From d5e88108a71c6c256b67948515f20955cc96fabf Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 18 Jul 2017 21:50:10 +0200 Subject: [PATCH 01/69] [WIP] Add Generalized Linear Model, issue #5975, initial commit --- sklearn/linear_model/__init__.py | 7 + sklearn/linear_model/glm.py | 872 +++++++++++++++++++++++++ sklearn/linear_model/tests/test_glm.py | 73 +++ 3 files changed, 952 insertions(+) create mode 100644 sklearn/linear_model/glm.py create mode 100644 sklearn/linear_model/tests/test_glm.py diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 2e01990ccce8c..5acc51e9dc87f 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,6 +18,12 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) +from .glm import (Link, IdentityLink, LogLink, + ExponentialDispersionModel, TweedieDistribution, + NormalDistribution, GaussianDistribution, + PoissonDistribution, GammaDistribution, + InverseGaussianDistribution, GeneralizedHyperbolicSecand, + GeneralizedLinearModel) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -38,6 +44,7 @@ 'BayesianRidge', 'ElasticNet', 'ElasticNetCV', + 'GeneralizedLinearModel', 'Hinge', 'Huber', 'HuberRegressor', diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py new file mode 100644 index 0000000000000..8b6eb8f3bf16c --- /dev/null +++ b/sklearn/linear_model/glm.py @@ -0,0 +1,872 @@ +""" +Generalized Linear Models with Exponential Dispersion Family +""" + +# Author: Christian Lorentzen +# License: BSD 3 clause + +# TODO: Which name? GeneralizedLinearModel vs GeneralizedLinearRegression. +# So far, it is GeneralizedLinearModel, since it could very easily +# extended by Bernoulli/Binomial distribution. +# TODO: Which name/symbol for coefficients and weights in docu? +# sklearn.linear_models uses w for coefficients. +# So far, coefficients=beta and weight=w (as standard literature) +# TODO: Add l2-penalty +# TODO: Add l1-penalty (elastic net) +# TODO: Add cross validation +# TODO: Write docu and examples + +# Design Decisions: +# - The link funtion (instance of class Link) is necessary for the evaluation +# of deviance, score, Fisher and Hessian matrix as functions of the +# coefficients, which is needed by optimizers. +# Solution: link as argument in those functions + +from __future__ import division +from abc import ABCMeta, abstractmethod, abstractproperty +import numbers +import numpy as np +from scipy import linalg, optimize, sparse +import warnings +from .base import LinearModel, LinearRegression +from ..base import RegressorMixin +from ..utils import check_X_y +from ..utils.extmath import safe_sparse_dot +from ..utils.optimize import newton_cg +from ..utils.validation import check_is_fitted + + + +class Link(metaclass=ABCMeta): + """Abstract base class for Link funtions + """ + + @abstractmethod + def link(self, mu): + """The link function g(mu) with argument mu=E[Y] returns the + linear predictor. + """ + raise NotImplementedError + + @abstractmethod + def derivative(self, mu): + """Derivative of the link g'(mu). + """ + raise NotImplementedError + + @abstractmethod + def inverse(self, lin_pred): + """The inverse link function h(lin_pred) with the linear predictor as + argument returns mu=E[Y]. + """ + raise NotImplementedError + + @abstractmethod + def inverse_derivative(self, lin_pred): + """Derivative of the inverse link function h'(lin_pred). + """ + raise NotImplementedError + + @abstractmethod + def inverse_derivative2(self, lin_pred): + """Second derivative of the inverse link function h''(lin_pred). + """ + raise NotImplementedError + +class IdentityLink(Link): + """The identity link function g(x)=x. + """ + + def link(self, mu): + return mu + + def derivative(self, mu): + return np.ones_like(mu) + + def inverse(self, lin_pred): + return lin_pred + + def inverse_derivative(self, lin_pred): + return np.ones_like(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.zeros_like(lin_pred) + + +class LogLink(Link): + """The log link function g(x)=log(x). + """ + + def link(self, mu): + return np.log(mu) + + def derivative(self, mu): + return 1./mu + + def inverse(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.exp(lin_pred) + + +class ExponentialDispersionModel(metaclass=ABCMeta): + """Base class for reproductive Exponential Dispersion Models (EDM). + + The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by + + .. math:: p(y| \theta, \phi) = c(y, \phi) + \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) + = \tilde{c}(y, \phi) + \exp\left(-\frac{d(y, \mu)}{2\phi}\right) + + with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, + unit variance :math:`v(\mu)` and + unit deviance :math:`d(y,\mu)`. + + Attributes + ---------- + lower_bound + upper_bound + + Methods + ------- + in_y_range + unit_variance + unit_variance_derivative + variance + variance_derivative + unit_deviance + unit_deviance_derivative + deviance + deviance_derivative + starting_mu + + _score + _fisher_matrix + _observed_information + _deviance + _deviance_derivative + _deviance_hessian + + References + ---------- + See https://en.wikipedia.org/wiki/Exponential_dispersion_model. + """ + + @abstractproperty + def lower_bound(self): + """The lower bound of values of Y~EDM. + """ + raise NotImplementedError() + + @abstractproperty + def upper_bound(self): + """The upper bound of values of Y~EDM. + """ + raise NotImplementedError() + + @abstractmethod + def in_y_range(self, x): + """Returns true if x is in the valid range of Y~EDM. + """ + raise NotImplementedError() + + @abstractmethod + def unit_variance(self, mu): + """The unit variance :math:`v(mu)` determines the variance as + a function of the mean mu by + :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`. + It can also be derived from the unit deviance :math:`d(y,\mu)` as + + .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ + \partial\mu^2}}\big|_{y=\mu} + """ + raise NotImplementedError() + + @abstractmethod + def unit_variance_derivative(self, mu): + """The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`. + """ + raise NotImplementedError() + + def variance(self, mu, phi=1, weight=1): + """The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is + :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`, + with unit variance v(mu). + """ + return phi/weight * self.unit_variance(mu) + + def variance_derivative(self, mu, phi=1, weight=1): + """The derivative of the variance w.r.t. mu, + :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] + =phi/w_i*v'(\mu_i)`, with unit variance v(mu). + """ + return phi/weight * self.unit_variance_derivative(mu) + + @abstractmethod + def unit_deviance(self, y, mu): + """The unit_deviance :math:`d(y,\mu)`. + In terms of the log-likelihood it is given by + :math:`d(y,\mu) = -2\phi\cdot + \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).` + """ + raise NotImplementedError() + + def unit_deviance_derivative(self, y, mu): + """The derivative w.r.t. mu of the unit_deviance + :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + with unit variance :math:`v(\mu)`. + + Returns + ------- + derivative: array, shape = (n_samples,) + """ + return -2*(y-mu)/self.unit_variance(mu) + + def deviance(self, y, mu, weight=1): + """The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu) + with weight :math:`w_i` and unit_deviance :math:`d(y,mu)`. + In terms of the likelihood it is :math:`D = -2\phi\cdot + \left(loglike(y,\mu,\frac{phi}{w}) + - loglike(y,y,\frac{phi}{w})\right).` + """ + return np.sum(weight*self.unit_deviance(y,mu)) + + def _deviance(self, coef, X, y, weight, link): + """The deviance as a function of the coefficients ``coef`` + (:math:`beta`). + """ + lin_pred = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(lin_pred) + return self.deviance(y, mu, weight) + + def deviance_derivative(self, y, mu, weight=1): + """The derivative w.r.t. mu of the deviance.` + """ + return weight*self.unit_deviance_derivative(y,mu) + + def _score(self, coef, phi, X, y, weight, link): + """The score function :math:`s` is the derivative of the + log-likelihood w.r.t. the ``coef`` (:math:`\beta`). + It is given by + + .. math: + + \mathbf{s}(\boldsymbol{\beta}) = \mathbf{X}^T \mathbf{D} + \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, + + with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and + :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}(y_1),\ldots)`. + """ + n_samples = X.shape[0] + lin_pred = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + d = link.inverse_derivative(lin_pred) + d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False) + score = safe_sparse_dot(X.T, temp, dense_output=False) + return score + + def _fisher_matrix(self, coef, phi, X, y, weight, link): + """The Fisher information matrix, also known as expected + information matrix. It is given by + + .. math: + + \mathbf{F}(\boldsymbol{\beta}) = \mathrm{E}\left[ + -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta} + \partial\boldsymbol{\beta}^T}\right] + = \mathbf{X}^T W \mathbf{X} \,, + + with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, + see score function. + """ + n_samples = X.shape[0] + lin_pred = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + d2 = link.inverse_derivative(lin_pred)**2 + d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) + fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False) + return fisher_matrix + + def _observed_information(self, coef, phi, X, y, weight, link): + """The observed information matrix, also known as the negative of + the Hessian matrix of the log-likelihood. It is given by + + .. math: + + \mathbf{H}(\boldsymbol{\beta}) = + -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta} + \partial\boldsymbol{\beta}^T} + = \mathbf{X}^T \legt[ + - \mathbf{D}' \mathbf{R} + + \mathbf{D}^2 \mathbf{V} \mathbf{R} + + \mathbf{D}^2 + \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,, + + with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`, + :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{ + v(\mu_i)} + \right)`, + see score function and Fisher matrix. + """ + n_samples = X.shape[0] + lin_pred = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + dp = link.inverse_derivative2(lin_pred) + d2 = link.inverse_derivative(lin_pred)**2 + v = self.unit_variance_derivative(mu)/self.unit_variance(mu) + r = y - mu + temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(temp, X, dense_output=False) + observed_information = safe_sparse_dot(X.T, temp, dense_output=False) + return observed_information + + def _deviance_derivative(self, coef, X, y, weight, link): + """The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a + function of the coefficients ``coef``. + This is equivalent to :math:`-2\phi` times the score function + :math:`s` (derivative of the log-likelihood). + """ + score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight, + link=link) + return -2*score + + def _deviance_hessian(self, coef, X, y, weight, link): + """The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance + as a function of the coefficients ``coef``. + This is equivalent to :math:`+2\phi` times the observed information + matrix. + """ + info_matrix = self._observed_information(coef=coef, phi=1, + X=X, y=y, weight=weight, link=link) + return 2*info_matrix + + def starting_mu(self, y, weight=1): + """Starting values for the mean mu_i in IRLS.""" + return (weight*y+np.mean(weight*y))/(2.*np.sum(np.ones_like(y)*weight)) + + +class TweedieDistribution(ExponentialDispersionModel): + """A class for the Tweedie distribution. + They have mu=E[X] and Var[X] \propto mu**power. + + Attributes + ---------- + power : float + The variance power of the unit_variance + :math:`v(mu) = mu^{power}`. + """ + def __init__(self, power=0): + self.power = power + self._upper_bound = np.Inf + self._upper_compare = lambda x: np.less(x, self.upper_bound) + if power < 0: + #Extreme Stable + self._lower_bound = -np.Inf + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif power == 0: + #GaussianDistribution + self._lower_bound = -np.Inf + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif (power > 0) and (power < 1): + raise ValueError('For 0 1) and (power < 2): + #Compound Poisson + self._lower_bound = 0 + self._lower_compare = ( + lambda x: np.greater_equal(x, self.lower_bound)) + elif power == 2: + #GammaDistribution + self._lower_bound = 0 + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif (power > 2) and (power < 3): + #Positive Stable + self._lower_bound = 0 + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif power == 3: + #InverseGaussianDistribution + self._lower_bound = 0 + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif power > 3: + #Positive Stable + self._lower_bound = 0 + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + + @property + def power(self): + return self._power + + @power.setter + def power(self, power): + if not isinstance(power, numbers.Real): + raise TypeError('power must be a real number, input was {0}' + .format(power)) + self._power = power + + @property + def lower_bound(self): + return self._lower_bound + + @property + def upper_bound(self): + return self._upper_bound + + def in_y_range(self, x): + return np.logical_and(self._lower_compare(x), self._upper_compare(x)) + + def unit_variance(self, mu): + """The unit variance of a Tweedie distribution is v(mu)=mu**power. + """ + return np.power(mu, self.power) + + def unit_variance_derivative(self, mu): + """The derivative of the unit variance of a Tweedie distribution is + v(mu)=power*mu**(power-1). + """ + return self.power*np.power(mu, self.power-1) + + def unit_deviance(self, y, mu): + p = self.power + if p == 0: + #NormalDistribution + return (y-mu)**2 + if p == 1: + #PoissonDistribution + return 2 * (np.where(y==0,0,y*np.log(y/mu))-y+mu) + elif p == 2: + #GammaDistribution + return 2 * (np.log(mu/y)+y/mu-1) + else: + #return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) + # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) + return 2 * (np.power(np.maximum(y,0), 2-p)/((1-p)*(2-p)) + - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + + def likelihood(self, y, X, beta, phi, weight=1): + raise NotImplementedError('This function is not (yet) implemented.') + + +class NormalDistribution(TweedieDistribution): + """Class for the Normal (aka Gaussian) distribution""" + def __init__(self): + super(NormalDistribution, self).__init__(power=0) + +GaussianDistribution = NormalDistribution + +class PoissonDistribution(TweedieDistribution): + """Class for the scaled Poisson distribution""" + def __init__(self): + super(PoissonDistribution, self).__init__(power=1) + +class GammaDistribution(TweedieDistribution): + """Class for the Gamma distribution""" + def __init__(self): + super(GammaDistribution, self).__init__(power=2) + +class InverseGaussianDistribution(TweedieDistribution): + """Class for the scaled InverseGaussianDistribution distribution""" + def __init__(self): + super(InverseGaussianDistribution, self).__init__(power=3) + +class GeneralizedHyperbolicSecand(ExponentialDispersionModel): + """A class for the von Generalized Hyperbolic Secand (GHS) distribution. + + The GHS distribution is for data y in (-inf, inf). + """ + def __init__(self): + self._lower_bound = -np.Inf + self._upper_bound = np.Inf + + @property + def lower_bound(self): + return self._lower_bound + + @property + def upper_bound(self): + return self._upper_bound + + def in_y_range(self, x): + np.logical_and( + np.greater(x, self.lower_bound), + np.less(x, self.lower_bound) + ) + + def unit_variance(self, mu): + return 1 + mu**2 + + def unit_variance_derivative(self, mu): + return 2*mu + + def unit_deviance(self, y, mu): + return (2*y*(np.arctan(y) - np.arctan(mu)) + + np.log((1+mu**2)/(1+y**2))) + + + +class GeneralizedLinearModel(LinearModel, RegressorMixin): + """ + Class to fit a Generalized Linear Model (GLM) based on reproductive + Exponential Dispersion Models (EDM). + + Assumptions: + + - The target values y_i are realizations of random variables + :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion + parameter :math:`\phi` and weights :math:`w_i`. + - The expectation of :math:`Y_i` is :math:`mu_i=\mathrm{E}[Y]=h(\eta_i)` + whith the linear predictor :math:`\eta=X*\beta`, inverse link function + :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta` + to be estimated. + + Note that the first assumption implies + :math:`\mathrm{Var}[Y_i]=\frac{\phi}{w_i} v(\mu_i)` with uni variance + function :math:`v(\mu)`. + + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments :math:`E[Y_i]=\mu_i=h(\eta_i)` and + :math:`Var[Y_i]=\frac{\phi}{w_i} v(\mu_i)` + + The parameters :math:`\beta` are estimated by maximum likelihood which is + equivalent to minimizing the deviance. + + TODO: Estimation of the dispersion parameter phi. + + TODO: Notes on 'scaled' Poisson and weights + + Parameters + ---------- + fit_intercept : boolean, optional, default True + whether to calculate the intercept for this model. If set + to False, no intercept will be used in calculations + (e.g. data is expected to be already centered). + + family : ExponentialDispersionModel, optional, default NormalDistribution() + the distributional assumption of the GLM + + link : Link, optional, default IdentityLink() + the link function (class) of the GLM + + fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr' + method for estimation of the dispersion parameter phi. Whether to use + the chi squared statisic or the deviance statistic. If None, the + dispersion is not estimated. + + solver : {'irls', 'newton-cg', 'lbfgs'}, defaul 'irls' + Algorithm to use in the optimization problem. + + - 'irls' is iterated reweighted least squares. It is the standard + algorithm for GLMs. + + - 'newton-cg', 'lbfgs' + + max_iter : int, default 100 + TODO + + tol : float + Stopping criterion. For the irls, newton-cg and lbfgs solvers, + the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient (derivative of + the deviance). + + start_params : {array shape (n_features, ), 'ols'}, default None + sets the start values for coef_ in the fit. + If None, default values are taken. + If 'ols' the result of an ordinary least squares in the link space + (linear predictor) is taken. + If an array is given, these values are taken as coef_ to start with. + If fit_intercept is true, the first value is assumed to be the start + value for the intercept_. + + verbose : int, default: 0 + For the lbfgs solver set verbose to any positive + number for verbosity. + + Attributes + ---------- + coef_ : array, shape (1, n_features) + Estimated coefficients for the linear predictor (X*coef_) in the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + dispersion_ : float + The dispersion parameter :math:`\phi` if fit_dispersion is set. + + n_iter_ : int + Actual number of iterations of the solver. + + Notes + ----- + + References + ---------- + TODO + """ + + def __init__(self, fit_intercept=True, family=NormalDistribution(), + link=IdentityLink(), fit_dispersion='chisqr', solver='irls', max_iter=100, + tol=1e-4, start_params=None, verbose=0): + self.fit_intercept = fit_intercept + self.family = family + self.link = link + self.fit_dispersion = fit_dispersion + self.solver = solver + self.max_iter = 100 + self.tol = tol + self.start_params = start_params + self.verbose = verbose + + def fit(self, X, y, weight=None): + """ + Fit a generalized linear model. + + Parameters + ---------- + X : numpy array or sparse matrix of shape [n_samples,n_features] + Training data + + y : numpy array of shape [n_samples] + Target values + + weight : numpy array of shape [n_samples] + Individual weights for each sample. + Var[Y_i]=phi/weight_i * v(mu) + If Y_i ~ EDM(mu, phi/w_i) then + sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)) + + Returns + ------- + self : returns an instance of self. + """ + if not isinstance(self.family, ExponentialDispersionModel): + raise ValueError("The argument family must be an instance of class" + "ExponentialDispersionModel.") + if not isinstance(self.fit_intercept, bool): + raise ValueError("The argument fit_intercept must be bool," + " got {0}".format(self.fit_intercept)) + if not self.solver in ['irls', 'lbfgs', 'newton-cg']: + raise ValueError("GLM Regression supports only irls, lbfgs and" + "newton-cg solvers, got {0}".format(self.solver)) + if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: + raise ValueError("Maximum number of iteration must be positive;" + " got (max_iter={0!r})".format(self.max_iter)) + if not isinstance(self.tol, numbers.Number) or self.tol < 0: + raise ValueError("Tolerance for stopping criteria must be " + "positive; got (tol={0!r})".format(self.tol)) + start_params = self.start_params + if start_params is not None and start_params is not 'ols': + start_params = np.atleast_1d(start_params) + if start_params.shape[0] != X.shape[1] + self.fit_intercept: + raise ValueError("Start values for parameters must have the" + "right length; required length {0}, got {1}".format( + X.shape[1] + self.fit_intercept, start_params.shape[0])) + + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + y_numeric=True, multi_output=False) + y = y.astype(np.float64) + + if not np.all(self.family.in_y_range(y)): + raise ValueError("Some value(s) of y are out of the valid " + "range for family {0}".format(self.family.__class__.__name__)) + + if weight is None: + weight = np.ones_like(y) + elif np.isscalar(weight): + weight = weight*np.ones_like(y) + else: + weight = np.atleast_1d(weight) + if weight.ndim > 1: + raise ValueError("Weights must be 1D array or scalar") + elif weight.shape[0] != y.shape[0]: + raise ValueError("Weights must have the same length as y") + + + if self.fit_intercept: + #intercept is first column <=> coef[0] is for intecept + if sparse.issparse(X): + Xnew = sparse.hstack([np.ones([X.shape[0],1]), X]) + else: + Xnew = np.concatenate((np.ones((X.shape[0],1)), X), axis=1) + else: + Xnew = X + + n_samples, n_features = Xnew.shape + + #Note: Since phi does not enter the estimation of mu_i=E[y_i] + # set it to 1 where convenient. + + #set start values for coef + coef = None + if start_params is None: + #Use mu_start and apply one irls step to calculate coef + mu = self.family.starting_mu(y, weight) + #linear predictor + eta = self.link.link(mu) + #h'(eta) + hp = self.link.inverse_derivative(eta) + #working weights w, in principle a diagonal matrix + #therefore here just as 1d array + w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) + wroot = np.sqrt(w) + #working observations + yw = eta + (y-mu)/hp + #least squares rescaled with wroot + wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples)) + X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) + yw_rescale = safe_sparse_dot(wroot, y, dense_output=True) + coef = linalg.lstsq(X_rescale, yw_rescale)[0] + elif start_params is 'ols': + reg = LinearRegression(copy_X=False, + fit_intercept=False) + reg.fit(Xnew, self.link.link(y)) + coef = reg.coef_ + else: + coef = start_params + + #algorithms for optimiation + #TODO: Parallelize it + self.n_iter_ = 0 + converged = False + if self.solver == 'irls': + #linear predictor + eta = safe_sparse_dot(Xnew, coef, dense_output=True) + mu = self.link.inverse(eta) + while self.n_iter_ < self.max_iter: + self.n_iter_ += 1 + #coef_old not used so far. + #coef_old = coef + #h'(eta) + hp = self.link.inverse_derivative(eta) + #working weights w, in principle a diagonal matrix + #therefore here just as 1d array + w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) + wroot = np.sqrt(w) + #working observations + yw = eta + (y-mu)/hp + #least squares rescaled with wroot + wroot = sparse.dia_matrix((wroot, 0), + shape=(n_samples, n_samples)) + X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) + yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True) + coef, residues, rank, singular_ = ( + linalg.lstsq(X_rescale, yw_rescale)) + + #updated linear predictor + #do it here for updated values for tolerance + eta = safe_sparse_dot(Xnew, coef, dense_output=True) + mu = self.link.inverse(eta) + + #which tolerace? |coef - coef_old| or gradient? + #use gradient for compliance with newton-cg and lbfgs + #TODO: faster computation of gradient, use mu and eta directly + gradient = self.family._deviance_derivative(coef=coef, + X=Xnew, y=y, weight=weight, link=self.link) + if (np.max(np.abs(gradient)) <= self.tol): + converged = True + break + + if not converged: + warnings.warn("irls failed to converge. Increase the number " + "of iterations (currently {0})".format(self.max_iter)) + + #TODO: performance: make one function return both deviance and gradient + elif self.solver == 'lbfgs': + func = self.family._deviance + fprime = self.family._deviance_derivative + args = (Xnew, y, weight, self.link) + coef, loss, info = optimize.fmin_l_bfgs_b( + func, coef, fprime=fprime, + args=args, + iprint=(self.verbose > 0) - 1, pgtol=self.tol, + maxiter=self.max_iter) + if self.verbose > 0: + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.") + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}".format( + info["task"])) + self.n_iter_ = info['nit'] + elif self.solver == 'newton-cg': + func = self.family._deviance + grad = self.family._deviance_derivative + def grad_hess(coef, X, y, weight, link): + grad = (self.family + ._deviance_derivative(coef, X, y, weight, link)) + hessian = (self.family + ._deviance_hessian(coef, X, y, weight,link)) + def Hs(s): + ret = np.dot(hessian, s) + return ret + return grad, Hs + hess = grad_hess + args = (Xnew, y, weight, self.link) + coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, + maxiter=self.max_iter, tol=self.tol) + self.coef_ = coef + + if self.fit_intercept is True: + self.intercept_ = coef[0] + self.coef_ = coef[1:] + else: + self.coef_ = coef + + if self.fit_dispersion in ['chisqr', 'deviance']: + self.dispersion_ = self.estimate_phi(y, X, weight) + + return self + + def predict(self, X, weight=1): + check_is_fitted(self, "coef_") + eta = safe_sparse_dot(X, self.coef_, dense_output=True) + if self.fit_intercept is True: + eta += self.intercept_ + mu = self.link.inverse(eta) + return mu*weight + + def estimate_phi(self, y, X, weight): + n_samples, n_features = X.shape + eta = safe_sparse_dot(X, self.coef_, dense_output=True) + if self.fit_intercept is True: + eta += self.intercept_ + mu = self.link.inverse(eta) + if self.fit_dispersion == 'chisqr': + chisq = np.sum(weight*(y-mu)**2/self.family.unit_variance(mu)) + return chisq/(n_samples - n_features) + elif self.fit_dispersion == 'deviance': + dev = self.family.deviance(y, mu, weight) + return dev/(n_samples - n_features) + + def score(self, X, y, weight=1): + """The natural score for a GLM is -deviance. + Returns the weight averaged negitive deviance (the better the score, + the better the fit). Maximum score is therefore 0. + """ + #RegressorMixin has R^2 score. + #TODO: Make it more compatible with the score function in + # sklearn.metrics.regression.py + eta = safe_sparse_dot(X, self.coef_, dense_output=True) + if self.fit_intercept is True: + eta += self.intercept_ + mu = self.link.inverse(eta) + output_errors = self.family.unit_deviance(y,mu) + weight = weight * np.ones_like(y) + return np.average(output_errors, weights=weight) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py new file mode 100644 index 0000000000000..2a88a7ec899d9 --- /dev/null +++ b/sklearn/linear_model/tests/test_glm.py @@ -0,0 +1,73 @@ +import numpy as np + +from sklearn.linear_model.glm import (Link, IdentityLink, LogLink, + ExponentialDispersionModel, TweedieDistribution, + NormalDistribution, GaussianDistribution, + PoissonDistribution, GammaDistribution, + InverseGaussianDistribution, GeneralizedHyperbolicSecand, + GeneralizedLinearModel) + +from sklearn.utils.testing import (assert_equal, assert_array_equal, + assert_array_almost_equal) + +def test_family_bounds(): + """Test the valid range of distributions + """ + family = NormalDistribution() + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [True, True, True]) + + family = PoissonDistribution() + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, True, True]) + + family = TweedieDistribution(power=1.5) + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, True, True]) + + family = GammaDistribution() + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, False, True]) + + family = InverseGaussianDistribution() + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, False, True]) + + family = TweedieDistribution(power=4.5) + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, False, True]) + +def test_glm_identiy_regression(): + """Test linear regression on a simple dataset + """ + coef = [1,2] + X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T + y = np.dot(X, coef) + for solver in ['irls', 'lbfgs', 'newton-cg']: + for family in (GaussianDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)): + glm = GeneralizedLinearModel(family=family, + fit_intercept=False, solver=solver) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) + +def test_glm_log_regression(): + """Test linear regression on a simple dataset + """ + coef = [1,2] + X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T + y = np.exp(np.dot(X, coef)) + #for solver in ['irls', 'lbfgs', 'newton-cg']: + for solver in ['irls']: + #for family in [GaussianDistribution(), PoissonDistribution(), + # GammaDistribution(), InverseGaussianDistribution(), + # TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]: + for family in [GaussianDistribution()]: + glm = GeneralizedLinearModel(family=family, + link=LogLink(), + fit_intercept=False, solver=solver, start_params='ols') + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) + +#TODO: Test compatibility with R's glm, glmnet From 2fc189d8351c9710c1329750545539afe3e6e40c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 19 Jul 2017 17:33:04 +0200 Subject: [PATCH 02/69] [WIP] Add Generalized Linear Models (#9405) * Fixed pep8 * Fixed flake8 * Rename GeneralizedLinearModel as GeneralizedLinearRegressor * Use of six.with_metaclass * PEP257: summary should be on same line as quotes * Docstring of class GeneralizedLinearRegressor: \ before mu * Arguments family and link accept strings * Use of ConvergenceWarning --- sklearn/linear_model/__init__.py | 13 +- sklearn/linear_model/glm.py | 231 ++++++++++++++----------- sklearn/linear_model/tests/test_glm.py | 75 ++++---- 3 files changed, 180 insertions(+), 139 deletions(-) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 5acc51e9dc87f..0c5840f343a3a 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,12 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from .glm import (Link, IdentityLink, LogLink, - ExponentialDispersionModel, TweedieDistribution, - NormalDistribution, GaussianDistribution, - PoissonDistribution, GammaDistribution, - InverseGaussianDistribution, GeneralizedHyperbolicSecand, - GeneralizedLinearModel) +from .glm import (TweedieDistribution, + GeneralizedLinearRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -44,7 +40,6 @@ 'BayesianRidge', 'ElasticNet', 'ElasticNetCV', - 'GeneralizedLinearModel', 'Hinge', 'Huber', 'HuberRegressor', @@ -84,4 +79,6 @@ 'orthogonal_mp', 'orthogonal_mp_gram', 'ridge_regression', - 'RANSACRegressor'] + 'RANSACRegressor', + 'GeneralizedLinearRegressor', + 'TweedieDistribution'] diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 8b6eb8f3bf16c..cf91a64fafc12 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -5,9 +5,6 @@ # Author: Christian Lorentzen # License: BSD 3 clause -# TODO: Which name? GeneralizedLinearModel vs GeneralizedLinearRegression. -# So far, it is GeneralizedLinearModel, since it could very easily -# extended by Bernoulli/Binomial distribution. # TODO: Which name/symbol for coefficients and weights in docu? # sklearn.linear_models uses w for coefficients. # So far, coefficients=beta and weight=w (as standard literature) @@ -17,6 +14,10 @@ # TODO: Write docu and examples # Design Decisions: +# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. +# So far, it is GeneralizedLinearModel, since it could very easily +# extended by Bernoulli/Binomial distribution. +# Solution: GeneralizedLinearRegressor # - The link funtion (instance of class Link) is necessary for the evaluation # of deviance, score, Fisher and Hessian matrix as functions of the # coefficients, which is needed by optimizers. @@ -28,16 +29,17 @@ import numpy as np from scipy import linalg, optimize, sparse import warnings -from .base import LinearModel, LinearRegression -from ..base import RegressorMixin +from .base import LinearRegression +from ..base import BaseEstimator, RegressorMixin +from ..exceptions import ConvergenceWarning +from ..externals import six from ..utils import check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg from ..utils.validation import check_is_fitted - -class Link(metaclass=ABCMeta): +class Link(six.with_metaclass(ABCMeta)): """Abstract base class for Link funtions """ @@ -73,6 +75,7 @@ def inverse_derivative2(self, lin_pred): """ raise NotImplementedError + class IdentityLink(Link): """The identity link function g(x)=x. """ @@ -113,7 +116,7 @@ def inverse_derivative2(self, lin_pred): return np.exp(lin_pred) -class ExponentialDispersionModel(metaclass=ABCMeta): +class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): """Base class for reproductive Exponential Dispersion Models (EDM). The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by @@ -235,7 +238,7 @@ def deviance(self, y, mu, weight=1): \left(loglike(y,\mu,\frac{phi}{w}) - loglike(y,y,\frac{phi}{w})\right).` """ - return np.sum(weight*self.unit_deviance(y,mu)) + return np.sum(weight*self.unit_deviance(y, mu)) def _deviance(self, coef, X, y, weight, link): """The deviance as a function of the coefficients ``coef`` @@ -248,7 +251,7 @@ def _deviance(self, coef, X, y, weight, link): def deviance_derivative(self, y, mu, weight=1): """The derivative w.r.t. mu of the deviance.` """ - return weight*self.unit_deviance_derivative(y,mu) + return weight*self.unit_deviance_derivative(y, mu) def _score(self, coef, phi, X, y, weight, link): """The score function :math:`s` is the derivative of the @@ -269,7 +272,7 @@ def _score(self, coef, phi, X, y, weight, link): sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) d = link.inverse_derivative(lin_pred) d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), - shape=(n_samples, n_samples)) + shape=(n_samples, n_samples)) temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False) score = safe_sparse_dot(X.T, temp, dense_output=False) return score @@ -294,7 +297,7 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link): sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) d2 = link.inverse_derivative(lin_pred)**2 d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), - shape=(n_samples, n_samples)) + shape=(n_samples, n_samples)) temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False) return fisher_matrix @@ -329,7 +332,7 @@ def _observed_information(self, coef, phi, X, y, weight, link): v = self.unit_variance_derivative(mu)/self.unit_variance(mu) r = y - mu temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0), - shape=(n_samples, n_samples)) + shape=(n_samples, n_samples)) temp = safe_sparse_dot(temp, X, dense_output=False) observed_information = safe_sparse_dot(X.T, temp, dense_output=False) return observed_information @@ -341,7 +344,7 @@ def _deviance_derivative(self, coef, X, y, weight, link): :math:`s` (derivative of the log-likelihood). """ score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight, - link=link) + link=link) return -2*score def _deviance_hessian(self, coef, X, y, weight, link): @@ -350,8 +353,8 @@ def _deviance_hessian(self, coef, X, y, weight, link): This is equivalent to :math:`+2\phi` times the observed information matrix. """ - info_matrix = self._observed_information(coef=coef, phi=1, - X=X, y=y, weight=weight, link=link) + info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y, + weight=weight, link=link) return 2*info_matrix def starting_mu(self, y, weight=1): @@ -374,39 +377,39 @@ def __init__(self, power=0): self._upper_bound = np.Inf self._upper_compare = lambda x: np.less(x, self.upper_bound) if power < 0: - #Extreme Stable + # Extreme Stable self._lower_bound = -np.Inf self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif power == 0: - #GaussianDistribution + # GaussianDistribution self._lower_bound = -np.Inf self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif (power > 0) and (power < 1): raise ValueError('For 0 1) and (power < 2): - #Compound Poisson + # Compound Poisson self._lower_bound = 0 self._lower_compare = ( lambda x: np.greater_equal(x, self.lower_bound)) elif power == 2: - #GammaDistribution + # GammaDistribution self._lower_bound = 0 self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif (power > 2) and (power < 3): - #Positive Stable + # Positive Stable self._lower_bound = 0 self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif power == 3: - #InverseGaussianDistribution + # InverseGaussianDistribution self._lower_bound = 0 self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif power > 3: - #Positive Stable + # Positive Stable self._lower_bound = 0 self._lower_compare = lambda x: np.greater(x, self.lower_bound) @@ -418,7 +421,7 @@ def power(self): def power(self, power): if not isinstance(power, numbers.Real): raise TypeError('power must be a real number, input was {0}' - .format(power)) + .format(power)) self._power = power @property @@ -446,19 +449,19 @@ def unit_variance_derivative(self, mu): def unit_deviance(self, y, mu): p = self.power if p == 0: - #NormalDistribution + # NormalDistribution return (y-mu)**2 if p == 1: - #PoissonDistribution - return 2 * (np.where(y==0,0,y*np.log(y/mu))-y+mu) + # PoissonDistribution + return 2 * (np.where(y == 0, 0, y*np.log(y/mu))-y+mu) elif p == 2: - #GammaDistribution + # GammaDistribution return 2 * (np.log(mu/y)+y/mu-1) else: - #return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) + # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) - return 2 * (np.power(np.maximum(y,0), 2-p)/((1-p)*(2-p)) - - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - + y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) def likelihood(self, y, X, beta, phi, weight=1): raise NotImplementedError('This function is not (yet) implemented.') @@ -469,23 +472,25 @@ class NormalDistribution(TweedieDistribution): def __init__(self): super(NormalDistribution, self).__init__(power=0) -GaussianDistribution = NormalDistribution class PoissonDistribution(TweedieDistribution): """Class for the scaled Poisson distribution""" def __init__(self): super(PoissonDistribution, self).__init__(power=1) + class GammaDistribution(TweedieDistribution): """Class for the Gamma distribution""" def __init__(self): super(GammaDistribution, self).__init__(power=2) + class InverseGaussianDistribution(TweedieDistribution): """Class for the scaled InverseGaussianDistribution distribution""" def __init__(self): super(InverseGaussianDistribution, self).__init__(power=3) + class GeneralizedHyperbolicSecand(ExponentialDispersionModel): """A class for the von Generalized Hyperbolic Secand (GHS) distribution. @@ -516,12 +521,11 @@ def unit_variance_derivative(self, mu): return 2*mu def unit_deviance(self, y, mu): - return (2*y*(np.arctan(y) - np.arctan(mu)) - + np.log((1+mu**2)/(1+y**2))) - + return (2*y*(np.arctan(y) - np.arctan(mu)) + + np.log((1+mu**2)/(1+y**2))) -class GeneralizedLinearModel(LinearModel, RegressorMixin): +class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """ Class to fit a Generalized Linear Model (GLM) based on reproductive Exponential Dispersion Models (EDM). @@ -531,7 +535,7 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin): - The target values y_i are realizations of random variables :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion parameter :math:`\phi` and weights :math:`w_i`. - - The expectation of :math:`Y_i` is :math:`mu_i=\mathrm{E}[Y]=h(\eta_i)` + - The expectation of :math:`Y_i` is :math:`\mu_i=\mathrm{E}[Y]=h(\eta_i)` whith the linear predictor :math:`\eta=X*\beta`, inverse link function :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta` to be estimated. @@ -549,7 +553,9 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin): TODO: Estimation of the dispersion parameter phi. - TODO: Notes on 'scaled' Poisson and weights + TODO: Notes on weights and 'scaled' Poisson, e.g. fit y = x/w with + with x=counts and w=exposure (time, money, persons, ...) => y is a + ratio with weights w. Parameters ---------- @@ -558,10 +564,12 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin): to False, no intercept will be used in calculations (e.g. data is expected to be already centered). - family : ExponentialDispersionModel, optional, default NormalDistribution() - the distributional assumption of the GLM + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance + of a subclass of ExponentialDispersionModel, optional, default 'normal' + the distributional assumption of the GLM. - link : Link, optional, default IdentityLink() + link : {'identity', 'log'} or an instance of a subclass of Link, + optional, default IdentityLink() the link function (class) of the GLM fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr' @@ -622,8 +630,8 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin): """ def __init__(self, fit_intercept=True, family=NormalDistribution(), - link=IdentityLink(), fit_dispersion='chisqr', solver='irls', max_iter=100, - tol=1e-4, start_params=None, verbose=0): + link=IdentityLink(), fit_dispersion='chisqr', solver='irls', + max_iter=100, tol=1e-4, start_params=None, verbose=0): self.fit_intercept = fit_intercept self.family = family self.link = link @@ -635,8 +643,7 @@ def __init__(self, fit_intercept=True, family=NormalDistribution(), self.verbose = verbose def fit(self, X, y, weight=None): - """ - Fit a generalized linear model. + """Fit a generalized linear model. Parameters ---------- @@ -657,12 +664,32 @@ def fit(self, X, y, weight=None): self : returns an instance of self. """ if not isinstance(self.family, ExponentialDispersionModel): - raise ValueError("The argument family must be an instance of class" - "ExponentialDispersionModel.") + if self.family == 'normal': + self.family = NormalDistribution() + elif self.family == 'poisson': + self.family = PoissonDistribution() + elif self.family == 'gamma': + self.family = GammaDistribution() + elif self.family == 'inverse.gaussian': + self.family = InverseGaussianDistribution() + else: + raise ValueError( + "The argument family must be an instance of class" + " ExponentialDispersionModel or an element of" + " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].") + if not isinstance(self.link, Link): + if self.link == 'identity': + self.link = IdentityLink() + if self.link == 'log': + self.link = LogLink() + else: + raise ValueError( + "The argument link must be an instance of class Link or" + " an element of ['identity', 'log'].") if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool," " got {0}".format(self.fit_intercept)) - if not self.solver in ['irls', 'lbfgs', 'newton-cg']: + if self.solver not in ['irls', 'lbfgs', 'newton-cg']: raise ValueError("GLM Regression supports only irls, lbfgs and" "newton-cg solvers, got {0}".format(self.solver)) if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: @@ -676,8 +703,9 @@ def fit(self, X, y, weight=None): start_params = np.atleast_1d(start_params) if start_params.shape[0] != X.shape[1] + self.fit_intercept: raise ValueError("Start values for parameters must have the" - "right length; required length {0}, got {1}".format( - X.shape[1] + self.fit_intercept, start_params.shape[0])) + "right length; required length {0}, got {1}" + .format(X.shape[1] + self.fit_intercept, + start_params.shape[0])) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=False) @@ -685,7 +713,8 @@ def fit(self, X, y, weight=None): if not np.all(self.family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " - "range for family {0}".format(self.family.__class__.__name__)) + "range for family {0}" + .format(self.family.__class__.__name__)) if weight is None: weight = np.ones_like(y) @@ -698,96 +727,96 @@ def fit(self, X, y, weight=None): elif weight.shape[0] != y.shape[0]: raise ValueError("Weights must have the same length as y") - if self.fit_intercept: - #intercept is first column <=> coef[0] is for intecept + # intercept is first column <=> coef[0] is for intecept if sparse.issparse(X): - Xnew = sparse.hstack([np.ones([X.shape[0],1]), X]) + Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X]) else: - Xnew = np.concatenate((np.ones((X.shape[0],1)), X), axis=1) + Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) else: Xnew = X n_samples, n_features = Xnew.shape - #Note: Since phi does not enter the estimation of mu_i=E[y_i] - # set it to 1 where convenient. + # Note: Since dispersion_ alias phi does not enter the estimation + # of mu_i=E[y_i] set it to 1 where convenient. - #set start values for coef + # set start values for coef coef = None if start_params is None: - #Use mu_start and apply one irls step to calculate coef + # Use mu_start and apply one irls step to calculate coef mu = self.family.starting_mu(y, weight) - #linear predictor + # linear predictor eta = self.link.link(mu) - #h'(eta) + # h'(eta) hp = self.link.inverse_derivative(eta) - #working weights w, in principle a diagonal matrix - #therefore here just as 1d array + # working weights w, in principle a diagonal matrix + # therefore here just as 1d array w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) wroot = np.sqrt(w) - #working observations + # working observations yw = eta + (y-mu)/hp - #least squares rescaled with wroot + # least squares rescaled with wroot wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples)) X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) yw_rescale = safe_sparse_dot(wroot, y, dense_output=True) coef = linalg.lstsq(X_rescale, yw_rescale)[0] elif start_params is 'ols': - reg = LinearRegression(copy_X=False, - fit_intercept=False) + reg = LinearRegression(copy_X=False, fit_intercept=False) reg.fit(Xnew, self.link.link(y)) coef = reg.coef_ else: coef = start_params - #algorithms for optimiation - #TODO: Parallelize it + # algorithms for optimiation + # TODO: Parallelize it self.n_iter_ = 0 converged = False if self.solver == 'irls': - #linear predictor + # linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = self.link.inverse(eta) while self.n_iter_ < self.max_iter: self.n_iter_ += 1 - #coef_old not used so far. - #coef_old = coef - #h'(eta) + # coef_old not used so far. + # coef_old = coef + # h'(eta) hp = self.link.inverse_derivative(eta) - #working weights w, in principle a diagonal matrix - #therefore here just as 1d array + # working weights w, in principle a diagonal matrix + # therefore here just as 1d array w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) wroot = np.sqrt(w) - #working observations + # working observations yw = eta + (y-mu)/hp - #least squares rescaled with wroot + # least squares rescaled with wroot wroot = sparse.dia_matrix((wroot, 0), - shape=(n_samples, n_samples)) + shape=(n_samples, n_samples)) X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True) - coef, residues, rank, singular_ = ( + coef, residues, rank, singular_ = ( linalg.lstsq(X_rescale, yw_rescale)) - #updated linear predictor - #do it here for updated values for tolerance + # updated linear predictor + # do it here for updated values for tolerance eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = self.link.inverse(eta) - #which tolerace? |coef - coef_old| or gradient? - #use gradient for compliance with newton-cg and lbfgs - #TODO: faster computation of gradient, use mu and eta directly - gradient = self.family._deviance_derivative(coef=coef, - X=Xnew, y=y, weight=weight, link=self.link) + # which tolerace? |coef - coef_old| or gradient? + # use gradient for compliance with newton-cg and lbfgs + # TODO: faster computation of gradient, use mu and eta directly + gradient = self.family._deviance_derivative( + coef=coef, X=Xnew, y=y, weight=weight, link=self.link) if (np.max(np.abs(gradient)) <= self.tol): converged = True break if not converged: warnings.warn("irls failed to converge. Increase the number " - "of iterations (currently {0})".format(self.max_iter)) + "of iterations (currently {0})" + .format(self.max_iter), ConvergenceWarning) - #TODO: performance: make one function return both deviance and gradient + # TODO: performance: make one function return both deviance and + # gradient of deviance elif self.solver == 'lbfgs': func = self.family._deviance fprime = self.family._deviance_derivative @@ -800,7 +829,8 @@ def fit(self, X, y, weight=None): if self.verbose > 0: if info["warnflag"] == 1: warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.") + " Increase the number of iterations.", + ConvergenceWarning) elif info["warnflag"] == 2: warnings.warn("lbfgs failed for the reason: {0}".format( info["task"])) @@ -808,11 +838,13 @@ def fit(self, X, y, weight=None): elif self.solver == 'newton-cg': func = self.family._deviance grad = self.family._deviance_derivative + def grad_hess(coef, X, y, weight, link): - grad = (self.family - ._deviance_derivative(coef, X, y, weight, link)) - hessian = (self.family - ._deviance_hessian(coef, X, y, weight,link)) + grad = (self.family._deviance_derivative( + coef, X, y, weight, link)) + hessian = (self.family._deviance_hessian( + coef, X, y, weight, link)) + def Hs(s): ret = np.dot(hessian, s) return ret @@ -820,7 +852,7 @@ def Hs(s): hess = grad_hess args = (Xnew, y, weight, self.link) coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, - maxiter=self.max_iter, tol=self.tol) + maxiter=self.max_iter, tol=self.tol) self.coef_ = coef if self.fit_intercept is True: @@ -835,6 +867,9 @@ def Hs(s): return self def predict(self, X, weight=1): + """Prediction with features X. + If weights are given, returns prediction*weights. + """ check_is_fitted(self, "coef_") eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: @@ -860,13 +895,13 @@ def score(self, X, y, weight=1): Returns the weight averaged negitive deviance (the better the score, the better the fit). Maximum score is therefore 0. """ - #RegressorMixin has R^2 score. - #TODO: Make it more compatible with the score function in + # RegressorMixin has R^2 score. + # TODO: Make it more compatible with the score function in # sklearn.metrics.regression.py eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ mu = self.link.inverse(eta) - output_errors = self.family.unit_deviance(y,mu) + output_errors = self.family.unit_deviance(y, mu) weight = weight * np.ones_like(y) return np.average(output_errors, weights=weight) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 2a88a7ec899d9..a4d4ea8650860 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,73 +1,82 @@ import numpy as np -from sklearn.linear_model.glm import (Link, IdentityLink, LogLink, - ExponentialDispersionModel, TweedieDistribution, - NormalDistribution, GaussianDistribution, - PoissonDistribution, GammaDistribution, - InverseGaussianDistribution, GeneralizedHyperbolicSecand, - GeneralizedLinearModel) +from sklearn.linear_model.glm import ( + # Link, IdentityLink, + LogLink, + TweedieDistribution, + NormalDistribution, PoissonDistribution, + GammaDistribution, InverseGaussianDistribution, + # GeneralizedHyperbolicSecand, + GeneralizedLinearRegressor) + +from sklearn.utils.testing import ( + # assert_equal, + assert_array_equal, assert_array_almost_equal) -from sklearn.utils.testing import (assert_equal, assert_array_equal, - assert_array_almost_equal) def test_family_bounds(): """Test the valid range of distributions """ family = NormalDistribution() - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [True, True, True]) family = PoissonDistribution() - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, True, True]) family = TweedieDistribution(power=1.5) - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, True, True]) family = GammaDistribution() - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, False, True]) family = InverseGaussianDistribution() - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, False, True]) family = TweedieDistribution(power=4.5) - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, False, True]) + def test_glm_identiy_regression(): """Test linear regression on a simple dataset """ - coef = [1,2] - X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T + coef = [1, 2] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) + families = ( + NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)) for solver in ['irls', 'lbfgs', 'newton-cg']: - for family in (GaussianDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)): - glm = GeneralizedLinearModel(family=family, - fit_intercept=False, solver=solver) + for family in families: + glm = GeneralizedLinearRegressor( + family=family, fit_intercept=False, solver=solver) res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) + def test_glm_log_regression(): """Test linear regression on a simple dataset """ - coef = [1,2] - X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T + coef = [1, 2] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) - #for solver in ['irls', 'lbfgs', 'newton-cg']: - for solver in ['irls']: - #for family in [GaussianDistribution(), PoissonDistribution(), - # GammaDistribution(), InverseGaussianDistribution(), - # TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]: - for family in [GaussianDistribution()]: - glm = GeneralizedLinearModel(family=family, - link=LogLink(), - fit_intercept=False, solver=solver, start_params='ols') + families = ( + NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)) + for solver in ['irls', 'lbfgs', 'newton-cg']: + for family in families: + glm = GeneralizedLinearRegressor( + family=family, link=LogLink(), fit_intercept=False, + solver=solver, start_params='ols') res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) -#TODO: Test compatibility with R's glm, glmnet + +# TODO: Test compatibility with R's glm, glmnet From a6137d85401ef72976327c211f44d721d9f81e00 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 19 Jul 2017 17:41:49 +0200 Subject: [PATCH 03/69] [WIP] Add Generalized Linear Models (#9405) * GeneralizedLinearRegressor added to doc/modules/classes.rst --- doc/modules/classes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index e09ca0422d8a7..2d451b6758eb1 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -720,6 +720,7 @@ Kernels: linear_model.BayesianRidge linear_model.ElasticNet linear_model.ElasticNetCV + linear_model.GeneralizedLinearRegressor linear_model.HuberRegressor linear_model.Lars linear_model.LarsCV From b0be167080588a35dc1f4b762d961edb897b1019 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 9 Aug 2017 13:38:49 +0200 Subject: [PATCH 04/69] [WIP] Add Generalized Linear Models (#9405) * fixed bug: init parameter max_iter * fix API for family and link: default parameter changed to string non public variables self._family_instance and self._link_instance * fixed bug in score, minus sign forgotten * added check_is_fitted to estimate_phi and score * added check_array(X) in predict * replaced lambda functions in TweedieDistribution * some documentation --- sklearn/linear_model/glm.py | 196 ++++++++++++++++++++++-------------- 1 file changed, 122 insertions(+), 74 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index cf91a64fafc12..0ee1564049329 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -15,9 +15,10 @@ # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. -# So far, it is GeneralizedLinearModel, since it could very easily -# extended by Bernoulli/Binomial distribution. -# Solution: GeneralizedLinearRegressor +# Estimators in sklearn are either regressors or classifiers. A Generalized +# Linear Model does both depending on the chosen distribution, e.g. Normal => +# regressor, Bernoulli/Binomial => classifier. +# Solution: GeneralizedLinearRegressor since this is the focus. # - The link funtion (instance of class Link) is necessary for the evaluation # of deviance, score, Fisher and Hessian matrix as functions of the # coefficients, which is needed by optimizers. @@ -33,7 +34,7 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..externals import six -from ..utils import check_X_y +from ..utils import check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg from ..utils.validation import check_is_fitted @@ -372,46 +373,67 @@ class TweedieDistribution(ExponentialDispersionModel): The variance power of the unit_variance :math:`v(mu) = mu^{power}`. """ + def _less_upper_bound(self, x): + return np.less(x, self.upper_bound) + + def _less_equal_upper_bound(self, x): + return np.less_equal(x, self.upper_bound) + + def _greater_lower_bound(self, x): + return np.greater(x, self.lower_bound) + + def _greater_equal_lower_bound(self, x): + return np.greater_equal(x, self.lower_bound) + def __init__(self, power=0): self.power = power self._upper_bound = np.Inf - self._upper_compare = lambda x: np.less(x, self.upper_bound) + # self._upper_compare = lambda x: np.less(x, self.upper_bound) + self._upper_compare = self._less_upper_bound if power < 0: # Extreme Stable self._lower_bound = -np.Inf - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif power == 0: - # GaussianDistribution + # NormalDistribution self._lower_bound = -np.Inf - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif (power > 0) and (power < 1): raise ValueError('For 0 1) and (power < 2): # Compound Poisson self._lower_bound = 0 - self._lower_compare = ( - lambda x: np.greater_equal(x, self.lower_bound)) + # self._lower_compare = ( + # lambda x: np.greater_equal(x, self.lower_bound)) + self._lower_compare = self._greater_equal_lower_bound elif power == 2: # GammaDistribution self._lower_bound = 0 - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif (power > 2) and (power < 3): # Positive Stable self._lower_bound = 0 - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif power == 3: # InverseGaussianDistribution self._lower_bound = 0 - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif power > 3: # Positive Stable self._lower_bound = 0 - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound @property def power(self): @@ -530,6 +552,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Class to fit a Generalized Linear Model (GLM) based on reproductive Exponential Dispersion Models (EDM). + #TODO: This belongs to User Guide Assumptions: - The target values y_i are realizations of random variables @@ -559,25 +582,26 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Parameters ---------- - fit_intercept : boolean, optional, default True - whether to calculate the intercept for this model. If set - to False, no intercept will be used in calculations - (e.g. data is expected to be already centered). + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance - of a subclass of ExponentialDispersionModel, optional, default 'normal' + of a subclass of ExponentialDispersionModel, optional + (default='normal') the distributional assumption of the GLM. link : {'identity', 'log'} or an instance of a subclass of Link, - optional, default IdentityLink() - the link function (class) of the GLM + optional (default='identity') + the link function of the GLM, i.e. mapping from linear predictor + (X*coef) to expectation (mu). - fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr' + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul='chisqr') method for estimation of the dispersion parameter phi. Whether to use the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'irls', 'newton-cg', 'lbfgs'}, defaul 'irls' + solver : {'irls', 'newton-cg', 'lbfgs'}, optional (defaul='irls') Algorithm to use in the optimization problem. - 'irls' is iterated reweighted least squares. It is the standard @@ -585,16 +609,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'newton-cg', 'lbfgs' - max_iter : int, default 100 + max_iter : int, optional (default=100) TODO - tol : float + tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative of the deviance). - start_params : {array shape (n_features, ), 'ols'}, default None + start_params : {array shape (n_features, ), 'ols'}, optional (default=None) sets the start values for coef_ in the fit. If None, default values are taken. If 'ols' the result of an ordinary least squares in the link space @@ -603,9 +627,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If fit_intercept is true, the first value is assumed to be the start value for the intercept_. - verbose : int, default: 0 - For the lbfgs solver set verbose to any positive - number for verbosity. + verbose : int, optional (default=0) + For the lbfgs solver set verbose to any positive number for verbosity. Attributes ---------- @@ -629,15 +652,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): TODO """ - def __init__(self, fit_intercept=True, family=NormalDistribution(), - link=IdentityLink(), fit_dispersion='chisqr', solver='irls', + def __init__(self, fit_intercept=True, family='normal', + link='identity', fit_dispersion='chisqr', solver='irls', max_iter=100, tol=1e-4, start_params=None, verbose=0): self.fit_intercept = fit_intercept self.family = family self.link = link self.fit_dispersion = fit_dispersion self.solver = solver - self.max_iter = 100 + self.max_iter = max_iter self.tol = tol self.start_params = start_params self.verbose = verbose @@ -663,29 +686,38 @@ def fit(self, X, y, weight=None): ------- self : returns an instance of self. """ - if not isinstance(self.family, ExponentialDispersionModel): + # Garantee that self._family_instance is an instance of class + # ExponentialDispersionModel + if isinstance(self.family, ExponentialDispersionModel): + self._family_instance = self.family + else: if self.family == 'normal': - self.family = NormalDistribution() + self._family_instance = NormalDistribution() elif self.family == 'poisson': - self.family = PoissonDistribution() + self._family_instance = PoissonDistribution() elif self.family == 'gamma': - self.family = GammaDistribution() + self._family_instance = GammaDistribution() elif self.family == 'inverse.gaussian': - self.family = InverseGaussianDistribution() + self._family_instance = InverseGaussianDistribution() else: raise ValueError( - "The argument family must be an instance of class" + "The family must be an instance of class" " ExponentialDispersionModel or an element of" " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].") - if not isinstance(self.link, Link): + + # Garantee that self._link_instance is set to an instance of class Link + if isinstance(self.link, Link): + self._link_instance = self.link + else: if self.link == 'identity': - self.link = IdentityLink() - if self.link == 'log': - self.link = LogLink() + self._link_instance = IdentityLink() + elif self.link == 'log': + self._link_instance = LogLink() else: raise ValueError( - "The argument link must be an instance of class Link or" + "The link must be an instance of class Link or" " an element of ['identity', 'log'].") + if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool," " got {0}".format(self.fit_intercept)) @@ -711,10 +743,13 @@ def fit(self, X, y, weight=None): y_numeric=True, multi_output=False) y = y.astype(np.float64) - if not np.all(self.family.in_y_range(y)): + family = self._family_instance + link = self._link_instance + + if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" - .format(self.family.__class__.__name__)) + .format(family.__class__.__name__)) if weight is None: weight = np.ones_like(y) @@ -745,14 +780,14 @@ def fit(self, X, y, weight=None): coef = None if start_params is None: # Use mu_start and apply one irls step to calculate coef - mu = self.family.starting_mu(y, weight) + mu = family.starting_mu(y, weight) # linear predictor - eta = self.link.link(mu) + eta = link.link(mu) # h'(eta) - hp = self.link.inverse_derivative(eta) + hp = link.inverse_derivative(eta) # working weights w, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) + w = (hp**2 / family.variance(mu, phi=1, weight=weight)) wroot = np.sqrt(w) # working observations yw = eta + (y-mu)/hp @@ -763,7 +798,7 @@ def fit(self, X, y, weight=None): coef = linalg.lstsq(X_rescale, yw_rescale)[0] elif start_params is 'ols': reg = LinearRegression(copy_X=False, fit_intercept=False) - reg.fit(Xnew, self.link.link(y)) + reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: coef = start_params @@ -775,16 +810,16 @@ def fit(self, X, y, weight=None): if self.solver == 'irls': # linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) - mu = self.link.inverse(eta) + mu = link.inverse(eta) while self.n_iter_ < self.max_iter: self.n_iter_ += 1 # coef_old not used so far. # coef_old = coef # h'(eta) - hp = self.link.inverse_derivative(eta) + hp = link.inverse_derivative(eta) # working weights w, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) + w = (hp**2 / family.variance(mu, phi=1, weight=weight)) wroot = np.sqrt(w) # working observations yw = eta + (y-mu)/hp @@ -799,13 +834,13 @@ def fit(self, X, y, weight=None): # updated linear predictor # do it here for updated values for tolerance eta = safe_sparse_dot(Xnew, coef, dense_output=True) - mu = self.link.inverse(eta) + mu = link.inverse(eta) # which tolerace? |coef - coef_old| or gradient? # use gradient for compliance with newton-cg and lbfgs # TODO: faster computation of gradient, use mu and eta directly - gradient = self.family._deviance_derivative( - coef=coef, X=Xnew, y=y, weight=weight, link=self.link) + gradient = family._deviance_derivative( + coef=coef, X=Xnew, y=y, weight=weight, link=link) if (np.max(np.abs(gradient)) <= self.tol): converged = True break @@ -818,9 +853,9 @@ def fit(self, X, y, weight=None): # TODO: performance: make one function return both deviance and # gradient of deviance elif self.solver == 'lbfgs': - func = self.family._deviance - fprime = self.family._deviance_derivative - args = (Xnew, y, weight, self.link) + func = family._deviance + fprime = family._deviance_derivative + args = (Xnew, y, weight, link) coef, loss, info = optimize.fmin_l_bfgs_b( func, coef, fprime=fprime, args=args, @@ -836,13 +871,13 @@ def fit(self, X, y, weight=None): info["task"])) self.n_iter_ = info['nit'] elif self.solver == 'newton-cg': - func = self.family._deviance - grad = self.family._deviance_derivative + func = family._deviance + grad = family._deviance_derivative def grad_hess(coef, X, y, weight, link): - grad = (self.family._deviance_derivative( + grad = (family._deviance_derivative( coef, X, y, weight, link)) - hessian = (self.family._deviance_hessian( + hessian = (family._deviance_hessian( coef, X, y, weight, link)) def Hs(s): @@ -850,7 +885,7 @@ def Hs(s): return ret return grad, Hs hess = grad_hess - args = (Xnew, y, weight, self.link) + args = (Xnew, y, weight, link) coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, maxiter=self.max_iter, tol=self.tol) self.coef_ = coef @@ -871,37 +906,50 @@ def predict(self, X, weight=1): If weights are given, returns prediction*weights. """ check_is_fitted(self, "coef_") + X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) + # TODO: validation of weight eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ - mu = self.link.inverse(eta) + mu = self._link_instance.inverse(eta) return mu*weight def estimate_phi(self, y, X, weight): + """Estimation of the dispersion parameter. + Returns the estimate. + """ + check_is_fitted(self, "coef_") n_samples, n_features = X.shape eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ - mu = self.link.inverse(eta) + mu = self._link_instance.inverse(eta) if self.fit_dispersion == 'chisqr': - chisq = np.sum(weight*(y-mu)**2/self.family.unit_variance(mu)) + chisq = np.sum(weight*(y-mu)**2 / + self._family_instance.unit_variance(mu)) return chisq/(n_samples - n_features) elif self.fit_dispersion == 'deviance': - dev = self.family.deviance(y, mu, weight) + dev = self._family_instance.deviance(y, mu, weight) return dev/(n_samples - n_features) +# TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5" +# in check_estimator for score +# from sklearn.utils.estimator_checks import check_estimator +# from sklearn.linear_model import GeneralizedLinearRegressor +# check_estimator(GeneralizedLinearRegressor) def score(self, X, y, weight=1): """The natural score for a GLM is -deviance. - Returns the weight averaged negitive deviance (the better the score, + Returns the weight averaged negative deviance (the better the score, the better the fit). Maximum score is therefore 0. """ # RegressorMixin has R^2 score. # TODO: Make it more compatible with the score function in # sklearn.metrics.regression.py + check_is_fitted(self, "coef_") eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ - mu = self.link.inverse(eta) - output_errors = self.family.unit_deviance(y, mu) + mu = self._link_instance.inverse(eta) + output_errors = self._family_instance.unit_deviance(y, mu) weight = weight * np.ones_like(y) - return np.average(output_errors, weights=weight) + return -np.average(output_errors, weights=weight) From 85c52ec9c6adb3b1f75650cfa7fe0b770393d24e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 13 Aug 2017 01:46:16 +0200 Subject: [PATCH 05/69] [WIP] Add Generalized Linear Models (#9405) * make raw docstrings where appropriate * make ExponentialDispersionModel (i.e. TweedieDistribution) pickable: ExponentialDispersionModel has new properties include_lower_bound, method in_y_range is not abstract anymore. * set self.intercept_=0 if fit_intercept=False, such that it is always defined. * set score to D2, a generalized R2 with deviance instead of squared error, as does glmnet. This also solves issues with check_regressors_train(GeneralizedLinearRegressor), which assumes R2 score. * change of names: weight to weights in ExponentialDispersionModel and to sample_weight in GeneralizedLinearRegressor * add class method linear_predictor --- sklearn/linear_model/glm.py | 330 ++++++++++++++++++++++-------------- 1 file changed, 199 insertions(+), 131 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 0ee1564049329..b80842f817f4d 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -5,13 +5,16 @@ # Author: Christian Lorentzen # License: BSD 3 clause +# TODO: Write more tests # TODO: Which name/symbol for coefficients and weights in docu? # sklearn.linear_models uses w for coefficients. -# So far, coefficients=beta and weight=w (as standard literature) -# TODO: Add l2-penalty +# So far, coefficients=beta and weights=w (as standard literature) +# TODO: Add l2-penalty (maybe more general w.P.w with P penalty matrix) # TODO: Add l1-penalty (elastic net) # TODO: Add cross validation # TODO: Write docu and examples +# TODO: Make it as much consistent to other estimators in linear_model as +# possible # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -118,7 +121,7 @@ def inverse_derivative2(self, lin_pred): class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): - """Base class for reproductive Exponential Dispersion Models (EDM). + r"""Base class for reproductive Exponential Dispersion Models (EDM). The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by @@ -136,6 +139,8 @@ class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): ---------- lower_bound upper_bound + include_lower_bound + include_upper_bound Methods ------- @@ -174,15 +179,39 @@ def upper_bound(self): """ raise NotImplementedError() - @abstractmethod + @abstractproperty + def include_lower_bound(self): + """If True, values of y may equal lower bound: y >= lower_bound. + """ + raise NotImplementedError() + + @abstractproperty + def include_upper_bound(self): + """If True, values of y may equal upper bound: y <= upper_bound. + """ + raise NotImplementedError() + def in_y_range(self, x): """Returns true if x is in the valid range of Y~EDM. """ - raise NotImplementedError() + if self.include_lower_bound: + if self.include_upper_bound: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less(x, self.upper_bound)) + else: + if self.include_upper_bound: + return np.logical_and(np.greater(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater(x, self.lower_bound), + np.less(x, self.upper_bound)) @abstractmethod def unit_variance(self, mu): - """The unit variance :math:`v(mu)` determines the variance as + r"""The unit variance :math:`v(mu)` determines the variance as a function of the mean mu by :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`. It can also be derived from the unit deviance :math:`d(y,\mu)` as @@ -194,27 +223,27 @@ def unit_variance(self, mu): @abstractmethod def unit_variance_derivative(self, mu): - """The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`. + r"""The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`. """ raise NotImplementedError() - def variance(self, mu, phi=1, weight=1): - """The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is + def variance(self, mu, phi=1, weights=1): + r"""The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`, with unit variance v(mu). """ - return phi/weight * self.unit_variance(mu) + return phi/weights * self.unit_variance(mu) - def variance_derivative(self, mu, phi=1, weight=1): - """The derivative of the variance w.r.t. mu, + def variance_derivative(self, mu, phi=1, weights=1): + r"""The derivative of the variance w.r.t. mu, :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] =phi/w_i*v'(\mu_i)`, with unit variance v(mu). """ - return phi/weight * self.unit_variance_derivative(mu) + return phi/weights * self.unit_variance_derivative(mu) @abstractmethod def unit_deviance(self, y, mu): - """The unit_deviance :math:`d(y,\mu)`. + r"""The unit_deviance :math:`d(y,\mu)`. In terms of the log-likelihood it is given by :math:`d(y,\mu) = -2\phi\cdot \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).` @@ -222,7 +251,7 @@ def unit_deviance(self, y, mu): raise NotImplementedError() def unit_deviance_derivative(self, y, mu): - """The derivative w.r.t. mu of the unit_deviance + r"""The derivative w.r.t. mu of the unit_deviance :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` with unit variance :math:`v(\mu)`. @@ -232,30 +261,30 @@ def unit_deviance_derivative(self, y, mu): """ return -2*(y-mu)/self.unit_variance(mu) - def deviance(self, y, mu, weight=1): - """The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu) - with weight :math:`w_i` and unit_deviance :math:`d(y,mu)`. + def deviance(self, y, mu, weights=1): + r"""The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu) + with weights :math:`w_i` and unit_deviance :math:`d(y,mu)`. In terms of the likelihood it is :math:`D = -2\phi\cdot \left(loglike(y,\mu,\frac{phi}{w}) - loglike(y,y,\frac{phi}{w})\right).` """ - return np.sum(weight*self.unit_deviance(y, mu)) + return np.sum(weights*self.unit_deviance(y, mu)) - def _deviance(self, coef, X, y, weight, link): + def _deviance(self, coef, X, y, weights, link): """The deviance as a function of the coefficients ``coef`` (:math:`beta`). """ lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) - return self.deviance(y, mu, weight) + return self.deviance(y, mu, weights) - def deviance_derivative(self, y, mu, weight=1): + def deviance_derivative(self, y, mu, weights=1): """The derivative w.r.t. mu of the deviance.` """ - return weight*self.unit_deviance_derivative(y, mu) + return weights*self.unit_deviance_derivative(y, mu) - def _score(self, coef, phi, X, y, weight, link): - """The score function :math:`s` is the derivative of the + def _score(self, coef, phi, X, y, weights, link): + r"""The score function :math:`s` is the derivative of the log-likelihood w.r.t. the ``coef`` (:math:`\beta`). It is given by @@ -270,7 +299,7 @@ def _score(self, coef, phi, X, y, weight, link): n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d = link.inverse_derivative(lin_pred) d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), shape=(n_samples, n_samples)) @@ -278,8 +307,8 @@ def _score(self, coef, phi, X, y, weight, link): score = safe_sparse_dot(X.T, temp, dense_output=False) return score - def _fisher_matrix(self, coef, phi, X, y, weight, link): - """The Fisher information matrix, also known as expected + def _fisher_matrix(self, coef, phi, X, y, weights, link): + r"""The Fisher information matrix, also known as expected information matrix. It is given by .. math: @@ -295,7 +324,7 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link): n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d2 = link.inverse_derivative(lin_pred)**2 d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), shape=(n_samples, n_samples)) @@ -303,8 +332,8 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link): fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False) return fisher_matrix - def _observed_information(self, coef, phi, X, y, weight, link): - """The observed information matrix, also known as the negative of + def _observed_information(self, coef, phi, X, y, weights, link): + r"""The observed information matrix, also known as the negative of the Hessian matrix of the log-likelihood. It is given by .. math: @@ -327,7 +356,7 @@ def _observed_information(self, coef, phi, X, y, weight, link): n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) dp = link.inverse_derivative2(lin_pred) d2 = link.inverse_derivative(lin_pred)**2 v = self.unit_variance_derivative(mu)/self.unit_variance(mu) @@ -338,33 +367,34 @@ def _observed_information(self, coef, phi, X, y, weight, link): observed_information = safe_sparse_dot(X.T, temp, dense_output=False) return observed_information - def _deviance_derivative(self, coef, X, y, weight, link): - """The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a + def _deviance_derivative(self, coef, X, y, weights, link): + r"""The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a function of the coefficients ``coef``. This is equivalent to :math:`-2\phi` times the score function :math:`s` (derivative of the log-likelihood). """ - score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight, + score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights, link=link) return -2*score - def _deviance_hessian(self, coef, X, y, weight, link): - """The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance + def _deviance_hessian(self, coef, X, y, weights, link): + r"""The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance as a function of the coefficients ``coef``. This is equivalent to :math:`+2\phi` times the observed information matrix. """ info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y, - weight=weight, link=link) + weights=weights, link=link) return 2*info_matrix - def starting_mu(self, y, weight=1): + def starting_mu(self, y, weights=1): """Starting values for the mean mu_i in IRLS.""" - return (weight*y+np.mean(weight*y))/(2.*np.sum(np.ones_like(y)*weight)) + return ((weights*y+np.mean(weights*y)) + / (2.*np.sum(np.ones_like(y)*weights))) class TweedieDistribution(ExponentialDispersionModel): - """A class for the Tweedie distribution. + r"""A class for the Tweedie distribution. They have mu=E[X] and Var[X] \propto mu**power. Attributes @@ -373,67 +403,44 @@ class TweedieDistribution(ExponentialDispersionModel): The variance power of the unit_variance :math:`v(mu) = mu^{power}`. """ - def _less_upper_bound(self, x): - return np.less(x, self.upper_bound) - - def _less_equal_upper_bound(self, x): - return np.less_equal(x, self.upper_bound) - - def _greater_lower_bound(self, x): - return np.greater(x, self.lower_bound) - - def _greater_equal_lower_bound(self, x): - return np.greater_equal(x, self.lower_bound) - def __init__(self, power=0): self.power = power self._upper_bound = np.Inf - # self._upper_compare = lambda x: np.less(x, self.upper_bound) - self._upper_compare = self._less_upper_bound + self._include_upper_bound = False if power < 0: # Extreme Stable self._lower_bound = -np.Inf - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif power == 0: # NormalDistribution self._lower_bound = -np.Inf - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif (power > 0) and (power < 1): raise ValueError('For 0 1) and (power < 2): # Compound Poisson self._lower_bound = 0 - # self._lower_compare = ( - # lambda x: np.greater_equal(x, self.lower_bound)) - self._lower_compare = self._greater_equal_lower_bound + self._include_lower_bound = True elif power == 2: # GammaDistribution self._lower_bound = 0 - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif (power > 2) and (power < 3): # Positive Stable self._lower_bound = 0 - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif power == 3: # InverseGaussianDistribution self._lower_bound = 0 - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif power > 3: # Positive Stable self._lower_bound = 0 - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False @property def power(self): @@ -454,8 +461,13 @@ def lower_bound(self): def upper_bound(self): return self._upper_bound - def in_y_range(self, x): - return np.logical_and(self._lower_compare(x), self._upper_compare(x)) + @property + def include_lower_bound(self): + return self._include_lower_bound + + @property + def include_upper_bound(self): + return self._include_upper_bound def unit_variance(self, mu): """The unit variance of a Tweedie distribution is v(mu)=mu**power. @@ -485,7 +497,7 @@ def unit_deviance(self, y, mu): return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) - def likelihood(self, y, X, beta, phi, weight=1): + def likelihood(self, y, X, beta, phi, weights=1): raise NotImplementedError('This function is not (yet) implemented.') @@ -521,6 +533,8 @@ class GeneralizedHyperbolicSecand(ExponentialDispersionModel): def __init__(self): self._lower_bound = -np.Inf self._upper_bound = np.Inf + self._include_lower_bound = False + self._include_upper_bound = False @property def lower_bound(self): @@ -530,11 +544,13 @@ def lower_bound(self): def upper_bound(self): return self._upper_bound - def in_y_range(self, x): - np.logical_and( - np.greater(x, self.lower_bound), - np.less(x, self.lower_bound) - ) + @property + def include_lower_bound(self): + return self._include_lower_bound + + @property + def include_upper_bound(self): + return self._include_upper_bound def unit_variance(self, mu): return 1 + mu**2 @@ -548,7 +564,7 @@ def unit_deviance(self, y, mu): class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - """ + r""" Class to fit a Generalized Linear Model (GLM) based on reproductive Exponential Dispersion Models (EDM). @@ -665,7 +681,7 @@ def __init__(self, fit_intercept=True, family='normal', self.start_params = start_params self.verbose = verbose - def fit(self, X, y, weight=None): + def fit(self, X, y, sample_weight=None): """Fit a generalized linear model. Parameters @@ -676,11 +692,12 @@ def fit(self, X, y, weight=None): y : numpy array of shape [n_samples] Target values - weight : numpy array of shape [n_samples] + sample_weight : numpy array of shape [n_samples] Individual weights for each sample. Var[Y_i]=phi/weight_i * v(mu) If Y_i ~ EDM(mu, phi/w_i) then - sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)) + sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a + weighted average with weights=sample_weight. Returns ------- @@ -751,16 +768,17 @@ def fit(self, X, y, weight=None): "range for family {0}" .format(family.__class__.__name__)) - if weight is None: - weight = np.ones_like(y) - elif np.isscalar(weight): - weight = weight*np.ones_like(y) + if sample_weight is None: + weights = np.ones_like(y) + elif np.isscalar(sample_weight): + weights = sample_weight*np.ones_like(y) else: - weight = np.atleast_1d(weight) - if weight.ndim > 1: - raise ValueError("Weights must be 1D array or scalar") - elif weight.shape[0] != y.shape[0]: - raise ValueError("Weights must have the same length as y") + weights = np.atleast_1d(sample_weight) + if weights.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar") + elif weights.shape[0] != y.shape[0]: + raise ValueError("Sample weights must have the same length as" + " y") if self.fit_intercept: # intercept is first column <=> coef[0] is for intecept @@ -780,14 +798,14 @@ def fit(self, X, y, weight=None): coef = None if start_params is None: # Use mu_start and apply one irls step to calculate coef - mu = family.starting_mu(y, weight) + mu = family.starting_mu(y, weights) # linear predictor eta = link.link(mu) # h'(eta) hp = link.inverse_derivative(eta) # working weights w, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / family.variance(mu, phi=1, weight=weight)) + w = (hp**2 / family.variance(mu, phi=1, weights=weights)) wroot = np.sqrt(w) # working observations yw = eta + (y-mu)/hp @@ -819,7 +837,7 @@ def fit(self, X, y, weight=None): hp = link.inverse_derivative(eta) # working weights w, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / family.variance(mu, phi=1, weight=weight)) + w = (hp**2 / family.variance(mu, phi=1, weights=weights)) wroot = np.sqrt(w) # working observations yw = eta + (y-mu)/hp @@ -840,7 +858,7 @@ def fit(self, X, y, weight=None): # use gradient for compliance with newton-cg and lbfgs # TODO: faster computation of gradient, use mu and eta directly gradient = family._deviance_derivative( - coef=coef, X=Xnew, y=y, weight=weight, link=link) + coef=coef, X=Xnew, y=y, weights=weights, link=link) if (np.max(np.abs(gradient)) <= self.tol): converged = True break @@ -855,7 +873,7 @@ def fit(self, X, y, weight=None): elif self.solver == 'lbfgs': func = family._deviance fprime = family._deviance_derivative - args = (Xnew, y, weight, link) + args = (Xnew, y, weights, link) coef, loss, info = optimize.fmin_l_bfgs_b( func, coef, fprime=fprime, args=args, @@ -874,47 +892,72 @@ def fit(self, X, y, weight=None): func = family._deviance grad = family._deviance_derivative - def grad_hess(coef, X, y, weight, link): + def grad_hess(coef, X, y, weights, link): grad = (family._deviance_derivative( - coef, X, y, weight, link)) + coef, X, y, weights, link)) hessian = (family._deviance_hessian( - coef, X, y, weight, link)) + coef, X, y, weights, link)) def Hs(s): ret = np.dot(hessian, s) return ret return grad, Hs hess = grad_hess - args = (Xnew, y, weight, link) + args = (Xnew, y, weights, link) coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, maxiter=self.max_iter, tol=self.tol) self.coef_ = coef - if self.fit_intercept is True: + if self.fit_intercept: self.intercept_ = coef[0] self.coef_ = coef[1:] else: + self.intercept_ = 0. self.coef_ = coef if self.fit_dispersion in ['chisqr', 'deviance']: - self.dispersion_ = self.estimate_phi(y, X, weight) + self.dispersion_ = self.estimate_phi(y, X, weights) return self - def predict(self, X, weight=1): - """Prediction with features X. - If weights are given, returns prediction*weights. + def linear_predictor(self, X): + """The linear_predictor X*coef_ + intercept_. + + Parameters + ---------- + X : numpy array or sparse matrix of shape [n_samples,n_features] + Samples. + + Returns + ------- + C : array, shape = (n_samples) + Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) - # TODO: validation of weight - eta = safe_sparse_dot(X, self.coef_, dense_output=True) - if self.fit_intercept is True: - eta += self.intercept_ + return safe_sparse_dot(X, self.coef_, + dense_output=True) + self.intercept_ + + def predict(self, X, sample_weight=1): + """Predict uing GLM with feature matrix X. + If sample_weight is given, returns prediction*sample_weight. + + Parameters + ---------- + X : numpy array or sparse matrix of shape [n_samples,n_features] + Samples. + + Returns + ------- + C : array, shape = (n_samples) + Returns predicted values times sample_weight. + """ + # TODO: validation of sample_weight + eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) - return mu*weight + return mu*sample_weight - def estimate_phi(self, y, X, weight): + def estimate_phi(self, y, X, sample_weight): """Estimation of the dispersion parameter. Returns the estimate. """ @@ -925,11 +968,11 @@ def estimate_phi(self, y, X, weight): eta += self.intercept_ mu = self._link_instance.inverse(eta) if self.fit_dispersion == 'chisqr': - chisq = np.sum(weight*(y-mu)**2 / + chisq = np.sum(sample_weight*(y-mu)**2 / self._family_instance.unit_variance(mu)) return chisq/(n_samples - n_features) elif self.fit_dispersion == 'deviance': - dev = self._family_instance.deviance(y, mu, weight) + dev = self._family_instance.deviance(y, mu, sample_weight) return dev/(n_samples - n_features) # TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5" @@ -937,19 +980,44 @@ def estimate_phi(self, y, X, weight): # from sklearn.utils.estimator_checks import check_estimator # from sklearn.linear_model import GeneralizedLinearRegressor # check_estimator(GeneralizedLinearRegressor) - def score(self, X, y, weight=1): - """The natural score for a GLM is -deviance. - Returns the weight averaged negative deviance (the better the score, - the better the fit). Maximum score is therefore 0. + def score(self, X, y, sample_weight=None): + r"""Returns D^2, a generalization of the coefficient of determination + R^2, which uses deviance instead of squared error. + + D^2 is defined as + :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` + is the null deviance, i.e. the deviance of a model with intercept + alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean + :math:`\bar{y}` is average by sample_weight. In the case of a Normal + distribution, this D^2 equals R^2. + Best possible score is 1.0 and it can be negative (because the + model can be arbitrarily worse). + + Parameters + ---------- + X : array-like, shape = (n_samples, n_features) + Test samples + + y : array-like of shape = (n_samples) + True valeus for X. + + sample_weight : array-like, shape = (n_samples), optional + Sample weights. + + Returns + ------- + score : float + D^2 of self.predict(X) wrt. y. """ - # RegressorMixin has R^2 score. - # TODO: Make it more compatible with the score function in - # sklearn.metrics.regression.py - check_is_fitted(self, "coef_") - eta = safe_sparse_dot(X, self.coef_, dense_output=True) - if self.fit_intercept is True: - eta += self.intercept_ - mu = self._link_instance.inverse(eta) - output_errors = self._family_instance.unit_deviance(y, mu) - weight = weight * np.ones_like(y) - return -np.average(output_errors, weights=weight) + # Note, default score defined in RegressorMixin is R^2 score. + # TODO: make D^2 a score function in module metrics (and thereby get + # input validation and so on) + if sample_weight is None: + weights = np.ones_like(y) + else: + weights = np.atleast_1d(sample_weight) + mu = self.predict(X) + dev = self._family_instance.deviance(y, mu, weights=weights) + y_mean = np.average(y, weights=weights) + dev_null = self._family_instance.deviance(y, y_mean, weights=weights) + return 1. - dev / dev_null From 0f4bdb3a8c5c45e80786b8156398da93bfc597e8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 18 Sep 2017 23:41:19 +0200 Subject: [PATCH 06/69] [WIP] Add Generalized Linear Models (#9405) * added L2 penalty * api change: alpha, l1_ratio, P1, P2, warm_start, check_input, copy_X * added entry in user guide * improved docstrings * helper function _irls_step --- doc/modules/linear_model.rst | 113 +++- sklearn/linear_model/glm.py | 702 ++++++++++++++++++------- sklearn/linear_model/tests/test_glm.py | 115 +++- 3 files changed, 721 insertions(+), 209 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 75b95f6c7a44f..51b3821fa6207 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -786,7 +786,7 @@ non-smooth `penalty="l1"`. This is therefore the solver of choice for sparse multinomial logistic regression. It is also the only solver that supports `penalty="elasticnet"`. -The "lbfgs" is an optimization algorithm that approximates the +The "lbfgs" is an optimization algorithm that approximates the Broyden–Fletcher–Goldfarb–Shanno algorithm [8]_, which belongs to quasi-Newton methods. The "lbfgs" solver is recommended for use for small data-sets but for larger datasets its performance suffers. [9]_ @@ -874,6 +874,117 @@ to warm-starting (see :term:`Glossary `). .. [9] `"Performance Evaluation of Lbfgs vs other solvers" `_ +.. _Generalized_linear_regression: + +Generalized linear regression +============================= + +:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two +ways [1]_. First, the predicted values :math:`\hat{y}` are linked to a linear +combination of the input variables :math:`X` via an inverse link function +:math:`h` as + +.. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). + +Secondly, the squared loss function is replaced by the deviance :math:`D` of an +exponential dispersion model (EDM) [2]_. The objective function beeing minimized +becomes + +.. math:: \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1 + +\frac{\alpha(1-\rho)}{2} w^T P_2 w + +with sample weights :math:`s`. +:math:`P_1` can be used to exclude some of the coefficients in the L1 +penalty, :math:`P_2` (must be positive semi-definite) allows for a more +versatile L2 penalty. + +Use cases, where a loss different from the squared loss might be appropriate, +are the following: + + * If the target values :math:`y` are counts (integer valued) or frequencies, you might try a Poisson deviance. + + * If the target values are positive valued and skewed, you might try a Gamma deviance. + + * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family). + +Since the linear predictor :math:`Xw` can be negative and +Poisson, Gamma and Inverse Gaussian distributions don't have negative values, +it is convenient to apply a link function different from the identity link +:math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with +:math:`h(Xw)=\exp(Xw)`. + +Note that the feature matrix `X` should be standardized before fitting. This +ensures that the penalty treats features equally. + + >>> from sklearn import linear_model + >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, l1_ratio=0) + >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) + >>> reg.coef_ + array([ 0.24630255, 0.43373521]) + >>> reg.intercept_ + -0.76383575123143277 + +Mathematical formulation +------------------------ + +In the unpenalized case, the assumptions are the folowing: + + * The target values :math:`y_i` are realizations of random variables + :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})` + with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter + :math:`\phi` and sample weights :math:`s_i`. + * The aim is to predict the expectation :math:`\mu_i` with + :math:`\hat{y_i} = h(\eta_i)`, linear predictor + :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`. + +Note that the first assumption implies +:math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance +function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the +same as specifying a unit variance function (they are one-to-one). + +Including penalties helps to avoid overfitting or, in case of L1 penalty, to +obtain sparse solutions. But there are also other motivations to include them, +e.g. accounting fo dependence structure of :math:`y`. + +The objective function, which is independent of :math:`\phi`, is minimized with +respect to the coefficients :math:`w`. + +The deviance is defined by + +.. math:: D(y, \mu) = -2\phi\cdot + \left(loglike(y,\mu,\frac{\phi}{s}) + - loglike(y,y,\frac{\phi}{s})\right) + +===================================== ================================= +Distribution Variance Function :math:`v(\mu)` +===================================== ================================= +Normal ("normal") :math:`1` +Poisson ("poisson") :math:`\mu` +Gamma ("gamma") :math:`\mu^2` +Inverse Gaussian ("inverse.gaussian") :math:`\mu^3` +===================================== ================================= + +Two remarks: + +* The deviances for at least Normal, Poisson and Gamma distributions are + strictly consistent scoring functions for the mean :math:`\mu`, see Eq. + (19)-(20) in [3]_. + +* If you want to model a frequency, i.e. counts per exposure (time, volume, ...) + you can do so by a Poisson distribution and passing + :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together + with :math:`s=\mathrm{exposure}` as sample weights. + + +.. topic:: References: + + .. [1] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. [2] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. + See also `Exponential dispersion model. `_ + + .. [3] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ Stochastic Gradient Descent - SGD ================================= diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index b80842f817f4d..2db3c56d5e1c1 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -6,15 +6,15 @@ # License: BSD 3 clause # TODO: Write more tests -# TODO: Which name/symbol for coefficients and weights in docu? -# sklearn.linear_models uses w for coefficients. -# So far, coefficients=beta and weights=w (as standard literature) -# TODO: Add l2-penalty (maybe more general w.P.w with P penalty matrix) # TODO: Add l1-penalty (elastic net) +# TODO: deal with option self.copy_X +# TODO: Should the option `normalize` be included (like other linear models)? +# So far, it is not included. User must pass a normalized X. # TODO: Add cross validation -# TODO: Write docu and examples +# TODO: Write examples and more docu # TODO: Make it as much consistent to other estimators in linear_model as # possible +# TODO: options P1 and P2 in fit() or in __init__()??? # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -22,10 +22,21 @@ # Linear Model does both depending on the chosen distribution, e.g. Normal => # regressor, Bernoulli/Binomial => classifier. # Solution: GeneralizedLinearRegressor since this is the focus. +# - Allow for finer control of penalty terms: +# L1: ||P1*w||_1 with P1*w a componentwise product, this allows to exclude +# factors from the L1 penalty. +# L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be +# a 1st or 2nd order difference matrix (compare B-spline penalties and +# Tikhonov regularization). # - The link funtion (instance of class Link) is necessary for the evaluation # of deviance, score, Fisher and Hessian matrix as functions of the # coefficients, which is needed by optimizers. # Solution: link as argument in those functions +# - Which name/symbol for sample_weight in docu? +# sklearn.linear_models uses w for coefficients, standard literature on +# GLMs use beta for coefficients and w for (sample) weights. +# So far, coefficients=w and sample weights=s. + from __future__ import division from abc import ABCMeta, abstractmethod, abstractproperty @@ -34,6 +45,8 @@ from scipy import linalg, optimize, sparse import warnings from .base import LinearRegression +from .coordinate_descent import ElasticNet +from .ridge import Ridge from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..externals import six @@ -164,7 +177,8 @@ class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): References ---------- - See https://en.wikipedia.org/wiki/Exponential_dispersion_model. + + https://en.wikipedia.org/wiki/Exponential_dispersion_model. """ @abstractproperty @@ -192,7 +206,7 @@ def include_upper_bound(self): raise NotImplementedError() def in_y_range(self, x): - """Returns true if x is in the valid range of Y~EDM. + """Returns true if `x` is in the valid range of Y~EDM. """ if self.include_lower_bound: if self.include_upper_bound: @@ -211,33 +225,36 @@ def in_y_range(self, x): @abstractmethod def unit_variance(self, mu): - r"""The unit variance :math:`v(mu)` determines the variance as - a function of the mean mu by - :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`. + r"""The unit variance :math:`v(\mu)` determines the variance as + a function of the mean :math:`\mu` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. It can also be derived from the unit deviance :math:`d(y,\mu)` as .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ \partial\mu^2}}\big|_{y=\mu} + + See also :func:`variance`. """ raise NotImplementedError() @abstractmethod def unit_variance_derivative(self, mu): - r"""The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`. + r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`. """ raise NotImplementedError() def variance(self, mu, phi=1, weights=1): - r"""The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is - :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`, - with unit variance v(mu). + r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is + :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, + with unit variance :math:`v(\mu)` and weights :math:`s_i`. """ return phi/weights * self.unit_variance(mu) def variance_derivative(self, mu, phi=1, weights=1): - r"""The derivative of the variance w.r.t. mu, + r"""The derivative of the variance w.r.t. `mu`, :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] - =phi/w_i*v'(\mu_i)`, with unit variance v(mu). + =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` + and weights :math:`s_i`. """ return phi/weights * self.unit_variance_derivative(mu) @@ -251,8 +268,8 @@ def unit_deviance(self, y, mu): raise NotImplementedError() def unit_deviance_derivative(self, y, mu): - r"""The derivative w.r.t. mu of the unit_deviance - :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + r"""The derivative w.r.t. `mu` of the unit deviance + :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` with unit variance :math:`v(\mu)`. Returns @@ -262,39 +279,39 @@ def unit_deviance_derivative(self, y, mu): return -2*(y-mu)/self.unit_variance(mu) def deviance(self, y, mu, weights=1): - r"""The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu) - with weights :math:`w_i` and unit_deviance :math:`d(y,mu)`. + r"""The deviance is given by :math:`D = \sum_i s_i \cdot d(y, \mu) + with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. In terms of the likelihood it is :math:`D = -2\phi\cdot - \left(loglike(y,\mu,\frac{phi}{w}) - - loglike(y,y,\frac{phi}{w})\right).` + \left(loglike(y,\mu,\frac{phi}{s}) + - loglike(y,y,\frac{phi}{s})\right)`. """ return np.sum(weights*self.unit_deviance(y, mu)) def _deviance(self, coef, X, y, weights, link): - """The deviance as a function of the coefficients ``coef`` - (:math:`beta`). + """The deviance as a function of the coefficients `coef` + (:math:`w`). """ lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) return self.deviance(y, mu, weights) def deviance_derivative(self, y, mu, weights=1): - """The derivative w.r.t. mu of the deviance.` + """The derivative w.r.t. `mu` of the deviance. """ return weights*self.unit_deviance_derivative(y, mu) def _score(self, coef, phi, X, y, weights, link): - r"""The score function :math:`s` is the derivative of the - log-likelihood w.r.t. the ``coef`` (:math:`\beta`). + r"""The score function is the derivative of the + log-likelihood w.r.t. `coef` (:math:`w`). It is given by .. math: - \mathbf{s}(\boldsymbol{\beta}) = \mathbf{X}^T \mathbf{D} + \mathbf{score}(\boldsymbol{w}) = \mathbf{X}^T \mathbf{D} \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and - :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}(y_1),\ldots)`. + :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. """ n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) @@ -303,23 +320,27 @@ def _score(self, coef, phi, X, y, weights, link): d = link.inverse_derivative(lin_pred) d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False) - score = safe_sparse_dot(X.T, temp, dense_output=False) + temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=True) + score = safe_sparse_dot(X.T, temp, dense_output=True) return score def _fisher_matrix(self, coef, phi, X, y, weights, link): - r"""The Fisher information matrix, also known as expected - information matrix. It is given by + r"""The Fisher information matrix. + The Fisher information matrix, also known as expected information + matrix is given by .. math: - \mathbf{F}(\boldsymbol{\beta}) = \mathrm{E}\left[ - -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta} - \partial\boldsymbol{\beta}^T}\right] + \mathbf{F}(\boldsymbol{w}) = + \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial + \boldsymbol{w}} \right] + = \mathrm{E}\left[ + -\frac{\partial^2 loglike}{\partial\boldsymbol{w} + \partial\boldsymbol{w}^T}\right] = \mathbf{X}^T W \mathbf{X} \,, with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, - see score function. + see func:`score_function`. """ n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) @@ -333,14 +354,15 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): return fisher_matrix def _observed_information(self, coef, phi, X, y, weights, link): - r"""The observed information matrix, also known as the negative of + r"""The observed information matrix. + The observed information matrix, also known as the negative of the Hessian matrix of the log-likelihood. It is given by .. math: - \mathbf{H}(\boldsymbol{\beta}) = - -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta} - \partial\boldsymbol{\beta}^T} + \mathbf{H}(\boldsymbol{w}) = + -\frac{\partial^2 loglike}{\partial\boldsymbol{w} + \partial\boldsymbol{w}^T} = \mathbf{X}^T \legt[ - \mathbf{D}' \mathbf{R} + \mathbf{D}^2 \mathbf{V} \mathbf{R} @@ -351,7 +373,7 @@ def _observed_information(self, coef, phi, X, y, weights, link): :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{ v(\mu_i)} \right)`, - see score function and Fisher matrix. + see :func:`score_` function and :func:`_fisher_matrix`. """ n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) @@ -368,18 +390,18 @@ def _observed_information(self, coef, phi, X, y, weights, link): return observed_information def _deviance_derivative(self, coef, X, y, weights, link): - r"""The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a - function of the coefficients ``coef``. + r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a + function of the coefficients `coef`. This is equivalent to :math:`-2\phi` times the score function - :math:`s` (derivative of the log-likelihood). + :func:`score_function` (derivative of the log-likelihood). """ score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights, link=link) return -2*score def _deviance_hessian(self, coef, X, y, weights, link): - r"""The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance - as a function of the coefficients ``coef``. + r"""The hessian matrix w.r.t. `coef` (:math:`w`) of the deviance + as a function of the coefficients `coef`. This is equivalent to :math:`+2\phi` times the observed information matrix. """ @@ -388,20 +410,21 @@ def _deviance_hessian(self, coef, X, y, weights, link): return 2*info_matrix def starting_mu(self, y, weights=1): - """Starting values for the mean mu_i in IRLS.""" - return ((weights*y+np.mean(weights*y)) - / (2.*np.sum(np.ones_like(y)*weights))) + """Starting values for the mean mu_i in (unpenalized) IRLS.""" + return ((weights*y+np.mean(weights*y)) / + (2.*np.sum(np.ones_like(y)*weights))) class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. - They have mu=E[X] and Var[X] \propto mu**power. + They have :math:`\mu=\mathrm{E}[Y]` and + :math:`\mathrm{Var}[Y] \propto \mu^power. Attributes ---------- power : float The variance power of the unit_variance - :math:`v(mu) = mu^{power}`. + :math:`v(\mu) = \mu^{power}`. """ def __init__(self, power=0): self.power = power @@ -497,7 +520,7 @@ def unit_deviance(self, y, mu): return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) - def likelihood(self, y, X, beta, phi, weights=1): + def likelihood(self, y, X, w, phi, weights=1): raise NotImplementedError('This function is not (yet) implemented.') @@ -563,51 +586,135 @@ def unit_deviance(self, y, mu): np.log((1+mu**2)/(1+y**2))) +def _irls_step(X, W, P2, z): + """One step in iteratively reweighted least squares + + Solve A w = b for w with + A = (X' W X + P2) + b = X' W z + z = eta + D^-1 (y-mu) + + See also fit method of :class:`GeneralizedLinearRegressor`. + + Parameters + ---------- + X : numpy array or sparse matrix of shape (n_samples, n_features) + Training data (with intercept included if present) + + W : numpy array of shape (n_samples, ) + + P2 : numpy array or sparse matrix of shape (n_features, n_features) + The l2-penalty matrix or vector (=diagonal matrix) + + z : numpy array of shape (n_samples, ) + Working observations + + Returns + ------- + coef: array, shape = (X.shape[1]) + """ + # TODO: scipy.linalg.solve if faster, but ordinary least squares uses + # scipy.linalg.lstsq. What is more appropriate? + n_samples, n_features = X.shape + if sparse.issparse(X): + W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr() + if P2.ndim == 1: + L2 = (sparse.dia_matrix((P2, 0), shape=(n_features, n_features)) + ).tocsr() + else: + L2 = sparse.csr_matrix(P2) + XtW = X.transpose() * W + A = XtW * X + L2 + b = XtW * z + coef = sparse.linalg.spsolve(A, b) + else: + XtW = (X.T * W) + A = XtW.dot(X) + if P2.ndim == 1: + A[np.diag_indices_from(A)] += P2 + else: + A += P2 + b = XtW.dot(z) + coef = linalg.solve(A, b) + return coef + + class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - r""" - Class to fit a Generalized Linear Model (GLM) based on reproductive - Exponential Dispersion Models (EDM). + """Regression via a Generalized Linear Model (GLM) based on reproductive + Exponential Dispersion Models (EDM) with combined L1 and L2 priors as + regularizer. + + Minimizes the objective function:: + + 1/(2s) * deviance(y, h(X*w)) + + alpha * l1_ratio * ||P1*w||_1 + + 1/2 * alpha * (1 - l1_ratio) * w*P2*w + + with inverse link function `h` and s=sum of `sample_weight` (which equals + n_samples for `sample_weight=None`). + For `P1`=`P2`=identity, the penalty is the elastic net:: - #TODO: This belongs to User Guide - Assumptions: + alpha * l1_ratio * ||w||_1 + + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 - - The target values y_i are realizations of random variables - :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion - parameter :math:`\phi` and weights :math:`w_i`. - - The expectation of :math:`Y_i` is :math:`\mu_i=\mathrm{E}[Y]=h(\eta_i)` - whith the linear predictor :math:`\eta=X*\beta`, inverse link function - :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta` - to be estimated. + If you are interested in controlling the L1 and L2 penalty + separately, keep in mind that this is equivalent to:: - Note that the first assumption implies - :math:`\mathrm{Var}[Y_i]=\frac{\phi}{w_i} v(\mu_i)` with uni variance - function :math:`v(\mu)`. + a * L1 + b * L2 + + where:: + + alpha = a + b and l1_ratio = a / (a + b) + + The parameter `l1_ratio` corresponds to alpha in the glmnet R package while + alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio + = 1 is the lasso penalty. + + Read more in the :ref:`User Guide `. The fit itself does not need Y to be from an EDM, but only assumes - the first two moments :math:`E[Y_i]=\mu_i=h(\eta_i)` and - :math:`Var[Y_i]=\frac{\phi}{w_i} v(\mu_i)` + the first two moments :math:`E[Y_i]=\\mu_i=h(\\eta_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{w_i} v(\\mu_i)`. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + (penalized) maximum likelihood which is equivalent to minimizing the + deviance. - The parameters :math:`\beta` are estimated by maximum likelihood which is - equivalent to minimizing the deviance. + TODO: For `alpha` > 0, the feature matrix `X` is assumed to be + standardized. Call + :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. TODO: Estimation of the dispersion parameter phi. - TODO: Notes on weights and 'scaled' Poisson, e.g. fit y = x/w with - with x=counts and w=exposure (time, money, persons, ...) => y is a - ratio with weights w. + TODO: Notes on weights and 'scaled' distributions. For Poisson, this means + to fit y = z/w with z=counts and w=exposure (time, money, persons, ...) + => y is a ratio with weights w. Same for other distributions. Parameters ---------- + alpha : float, optional (default=1) + Constant that multiplies the penalty terms und thus determines the + regularization strength. + See the notes for the exact mathematical meaning of this + parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + case, the design matrix X must have full column rank + (no collinearities). + + l1_ratio : float, optional (defaul=0) + The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For + ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it + is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a + combination of L1 and L2. + fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance - of a subclass of ExponentialDispersionModel, optional - (default='normal') + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\ + of class ExponentialDispersionModel, optional(default='normal') the distributional assumption of the GLM. - link : {'identity', 'log'} or an instance of a subclass of Link, + link : {'identity', 'log'} or an instance of class Link, optional (default='identity') the link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). @@ -634,28 +741,41 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): where ``g_i`` is the i-th component of the gradient (derivative of the deviance). - start_params : {array shape (n_features, ), 'ols'}, optional (default=None) - sets the start values for coef_ in the fit. - If None, default values are taken. - If 'ols' the result of an ordinary least squares in the link space - (linear predictor) is taken. - If an array is given, these values are taken as coef_ to start with. - If fit_intercept is true, the first value is assumed to be the start - value for the intercept_. + warm_start : boolean, optional (default=False) + If set to ``True``, reuse the solution of the previous call to fit as + initialization for ``coef_`` and ``intercept_`` (supersedes option + ``start_params``). If set to ``True`` or if the attribute ``coef_`` + does not exit (first call to fit), option ``start_params`` sets the + starting values for ``coef_`` and ``intercept_``. + + start_params : None or array of shape (n_features, ) or 'least_squares'}, \ + optional (default=None) + If an array of size n_features is supplied, use these as start values + for ``coef_`` in the fit. If ``fit_intercept=True``, the first element + is assumed to be the start value for the ``intercept_``. + If 'least_squares' is set, the result of a least squares fit in the + link space (linear predictor) is taken. If ``None``, the start values + are calculated by setting mu to family.starting_mu(..) and one step of + irls. + This option only applies if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not exist). + + copy_X : boolean, optional, default True + If ``True``, X will be copied; else, it may be overwritten. verbose : int, optional (default=0) For the lbfgs solver set verbose to any positive number for verbosity. Attributes ---------- - coef_ : array, shape (1, n_features) + coef_ : array, shape (n_features, ) Estimated coefficients for the linear predictor (X*coef_) in the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. dispersion_ : float - The dispersion parameter :math:`\phi` if fit_dispersion is set. + The dispersion parameter :math:`\\phi` if fit_dispersion is set. n_iter_ : int Actual number of iterations of the solver. @@ -667,10 +787,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): ---------- TODO """ - - def __init__(self, fit_intercept=True, family='normal', - link='identity', fit_dispersion='chisqr', solver='irls', - max_iter=100, tol=1e-4, start_params=None, verbose=0): + def __init__(self, alpha=1.0, l1_ratio=0, + fit_intercept=True, family='normal', link='identity', + fit_dispersion='chisqr', solver='irls', max_iter=100, + tol=1e-4, warm_start=False, start_params=None, copy_X=True, + verbose=0): + self.alpha = alpha + self.l1_ratio = l1_ratio self.fit_intercept = fit_intercept self.family = family self.link = link @@ -678,31 +801,86 @@ def __init__(self, fit_intercept=True, family='normal', self.solver = solver self.max_iter = max_iter self.tol = tol + self.warm_start = warm_start self.start_params = start_params + self.copy_X = copy_X self.verbose = verbose - def fit(self, X, y, sample_weight=None): + def fit(self, X, y, sample_weight=None, P1=None, P2=None, + check_input=True): """Fit a generalized linear model. Parameters ---------- - X : numpy array or sparse matrix of shape [n_samples,n_features] + X : numpy array or sparse matrix of shape (n_samples, n_features) Training data - y : numpy array of shape [n_samples] + y : numpy array of shape (n_samples, ) Target values - sample_weight : numpy array of shape [n_samples] + sample_weight : array of shape (n_samples, ) or None,\ + optinal (default=None) Individual weights for each sample. Var[Y_i]=phi/weight_i * v(mu) If Y_i ~ EDM(mu, phi/w_i) then sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a weighted average with weights=sample_weight. + P1 : None or array of shape (n_features*, ), optional\ + (default=None) + With this array, you can exclude coefficients from ths L1 penalty. + Set the corresponding value to 1 (include) or 0 (exclude). The + default value ``None`` is the same as an array of ones. + Note that n_features* = X.shape[1] = length of coef_ (intercept + always excluded from counting). + + P2 : None or array of shape (n_features*, n_features*) + With this square matrix the L2 penalty is calculated as `w P2 w`. + This gives a fine control over this penalty (Tikhonov + regularization). + Note that n_features* = X.shape[1] = length of coef_ (intercept + always excluded from counting). + + check_input : boolean, optional (default=True) + Allow to bypass several input checking. + Don't use this parameter unless you know what you do. + Returns ------- self : returns an instance of self. """ + ####################################################################### + # 1. input validation # + ####################################################################### + # 1.1 validate arguments of fit ####################################### + _dtype = [np.float64, np.float32] + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=_dtype, y_numeric=True, multi_output=False) + y = y.astype(np.float64) + + if sample_weight is None: + weights = np.ones_like(y) + elif np.isscalar(sample_weight): + weights = sample_weight*np.ones_like(y) + else: + weights = np.atleast_1d(sample_weight) + if weights.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar") + elif weights.shape[0] != y.shape[0]: + raise ValueError("Sample weights must have the same length as" + " y") + # IMPORTANT NOTE: Since we want to minimize + # 1/(2*sum(sample_weight)) * deviance + L1 + L2, + # deviance = sum(sample_weight * unit_deviance), + # we rescale weights such that sum(weights) = 1 and this becomes + # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + weights = weights/np.sum(weights) + + if not isinstance(check_input, bool): + raise ValueError("The argument check_input must be bool; got " + "(check_input={0})".format(check_input)) + + # 1.2 validate arguments of __init__ ################################## # Garantee that self._family_instance is an instance of class # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): @@ -720,7 +898,8 @@ def fit(self, X, y, sample_weight=None): raise ValueError( "The family must be an instance of class" " ExponentialDispersionModel or an element of" - " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].") + " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];" + " got (family={0})".format(self.family)) # Garantee that self._link_instance is set to an instance of class Link if isinstance(self.link, Link): @@ -733,132 +912,232 @@ def fit(self, X, y, sample_weight=None): else: raise ValueError( "The link must be an instance of class Link or" - " an element of ['identity', 'log'].") - + " an element of ['identity', 'log']; got (link={0})" + .format(self.link)) + + if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: + raise ValueError("Penalty term must be non-negative;" + " got (alpha={0})".format(self.alpha)) + if (not isinstance(self.l1_ratio, numbers.Number) or + self.l1_ratio < 0 or self.l1_ratio > 1): + raise ValueError("l1_ratio must be in interval [0, 1]; got" + " (l1_ratio={0]})".format(self.l1_ratio)) if not isinstance(self.fit_intercept, bool): - raise ValueError("The argument fit_intercept must be bool," + raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) if self.solver not in ['irls', 'lbfgs', 'newton-cg']: raise ValueError("GLM Regression supports only irls, lbfgs and" "newton-cg solvers, got {0}".format(self.solver)) + if self.alpha > 0: + if (self.l1_ratio > 0 and + self.solver not in []): + # TODO: Add solver for L1 + # raise ValueError("The solver option (solver={0}) is not " + # "appropriate for the chosen penalty which" + # " includes L1 (alpha={1})." + # .format(self.solver, self.alpha)) + raise NotImplementedError("Currently, no solver is implemented" + " that can deal with L1 penalties.") if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: raise ValueError("Maximum number of iteration must be positive;" " got (max_iter={0!r})".format(self.max_iter)) if not isinstance(self.tol, numbers.Number) or self.tol < 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol={0!r})".format(self.tol)) + if not isinstance(self.warm_start, bool): + raise ValueError("The argument warm_start must be bool;" + " got {0}".format(self.warm_start)) start_params = self.start_params - if start_params is not None and start_params is not 'ols': + if start_params is not None and start_params is not 'least_squares': start_params = np.atleast_1d(start_params) - if start_params.shape[0] != X.shape[1] + self.fit_intercept: + if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or + (start_params.ndim != 1)): raise ValueError("Start values for parameters must have the" - "right length; required length {0}, got {1}" + "right length and dimension; required (length" + "={0}, ndim=1), got (length={1}, ndim={2})." .format(X.shape[1] + self.fit_intercept, - start_params.shape[0])) - - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - y_numeric=True, multi_output=False) - y = y.astype(np.float64) + start_params.shape[0], + start_params.ndim)) + if not isinstance(self.copy_X, bool): + raise ValueError("The argument copy_X must be bool;" + " got {0}".format(self.copy_X)) + + if P1 is None: + P1 = np.ones(X.shape[1]) + else: + P1 = np.atleast_1d(P1) + if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1): + raise ValueError("P1 must be either None or an 1D array with " + "the length of X.shape[1]; " + "got (P1.shape[0]={0}), " + "needed (X.shape[1]={1})." + .format(P1.shape[0], X.shape[1])) + if P2 is None: + P2 = np.ones(X.shape[1]) + if sparse.issparse(X): + P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), + shape=(X.shape[1], X.shape[1]))).tocsr() + else: + P2 = check_array(P2, accept_sparse=['csr', 'csc', 'coo'], + dtype="numeric", ensure_2d=True) + if ((P2.shape[0] != P2.shape[1]) or + (P2.shape[0] != X.shape[1]) or + (P2.ndim != 2)): + raise ValueError("P2 must be either None or an array of shape " + "(n_features, n_features) with " + "n_features=X.shape[1]; " + "got (P2.shape=({0},{1})), needed ({3},{3})" + .format(P2.shape[0], P2.shape[1], X.shape[1])) family = self._family_instance link = self._link_instance - if not np.all(family.in_y_range(y)): - raise ValueError("Some value(s) of y are out of the valid " - "range for family {0}" - .format(family.__class__.__name__)) - - if sample_weight is None: - weights = np.ones_like(y) - elif np.isscalar(sample_weight): - weights = sample_weight*np.ones_like(y) - else: - weights = np.atleast_1d(sample_weight) - if weights.ndim > 1: - raise ValueError("Sample weight must be 1D array or scalar") - elif weights.shape[0] != y.shape[0]: - raise ValueError("Sample weights must have the same length as" - " y") - if self.fit_intercept: # intercept is first column <=> coef[0] is for intecept if sparse.issparse(X): Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X]) else: Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) + P1 = np.concatenate((np.array([0]), P1)) + if P2.ndim == 1: + P2 = np.concatenate((np.array([0]), P2)) + elif sparse.issparse(P2): + P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2), + dtype=P2.dtype).tocsr() + else: + P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))], + [np.zeros((X.shape[1], 1)), P2]]) else: Xnew = X n_samples, n_features = Xnew.shape - - # Note: Since dispersion_ alias phi does not enter the estimation - # of mu_i=E[y_i] set it to 1 where convenient. + l1 = self.alpha * self.l1_ratio + l2 = self.alpha * (1-self.l1_ratio) + P1 *= l1 + P2 *= l2 + + # 1.3 additional validations ########################################## + if check_input: + if not np.all(family.in_y_range(y)): + raise ValueError("Some value(s) of y are out of the valid " + "range for family {0}" + .format(family.__class__.__name__)) + # TODO: if alpha=0 check that Xnew is not rank deficient + # TODO: what else to check? + + ####################################################################### + # 2. initialization of coef = (intercept_, coef_) # + ####################################################################### + # Note: Since phi=self.dispersion_ does not enter the estimation + # of mu_i=E[y_i], set it to 1. # set start values for coef coef = None - if start_params is None: - # Use mu_start and apply one irls step to calculate coef - mu = family.starting_mu(y, weights) - # linear predictor - eta = link.link(mu) - # h'(eta) - hp = link.inverse_derivative(eta) - # working weights w, in principle a diagonal matrix - # therefore here just as 1d array - w = (hp**2 / family.variance(mu, phi=1, weights=weights)) - wroot = np.sqrt(w) - # working observations - yw = eta + (y-mu)/hp - # least squares rescaled with wroot - wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples)) - X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) - yw_rescale = safe_sparse_dot(wroot, y, dense_output=True) - coef = linalg.lstsq(X_rescale, yw_rescale)[0] - elif start_params is 'ols': - reg = LinearRegression(copy_X=False, fit_intercept=False) - reg.fit(Xnew, link.link(y)) - coef = reg.coef_ + if self.warm_start and hasattr(self, "coef_"): + if self.fit_intercept: + coef = np.concatenate((self.intercept_, self.coef_)) + else: + coef = self.coef_ + elif self.start_params is None: + if self.l1_ratio == 0: + # See 3.1 IRLS + # Use mu_start and apply one irls step to calculate coef + mu = family.starting_mu(y, weights) + # linear predictor + eta = link.link(mu) + # h'(eta) + hp = link.inverse_derivative(eta) + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = (hp**2 / family.variance(mu, phi=1, weights=weights)) + # working observations + z = eta + (y-mu)/hp + # solve A*coef = b + # A = X' W X + l2 P2, b = X' W z + coef = _irls_step(Xnew, W, P2, z) + else: + # with L1 penalty, start with coef = 0 + coef = np.zeros(n_features) + elif self.start_params is 'least_squares': + if self.alpha == 0: + reg = LinearRegression(copy_X=True, fit_intercept=False) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ + elif self.l1_ratio <= 0.01: + # ElasticNet says l1_ratio <= 0.01 is not reliable, use Ridge + reg = Ridge(copy_X=True, fit_intercept=False, + alpha=self.alpha) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ + else: + # TODO: Does this make sense? + reg = ElasticNet(copy_X=True, fit_intercept=False, + alpha=self.alpha, l1_ratio=self.l1_ratio) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ else: coef = start_params + ####################################################################### + # 3. fit # + ####################################################################### # algorithms for optimiation # TODO: Parallelize it self.n_iter_ = 0 converged = False + # 3.1 IRLS ############################################################ + # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' + # Obj = objective function = 1/2 Dev + l2/2 w P2 w + # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 + # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) + # D2 = link.inverse_derivative(eta)^2 = D^2 + # W = D2/V(mu) + # l2 = alpha * (1 - l1_ratio) + # Obj' = d(Obj)/d(w) = 1/2 Dev' + P2 w + # = -X' D (y-mu)/V(mu) + l2 P2 w + # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 + # Use Fisher matrix instead of full info matrix -X'(...) X, + # i.e. E[Dev''] with E[y-mu]=0: + # Obj'' ~ X' W X + l2 P2 + # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu) + # Note: P2 = l2*P2, see above if self.solver == 'irls': - # linear predictor + # eta = linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = link.inverse(eta) + # D = h'(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) while self.n_iter_ < self.max_iter: self.n_iter_ += 1 # coef_old not used so far. # coef_old = coef - # h'(eta) - hp = link.inverse_derivative(eta) - # working weights w, in principle a diagonal matrix + # working weights W, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / family.variance(mu, phi=1, weights=weights)) - wroot = np.sqrt(w) + W = (hp**2 / V) # working observations - yw = eta + (y-mu)/hp - # least squares rescaled with wroot - wroot = sparse.dia_matrix((wroot, 0), - shape=(n_samples, n_samples)) - X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) - yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True) - coef, residues, rank, singular_ = ( - linalg.lstsq(X_rescale, yw_rescale)) + z = eta + (y-mu)/hp + # solve A*coef = b + # A = X' W X + l2 P2, b = X' W z + coef = _irls_step(Xnew, W, P2, z) # updated linear predictor # do it here for updated values for tolerance eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = link.inverse(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) # which tolerace? |coef - coef_old| or gradient? # use gradient for compliance with newton-cg and lbfgs - # TODO: faster computation of gradient, use mu and eta directly - gradient = family._deviance_derivative( - coef=coef, X=Xnew, y=y, weights=weights, link=link) + # gradient = family._deviance_derivative( + # coef=coef, X=Xnew, y=y, weights=weights, link=link) + # gradient = -X' D (y-mu)/V(mu) + l2 P2 w + gradient = -safe_sparse_dot(Xnew.T, hp*(y-mu)/V) + if P2.ndim == 1: + gradient += P2*coef + else: + gradient += safe_sparse_dot(P2, coef) if (np.max(np.abs(gradient)) <= self.tol): converged = True break @@ -868,50 +1147,73 @@ def fit(self, X, y, sample_weight=None): "of iterations (currently {0})" .format(self.max_iter), ConvergenceWarning) + # 3.2 L-BFGS and Newton-CG ############################################ # TODO: performance: make one function return both deviance and # gradient of deviance - elif self.solver == 'lbfgs': - func = family._deviance - fprime = family._deviance_derivative - args = (Xnew, y, weights, link) - coef, loss, info = optimize.fmin_l_bfgs_b( - func, coef, fprime=fprime, - args=args, - iprint=(self.verbose > 0) - 1, pgtol=self.tol, - maxiter=self.max_iter) - if self.verbose > 0: - if info["warnflag"] == 1: - warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.", - ConvergenceWarning) - elif info["warnflag"] == 2: - warnings.warn("lbfgs failed for the reason: {0}".format( - info["task"])) - self.n_iter_ = info['nit'] - elif self.solver == 'newton-cg': - func = family._deviance - grad = family._deviance_derivative + elif self.solver in ['lbfgs', 'newton-cg']: + def func(coef, *args): + if P2.ndim == 1: + L2 = safe_sparse_dot(coef.T, P2*coef) + else: + L2 = safe_sparse_dot(coef.T, safe_sparse_dot(P2, coef)) + # A[np.diag_indices_from(A)] += P2 + return 0.5*family._deviance(coef, *args) + 0.5*L2 + + def fprime(coef, *args): + if P2.ndim == 1: + L2 = P2*coef + else: + L2 = safe_sparse_dot(P2, coef) + return 0.5*family._deviance_derivative(coef, *args) + L2 def grad_hess(coef, X, y, weights, link): - grad = (family._deviance_derivative( - coef, X, y, weights, link)) - hessian = (family._deviance_hessian( - coef, X, y, weights, link)) + if P2.ndim == 1: + L2 = P2*coef + else: + L2 = safe_sparse_dot(P2, coef) + grad = 0.5*family._deviance_derivative( + coef, X, y, weights, link) + L2 + hessian = 0.5*family._deviance_hessian( + coef, X, y, weights, link) + if P2.ndim == 1: + hessian[np.diag_indices_from(hessian)] += P2 + else: + hessian += P2 def Hs(s): - ret = np.dot(hessian, s) + ret = safe_sparse_dot(hessian, s) return ret return grad, Hs - hess = grad_hess + args = (Xnew, y, weights, link) - coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, - maxiter=self.max_iter, tol=self.tol) - self.coef_ = coef + if self.solver == 'lbfgs': + coef, loss, info = optimize.fmin_l_bfgs_b( + func, coef, fprime=fprime, args=args, + iprint=(self.verbose > 0) - 1, pgtol=self.tol, + maxiter=self.max_iter) + if self.verbose > 0: + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.", + ConvergenceWarning) + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}" + .format(info["task"])) + self.n_iter_ = info['nit'] + elif self.solver == 'newton-cg': + coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef, + args=args, maxiter=self.max_iter, + tol=self.tol) + + ####################################################################### + # 4. postprocessing # + ####################################################################### if self.fit_intercept: self.intercept_ = coef[0] self.coef_ = coef[1:] else: + # set intercept to zero as the other linear models do self.intercept_ = 0. self.coef_ = coef @@ -988,8 +1290,8 @@ def score(self, X, y, sample_weight=None): :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` is the null deviance, i.e. the deviance of a model with intercept alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean - :math:`\bar{y}` is average by sample_weight. In the case of a Normal - distribution, this D^2 equals R^2. + :math:`\bar{y}` is averaged by sample_weight. In the case of a Normal + distribution, D^2 equals R^2. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index a4d4ea8650860..df0413b4d7836 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,19 +1,34 @@ import numpy as np from sklearn.linear_model.glm import ( - # Link, IdentityLink, + Link, + IdentityLink, LogLink, TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, - # GeneralizedHyperbolicSecand, + GeneralizedHyperbolicSecand, GeneralizedLinearRegressor) +from sklearn.linear_model.ridge import Ridge from sklearn.utils.testing import ( - # assert_equal, + assert_equal, assert_almost_equal, assert_array_equal, assert_array_almost_equal) +def test_link_properties(): + """Test link inverse and derivative + """ + rng = np.random.RandomState(0) + x = rng.rand(100)*100 + from sklearn.linear_model.glm import Link + for link in vars()['Link'].__subclasses__(): + link = link() + assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) + assert_almost_equal(link.inverse_derivative(link.link(x)), + 1/link.derivative(x), decimal=10) + + def test_family_bounds(): """Test the valid range of distributions """ @@ -42,8 +57,23 @@ def test_family_bounds(): assert_array_equal(result, [False, False, True]) +def test_deviance_zero(): + """Test deviance(y,y) = 0 for different families + """ + for family in [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=-2.5), + TweedieDistribution(power=-1), + TweedieDistribution(power=1.5), + TweedieDistribution(power=2.5), + TweedieDistribution(power=4), + GeneralizedHyperbolicSecand()]: + assert_almost_equal(family.deviance(0.1, 0.1), 0, decimal=10) + assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10) + + def test_glm_identiy_regression(): - """Test linear regression on a simple dataset + """Test GLM regression with identity link on a simple dataset """ coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T @@ -55,13 +85,13 @@ def test_glm_identiy_regression(): for solver in ['irls', 'lbfgs', 'newton-cg']: for family in families: glm = GeneralizedLinearRegressor( - family=family, fit_intercept=False, solver=solver) + alpha=0, family=family, fit_intercept=False, solver=solver) res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) def test_glm_log_regression(): - """Test linear regression on a simple dataset + """Test GLM regression with log link on a simple dataset """ coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T @@ -73,10 +103,79 @@ def test_glm_log_regression(): for solver in ['irls', 'lbfgs', 'newton-cg']: for family in families: glm = GeneralizedLinearRegressor( - family=family, link=LogLink(), fit_intercept=False, - solver=solver, start_params='ols') + alpha=0, family=family, link=LogLink(), fit_intercept=False, + solver=solver, start_params='least_squares') res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) +def test_normal_ridge(): + """Test ridge regression for Normal distributions + + Compare to test_ridge in test_ridge.py. + """ + rng = np.random.RandomState(0) + alpha = 1.0 + + # With more samples than features + n_samples, n_features, n_predict = 6, 5, 10 + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + T = rng.randn(n_predict, n_features) + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True) + ridge.fit(X, y) + for solver in ['irls', 'lbfgs', 'newton-cg']: + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + family='normal', link='identity', + fit_intercept=True, solver=solver) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + family='normal', link='identity', + fit_intercept=False, solver='irls') + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + + # With more features than samples + n_samples, n_features, n_predict = 5, 10, 10 + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + T = rng.randn(n_predict, n_features) + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True) + ridge.fit(X, y) + for solver in ['irls', 'lbfgs', 'newton-cg']: + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + family='normal', link='identity', + fit_intercept=True, solver=solver) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + family='normal', link='identity', + fit_intercept=False, solver='irls') + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + + # TODO: Test compatibility with R's glm, glmnet From 5b46c23977a8e386987a2767b2c12d4296d332af Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 19 Sep 2017 00:40:34 +0200 Subject: [PATCH 07/69] [WIP] Add Generalized Linear Models (#9405) * fix some bugs in user guide linear_model.rst * fix some pep8 issues in test_glm.py --- doc/modules/linear_model.rst | 28 +++++++++++++++----------- sklearn/linear_model/tests/test_glm.py | 7 ++++--- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 51b3821fa6207..98736facd9b76 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -880,14 +880,14 @@ Generalized linear regression ============================= :class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two -ways [1]_. First, the predicted values :math:`\hat{y}` are linked to a linear +ways [8]_. First, the predicted values :math:`\hat{y}` are linked to a linear combination of the input variables :math:`X` via an inverse link function :math:`h` as .. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [2]_. The objective function beeing minimized +exponential dispersion model (EDM) [9]_. The objective function beeing minimized becomes .. math:: \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1 @@ -914,16 +914,20 @@ it is convenient to apply a link function different from the identity link :math:`h(Xw)=\exp(Xw)`. Note that the feature matrix `X` should be standardized before fitting. This -ensures that the penalty treats features equally. +ensures that the penalty treats features equally. The estimator can be used as +follows:: - >>> from sklearn import linear_model - >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, l1_ratio=0) - >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> from sklearn.linear_model import GeneralizedLinearRegressor + >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) + GeneralizedLinearRegressor(alpha=0.5, copy_X=True, family='poisson', + fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0, + link='log', max_iter=100, solver='irls', start_params=None, + tol=0.0001, verbose=0, warm_start=False) >>> reg.coef_ array([ 0.24630255, 0.43373521]) - >>> reg.intercept_ - -0.76383575123143277 + >>> reg.intercept_ #doctest: +ELLIPSIS + -0.76383575... Mathematical formulation ------------------------ @@ -969,7 +973,7 @@ Two remarks: * The deviances for at least Normal, Poisson and Gamma distributions are strictly consistent scoring functions for the mean :math:`\mu`, see Eq. - (19)-(20) in [3]_. + (19)-(20) in [10]_. * If you want to model a frequency, i.e. counts per exposure (time, volume, ...) you can do so by a Poisson distribution and passing @@ -979,12 +983,12 @@ Two remarks: .. topic:: References: - .. [1] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + .. [8] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - .. [2] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. + .. [9] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. See also `Exponential dispersion model. `_ - .. [3] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ + .. [10] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ Stochastic Gradient Descent - SGD ================================= diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index df0413b4d7836..b62b51b5bcb9e 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -2,7 +2,7 @@ from sklearn.linear_model.glm import ( Link, - IdentityLink, + # IdentityLink, LogLink, TweedieDistribution, NormalDistribution, PoissonDistribution, @@ -21,8 +21,9 @@ def test_link_properties(): """ rng = np.random.RandomState(0) x = rng.rand(100)*100 - from sklearn.linear_model.glm import Link - for link in vars()['Link'].__subclasses__(): + # from sklearn.linear_model.glm import Link + # for link in vars()['Link'].__subclasses__(): + for link in Link.__subclasses__(): link = link() assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) assert_almost_equal(link.inverse_derivative(link.link(x)), From 10dd14603a5fc04f53ca4920621434aaff662064 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 3 Dec 2017 19:54:57 +0100 Subject: [PATCH 08/69] [WIP] Add Generalized Linear Models (#9405) * added test: ridge poisson with log-link compared to glmnet * fix ValueError message for l1_ratio * fix ValueError message for P2 * string comparison: use '==' and '!=' instead of 'is' and 'is not' * fix RuntimeWarnings in unit_deviance of poisson: x*log(x) as xlogy * added test for fisher matrix * added test for family argument --- sklearn/linear_model/glm.py | 29 ++++++----- sklearn/linear_model/tests/test_glm.py | 72 ++++++++++++++++++++++++-- 2 files changed, 84 insertions(+), 17 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 2db3c56d5e1c1..93ce358a8a874 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -15,6 +15,8 @@ # TODO: Make it as much consistent to other estimators in linear_model as # possible # TODO: options P1 and P2 in fit() or in __init__()??? +# TODO: Include further classes in class.rst? ExponentialDispersionModel? +# TweedieDistribution? # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -42,7 +44,7 @@ from abc import ABCMeta, abstractmethod, abstractproperty import numbers import numpy as np -from scipy import linalg, optimize, sparse +from scipy import linalg, optimize, sparse, special import warnings from .base import LinearRegression from .coordinate_descent import ElasticNet @@ -340,7 +342,7 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): = \mathbf{X}^T W \mathbf{X} \,, with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, - see func:`score_function`. + see func:`_score`. """ n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) @@ -363,7 +365,7 @@ def _observed_information(self, coef, phi, X, y, weights, link): \mathbf{H}(\boldsymbol{w}) = -\frac{\partial^2 loglike}{\partial\boldsymbol{w} \partial\boldsymbol{w}^T} - = \mathbf{X}^T \legt[ + = \mathbf{X}^T \left[ - \mathbf{D}' \mathbf{R} + \mathbf{D}^2 \mathbf{V} \mathbf{R} + \mathbf{D}^2 @@ -393,7 +395,7 @@ def _deviance_derivative(self, coef, X, y, weights, link): r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a function of the coefficients `coef`. This is equivalent to :math:`-2\phi` times the score function - :func:`score_function` (derivative of the log-likelihood). + :func:`_score` (derivative of the log-likelihood). """ score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights, link=link) @@ -510,7 +512,8 @@ def unit_deviance(self, y, mu): return (y-mu)**2 if p == 1: # PoissonDistribution - return 2 * (np.where(y == 0, 0, y*np.log(y/mu))-y+mu) + # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 + return 2 * (special.xlogy(y, y/mu) - y + mu) elif p == 2: # GammaDistribution return 2 * (np.log(mu/y)+y/mu-1) @@ -921,7 +924,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, if (not isinstance(self.l1_ratio, numbers.Number) or self.l1_ratio < 0 or self.l1_ratio > 1): raise ValueError("l1_ratio must be in interval [0, 1]; got" - " (l1_ratio={0]})".format(self.l1_ratio)) + " (l1_ratio={0})".format(self.l1_ratio)) if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) @@ -948,7 +951,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) start_params = self.start_params - if start_params is not None and start_params is not 'least_squares': + if start_params is not None and start_params != 'least_squares': start_params = np.atleast_1d(start_params) if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or (start_params.ndim != 1)): @@ -986,7 +989,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " "n_features=X.shape[1]; " - "got (P2.shape=({0},{1})), needed ({3},{3})" + "got (P2.shape=({0}, {1})), needed ({2}, {2})" .format(P2.shape[0], P2.shape[1], X.shape[1])) family = self._family_instance @@ -1058,7 +1061,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, else: # with L1 penalty, start with coef = 0 coef = np.zeros(n_features) - elif self.start_params is 'least_squares': + elif self.start_params == 'least_squares': if self.alpha == 0: reg = LinearRegression(copy_X=True, fit_intercept=False) reg.fit(Xnew, link.link(y)) @@ -1277,11 +1280,9 @@ def estimate_phi(self, y, X, sample_weight): dev = self._family_instance.deviance(y, mu, sample_weight) return dev/(n_samples - n_features) -# TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5" -# in check_estimator for score -# from sklearn.utils.estimator_checks import check_estimator -# from sklearn.linear_model import GeneralizedLinearRegressor -# check_estimator(GeneralizedLinearRegressor) + # Note: check_estimator(GeneralizedLinearRegressor) might raise + # "AssertionError: -0.28014056555724598 not greater than 0.5" + # unless GeneralizedLinearRegressor has a score which passes the test. def score(self, X, y, sample_weight=None): r"""Returns D^2, a generalization of the coefficient of determination R^2, which uses deviance instead of squared error. diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index b62b51b5bcb9e..de7de90db967b 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,4 +1,6 @@ import numpy as np +from numpy.testing import assert_allclose +import scipy as sp from sklearn.linear_model.glm import ( Link, @@ -73,6 +75,46 @@ def test_deviance_zero(): assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10) +def test_fisher_matrix(): + """Test the Fisher matrix numerically. + Trick: Use numerical differentiation with y = mu""" + for family in [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution()]: + link = LogLink() + rng = np.random.RandomState(0) + coef = np.array([-2, 1, 0, 1, 2.5]) + phi = 0.5 + X = rng.randn(10, 5) + lin_pred = np.dot(X, coef) + mu = link.inverse(lin_pred) + weights = rng.randn(10)**2 + 1 + fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link) + approx = np.array([]).reshape(0, coef.shape[0]) + for i in range(coef.shape[0]): + def f(coef): + return -family._score(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link)[i] + approx = np.vstack( + [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)] + ) + assert_allclose(fisher, approx, rtol=1e-3) + + +def test_glm_family_argument(): + """Test GLM family argument set as string + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for (f, fam) in [('normal', NormalDistribution()), + ('poisson', PoissonDistribution()), + ('gamma', GammaDistribution()), + ('inverse.gaussian', InverseGaussianDistribution())]: + glm = GeneralizedLinearRegressor(family=f, fit_intercept=False, + alpha=0).fit(X, y) + assert_equal(type(glm._family_instance), type(fam)) + + def test_glm_identiy_regression(): """Test GLM regression with identity link on a simple dataset """ @@ -82,7 +124,8 @@ def test_glm_identiy_regression(): families = ( NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)) + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecand()) for solver in ['irls', 'lbfgs', 'newton-cg']: for family in families: glm = GeneralizedLinearRegressor( @@ -100,7 +143,8 @@ def test_glm_log_regression(): families = ( NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)) + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecand()) for solver in ['irls', 'lbfgs', 'newton-cg']: for family in families: glm = GeneralizedLinearRegressor( @@ -179,4 +223,26 @@ def test_normal_ridge(): assert_array_almost_equal(glm.predict(T), ridge.predict(T)) -# TODO: Test compatibility with R's glm, glmnet +def test_poisson_ridge(): + """Test ridge regression with poisson family and LogLink + + Compare to R's glmnet""" + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.12889386979 + # a 0.29019207995 + # b 0.03741173122 + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, family='poisson', + link='log', tol=1e-10) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, -0.12889386979, decimal=7) + assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + decimal=7) From 72485b63e89879e65381bca12152b54600fd3970 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 8 Jan 2018 22:13:45 +0100 Subject: [PATCH 09/69] [WIP] Add Generalized Linear Models (#9405) * put arguments P1, P2 and check_input from fit to __init__ * added check_input test: is P2 positive definite? * added solver option: 'auto' --- sklearn/linear_model/glm.py | 181 +++++++++++++++++++++++------------- 1 file changed, 117 insertions(+), 64 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 93ce358a8a874..3f6b91026ef9b 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -14,7 +14,7 @@ # TODO: Write examples and more docu # TODO: Make it as much consistent to other estimators in linear_model as # possible -# TODO: options P1 and P2 in fit() or in __init__()??? +# TODO: which dtype to force for y and X? Which for P1, P2? # TODO: Include further classes in class.rst? ExponentialDispersionModel? # TweedieDistribution? @@ -709,6 +709,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. + P1 : None or array of shape (n_features*, ), optional\ + (default=None) + With this array, you can exclude coefficients from the L1 penalty. + Set the corresponding value to 1 (include) or 0 (exclude). The + default value ``None`` is the same as an array of ones. + Note that n_features* = X.shape[1] = length of coef_ (intercept + always excluded from counting). + + P2 : None or array of shape (n_features*, n_features*) + With this square matrix the L2 penalty is calculated as `w P2 w`. + This gives a fine control over this penalty (Tikhonov + regularization). + Note that n_features* = X.shape[1] = length of coef_ (intercept + always excluded from counting). P2 must be positive semi-definite. + fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). @@ -727,13 +742,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'irls', 'newton-cg', 'lbfgs'}, optional (defaul='irls') + solver : {'auto', 'irls', 'newton-cg', 'lbfgs'}, optional (defaul='auto') Algorithm to use in the optimization problem. - - 'irls' is iterated reweighted least squares. It is the standard - algorithm for GLMs. + - 'irls' is iterated reweighted least squares (Fisher scoring). + It is the standard algorithm for GLMs. Cannot deal with + L1 penalties. + + - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties. - - 'newton-cg', 'lbfgs' + - 'auto' sets 'irls'. max_iter : int, optional (default=100) TODO @@ -766,6 +784,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten. + check_input : boolean, optional (default=True) + Allow to bypass several checks on input: y values in range of family, + sample_weights non-negative, P2 positive semi-definite. + Don't use this parameter unless you know what you do. + verbose : int, optional (default=0) For the lbfgs solver set verbose to any positive number for verbosity. @@ -790,13 +813,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): ---------- TODO """ - def __init__(self, alpha=1.0, l1_ratio=0, + def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', - fit_dispersion='chisqr', solver='irls', max_iter=100, + fit_dispersion='chisqr', solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params=None, copy_X=True, - verbose=0): + check_input=True, verbose=0): self.alpha = alpha self.l1_ratio = l1_ratio + self.P1 = P1 + self.P2 = P2 self.fit_intercept = fit_intercept self.family = family self.link = link @@ -807,10 +832,10 @@ def __init__(self, alpha=1.0, l1_ratio=0, self.warm_start = warm_start self.start_params = start_params self.copy_X = copy_X + self.check_input = check_input self.verbose = verbose - def fit(self, X, y, sample_weight=None, P1=None, P2=None, - check_input=True): + def fit(self, X, y, sample_weight=None): """Fit a generalized linear model. Parameters @@ -823,31 +848,13 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, sample_weight : array of shape (n_samples, ) or None,\ optinal (default=None) - Individual weights for each sample. - Var[Y_i]=phi/weight_i * v(mu) - If Y_i ~ EDM(mu, phi/w_i) then + Individual weights w_i for each sample. Note that for an + Exponential Dispersion Model (EDM), one has + Var[Y_i]=phi/w_i * v(mu). + If Y_i ~ EDM(mu, phi/w_i), then sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a weighted average with weights=sample_weight. - P1 : None or array of shape (n_features*, ), optional\ - (default=None) - With this array, you can exclude coefficients from ths L1 penalty. - Set the corresponding value to 1 (include) or 0 (exclude). The - default value ``None`` is the same as an array of ones. - Note that n_features* = X.shape[1] = length of coef_ (intercept - always excluded from counting). - - P2 : None or array of shape (n_features*, n_features*) - With this square matrix the L2 penalty is calculated as `w P2 w`. - This gives a fine control over this penalty (Tikhonov - regularization). - Note that n_features* = X.shape[1] = length of coef_ (intercept - always excluded from counting). - - check_input : boolean, optional (default=True) - Allow to bypass several input checking. - Don't use this parameter unless you know what you do. - Returns ------- self : returns an instance of self. @@ -872,16 +879,6 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, elif weights.shape[0] != y.shape[0]: raise ValueError("Sample weights must have the same length as" " y") - # IMPORTANT NOTE: Since we want to minimize - # 1/(2*sum(sample_weight)) * deviance + L1 + L2, - # deviance = sum(sample_weight * unit_deviance), - # we rescale weights such that sum(weights) = 1 and this becomes - # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) - weights = weights/np.sum(weights) - - if not isinstance(check_input, bool): - raise ValueError("The argument check_input must be bool; got " - "(check_input={0})".format(check_input)) # 1.2 validate arguments of __init__ ################################## # Garantee that self._family_instance is an instance of class @@ -928,17 +925,22 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver not in ['irls', 'lbfgs', 'newton-cg']: - raise ValueError("GLM Regression supports only irls, lbfgs and" - "newton-cg solvers, got {0}".format(self.solver)) + if self.solver == 'auto': + solver = 'irls' + else: + solver = self.solver + if solver not in ['irls', 'lbfgs', 'newton-cg']: + raise ValueError("GeneralizedLinearRegressor supports only irls, " + "lbfgs and newton-cg solvers, got {0}" + "".format(solver)) if self.alpha > 0: if (self.l1_ratio > 0 and - self.solver not in []): + solver not in []): # TODO: Add solver for L1 # raise ValueError("The solver option (solver={0}) is not " # "appropriate for the chosen penalty which" # " includes L1 (alpha={1})." - # .format(self.solver, self.alpha)) + # .format(solver, self.alpha)) raise NotImplementedError("Currently, no solver is implemented" " that can deal with L1 penalties.") if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: @@ -964,28 +966,32 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) + if not isinstance(self.check_input, bool): + raise ValueError("The attribute check_input must be bool; got " + "(check_input={0})".format(self.check_input)) - if P1 is None: + if self.P1 is None: P1 = np.ones(X.shape[1]) else: - P1 = np.atleast_1d(P1) + P1 = np.atleast_1d(np.copy(self.P1)) if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1): raise ValueError("P1 must be either None or an 1D array with " "the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." .format(P1.shape[0], X.shape[1])) - if P2 is None: + if self.P2 is None: P2 = np.ones(X.shape[1]) if sparse.issparse(X): P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), shape=(X.shape[1], X.shape[1]))).tocsr() else: - P2 = check_array(P2, accept_sparse=['csr', 'csc', 'coo'], + P2 = check_array(self.P2, copy=True, + accept_sparse=['csr', 'csc', 'coo'], dtype="numeric", ensure_2d=True) - if ((P2.shape[0] != P2.shape[1]) or - (P2.shape[0] != X.shape[1]) or - (P2.ndim != 2)): + if ((P2.ndim != 2) or + (P2.shape[0] != P2.shape[1]) or + (P2.shape[0] != X.shape[1])): raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " "n_features=X.shape[1]; " @@ -1020,16 +1026,39 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, P2 *= l2 # 1.3 additional validations ########################################## - if check_input: + if self.check_input: if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" .format(family.__class__.__name__)) + if not np.all(weights >= 0): + raise ValueError("Sample weights must be non-negative.") + # check that P2 is positive semidefinite + # np.linalg.cholesky(P2) 'only' asserts positive definite + if self.P2 is not None: + if sparse.issparse(P2): + # TODO: check sparse P2 for non-negativeness + raise NotImplementedError("Check sparse P2 for " + "non-negaitveness is not yet " + "implemented.") + elif P2.ndim == 2: + if not np.all(np.linalg.eigvals(P2) >= -1e-15): + raise ValueError("P2 must be positive definite.") # TODO: if alpha=0 check that Xnew is not rank deficient # TODO: what else to check? ####################################################################### - # 2. initialization of coef = (intercept_, coef_) # + # 2. rescaling of weights (sample_weight) # + ####################################################################### + # IMPORTANT NOTE: Since we want to minimize + # 1/(2*sum(sample_weight)) * deviance + L1 + L2, + # deviance = sum(sample_weight * unit_deviance), + # we rescale weights such that sum(weights) = 1 and this becomes + # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + weights = weights/np.sum(weights) + + ####################################################################### + # 3. initialization of coef = (intercept_, coef_) # ####################################################################### # Note: Since phi=self.dispersion_ does not enter the estimation # of mu_i=E[y_i], set it to 1. @@ -1082,13 +1111,13 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, coef = start_params ####################################################################### - # 3. fit # + # 4. fit # ####################################################################### # algorithms for optimiation # TODO: Parallelize it self.n_iter_ = 0 converged = False - # 3.1 IRLS ############################################################ + # 4.1 IRLS ############################################################ # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' # Obj = objective function = 1/2 Dev + l2/2 w P2 w # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 @@ -1104,7 +1133,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, # Obj'' ~ X' W X + l2 P2 # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu) # Note: P2 = l2*P2, see above - if self.solver == 'irls': + if solver == 'irls': # eta = linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = link.inverse(eta) @@ -1150,10 +1179,10 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, "of iterations (currently {0})" .format(self.max_iter), ConvergenceWarning) - # 3.2 L-BFGS and Newton-CG ############################################ + # 4.2 L-BFGS and Newton-CG ############################################ # TODO: performance: make one function return both deviance and # gradient of deviance - elif self.solver in ['lbfgs', 'newton-cg']: + elif solver in ['lbfgs', 'newton-cg']: def func(coef, *args): if P2.ndim == 1: L2 = safe_sparse_dot(coef.T, P2*coef) @@ -1190,7 +1219,7 @@ def Hs(s): args = (Xnew, y, weights, link) - if self.solver == 'lbfgs': + if solver == 'lbfgs': coef, loss, info = optimize.fmin_l_bfgs_b( func, coef, fprime=fprime, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, @@ -1204,13 +1233,37 @@ def Hs(s): warnings.warn("lbfgs failed for the reason: {0}" .format(info["task"])) self.n_iter_ = info['nit'] - elif self.solver == 'newton-cg': + elif solver == 'newton-cg': coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef, args=args, maxiter=self.max_iter, tol=self.tol) + # 4.3 coordinate descent ############################################## + # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + # An Improved GLMNET for L1-regularized Logistic Regression, + # Journal of Machine Learning Research 13 (2012) 1999-2030 + # Note: Use Fisher matrix instead of Hessian + # + # 1. find optimal descent direction d by minimizing + # min_d F(w+d) = min_d F(w+d) - F(w) + # F = f + g; f(w) = 1/2 dev; g(w) = 1/2*w*P2*w + ||P1*w||_1 + # 2. quadrdatic approx of f(w+d)-f(w): + # q(d) = f'(w)*d +1/2 d*H*d + # min_d q(d) + g(w+d) - g(w) + # 3. coordinate descent by updating coordinate j (d -> d+z*e_j): + # min_z q(d+z*e_j) + g(w+d+z*e_j) - g(w) + # = min_z q(d+z e_j) - q(d) + g(w+d+z*e_j) - g(w+d) + # TODO + # elif solver == 'cd': + # line search parameters + # (beta, sigma) = (0.5, 0.01) + # for iteration k from 1 to maxiter + # for coordinate j sample at random + # np.random.choice(coord, replace = False) + # + ####################################################################### - # 4. postprocessing # + # 5. postprocessing # ####################################################################### if self.fit_intercept: self.intercept_ = coef[0] From 5c1369bde863a73aff46a502acd70e58e06dcb85 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 24 Jan 2018 15:22:08 +0100 Subject: [PATCH 10/69] [WIP] Add Generalized Linear Models (#9405) * added coordinate descent solver * skip doctest for GeneralizedLinearRegressor example * symmetrize P2 => use P2 = 1/2 (P2+P2') * better validation of parameter start_params --- doc/modules/linear_model.rst | 21 +- sklearn/linear_model/glm.py | 353 ++++++++++++++++++++----- sklearn/linear_model/tests/test_glm.py | 82 ++++-- 3 files changed, 365 insertions(+), 91 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 98736facd9b76..834466e494a4a 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -894,9 +894,9 @@ becomes +\frac{\alpha(1-\rho)}{2} w^T P_2 w with sample weights :math:`s`. -:math:`P_1` can be used to exclude some of the coefficients in the L1 -penalty, :math:`P_2` (must be positive semi-definite) allows for a more -versatile L2 penalty. +:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in +the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows +for a more versatile L2 penalty. Use cases, where a loss different from the squared loss might be appropriate, are the following: @@ -908,22 +908,23 @@ are the following: * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family). Since the linear predictor :math:`Xw` can be negative and -Poisson, Gamma and Inverse Gaussian distributions don't have negative values, +Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link :math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with :math:`h(Xw)=\exp(Xw)`. Note that the feature matrix `X` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as -follows:: +follows: >>> from sklearn.linear_model import GeneralizedLinearRegressor >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') - >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) - GeneralizedLinearRegressor(alpha=0.5, copy_X=True, family='poisson', - fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0, - link='log', max_iter=100, solver='irls', start_params=None, - tol=0.0001, verbose=0, warm_start=False) + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +SKIP + GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, + copy_X=True, family='poisson', fit_dispersion='chisqr', + fit_intercept=True, l1_ratio=0, link='log', max_iter=100, + random_state=None, selection='random', solver='auto', + start_params=None, tol=0.0001, verbose=0, warm_start=False) >>> reg.coef_ array([ 0.24630255, 0.43373521]) >>> reg.intercept_ #doctest: +ELLIPSIS diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 3f6b91026ef9b..3de82c20f33cf 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -3,20 +3,21 @@ """ # Author: Christian Lorentzen +# some parts and tricks stolen from other sklearn files. # License: BSD 3 clause # TODO: Write more tests -# TODO: Add l1-penalty (elastic net) +# TODO: Write examples and more docu # TODO: deal with option self.copy_X # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. # TODO: Add cross validation -# TODO: Write examples and more docu -# TODO: Make it as much consistent to other estimators in linear_model as -# possible -# TODO: which dtype to force for y and X? Which for P1, P2? +# TODO: Should GeneralizedLinearRegressor inherit from LinearModel? +# So far, it does not. # TODO: Include further classes in class.rst? ExponentialDispersionModel? # TweedieDistribution? +# TODO: Negative values in P1 are not allowed so far. They could be used form +# group lasse. # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -52,10 +53,11 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..externals import six +from ..externals.six.moves import xrange from ..utils import check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_random_state class Link(six.with_metaclass(ABCMeta)): @@ -309,7 +311,9 @@ def _score(self, coef, phi, X, y, weights, link): .. math: - \mathbf{score}(\boldsymbol{w}) = \mathbf{X}^T \mathbf{D} + \mathbf{score}(\boldsymbol{w}) + = \frac{\partial loglike}{\partial\boldsymbol{w}} + = \mathbf{X}^T \mathbf{D} \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and @@ -411,6 +415,29 @@ def _deviance_hessian(self, coef, X, y, weights, link): weights=weights, link=link) return 2*info_matrix + def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): + """Calculates eta (linear predictor), mu, score function (derivative + of log-likelihood) and Fisher matrix (all with phi=1) all in one go""" + n_samples, n_features = X.shape + # eta = linear predictor + eta = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(eta) + sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) + d1 = link.inverse_derivative(eta) # = h'(eta) + # Alternatively: + # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g + # d1 = 1./link.derivative(mu) + d1_sigma_inv = sparse.dia_matrix((sigma_inv*d1, 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(d1_sigma_inv, (y-mu), dense_output=True) + score = safe_sparse_dot(X.T, temp, dense_output=True) + # + d2_sigma_inv = sparse.dia_matrix((sigma_inv*(d1**2), 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) + fisher = safe_sparse_dot(X.T, temp, dense_output=False) + return eta, mu, score, fisher + def starting_mu(self, y, weights=1): """Starting values for the mean mu_i in (unpenalized) IRLS.""" return ((weights*y+np.mean(weights*y)) / @@ -670,8 +697,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): alpha = a + b and l1_ratio = a / (a + b) The parameter `l1_ratio` corresponds to alpha in the glmnet R package while - alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio - = 1 is the lasso penalty. + 'alpha' corresponds to the lambda parameter in glmnet. Specifically, + l1_ratio = 1 is the lasso penalty. Read more in the :ref:`User Guide `. @@ -686,6 +713,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): TODO: For `alpha` > 0, the feature matrix `X` is assumed to be standardized. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. + Otherwise, the strength of the penalty is different for the features. TODO: Estimation of the dispersion parameter phi. @@ -742,19 +770,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'auto', 'irls', 'newton-cg', 'lbfgs'}, optional (defaul='auto') + solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \ + optional (defaul='auto') Algorithm to use in the optimization problem. + - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'. + - 'irls' is iterated reweighted least squares (Fisher scoring). It is the standard algorithm for GLMs. Cannot deal with L1 penalties. - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties. - - 'auto' sets 'irls'. + - 'cd' is the coordinate descent algorithm. It can deal with L1 and + L2 penalties. max_iter : int, optional (default=100) - TODO + The maximal number of iterations for solver algorithms. tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, @@ -781,6 +813,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): This option only applies if ``warm_start=False`` or if fit is called the first time (``self.coef_`` does not exist). + selection : str, optional (default='random') + For the solver 'cd' (coordinate descent), the coordinates (features) + can be updated in either cyclic or random order. + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator that selects a random + feature to be updated for solver 'cd' (coordinate descent). + If int, random_state is the seed used by the random + number generator; if RandomState instance, random_state is the random + number generator; if None, the random number generator is the + RandomState instance used by `np.random`. Used when ``selection`` == + 'random'. + copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten. @@ -816,7 +865,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', fit_dispersion='chisqr', solver='auto', max_iter=100, - tol=1e-4, warm_start=False, start_params=None, copy_X=True, + tol=1e-4, warm_start=False, start_params=None, + selection='random', random_state=None, copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.l1_ratio = l1_ratio @@ -831,6 +881,8 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, self.tol = tol self.warm_start = warm_start self.start_params = start_params + self.selection = selection + self.random_state = random_state self.copy_X = copy_X self.check_input = check_input self.verbose = verbose @@ -925,24 +977,21 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver == 'auto': - solver = 'irls' - else: - solver = self.solver - if solver not in ['irls', 'lbfgs', 'newton-cg']: + if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']: raise ValueError("GeneralizedLinearRegressor supports only irls, " - "lbfgs and newton-cg solvers, got {0}" - "".format(solver)) - if self.alpha > 0: - if (self.l1_ratio > 0 and - solver not in []): - # TODO: Add solver for L1 - # raise ValueError("The solver option (solver={0}) is not " - # "appropriate for the chosen penalty which" - # " includes L1 (alpha={1})." - # .format(solver, self.alpha)) - raise NotImplementedError("Currently, no solver is implemented" - " that can deal with L1 penalties.") + "auto, lbfgs, newton-cg and cd solvers, got {0}" + "".format(self.solver)) + solver = self.solver + if self.solver == 'auto': + if self.l1_ratio == 0: + solver = 'irls' + else: + solver = 'cd' + if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']): + raise ValueError("The chosen solver (solver={0}) can't deal " + "with L1 penalties, which are included with " + "(alpha={1}) and (l1_ratio={2})." + .format(solver, self.alpha, self.l1_ratio)) if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: raise ValueError("Maximum number of iteration must be positive;" " got (max_iter={0!r})".format(self.max_iter)) @@ -953,7 +1002,14 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) start_params = self.start_params - if start_params is not None and start_params != 'least_squares': + if start_params is None: + pass + elif isinstance(start_params, six.string_types): + if start_params not in ['least_squares']: + raise ValueError("The argument start_params must be None, " + "'least-squares' or an array of right length," + " got(start_params={0})".format(start_params)) + else: start_params = np.atleast_1d(start_params) if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or (start_params.ndim != 1)): @@ -963,6 +1019,12 @@ def fit(self, X, y, sample_weight=None): .format(X.shape[1] + self.fit_intercept, start_params.shape[0], start_params.ndim)) + + if self.selection not in ['cyclic', 'random']: + raise ValueError("The argument selection must be 'cyclic' or " + "'random', got (selection={0})" + .format(self.selection)) + random_state = check_random_state(self.random_state) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) @@ -974,15 +1036,16 @@ def fit(self, X, y, sample_weight=None): P1 = np.ones(X.shape[1]) else: P1 = np.atleast_1d(np.copy(self.P1)) - if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1): + if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): raise ValueError("P1 must be either None or an 1D array with " "the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." .format(P1.shape[0], X.shape[1])) if self.P2 is None: - P2 = np.ones(X.shape[1]) - if sparse.issparse(X): + if not sparse.issparse(X): + P2 = np.ones(X.shape[1]) + else: P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), shape=(X.shape[1], X.shape[1]))).tocsr() else: @@ -1024,6 +1087,12 @@ def fit(self, X, y, sample_weight=None): l2 = self.alpha * (1-self.l1_ratio) P1 *= l1 P2 *= l2 + # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') + # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric + if sparse.issparse(P2): + P2 = 0.5 * (P2 + P2.transpose()) + else: + P2 = 0.5 * (P2 + P2.T) # 1.3 additional validations ########################################## if self.check_input: @@ -1033,14 +1102,20 @@ def fit(self, X, y, sample_weight=None): .format(family.__class__.__name__)) if not np.all(weights >= 0): raise ValueError("Sample weights must be non-negative.") - # check that P2 is positive semidefinite + # check if P1 has only non-negative values, negative values might + # indicate group lasso in the future. + if self.P1 is not None: + if not np.all(P1 >= 0): + raise ValueError("P1 must not have negative values.") + # check if P2 is positive semidefinite # np.linalg.cholesky(P2) 'only' asserts positive definite if self.P2 is not None: if sparse.issparse(P2): # TODO: check sparse P2 for non-negativeness - raise NotImplementedError("Check sparse P2 for " - "non-negaitveness is not yet " - "implemented.") + # raise NotImplementedError("Check sparse P2 for " + # "non-negaitveness is not yet " + # "implemented.") + pass elif P2.ndim == 2: if not np.all(np.linalg.eigvals(P2) >= -1e-15): raise ValueError("P2 must be positive definite.") @@ -1090,7 +1165,8 @@ def fit(self, X, y, sample_weight=None): else: # with L1 penalty, start with coef = 0 coef = np.zeros(n_features) - elif self.start_params == 'least_squares': + elif (isinstance(self.start_params, six.string_types) and + self.start_params == 'least_squares'): if self.alpha == 0: reg = LinearRegression(copy_X=True, fit_intercept=False) reg.fit(Xnew, link.link(y)) @@ -1102,7 +1178,7 @@ def fit(self, X, y, sample_weight=None): reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: - # TODO: Does this make sense? + # TODO: Does this make sense at all? reg = ElasticNet(copy_X=True, fit_intercept=False, alpha=self.alpha, l1_ratio=self.l1_ratio) reg.fit(Xnew, link.link(y)) @@ -1125,14 +1201,17 @@ def fit(self, X, y, sample_weight=None): # D2 = link.inverse_derivative(eta)^2 = D^2 # W = D2/V(mu) # l2 = alpha * (1 - l1_ratio) - # Obj' = d(Obj)/d(w) = 1/2 Dev' + P2 w + # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w # = -X' D (y-mu)/V(mu) + l2 P2 w # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 # Use Fisher matrix instead of full info matrix -X'(...) X, # i.e. E[Dev''] with E[y-mu]=0: # Obj'' ~ X' W X + l2 P2 - # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu) - # Note: P2 = l2*P2, see above + # (1): w = (X' W X + l2 P2)^-1 X' W z, + # with z = eta + D^-1 (y-mu) + # Note: we already set P2 = l2*P2, see above + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: ' denotes derivative, but also transpose for matrices if solver == 'irls': # eta = linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) @@ -1150,9 +1229,8 @@ def fit(self, X, y, sample_weight=None): # working observations z = eta + (y-mu)/hp # solve A*coef = b - # A = X' W X + l2 P2, b = X' W z + # A = X' W X + P2, b = X' W z coef = _irls_step(Xnew, W, P2, z) - # updated linear predictor # do it here for updated values for tolerance eta = safe_sparse_dot(Xnew, coef, dense_output=True) @@ -1242,25 +1320,182 @@ def Hs(s): # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin # An Improved GLMNET for L1-regularized Logistic Regression, # Journal of Machine Learning Research 13 (2012) 1999-2030 - # Note: Use Fisher matrix instead of Hessian + # Note: Use Fisher matrix instead of Hessian for H # # 1. find optimal descent direction d by minimizing # min_d F(w+d) = min_d F(w+d) - F(w) - # F = f + g; f(w) = 1/2 dev; g(w) = 1/2*w*P2*w + ||P1*w||_1 - # 2. quadrdatic approx of f(w+d)-f(w): - # q(d) = f'(w)*d +1/2 d*H*d - # min_d q(d) + g(w+d) - g(w) + # F = f + g, f(w) = 1/2 deviance, g(w) = 1/2 w*P2*w + ||P1*w||_1 + # 2. quadrdatic approximation of F(w+d)-F(w) = q(d): + # using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives + # q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d + # + ||P1*(w+d)||_1 - ||P1*w||_1 + # min_d q(d) # 3. coordinate descent by updating coordinate j (d -> d+z*e_j): - # min_z q(d+z*e_j) + g(w+d+z*e_j) - g(w) - # = min_z q(d+z e_j) - q(d) + g(w+d+z*e_j) - g(w+d) - # TODO - # elif solver == 'cd': + # min_z q(d+z*e_j) + # = min_z q(d+z*e_j) - q(d) + # = min_z A_j z + 1/2 B_jj z^2 + # + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 + # A = f'(w) + d*H(w) + (w+d)*P2 + # B = H+P2 + # Note: we already set P2 = l2*P2, P1 = l1*P1, see above + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: f' = -score, H = Fisher matrix + elif solver == 'cd': # line search parameters - # (beta, sigma) = (0.5, 0.01) - # for iteration k from 1 to maxiter - # for coordinate j sample at random - # np.random.choice(coord, replace = False) - # + (beta, sigma) = (0.5, 0.01) + # max inner loops (cycles through all features) + max_inner_iter = 1000 + # some precalculations + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) + # initial stopping tolerance of inner loop + # use L1-norm of minimum-norm of subgradient of F + # fp_wP2 = f'(w) + w*P2 + if P2.ndim == 1: + fp_wP2 = -score + coef*P2 + else: + fp_wP2 = -score + safe_sparse_dot(coef, P2) + inner_tol = (np.where(coef == 0, + np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), + fp_wP2+np.sign(coef)*P1)) + inner_tol = linalg.norm(inner_tol, ord=1) + # outer loop + while self.n_iter_ < self.max_iter: + self.n_iter_ += 1 + # initialize search direction d (to be optimized) + d = np.zeros_like(coef) + # inner loop + # TODO: use sparsity (coefficient already 0 due to L1 penalty) + d = np.zeros_like(coef) + # A = f'(w) + d*H(w) + (w+d)*P2 + # B = H+P2 + # Note: f'=-score and H=fisher are updated at the end of outer + # iteration + B = fisher + if P2.ndim == 1: + coef_P2 = coef * P2 + B[np.diag_indices_from(B)] += P2 + else: + coef_P2 = safe_sparse_dot(coef, P2) + B += P2 + A = -score + coef_P2 # + d*(H+P2) but d=0 so far + inner_iter = 0 + while inner_iter < max_inner_iter: + inner_iter += 1 + if self.selection == 'random': + featurelist = random_state.permutation(n_features) + else: + featurelist = np.arange(n_features) + for j in featurelist: + # minimize_z: a z + 1/2 b z^2 + c |d+z| + # a = A_j + # b = B_jj > 0 + # c = |P1_j| = P1_j > 0, ee 1.3 + # d = w_j + d_j + # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) + # with beta = z+d, beta_hat = d-a/b and gamma = c/b + # z = 1/b * S(bd-a,c) - d + # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding + a = A[j] + b = B[j, j] + if P1[j] == 0: + if b == 0: + z = 0 + else: + z = -a/b + elif a + P1[j] < b * (coef[j]+d[j]): + if b == 0: + z = 0 + else: + z = -(a + P1[j])/b + elif a - P1[j] > b * (coef[j]+d[j]): + if b == 0: + z = 0 + else: + z = -(a - P1[j])/b + else: + z = -(coef[j] + d[j]) + # update direction d + d[j] += z + # update A because d_j is now d_j+z + # A = f'(w) + d*H(w) + (w+d)*P2 + # => A += (H+P2)*e_j z = B_j * z + # Note: B is symmetric B = B.transpose + if sparse.issparse(B): + if sparse.isspmatrix_csc(B): + # slice columns + A += B[:, j].toarray().ravel() * z + else: + # slice rows + A += B[j, :].toarray().ravel() * z + else: + A += B[j, :] * z + # end of cycle + # stopping criterion for inner loop + # sum_i(|minimum-norm subgrad of q(d)_i|) + mn_subgrad = (np.where(coef + d == 0, + np.sign(A)*np.maximum(np.abs(A)-P1, 0), + A+np.sign(coef+d)*P1)) + mn_subgrad = np.sum(np.abs(mn_subgrad)) + if mn_subgrad <= inner_tol: + if inner_iter == 1: + inner_tol = inner_tol/4. + break + # end of inner loop + # line search by sequence beta^k, k=0, 1, .. + # F(w + lambda d) - F(w) <= lambda * bound + # bound = sigma * (f'(w)*d + w*P2*d + # +||P1 (w+d)||_1 - ||P1 w||_1) + P1w_1 = linalg.norm(P1*coef, ord=1) + # Note: coef_P2 already calculated and still valid + bound = sigma * ( + safe_sparse_dot(-score, d) + + safe_sparse_dot(coef_P2, d) + + linalg.norm(P1*(coef+d), ord=1) - + P1w_1) + Fw = (0.5 * family.deviance(y, mu, weights) + + 0.5 * safe_sparse_dot(coef_P2, coef) + + P1w_1) + la = 1./beta + for k in range(20): + la *= beta # starts with la=1 + mu_wd = link.inverse(safe_sparse_dot(Xnew, coef+la*d, + dense_output=True)) + Fwd = (0.5 * family.deviance(y, mu_wd, weights) + + linalg.norm(P1*(coef+la*d), ord=1)) + if P2.ndim == 1: + Fwd += 0.5 * safe_sparse_dot((coef+la*d)*P2, coef+la*d) + else: + Fwd += 0.5 * (safe_sparse_dot(coef+la*d, + safe_sparse_dot(P2, coef+la*d))) + if Fwd-Fw <= sigma*la*bound: + break + # update coefficients + # coef_old = coef.copy() + coef += la * d + # calculate eta, mu, score, Fisher matrix for next iteration + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) + # stopping criterion for outer loop + # sum_i(|minimum-norm subgrad of F(w)_i|) + # fp_wP2 = f'(w) + w*P2 + # Note: eta, mu and score are already updated + if P2.ndim == 1: + fp_wP2 = -score + coef*P2 + else: + fp_wP2 = -score + safe_sparse_dot(coef, P2) + mn_subgrad = (np.where(coef == 0, + np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), + fp_wP2+np.sign(coef)*P1)) + mn_subgrad = np.sum(np.abs(mn_subgrad)) + if mn_subgrad <= self.tol: + converged = True + break + # end of outer loop + if not converged: + warnings.warn("Coordinate descent failed to converge. Increase" + " the number of iterations (currently {0})" + .format(self.max_iter), ConvergenceWarning) ####################################################################### # 5. postprocessing # diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index de7de90db967b..87cc8bea45f5b 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,6 +1,7 @@ import numpy as np from numpy.testing import assert_allclose import scipy as sp +from scipy import sparse from sklearn.linear_model.glm import ( Link, @@ -126,7 +127,7 @@ def test_glm_identiy_regression(): GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), GeneralizedHyperbolicSecand()) - for solver in ['irls', 'lbfgs', 'newton-cg']: + for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: for family in families: glm = GeneralizedLinearRegressor( alpha=0, family=family, fit_intercept=False, solver=solver) @@ -162,28 +163,31 @@ def test_normal_ridge(): rng = np.random.RandomState(0) alpha = 1.0 - # With more samples than features + # 1. With more samples than features n_samples, n_features, n_predict = 6, 5, 10 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) T = rng.randn(n_predict, n_features) # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True) + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, + solver='svd', normalize=False) ridge.fit(X, y) - for solver in ['irls', 'lbfgs', 'newton-cg']: + for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', - fit_intercept=True, solver=solver) + fit_intercept=True, tol=1e-6, + max_iter=100, solver=solver) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) assert_almost_equal(glm.intercept_, ridge.intercept_) assert_array_almost_equal(glm.predict(T), ridge.predict(T)) - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False) + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, + solver='svd', normalize=False) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, family='normal', link='identity', fit_intercept=False, solver='irls') glm.fit(X, y) @@ -192,28 +196,30 @@ def test_normal_ridge(): assert_almost_equal(glm.intercept_, ridge.intercept_) assert_array_almost_equal(glm.predict(T), ridge.predict(T)) - # With more features than samples + # 2. With more features than samples and sparse n_samples, n_features, n_predict = 5, 10, 10 y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) - T = rng.randn(n_predict, n_features) + X = sparse.csr_matrix(rng.randn(n_samples, n_features)) + T = sparse.csr_matrix(rng.randn(n_predict, n_features)) # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True) + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, + solver='sag', normalize=False, max_iter=100000) ridge.fit(X, y) - for solver in ['irls', 'lbfgs', 'newton-cg']: - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, family='normal', link='identity', fit_intercept=True, solver=solver) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False) + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, + solver='sag', normalize=False, max_iter=1000) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, family='normal', link='identity', fit_intercept=False, solver='irls') glm.fit(X, y) @@ -240,9 +246,41 @@ def test_poisson_ridge(): # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) - glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, family='poisson', - link='log', tol=1e-10) + s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 7, 'cd': 7} + for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, + fit_intercept=True, family='poisson', + link='log', tol=1e-7, + solver=solver, max_iter=200) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, -0.12889386979, + decimal=s_dec[solver]) + assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + decimal=s_dec[solver]) + + +def test_poisson_enet(): + """Test elastic net regression with poisson family and LogLink + + Compare to R's glmnet""" + # library("glmnet") + # options(digits=10) + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.03550978409 + # a 0.16936423283 + # b . + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', + link='log', tol=1e-7) glm.fit(X, y) - assert_almost_equal(glm.intercept_, -0.12889386979, decimal=7) - assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + assert_almost_equal(glm.intercept_, -0.03550978409, decimal=7) + assert_array_almost_equal(glm.coef_, [0.16936423283, 0.], decimal=7) From 91497a2abc4824cdcb72f88dc26c9fd347d54b0d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 24 Jan 2018 20:37:27 +0100 Subject: [PATCH 11/69] [WIP] Add Generalized Linear Models (#9405) * bug for sparse matrices for newton-cg solver, function grad_hess * reduce precision for solver newton-cg in test_poisson_ridge * remedy doctest issues in linear_model.rst for example of GeneralizedLinearRegressor * remove unused import of xrange from six --- doc/modules/linear_model.rst | 2 +- sklearn/linear_model/glm.py | 3 +-- sklearn/linear_model/tests/test_glm.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 834466e494a4a..1f0946e97b059 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -919,7 +919,7 @@ follows: >>> from sklearn.linear_model import GeneralizedLinearRegressor >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') - >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +SKIP + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, copy_X=True, family='poisson', fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0, link='log', max_iter=100, diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 3de82c20f33cf..25f3ee1f52a2e 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -53,7 +53,6 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..externals import six -from ..externals.six.moves import xrange from ..utils import check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg @@ -1288,7 +1287,7 @@ def grad_hess(coef, X, y, weights, link): if P2.ndim == 1: hessian[np.diag_indices_from(hessian)] += P2 else: - hessian += P2 + hessian = hessian + P2 def Hs(s): ret = safe_sparse_dot(hessian, s) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 87cc8bea45f5b..c48c59ebd0eda 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -246,7 +246,7 @@ def test_poisson_ridge(): # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) - s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 7, 'cd': 7} + s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7} for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, fit_intercept=True, family='poisson', From b9e5105ddb011a2a4efd74eeb3033ebb824fa5a8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 24 Jan 2018 21:44:10 +0100 Subject: [PATCH 12/69] [WIP] Add Generalized Linear Models (#9405) * bug in cd solver for sparse matrices * higer precision (smaller tol) in test_normal_ridge for sparse matrices * for each solver a separate precision (tol) in test_poisson_ridge --- sklearn/linear_model/glm.py | 2 +- sklearn/linear_model/tests/test_glm.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 25f3ee1f52a2e..b428ee7509d14 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -1376,7 +1376,7 @@ def Hs(s): B[np.diag_indices_from(B)] += P2 else: coef_P2 = safe_sparse_dot(coef, P2) - B += P2 + B = B + P2 A = -score + coef_P2 # + d*(H+P2) but d=0 so far inner_iter = 0 while inner_iter < max_inner_iter: diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index c48c59ebd0eda..baad852dfb945 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -207,19 +207,20 @@ def test_normal_ridge(): solver='sag', normalize=False, max_iter=100000) ridge.fit(X, y) for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, family='normal', link='identity', - fit_intercept=True, solver=solver) + fit_intercept=True, solver=solver, + max_iter=300) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, solver='sag', normalize=False, max_iter=1000) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, family='normal', link='identity', fit_intercept=False, solver='irls') glm.fit(X, y) @@ -247,11 +248,12 @@ def test_poisson_ridge(): X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7} + s_tol = {'irls': 1e-8, 'lbfgs': 1e-7, 'newton-cg': 1e-7, 'cd': 1e-8} for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, fit_intercept=True, family='poisson', - link='log', tol=1e-7, - solver=solver, max_iter=200) + link='log', tol=s_tol[solver], + solver=solver, max_iter=300) glm.fit(X, y) assert_almost_equal(glm.intercept_, -0.12889386979, decimal=s_dec[solver]) From e317422e9dd860c4ed5a3c6ac6191eb8e560c365 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 25 Jan 2018 21:44:04 +0100 Subject: [PATCH 13/69] [WIP] Add Generalized Linear Models (#9405) * improved documentation * additional option 'zero' for argument start_params * validation of sample_weight in function predict * input validation of estimate_phi * set default fit_dispersion=None * bug in estimate_phi because of weight rescaling * test for estimate_phi in normal ridge regression * extended tests for elastic net poisson --- sklearn/linear_model/glm.py | 116 ++++++++++++++++--------- sklearn/linear_model/tests/test_glm.py | 39 +++++++-- 2 files changed, 110 insertions(+), 45 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index b428ee7509d14..e5eda6108052c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -11,13 +11,13 @@ # TODO: deal with option self.copy_X # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. -# TODO: Add cross validation +# TODO: Add cross validation support # TODO: Should GeneralizedLinearRegressor inherit from LinearModel? # So far, it does not. # TODO: Include further classes in class.rst? ExponentialDispersionModel? # TweedieDistribution? -# TODO: Negative values in P1 are not allowed so far. They could be used form -# group lasse. +# TODO: Negative values in P1 are not allowed so far. They could be used to +# for group lasso. # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -642,7 +642,7 @@ def _irls_step(X, W, P2, z): ------- coef: array, shape = (X.shape[1]) """ - # TODO: scipy.linalg.solve if faster, but ordinary least squares uses + # TODO: scipy.linalg.solve is faster, but ordinary least squares uses # scipy.linalg.lstsq. What is more appropriate? n_samples, n_features = X.shape if sparse.issparse(X): @@ -709,16 +709,20 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): (penalized) maximum likelihood which is equivalent to minimizing the deviance. - TODO: For `alpha` > 0, the feature matrix `X` is assumed to be - standardized. Call + For `alpha` > 0, the feature matrix `X` should be standardized in order to + penalize features equally strong. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. - Otherwise, the strength of the penalty is different for the features. TODO: Estimation of the dispersion parameter phi. - TODO: Notes on weights and 'scaled' distributions. For Poisson, this means - to fit y = z/w with z=counts and w=exposure (time, money, persons, ...) - => y is a ratio with weights w. Same for other distributions. + If your target `y` is a ratio, you should also provide appropriate weights + `w`. As an example, consider Poission distributed counts `z` (integers) and + weights `w`=exposure (time, money, persons years, ...), then you fit + `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y, + sample_weight=w)``. You need the weights for the right mean, consider: + :math:`\bar(y) = \frac{\sum_i w_i y_i}{\sum_i w_i}`. + In this case one might say that y has a 'scaled' Poisson distributions. + The same holds for other distributions. Parameters ---------- @@ -800,8 +804,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): does not exit (first call to fit), option ``start_params`` sets the starting values for ``coef_`` and ``intercept_``. - start_params : None or array of shape (n_features, ) or 'least_squares'}, \ - optional (default=None) + start_params : {None, 'least_squares', 'zero'} or array of shape \ + (n_features, ) or }, optional (default=None) If an array of size n_features is supplied, use these as start values for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. @@ -854,16 +858,18 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): n_iter_ : int Actual number of iterations of the solver. - Notes - ----- References ---------- - TODO + For the coordinate descent implementation: + .. [1] Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', - fit_dispersion='chisqr', solver='auto', max_iter=100, + fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params=None, selection='random', random_state=None, copy_X=True, check_input=True, verbose=0): @@ -1004,9 +1010,10 @@ def fit(self, X, y, sample_weight=None): if start_params is None: pass elif isinstance(start_params, six.string_types): - if start_params not in ['least_squares']: + if start_params not in ['least_squares', 'zero']: raise ValueError("The argument start_params must be None, " - "'least-squares' or an array of right length," + "'least-squares', 'zero' or an array of right" + " length," " got(start_params={0})".format(start_params)) else: start_params = np.atleast_1d(start_params) @@ -1129,6 +1136,7 @@ def fit(self, X, y, sample_weight=None): # deviance = sum(sample_weight * unit_deviance), # we rescale weights such that sum(weights) = 1 and this becomes # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + weights_sum = np.sum(weights) weights = weights/np.sum(weights) ####################################################################### @@ -1141,7 +1149,8 @@ def fit(self, X, y, sample_weight=None): coef = None if self.warm_start and hasattr(self, "coef_"): if self.fit_intercept: - coef = np.concatenate((self.intercept_, self.coef_)) + coef = np.concatenate((np.array([self.intercept_]), + self.coef_)) else: coef = self.coef_ elif self.start_params is None: @@ -1164,24 +1173,27 @@ def fit(self, X, y, sample_weight=None): else: # with L1 penalty, start with coef = 0 coef = np.zeros(n_features) - elif (isinstance(self.start_params, six.string_types) and - self.start_params == 'least_squares'): - if self.alpha == 0: - reg = LinearRegression(copy_X=True, fit_intercept=False) - reg.fit(Xnew, link.link(y)) - coef = reg.coef_ - elif self.l1_ratio <= 0.01: - # ElasticNet says l1_ratio <= 0.01 is not reliable, use Ridge - reg = Ridge(copy_X=True, fit_intercept=False, - alpha=self.alpha) - reg.fit(Xnew, link.link(y)) - coef = reg.coef_ - else: - # TODO: Does this make sense at all? - reg = ElasticNet(copy_X=True, fit_intercept=False, - alpha=self.alpha, l1_ratio=self.l1_ratio) - reg.fit(Xnew, link.link(y)) - coef = reg.coef_ + elif isinstance(self.start_params, six.string_types): + if self.start_params == 'zero': + coef = np.zeros(n_features) + elif self.start_params == 'least_squares': + if self.alpha == 0: + reg = LinearRegression(copy_X=True, fit_intercept=False) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ + elif self.l1_ratio <= 0.01: + # ElasticNet says l1_ratio <= 0.01 is not reliable + # => use Ridge + reg = Ridge(copy_X=True, fit_intercept=False, + alpha=self.alpha) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ + else: + # TODO: Does this make sense at all? + reg = ElasticNet(copy_X=True, fit_intercept=False, + alpha=self.alpha, l1_ratio=self.l1_ratio) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ else: coef = start_params @@ -1365,6 +1377,7 @@ def Hs(s): d = np.zeros_like(coef) # inner loop # TODO: use sparsity (coefficient already 0 due to L1 penalty) + # => active set of features for featurelist, see paper d = np.zeros_like(coef) # A = f'(w) + d*H(w) + (w+d)*P2 # B = H+P2 @@ -1508,7 +1521,8 @@ def Hs(s): self.coef_ = coef if self.fit_dispersion in ['chisqr', 'deviance']: - self.dispersion_ = self.estimate_phi(y, X, weights) + # attention because of rescaling of weights + self.dispersion_ = self.estimate_phi(y, X, weights)*weights_sum return self @@ -1544,9 +1558,23 @@ def predict(self, X, sample_weight=1): C : array, shape = (n_samples) Returns predicted values times sample_weight. """ - # TODO: validation of sample_weight eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) + if sample_weight is None: + return mu + elif np.isscalar(sample_weight): + if sample_weight <= 0: + raise ValueError("Sample weight must be positive, " + "got (sample_weight={0})." + .format(sample_weight)) + else: + sample_weights = np.atleast_1d(sample_weight) + if sample_weight.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar.") + elif sample_weight.shape[0] != mu.shape[0]: + raise ValueError("Sample weights must have the same length as" + " X.shape[1].") + return mu*sample_weight def estimate_phi(self, y, X, sample_weight): @@ -1554,10 +1582,20 @@ def estimate_phi(self, y, X, sample_weight): Returns the estimate. """ check_is_fitted(self, "coef_") + _dtype = [np.float64, np.float32] + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=_dtype, y_numeric=True, multi_output=False) n_samples, n_features = X.shape eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ + n_features += 1 + if n_samples <= n_features: + raise ValueError("Estimation of dispersion parameter phi requires" + " more samples than features, got" + " samples=X.shape[0]={0} and" + " n_features=X.shape[1]+fit_intercept={1}." + .format(n_samples, n_features)) mu = self._link_instance.inverse(eta) if self.fit_dispersion == 'chisqr': chisq = np.sum(sample_weight*(y-mu)**2 / diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index baad852dfb945..ee90cd51c874d 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -164,7 +164,7 @@ def test_normal_ridge(): alpha = 1.0 # 1. With more samples than features - n_samples, n_features, n_predict = 6, 5, 10 + n_samples, n_features, n_predict = 10, 5, 10 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) T = rng.randn(n_predict, n_features) @@ -189,12 +189,16 @@ def test_normal_ridge(): ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, family='normal', link='identity', - fit_intercept=False, solver='irls') + fit_intercept=False, solver='irls', + fit_dispersion='chisqr') glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) assert_almost_equal(glm.intercept_, ridge.intercept_) assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + mu = glm.predict(X) + assert_almost_equal(glm.dispersion_, + np.sum((y-mu)**2/(n_samples-n_features))) # 2. With more features than samples and sparse n_samples, n_features, n_predict = 5, 10, 10 @@ -278,11 +282,34 @@ def test_poisson_enet(): # (Intercept) -0.03550978409 # a 0.16936423283 # b . + glmnet_intercept = -0.03550978409 + glmnet_coef = [0.16936423283, 0.] X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', tol=1e-7) + link='log', solver='cd', tol=1e-7) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) + + # same for start_params='zero' with reduced precision + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', + link='log', solver='cd', tol=1e-5, + start_params='zero') + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + + # start_params='least_squares' with different alpha + glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, + family='poisson', + link='log', solver='cd', tol=1e-5, + start_params='zero') + glm.fit(X, y) + # warm start with original alpha and use of sparse matrices + glm.warm_start = True + glm.alpha = 1 + X = sparse.csr_matrix(X) glm.fit(X, y) - assert_almost_equal(glm.intercept_, -0.03550978409, decimal=7) - assert_array_almost_equal(glm.coef_, [0.16936423283, 0.], - decimal=7) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) From 9a9818441d605bf86547651997db81e969f41cdf Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 25 Jan 2018 22:59:46 +0100 Subject: [PATCH 14/69] [WIP] Add Generalized Linear Models (#9405) * new helper function _check_weights for validation of sample_weight * fix white space issue in doctest of linear_model.rst --- doc/modules/linear_model.rst | 8 ++--- sklearn/linear_model/glm.py | 59 +++++++++++++++++------------------- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 1f0946e97b059..f7b0ca0cc7add 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -921,10 +921,10 @@ follows: >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, - copy_X=True, family='poisson', fit_dispersion='chisqr', - fit_intercept=True, l1_ratio=0, link='log', max_iter=100, - random_state=None, selection='random', solver='auto', - start_params=None, tol=0.0001, verbose=0, warm_start=False) + copy_X=True, family='poisson', fit_dispersion='chisqr', + fit_intercept=True, l1_ratio=0, link='log', max_iter=100, + random_state=None, selection='random', solver='auto', + start_params=None, tol=0.0001, verbose=0, warm_start=False) >>> reg.coef_ array([ 0.24630255, 0.43373521]) >>> reg.intercept_ #doctest: +ELLIPSIS diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index e5eda6108052c..138830ea431c9 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -59,6 +59,26 @@ from ..utils.validation import check_is_fitted, check_random_state +def _check_weights(sample_weight, n_samples): + if sample_weight is None: + weights = np.ones(n_samples) + elif np.isscalar(sample_weight): + if sample_weight < 0: + raise ValueError("Sample weights must be non-negative.") + weights = sample_weight*np.ones(n_samples) + else: + weights = np.atleast_1d(sample_weight) + if weights.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar") + elif weights.shape[0] != n_samples: + raise ValueError("Sample weights must have the same length as" + " y") + if not np.all(sample_weight >= 0): + raise ValueError("Sample weights must be non-negative.") + + return weights + + class Link(six.with_metaclass(ABCMeta)): """Abstract base class for Link funtions """ @@ -925,17 +945,7 @@ def fit(self, X, y, sample_weight=None): dtype=_dtype, y_numeric=True, multi_output=False) y = y.astype(np.float64) - if sample_weight is None: - weights = np.ones_like(y) - elif np.isscalar(sample_weight): - weights = sample_weight*np.ones_like(y) - else: - weights = np.atleast_1d(sample_weight) - if weights.ndim > 1: - raise ValueError("Sample weight must be 1D array or scalar") - elif weights.shape[0] != y.shape[0]: - raise ValueError("Sample weights must have the same length as" - " y") + weights = _check_weights(sample_weight, y.shape[0]) # 1.2 validate arguments of __init__ ################################## # Garantee that self._family_instance is an instance of class @@ -1544,7 +1554,7 @@ def linear_predictor(self, X): return safe_sparse_dot(X, self.coef_, dense_output=True) + self.intercept_ - def predict(self, X, sample_weight=1): + def predict(self, X, sample_weight=None): """Predict uing GLM with feature matrix X. If sample_weight is given, returns prediction*sample_weight. @@ -1558,26 +1568,13 @@ def predict(self, X, sample_weight=1): C : array, shape = (n_samples) Returns predicted values times sample_weight. """ + weights = _check_weights(sample_weight, X.shape[0]) eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) - if sample_weight is None: - return mu - elif np.isscalar(sample_weight): - if sample_weight <= 0: - raise ValueError("Sample weight must be positive, " - "got (sample_weight={0})." - .format(sample_weight)) - else: - sample_weights = np.atleast_1d(sample_weight) - if sample_weight.ndim > 1: - raise ValueError("Sample weight must be 1D array or scalar.") - elif sample_weight.shape[0] != mu.shape[0]: - raise ValueError("Sample weights must have the same length as" - " X.shape[1].") - return mu*sample_weight + return mu*weights - def estimate_phi(self, y, X, sample_weight): + def estimate_phi(self, y, X, sample_weight=None): """Estimation of the dispersion parameter. Returns the estimate. """ @@ -1586,6 +1583,7 @@ def estimate_phi(self, y, X, sample_weight): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=_dtype, y_numeric=True, multi_output=False) n_samples, n_features = X.shape + weights = _check_weights(sample_weight, n_samples) eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ @@ -1640,10 +1638,7 @@ def score(self, X, y, sample_weight=None): # Note, default score defined in RegressorMixin is R^2 score. # TODO: make D^2 a score function in module metrics (and thereby get # input validation and so on) - if sample_weight is None: - weights = np.ones_like(y) - else: - weights = np.atleast_1d(sample_weight) + weights = _check_weights(sample_weight, y.shape[0]) mu = self.predict(X) dev = self._family_instance.deviance(y, mu, weights=weights) y_mean = np.average(y, weights=weights) From db9defe6b7637fe022034ca7f435f4fd37f6c118 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 26 Jan 2018 08:33:59 +0100 Subject: [PATCH 15/69] [WIP] Add Generalized Linear Models (#9405) * fit_dispersion default=None also in docs. * improved docs. * fixed input validation of predict * fixed bug for sample_weight in estimate_phi --- sklearn/linear_model/glm.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 138830ea431c9..22a8be9e50828 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -754,7 +754,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): case, the design matrix X must have full column rank (no collinearities). - l1_ratio : float, optional (defaul=0) + l1_ratio : float, optional (default=0) The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a @@ -768,7 +768,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Note that n_features* = X.shape[1] = length of coef_ (intercept always excluded from counting). - P2 : None or array of shape (n_features*, n_features*) + P2 : None or array of shape (n_features*, n_features*), optional\ + (default=None) With this square matrix the L2 penalty is calculated as `w P2 w`. This gives a fine control over this penalty (Tikhonov regularization). @@ -781,20 +782,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\ of class ExponentialDispersionModel, optional(default='normal') - the distributional assumption of the GLM. + the distributional assumption of the GLM, i.e. which loss function to + be minimized. link : {'identity', 'log'} or an instance of class Link, optional (default='identity') the link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul='chisqr') + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) method for estimation of the dispersion parameter phi. Whether to use the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \ - optional (defaul='auto') + optional (default='auto') Algorithm to use in the optimization problem. - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'. @@ -830,11 +832,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. If 'least_squares' is set, the result of a least squares fit in the - link space (linear predictor) is taken. If ``None``, the start values - are calculated by setting mu to family.starting_mu(..) and one step of - irls. - This option only applies if ``warm_start=False`` or if fit is called - the first time (``self.coef_`` does not exist). + link space (linear predictor) is taken. + If 'zero' is set, all coefficients start with zero. + If ``None``, the start values are calculated by setting mu to + family.starting_mu(..) and one step of irls. + These options only apply if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not yet exist). selection : str, optional (default='random') For the solver 'cd' (coordinate descent), the coordinates (features) @@ -1550,7 +1553,9 @@ def linear_predictor(self, X): Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") - X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype='numeric', copy=True, ensure_2d=True, + allow_nd=False) return safe_sparse_dot(X, self.coef_, dense_output=True) + self.intercept_ @@ -1568,9 +1573,10 @@ def predict(self, X, sample_weight=None): C : array, shape = (n_samples) Returns predicted values times sample_weight. """ - weights = _check_weights(sample_weight, X.shape[0]) + # validation of X in linear_predictor eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) + weights = _check_weights(sample_weight, X.shape[0]) return mu*weights @@ -1596,11 +1602,11 @@ def estimate_phi(self, y, X, sample_weight=None): .format(n_samples, n_features)) mu = self._link_instance.inverse(eta) if self.fit_dispersion == 'chisqr': - chisq = np.sum(sample_weight*(y-mu)**2 / + chisq = np.sum(weights*(y-mu)**2 / self._family_instance.unit_variance(mu)) return chisq/(n_samples - n_features) elif self.fit_dispersion == 'deviance': - dev = self._family_instance.deviance(y, mu, sample_weight) + dev = self._family_instance.deviance(y, mu, weights) return dev/(n_samples - n_features) # Note: check_estimator(GeneralizedLinearRegressor) might raise From dc7fdd7f8f6a3a2276f5f96aa6d5dd6ad3ce853e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 26 Jan 2018 08:41:24 +0100 Subject: [PATCH 16/69] [WIP] Add Generalized Linear Models (#9405) * improved docs --- sklearn/linear_model/tests/test_glm.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index ee90cd51c874d..0af837c9c73f3 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -177,7 +177,8 @@ def test_normal_ridge(): glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=True, tol=1e-6, - max_iter=100, solver=solver) + max_iter=100, solver=solver, + random_state=42) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) @@ -214,7 +215,7 @@ def test_normal_ridge(): glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, family='normal', link='identity', fit_intercept=True, solver=solver, - max_iter=300) + max_iter=300, random_state=42) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) @@ -257,7 +258,8 @@ def test_poisson_ridge(): glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, fit_intercept=True, family='poisson', link='log', tol=s_tol[solver], - solver=solver, max_iter=300) + solver=solver, max_iter=300, + random_state=42) glm.fit(X, y) assert_almost_equal(glm.intercept_, -0.12889386979, decimal=s_dec[solver]) @@ -282,20 +284,23 @@ def test_poisson_enet(): # (Intercept) -0.03550978409 # a 0.16936423283 # b . + rand = 0 glmnet_intercept = -0.03550978409 glmnet_coef = [0.16936423283, 0.] X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', solver='cd', tol=1e-7) + link='log', solver='cd', tol=1e-7, + selection='random', random_state=42) glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) - # same for start_params='zero' with reduced precision + # same for start_params='zero' and selection='cyclic' + # with reduced precision glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', link='log', solver='cd', tol=1e-5, - start_params='zero') + selection='cyclic', start_params='zero') glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) From b11d06ba72865c14b0532c6d6c34d264a09d7ae4 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 26 Jan 2018 16:57:45 +0100 Subject: [PATCH 17/69] [WIP] Add Generalized Linear Models (#9405) * fixed input validation of X in predict --- sklearn/linear_model/glm.py | 4 +++- sklearn/linear_model/tests/test_glm.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 22a8be9e50828..032ded86816dd 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -1573,7 +1573,9 @@ def predict(self, X, sample_weight=None): C : array, shape = (n_samples) Returns predicted values times sample_weight. """ - # validation of X in linear_predictor + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype='numeric', copy=True, ensure_2d=True, + allow_nd=False) eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) weights = _check_weights(sample_weight, X.shape[0]) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 0af837c9c73f3..776edd8aeec46 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -284,7 +284,6 @@ def test_poisson_enet(): # (Intercept) -0.03550978409 # a 0.16936423283 # b . - rand = 0 glmnet_intercept = -0.03550978409 glmnet_coef = [0.16936423283, 0.] X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T From 9e6c01378a4cb245824bcb9429b3d566652af743 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 26 Jan 2018 17:37:44 +0100 Subject: [PATCH 18/69] [WIP] Add Generalized Linear Models (#9405) * redundant line of code 'd = np.zeros_like(coef)' --- doc/modules/linear_model.rst | 2 +- sklearn/linear_model/glm.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index f7b0ca0cc7add..dcb35b6a5d941 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -921,7 +921,7 @@ follows: >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, - copy_X=True, family='poisson', fit_dispersion='chisqr', + copy_X=True, family='poisson', fit_dispersion=None, fit_intercept=True, l1_ratio=0, link='log', max_iter=100, random_state=None, selection='random', solver='auto', start_params=None, tol=0.0001, verbose=0, warm_start=False) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 032ded86816dd..8856af8ec698a 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -73,7 +73,7 @@ def _check_weights(sample_weight, n_samples): elif weights.shape[0] != n_samples: raise ValueError("Sample weights must have the same length as" " y") - if not np.all(sample_weight >= 0): + if not np.all(weights >= 0): raise ValueError("Sample weights must be non-negative.") return weights @@ -1132,7 +1132,7 @@ def fit(self, X, y, sample_weight=None): if sparse.issparse(P2): # TODO: check sparse P2 for non-negativeness # raise NotImplementedError("Check sparse P2 for " - # "non-negaitveness is not yet " + # "non-negativeness is not yet " # "implemented.") pass elif P2.ndim == 2: @@ -1391,7 +1391,6 @@ def Hs(s): # inner loop # TODO: use sparsity (coefficient already 0 due to L1 penalty) # => active set of features for featurelist, see paper - d = np.zeros_like(coef) # A = f'(w) + d*H(w) + (w+d)*P2 # B = H+P2 # Note: f'=-score and H=fisher are updated at the end of outer From bad0190a22623eae3a2f6dfcdb0fd8caee625111 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 27 Jan 2018 20:38:02 +0100 Subject: [PATCH 19/69] [WIP] Add Generalized Linear Models (#9405) * added test to compare to ElasticNet * deleted identical comment lines --- sklearn/linear_model/tests/test_glm.py | 29 +++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 776edd8aeec46..9990cafe2cbcf 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -12,7 +12,7 @@ GammaDistribution, InverseGaussianDistribution, GeneralizedHyperbolicSecand, GeneralizedLinearRegressor) -from sklearn.linear_model.ridge import Ridge +from sklearn.linear_model import ElasticNet, Ridge from sklearn.utils.testing import ( assert_equal, assert_almost_equal, @@ -267,14 +267,37 @@ def test_poisson_ridge(): decimal=s_dec[solver]) +def test_normal_enet(): + """Tet elastic net regression with normal/gaussian family""" + rng = np.random.RandomState(0) + alpha, l1_ratio = 0.3, 0.7 + n_samples, n_features = 20, 2 + X = rng.randn(n_samples, n_features).copy(order='F') + beta = rng.randn(n_features) + y = 2 + np.dot(X, beta) + rng.randn(n_samples) + + glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, + family='normal', link='identity', + fit_intercept=True, tol=1e-7, + max_iter=100, selection='cyclic', + solver='cd', start_params='zero', + check_input=False) + glm.fit(X, y) + + enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, + normalize=False, tol=1e-7, copy_X=True) + enet.fit(X, y) + + assert_almost_equal(glm.intercept_, enet.intercept_) + assert_array_almost_equal(glm.coef_, enet.coef_) + + def test_poisson_enet(): """Test elastic net regression with poisson family and LogLink Compare to R's glmnet""" # library("glmnet") # options(digits=10) - # library("glmnet") - # options(digits=10) # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) # x <- data.matrix(df[,c("a", "b")]) # y <- df$y From 48137d86079c9a8efd15d57e719e3ee35f1644c9 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 28 Jan 2018 11:52:19 +0100 Subject: [PATCH 20/69] [WIP] Add Generalized Linear Models (#9405) * increased precision in test_normal_enet --- sklearn/linear_model/tests/test_glm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 9990cafe2cbcf..6f8bdd3a72f40 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -278,18 +278,18 @@ def test_normal_enet(): glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, family='normal', link='identity', - fit_intercept=True, tol=1e-7, + fit_intercept=True, tol=1e-8, max_iter=100, selection='cyclic', solver='cd', start_params='zero', check_input=False) glm.fit(X, y) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, - normalize=False, tol=1e-7, copy_X=True) + normalize=False, tol=1e-8, copy_X=True) enet.fit(X, y) - assert_almost_equal(glm.intercept_, enet.intercept_) - assert_array_almost_equal(glm.coef_, enet.coef_) + assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) + assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) def test_poisson_enet(): From 2c2a077a2e8c57bdf3c945678526d6efdb5763e9 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 28 Jan 2018 12:42:03 +0100 Subject: [PATCH 21/69] [WIP] Add Generalized Linear Models (#9405) * better doc for heavy tailed distributions --- doc/modules/linear_model.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index dcb35b6a5d941..5cb8e54afbb06 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -905,7 +905,8 @@ are the following: * If the target values are positive valued and skewed, you might try a Gamma deviance. - * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family). + * If the target values seem to be heavier tailed than a Gamma distribution, you might try an Inverse Gaussian deviance (or even higher variance powers of the Tweedie family). + Keep in mind that the mean is not a good measure for very heavy tailed distributions, cf. extreme value theory. Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, From 15931c3148b68c47aa3c3c19983525ae758a0981 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 28 Jan 2018 15:18:26 +0100 Subject: [PATCH 22/69] [WIP] Add Generalized Linear Models (#9405) * improved input validation and testing of them --- sklearn/linear_model/glm.py | 24 +++- sklearn/linear_model/tests/test_glm.py | 174 ++++++++++++++++++++++++- 2 files changed, 189 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 8856af8ec698a..33df6b4c9b850 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -63,11 +63,14 @@ def _check_weights(sample_weight, n_samples): if sample_weight is None: weights = np.ones(n_samples) elif np.isscalar(sample_weight): - if sample_weight < 0: + if sample_weight <= 0: raise ValueError("Sample weights must be non-negative.") weights = sample_weight*np.ones(n_samples) else: - weights = np.atleast_1d(sample_weight) + _dtype = [np.float64, np.float32] + weights = check_array(sample_weight, accept_sparse='csr', + force_all_finite=True, ensure_2d=False, + dtype=_dtype) if weights.ndim > 1: raise ValueError("Sample weight must be 1D array or scalar") elif weights.shape[0] != n_samples: @@ -75,6 +78,9 @@ def _check_weights(sample_weight, n_samples): " y") if not np.all(weights >= 0): raise ValueError("Sample weights must be non-negative.") + elif not np.sum(weights) > 0: + raise ValueError("Sample weights must have at least one positive " + "element.") return weights @@ -1010,10 +1016,12 @@ def fit(self, X, y, sample_weight=None): "with L1 penalties, which are included with " "(alpha={1}) and (l1_ratio={2})." .format(solver, self.alpha, self.l1_ratio)) - if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: - raise ValueError("Maximum number of iteration must be positive;" + if (not isinstance(self.max_iter, six.integer_types) + or self.max_iter <= 0): + raise ValueError("Maximum number of iteration must be a positive " + "integer;" " got (max_iter={0!r})".format(self.max_iter)) - if not isinstance(self.tol, numbers.Number) or self.tol < 0: + if not isinstance(self.tol, numbers.Number) or self.tol <= 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol={0!r})".format(self.tol)) if not isinstance(self.warm_start, bool): @@ -1029,7 +1037,9 @@ def fit(self, X, y, sample_weight=None): " length," " got(start_params={0})".format(start_params)) else: - start_params = np.atleast_1d(start_params) + start_params = check_array(start_params, accept_sparse='csr', + force_all_finite=True, ensure_2d=False, + dtype=_dtype, copy=True) if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or (start_params.ndim != 1)): raise ValueError("Start values for parameters must have the" @@ -1160,7 +1170,7 @@ def fit(self, X, y, sample_weight=None): # set start values for coef coef = None - if self.warm_start and hasattr(self, "coef_"): + if self.warm_start and hasattr(self, 'coef_'): if self.fit_intercept: coef = np.concatenate((np.array([self.intercept_]), self.coef_)) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 6f8bdd3a72f40..1abbcf0540e28 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -5,7 +5,7 @@ from sklearn.linear_model.glm import ( Link, - # IdentityLink, + IdentityLink, LogLink, TweedieDistribution, NormalDistribution, PoissonDistribution, @@ -16,7 +16,8 @@ from sklearn.utils.testing import ( assert_equal, assert_almost_equal, - assert_array_equal, assert_array_almost_equal) + assert_array_equal, assert_array_almost_equal, + assert_raises) def test_link_properties(): @@ -102,6 +103,34 @@ def f(coef): assert_allclose(fisher, approx, rtol=1e-3) +def test_sample_weights_validation(): + """Test the raised errors in the validation of sample_weight""" + # 1. scalar value but not positive + X = [[1]] + y = [1] + weights = 0 + glm = GeneralizedLinearRegressor(fit_intercept=False) + assert_raises(ValueError, glm.fit, X, y, weights) + + # 2. 2d array + weights = [[0]] + assert_raises(ValueError, glm.fit, X, y, weights) + + # 3. 1d but wrong length + weights = [1, 0] + assert_raises(ValueError, glm.fit, X, y, weights) + + # 4. 1d but only zeros (sum not greater than 0) + weights = [0, 0] + X = [[0], [1]] + y = [1, 2] + assert_raises(ValueError, glm.fit, X, y, weights) + + # 5. 1d but weith a negative value + weights = [2, -1] + assert_raises(ValueError, glm.fit, X, y, weights) + + def test_glm_family_argument(): """Test GLM family argument set as string """ @@ -115,6 +144,147 @@ def test_glm_family_argument(): alpha=0).fit(X, y) assert_equal(type(glm._family_instance), type(fam)) + glm = GeneralizedLinearRegressor(family='not a family', + fit_intercept=False) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_link_argument(): + """Test GLM link argument set as string + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for (l, link) in [('identity', IdentityLink()), + ('log', LogLink())]: + glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, + link=l).fit(X, y) + assert_equal(type(glm._link_instance), type(link)) + + glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, + link='not a link') + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_alpha_argument(): + """Test GLM alpha argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for alpha in ['not a number', -4.2]: + glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, + alpha=alpha) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_l1_ratio_argument(): + """Test GLM l1_ratio argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for l1_ratio in ['not a number', -4.2, 1.1, [1]]: + glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, + l1_ratio=l1_ratio) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_fit_intercept_argument(): + """Test GLM fit_intercept argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for fit_intercept in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_solver_argument(): + """Test GLM solver argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for solver in ['not a solver', 1, [1]]: + glm = GeneralizedLinearRegressor(solver=solver) + assert_raises(ValueError, glm.fit, X, y) + + # solver not suitable for L1 penalty + for solver in ['irls', 'lbfgs', 'newton-cg']: + glm = GeneralizedLinearRegressor(solver=solver, alpha=1, l1_ratio=0.1) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_max_iter_argument(): + """Test GLM max_iter argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for max_iter in ['not a number', 0, -1, 5.5, [1]]: + glm = GeneralizedLinearRegressor(max_iter=max_iter) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_tol_argument(): + """Test GLM tol argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for tol in ['not a number', 0, -1.0, [1e-3]]: + glm = GeneralizedLinearRegressor(tol=tol) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_warm_start_argument(): + """Test GLM warm_start argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for warm_start in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(warm_start=warm_start) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_start_params_argument(): + """Test GLM start_params argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for start_params in ['not a start_params', ['zero'], [0, 0, 0], + [[0, 0]], ['a', 'b']]: + glm = GeneralizedLinearRegressor(start_params=start_params) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_selection_argument(): + """Test GLM selection argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for selection in ['not a selection', 1, 0, ['cyclic']]: + glm = GeneralizedLinearRegressor(selection=selection) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_check_input_argument(): + """Test GLM check_input argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for check_input in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(check_input=check_input) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_random_state_argument(): + """Test GLM random_state argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for random_state in ['a string', 0.5, [0]]: + glm = GeneralizedLinearRegressor(random_state=random_state) + assert_raises(ValueError, glm.fit, X, y) + + +# TODO: check P1 and P2 +# TODO: check additional validations if check_input == True def test_glm_identiy_regression(): """Test GLM regression with identity link on a simple dataset From feedba379c2a8f53a7a9792e33041f29da7a4c95 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 30 Mar 2018 18:50:06 +0200 Subject: [PATCH 23/69] [MRG] Add Generalized Linear Models (#9405) * improved input validation and testing of P1 * test case for validation of argument P2 * test case for validation of argument copy_X --- sklearn/linear_model/glm.py | 3 ++ sklearn/linear_model/tests/test_glm.py | 48 +++++++++++++++++++++----- 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 33df6b4c9b850..eae4a56ea1d95 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -1065,6 +1065,9 @@ def fit(self, X, y, sample_weight=None): P1 = np.ones(X.shape[1]) else: P1 = np.atleast_1d(np.copy(self.P1)) + if P1.dtype.kind not in ['b', 'i', 'u', 'f']: + raise ValueError("P1 must be a numeric value; " + "got (dtype={0}).".format(P1.dtype)) if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): raise ValueError("P1 must be either None or an 1D array with " "the length of X.shape[1]; " diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 1abbcf0540e28..edf579a416973 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -187,6 +187,27 @@ def test_glm_l1_ratio_argument(): assert_raises(ValueError, glm.fit, X, y) +def test_glm_P1_argument(): + """Test GLM P1 arguments + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for P1 in [['a string', 'a string'], [1, [2]], [1, 2, 3]]: + glm = GeneralizedLinearRegressor(P1=P1) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_P2_argument(): + """Test GLM P2 arguments + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for P2 in [np.full((2, 2), 'a string'), [[1, [2]], [3, 4]], [1, 2, 3], + [[1, 2]], [[1], [2]]]: + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False) + assert_raises(ValueError, glm.fit, X, y) + + def test_glm_fit_intercept_argument(): """Test GLM fit_intercept argument """ @@ -263,27 +284,36 @@ def test_glm_selection_argument(): assert_raises(ValueError, glm.fit, X, y) -def test_glm_check_input_argument(): - """Test GLM check_input argument +def test_glm_random_state_argument(): + """Test GLM random_state argument """ y = np.array([1, 2]) X = np.array([[1], [1]]) - for check_input in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(check_input=check_input) + for random_state in ['a string', 0.5, [0]]: + glm = GeneralizedLinearRegressor(random_state=random_state) assert_raises(ValueError, glm.fit, X, y) -def test_glm_random_state_argument(): - """Test GLM random_state argument +def test_glm_copy_X_argument(): + """Test GLM copy_X arguments """ y = np.array([1, 2]) X = np.array([[1], [1]]) - for random_state in ['a string', 0.5, [0]]: - glm = GeneralizedLinearRegressor(random_state=random_state) + for copy_X in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(copy_X=copy_X) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_check_input_argument(): + """Test GLM check_input argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for check_input in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(check_input=check_input) assert_raises(ValueError, glm.fit, X, y) -# TODO: check P1 and P2 # TODO: check additional validations if check_input == True def test_glm_identiy_regression(): From 6fdfb47428571b4c8e89046a7b1f481711832f61 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 30 Mar 2018 19:41:09 +0200 Subject: [PATCH 24/69] [MRG] Add Generalized Linear Models (#9405) * fix doctest failure in example of linear_model.rst * fix dtype issue in test_glm_P2_argument --- doc/modules/linear_model.rst | 4 ++-- sklearn/linear_model/tests/test_glm.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 5cb8e54afbb06..9f85da771c6f1 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -926,8 +926,8 @@ follows: fit_intercept=True, l1_ratio=0, link='log', max_iter=100, random_state=None, selection='random', solver='auto', start_params=None, tol=0.0001, verbose=0, warm_start=False) - >>> reg.coef_ - array([ 0.24630255, 0.43373521]) + >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE + array([0.24630255, 0.43373521]) >>> reg.intercept_ #doctest: +ELLIPSIS -0.76383575... diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index edf579a416973..c5d132d35bdb9 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -202,8 +202,8 @@ def test_glm_P2_argument(): """ y = np.array([1, 2]) X = np.array([[1], [1]]) - for P2 in [np.full((2, 2), 'a string'), [[1, [2]], [3, 4]], [1, 2, 3], - [[1, 2]], [[1], [2]]]: + for P2 in [np.full((2, 2), 'a string', dtype=np.dtype(' Date: Sun, 5 Aug 2018 14:48:33 +0200 Subject: [PATCH 25/69] [MRG] Add Generalized Linear Models (#9405) * fix typos in doc --- doc/modules/linear_model.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 9f85da771c6f1..a204ccb080cc9 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -934,7 +934,7 @@ follows: Mathematical formulation ------------------------ -In the unpenalized case, the assumptions are the folowing: +In the unpenalized case, the assumptions are the following: * The target values :math:`y_i` are realizations of random variables :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})` @@ -951,7 +951,7 @@ same as specifying a unit variance function (they are one-to-one). Including penalties helps to avoid overfitting or, in case of L1 penalty, to obtain sparse solutions. But there are also other motivations to include them, -e.g. accounting fo dependence structure of :math:`y`. +e.g. accounting for the dependence structure of :math:`y`. The objective function, which is independent of :math:`\phi`, is minimized with respect to the coefficients :math:`w`. From 809e3a2747e451fc97b9a462cf682110d59fac25 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 26 Aug 2018 20:41:25 +0200 Subject: [PATCH 26/69] Remove test_glm_P2_argument --- sklearn/linear_model/tests/test_glm.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index c5d132d35bdb9..82b8ec2435543 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -197,15 +197,15 @@ def test_glm_P1_argument(): assert_raises(ValueError, glm.fit, X, y) -def test_glm_P2_argument(): - """Test GLM P2 arguments - """ - y = np.array([1, 2]) - X = np.array([[1], [1]]) - for P2 in [np.full((2, 2), 'a string', dtype=np.dtype(' Date: Thu, 30 Aug 2018 19:33:27 +0200 Subject: [PATCH 27/69] Filter out DeprecationWarning in old versions of scipy.sparse.linalg.spsolve about usage of umfpack --- sklearn/linear_model/tests/test_glm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 82b8ec2435543..e4be75ddb7a64 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -355,6 +355,7 @@ def test_glm_log_regression(): assert_array_almost_equal(res.coef_, coef) +@pytest.mark.filterwarnings('ignore:DeprecationWarning') def test_normal_ridge(): """Test ridge regression for Normal distributions From 46df5b6555938dc7c9acac67929b0633ea1354a3 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 30 Aug 2018 20:40:08 +0200 Subject: [PATCH 28/69] import pytest --- sklearn/linear_model/tests/test_glm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index e4be75ddb7a64..dfa205407a193 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,5 +1,6 @@ import numpy as np from numpy.testing import assert_allclose +import pytest import scipy as sp from scipy import sparse From 21f2136c083b4057868b7886880f356e58703611 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 30 Aug 2018 21:17:53 +0200 Subject: [PATCH 29/69] Document arguments of abstact methods --- sklearn/linear_model/glm.py | 125 +++++++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index eae4a56ea1d95..825ee6bfe8c45 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -93,12 +93,22 @@ class Link(six.with_metaclass(ABCMeta)): def link(self, mu): """The link function g(mu) with argument mu=E[Y] returns the linear predictor. + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the predicted mean. """ raise NotImplementedError @abstractmethod def derivative(self, mu): """Derivative of the link g'(mu). + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the predicted mean. """ raise NotImplementedError @@ -106,18 +116,33 @@ def derivative(self, mu): def inverse(self, lin_pred): """The inverse link function h(lin_pred) with the linear predictor as argument returns mu=E[Y]. + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (predicted) linear predictor. """ raise NotImplementedError @abstractmethod def inverse_derivative(self, lin_pred): """Derivative of the inverse link function h'(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (predicted) linear predictor. """ raise NotImplementedError @abstractmethod def inverse_derivative2(self, lin_pred): """Second derivative of the inverse link function h''(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (predicted) linear predictor. """ raise NotImplementedError @@ -236,6 +261,11 @@ def include_upper_bound(self): def in_y_range(self, x): """Returns true if `x` is in the valid range of Y~EDM. + + Parameters + ---------- + x : array, shape (n_samples,) + Target values. """ if self.include_lower_bound: if self.include_upper_bound: @@ -263,12 +293,22 @@ def unit_variance(self, mu): \partial\mu^2}}\big|_{y=\mu} See also :func:`variance`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. """ raise NotImplementedError() @abstractmethod def unit_variance_derivative(self, mu): r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Target values. """ raise NotImplementedError() @@ -276,6 +316,17 @@ def variance(self, mu, phi=1, weights=1): r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, with unit variance :math:`v(\mu)` and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. """ return phi/weights * self.unit_variance(mu) @@ -284,6 +335,17 @@ def variance_derivative(self, mu, phi=1, weights=1): :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. """ return phi/weights * self.unit_variance_derivative(mu) @@ -293,6 +355,14 @@ def unit_deviance(self, y, mu): In terms of the log-likelihood it is given by :math:`d(y,\mu) = -2\phi\cdot \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).` + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. """ raise NotImplementedError() @@ -301,9 +371,13 @@ def unit_deviance_derivative(self, y, mu): :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` with unit variance :math:`v(\mu)`. - Returns - ------- - derivative: array, shape = (n_samples,) + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. """ return -2*(y-mu)/self.unit_variance(mu) @@ -313,6 +387,17 @@ def deviance(self, y, mu, weights=1): In terms of the likelihood it is :math:`D = -2\phi\cdot \left(loglike(y,\mu,\frac{phi}{s}) - loglike(y,y,\frac{phi}{s})\right)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. """ return np.sum(weights*self.unit_deviance(y, mu)) @@ -326,6 +411,17 @@ def _deviance(self, coef, X, y, weights, link): def deviance_derivative(self, y, mu, weights=1): """The derivative w.r.t. `mu` of the deviance. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. """ return weights*self.unit_deviance_derivative(y, mu) @@ -464,7 +560,16 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): return eta, mu, score, fisher def starting_mu(self, y, weights=1): - """Starting values for the mean mu_i in (unpenalized) IRLS.""" + """Starting values for the mean mu_i in (unpenalized) IRLS. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ return ((weights*y+np.mean(weights*y)) / (2.*np.sum(np.ones_like(y)*weights))) @@ -656,12 +761,12 @@ def _irls_step(X, W, P2, z): X : numpy array or sparse matrix of shape (n_samples, n_features) Training data (with intercept included if present) - W : numpy array of shape (n_samples, ) + W : numpy array of shape (n_samples,) P2 : numpy array or sparse matrix of shape (n_features, n_features) The l2-penalty matrix or vector (=diagonal matrix) - z : numpy array of shape (n_samples, ) + z : numpy array of shape (n_samples,) Working observations Returns @@ -927,12 +1032,12 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- X : numpy array or sparse matrix of shape (n_samples, n_features) - Training data + Training data. - y : numpy array of shape (n_samples, ) - Target values + y : numpy array of shape (n_samples,) + Target values. - sample_weight : array of shape (n_samples, ) or None,\ + sample_weight : array of shape (n_samples,) or None,\ optinal (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has From 1faedf87eecfd2c6a668e2aec6e28a9fff8780ec Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 30 Aug 2018 21:18:51 +0200 Subject: [PATCH 30/69] Pytest filter warnings use two colons --- sklearn/linear_model/tests/test_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index dfa205407a193..1ac5ccd4d3d5c 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -356,7 +356,7 @@ def test_glm_log_regression(): assert_array_almost_equal(res.coef_, coef) -@pytest.mark.filterwarnings('ignore:DeprecationWarning') +@pytest.mark.filterwarnings('ignore::DeprecationWarning') def test_normal_ridge(): """Test ridge regression for Normal distributions From 992f9819838336a9372a7cadecc53eeafef439ff Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 30 Aug 2018 22:07:33 +0200 Subject: [PATCH 31/69] Improve documentation of arguments that were so far undocumented --- sklearn/linear_model/glm.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 825ee6bfe8c45..a0d0266fa0efe 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -579,6 +579,17 @@ class TweedieDistribution(ExponentialDispersionModel): They have :math:`\mu=\mathrm{E}[Y]` and :math:`\mathrm{Var}[Y] \propto \mu^power. + Special cases are: + + ===== ================ + Power Distribution + ===== ================ + 0 Normal + 1 Poisson + (0,1) Compound Poisson + 2 Gamma + 3 Inverse Gaussian + Attributes ---------- power : float @@ -586,6 +597,12 @@ class TweedieDistribution(ExponentialDispersionModel): :math:`v(\mu) = \mu^{power}`. """ def __init__(self, power=0): + """ + Parameters + ---------- + power : float (default=0) + Power of (of mu) of the variance function. + """ self.power = power self._upper_bound = np.Inf self._include_upper_bound = False @@ -623,6 +640,9 @@ def __init__(self, power=0): # Positive Stable self._lower_bound = 0 self._include_lower_bound = False + else: + raise ValueError('The power must be a float, i.e. real number, ' + 'got (power={})'.format(power)) @property def power(self): @@ -653,12 +673,22 @@ def include_upper_bound(self): def unit_variance(self, mu): """The unit variance of a Tweedie distribution is v(mu)=mu**power. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. """ return np.power(mu, self.power) def unit_variance_derivative(self, mu): """The derivative of the unit variance of a Tweedie distribution is v(mu)=power*mu**(power-1). + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. """ return self.power*np.power(mu, self.power-1) @@ -680,9 +710,6 @@ def unit_deviance(self, y, mu): return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) - def likelihood(self, y, X, w, phi, weights=1): - raise NotImplementedError('This function is not (yet) implemented.') - class NormalDistribution(TweedieDistribution): """Class for the Normal (aka Gaussian) distribution""" From 06b8451ea109040371615e9e7baaa8ff505197f0 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 31 Aug 2018 00:26:39 +0200 Subject: [PATCH 32/69] Further improve documentation of arguments --- sklearn/linear_model/glm.py | 43 +++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index a0d0266fa0efe..9688b1e0c9e5c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -601,7 +601,7 @@ def __init__(self, power=0): Parameters ---------- power : float (default=0) - Power of (of mu) of the variance function. + Variance power of the `unit_variance` function. """ self.power = power self._upper_bound = np.Inf @@ -798,7 +798,7 @@ def _irls_step(X, W, P2, z): Returns ------- - coef: array, shape = (X.shape[1]) + coef: array, shape (X.shape[1]) """ # TODO: scipy.linalg.solve is faster, but ordinary least squares uses # scipy.linalg.lstsq. What is more appropriate? @@ -898,7 +898,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : None or array of shape (n_features*, ), optional\ + P1 : None or array of shape (n_features*,), optional\ (default=None) With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The @@ -1007,7 +1007,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Attributes ---------- - coef_ : array, shape (n_features, ) + coef_ : array, shape (n_features,) Estimated coefficients for the linear predictor (X*coef_) in the GLM. intercept_ : float @@ -1679,7 +1679,7 @@ def Hs(s): if self.fit_dispersion in ['chisqr', 'deviance']: # attention because of rescaling of weights - self.dispersion_ = self.estimate_phi(y, X, weights)*weights_sum + self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum return self @@ -1688,12 +1688,12 @@ def linear_predictor(self, X): Parameters ---------- - X : numpy array or sparse matrix of shape [n_samples,n_features] + X : numpy array or sparse matrix, shape (n_samples, n_features) Samples. Returns ------- - C : array, shape = (n_samples) + C : array, shape (n_samples) Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") @@ -1709,12 +1709,15 @@ def predict(self, X, sample_weight=None): Parameters ---------- - X : numpy array or sparse matrix of shape [n_samples,n_features] + X : numpy array or sparse matrix, shape (n_samples, n_features) Samples. + sample_weight : array of shape (n_samples,) or None , \ + (default=None) + Returns ------- - C : array, shape = (n_samples) + C : array, shape (n_samples,) Returns predicted values times sample_weight. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], @@ -1726,9 +1729,21 @@ def predict(self, X, sample_weight=None): return mu*weights - def estimate_phi(self, y, X, sample_weight=None): - """Estimation of the dispersion parameter. + def estimate_phi(self, X, y, sample_weight=None): + """Estimation of the dispersion parameter phi. Returns the estimate. + + Parameters + ---------- + X : numpy array or sparse matrix of shape (n_samples, n_features) + Training data. + + y : numpy array, shape (n_samples,) + Target values. + + sample_weight : array of shape (n_samples,) or None,\ + optinal (default=None) + Sample weights. """ check_is_fitted(self, "coef_") _dtype = [np.float64, np.float32] @@ -1773,13 +1788,13 @@ def score(self, X, y, sample_weight=None): Parameters ---------- - X : array-like, shape = (n_samples, n_features) + X : array-like, shape (n_samples, n_features) Test samples - y : array-like of shape = (n_samples) + y : array-like, shape (n_samples,) True valeus for X. - sample_weight : array-like, shape = (n_samples), optional + sample_weight : array-like, shape = (n_samples,), optional Sample weights. Returns From c93f60d9e98a5be0d493d513e37c3c9de5167542 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 31 Aug 2018 08:25:01 +0200 Subject: [PATCH 33/69] Remove parameters docstring for __init__ --- sklearn/linear_model/glm.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 9688b1e0c9e5c..021927b598822 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -597,12 +597,6 @@ class TweedieDistribution(ExponentialDispersionModel): :math:`v(\mu) = \mu^{power}`. """ def __init__(self, power=0): - """ - Parameters - ---------- - power : float (default=0) - Variance power of the `unit_variance` function. - """ self.power = power self._upper_bound = np.Inf self._include_upper_bound = False From 66ec63b5157026f8541e2761c29ef3225d89a44c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 31 Aug 2018 19:24:23 +0200 Subject: [PATCH 34/69] Fix typos in docstring of TweedieDistribution --- sklearn/linear_model/glm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 021927b598822..65abb42b043d5 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -577,7 +577,7 @@ def starting_mu(self, y, weights=1): class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. They have :math:`\mu=\mathrm{E}[Y]` and - :math:`\mathrm{Var}[Y] \propto \mu^power. + :math:`\mathrm{Var}[Y] \propto \mu^power`. Special cases are: @@ -593,7 +593,7 @@ class TweedieDistribution(ExponentialDispersionModel): Attributes ---------- power : float - The variance power of the unit_variance + The variance power of the `unit_variance` :math:`v(\mu) = \mu^{power}`. """ def __init__(self, power=0): From 53c69702c790223d3940cd650b9d52a61e39b244 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 31 Aug 2018 22:20:43 +0200 Subject: [PATCH 35/69] Change docstring section of TweedieDistribution from Attributes to Parameters --- sklearn/linear_model/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 65abb42b043d5..fcb6c9754b826 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -590,7 +590,7 @@ class TweedieDistribution(ExponentialDispersionModel): 2 Gamma 3 Inverse Gaussian - Attributes + Parameters ---------- power : float The variance power of the `unit_variance` From 87d5ba38c5663224edd7ec10bc7efdf27172e7f8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 7 Oct 2018 19:43:42 +0200 Subject: [PATCH 36/69] Minor doc improvements of GeneralizedLinearRegressor --- sklearn/linear_model/glm.py | 87 +++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index fcb6c9754b826..01d5420773ffc 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -6,17 +6,16 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -# TODO: Write more tests -# TODO: Write examples and more docu -# TODO: deal with option self.copy_X +# TODO: Write examples +# TODO: Make option self.copy_X more meaningfull than just for start values. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. -# TODO: Add cross validation support +# TODO: Add cross validation support? # TODO: Should GeneralizedLinearRegressor inherit from LinearModel? # So far, it does not. # TODO: Include further classes in class.rst? ExponentialDispersionModel? # TweedieDistribution? -# TODO: Negative values in P1 are not allowed so far. They could be used to +# TODO: Negative values in P1 are not allowed so far. They could be used # for group lasso. # Design Decisions: @@ -26,7 +25,7 @@ # regressor, Bernoulli/Binomial => classifier. # Solution: GeneralizedLinearRegressor since this is the focus. # - Allow for finer control of penalty terms: -# L1: ||P1*w||_1 with P1*w a componentwise product, this allows to exclude +# L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude # factors from the L1 penalty. # L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be # a 1st or 2nd order difference matrix (compare B-spline penalties and @@ -322,7 +321,7 @@ def variance(self, mu, phi=1, weights=1): mu : array, shape (n_samples,) Predicted mean. - phi : float + phi : float (default=1) Dispersion parameter. weights : array, shape (n_samples,) (default=1) @@ -592,7 +591,7 @@ class TweedieDistribution(ExponentialDispersionModel): Parameters ---------- - power : float + power : float (default=0) The variance power of the `unit_variance` :math:`v(\mu) = \mu^{power}`. """ @@ -779,22 +778,22 @@ def _irls_step(X, W, P2, z): Parameters ---------- - X : numpy array or sparse matrix of shape (n_samples, n_features) + X : {numpy array, sparse matrix}, shape (n_samples, n_features) Training data (with intercept included if present) - W : numpy array of shape (n_samples,) + W : numpy array, shape (n_samples,) - P2 : numpy array or sparse matrix of shape (n_features, n_features) - The l2-penalty matrix or vector (=diagonal matrix) + P2 : {numpy array, sparse matrix}, shape (n_features, n_features) + The L2-penalty matrix or vector (=diagonal matrix) - z : numpy array of shape (n_samples,) + z : numpy array, shape (n_samples,) Working observations Returns ------- coef: array, shape (X.shape[1]) """ - # TODO: scipy.linalg.solve is faster, but ordinary least squares uses + # TODO: scipy.linalg.solve seems faster, but ordinary least squares uses # scipy.linalg.lstsq. What is more appropriate? n_samples, n_features = X.shape if sparse.issparse(X): @@ -892,19 +891,20 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : None or array of shape (n_features*,), optional\ + P1 : {None, array-like}, shape (n_features*,), optional\ (default=None) With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The - default value ``None`` is the same as an array of ones. + default value ``None`` is the same as a 1d array of ones. Note that n_features* = X.shape[1] = length of coef_ (intercept always excluded from counting). - P2 : None or array of shape (n_features*, n_features*), optional\ - (default=None) + P2 : {None, array-like, sparse matrix}, shape \ + (n_features*, n_features*), optional (default=None) With this square matrix the L2 penalty is calculated as `w P2 w`. This gives a fine control over this penalty (Tikhonov regularization). + The default value ``None`` is the same as the idendity matrix. Note that n_features* = X.shape[1] = length of coef_ (intercept always excluded from counting). P2 must be positive semi-definite. @@ -939,8 +939,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties. - - 'cd' is the coordinate descent algorithm. It can deal with L1 and - L2 penalties. + - 'cd' is the coordinate descent algorithm. It can + deal with L1 as well as L2 penalties. max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -958,8 +958,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): does not exit (first call to fit), option ``start_params`` sets the starting values for ``coef_`` and ``intercept_``. - start_params : {None, 'least_squares', 'zero'} or array of shape \ - (n_features, ) or }, optional (default=None) + start_params : {None, 'least_squares', 'zero', array of shape \ + (n_features, )}, optional (default=None) If an array of size n_features is supplied, use these as start values for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. @@ -979,7 +979,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. - random_state : int, RandomState instance or None, optional (default=None) + random_state : {int, RandomState instance, None}, optional (default=None) The seed of the pseudo random number generator that selects a random feature to be updated for solver 'cd' (coordinate descent). If int, random_state is the seed used by the random @@ -1052,13 +1052,13 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : numpy array or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. - y : numpy array of shape (n_samples,) + y : array-like, shape (n_samples,) Target values. - sample_weight : array of shape (n_samples,) or None,\ + sample_weight : {None, array-like}, shape (n_samples,),\ optinal (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has @@ -1190,12 +1190,12 @@ def fit(self, X, y, sample_weight=None): if self.P1 is None: P1 = np.ones(X.shape[1]) else: - P1 = np.atleast_1d(np.copy(self.P1)) + P1 = np.copy(np.atleast_1d(self.P1)) if P1.dtype.kind not in ['b', 'i', 'u', 'f']: raise ValueError("P1 must be a numeric value; " "got (dtype={0}).".format(P1.dtype)) if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): - raise ValueError("P1 must be either None or an 1D array with " + raise ValueError("P1 must be either None or a 1d array with " "the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." @@ -1324,6 +1324,7 @@ def fit(self, X, y, sample_weight=None): coef = _irls_step(Xnew, W, P2, z) else: # with L1 penalty, start with coef = 0 + # TODO: Are there better options? coef = np.zeros(n_features) elif isinstance(self.start_params, six.string_types): if self.start_params == 'zero': @@ -1353,7 +1354,7 @@ def fit(self, X, y, sample_weight=None): # 4. fit # ####################################################################### # algorithms for optimiation - # TODO: Parallelize it + # TODO: Parallelize it? self.n_iter_ = 0 converged = False # 4.1 IRLS ############################################################ @@ -1682,12 +1683,12 @@ def linear_predictor(self, X): Parameters ---------- - X : numpy array or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Samples. Returns ------- - C : array, shape (n_samples) + C : array, shape (n_samples,) Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") @@ -1703,17 +1704,18 @@ def predict(self, X, sample_weight=None): Parameters ---------- - X : numpy array or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Samples. - sample_weight : array of shape (n_samples,) or None , \ - (default=None) + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) Returns ------- C : array, shape (n_samples,) Returns predicted values times sample_weight. """ + # TODO: Is copy=True necessary? X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype='numeric', copy=True, ensure_2d=True, allow_nd=False) @@ -1729,14 +1731,14 @@ def estimate_phi(self, X, y, sample_weight=None): Parameters ---------- - X : numpy array or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. - y : numpy array, shape (n_samples,) + y : array-like, shape (n_samples,) Target values. - sample_weight : array of shape (n_samples,) or None,\ - optinal (default=None) + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) Sample weights. """ check_is_fitted(self, "coef_") @@ -1782,13 +1784,14 @@ def score(self, X, y, sample_weight=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) - Test samples + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Test samples. y : array-like, shape (n_samples,) - True valeus for X. + True values of target. - sample_weight : array-like, shape = (n_samples,), optional + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) Sample weights. Returns From a9ae023ec331e782d7a23de18f7e6fbb0dd1f57d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 8 Oct 2018 20:21:35 +0200 Subject: [PATCH 37/69] Double escape in doctring of GeneralizedLinearRegressor --- sklearn/linear_model/glm.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 01d5420773ffc..535af60289f8e 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -866,13 +866,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): TODO: Estimation of the dispersion parameter phi. - If your target `y` is a ratio, you should also provide appropriate weights - `w`. As an example, consider Poission distributed counts `z` (integers) and - weights `w`=exposure (time, money, persons years, ...), then you fit + If the target `y` is a ratio, appropriate weights `w` should be provided. + As an example, consider Poission distributed counts `z` (integers) and + weights `w`=exposure (time, money, persons years, ...). Then you fit `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y, - sample_weight=w)``. You need the weights for the right mean, consider: - :math:`\bar(y) = \frac{\sum_i w_i y_i}{\sum_i w_i}`. - In this case one might say that y has a 'scaled' Poisson distributions. + sample_weight=w)``. The weights are necessary for the right mean, consider: + :math:`\\bar(y) = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`. + In this case one might say that 'y' has a 'scaled' Poisson distributions. The same holds for other distributions. Parameters @@ -891,7 +891,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : {None, array-like}, shape (n_features*,), optional\ + P1 : {None, array-like}, shape (n_features*,), optional \ (default=None) With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The From bb62485166412d8ba6393e5ba753b015b806867c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 31 Dec 2018 16:54:40 +0100 Subject: [PATCH 38/69] Add example for GeneralizedLinearRegressor * add example * improve docstring of GeneralizedLinearRegressor * improve user guide for GeneralizedLinearRegressor --- doc/modules/linear_model.rst | 34 +++++--- .../plot_poisson_spline_regression.py | 83 +++++++++++++++++++ sklearn/linear_model/glm.py | 22 ++--- 3 files changed, 115 insertions(+), 24 deletions(-) create mode 100644 examples/linear_model/plot_poisson_spline_regression.py diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index a204ccb080cc9..d65f7ed121f8e 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -901,12 +901,15 @@ for a more versatile L2 penalty. Use cases, where a loss different from the squared loss might be appropriate, are the following: - * If the target values :math:`y` are counts (integer valued) or frequencies, you might try a Poisson deviance. + * If the target values :math:`y` are counts (non-negative integer valued) or + frequencies (non-negative), you might use a Poisson deviance with log-link. - * If the target values are positive valued and skewed, you might try a Gamma deviance. + * If the target values are positive valued and skewed, you might try a + Gamma deviance with log-link. - * If the target values seem to be heavier tailed than a Gamma distribution, you might try an Inverse Gaussian deviance (or even higher variance powers of the Tweedie family). - Keep in mind that the mean is not a good measure for very heavy tailed distributions, cf. extreme value theory. + * If the target values seem to be heavier tailed than a Gamma distribution, + you might try an Inverse Gaussian deviance (or even higher variance powers + of the Tweedie family). Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, @@ -931,6 +934,11 @@ follows: >>> reg.intercept_ #doctest: +ELLIPSIS -0.76383575... + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py` + Mathematical formulation ------------------------ @@ -956,20 +964,20 @@ e.g. accounting for the dependence structure of :math:`y`. The objective function, which is independent of :math:`\phi`, is minimized with respect to the coefficients :math:`w`. -The deviance is defined by +The deviance is defined by the log of the EDM likelihood as .. math:: D(y, \mu) = -2\phi\cdot \left(loglike(y,\mu,\frac{\phi}{s}) - loglike(y,y,\frac{\phi}{s})\right) -===================================== ================================= -Distribution Variance Function :math:`v(\mu)` -===================================== ================================= -Normal ("normal") :math:`1` -Poisson ("poisson") :math:`\mu` -Gamma ("gamma") :math:`\mu^2` -Inverse Gaussian ("inverse.gaussian") :math:`\mu^3` -===================================== ================================= +===================================== =============================== ================================= ============================================ +Distribution Target Domain Variance Function :math:`v(\mu)` Deviance :math:`D(y, \mu)` +===================================== =============================== ================================= ============================================ +Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` +Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{/mu}-y+\mu)` +Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` +Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` +===================================== =============================== ================================= ============================================ Two remarks: diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py new file mode 100644 index 0000000000000..b98bca5d8f867 --- /dev/null +++ b/examples/linear_model/plot_poisson_spline_regression.py @@ -0,0 +1,83 @@ +""" +================================= +Poisson Regression with B-Splines +================================= + +As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` +example, a Poisson regression with penalized B-splines (P-splines) [1]_ is +fitted on slightly different sinusodial, Poisson distributed data and +compared to an AdaBoost model with decision trees. +One can see, that this is a hard problem for both estimators. + +.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines + and penalties". Statist. Sci. 11 (1996), no. 2, 89--121. + `doi:10.1214/ss/1038425655 + `_ + +""" +print(__doc__) + +# Author: Christian Lorentzen +# based on the AdaBoost regression example from Noel Dawe +# License: BSD 3 clause + +# importing necessary libraries +import numpy as np +from scipy.linalg import toeplitz +from scipy.interpolate import BSpline +import matplotlib.pyplot as plt +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import AdaBoostRegressor +from sklearn.linear_model import GeneralizedLinearRegressor + + +# Create the dataset +xmin, xmax = 0, 6 +rng = np.random.RandomState(1) +X = np.linspace(xmin, xmax, 500)[:, np.newaxis] +y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel()) +y = rng.poisson(y_true, X.shape[0]) + +# b-spline basis +nknots, degree = 40, 3 +ns = nknots - degree - 1 # number of base spline functions +dx = (xmax - xmin) / (nknots - 1 - 2 * degree) +knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots) +coef = np.zeros(ns) +splineBasis = np.empty((X.shape[0], ns), dtype=float) +for i in range(ns): + coef[i] = 1 + splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ + .ravel() + coef[i] = 0 + +# second order difference matrix +P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float) +P2[0, 0] = P2[-1, -1] = 1 + +# Fit regression model +regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), + n_estimators=10, random_state=rng) + +regr_2 = GeneralizedLinearRegressor(family='poisson', link='log', + fit_intercept=True, alpha=0.02, + l1_ratio=0.1, P2=P2) + +regr_1.fit(X, y) +regr_2.fit(splineBasis, y) + +# Predict +y_1 = regr_1.predict(X) +y_2 = regr_2.predict(splineBasis) + +# Plot the results +plt.figure() +plt.plot(X, y_true, c="b", label="true mean") +plt.scatter(X, y, c="k", marker='.', label="training samples") +plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2) +plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2) +plt.xlabel("data") +plt.ylabel("target") +plt.title("Regression Comparison") +plt.legend() +plt.show() diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 535af60289f8e..37afc8da2d6db 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -6,7 +6,7 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -# TODO: Write examples +# TODO: Write more examples. # TODO: Make option self.copy_X more meaningfull than just for start values. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. @@ -832,7 +832,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): with inverse link function `h` and s=sum of `sample_weight` (which equals n_samples for `sample_weight=None`). - For `P1`=`P2`=identity, the penalty is the elastic net:: + For `P1=P2=identity`, the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 @@ -868,11 +868,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If the target `y` is a ratio, appropriate weights `w` should be provided. As an example, consider Poission distributed counts `z` (integers) and - weights `w`=exposure (time, money, persons years, ...). Then you fit - `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y, - sample_weight=w)``. The weights are necessary for the right mean, consider: - :math:`\\bar(y) = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`. - In this case one might say that 'y' has a 'scaled' Poisson distributions. + weights `w=exposure` (time, money, persons years, ...). Then you fit + `y = z/w`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, + sample_weight=w)``. The weights are necessary for the right meanself. + Consider :math:`\\bar{y} = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`, + in this case one might say that `y` has a 'scaled' Poisson distributions. The same holds for other distributions. Parameters @@ -1017,10 +1017,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): References ---------- For the coordinate descent implementation: - .. [1] Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin - An Improved GLMNET for L1-regularized Logistic Regression, - Journal of Machine Learning Research 13 (2012) 1999-2030 - https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', From 16d064db7cee1d59569d21631cc2fa41be8b3b14 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 1 Jan 2019 11:58:41 +0100 Subject: [PATCH 39/69] Resolve merge conflicts * resolve merge conflicts in linear_model.rst * replace BSpline by splev to support older scipy versions --- doc/modules/linear_model.rst | 14 +++++++------- .../linear_model/plot_poisson_spline_regression.py | 8 +++++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index d65f7ed121f8e..09f14735c2907 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -876,18 +876,18 @@ to warm-starting (see :term:`Glossary `). .. _Generalized_linear_regression: -Generalized linear regression +Generalized Linear Regression ============================= :class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two -ways [8]_. First, the predicted values :math:`\hat{y}` are linked to a linear +ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear combination of the input variables :math:`X` via an inverse link function :math:`h` as .. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [9]_. The objective function beeing minimized +exponential dispersion model (EDM) [11]_. The objective function beeing minimized becomes .. math:: \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1 @@ -983,7 +983,7 @@ Two remarks: * The deviances for at least Normal, Poisson and Gamma distributions are strictly consistent scoring functions for the mean :math:`\mu`, see Eq. - (19)-(20) in [10]_. + (19)-(20) in [12]_. * If you want to model a frequency, i.e. counts per exposure (time, volume, ...) you can do so by a Poisson distribution and passing @@ -993,12 +993,12 @@ Two remarks: .. topic:: References: - .. [8] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - .. [9] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. + .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. See also `Exponential dispersion model. `_ - .. [10] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ + .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ Stochastic Gradient Descent - SGD ================================= diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py index b98bca5d8f867..fce85fae1ea8c 100644 --- a/examples/linear_model/plot_poisson_spline_regression.py +++ b/examples/linear_model/plot_poisson_spline_regression.py @@ -24,7 +24,8 @@ # importing necessary libraries import numpy as np from scipy.linalg import toeplitz -from scipy.interpolate import BSpline +# from scipy.interpolate import BSpline +from scipy.interpolate import splev import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor @@ -47,8 +48,9 @@ splineBasis = np.empty((X.shape[0], ns), dtype=float) for i in range(ns): coef[i] = 1 - splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ - .ravel() +# splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ +# .ravel() + splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel() coef[i] = 0 # second order difference matrix From 1a02a901d1a6d99484241ca205178ac61fc47846 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 1 Jan 2019 12:59:30 +0100 Subject: [PATCH 40/69] Adapt for minimum numpy version * replace np.block --- sklearn/linear_model/glm.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 37afc8da2d6db..851767055c61c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -1235,8 +1235,11 @@ def fit(self, X, y, sample_weight=None): P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2), dtype=P2.dtype).tocsr() else: - P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))], - [np.zeros((X.shape[1], 1)), P2]]) + # as of numpy 1.13 this would work: + # P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))], + # [np.zeros((X.shape[1], 1)), P2]]) + P2 = np.hstack((np.zeros((X.shape[1], 1)), P2)) + P2 = np.vstack((np.zeros((1, X.shape[1]+1)), P2)) else: Xnew = X From 177eb4cc017a7262e472070b4a920250711a099c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 6 Jan 2019 19:46:19 +0100 Subject: [PATCH 41/69] Remove six dependencies as in #12639 * replace six.with_metaclass(ABCMeta) by metaclass=ABCMeta * replace six.integer_types by int * replace six.string_types by str * rebase * correct email address --- sklearn/linear_model/glm.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 851767055c61c..aca49ec7edf28 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -2,12 +2,13 @@ Generalized Linear Models with Exponential Dispersion Family """ -# Author: Christian Lorentzen +# Author: Christian Lorentzen # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause # TODO: Write more examples. -# TODO: Make option self.copy_X more meaningfull than just for start values. +# TODO: Make option self.copy_X more meaningful. +# So far, fit uses Xnew instead of X. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. # TODO: Add cross validation support? @@ -51,7 +52,6 @@ from .ridge import Ridge from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning -from ..externals import six from ..utils import check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg @@ -84,7 +84,7 @@ def _check_weights(sample_weight, n_samples): return weights -class Link(six.with_metaclass(ABCMeta)): +class Link(metaclass=ABCMeta): """Abstract base class for Link funtions """ @@ -186,7 +186,7 @@ def inverse_derivative2(self, lin_pred): return np.exp(lin_pred) -class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): +class ExponentialDispersionModel(metaclass=ABCMeta): r"""Base class for reproductive Exponential Dispersion Models (EDM). The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by @@ -1142,7 +1142,7 @@ def fit(self, X, y, sample_weight=None): "with L1 penalties, which are included with " "(alpha={1}) and (l1_ratio={2})." .format(solver, self.alpha, self.l1_ratio)) - if (not isinstance(self.max_iter, six.integer_types) + if (not isinstance(self.max_iter, int) or self.max_iter <= 0): raise ValueError("Maximum number of iteration must be a positive " "integer;" @@ -1156,7 +1156,7 @@ def fit(self, X, y, sample_weight=None): start_params = self.start_params if start_params is None: pass - elif isinstance(start_params, six.string_types): + elif isinstance(start_params, str): if start_params not in ['least_squares', 'zero']: raise ValueError("The argument start_params must be None, " "'least-squares', 'zero' or an array of right" @@ -1329,7 +1329,7 @@ def fit(self, X, y, sample_weight=None): # with L1 penalty, start with coef = 0 # TODO: Are there better options? coef = np.zeros(n_features) - elif isinstance(self.start_params, six.string_types): + elif isinstance(self.start_params, str): if self.start_params == 'zero': coef = np.zeros(n_features) elif self.start_params == 'least_squares': From 3d4c784df6797c244b31dd3393083d2c63114bf2 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 3 Feb 2019 21:38:24 +0100 Subject: [PATCH 42/69] Improve user guide, doc and fix penalty parameter for Ridge * move parts of docstring to new Notes section * improve user guide and doc * fix typos * fix scaling of penalty parameter in Ridge() * docstring for _check_weights * reduce tol for parameter initialization --- doc/modules/linear_model.rst | 18 +++--- sklearn/linear_model/glm.py | 114 +++++++++++++++++++---------------- 2 files changed, 73 insertions(+), 59 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 09f14735c2907..174d1e4eddae4 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -890,7 +890,7 @@ Secondly, the squared loss function is replaced by the deviance :math:`D` of an exponential dispersion model (EDM) [11]_. The objective function beeing minimized becomes -.. math:: \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1 +.. math:: \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1 +\frac{\alpha(1-\rho)}{2} w^T P_2 w with sample weights :math:`s`. @@ -914,7 +914,7 @@ are the following: Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link -:math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with +:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with :math:`h(Xw)=\exp(Xw)`. Note that the feature matrix `X` should be standardized before fitting. This @@ -964,17 +964,19 @@ e.g. accounting for the dependence structure of :math:`y`. The objective function, which is independent of :math:`\phi`, is minimized with respect to the coefficients :math:`w`. -The deviance is defined by the log of the EDM likelihood as +The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` +likelihood as -.. math:: D(y, \mu) = -2\phi\cdot - \left(loglike(y,\mu,\frac{\phi}{s}) - - loglike(y,y,\frac{\phi}{s})\right) +.. math:: d(y, \mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) + - loglike(y,y,\phi)\right) \\ + D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) ===================================== =============================== ================================= ============================================ -Distribution Target Domain Variance Function :math:`v(\mu)` Deviance :math:`D(y, \mu)` +Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` ===================================== =============================== ================================= ============================================ Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` -Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{/mu}-y+\mu)` +Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` ===================================== =============================== ================================= ============================================ diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index aca49ec7edf28..5fc869f81195f 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -8,7 +8,7 @@ # TODO: Write more examples. # TODO: Make option self.copy_X more meaningful. -# So far, fit uses Xnew instead of X. +# So far, fit uses Xnew instead of X. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. # TODO: Add cross validation support? @@ -28,7 +28,7 @@ # - Allow for finer control of penalty terms: # L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude # factors from the L1 penalty. -# L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be +# L2: w*P2*w with P2 a (semi-) positive definite matrix, e.g. P2 could be # a 1st or 2nd order difference matrix (compare B-spline penalties and # Tikhonov regularization). # - The link funtion (instance of class Link) is necessary for the evaluation @@ -59,6 +59,8 @@ def _check_weights(sample_weight, n_samples): + """Check that weights are non-negative and have the right shape + """ if sample_weight is None: weights = np.ones(n_samples) elif np.isscalar(sample_weight): @@ -594,6 +596,7 @@ class TweedieDistribution(ExponentialDispersionModel): power : float (default=0) The variance power of the `unit_variance` :math:`v(\mu) = \mu^{power}`. + For ``0`. - The fit itself does not need Y to be from an EDM, but only assumes - the first two moments :math:`E[Y_i]=\\mu_i=h(\\eta_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{w_i} v(\\mu_i)`. - - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - (penalized) maximum likelihood which is equivalent to minimizing the - deviance. - - For `alpha` > 0, the feature matrix `X` should be standardized in order to - penalize features equally strong. Call - :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. - - TODO: Estimation of the dispersion parameter phi. - - If the target `y` is a ratio, appropriate weights `w` should be provided. - As an example, consider Poission distributed counts `z` (integers) and - weights `w=exposure` (time, money, persons years, ...). Then you fit - `y = z/w`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, - sample_weight=w)``. The weights are necessary for the right meanself. - Consider :math:`\\bar{y} = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`, - in this case one might say that `y` has a 'scaled' Poisson distributions. - The same holds for other distributions. - Parameters ---------- alpha : float, optional (default=1) @@ -891,22 +871,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : {None, array-like}, shape (n_features*,), optional \ + P1 : {None, array-like}, shape (n_features,), optional \ (default=None) With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The default value ``None`` is the same as a 1d array of ones. - Note that n_features* = X.shape[1] = length of coef_ (intercept - always excluded from counting). + Note that n_features = X.shape[1]. P2 : {None, array-like, sparse matrix}, shape \ - (n_features*, n_features*), optional (default=None) + (n_features, n_features), optional (default=None) With this square matrix the L2 penalty is calculated as `w P2 w`. This gives a fine control over this penalty (Tikhonov - regularization). - The default value ``None`` is the same as the idendity matrix. - Note that n_features* = X.shape[1] = length of coef_ (intercept - always excluded from counting). P2 must be positive semi-definite. + regularization). The diagonal zeros of a diagonal P2, for example, + exclude all corresponding coefficients from the L2 penalty. + The default value ``None`` is the same as the identity matrix. + Note that n_features = X.shape[1]. P2 must be positive semi-definite. fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be @@ -929,18 +908,22 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \ optional (default='auto') - Algorithm to use in the optimization problem. + Algorithm to use in the optimization problem: - - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'. + 'auto' + Sets 'irls' if l1_ratio equals 0, else 'cd'. - - 'irls' is iterated reweighted least squares (Fisher scoring). + 'irls' + iterated reweighted least squares (Fisher scoring). It is the standard algorithm for GLMs. Cannot deal with L1 penalties. - - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties. + 'newton-cg', 'lbfgs' + Cannot deal with L1 penalties. - - 'cd' is the coordinate descent algorithm. It can - deal with L1 as well as L2 penalties. + 'cd' + coordinate descent algorithm. It can deal with L1 as well as L2 + penalties. max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -959,10 +942,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): starting values for ``coef_`` and ``intercept_``. start_params : {None, 'least_squares', 'zero', array of shape \ - (n_features, )}, optional (default=None) - If an array of size n_features is supplied, use these as start values + (n_features*, )}, optional (default=None) + If an array of size n_features* is supplied, use it as start values for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. + Note that n_features* = X.shape[1] + fit_intercept includes the + intercept in counting. If 'least_squares' is set, the result of a least squares fit in the link space (linear predictor) is taken. If 'zero' is set, all coefficients start with zero. @@ -1013,6 +998,30 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): n_iter_ : int Actual number of iterations of the solver. + Notes + ----- + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + (penalized) maximum likelihood which is equivalent to minimizing the + deviance. + + For `alpha` > 0, the feature matrix `X` should be standardized in order to + penalize features equally strong. Call + :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. + + If the target `y` is a ratio, appropriate sample weights `s` should be + provided. + As an example, consider Poission distributed counts `z` (integers) and + weights `s=exposure` (time, money, persons years, ...). Then you fit + `y = z/s`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, + sample_weight=s)``. The weights are necessary for the right (finite + sample) mean. + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + in this case one might say that `y` has a 'scaled' Poisson distributions. + The same holds for other distributions. References ---------- @@ -1138,10 +1147,10 @@ def fit(self, X, y, sample_weight=None): else: solver = 'cd' if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']): - raise ValueError("The chosen solver (solver={0}) can't deal " - "with L1 penalties, which are included with " - "(alpha={1}) and (l1_ratio={2})." - .format(solver, self.alpha, self.l1_ratio)) + raise ValueError("The chosen solver (solver={0}) can't deal " + "with L1 penalties, which are included with " + "(alpha={1}) and (l1_ratio={2})." + .format(solver, self.alpha, self.l1_ratio)) if (not isinstance(self.max_iter, int) or self.max_iter <= 0): raise ValueError("Maximum number of iteration must be a positive " @@ -1340,14 +1349,17 @@ def fit(self, X, y, sample_weight=None): elif self.l1_ratio <= 0.01: # ElasticNet says l1_ratio <= 0.01 is not reliable # => use Ridge + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 reg = Ridge(copy_X=True, fit_intercept=False, - alpha=self.alpha) + alpha=self.alpha*n_samples, + tol=np.max([self.tol, np.sqrt(self.tol)])) reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: # TODO: Does this make sense at all? reg = ElasticNet(copy_X=True, fit_intercept=False, - alpha=self.alpha, l1_ratio=self.l1_ratio) + alpha=self.alpha, l1_ratio=self.l1_ratio, + tol=np.max([self.tol, np.sqrt(self.tol)])) reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: @@ -1557,7 +1569,7 @@ def Hs(s): # minimize_z: a z + 1/2 b z^2 + c |d+z| # a = A_j # b = B_jj > 0 - # c = |P1_j| = P1_j > 0, ee 1.3 + # c = |P1_j| = P1_j > 0, see 1.3 # d = w_j + d_j # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) # with beta = z+d, beta_hat = d-a/b and gamma = c/b From 919912c3f98e6d1190737e344c14f31c2eef9077 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 17 Feb 2019 18:38:36 +0100 Subject: [PATCH 43/69] Smarter intercept initialization and docstring improvements * smarter initialization of intercept * PEP 257 -- Docstring Conventions * minor docstring changes --- sklearn/linear_model/glm.py | 265 +++++++++++++++++++++--------------- 1 file changed, 153 insertions(+), 112 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 5fc869f81195f..d69ccd0a66486 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -59,14 +59,13 @@ def _check_weights(sample_weight, n_samples): - """Check that weights are non-negative and have the right shape - """ + """Check that weights are non-negative and have the right shape.""" if sample_weight is None: weights = np.ones(n_samples) elif np.isscalar(sample_weight): if sample_weight <= 0: raise ValueError("Sample weights must be non-negative.") - weights = sample_weight*np.ones(n_samples) + weights = sample_weight * np.ones(n_samples) else: _dtype = [np.float64, np.float32] weights = check_array(sample_weight, accept_sparse='csr', @@ -75,8 +74,8 @@ def _check_weights(sample_weight, n_samples): if weights.ndim > 1: raise ValueError("Sample weight must be 1D array or scalar") elif weights.shape[0] != n_samples: - raise ValueError("Sample weights must have the same length as" - " y") + raise ValueError("Sample weights must have the same length as " + "y") if not np.all(weights >= 0): raise ValueError("Sample weights must be non-negative.") elif not np.sum(weights) > 0: @@ -87,70 +86,72 @@ def _check_weights(sample_weight, n_samples): class Link(metaclass=ABCMeta): - """Abstract base class for Link funtions - """ + """Abstract base class for Link funtions.""" @abstractmethod def link(self, mu): - """The link function g(mu) with argument mu=E[Y] returns the - linear predictor. + """Compute the link function g(mu). + + The link function links the mean mu=E[Y] to the so called linear + predictor (X*w), i.e. g(mu) = linear predictor. Parameters ---------- mu : array, shape (n_samples,) - Usually the predicted mean. + Usually the (predicted) mean. """ raise NotImplementedError @abstractmethod def derivative(self, mu): - """Derivative of the link g'(mu). + """Compute the derivative of the link g'(mu). Parameters ---------- mu : array, shape (n_samples,) - Usually the predicted mean. + Usually the (predicted) mean. """ raise NotImplementedError @abstractmethod def inverse(self, lin_pred): - """The inverse link function h(lin_pred) with the linear predictor as - argument returns mu=E[Y]. + """Compute the inverse link function h(lin_pred). + + Gives the inverse relationship between linkear predictor and the mean + mu=E[Y], i.e. h(linear predictor) = mu. Parameters ---------- lin_pred : array, shape (n_samples,) - Usually the (predicted) linear predictor. + Usually the (fitted) linear predictor. """ raise NotImplementedError @abstractmethod def inverse_derivative(self, lin_pred): - """Derivative of the inverse link function h'(lin_pred). + """Compute the derivative of the inverse link function h'(lin_pred). Parameters ---------- lin_pred : array, shape (n_samples,) - Usually the (predicted) linear predictor. + Usually the (fitted) linear predictor. """ raise NotImplementedError @abstractmethod def inverse_derivative2(self, lin_pred): - """Second derivative of the inverse link function h''(lin_pred). + """Compute 2nd derivative of the inverse link function h''(lin_pred). Parameters ---------- lin_pred : array, shape (n_samples,) - Usually the (predicted) linear predictor. + Usually the (fitted) linear predictor. """ raise NotImplementedError class IdentityLink(Link): - """The identity link function g(x)=x. - """ + """The identity link function g(x)=x.""" def link(self, mu): return mu @@ -169,8 +170,7 @@ def inverse_derivative2(self, lin_pred): class LogLink(Link): - """The log link function g(x)=log(x). - """ + """The log link function g(x)=log(x).""" def link(self, mu): return np.log(mu) @@ -238,26 +238,22 @@ class ExponentialDispersionModel(metaclass=ABCMeta): @abstractproperty def lower_bound(self): - """The lower bound of values of Y~EDM. - """ + """The lower bound of values of Y~EDM.""" raise NotImplementedError() @abstractproperty def upper_bound(self): - """The upper bound of values of Y~EDM. - """ + """The upper bound of values of Y~EDM.""" raise NotImplementedError() @abstractproperty def include_lower_bound(self): - """If True, values of y may equal lower bound: y >= lower_bound. - """ + """If True, values of y may equal lower bound: y >= lower_bound.""" raise NotImplementedError() @abstractproperty def include_upper_bound(self): - """If True, values of y may equal upper bound: y <= upper_bound. - """ + """If True, values of y may equal upper bound: y <= upper_bound.""" raise NotImplementedError() def in_y_range(self, x): @@ -285,7 +281,9 @@ def in_y_range(self, x): @abstractmethod def unit_variance(self, mu): - r"""The unit variance :math:`v(\mu)` determines the variance as + r"""Compute the unit variance function. + + The unit variance :math:`v(\mu)` determines the variance as a function of the mean :math:`\mu` by :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. It can also be derived from the unit deviance :math:`d(y,\mu)` as @@ -304,7 +302,9 @@ def unit_variance(self, mu): @abstractmethod def unit_variance_derivative(self, mu): - r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`. + r"""Compute the derivative of the unit variance w.r.t. mu. + + Return :math:`v'(\mu)`. Parameters ---------- @@ -314,7 +314,9 @@ def unit_variance_derivative(self, mu): raise NotImplementedError() def variance(self, mu, phi=1, weights=1): - r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is + r"""Compute the variance function. + + The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, with unit variance :math:`v(\mu)` and weights :math:`s_i`. @@ -332,7 +334,9 @@ def variance(self, mu, phi=1, weights=1): return phi/weights * self.unit_variance(mu) def variance_derivative(self, mu, phi=1, weights=1): - r"""The derivative of the variance w.r.t. `mu`, + r"""Compute the derivative of the variance w.r.t. mu. + + Returns :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` and weights :math:`s_i`. @@ -352,10 +356,12 @@ def variance_derivative(self, mu, phi=1, weights=1): @abstractmethod def unit_deviance(self, y, mu): - r"""The unit_deviance :math:`d(y,\mu)`. - In terms of the log-likelihood it is given by + r"""Compute the unit deviance. + + The unit_deviance :math:`d(y,\mu)` can be defined by the + log-likelihood as :math:`d(y,\mu) = -2\phi\cdot - \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).` + \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` Parameters ---------- @@ -368,7 +374,9 @@ def unit_deviance(self, y, mu): raise NotImplementedError() def unit_deviance_derivative(self, y, mu): - r"""The derivative w.r.t. `mu` of the unit deviance + r"""Compute the derivative of the unit deviance w.r.t. mu. + + The derivative of the unit deviance is given by :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` with unit variance :math:`v(\mu)`. @@ -383,9 +391,12 @@ def unit_deviance_derivative(self, y, mu): return -2*(y-mu)/self.unit_variance(mu) def deviance(self, y, mu, weights=1): - r"""The deviance is given by :math:`D = \sum_i s_i \cdot d(y, \mu) + r"""Compute the deviance. + + The deviance is a weighted sum of the per sample unit deviances, + :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. - In terms of the likelihood it is :math:`D = -2\phi\cdot + In terms of the log-likelihood it is :math:`D = -2\phi\cdot \left(loglike(y,\mu,\frac{phi}{s}) - loglike(y,y,\frac{phi}{s})\right)`. @@ -403,15 +414,15 @@ def deviance(self, y, mu, weights=1): return np.sum(weights*self.unit_deviance(y, mu)) def _deviance(self, coef, X, y, weights, link): - """The deviance as a function of the coefficients `coef` - (:math:`w`). - """ + """Compute the deviance as a function of the coefficients and data.""" lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) return self.deviance(y, mu, weights) def deviance_derivative(self, y, mu, weights=1): - """The derivative w.r.t. `mu` of the deviance. + """Compute the derivative of the deviance w.r.t. mu. + + It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. Parameters ---------- @@ -427,7 +438,9 @@ def deviance_derivative(self, y, mu, weights=1): return weights*self.unit_deviance_derivative(y, mu) def _score(self, coef, phi, X, y, weights, link): - r"""The score function is the derivative of the + r"""Compute the score function. + + The score function is the derivative of the log-likelihood w.r.t. `coef` (:math:`w`). It is given by @@ -453,7 +466,8 @@ def _score(self, coef, phi, X, y, weights, link): return score def _fisher_matrix(self, coef, phi, X, y, weights, link): - r"""The Fisher information matrix. + r"""Compute the Fisher information matrix. + The Fisher information matrix, also known as expected information matrix is given by @@ -482,9 +496,10 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): return fisher_matrix def _observed_information(self, coef, phi, X, y, weights, link): - r"""The observed information matrix. + r"""Compute the observed information matrix. + The observed information matrix, also known as the negative of - the Hessian matrix of the log-likelihood. It is given by + the Hessian matrix of the log-likelihood, is given by .. math: @@ -518,8 +533,10 @@ def _observed_information(self, coef, phi, X, y, weights, link): return observed_information def _deviance_derivative(self, coef, X, y, weights, link): - r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a - function of the coefficients `coef`. + r"""Compute the derivative of the deviance w.r.t. coef. + + The derivative of the deviance w.r.t. `coef` (:math:`w`) as a + function of the coefficients `coef` and the data. This is equivalent to :math:`-2\phi` times the score function :func:`_score` (derivative of the log-likelihood). """ @@ -528,9 +545,11 @@ def _deviance_derivative(self, coef, X, y, weights, link): return -2*score def _deviance_hessian(self, coef, X, y, weights, link): - r"""The hessian matrix w.r.t. `coef` (:math:`w`) of the deviance - as a function of the coefficients `coef`. - This is equivalent to :math:`+2\phi` times the observed information + r"""Compute the hessian matrix of the deviance w.r.t. coef. + + The hessian of the deviance w.r.t. `coef` (:math:`w`) is evaluated as + a function of the coefficients `coef` and the data. + It is equivalent to :math:`+2\phi` times the observed information matrix. """ info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y, @@ -538,8 +557,12 @@ def _deviance_hessian(self, coef, X, y, weights, link): return 2*info_matrix def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): - """Calculates eta (linear predictor), mu, score function (derivative - of log-likelihood) and Fisher matrix (all with phi=1) all in one go""" + """Compute linear predictor, mean, score function and fisher matrix. + + It calculates the linear predictor, the mean, score function + (derivative of log-likelihood) and Fisher information matrix + all in one go as function of `coef` (:math:`w`) and the data. + """ n_samples, n_features = X.shape # eta = linear predictor eta = safe_sparse_dot(X, coef, dense_output=True) @@ -561,7 +584,9 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): return eta, mu, score, fisher def starting_mu(self, y, weights=1): - """Starting values for the mean mu_i in (unpenalized) IRLS. + """Set starting values for the mean mu. + + These may be good starting points for the (unpenalized) IRLS solver. Parameters ---------- @@ -577,7 +602,9 @@ def starting_mu(self, y, weights=1): class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. - They have :math:`\mu=\mathrm{E}[Y]` and + + A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely + defined by it's mean-variance relationship :math:`\mathrm{Var}[Y] \propto \mu^power`. Special cases are: @@ -668,7 +695,7 @@ def include_upper_bound(self): return self._include_upper_bound def unit_variance(self, mu): - """The unit variance of a Tweedie distribution is v(mu)=mu**power. + """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. Parameters ---------- @@ -678,8 +705,8 @@ def unit_variance(self, mu): return np.power(mu, self.power) def unit_variance_derivative(self, mu): - """The derivative of the unit variance of a Tweedie distribution is - v(mu)=power*mu**(power-1). + """Compute the derivative of the unit variance of a Tweedie + distribution v(mu)=power*mu**(power-1). Parameters ---------- @@ -732,9 +759,9 @@ def __init__(self): class GeneralizedHyperbolicSecand(ExponentialDispersionModel): - """A class for the von Generalized Hyperbolic Secand (GHS) distribution. + """A class for the Generalized Hyperbolic Secand (GHS) distribution. - The GHS distribution is for data y in (-inf, inf). + The GHS distribution is for tagets y in (-inf, inf). """ def __init__(self): self._lower_bound = -np.Inf @@ -770,7 +797,7 @@ def unit_deviance(self, y, mu): def _irls_step(X, W, P2, z): - """One step in iteratively reweighted least squares + """Compute one step in iteratively reweighted least squares. Solve A w = b for w with A = (X' W X + P2) @@ -823,18 +850,18 @@ def _irls_step(X, W, P2, z): class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - """Regression via a Generalized Linear Model (GLM) based on reproductive - Exponential Dispersion Models (EDM) with combined L1 and L2 priors as - regularizer. + """Regression via a Generalized Linear Model (GLM) with penalties. - Minimizes the objective function:: + GLMs based on a reproductive Exponential Dispersion Model (EDM) with + combined L1 and L2 priors as regularizer minimizes the following objective + function:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + alpha * l1_ratio * ||P1*w||_1 + 1/2 * alpha * (1 - l1_ratio) * w*P2*w with inverse link function `h` and s=`sample_weight` (for - `sample_weight=Nones` one has s=1 and sum(s) equals `n_samples`). + `sample_weight=None`, one has s=1 and sum(s)=`n_samples`). For `P1=P2=identity`, the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 @@ -893,16 +920,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\ of class ExponentialDispersionModel, optional(default='normal') - the distributional assumption of the GLM, i.e. which loss function to - be minimized. + The distributional assumption of the GLM, i.e. which distribution from + the EDM, specifies the loss function to be minimized. link : {'identity', 'log'} or an instance of class Link, optional (default='identity') - the link function of the GLM, i.e. mapping from linear predictor + The link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) - method for estimation of the dispersion parameter phi. Whether to use + Method for estimation of the dispersion parameter phi. Whether to use the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. @@ -914,15 +941,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Sets 'irls' if l1_ratio equals 0, else 'cd'. 'irls' - iterated reweighted least squares (Fisher scoring). - It is the standard algorithm for GLMs. Cannot deal with + Iterated reweighted least squares (with Fisher scoring). + It is the standard algorithm for GLMs. It cannot deal with L1 penalties. 'newton-cg', 'lbfgs' Cannot deal with L1 penalties. 'cd' - coordinate descent algorithm. It can deal with L1 as well as L2 + Coordinate descent algorithm. It can deal with L1 as well as L2 penalties. max_iter : int, optional (default=100) @@ -930,15 +957,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, - the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol`` + the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative of - the deviance). + the objective function). warm_start : boolean, optional (default=False) - If set to ``True``, reuse the solution of the previous call to fit as - initialization for ``coef_`` and ``intercept_`` (supersedes option + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` (supersedes option ``start_params``). If set to ``True`` or if the attribute ``coef_`` - does not exit (first call to fit), option ``start_params`` sets the + does not exit (first call to ``fit``), option ``start_params`` sets the starting values for ``coef_`` and ``intercept_``. start_params : {None, 'least_squares', 'zero', array of shape \ @@ -946,8 +973,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If an array of size n_features* is supplied, use it as start values for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. - Note that n_features* = X.shape[1] + fit_intercept includes the - intercept in counting. + Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes + the intercept in counting. If 'least_squares' is set, the result of a least squares fit in the link space (linear predictor) is taken. If 'zero' is set, all coefficients start with zero. @@ -960,7 +987,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): For the solver 'cd' (coordinate descent), the coordinates (features) can be updated in either cyclic or random order. If set to 'random', a random coefficient is updated every iteration - rather than looping over features sequentially by default. This + rather than looping over features sequentially in the same order. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. @@ -1057,7 +1084,7 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, self.verbose = verbose def fit(self, X, y, sample_weight=None): - """Fit a generalized linear model. + """Fit a Generalized Linear Model. Parameters ---------- @@ -1087,12 +1114,14 @@ def fit(self, X, y, sample_weight=None): _dtype = [np.float64, np.float32] X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=_dtype, y_numeric=True, multi_output=False) + # Without converting y to float, deviance might raise + # ValueError: Integers to negative integer powers are not allowed. y = y.astype(np.float64) weights = _check_weights(sample_weight, y.shape[0]) # 1.2 validate arguments of __init__ ################################## - # Garantee that self._family_instance is an instance of class + # Guarantee that self._family_instance is an instance of class # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): self._family_instance = self.family @@ -1112,7 +1141,8 @@ def fit(self, X, y, sample_weight=None): " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];" " got (family={0})".format(self.family)) - # Garantee that self._link_instance is set to an instance of class Link + # Guarantee that self._link_instance is set to an instance of + # class Link if isinstance(self.link, Link): self._link_instance = self.link else: @@ -1127,19 +1157,19 @@ def fit(self, X, y, sample_weight=None): .format(self.link)) if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: - raise ValueError("Penalty term must be non-negative;" + raise ValueError("Penalty term must be a non-negative number;" " got (alpha={0})".format(self.alpha)) if (not isinstance(self.l1_ratio, numbers.Number) or self.l1_ratio < 0 or self.l1_ratio > 1): - raise ValueError("l1_ratio must be in interval [0, 1]; got" - " (l1_ratio={0})".format(self.l1_ratio)) + raise ValueError("l1_ratio must be a number in interval [0, 1];" + " got (l1_ratio={0})".format(self.l1_ratio)) if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']: - raise ValueError("GeneralizedLinearRegressor supports only irls, " - "auto, lbfgs, newton-cg and cd solvers, got {0}" - "".format(self.solver)) + raise ValueError("GeneralizedLinearRegressor supports only solvers" + " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';" + " got {0}".format(self.solver)) solver = self.solver if self.solver == 'auto': if self.l1_ratio == 0: @@ -1168,8 +1198,8 @@ def fit(self, X, y, sample_weight=None): elif isinstance(start_params, str): if start_params not in ['least_squares', 'zero']: raise ValueError("The argument start_params must be None, " - "'least-squares', 'zero' or an array of right" - " length," + "'least-squares', 'zero' or an array of " + " correct length;" " got(start_params={0})".format(start_params)) else: start_params = check_array(start_params, accept_sparse='csr', @@ -1179,21 +1209,21 @@ def fit(self, X, y, sample_weight=None): (start_params.ndim != 1)): raise ValueError("Start values for parameters must have the" "right length and dimension; required (length" - "={0}, ndim=1), got (length={1}, ndim={2})." + "={0}, ndim=1); got (length={1}, ndim={2})." .format(X.shape[1] + self.fit_intercept, start_params.shape[0], start_params.ndim)) if self.selection not in ['cyclic', 'random']: raise ValueError("The argument selection must be 'cyclic' or " - "'random', got (selection={0})" + "'random'; got (selection={0})" .format(self.selection)) random_state = check_random_state(self.random_state) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) if not isinstance(self.check_input, bool): - raise ValueError("The attribute check_input must be bool; got " + raise ValueError("The argument check_input must be bool; got " "(check_input={0})".format(self.check_input)) if self.P1 is None: @@ -1232,7 +1262,7 @@ def fit(self, X, y, sample_weight=None): link = self._link_instance if self.fit_intercept: - # intercept is first column <=> coef[0] is for intecept + # Note: intercept is first column <=> coef[0] is for intecept if sparse.issparse(X): Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X]) else: @@ -1259,10 +1289,11 @@ def fit(self, X, y, sample_weight=None): P2 *= l2 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric - if sparse.issparse(P2): - P2 = 0.5 * (P2 + P2.transpose()) - else: - P2 = 0.5 * (P2 + P2.T) + if P2.ndim == 2: + if sparse.issparse(P2): + P2 = 0.5 * (P2 + P2.transpose()) + else: + P2 = 0.5 * (P2 + P2.T) # 1.3 additional validations ########################################## if self.check_input: @@ -1301,7 +1332,7 @@ def fit(self, X, y, sample_weight=None): # we rescale weights such that sum(weights) = 1 and this becomes # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) weights_sum = np.sum(weights) - weights = weights/np.sum(weights) + weights = weights/weights_sum ####################################################################### # 3. initialization of coef = (intercept_, coef_) # @@ -1338,6 +1369,8 @@ def fit(self, X, y, sample_weight=None): # with L1 penalty, start with coef = 0 # TODO: Are there better options? coef = np.zeros(n_features) + if self.fit_intercept: + coef[0] = link.link(np.mean(y)) elif isinstance(self.start_params, str): if self.start_params == 'zero': coef = np.zeros(n_features) @@ -1546,6 +1579,8 @@ def Hs(s): # inner loop # TODO: use sparsity (coefficient already 0 due to L1 penalty) # => active set of features for featurelist, see paper + # of Improved GLMNET or Gap Safe Screening Rules + # https://arxiv.org/abs/1611.05780 # A = f'(w) + d*H(w) + (w+d)*P2 # B = H+P2 # Note: f'=-score and H=fisher are updated at the end of outer @@ -1694,7 +1729,7 @@ def Hs(s): return self def linear_predictor(self, X): - """The linear_predictor X*coef_ + intercept_. + """Compute the linear_predictor = X*coef_ + intercept_. Parameters ---------- @@ -1741,8 +1776,7 @@ def predict(self, X, sample_weight=None): return mu*weights def estimate_phi(self, X, y, sample_weight=None): - """Estimation of the dispersion parameter phi. - Returns the estimate. + """Estimate/fit the dispersion parameter phi. Parameters ---------- @@ -1755,6 +1789,11 @@ def estimate_phi(self, X, y, sample_weight=None): sample_weight : {None, array-like}, shape (n_samples,), optional \ (default=None) Sample weights. + + Returns + ------- + phi : float + Dispersion parameter. """ check_is_fitted(self, "coef_") _dtype = [np.float64, np.float32] @@ -1785,15 +1824,17 @@ def estimate_phi(self, X, y, sample_weight=None): # "AssertionError: -0.28014056555724598 not greater than 0.5" # unless GeneralizedLinearRegressor has a score which passes the test. def score(self, X, y, sample_weight=None): - r"""Returns D^2, a generalization of the coefficient of determination - R^2, which uses deviance instead of squared error. + r"""Compute D^2, the percentage of deviance explained. + + D^2 is a generalization of the coefficient of determination R^2. + R^2 uses squared error and D^2 deviance. Note that those two are equal + for family='normal'. D^2 is defined as :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` is the null deviance, i.e. the deviance of a model with intercept alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean - :math:`\bar{y}` is averaged by sample_weight. In the case of a Normal - distribution, D^2 equals R^2. + :math:`\bar{y}` is averaged by sample_weight. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). @@ -1812,7 +1853,7 @@ def score(self, X, y, sample_weight=None): Returns ------- score : float - D^2 of self.predict(X) wrt. y. + D^2 of self.predict(X) w.r.t. y. """ # Note, default score defined in RegressorMixin is R^2 score. # TODO: make D^2 a score function in module metrics (and thereby get From 01033e36d913756f7ff5e2214189cf1d7426dee1 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 20 Feb 2019 17:20:09 +0100 Subject: [PATCH 44/69] Fix false formula in starting_mu and improve start_params --- sklearn/linear_model/glm.py | 108 ++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 48 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index d69ccd0a66486..fad7492acc2fb 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -583,7 +583,7 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): fisher = safe_sparse_dot(X.T, temp, dense_output=False) return eta, mu, score, fisher - def starting_mu(self, y, weights=1): + def starting_mu(self, y, weights=1, ind_weight=0.5): """Set starting values for the mean mu. These may be good starting points for the (unpenalized) IRLS solver. @@ -595,9 +595,13 @@ def starting_mu(self, y, weights=1): weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. + + ind_weight : float (default=0.5) + Must be between 0 and 1. Specifies how much weight is given to the + individual observations instead of the mean of y. """ - return ((weights*y+np.mean(weights*y)) / - (2.*np.sum(np.ones_like(y)*weights))) + return (ind_weight * y + + (1. - ind_weight) * np.average(y, weights=weights)) class TweedieDistribution(ExponentialDispersionModel): @@ -852,17 +856,19 @@ def _irls_step(X, W, P2, z): class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. - GLMs based on a reproductive Exponential Dispersion Model (EDM) with - combined L1 and L2 priors as regularizer minimizes the following objective - function:: + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean `mu=h(X*w)`. Therefore the fit minimizes + the following objective function with combined L1 and L2 priors as + regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + alpha * l1_ratio * ||P1*w||_1 + 1/2 * alpha * (1 - l1_ratio) * w*P2*w with inverse link function `h` and s=`sample_weight` (for - `sample_weight=None`, one has s=1 and sum(s)=`n_samples`). - For `P1=P2=identity`, the penalty is the elastic net:: + ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`). + For `P1=P2=identity` (``P1=None``, ``P2=None``), the penalty is the + elastic net:: alpha * l1_ratio * ||w||_1 + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 @@ -966,24 +972,34 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): as initialization for ``coef_`` and ``intercept_`` (supersedes option ``start_params``). If set to ``True`` or if the attribute ``coef_`` does not exit (first call to ``fit``), option ``start_params`` sets the - starting values for ``coef_`` and ``intercept_``. + start values for ``coef_`` and ``intercept_``. + + start_params : {'irls', 'least_squares', 'zero', array of shape \ + (n_features*, )}, optional (default='irls') + Relevant only if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not yet exist). + + 'irls' + Start values of mu are calculated by family.starting_mu(..). Then, + one step of irls obtains start values for ``coef_`. This gives + usually good results. - start_params : {None, 'least_squares', 'zero', array of shape \ - (n_features*, )}, optional (default=None) - If an array of size n_features* is supplied, use it as start values - for ``coef_`` in the fit. If ``fit_intercept=True``, the first element + 'least_squares' + Start values for ``coef_`` are obtained by a least squares fit in the + link space (y is transformed to the space of the linear predictor). + + 'zero' + All coefficients are set to zero. If ``fit_intercept=True``, the + start value for the intercept is obtained by the average of y. + + array + The array of size n_features* is directly used as start values + for ``coef_``. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes the intercept in counting. - If 'least_squares' is set, the result of a least squares fit in the - link space (linear predictor) is taken. - If 'zero' is set, all coefficients start with zero. - If ``None``, the start values are calculated by setting mu to - family.starting_mu(..) and one step of irls. - These options only apply if ``warm_start=False`` or if fit is called - the first time (``self.coef_`` does not yet exist). - selection : str, optional (default='random') + selection : str, optional (default='cyclic') For the solver 'cd' (coordinate descent), the coordinates (features) can be updated in either cyclic or random order. If set to 'random', a random coefficient is updated every iteration @@ -1005,7 +1021,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): check_input : boolean, optional (default=True) Allow to bypass several checks on input: y values in range of family, - sample_weights non-negative, P2 positive semi-definite. + sample_weight non-negative, P2 positive semi-definite. Don't use this parameter unless you know what you do. verbose : int, optional (default=0) @@ -1061,8 +1077,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', fit_dispersion=None, solver='auto', max_iter=100, - tol=1e-4, warm_start=False, start_params=None, - selection='random', random_state=None, copy_X=True, + tol=1e-4, warm_start=False, start_params='irls', + selection='cyclic', random_state=None, copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.l1_ratio = l1_ratio @@ -1193,11 +1209,9 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) start_params = self.start_params - if start_params is None: - pass - elif isinstance(start_params, str): - if start_params not in ['least_squares', 'zero']: - raise ValueError("The argument start_params must be None, " + if isinstance(start_params, str): + if start_params not in ['irls', 'least_squares', 'zero']: + raise ValueError("The argument start_params must be 'irls', " "'least-squares', 'zero' or an array of " " correct length;" " got(start_params={0})".format(start_params)) @@ -1348,11 +1362,11 @@ def fit(self, X, y, sample_weight=None): self.coef_)) else: coef = self.coef_ - elif self.start_params is None: - if self.l1_ratio == 0: + elif isinstance(start_params, str): + if start_params == 'irls': # See 3.1 IRLS # Use mu_start and apply one irls step to calculate coef - mu = family.starting_mu(y, weights) + mu = family.starting_mu(y, weights=weights) # linear predictor eta = link.link(mu) # h'(eta) @@ -1365,16 +1379,9 @@ def fit(self, X, y, sample_weight=None): # solve A*coef = b # A = X' W X + l2 P2, b = X' W z coef = _irls_step(Xnew, W, P2, z) - else: - # with L1 penalty, start with coef = 0 - # TODO: Are there better options? - coef = np.zeros(n_features) - if self.fit_intercept: - coef[0] = link.link(np.mean(y)) - elif isinstance(self.start_params, str): - if self.start_params == 'zero': - coef = np.zeros(n_features) - elif self.start_params == 'least_squares': + elif start_params == 'least_squares': + # less restrictive tolerance for finding start values + tol = np.max([self.tol, np.sqrt(self.tol)]) if self.alpha == 0: reg = LinearRegression(copy_X=True, fit_intercept=False) reg.fit(Xnew, link.link(y)) @@ -1384,18 +1391,21 @@ def fit(self, X, y, sample_weight=None): # => use Ridge # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 reg = Ridge(copy_X=True, fit_intercept=False, - alpha=self.alpha*n_samples, - tol=np.max([self.tol, np.sqrt(self.tol)])) + alpha=self.alpha*n_samples, tol=tol) reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: # TODO: Does this make sense at all? reg = ElasticNet(copy_X=True, fit_intercept=False, alpha=self.alpha, l1_ratio=self.l1_ratio, - tol=np.max([self.tol, np.sqrt(self.tol)])) + tol=tol) reg.fit(Xnew, link.link(y)) coef = reg.coef_ - else: + else: # start_params == 'zero' + coef = np.zeros(n_features) + if self.fit_intercept: + coef[0] = link.link(np.average(y, weights=weights)) + else: # assign given array as start values coef = start_params ####################################################################### @@ -1560,6 +1570,8 @@ def Hs(s): # some precalculations eta, mu, score, fisher = family._eta_mu_score_fisher( coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) + # set up space for search direction d for inner loop + d = np.zeros_like(coef) # initial stopping tolerance of inner loop # use L1-norm of minimum-norm of subgradient of F # fp_wP2 = f'(w) + w*P2 @@ -1574,8 +1586,8 @@ def Hs(s): # outer loop while self.n_iter_ < self.max_iter: self.n_iter_ += 1 - # initialize search direction d (to be optimized) - d = np.zeros_like(coef) + # initialize search direction d (to be optimized) with zero + d.fill(0) # inner loop # TODO: use sparsity (coefficient already 0 due to L1 penalty) # => active set of features for featurelist, see paper From 4071a8a54de0112fd1afd6d8fc5b5585708c84ea Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 20 Feb 2019 21:15:04 +0100 Subject: [PATCH 45/69] Improve argument handling of P1 and P2 * P2 also accepts 1d array and interprets it as diagonal matrix * improved input checks for P1 and P2 --- sklearn/linear_model/glm.py | 103 ++++++++++++++++--------- sklearn/linear_model/tests/test_glm.py | 31 ++++---- 2 files changed, 81 insertions(+), 53 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index fad7492acc2fb..33e0d75730e3a 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -45,7 +45,10 @@ from abc import ABCMeta, abstractmethod, abstractproperty import numbers import numpy as np -from scipy import linalg, optimize, sparse, special +from scipy import linalg, sparse +import scipy.sparse.linalg as splinalg +from scipy.optimize import fmin_l_bfgs_b +from scipy.special import xlogy import warnings from .base import LinearRegression from .coordinate_descent import ElasticNet @@ -727,7 +730,7 @@ def unit_deviance(self, y, mu): if p == 1: # PoissonDistribution # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 - return 2 * (special.xlogy(y, y/mu) - y + mu) + return 2 * (xlogy(y, y/mu) - y + mu) elif p == 2: # GammaDistribution return 2 * (np.log(mu/y)+y/mu-1) @@ -840,7 +843,7 @@ def _irls_step(X, W, P2, z): XtW = X.transpose() * W A = XtW * X + L2 b = XtW * z - coef = sparse.linalg.spsolve(A, b) + coef = splinalg.spsolve(A, b) else: XtW = (X.T * W) A = XtW.dot(X) @@ -867,7 +870,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): with inverse link function `h` and s=`sample_weight` (for ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`). - For `P1=P2=identity` (``P1=None``, ``P2=None``), the penalty is the + For ``P1=P2='identity'`` (``P1=None``, ``P2=None``), the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 @@ -904,21 +907,24 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : {None, array-like}, shape (n_features,), optional \ - (default=None) + P1 : {'identity', array-like}, shape (n_features,), optional \ + (default='identity') With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The - default value ``None`` is the same as a 1d array of ones. + default value ``'identity'`` is the same as a 1d array of ones. Note that n_features = X.shape[1]. - P2 : {None, array-like, sparse matrix}, shape \ - (n_features, n_features), optional (default=None) - With this square matrix the L2 penalty is calculated as `w P2 w`. - This gives a fine control over this penalty (Tikhonov - regularization). The diagonal zeros of a diagonal P2, for example, - exclude all corresponding coefficients from the L2 penalty. - The default value ``None`` is the same as the identity matrix. - Note that n_features = X.shape[1]. P2 must be positive semi-definite. + P2 : {'identity', array-like, sparse matrix}, shape \ + (n_features,) or (n_features, n_features), optional \ + (default='identity') + With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`. + This gives a fine control over this penalty (Tikhonov regularization). + A 2d array is directly used as the square matrix P2. A 1d array is + interpreted as diagonal (square) matrix. The default 'identity' sets + the identity matrix, which gives the usual squared L2-norm. If you just + want to exclude certain coefficients, pass a 1d array filled with 1, + and 0 for the coefficients to be excluded. + Note that P2 must be positive semi-definite. fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be @@ -1074,7 +1080,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Journal of Machine Learning Research 13 (2012) 1999-2030 https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ - def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, + def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', fit_intercept=True, family='normal', link='identity', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params='irls', @@ -1240,20 +1246,23 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument check_input must be bool; got " "(check_input={0})".format(self.check_input)) - if self.P1 is None: + if isinstance(self.P1, str) and self.P1 == 'identity': P1 = np.ones(X.shape[1]) else: - P1 = np.copy(np.atleast_1d(self.P1)) - if P1.dtype.kind not in ['b', 'i', 'u', 'f']: - raise ValueError("P1 must be a numeric value; " - "got (dtype={0}).".format(P1.dtype)) + P1 = np.atleast_1d(self.P1) + try: + P1 = P1.astype(np.float64, casting='safe', copy=True) + except TypeError: + raise TypeError("The given P1 cannot be converted to a numeric" + "array; got (P1.dtype={0})." + .format(P1.dtype)) if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): - raise ValueError("P1 must be either None or a 1d array with " - "the length of X.shape[1]; " + raise ValueError("P1 must be either 'identity' or a 1d array " + "with the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." .format(P1.shape[0], X.shape[1])) - if self.P2 is None: + if isinstance(self.P2, str) and self.P2 == 'identity': if not sparse.issparse(X): P2 = np.ones(X.shape[1]) else: @@ -1262,8 +1271,15 @@ def fit(self, X, y, sample_weight=None): else: P2 = check_array(self.P2, copy=True, accept_sparse=['csr', 'csc', 'coo'], - dtype="numeric", ensure_2d=True) - if ((P2.ndim != 2) or + dtype=_dtype, ensure_2d=False) + if P2.ndim == 1: + if P2.shape[0] != X.shape[1]: + raise ValueError("P2 should be a 1d array of shape " + "(n_features,) with " + "n_features=X.shape[1]; " + "got (P2.shape=({0},)), needed ({1},)" + .format(P2.shape[0], X.shape[1])) + elif ((P2.ndim != 2) or (P2.shape[0] != P2.shape[1]) or (P2.shape[0] != X.shape[1])): raise ValueError("P2 must be either None or an array of shape " @@ -1319,21 +1335,32 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Sample weights must be non-negative.") # check if P1 has only non-negative values, negative values might # indicate group lasso in the future. - if self.P1 is not None: + if self.P1 != 'identity': if not np.all(P1 >= 0): raise ValueError("P1 must not have negative values.") # check if P2 is positive semidefinite # np.linalg.cholesky(P2) 'only' asserts positive definite - if self.P2 is not None: - if sparse.issparse(P2): - # TODO: check sparse P2 for non-negativeness - # raise NotImplementedError("Check sparse P2 for " - # "non-negativeness is not yet " - # "implemented.") - pass - elif P2.ndim == 2: - if not np.all(np.linalg.eigvals(P2) >= -1e-15): - raise ValueError("P2 must be positive definite.") + if self.P2 != 'identity': + # due to numerical precision, we allow eigenvalues to be a + # tiny bit negative + epsneg = 10 * np.finfo(P2.dtype).epsneg + if P2.ndim == 1 or P2.shape[0] == 1: + if not np.all(P2 >= 0): + raise ValueError("1d array P2 must not have negative " + "values.") + elif sparse.issparse(P2): + # for sparse matrices, not all eigenvals can be computed + # efficiently, use only half of n_features + # k = how many eigenvals to compute + k = np.min([10, n_features // 10 + 1]) + sigma = 0 # start searching near this value + which = 'SA' # find smallest algebraic eigenvalues first + if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma, + which=which) >= epsneg): + raise ValueError("P2 must be positive semi-definite.") + else: + if not np.all(linalg.eigvalsh(P2) >= epsneg): + raise ValueError("P2 must be positive semi-definite.") # TODO: if alpha=0 check that Xnew is not rank deficient # TODO: what else to check? @@ -1520,7 +1547,7 @@ def Hs(s): args = (Xnew, y, weights, link) if solver == 'lbfgs': - coef, loss, info = optimize.fmin_l_bfgs_b( + coef, loss, info = fmin_l_bfgs_b( func, coef, fprime=fprime, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, maxiter=self.max_iter) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 1ac5ccd4d3d5c..fde1604ad16e3 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -188,25 +188,26 @@ def test_glm_l1_ratio_argument(): assert_raises(ValueError, glm.fit, X, y) -def test_glm_P1_argument(): - """Test GLM P1 arguments - """ +@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3]]) +def test_glm_P1_argument(P1): + """Test GLM P1 arguments.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for P1 in [['a string', 'a string'], [1, [2]], [1, 2, 3]]: - glm = GeneralizedLinearRegressor(P1=P1) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(P1=P1) + with pytest.raises((ValueError, TypeError)): + glm.fit(X, y) -# def test_glm_P2_argument(): -# """Test GLM P2 arguments -# """ -# y = np.array([1, 2]) -# X = np.array([[1], [1]]) -# for P2 in [np.full((2, 2), 'a string', dtype=np.dtype(' Date: Wed, 20 Feb 2019 22:48:14 +0100 Subject: [PATCH 46/69] Fix doctest, test_poisson_enet, change IRLS to use lstsq, fix input checks * adapt examples of GeneralizedLinearModel to new defaults for P1, P2 and selection * fix precision/decimal issue in test_poisson_enet * use more robust least squares instead of solve in IRLS * fix sign error in input checks --- doc/modules/linear_model.rst | 16 +++++++++------- sklearn/linear_model/glm.py | 25 ++++++++++++++++--------- sklearn/linear_model/tests/test_glm.py | 24 ++++++++++++++++++++---- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 174d1e4eddae4..e60e9e84a4747 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -924,15 +924,17 @@ follows: >>> from sklearn.linear_model import GeneralizedLinearRegressor >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE - GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, - copy_X=True, family='poisson', fit_dispersion=None, - fit_intercept=True, l1_ratio=0, link='log', max_iter=100, - random_state=None, selection='random', solver='auto', - start_params=None, tol=0.0001, verbose=0, warm_start=False) + GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5, + check_input=True, copy_X=True, family='poisson', + fit_dispersion=None, fit_intercept=True, l1_ratio=0, + link='log', max_iter=100, random_state=None, + selection='cyclic', solver='auto', + start_params='irls', tol=0.0001, verbose=0, + warm_start=False) >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE - array([0.24630255, 0.43373521]) + array([0.24630169, 0.43373464]) >>> reg.intercept_ #doctest: +ELLIPSIS - -0.76383575... + -0.76383633... .. topic:: Examples: diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 33e0d75730e3a..bc1a0434fa3b0 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -830,8 +830,9 @@ def _irls_step(X, W, P2, z): ------- coef: array, shape (X.shape[1]) """ - # TODO: scipy.linalg.solve seems faster, but ordinary least squares uses - # scipy.linalg.lstsq. What is more appropriate? + # Note: solve vs least squares, what is more appropriate? + # scipy.linalg.solve seems faster, but scipy.linalg.lstsq + # is more robust. n_samples, n_features = X.shape if sparse.issparse(X): W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr() @@ -843,7 +844,8 @@ def _irls_step(X, W, P2, z): XtW = X.transpose() * W A = XtW * X + L2 b = XtW * z - coef = splinalg.spsolve(A, b) + # coef = splinalg.spsolve(A, b) + coef, *_ = splinalg.lsmr(A, b) else: XtW = (X.T * W) A = XtW.dot(X) @@ -852,7 +854,8 @@ def _irls_step(X, W, P2, z): else: A += P2 b = XtW.dot(z) - coef = linalg.solve(A, b) + # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True) + coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) return coef @@ -1340,12 +1343,15 @@ def fit(self, X, y, sample_weight=None): raise ValueError("P1 must not have negative values.") # check if P2 is positive semidefinite # np.linalg.cholesky(P2) 'only' asserts positive definite - if self.P2 != 'identity': + if not isinstance(self.P2, str): # self.P2 != 'identity' # due to numerical precision, we allow eigenvalues to be a # tiny bit negative - epsneg = 10 * np.finfo(P2.dtype).epsneg + epsneg = -10 * np.finfo(P2.dtype).epsneg if P2.ndim == 1 or P2.shape[0] == 1: - if not np.all(P2 >= 0): + p2 = P2 + if sparse.issparse(P2): + p2 = P2.toarray() + if not np.all(p2 >= 0): raise ValueError("1d array P2 must not have negative " "values.") elif sparse.issparse(P2): @@ -1360,6 +1366,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("P2 must be positive semi-definite.") else: if not np.all(linalg.eigvalsh(P2) >= epsneg): + return P2 raise ValueError("P2 must be positive semi-definite.") # TODO: if alpha=0 check that Xnew is not rank deficient # TODO: what else to check? @@ -1689,7 +1696,7 @@ def Hs(s): mn_subgrad = (np.where(coef + d == 0, np.sign(A)*np.maximum(np.abs(A)-P1, 0), A+np.sign(coef+d)*P1)) - mn_subgrad = np.sum(np.abs(mn_subgrad)) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) if mn_subgrad <= inner_tol: if inner_iter == 1: inner_tol = inner_tol/4. @@ -1740,7 +1747,7 @@ def Hs(s): mn_subgrad = (np.where(coef == 0, np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), fp_wP2+np.sign(coef)*P1)) - mn_subgrad = np.sum(np.abs(mn_subgrad)) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) if mn_subgrad <= self.tol: converged = True break diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index fde1604ad16e3..8893028d0176a 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -2,7 +2,7 @@ from numpy.testing import assert_allclose import pytest import scipy as sp -from scipy import sparse +from scipy import sparse, optimize from sklearn.linear_model.glm import ( Link, @@ -199,8 +199,7 @@ def test_glm_P1_argument(P1): @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], - sparse.csr_matrix([1, 2, 3]), - sparse.lil_matrix([[1]])]) + sparse.csr_matrix([1, 2, 3])]) def test_glm_P2_argument(P2): """Test GLM P2 arguments.""" y = np.array([1, 2]) @@ -515,12 +514,29 @@ def test_poisson_enet(): X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', solver='cd', tol=1e-7, + link='log', solver='cd', tol=1e-8, selection='random', random_state=42) glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) + # test results with general optimization procedure + def obj(coef): + pd = PoissonDistribution() + link = LogLink() + N = y.shape[0] + mu = link.inverse(X @ coef[1:]+coef[0]) + alpha, l1_ratio = (1, 0.5) + return 1./(2.*N) * pd.deviance(y, mu) \ + + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \ + + alpha * l1_ratio * np.sum(np.abs(coef[1:])) + res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10, + options={'maxiter': 1000, 'disp': False}) + assert_almost_equal(glm.intercept_, res.x[0], decimal=5) + assert_almost_equal(glm.coef_, res.x[1:], decimal=5) + assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))), + res.fun, decimal=8) + # same for start_params='zero' and selection='cyclic' # with reduced precision glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', From ed8e74f97d2b1921af4b8c2907c9e30629788bdc Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 23 Feb 2019 14:13:22 +0100 Subject: [PATCH 47/69] Use pytest decorators and pytest.raises --- sklearn/linear_model/glm.py | 7 +- sklearn/linear_model/tests/test_glm.py | 510 ++++++++++++------------- 2 files changed, 257 insertions(+), 260 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index bc1a0434fa3b0..f583e17433ee3 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -765,8 +765,8 @@ def __init__(self): super(InverseGaussianDistribution, self).__init__(power=3) -class GeneralizedHyperbolicSecand(ExponentialDispersionModel): - """A class for the Generalized Hyperbolic Secand (GHS) distribution. +class GeneralizedHyperbolicSecant(ExponentialDispersionModel): + """A class for the Generalized Hyperbolic Secant (GHS) distribution. The GHS distribution is for tagets y in (-inf, inf). """ @@ -1338,7 +1338,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Sample weights must be non-negative.") # check if P1 has only non-negative values, negative values might # indicate group lasso in the future. - if self.P1 != 'identity': + if not isinstance(self.P1, str): # if self.P1 != 'identity': if not np.all(P1 >= 0): raise ValueError("P1 must not have negative values.") # check if P2 is positive semidefinite @@ -1366,7 +1366,6 @@ def fit(self, X, y, sample_weight=None): raise ValueError("P2 must be positive semi-definite.") else: if not np.all(linalg.eigvalsh(P2) >= epsneg): - return P2 raise ValueError("P2 must be positive semi-definite.") # TODO: if alpha=0 check that Xnew is not rank deficient # TODO: what else to check? diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 8893028d0176a..361a237f2cc9f 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -2,7 +2,7 @@ from numpy.testing import assert_allclose import pytest import scipy as sp -from scipy import sparse, optimize +from scipy import linalg, optimize, sparse from sklearn.linear_model.glm import ( Link, @@ -11,354 +11,355 @@ TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, - GeneralizedHyperbolicSecand, + GeneralizedHyperbolicSecant, GeneralizedLinearRegressor) from sklearn.linear_model import ElasticNet, Ridge from sklearn.utils.testing import ( assert_equal, assert_almost_equal, - assert_array_equal, assert_array_almost_equal, - assert_raises) + assert_array_equal, assert_array_almost_equal) -def test_link_properties(): - """Test link inverse and derivative - """ +@pytest.mark.parametrize('link', Link.__subclasses__()) +def test_link_properties(link): + """Test link inverse and derivative.""" rng = np.random.RandomState(0) x = rng.rand(100)*100 - # from sklearn.linear_model.glm import Link - # for link in vars()['Link'].__subclasses__(): - for link in Link.__subclasses__(): - link = link() - assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) - assert_almost_equal(link.inverse_derivative(link.link(x)), - 1/link.derivative(x), decimal=10) - - -def test_family_bounds(): - """Test the valid range of distributions - """ - family = NormalDistribution() - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [True, True, True]) - - family = PoissonDistribution() + link = link() # instatiate object + assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) + assert_almost_equal(link.inverse_derivative(link.link(x)), + 1/link.derivative(x), decimal=10) + + +@pytest.mark.parametrize( + 'family, expected', + [(NormalDistribution(), [True, True, True]), + (PoissonDistribution(), [False, True, True]), + (TweedieDistribution(power=1.5), [False, True, True]), + (GammaDistribution(), [False, False, True]), + (InverseGaussianDistribution(), [False, False, True]), + (TweedieDistribution(power=4.5), [False, False, True])]) +def test_family_bounds(family, expected): + """Test the valid range of distributions at -1, 0, 1.""" result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, True, True]) - - family = TweedieDistribution(power=1.5) - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, True, True]) - - family = GammaDistribution() - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, False, True]) - - family = InverseGaussianDistribution() - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, False, True]) - - family = TweedieDistribution(power=4.5) - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, False, True]) - - -def test_deviance_zero(): - """Test deviance(y,y) = 0 for different families - """ - for family in [NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=-2.5), - TweedieDistribution(power=-1), - TweedieDistribution(power=1.5), - TweedieDistribution(power=2.5), - TweedieDistribution(power=4), - GeneralizedHyperbolicSecand()]: - assert_almost_equal(family.deviance(0.1, 0.1), 0, decimal=10) - assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10) - - -def test_fisher_matrix(): + assert_array_equal(result, expected) + + +@pytest.mark.parametrize( + 'family, chk_values', + [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), + (PoissonDistribution(), [0.1, 1.5]), + (GammaDistribution(), [0.1, 1.5]), + (InverseGaussianDistribution(), [0.1, 1.5]), + (TweedieDistribution(power=-2.5), [0.1, 1.5]), + (TweedieDistribution(power=-1), [0.1, 1.5]), + (TweedieDistribution(power=1.5), [0.1, 1.5]), + (TweedieDistribution(power=2.5), [0.1, 1.5]), + (TweedieDistribution(power=-4), [0.1, 1.5]), + (GeneralizedHyperbolicSecant(), [0.1, 1.5])]) +def test_deviance_zero(family, chk_values): + """Test deviance(y,y) = 0 for different families.""" + for x in chk_values: + assert_almost_equal(family.deviance(x, x), 0, decimal=10) + + +@pytest.mark.parametrize( + 'family, link', + [(NormalDistribution(), IdentityLink()), + (PoissonDistribution(), LogLink()), + (GammaDistribution(), LogLink()), + (InverseGaussianDistribution(), LogLink()), + (TweedieDistribution(power=1.5), LogLink()), + (TweedieDistribution(power=4.5), LogLink())]) +def test_fisher_matrix(family, link): """Test the Fisher matrix numerically. Trick: Use numerical differentiation with y = mu""" - for family in [NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution()]: - link = LogLink() - rng = np.random.RandomState(0) - coef = np.array([-2, 1, 0, 1, 2.5]) - phi = 0.5 - X = rng.randn(10, 5) - lin_pred = np.dot(X, coef) - mu = link.inverse(lin_pred) - weights = rng.randn(10)**2 + 1 - fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, - weights=weights, link=link) - approx = np.array([]).reshape(0, coef.shape[0]) - for i in range(coef.shape[0]): - def f(coef): - return -family._score(coef=coef, phi=phi, X=X, y=mu, - weights=weights, link=link)[i] - approx = np.vstack( - [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)] - ) - assert_allclose(fisher, approx, rtol=1e-3) + rng = np.random.RandomState(0) + coef = np.array([-2, 1, 0, 1, 2.5]) + phi = 0.5 + X = rng.randn(10, 5) + lin_pred = np.dot(X, coef) + mu = link.inverse(lin_pred) + weights = rng.randn(10)**2 + 1 + fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link) + approx = np.array([]).reshape(0, coef.shape[0]) + for i in range(coef.shape[0]): + def f(coef): + return -family._score(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link)[i] + approx = np.vstack( + [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]) + assert_allclose(fisher, approx, rtol=1e-3) def test_sample_weights_validation(): - """Test the raised errors in the validation of sample_weight""" + """Test the raised errors in the validation of sample_weight.""" # 1. scalar value but not positive X = [[1]] y = [1] weights = 0 glm = GeneralizedLinearRegressor(fit_intercept=False) - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) # 2. 2d array weights = [[0]] - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) # 3. 1d but wrong length weights = [1, 0] - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) # 4. 1d but only zeros (sum not greater than 0) weights = [0, 0] X = [[0], [1]] y = [1, 2] - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) # 5. 1d but weith a negative value weights = [2, -1] - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) def test_glm_family_argument(): - """Test GLM family argument set as string - """ + """Test GLM family argument set as string.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) + X = np.array([[1], [2]]) for (f, fam) in [('normal', NormalDistribution()), ('poisson', PoissonDistribution()), ('gamma', GammaDistribution()), ('inverse.gaussian', InverseGaussianDistribution())]: - glm = GeneralizedLinearRegressor(family=f, fit_intercept=False, - alpha=0).fit(X, y) + glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) assert_equal(type(glm._family_instance), type(fam)) glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) - assert_raises(ValueError, glm.fit, X, y) + with pytest.raises(ValueError): + glm.fit(X, y) def test_glm_link_argument(): - """Test GLM link argument set as string - """ + """Test GLM link argument set as string.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) + X = np.array([[1], [2]]) for (l, link) in [('identity', IdentityLink()), ('log', LogLink())]: - glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, - link=l).fit(X, y) + glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) assert_equal(type(glm._link_instance), type(link)) - glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, - link='not a link') - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(family='normal', link='not a link') + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_alpha_argument(): - """Test GLM alpha argument - """ +@pytest.mark.parametrize('alpha', ['not a number', -4.2]) +def test_glm_alpha_argument(alpha): + """Test GLM for invalid alpha argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - for alpha in ['not a number', -4.2]: - glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, - alpha=alpha) - assert_raises(ValueError, glm.fit, X, y) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', alpha=alpha) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_l1_ratio_argument(): - """Test GLM l1_ratio argument - """ +@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]]) +def test_glm_l1_ratio_argument(l1_ratio): + """Test GLM for invalid l1_ratio argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - for l1_ratio in ['not a number', -4.2, 1.1, [1]]: - glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, - l1_ratio=l1_ratio) - assert_raises(ValueError, glm.fit, X, y) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio) + with pytest.raises(ValueError): + glm.fit(X, y) -@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3]]) +@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3], + [-1]]) def test_glm_P1_argument(P1): - """Test GLM P1 arguments.""" + """Test GLM for invalid P1 argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(P1=P1) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True) with pytest.raises((ValueError, TypeError)): glm.fit(X, y) @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], - sparse.csr_matrix([1, 2, 3])]) + sparse.csr_matrix([1, 2, 3]), [-1]]) def test_glm_P2_argument(P2): - """Test GLM P2 arguments.""" + """Test GLM for invalid P2 argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False) - with pytest.raises((ValueError, TypeError)): + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(P2=P2, check_input=True) + with pytest.raises(ValueError): glm.fit(X, y) -def test_glm_fit_intercept_argument(): - """Test GLM fit_intercept argument - """ - y = np.array([1, 2]) - X = np.array([[1], [1]]) - for fit_intercept in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) - assert_raises(ValueError, glm.fit, X, y) +def test_glm_P2_positive_semidefinite(): + """Test GLM for a positive semi-definite P2 argument.""" + n_samples, n_features = 10, 5 + rng = np.random.RandomState(42) + y = np.arange(n_samples) + X = np.zeros((n_samples, n_features)) + P2 = np.diag([100, 10, 5, 0, -1E-5]) + # construct random orthogonal matrix Q + Q, R = linalg.qr(rng.randn(n_features, n_features)) + P2 = Q.T @ P2 @ Q + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, + check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_solver_argument(): - """Test GLM solver argument - """ +@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) +def test_glm_fit_intercept_argument(fit_intercept): + """Test GLM for invalid fit_intercept argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for solver in ['not a solver', 1, [1]]: - glm = GeneralizedLinearRegressor(solver=solver) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) + with pytest.raises(ValueError): + glm.fit(X, y) - # solver not suitable for L1 penalty - for solver in ['irls', 'lbfgs', 'newton-cg']: - glm = GeneralizedLinearRegressor(solver=solver, alpha=1, l1_ratio=0.1) - assert_raises(ValueError, glm.fit, X, y) +@pytest.mark.parametrize('solver, l1_ratio', + [('not a solver', 0), (1, 0), ([1], 0), + ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)]) +def test_glm_solver_argument(solver, l1_ratio): + """Test GLM for invalid solver argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_max_iter_argument(): - """Test GLM max_iter argument - """ + +@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]]) +def test_glm_max_iter_argument(max_iter): + """Test GLM for invalid max_iter argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - for max_iter in ['not a number', 0, -1, 5.5, [1]]: - glm = GeneralizedLinearRegressor(max_iter=max_iter) - assert_raises(ValueError, glm.fit, X, y) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(max_iter=max_iter) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_tol_argument(): - """Test GLM tol argument - """ +@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]]) +def test_glm_tol_argument(tol): + """Test GLM for invalid tol argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - for tol in ['not a number', 0, -1.0, [1e-3]]: - glm = GeneralizedLinearRegressor(tol=tol) - assert_raises(ValueError, glm.fit, X, y) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(tol=tol) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_warm_start_argument(): - """Test GLM warm_start argument - """ +@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]]) +def test_glm_warm_start_argument(warm_start): + """Test GLM for invalid warm_start argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for warm_start in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(warm_start=warm_start) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(warm_start=warm_start) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_start_params_argument(): - """Test GLM start_params argument - """ +@pytest.mark.parametrize('start_params', + ['not a start_params', ['zero'], [0, 0, 0], + [[0, 0]], ['a', 'b']]) +def test_glm_start_params_argument(start_params): + """Test GLM for invalid start_params argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for start_params in ['not a start_params', ['zero'], [0, 0, 0], - [[0, 0]], ['a', 'b']]: - glm = GeneralizedLinearRegressor(start_params=start_params) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(start_params=start_params) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_selection_argument(): - """Test GLM selection argument - """ +@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']]) +def test_glm_selection_argument(selection): + """Test GLM for invalid selection argument""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for selection in ['not a selection', 1, 0, ['cyclic']]: - glm = GeneralizedLinearRegressor(selection=selection) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(selection=selection) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_random_state_argument(): - """Test GLM random_state argument - """ +@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]]) +def test_glm_random_state_argument(random_state): + """Test GLM for invalid random_state argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for random_state in ['a string', 0.5, [0]]: - glm = GeneralizedLinearRegressor(random_state=random_state) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(random_state=random_state) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_copy_X_argument(): - """Test GLM copy_X arguments - """ +@pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) +def test_glm_copy_X_argument(copy_X): + """Test GLM for invalid copy_X arguments.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for copy_X in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(copy_X=copy_X) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(copy_X=copy_X) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_check_input_argument(): - """Test GLM check_input argument - """ +@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]]) +def test_glm_check_input_argument(check_input): + """Test GLM for invalid check_input argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for check_input in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(check_input=check_input) - assert_raises(ValueError, glm.fit, X, y) - + glm = GeneralizedLinearRegressor(check_input=check_input) + with pytest.raises(ValueError): + glm.fit(X, y) -# TODO: check additional validations if check_input == True -def test_glm_identiy_regression(): - """Test GLM regression with identity link on a simple dataset - """ +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecant()]) +@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +def test_glm_identiy_regression(family, solver): + """Test GLM regression with identity link on a simple dataset.""" coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) - families = ( - NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), - GeneralizedHyperbolicSecand()) - for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - for family in families: - glm = GeneralizedLinearRegressor( - alpha=0, family=family, fit_intercept=False, solver=solver) - res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef) - - -def test_glm_log_regression(): - """Test GLM regression with log link on a simple dataset - """ + glm = GeneralizedLinearRegressor(alpha=0, family=family, + fit_intercept=False, solver=solver) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) + + +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecant()]) +@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +def test_glm_log_regression(family, solver): + """Test GLM regression with log link on a simple dataset.""" coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) - families = ( - NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), - GeneralizedHyperbolicSecand()) - for solver in ['irls', 'lbfgs', 'newton-cg']: - for family in families: - glm = GeneralizedLinearRegressor( + glm = GeneralizedLinearRegressor( alpha=0, family=family, link=LogLink(), fit_intercept=False, solver=solver, start_params='least_squares') - res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) @pytest.mark.filterwarnings('ignore::DeprecationWarning') -def test_normal_ridge(): - """Test ridge regression for Normal distributions +@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +def test_normal_ridge(solver): + """Test ridge regression for Normal distributions. Compare to test_ridge in test_ridge.py. """ @@ -375,25 +376,23 @@ def test_normal_ridge(): ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, solver='svd', normalize=False) ridge.fit(X, y) - for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, - family='normal', link='identity', - fit_intercept=True, tol=1e-6, - max_iter=100, solver=solver, - random_state=42) - glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=True, + tol=1e-6, max_iter=100, solver=solver, + random_state=42) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, solver='svd', normalize=False) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, - family='normal', link='identity', - fit_intercept=False, solver='irls', - fit_dispersion='chisqr') + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=False, + tol=1e-6, max_iter=100, solver=solver, + random_state=42, fit_dispersion='chisqr') glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) @@ -413,23 +412,22 @@ def test_normal_ridge(): ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, solver='sag', normalize=False, max_iter=100000) ridge.fit(X, y) - for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, - family='normal', link='identity', - fit_intercept=True, solver=solver, - max_iter=300, random_state=42) - glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, + family='normal', link='identity', + fit_intercept=True, solver=solver, + max_iter=300, random_state=42) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, solver='sag', normalize=False, max_iter=1000) ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, family='normal', link='identity', - fit_intercept=False, solver='irls') + fit_intercept=False, solver=solver) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) @@ -438,7 +436,7 @@ def test_normal_ridge(): def test_poisson_ridge(): - """Test ridge regression with poisson family and LogLink + """Test ridge regression with poisson family and LogLink. Compare to R's glmnet""" # library("glmnet") @@ -470,7 +468,7 @@ def test_poisson_ridge(): def test_normal_enet(): - """Tet elastic net regression with normal/gaussian family""" + """Test elastic net regression with normal/gaussian family.""" rng = np.random.RandomState(0) alpha, l1_ratio = 0.3, 0.7 n_samples, n_features = 20, 2 @@ -495,7 +493,7 @@ def test_normal_enet(): def test_poisson_enet(): - """Test elastic net regression with poisson family and LogLink + """Test elastic net regression with poisson family and LogLink. Compare to R's glmnet""" # library("glmnet") From fe876da908a7d5aefe8fa9ac56f4c5130ccf83df Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 24 Feb 2019 12:45:55 +0100 Subject: [PATCH 48/69] Add Logistic regression=Binomial + Logit * add Binomial distribution * add Logit link * tests for binomial against LogisticRegression * option 'auto' for link * reduce code duplication by replacing @abstractproperty by @property --- sklearn/linear_model/glm.py | 160 +++++++++++++++---------- sklearn/linear_model/tests/test_glm.py | 57 +++++++-- 2 files changed, 149 insertions(+), 68 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index f583e17433ee3..01e40b322946c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -42,13 +42,12 @@ from __future__ import division -from abc import ABCMeta, abstractmethod, abstractproperty +from abc import ABCMeta, abstractmethod import numbers import numpy as np -from scipy import linalg, sparse +from scipy import linalg, sparse, special import scipy.sparse.linalg as splinalg from scipy.optimize import fmin_l_bfgs_b -from scipy.special import xlogy import warnings from .base import LinearRegression from .coordinate_descent import ElasticNet @@ -191,6 +190,28 @@ def inverse_derivative2(self, lin_pred): return np.exp(lin_pred) +class LogitLink(Link): + """The logit link function g(x)=logit(x).""" + + def link(self, mu): + return special.logit(mu) + + def derivative(self, mu): + return 1. / (mu * (1 - mu)) + + def inverse(self, lin_pred): + return special.expit(lin_pred) + + def inverse_derivative(self, lin_pred): + ep = special.expit(lin_pred) + return ep * (1. - ep) + + def inverse_derivative2(self, lin_pred): + ep = special.expit(lin_pred) + ep = special.expit(lin_pred) + return ep * (1. - ep) * (1. - 2 * ep) + + class ExponentialDispersionModel(metaclass=ABCMeta): r"""Base class for reproductive Exponential Dispersion Models (EDM). @@ -238,26 +259,25 @@ class ExponentialDispersionModel(metaclass=ABCMeta): https://en.wikipedia.org/wiki/Exponential_dispersion_model. """ - - @abstractproperty + @property def lower_bound(self): - """The lower bound of values of Y~EDM.""" - raise NotImplementedError() + """Get the lower bound of values for Y~EDM.""" + return self._lower_bound - @abstractproperty + @property def upper_bound(self): - """The upper bound of values of Y~EDM.""" - raise NotImplementedError() + """Get the upper bound of values for Y~EDM.""" + return self._upper_bound - @abstractproperty + @property def include_lower_bound(self): - """If True, values of y may equal lower bound: y >= lower_bound.""" - raise NotImplementedError() + """Get True if lower bound for y is included: y >= lower_bound.""" + return self._include_lower_bound - @abstractproperty + @property def include_upper_bound(self): - """If True, values of y may equal upper bound: y <= upper_bound.""" - raise NotImplementedError() + """Get True if upper bound for y is includede: y <= upper_bound.""" + return self._include_upper_bound def in_y_range(self, x): """Returns true if `x` is in the valid range of Y~EDM. @@ -685,22 +705,6 @@ def power(self, power): .format(power)) self._power = power - @property - def lower_bound(self): - return self._lower_bound - - @property - def upper_bound(self): - return self._upper_bound - - @property - def include_lower_bound(self): - return self._include_lower_bound - - @property - def include_upper_bound(self): - return self._include_upper_bound - def unit_variance(self, mu): """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. @@ -730,7 +734,7 @@ def unit_deviance(self, y, mu): if p == 1: # PoissonDistribution # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 - return 2 * (xlogy(y, y/mu) - y + mu) + return 2 * (special.xlogy(y, y/mu) - y + mu) elif p == 2: # GammaDistribution return 2 * (np.log(mu/y)+y/mu-1) @@ -776,22 +780,6 @@ def __init__(self): self._include_lower_bound = False self._include_upper_bound = False - @property - def lower_bound(self): - return self._lower_bound - - @property - def upper_bound(self): - return self._upper_bound - - @property - def include_lower_bound(self): - return self._include_lower_bound - - @property - def include_upper_bound(self): - return self._include_upper_bound - def unit_variance(self, mu): return 1 + mu**2 @@ -803,6 +791,27 @@ def unit_deviance(self, y, mu): np.log((1+mu**2)/(1+y**2))) +class BinomialDistribution(ExponentialDispersionModel): + """A class for the Binomial distribution. + + The Binomial distribution is for tagets y in [0, 1]. + """ + def __init__(self): + self._lower_bound = 0 + self._upper_bound = 1 + self._include_lower_bound = True + self._include_upper_bound = True + + def unit_variance(self, mu): + return mu * (1 - mu) + + def unit_variance_derivative(self, mu): + return 1 - 2 * mu + + def unit_deviance(self, y, mu): + return 2*(special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) + + def _irls_step(X, W, P2, z): """Compute one step in iteratively reweighted least squares. @@ -933,15 +942,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\ - of class ExponentialDispersionModel, optional(default='normal') + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \ + or an instance of class ExponentialDispersionModel, \ + optional(default='normal') The distributional assumption of the GLM, i.e. which distribution from the EDM, specifies the loss function to be minimized. - link : {'identity', 'log'} or an instance of class Link, - optional (default='identity') + link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, + optional (default='auto') The link function of the GLM, i.e. mapping from linear predictor - (X*coef) to expectation (mu). + (X*coef) to expectation (mu). Option 'auto' sets the link depending on + the chosen family as follows: + + - 'identity' for family 'normal' + + - 'log' for families 'poisson', 'gamma', 'inverse.gaussian' + + - 'logit' for family 'binomial' fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) Method for estimation of the dispersion parameter phi. Whether to use @@ -1084,7 +1101,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', - fit_intercept=True, family='normal', link='identity', + fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params='irls', selection='cyclic', random_state=None, copy_X=True, @@ -1159,27 +1176,48 @@ def fit(self, X, y, sample_weight=None): self._family_instance = GammaDistribution() elif self.family == 'inverse.gaussian': self._family_instance = InverseGaussianDistribution() + elif self.family == 'binomial': + self._family_instance = BinomialDistribution() else: raise ValueError( "The family must be an instance of class" " ExponentialDispersionModel or an element of" - " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];" - " got (family={0})".format(self.family)) + " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " + "'binomial']; got (family={0})".format(self.family)) # Guarantee that self._link_instance is set to an instance of # class Link if isinstance(self.link, Link): self._link_instance = self.link else: - if self.link == 'identity': + if self.link == 'auto': + if isinstance(self._family_instance, TweedieDistribution): + if self._family_instance.power <= 0: + self._link_instance = IdentityLink() + if self._family_instance.power >= 1: + self._link_instance = LogLink() + elif isinstance(self._family_instance, + GeneralizedHyperbolicSecant): + self._link_instance = IdentityLink() + elif isinstance(self._family_instance, BinomialDistribution): + self._link_instance = LogitLink() + else: + raise ValueError("No default link known for the " + "specified distribution family. Please " + "set link manually, i.e. not to 'auto'; " + "got (link='auto', family={}" + .format(self.family)) + elif self.link == 'identity': self._link_instance = IdentityLink() elif self.link == 'log': self._link_instance = LogLink() + elif self.link == 'logit': + self._link_instance = LogitLink() else: raise ValueError( - "The link must be an instance of class Link or" - " an element of ['identity', 'log']; got (link={0})" - .format(self.link)) + "The link must be an instance of class Link or " + "an element of ['auto', 'identity', 'log', 'logit']; " + "got (link={0})".format(self.link)) if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: raise ValueError("Penalty term must be a non-negative number;" diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 361a237f2cc9f..de0857a34fe3a 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -4,16 +4,18 @@ import scipy as sp from scipy import linalg, optimize, sparse +from sklearn.datasets import make_classification from sklearn.linear_model.glm import ( Link, IdentityLink, LogLink, + LogitLink, TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, - GeneralizedHyperbolicSecant, + GeneralizedHyperbolicSecant, BinomialDistribution, GeneralizedLinearRegressor) -from sklearn.linear_model import ElasticNet, Ridge +from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.utils.testing import ( assert_equal, assert_almost_equal, @@ -26,9 +28,19 @@ def test_link_properties(link): rng = np.random.RandomState(0) x = rng.rand(100)*100 link = link() # instatiate object - assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) - assert_almost_equal(link.inverse_derivative(link.link(x)), - 1/link.derivative(x), decimal=10) + decimal = 10 + if isinstance(link, LogitLink): + # careful for large x, note expit(36) = 1 + # limit max eta to 15 + x = x / 100 * 15 + decimal = 8 + assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal) + # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) + assert_almost_equal(link.derivative(link.inverse(x)), + 1./link.inverse_derivative(x), decimal=decimal) + # for LogitLink, in the following x should be between 0 and 1. + # assert_almost_equal(link.inverse_derivative(link.link(x)), + # 1./link.derivative(x), decimal=decimal) @pytest.mark.parametrize( @@ -214,6 +226,12 @@ def test_glm_P2_positive_semidefinite(): with pytest.raises(ValueError): glm.fit(X, y) + P2 = sparse.csr_matrix(P2) + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, + check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) + @pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) def test_glm_fit_intercept_argument(fit_intercept): @@ -331,7 +349,7 @@ def test_glm_identiy_regression(family, solver): coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) - glm = GeneralizedLinearRegressor(alpha=0, family=family, + glm = GeneralizedLinearRegressor(alpha=0, family=family, link='identity', fit_intercept=False, solver=solver) res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) @@ -350,7 +368,7 @@ def test_glm_log_regression(family, solver): X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) glm = GeneralizedLinearRegressor( - alpha=0, family=family, link=LogLink(), fit_intercept=False, + alpha=0, family=family, link='log', fit_intercept=False, solver=solver, start_params='least_squares') res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) @@ -557,3 +575,28 @@ def obj(coef): glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + + +@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10]) +def test_binomial_enet(alpha): + """Test elastic net regression with binomial family and LogitLink. + + Compare to LogisticRegression. + """ + l1_ratio = 0.5 + n_samples = 500 + X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6, + n_informative=5, n_redundant=0, n_repeated=0, + random_state=0) + log = LogisticRegression( + penalty='elasticnet', random_state=0, fit_intercept=False, tol=1e-6, + max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), + solver='saga') + log.fit(X, y) + glm = GeneralizedLinearRegressor( + family=BinomialDistribution(), link=LogitLink(), fit_intercept=False, + alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic', + tol=1e-7) + glm.fit(X, y) + assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6) + assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6) From 2993e03dbfc89b068373718c82f65957639767ac Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 7 Apr 2019 15:33:27 +0200 Subject: [PATCH 49/69] More efficient sparse matrices and refactor of irls and cd solver * refactor into function _irls_solver * refactor into function _cd_solver * replace of safe_sparse_dot by matmul operator @ * more efficient handling of fisher matrix * sparse coo matrices are converted to csc or csr * sample weights don't except sparse matrices * minor doc changes --- sklearn/linear_model/glm.py | 1101 ++++++++++++++---------- sklearn/linear_model/tests/test_glm.py | 85 +- 2 files changed, 717 insertions(+), 469 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 01e40b322946c..b2de866a4b69d 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -55,13 +55,12 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, check_X_y -from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg from ..utils.validation import check_is_fitted, check_random_state def _check_weights(sample_weight, n_samples): - """Check that weights are non-negative and have the right shape.""" + """Check that sample weights are non-negative and have the right shape.""" if sample_weight is None: weights = np.ones(n_samples) elif np.isscalar(sample_weight): @@ -70,7 +69,7 @@ def _check_weights(sample_weight, n_samples): weights = sample_weight * np.ones(n_samples) else: _dtype = [np.float64, np.float32] - weights = check_array(sample_weight, accept_sparse='csr', + weights = check_array(sample_weight, accept_sparse=False, force_all_finite=True, ensure_2d=False, dtype=_dtype) if weights.ndim > 1: @@ -247,12 +246,11 @@ class ExponentialDispersionModel(metaclass=ABCMeta): deviance_derivative starting_mu + _mu_deviance_derivative _score _fisher_matrix _observed_information - _deviance - _deviance_derivative - _deviance_hessian + _eta_mu_score_fisher References ---------- @@ -280,7 +278,7 @@ def include_upper_bound(self): return self._include_upper_bound def in_y_range(self, x): - """Returns true if `x` is in the valid range of Y~EDM. + """Returns ``True`` if x is in the valid range of Y~EDM. Parameters ---------- @@ -411,7 +409,7 @@ def unit_deviance_derivative(self, y, mu): mu : array, shape (n_samples,) Predicted mean. """ - return -2*(y-mu)/self.unit_variance(mu) + return -2 * (y - mu) / self.unit_variance(mu) def deviance(self, y, mu, weights=1): r"""Compute the deviance. @@ -434,13 +432,7 @@ def deviance(self, y, mu, weights=1): weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. """ - return np.sum(weights*self.unit_deviance(y, mu)) - - def _deviance(self, coef, X, y, weights, link): - """Compute the deviance as a function of the coefficients and data.""" - lin_pred = safe_sparse_dot(X, coef, dense_output=True) - mu = link.inverse(lin_pred) - return self.deviance(y, mu, weights) + return np.sum(weights * self.unit_deviance(y, mu)) def deviance_derivative(self, y, mu, weights=1): """Compute the derivative of the deviance w.r.t. mu. @@ -458,7 +450,36 @@ def deviance_derivative(self, y, mu, weights=1): weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. """ - return weights*self.unit_deviance_derivative(y, mu) + return weights * self.unit_deviance_derivative(y, mu) + + def starting_mu(self, y, weights=1, ind_weight=0.5): + """Set starting values for the mean mu. + + These may be good starting points for the (unpenalized) IRLS solver. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + + ind_weight : float (default=0.5) + Must be between 0 and 1. Specifies how much weight is given to the + individual observations instead of the mean of y. + """ + return (ind_weight * y + + (1. - ind_weight) * np.average(y, weights=weights)) + + def _mu_deviance_derivative(self, coef, X, y, weights, link): + """Compute mu, the deviance and it's derivative w.r.t coef.""" + lin_pred = X @ coef + mu = link.inverse(lin_pred) + dev = self.deviance(y, mu, weights) + d1 = link.inverse_derivative(lin_pred) + devp = X.T @ (d1 * self.deviance_derivative(y, mu, weights)) + return mu, dev, devp def _score(self, coef, phi, X, y, weights, link): r"""Compute the score function. @@ -476,16 +497,14 @@ def _score(self, coef, phi, X, y, weights, link): with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. + Note: The derivative of the deviance w.r.t. coef equals -2 * score. """ - n_samples = X.shape[0] - lin_pred = safe_sparse_dot(X, coef, dense_output=True) + lin_pred = X @ coef mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d = link.inverse_derivative(lin_pred) - d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), - shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=True) - score = safe_sparse_dot(X.T, temp, dense_output=True) + temp = sigma_inv * d * (y - mu) + score = X.T @ temp return score def _fisher_matrix(self, coef, phi, X, y, weights, link): @@ -508,14 +527,13 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): see func:`_score`. """ n_samples = X.shape[0] - lin_pred = safe_sparse_dot(X, coef, dense_output=True) + lin_pred = X @ coef mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d2 = link.inverse_derivative(lin_pred)**2 d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) - fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False) + fisher_matrix = X.T @ d2_sigma_inv @ X return fisher_matrix def _observed_information(self, coef, phi, X, y, weights, link): @@ -542,7 +560,7 @@ def _observed_information(self, coef, phi, X, y, weights, link): see :func:`score_` function and :func:`_fisher_matrix`. """ n_samples = X.shape[0] - lin_pred = safe_sparse_dot(X, coef, dense_output=True) + lin_pred = X @ coef mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) dp = link.inverse_derivative2(lin_pred) @@ -551,80 +569,59 @@ def _observed_information(self, coef, phi, X, y, weights, link): r = y - mu temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0), shape=(n_samples, n_samples)) - temp = safe_sparse_dot(temp, X, dense_output=False) - observed_information = safe_sparse_dot(X.T, temp, dense_output=False) + observed_information = X.T @ temp @ X return observed_information - def _deviance_derivative(self, coef, X, y, weights, link): - r"""Compute the derivative of the deviance w.r.t. coef. - - The derivative of the deviance w.r.t. `coef` (:math:`w`) as a - function of the coefficients `coef` and the data. - This is equivalent to :math:`-2\phi` times the score function - :func:`_score` (derivative of the log-likelihood). - """ - score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights, - link=link) - return -2*score - - def _deviance_hessian(self, coef, X, y, weights, link): - r"""Compute the hessian matrix of the deviance w.r.t. coef. - - The hessian of the deviance w.r.t. `coef` (:math:`w`) is evaluated as - a function of the coefficients `coef` and the data. - It is equivalent to :math:`+2\phi` times the observed information - matrix. - """ - info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y, - weights=weights, link=link) - return 2*info_matrix - - def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): + def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, + diag_fisher=False): """Compute linear predictor, mean, score function and fisher matrix. It calculates the linear predictor, the mean, score function (derivative of log-likelihood) and Fisher information matrix all in one go as function of `coef` (:math:`w`) and the data. + + Parameters + ---------- + diag_fisher : boolean, optional (default=False) + If ``True``, returns only an array d such that + fisher = X.T @ np.diag(d) @ X. + + Returns + ------- + (eta, mu, score, fisher) : tuple with 4 elements + The 4 elements are: + + * eta: ndarray, shape (X.shape[0],) + * mu: ndarray, shape (X.shape[0],) + * score: ndarray, shape (X.shape[0],) + * fisher: + + * If diag_fisher is ``False``, the full fisher matrix, + an array of shape (X.shape[1], X.shape[1]) + * If diag_fisher is ``True`, an array of shape (X.shape[0]) """ n_samples, n_features = X.shape # eta = linear predictor - eta = safe_sparse_dot(X, coef, dense_output=True) + eta = X @ coef mu = link.inverse(eta) sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) d1 = link.inverse_derivative(eta) # = h'(eta) # Alternatively: # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g # d1 = 1./link.derivative(mu) - d1_sigma_inv = sparse.dia_matrix((sigma_inv*d1, 0), - shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d1_sigma_inv, (y-mu), dense_output=True) - score = safe_sparse_dot(X.T, temp, dense_output=True) + score = X.T @ (sigma_inv * d1 * (y - mu)) # - d2_sigma_inv = sparse.dia_matrix((sigma_inv*(d1**2), 0), - shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) - fisher = safe_sparse_dot(X.T, temp, dense_output=False) - return eta, mu, score, fisher - - def starting_mu(self, y, weights=1, ind_weight=0.5): - """Set starting values for the mean mu. - - These may be good starting points for the (unpenalized) IRLS solver. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - - ind_weight : float (default=0.5) - Must be between 0 and 1. Specifies how much weight is given to the - individual observations instead of the mean of y. - """ - return (ind_weight * y + - (1. - ind_weight) * np.average(y, weights=weights)) + d2_sigma_inv = sigma_inv * (d1**2) + if diag_fisher: + return eta, mu, score, d2_sigma_inv + else: + if sparse.issparse(X): + d2_sigma_inv = sparse.dia_matrix((d2_sigma_inv, 0), + shape=(n_samples, n_samples)) + fisher = (X.T @ d2_sigma_inv @ X).toarray() + else: + fisher = (X.T * d2_sigma_inv) @ X + return eta, mu, score, fisher class TweedieDistribution(ExponentialDispersionModel): @@ -724,20 +721,20 @@ def unit_variance_derivative(self, mu): mu : array, shape (n_samples,) Predicted mean. """ - return self.power*np.power(mu, self.power-1) + return self.power * np.power(mu, self.power - 1) def unit_deviance(self, y, mu): p = self.power if p == 0: # NormalDistribution - return (y-mu)**2 + return (y - mu)**2 if p == 1: # PoissonDistribution # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 return 2 * (special.xlogy(y, y/mu) - y + mu) elif p == 2: # GammaDistribution - return 2 * (np.log(mu/y)+y/mu-1) + return 2 * (np.log(mu/y) + y/mu - 1) else: # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) @@ -784,11 +781,11 @@ def unit_variance(self, mu): return 1 + mu**2 def unit_variance_derivative(self, mu): - return 2*mu + return 2 * mu def unit_deviance(self, y, mu): - return (2*y*(np.arctan(y) - np.arctan(mu)) + - np.log((1+mu**2)/(1+y**2))) + return (2 * y * (np.arctan(y) - np.arctan(mu)) + + np.log((1 + mu**2)/(1 + y**2))) class BinomialDistribution(ExponentialDispersionModel): @@ -809,7 +806,7 @@ def unit_variance_derivative(self, mu): return 1 - 2 * mu def unit_deviance(self, y, mu): - return 2*(special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) + return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) def _irls_step(X, W, P2, z): @@ -824,20 +821,20 @@ def _irls_step(X, W, P2, z): Parameters ---------- - X : {numpy array, sparse matrix}, shape (n_samples, n_features) + X : {ndarray, sparse matrix}, shape (n_samples, n_features) Training data (with intercept included if present) - W : numpy array, shape (n_samples,) + W : ndarray, shape (n_samples,) - P2 : {numpy array, sparse matrix}, shape (n_features, n_features) + P2 : {ndarray, sparse matrix}, shape (n_features, n_features) The L2-penalty matrix or vector (=diagonal matrix) - z : numpy array, shape (n_samples,) + z : ndarray, shape (n_samples,) Working observations Returns ------- - coef: array, shape (X.shape[1]) + coef: ndarray, shape (X.shape[1]) """ # Note: solve vs least squares, what is more appropriate? # scipy.linalg.solve seems faster, but scipy.linalg.lstsq @@ -868,27 +865,422 @@ def _irls_step(X, W, P2, z): return coef +def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): + """Solve GLM with L2 penalty by IRLS algorithm. + + Note: If X is sparse, P2 must also be sparse. + """ + # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' + # Obj = objective function = 1/2 Dev + l2/2 w P2 w + # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 + # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) + # D2 = link.inverse_derivative(eta)^2 = D^2 + # W = D2/V(mu) + # l2 = alpha * (1 - l1_ratio) + # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w + # = -X' D (y-mu)/V(mu) + l2 P2 w + # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 + # Use Fisher matrix instead of full info matrix -X'(...) X, + # i.e. E[Dev''] with E[y-mu]=0: + # Obj'' ~ X' W X + l2 P2 + # (1): w = (X' W X + l2 P2)^-1 X' W z, + # with z = eta + D^-1 (y-mu) + # Note: P2 must be symmetrized + # Note: ' denotes derivative, but also transpose for matrices + + # eta = linear predictor + eta = X @ coef + mu = link.inverse(eta) + # D = h'(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) + n_iter = 0 + while n_iter < max_iter: + n_iter += 1 + # coef_old not used so far. + # coef_old = coef + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = hp**2 / V + # working observations + z = eta + (y - mu) / hp + # solve A*coef = b + # A = X' W X + P2, b = X' W z + coef = _irls_step(X, W, P2, z) + # updated linear predictor + # do it here for updated values for tolerance + eta = X @ coef + mu = link.inverse(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) + + # which tolerace? |coef - coef_old| or gradient? + # use gradient for compliance with newton-cg and lbfgs + # gradient = -X' D (y-mu)/V(mu) + l2 P2 w + gradient = -(X.T @ (hp*(y-mu)/V)) + if P2.ndim == 1: + gradient += P2*coef + else: + gradient += P2 @ coef + if (np.max(np.abs(gradient)) <= tol): + converged = True + break + + if not converged: + warnings.warn("irls failed to converge. Increase the number " + "of iterations (currently {0})" + .format(max_iter), ConvergenceWarning) + + return coef, n_iter + + +def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, + max_inner_iter=1000, selection='cyclic', + random_state=None, diag_fisher=False): + """Compute inner loop of coordinate descent = cycles through features. + + Minimization of 1-d subproblems:: + + min_z q(d+z*e_j) - q(d) + = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 + + A = f'(w) + d*H(w) + (w+d)*P2 + B = H+P2 + Note: f'=-score and H=fisher are updated at the end of outer iteration. + """ + # TODO: use sparsity (coefficient already 0 due to L1 penalty) + # => active set of features for featurelist, see paper + # of Improved GLMNET or Gap Safe Screening Rules + # https://arxiv.org/abs/1611.05780 + n_samples, n_features = X.shape + B = fisher + if P2.ndim == 1: + coef_P2 = coef * P2 + if not diag_fisher: + B[np.diag_indices_from(B)] += P2 + else: + coef_P2 = P2 @ coef # P2 is symmetric, mat @ vec is usually faster + if not diag_fisher: + if sparse.issparse(P2): + B += P2.toarray() + else: + B += P2 + A = -score + coef_P2 # + d @ (H+P2) but d=0 so far + # inner loop + inner_iter = 0 + while inner_iter < max_inner_iter: + inner_iter += 1 + n_cycles += 1 + if selection == 'random': + featurelist = random_state.permutation(n_features) + else: + featurelist = np.arange(n_features) + for j in featurelist: + # minimize_z: a z + 1/2 b z^2 + c |d+z| + # a = A_j + # b = B_jj > 0 + # c = |P1_j| = P1_j > 0, see 1.3 + # d = w_j + d_j + # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) + # with beta = z+d, beta_hat = d-a/b and gamma = c/b + # z = 1/b * S(bd-a,c) - d + # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding + a = A[j] + if diag_fisher: + if sparse.issparse(X): + xj = X[:, j] + b = xj.transpose() @ xj.multiply(fisher[:, np.newaxis]) + b = b[0, 0] + else: + b = X[:, j] @ (fisher * X[:, j]) + + if P2.ndim == 1: + b += P2[j] + else: + b += P2[j, j] + else: + b = B[j, j] + + if b <= 0: + z = 0 + elif P1[j] == 0: + z = -a/b + elif a + P1[j] < b * (coef[j] + d[j]): + z = -(a + P1[j])/b + elif a - P1[j] > b * (coef[j] + d[j]): + z = -(a - P1[j])/b + else: + z = -(coef[j] + d[j]) + + # update direction d + d[j] += z + # update A because d_j is now d_j+z + # A = f'(w) + d*H(w) + (w+d)*P2 + # => A += (H+P2)*e_j z = B_j * z + # Note: B is symmetric B = B.transpose + if diag_fisher: + if sparse.issparse(X): + A += (X.transpose() @ + X[:, j].multiply(fisher[:, np.newaxis]) + ).toarray().ravel() * z + else: + # A += (X.T @ (fisher * X[:, j])) * z + # same without transpose of X + A += ((fisher * X[:, j]) @ X) * z + + if P2.ndim == 1: + A[j] += P2[j] * z + elif sparse.issparse(P2): + # slice columns as P2 is csc + A += P2[:, j].toarray().ravel() * z + else: + A += P2[:, j] * z + else: + # B is symmetric, C- or F-contiguous, but never sparse + if B.flags['F_CONTIGUOUS']: + # slice columns like for sparse csc + A += B[:, j] * z + else: # B.flags['C_CONTIGUOUS'] might be true + # slice rows + A += B[j, :] * z + # end of cycle + # stopping criterion for inner loop + # sum_i(|minimum of norm of subgrad of q(d)_i|) + mn_subgrad = np.where(coef + d == 0, + np.sign(A) * np.maximum(np.abs(A) - P1, 0), + A + np.sign(coef + d) * P1) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) + if mn_subgrad <= inner_tol: + if inner_iter == 1: + inner_tol = inner_tol/4. + break + # end of inner loop + return d, coef_P2, n_cycles, inner_tol + + +def _cd_solver(coef, X, y, weights, P1, P2, family, link, + max_iter=100, max_inner_iter=1000, tol=1e-4, + selection='cyclic ', random_state=None, + diag_fisher=False, copy_X=True): + """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. + + The objective beeing minimized in the coefficients w=coef is:: + + F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 + + An Improved GLMNET for L1-regularized Logistic Regression: + + 1. Find optimal descent direction d by minimizing + min_d F(w+d) = min_d F(w+d) - F(w) + 2. Quadrdatic approximation of F(w+d)-F(w) = q(d): + using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: + q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d + + ||P1*(w+d)||_1 - ||P1*w||_1 + Then minimize q(d): min_d q(d) + 3. Coordinate descent by updating coordinate j (d -> d+z*e_j): + min_z q(d+z*e_j) + = min_z q(d+z*e_j) - q(d) + = min_z A_j z + 1/2 B_jj z^2 + + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 + A = f'(w) + d*H(w) + (w+d)*P2 + B = H+P2 + + Repeat steps 1-3 until convergence. + Note: Use Fisher matrix instead of Hessian for H. + Note: f' = -score, H = Fisher matrix + + Parameters + ---------- + coef: ndarray, shape (n_features,) + + X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) + Training data (with intercept included if present). If not sparse, + pass directly as Fortran-contiguous data to avoid + unnecessary memory duplication. + + y : ndarray, shape (n_samples,) + Target values. + + weights: ndarray, shape (n_samples,) + Sample weights with which the deviance is weighted. The weights must + bee normalized and sum to 1. + + P1 : {ndarray}, shape (n_features,) + The L1-penalty vector (=diagonal matrix) + + P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features) + The L2-penalty matrix or vector (=diagonal matrix). If a matrix is + passed, it must be symmetric. If X is sparse, P2 must also be sparse. + + family : ExponentialDispersionModel + + link : Link + + max_iter : int, optional (default=100) + Maximum numer of outer (Newton) iterations. + + max_inner_iter : int, optional (default=1000) + Maximum number of iterations, i.e. cycles over all features, in inner + loop. + + tol : float, optional (default=1e-4) + Covergence criterion is + sum_i(|minimum of norm of subgrad of objective_i|)<=tol. + + selection : str, optional (default='cyclic') + If 'random', randomly chose features in inner loop. + + random_state : {int, RandomState instance, None}, optional (default=None) + + diag_fisher : boolean, optional (default=False) + 'False' calculates full fisher matrix, 'True' only diagonal matrix s.t. + fisher = X.T @ diag @ X. This saves storage but needs more + matrix-vector multiplications. + + copy_X : boolean, optional (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + Returns + ------- + coef : ndarray, shape (n_features,) + + n_iter : numer of outer iterations = newton iterations + + n_cycles : number of cycles over features + + References + ---------- + Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + """ + X = check_array(X, 'csc', dtype=[np.float64, np.float32], + order='F', copy=copy_X) + if P2.ndim == 2: + P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32], + order='F', copy=copy_X) + if sparse.issparse(X): + if not sparse.isspmatrix_csc(X): + raise ValueError("If X is sparse, it must be in csc format" + "; got (format={})".format(X.format)) + if not sparse.isspmatrix_csc(P2): + raise ValueError("If X is sparse, P2 must also be sparse csc" + "format. Got P2 not sparse.") + random_state = check_random_state(random_state) + # Note: we already set P2 = l2*P2, P1 = l1*P1 + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + n_iter = 0 # number of outer iterations + n_cycles = 0 # number of (complete) cycles over features + converged = False + n_samples, n_features = X.shape + # line search parameters + (beta, sigma) = (0.5, 0.01) + # some precalculations + # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a + # 1d array representing a diagonal matrix. + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=X, y=y, weights=weights, link=link, + diag_fisher=diag_fisher) + # set up space for search direction d for inner loop + d = np.zeros_like(coef) + # initial stopping tolerance of inner loop + # use L1-norm of minimum of norm of subgradient of F + # fp_wP2 = f'(w) + w*P2 + if P2.ndim == 1: + fp_wP2 = -score + coef * P2 + else: + # Note: P2 is symmetric and matrix @ vector is faster for sparse + # matrices. + fp_wP2 = -score + P2 @ coef + inner_tol = np.where(coef == 0, + np.sign(fp_wP2) * np.maximum(np.abs(fp_wP2) - P1, 0), + fp_wP2 + np.sign(coef) * P1) + inner_tol = linalg.norm(inner_tol, ord=1) + # outer loop + while n_iter < max_iter: + n_iter += 1 + # initialize search direction d (to be optimized) with zero + d.fill(0) + # inner loop = _cd_cycle + d, coef_P2, n_cycles, inner_tol = \ + _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, + max_inner_iter=max_inner_iter, selection=selection, + random_state=random_state, diag_fisher=diag_fisher) + # line search by sequence beta^k, k=0, 1, .. + # F(w + lambda d) - F(w) <= lambda * bound + # bound = sigma * (f'(w)*d + w*P2*d + # +||P1 (w+d)||_1 - ||P1 w||_1) + P1w_1 = linalg.norm(P1 * coef, ord=1) + # Note: coef_P2 already calculated and still valid + bound = sigma * (-(score @ d) + coef_P2 @ d + + linalg.norm(P1 * (coef + d), ord=1) - P1w_1) + Fw = (0.5 * family.deviance(y, mu, weights) + + 0.5 * (coef_P2 @ coef) + P1w_1) + la = 1./beta + for k in range(20): + la *= beta # starts with la=1 + coef_wd = coef + la * d + mu_wd = link.inverse(X @ coef_wd) + Fwd = (0.5 * family.deviance(y, mu_wd, weights) + + linalg.norm(P1 * coef_wd, ord=1)) + if P2.ndim == 1: + Fwd += 0.5 * ((coef_wd * P2) @ coef_wd) + else: + Fwd += 0.5 * (coef_wd @ (P2 @ coef_wd)) + if Fwd - Fw <= sigma * la * bound: + break + # update coefficients + # coef_old = coef.copy() + coef += la * d + # calculate eta, mu, score, Fisher matrix for next iteration + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=X, y=y, weights=weights, link=link, + diag_fisher=diag_fisher) + # stopping criterion for outer loop + # sum_i(|minimum of norm of subgrad of F(w)_i|) + # fp_wP2 = f'(w) + w*P2 + # Note: eta, mu and score are already updated + if P2.ndim == 1: + fp_wP2 = -score + coef * P2 + else: + fp_wP2 = -score + P2 @ coef # P2 is symmetric, mat @ vec is faster + mn_subgrad = np.where(coef == 0, + np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), + fp_wP2 + np.sign(coef) * P1) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) + if mn_subgrad <= tol: + converged = True + break + # end of outer loop + if not converged: + warnings.warn("Coordinate descent failed to converge. Increase" + " the number of iterations (currently {0})" + .format(max_iter), ConvergenceWarning) + + return coef, n_iter, n_cycles + + class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean `mu=h(X*w)`. Therefore the fit minimizes - the following objective function with combined L1 and L2 priors as - regularizer:: + fitting and predicting the mean of the target y as mu=h(X*w). Therefore, + the fit minimizes the following objective function with combined L1 and L2 + priors as regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + alpha * l1_ratio * ||P1*w||_1 + 1/2 * alpha * (1 - l1_ratio) * w*P2*w - with inverse link function `h` and s=`sample_weight` (for - ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`). - For ``P1=P2='identity'`` (``P1=None``, ``P2=None``), the penalty is the - elastic net:: + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + For ``P1=P2='identity'``, the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 - If you are interested in controlling the L1 and L2 penalty + If you are interested in controlling the L1 and L2 penalties separately, keep in mind that this is equivalent to:: a * L1 + b * L2 @@ -897,9 +1289,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): alpha = a + b and l1_ratio = a / (a + b) - The parameter `l1_ratio` corresponds to alpha in the glmnet R package while - 'alpha' corresponds to the lambda parameter in glmnet. Specifically, - l1_ratio = 1 is the lasso penalty. + The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet, + while ``alpha`` corresponds to the lambda parameter in glmnet. + Specifically, l1_ratio = 1 is the lasso penalty. Read more in the :ref:`User Guide `. @@ -948,8 +1340,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The distributional assumption of the GLM, i.e. which distribution from the EDM, specifies the loss function to be minimized. - link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, - optional (default='auto') + link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \ + optional (default='auto') The link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). Option 'auto' sets the link depending on the chosen family as follows: @@ -982,7 +1374,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): 'cd' Coordinate descent algorithm. It can deal with L1 as well as L2 - penalties. + penalties. Note that in order to avoid unnecessary memory + duplication of the X argument in the ``fit`` method, X should be + directly passed as a Fortran-contiguous numpy array or sparse csc + matrix. max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -990,8 +1385,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` - where ``g_i`` is the i-th component of the gradient (derivative of - the objective function). + where g_i is the i-th component of the gradient (derivative) of + the objective function. For the cd solver, covergence is reached + when ``sum_i(|minimum of norm of g_i|)``, where g_i is the + subgradient of the objective. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` @@ -1007,7 +1404,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): 'irls' Start values of mu are calculated by family.starting_mu(..). Then, - one step of irls obtains start values for ``coef_`. This gives + one step of irls obtains start values for ``coef_``. This gives usually good results. 'least_squares' @@ -1042,6 +1439,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): RandomState instance used by `np.random`. Used when ``selection`` == 'random'. + diag_fisher : boolean, (default=False) + Only relevant for solver 'cd'. If ``False``, the full Fisher matrix + (expected Hessian) is computed in each outer iteretion (Newton + iteration). If ``True``, only a diagonal matrix (stored as 1d array) is + computed, such that fisher = X.T @ diag @ X. This saves memory and + matrix-matrix multiplications, but needs more matrix-vector + multiplications. If you use large sparse X or if you have many + features, i.e. n_features >> n_samples, you might set this option to + ``True``. + copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten. @@ -1056,40 +1463,43 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Attributes ---------- coef_ : array, shape (n_features,) - Estimated coefficients for the linear predictor (X*coef_) in the GLM. + Estimated coefficients for the linear predictor (X*coef_+intercept_) in + the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. dispersion_ : float - The dispersion parameter :math:`\\phi` if fit_dispersion is set. + The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. n_iter_ : int - Actual number of iterations of the solver. + Actual number of iterations used in solver. Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + :ref:`User Guide `. The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - (penalized) maximum likelihood which is equivalent to minimizing the - deviance. + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. - For `alpha` > 0, the feature matrix `X` should be standardized in order to + For alpha > 0, the feature matrix X should be standardized in order to penalize features equally strong. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. - If the target `y` is a ratio, appropriate sample weights `s` should be + If the target y is a ratio, appropriate sample weights s should be provided. - As an example, consider Poission distributed counts `z` (integers) and - weights `s=exposure` (time, money, persons years, ...). Then you fit - `y = z/s`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, + As an example, consider Poission distributed counts z (integers) and + weights s=exposure (time, money, persons years, ...). Then you fit + y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, - in this case one might say that `y` has a 'scaled' Poisson distributions. + in this case one might say that y has a 'scaled' Poisson distributions. The same holds for other distributions. References @@ -1104,8 +1514,8 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params='irls', - selection='cyclic', random_state=None, copy_X=True, - check_input=True, verbose=0): + selection='cyclic', random_state=None, diag_fisher=False, + copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.l1_ratio = l1_ratio self.P1 = P1 @@ -1121,6 +1531,7 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', self.start_params = start_params self.selection = selection self.random_state = random_state + self.diag_fisher = diag_fisher self.copy_X = copy_X self.check_input = check_input self.verbose = verbose @@ -1154,11 +1565,13 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # 1.1 validate arguments of fit ####################################### _dtype = [np.float64, np.float32] - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - dtype=_dtype, y_numeric=True, multi_output=False) + X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], + dtype=_dtype, y_numeric=True, multi_output=False, + copy=self.copy_X) # Without converting y to float, deviance might raise # ValueError: Integers to negative integer powers are not allowed. - y = y.astype(np.float64) + # Also, y must not be sparse. + y = np.asarray(y, dtype=np.float64) weights = _check_weights(sample_weight, y.shape[0]) @@ -1263,7 +1676,7 @@ def fit(self, X, y, sample_weight=None): " correct length;" " got(start_params={0})".format(start_params)) else: - start_params = check_array(start_params, accept_sparse='csr', + start_params = check_array(start_params, accept_sparse=False, force_all_finite=True, ensure_2d=False, dtype=_dtype, copy=True) if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or @@ -1274,12 +1687,14 @@ def fit(self, X, y, sample_weight=None): .format(X.shape[1] + self.fit_intercept, start_params.shape[0], start_params.ndim)) - if self.selection not in ['cyclic', 'random']: raise ValueError("The argument selection must be 'cyclic' or " "'random'; got (selection={0})" .format(self.selection)) random_state = check_random_state(self.random_state) + if not isinstance(self.diag_fisher, bool): + raise ValueError("The argument diag_fisher must be bool;" + " got {0}".format(self.diag_fisher)) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) @@ -1303,26 +1718,34 @@ def fit(self, X, y, sample_weight=None): "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." .format(P1.shape[0], X.shape[1])) + # If X is sparse, make P2 sparse, too. if isinstance(self.P2, str) and self.P2 == 'identity': - if not sparse.issparse(X): - P2 = np.ones(X.shape[1]) - else: + if sparse.issparse(X): P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), shape=(X.shape[1], X.shape[1]))).tocsr() + else: + P2 = np.ones(X.shape[1]) else: P2 = check_array(self.P2, copy=True, - accept_sparse=['csr', 'csc', 'coo'], + accept_sparse=['csr', 'csc'], dtype=_dtype, ensure_2d=False) if P2.ndim == 1: + P2 = np.asarray(P2) if P2.shape[0] != X.shape[1]: raise ValueError("P2 should be a 1d array of shape " "(n_features,) with " "n_features=X.shape[1]; " "got (P2.shape=({0},)), needed ({1},)" .format(P2.shape[0], X.shape[1])) - elif ((P2.ndim != 2) or - (P2.shape[0] != P2.shape[1]) or - (P2.shape[0] != X.shape[1])): + if sparse.issparse(X): + P2 = (sparse.dia_matrix((P2, 0), + shape=(X.shape[1], X.shape[1]))).tocsr() + elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and + P2.shape[0] == X.shape[1]): + if sparse.issparse(X): + P2 = (sparse.dia_matrix((P2, 0), + shape=(X.shape[1], X.shape[1]))).tocsr() + else: raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " "n_features=X.shape[1]; " @@ -1335,7 +1758,8 @@ def fit(self, X, y, sample_weight=None): if self.fit_intercept: # Note: intercept is first column <=> coef[0] is for intecept if sparse.issparse(X): - Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X]) + Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X], + format=X.format) else: Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) P1 = np.concatenate((np.array([0]), P1)) @@ -1343,6 +1767,7 @@ def fit(self, X, y, sample_weight=None): P2 = np.concatenate((np.array([0]), P2)) elif sparse.issparse(P2): P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2), + format=P2.format, dtype=P2.dtype).tocsr() else: # as of numpy 1.13 this would work: @@ -1355,7 +1780,7 @@ def fit(self, X, y, sample_weight=None): n_samples, n_features = Xnew.shape l1 = self.alpha * self.l1_ratio - l2 = self.alpha * (1-self.l1_ratio) + l2 = self.alpha * (1 - self.l1_ratio) P1 *= l1 P2 *= l2 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') @@ -1484,315 +1909,114 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # algorithms for optimiation # TODO: Parallelize it? - self.n_iter_ = 0 - converged = False + # 4.1 IRLS ############################################################ - # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' - # Obj = objective function = 1/2 Dev + l2/2 w P2 w - # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 - # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) - # D2 = link.inverse_derivative(eta)^2 = D^2 - # W = D2/V(mu) - # l2 = alpha * (1 - l1_ratio) - # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w - # = -X' D (y-mu)/V(mu) + l2 P2 w - # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 - # Use Fisher matrix instead of full info matrix -X'(...) X, - # i.e. E[Dev''] with E[y-mu]=0: - # Obj'' ~ X' W X + l2 P2 - # (1): w = (X' W X + l2 P2)^-1 X' W z, - # with z = eta + D^-1 (y-mu) # Note: we already set P2 = l2*P2, see above # Note: we already symmetriezed P2 = 1/2 (P2 + P2') - # Note: ' denotes derivative, but also transpose for matrices if solver == 'irls': - # eta = linear predictor - eta = safe_sparse_dot(Xnew, coef, dense_output=True) - mu = link.inverse(eta) - # D = h'(eta) - hp = link.inverse_derivative(eta) - V = family.variance(mu, phi=1, weights=weights) - while self.n_iter_ < self.max_iter: - self.n_iter_ += 1 - # coef_old not used so far. - # coef_old = coef - # working weights W, in principle a diagonal matrix - # therefore here just as 1d array - W = (hp**2 / V) - # working observations - z = eta + (y-mu)/hp - # solve A*coef = b - # A = X' W X + P2, b = X' W z - coef = _irls_step(Xnew, W, P2, z) - # updated linear predictor - # do it here for updated values for tolerance - eta = safe_sparse_dot(Xnew, coef, dense_output=True) - mu = link.inverse(eta) - hp = link.inverse_derivative(eta) - V = family.variance(mu, phi=1, weights=weights) - - # which tolerace? |coef - coef_old| or gradient? - # use gradient for compliance with newton-cg and lbfgs - # gradient = family._deviance_derivative( - # coef=coef, X=Xnew, y=y, weights=weights, link=link) - # gradient = -X' D (y-mu)/V(mu) + l2 P2 w - gradient = -safe_sparse_dot(Xnew.T, hp*(y-mu)/V) + coef, self.n_iter_ = \ + _irls_solver(coef=coef, X=Xnew, y=y, weights=weights, P2=P2, + family=family, link=link, max_iter=self.max_iter, + tol=self.tol) + + # 4.2 L-BFGS ########################################################## + elif solver == 'lbfgs': + def func(coef, X, y, weights, P2, family, link): + mu, dev, devp = \ + family._mu_deviance_derivative(coef, X, y, weights, link) if P2.ndim == 1: - gradient += P2*coef + L2 = P2 * coef else: - gradient += safe_sparse_dot(P2, coef) - if (np.max(np.abs(gradient)) <= self.tol): - converged = True - break - - if not converged: - warnings.warn("irls failed to converge. Increase the number " - "of iterations (currently {0})" - .format(self.max_iter), ConvergenceWarning) - - # 4.2 L-BFGS and Newton-CG ############################################ - # TODO: performance: make one function return both deviance and - # gradient of deviance - elif solver in ['lbfgs', 'newton-cg']: - def func(coef, *args): + L2 = P2 @ coef + obj = 0.5 * dev + 0.5 * (coef @ L2) + objp = 0.5 * devp + L2 + return obj, objp + + args = (Xnew, y, weights, P2, family, link) + coef, loss, info = fmin_l_bfgs_b( + func, coef, fprime=None, args=args, + iprint=(self.verbose > 0) - 1, pgtol=self.tol, + maxiter=self.max_iter) + if self.verbose > 0: + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.", + ConvergenceWarning) + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}" + .format(info["task"])) + self.n_iter_ = info['nit'] + + # 4.3 Newton-CG ####################################################### + # We use again the fisher matrix instead of the hessian. More + # precisely, expected hessian of deviance. + elif solver == 'newton-cg': + def func(coef, X, y, weights, P2, family, link): if P2.ndim == 1: - L2 = safe_sparse_dot(coef.T, P2*coef) + L2 = coef @ (P2 * coef) else: - L2 = safe_sparse_dot(coef.T, safe_sparse_dot(P2, coef)) - # A[np.diag_indices_from(A)] += P2 - return 0.5*family._deviance(coef, *args) + 0.5*L2 + L2 = coef @ (P2 @ coef) + mu = link.inverse(X @ coef) + return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 - def fprime(coef, *args): + def grad(coef, X, y, weights, P2, family, link): if P2.ndim == 1: - L2 = P2*coef + L2 = P2 * coef else: - L2 = safe_sparse_dot(P2, coef) - return 0.5*family._deviance_derivative(coef, *args) + L2 + L2 = P2 @ coef + eta = X @ coef + mu = link.inverse(eta) + d1 = link.inverse_derivative(eta) + grad = X.T @ (d1 * family.deviance_derivative(y, mu, weights)) + return 0.5 * grad + L2 - def grad_hess(coef, X, y, weights, link): + def grad_hess(coef, X, y, weights, P2, family, link): if P2.ndim == 1: - L2 = P2*coef + L2 = P2 * coef else: - L2 = safe_sparse_dot(P2, coef) - grad = 0.5*family._deviance_derivative( - coef, X, y, weights, link) + L2 - hessian = 0.5*family._deviance_hessian( - coef, X, y, weights, link) - if P2.ndim == 1: - hessian[np.diag_indices_from(hessian)] += P2 - else: - hessian = hessian + P2 + L2 = P2 @ coef + eta = X @ coef + mu = link.inverse(eta) + d1 = link.inverse_derivative(eta) + grad = 0.5 * \ + (X.T @ (d1 * family.deviance_derivative(y, mu, weights))) \ + + L2 + # expected hessian = X.T @ diag_matrix @ X + # calculate only diag_matrix + diag = d1**2 / family.variance(mu, phi=1, weights=weights) def Hs(s): - ret = safe_sparse_dot(hessian, s) + ret = 0.5 * (X.T @ (diag * (X @ s))) + if P2.ndim == 1: + ret += P2 * s + else: + ret += P2 @ s return ret + return grad, Hs - args = (Xnew, y, weights, link) - - if solver == 'lbfgs': - coef, loss, info = fmin_l_bfgs_b( - func, coef, fprime=fprime, args=args, - iprint=(self.verbose > 0) - 1, pgtol=self.tol, - maxiter=self.max_iter) - if self.verbose > 0: - if info["warnflag"] == 1: - warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.", - ConvergenceWarning) - elif info["warnflag"] == 2: - warnings.warn("lbfgs failed for the reason: {0}" - .format(info["task"])) - self.n_iter_ = info['nit'] - elif solver == 'newton-cg': - coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef, - args=args, maxiter=self.max_iter, - tol=self.tol) - - # 4.3 coordinate descent ############################################## - # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin - # An Improved GLMNET for L1-regularized Logistic Regression, - # Journal of Machine Learning Research 13 (2012) 1999-2030 - # Note: Use Fisher matrix instead of Hessian for H - # - # 1. find optimal descent direction d by minimizing - # min_d F(w+d) = min_d F(w+d) - F(w) - # F = f + g, f(w) = 1/2 deviance, g(w) = 1/2 w*P2*w + ||P1*w||_1 - # 2. quadrdatic approximation of F(w+d)-F(w) = q(d): - # using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives - # q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d - # + ||P1*(w+d)||_1 - ||P1*w||_1 - # min_d q(d) - # 3. coordinate descent by updating coordinate j (d -> d+z*e_j): - # min_z q(d+z*e_j) - # = min_z q(d+z*e_j) - q(d) - # = min_z A_j z + 1/2 B_jj z^2 - # + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 - # A = f'(w) + d*H(w) + (w+d)*P2 - # B = H+P2 - # Note: we already set P2 = l2*P2, P1 = l1*P1, see above + args = (Xnew, y, weights, P2, family, link) + coef, n_iter_i = newton_cg(grad_hess, func, grad, coef, + args=args, maxiter=self.max_iter, + tol=self.tol) + + # 4.4 coordinate descent ############################################## + # Note: we already set P1 = l1*P1, see above + # Note: we already set P2 = l2*P2, see above # Note: we already symmetriezed P2 = 1/2 (P2 + P2') - # Note: f' = -score, H = Fisher matrix elif solver == 'cd': - # line search parameters - (beta, sigma) = (0.5, 0.01) - # max inner loops (cycles through all features) - max_inner_iter = 1000 - # some precalculations - eta, mu, score, fisher = family._eta_mu_score_fisher( - coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) - # set up space for search direction d for inner loop - d = np.zeros_like(coef) - # initial stopping tolerance of inner loop - # use L1-norm of minimum-norm of subgradient of F - # fp_wP2 = f'(w) + w*P2 - if P2.ndim == 1: - fp_wP2 = -score + coef*P2 - else: - fp_wP2 = -score + safe_sparse_dot(coef, P2) - inner_tol = (np.where(coef == 0, - np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), - fp_wP2+np.sign(coef)*P1)) - inner_tol = linalg.norm(inner_tol, ord=1) - # outer loop - while self.n_iter_ < self.max_iter: - self.n_iter_ += 1 - # initialize search direction d (to be optimized) with zero - d.fill(0) - # inner loop - # TODO: use sparsity (coefficient already 0 due to L1 penalty) - # => active set of features for featurelist, see paper - # of Improved GLMNET or Gap Safe Screening Rules - # https://arxiv.org/abs/1611.05780 - # A = f'(w) + d*H(w) + (w+d)*P2 - # B = H+P2 - # Note: f'=-score and H=fisher are updated at the end of outer - # iteration - B = fisher - if P2.ndim == 1: - coef_P2 = coef * P2 - B[np.diag_indices_from(B)] += P2 - else: - coef_P2 = safe_sparse_dot(coef, P2) - B = B + P2 - A = -score + coef_P2 # + d*(H+P2) but d=0 so far - inner_iter = 0 - while inner_iter < max_inner_iter: - inner_iter += 1 - if self.selection == 'random': - featurelist = random_state.permutation(n_features) - else: - featurelist = np.arange(n_features) - for j in featurelist: - # minimize_z: a z + 1/2 b z^2 + c |d+z| - # a = A_j - # b = B_jj > 0 - # c = |P1_j| = P1_j > 0, see 1.3 - # d = w_j + d_j - # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) - # with beta = z+d, beta_hat = d-a/b and gamma = c/b - # z = 1/b * S(bd-a,c) - d - # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding - a = A[j] - b = B[j, j] - if P1[j] == 0: - if b == 0: - z = 0 - else: - z = -a/b - elif a + P1[j] < b * (coef[j]+d[j]): - if b == 0: - z = 0 - else: - z = -(a + P1[j])/b - elif a - P1[j] > b * (coef[j]+d[j]): - if b == 0: - z = 0 - else: - z = -(a - P1[j])/b - else: - z = -(coef[j] + d[j]) - # update direction d - d[j] += z - # update A because d_j is now d_j+z - # A = f'(w) + d*H(w) + (w+d)*P2 - # => A += (H+P2)*e_j z = B_j * z - # Note: B is symmetric B = B.transpose - if sparse.issparse(B): - if sparse.isspmatrix_csc(B): - # slice columns - A += B[:, j].toarray().ravel() * z - else: - # slice rows - A += B[j, :].toarray().ravel() * z - else: - A += B[j, :] * z - # end of cycle - # stopping criterion for inner loop - # sum_i(|minimum-norm subgrad of q(d)_i|) - mn_subgrad = (np.where(coef + d == 0, - np.sign(A)*np.maximum(np.abs(A)-P1, 0), - A+np.sign(coef+d)*P1)) - mn_subgrad = linalg.norm(mn_subgrad, ord=1) - if mn_subgrad <= inner_tol: - if inner_iter == 1: - inner_tol = inner_tol/4. - break - # end of inner loop - # line search by sequence beta^k, k=0, 1, .. - # F(w + lambda d) - F(w) <= lambda * bound - # bound = sigma * (f'(w)*d + w*P2*d - # +||P1 (w+d)||_1 - ||P1 w||_1) - P1w_1 = linalg.norm(P1*coef, ord=1) - # Note: coef_P2 already calculated and still valid - bound = sigma * ( - safe_sparse_dot(-score, d) + - safe_sparse_dot(coef_P2, d) + - linalg.norm(P1*(coef+d), ord=1) - - P1w_1) - Fw = (0.5 * family.deviance(y, mu, weights) + - 0.5 * safe_sparse_dot(coef_P2, coef) + - P1w_1) - la = 1./beta - for k in range(20): - la *= beta # starts with la=1 - mu_wd = link.inverse(safe_sparse_dot(Xnew, coef+la*d, - dense_output=True)) - Fwd = (0.5 * family.deviance(y, mu_wd, weights) + - linalg.norm(P1*(coef+la*d), ord=1)) - if P2.ndim == 1: - Fwd += 0.5 * safe_sparse_dot((coef+la*d)*P2, coef+la*d) - else: - Fwd += 0.5 * (safe_sparse_dot(coef+la*d, - safe_sparse_dot(P2, coef+la*d))) - if Fwd-Fw <= sigma*la*bound: - break - # update coefficients - # coef_old = coef.copy() - coef += la * d - # calculate eta, mu, score, Fisher matrix for next iteration - eta, mu, score, fisher = family._eta_mu_score_fisher( - coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) - # stopping criterion for outer loop - # sum_i(|minimum-norm subgrad of F(w)_i|) - # fp_wP2 = f'(w) + w*P2 - # Note: eta, mu and score are already updated - if P2.ndim == 1: - fp_wP2 = -score + coef*P2 - else: - fp_wP2 = -score + safe_sparse_dot(coef, P2) - mn_subgrad = (np.where(coef == 0, - np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), - fp_wP2+np.sign(coef)*P1)) - mn_subgrad = linalg.norm(mn_subgrad, ord=1) - if mn_subgrad <= self.tol: - converged = True - break - # end of outer loop - if not converged: - warnings.warn("Coordinate descent failed to converge. Increase" - " the number of iterations (currently {0})" - .format(self.max_iter), ConvergenceWarning) + # For coordinate descent, if X is sparse, it should be csc format + # If X is sparse, P2 must also be csc + if sparse.issparse(Xnew): + Xnew = Xnew.tocsc(copy=self.copy_X) + P2 = sparse.csc_matrix(P2) + + coef, self.n_iter_, self._n_cycles = \ + _cd_solver(coef=coef, X=Xnew, y=y, weights=weights, P1=P1, + P2=P2, family=family, link=link, + max_iter=self.max_iter, tol=self.tol, + selection=self.selection, random_state=random_state, + diag_fisher=self.diag_fisher, copy_X=self.copy_X) ####################################################################### # 5. postprocessing # @@ -1828,8 +2052,7 @@ def linear_predictor(self, X): X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype='numeric', copy=True, ensure_2d=True, allow_nd=False) - return safe_sparse_dot(X, self.coef_, - dense_output=True) + self.intercept_ + return X @ self.coef_ + self.intercept_ def predict(self, X, sample_weight=None): """Predict uing GLM with feature matrix X. @@ -1884,7 +2107,7 @@ def estimate_phi(self, X, y, sample_weight=None): dtype=_dtype, y_numeric=True, multi_output=False) n_samples, n_features = X.shape weights = _check_weights(sample_weight, n_samples) - eta = safe_sparse_dot(X, self.coef_, dense_output=True) + eta = X @ self.coef_ if self.fit_intercept is True: eta += self.intercept_ n_features += 1 @@ -1907,19 +2130,19 @@ def estimate_phi(self, X, y, sample_weight=None): # "AssertionError: -0.28014056555724598 not greater than 0.5" # unless GeneralizedLinearRegressor has a score which passes the test. def score(self, X, y, sample_weight=None): - r"""Compute D^2, the percentage of deviance explained. + """Compute D^2, the percentage of deviance explained. D^2 is a generalization of the coefficient of determination R^2. R^2 uses squared error and D^2 deviance. Note that those two are equal for family='normal'. D^2 is defined as - :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` - is the null deviance, i.e. the deviance of a model with intercept - alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean - :math:`\bar{y}` is averaged by sample_weight. - Best possible score is 1.0 and it can be negative (because the - model can be arbitrarily worse). + :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, + :math:`D_{null}` is the null deviance, i.e. the deviance of a model + with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. + The mean :math:`\\bar{y}` is averaged by sample_weight. + Best possible score is 1.0 and it can be negative (because the model + can be arbitrarily worse). Parameters ---------- diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index de0857a34fe3a..ae8a5f4cfc5e7 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -138,16 +138,18 @@ def test_sample_weights_validation(): glm.fit(X, y, weights) -def test_glm_family_argument(): +@pytest.mark.parametrize('f, fam', + [('normal', NormalDistribution()), + ('poisson', PoissonDistribution()), + ('gamma', GammaDistribution()), + ('inverse.gaussian', InverseGaussianDistribution()), + ('binomial', BinomialDistribution())]) +def test_glm_family_argument(f, fam): """Test GLM family argument set as string.""" - y = np.array([1, 2]) + y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) - for (f, fam) in [('normal', NormalDistribution()), - ('poisson', PoissonDistribution()), - ('gamma', GammaDistribution()), - ('inverse.gaussian', InverseGaussianDistribution())]: - glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) - assert_equal(type(glm._family_instance), type(fam)) + glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) + assert_equal(type(glm._family_instance), type(fam)) glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) @@ -155,14 +157,16 @@ def test_glm_family_argument(): glm.fit(X, y) -def test_glm_link_argument(): +@pytest.mark.parametrize('l, link', + [('identity', IdentityLink()), + ('log', LogLink()), + ('logit', LogitLink())]) +def test_glm_link_argument(l, link): """Test GLM link argument set as string.""" - y = np.array([1, 2]) + y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) - for (l, link) in [('identity', IdentityLink()), - ('log', LogLink())]: - glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) - assert_equal(type(glm._link_instance), type(link)) + glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) + assert_equal(type(glm._link_instance), type(link)) glm = GeneralizedLinearRegressor(family='normal', link='not a link') with pytest.raises(ValueError): @@ -317,6 +321,16 @@ def test_glm_random_state_argument(random_state): glm.fit(X, y) +@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]]) +def test_glm_diag_fisher_argument(diag_fisher): + """Test GLM for invalid diag_fisher arguments.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher) + with pytest.raises(ValueError): + glm.fit(X, y) + + @pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) def test_glm_copy_X_argument(copy_X): """Test GLM for invalid copy_X arguments.""" @@ -453,7 +467,12 @@ def test_normal_ridge(solver): assert_array_almost_equal(glm.predict(T), ridge.predict(T)) -def test_poisson_ridge(): +@pytest.mark.parametrize('solver, decimal, tol', + [('irls', 7, 1e-8), + ('lbfgs', 5, 1e-7), + ('newton-cg', 5, 1e-7), + ('cd', 7, 1e-8)]) +def test_poisson_ridge(solver, decimal, tol): """Test ridge regression with poisson family and LogLink. Compare to R's glmnet""" @@ -470,22 +489,20 @@ def test_poisson_ridge(): # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) - s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7} - s_tol = {'irls': 1e-8, 'lbfgs': 1e-7, 'newton-cg': 1e-7, 'cd': 1e-8} - for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, - fit_intercept=True, family='poisson', - link='log', tol=s_tol[solver], - solver=solver, max_iter=300, - random_state=42) - glm.fit(X, y) - assert_almost_equal(glm.intercept_, -0.12889386979, - decimal=s_dec[solver]) - assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], - decimal=s_dec[solver]) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, + fit_intercept=True, family='poisson', + link='log', tol=tol, + solver=solver, max_iter=300, + random_state=42) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, -0.12889386979, + decimal=decimal) + assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + decimal=decimal) -def test_normal_enet(): +@pytest.mark.parametrize('diag_fisher', [False, True]) +def test_normal_enet(diag_fisher): """Test elastic net regression with normal/gaussian family.""" rng = np.random.RandomState(0) alpha, l1_ratio = 0.3, 0.7 @@ -494,12 +511,14 @@ def test_normal_enet(): beta = rng.randn(n_features) y = 2 + np.dot(X, beta) + rng.randn(n_samples) + # 1. test normal enet on dense data glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, family='normal', link='identity', fit_intercept=True, tol=1e-8, max_iter=100, selection='cyclic', solver='cd', start_params='zero', - check_input=False) + check_input=False, + diag_fisher=diag_fisher) glm.fit(X, y) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, @@ -509,6 +528,12 @@ def test_normal_enet(): assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + # 2. test normal enet on sparse data + X = sparse.csc_matrix(X) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) + assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + def test_poisson_enet(): """Test elastic net regression with poisson family and LogLink. From a6f9f13db9dd11d01d1dcab93819e67d5ca18b9d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 20 Apr 2019 15:59:39 +0200 Subject: [PATCH 50/69] Treat the intercept separately, i.e. X, P1, P2 never include intercept --- sklearn/linear_model/glm.py | 637 +++++++++++++++---------- sklearn/linear_model/tests/test_glm.py | 6 +- 2 files changed, 386 insertions(+), 257 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index b2de866a4b69d..2afd1ddf8c79c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -7,8 +7,6 @@ # License: BSD 3 clause # TODO: Write more examples. -# TODO: Make option self.copy_X more meaningful. -# So far, fit uses Xnew instead of X. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. # TODO: Add cross validation support? @@ -39,6 +37,7 @@ # sklearn.linear_models uses w for coefficients, standard literature on # GLMs use beta for coefficients and w for (sample) weights. # So far, coefficients=w and sample weights=s. +# - The intercept term is the first index, i.e. coef[0] from __future__ import division @@ -86,6 +85,83 @@ def _check_weights(sample_weight, n_samples): return weights +def _safe_lin_pred(X, coef): + """Compute the linear predictor taking care if intercept is present.""" + if coef.size == X.shape[1] + 1: + return X @ coef[1:] + coef[0] + else: + return X @ coef + + +def _safe_sandwich_dot(X, d, intercept=False): + """Compute sandwich product X.T @ diag(d) @ X. + + With ``intercept=True``, X is treated as if a column of 1 were appended as + first column of X. + X can be sparse, d must be an ndarray. Always returns a ndarray.""" + if sparse.issparse(X): + temp = (X.transpose().multiply(d) @ X).toarray() + else: + temp = (X.T * d) @ X + if intercept: + dim = X.shape[1] + 1 + if sparse.issparse(X): + order = 'F' if sparse.isspmatrix_csc(X) else 'C' + else: + order = 'F' if X.flags['F_CONTIGUOUS'] else 'C' + res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order) + res[0, 0] = d.sum() + res[1:, 0] = d @ X + res[0, 1:] = res[1:, 0] + res[1:, 1:] = temp + else: + res = temp + return res + + +def _min_norm_sugrad(coef, grad, P2, P1): + """Compute the gradient of all subgradients with minimal L2-norm. + + subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1) + + g_i = grad_i + (P2*coef)_i + + if coef_i > 0: g_i + P1_i + if coef_i < 0: g_i - P1_i + if coef_i = 0: sign(g_i) * max(|g_i|-P1_i, 0) + + Parameters + ---------- + coef : ndarray + coef[0] may be intercept. + + grad : ndarray, shape=coef.shape + + P2 : {1d or 2d array, None} + always without intercept, ``None`` means P2 = 0 + + P1 : ndarray + always without intercept + """ + intercept = (coef.size == P1.size + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + # compute grad + coef @ P2 without intercept + grad_wP2 = grad[idx:].copy() + if P2 is None: + pass + elif P2.ndim == 1: + grad_wP2 += coef[idx:] * P2 + else: + grad_wP2 += coef[idx:] @ P2 + res = np.where(coef[idx:] == 0, + np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0), + grad_wP2 + np.sign(coef[idx:]) * P1) + if intercept: + return np.concatenate(([grad[0]], res)) + else: + return res + + class Link(metaclass=ABCMeta): """Abstract base class for Link funtions.""" @@ -473,13 +549,16 @@ def starting_mu(self, y, weights=1, ind_weight=0.5): (1. - ind_weight) * np.average(y, weights=weights)) def _mu_deviance_derivative(self, coef, X, y, weights, link): - """Compute mu, the deviance and it's derivative w.r.t coef.""" - lin_pred = X @ coef + """Compute mu and the derivative of the deviance w.r.t coef.""" + lin_pred = _safe_lin_pred(X, coef) mu = link.inverse(lin_pred) - dev = self.deviance(y, mu, weights) d1 = link.inverse_derivative(lin_pred) - devp = X.T @ (d1 * self.deviance_derivative(y, mu, weights)) - return mu, dev, devp + temp = d1 * self.deviance_derivative(y, mu, weights) + if coef.size == X.shape[1] + 1: + devp = np.concatenate(([temp.sum()], temp @ X)) + else: + devp = temp @ X # sampe as X.T @ temp + return mu, devp def _score(self, coef, phi, X, y, weights, link): r"""Compute the score function. @@ -499,12 +578,15 @@ def _score(self, coef, phi, X, y, weights, link): :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. Note: The derivative of the deviance w.r.t. coef equals -2 * score. """ - lin_pred = X @ coef + lin_pred = _safe_lin_pred(X, coef) mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d = link.inverse_derivative(lin_pred) temp = sigma_inv * d * (y - mu) - score = X.T @ temp + if coef.size == X.shape[1] + 1: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X # sampe as X.T @ temp return score def _fisher_matrix(self, coef, phi, X, y, weights, link): @@ -526,14 +608,14 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, see func:`_score`. """ - n_samples = X.shape[0] - lin_pred = X @ coef + lin_pred = _safe_lin_pred(X, coef) mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) - d2 = link.inverse_derivative(lin_pred)**2 - d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), - shape=(n_samples, n_samples)) - fisher_matrix = X.T @ d2_sigma_inv @ X + d = link.inverse_derivative(lin_pred) + d2_sigma_inv = sigma_inv * d * d + intercept = (coef.size == X.shape[1] + 1) + fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, + intercept=intercept) return fisher_matrix def _observed_information(self, coef, phi, X, y, weights, link): @@ -559,17 +641,17 @@ def _observed_information(self, coef, phi, X, y, weights, link): \right)`, see :func:`score_` function and :func:`_fisher_matrix`. """ - n_samples = X.shape[0] - lin_pred = X @ coef + lin_pred = _safe_lin_pred(X, coef) mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) dp = link.inverse_derivative2(lin_pred) d2 = link.inverse_derivative(lin_pred)**2 v = self.unit_variance_derivative(mu)/self.unit_variance(mu) r = y - mu - temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0), - shape=(n_samples, n_samples)) - observed_information = X.T @ temp @ X + temp = sigma_inv * (-dp * r + d2 * v * r + d2) + intercept = (coef.size == X.shape[1] + 1) + observed_information = _safe_sandwich_dot(X, temp, + intercept=intercept) return observed_information def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, @@ -600,28 +682,29 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, an array of shape (X.shape[1], X.shape[1]) * If diag_fisher is ``True`, an array of shape (X.shape[0]) """ - n_samples, n_features = X.shape + intercept = (coef.size == X.shape[1] + 1) # eta = linear predictor - eta = X @ coef + eta = _safe_lin_pred(X, coef) mu = link.inverse(eta) sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) d1 = link.inverse_derivative(eta) # = h'(eta) # Alternatively: # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g # d1 = 1./link.derivative(mu) - score = X.T @ (sigma_inv * d1 * (y - mu)) - # - d2_sigma_inv = sigma_inv * (d1**2) + d1_sigma_inv = d1 * sigma_inv + temp = d1_sigma_inv * (y - mu) + if intercept: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X + + d2_sigma_inv = d1 * d1_sigma_inv if diag_fisher: - return eta, mu, score, d2_sigma_inv + fisher_matrix = d2_sigma_inv else: - if sparse.issparse(X): - d2_sigma_inv = sparse.dia_matrix((d2_sigma_inv, 0), - shape=(n_samples, n_samples)) - fisher = (X.T @ d2_sigma_inv @ X).toarray() - else: - fisher = (X.T * d2_sigma_inv) @ X - return eta, mu, score, fisher + fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, + intercept=intercept) + return eta, mu, score, fisher_matrix class TweedieDistribution(ExponentialDispersionModel): @@ -809,7 +892,7 @@ def unit_deviance(self, y, mu): return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) -def _irls_step(X, W, P2, z): +def _irls_step(X, W, P2, z, fit_intercept=True): """Compute one step in iteratively reweighted least squares. Solve A w = b for w with @@ -829,43 +912,57 @@ def _irls_step(X, W, P2, z): P2 : {ndarray, sparse matrix}, shape (n_features, n_features) The L2-penalty matrix or vector (=diagonal matrix) - z : ndarray, shape (n_samples,) + z : ndarray, shape (n_samples,) Working observations + fit_intercept : boolean, optional (default=True) + Returns ------- - coef: ndarray, shape (X.shape[1]) + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. """ # Note: solve vs least squares, what is more appropriate? # scipy.linalg.solve seems faster, but scipy.linalg.lstsq # is more robust. - n_samples, n_features = X.shape - if sparse.issparse(X): - W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr() + # Note: X.T @ W @ X is not sparse, even when X is sparse. + # Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b) + if fit_intercept: + Wz = W * z + if sparse.issparse(X): + b = np.concatenate(([Wz.sum()], X.transpose() @ Wz)) + else: + b = np.concatenate(([Wz.sum()], X.T @ Wz)) + A = _safe_sandwich_dot(X, W, intercept=fit_intercept) if P2.ndim == 1: - L2 = (sparse.dia_matrix((P2, 0), shape=(n_features, n_features)) - ).tocsr() + idx = np.arange(start=1, stop=A.shape[0]) + A[(idx, idx)] += P2 # add to diag elements without intercept + elif sparse.issparse(P2): + A[1:, 1:] += P2.toarray() else: - L2 = sparse.csr_matrix(P2) - XtW = X.transpose() * W - A = XtW * X + L2 - b = XtW * z - # coef = splinalg.spsolve(A, b) - coef, *_ = splinalg.lsmr(A, b) + A[1:, 1:] += P2 else: - XtW = (X.T * W) - A = XtW.dot(X) + if sparse.issparse(X): + XtW = X.transpose().multiply(W) + A = (XtW @ X).toarray() + else: + XtW = (X.T * W) + A = XtW @ X + b = XtW @ z if P2.ndim == 1: A[np.diag_indices_from(A)] += P2 + elif sparse.issparse(P2): + A += P2.toarray() else: A += P2 - b = XtW.dot(z) - # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True) - coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) + # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True) + coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) return coef -def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): +def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, + max_iter, tol): """Solve GLM with L2 penalty by IRLS algorithm. Note: If X is sparse, P2 must also be sparse. @@ -889,7 +986,7 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): # Note: ' denotes derivative, but also transpose for matrices # eta = linear predictor - eta = X @ coef + eta = _safe_lin_pred(X, coef) mu = link.inverse(eta) # D = h'(eta) hp = link.inverse_derivative(eta) @@ -906,10 +1003,10 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): z = eta + (y - mu) / hp # solve A*coef = b # A = X' W X + P2, b = X' W z - coef = _irls_step(X, W, P2, z) + coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept) # updated linear predictor # do it here for updated values for tolerance - eta = X @ coef + eta = _safe_lin_pred(X, coef) mu = link.inverse(eta) hp = link.inverse_derivative(eta) V = family.variance(mu, phi=1, weights=weights) @@ -917,11 +1014,18 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): # which tolerace? |coef - coef_old| or gradient? # use gradient for compliance with newton-cg and lbfgs # gradient = -X' D (y-mu)/V(mu) + l2 P2 w - gradient = -(X.T @ (hp*(y-mu)/V)) + temp = hp * (y - mu) / V + if sparse.issparse(X): + gradient = -(X.transpose() @ temp) + else: + gradient = -(X.T @ temp) + idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - gradient += P2*coef + gradient += P2 * coef[idx:] else: - gradient += P2 @ coef + gradient += P2 @ coef[idx:] + if fit_intercept: + gradient = np.concatenate(([-temp.sum()], gradient)) if (np.max(np.abs(gradient)) <= tol): converged = True break @@ -937,7 +1041,7 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, max_inner_iter=1000, selection='cyclic', random_state=None, diag_fisher=False): - """Compute inner loop of coordinate descent = cycles through features. + """Compute inner loop of coordinate descent, i.e. cycles through features. Minimization of 1-d subproblems:: @@ -953,24 +1057,31 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, # of Improved GLMNET or Gap Safe Screening Rules # https://arxiv.org/abs/1611.05780 n_samples, n_features = X.shape + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept B = fisher if P2.ndim == 1: - coef_P2 = coef * P2 + coef_P2 = coef[idx:] * P2 if not diag_fisher: - B[np.diag_indices_from(B)] += P2 + idiag = np.arange(start=idx, stop=B.shape[0]) + # B[np.diag_indices_from(B)] += P2 + B[(idiag, idiag)] += P2 else: - coef_P2 = P2 @ coef # P2 is symmetric, mat @ vec is usually faster + coef_P2 = coef[idx:] @ P2 if not diag_fisher: if sparse.issparse(P2): - B += P2.toarray() + B[idx:, idx:] += P2.toarray() else: - B += P2 - A = -score + coef_P2 # + d @ (H+P2) but d=0 so far + B[idx:, idx:] += P2 + # A = -score + coef_P2 + A = -score + A[idx:] += coef_P2 + # A += d @ (H+P2) but so far d=0 # inner loop - inner_iter = 0 - while inner_iter < max_inner_iter: + for inner_iter in range(1, max_inner_iter+1): inner_iter += 1 n_cycles += 1 + # cycle through features, update intercept separately at the end if selection == 'random': featurelist = random_state.permutation(n_features) else: @@ -985,70 +1096,85 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, # with beta = z+d, beta_hat = d-a/b and gamma = c/b # z = 1/b * S(bd-a,c) - d # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding - a = A[j] + jdx = j+idx # index for arrays containing entries for intercept + a = A[jdx] if diag_fisher: + # Note: fisher is ndarray of shape (n_samples,) => no idx + # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway + Bj = np.zeros_like(A) + if intercept: + Bj[0] = fisher.sum() if sparse.issparse(X): - xj = X[:, j] - b = xj.transpose() @ xj.multiply(fisher[:, np.newaxis]) - b = b[0, 0] + Bj[idx:] = (X[:, j].transpose().multiply(fisher) @ X + ).toarray().ravel() else: - b = X[:, j] @ (fisher * X[:, j]) + Bj[idx:] = (fisher * X[:, j]) @ X if P2.ndim == 1: - b += P2[j] + Bj[idx:] += P2[j] else: - b += P2[j, j] + if sparse.issparse(P2): + # slice columns as P2 is csc + Bj[idx:] += P2[:, j].toarray().ravel() + else: + Bj[idx:] += P2[:, j] + b = Bj[jdx] else: - b = B[j, j] + b = B[jdx, jdx] + # those ten lines aree what it is all about if b <= 0: z = 0 elif P1[j] == 0: z = -a/b - elif a + P1[j] < b * (coef[j] + d[j]): + elif a + P1[j] < b * (coef[jdx] + d[jdx]): z = -(a + P1[j])/b - elif a - P1[j] > b * (coef[j] + d[j]): + elif a - P1[j] > b * (coef[jdx] + d[jdx]): z = -(a - P1[j])/b else: - z = -(coef[j] + d[j]) + z = -(coef[jdx] + d[jdx]) # update direction d - d[j] += z + d[jdx] += z # update A because d_j is now d_j+z # A = f'(w) + d*H(w) + (w+d)*P2 # => A += (H+P2)*e_j z = B_j * z # Note: B is symmetric B = B.transpose if diag_fisher: - if sparse.issparse(X): - A += (X.transpose() @ - X[:, j].multiply(fisher[:, np.newaxis]) - ).toarray().ravel() * z - else: - # A += (X.T @ (fisher * X[:, j])) * z - # same without transpose of X - A += ((fisher * X[:, j]) @ X) * z - - if P2.ndim == 1: - A[j] += P2[j] * z - elif sparse.issparse(P2): - # slice columns as P2 is csc - A += P2[:, j].toarray().ravel() * z - else: - A += P2[:, j] * z + # Bj = B[:, j] calculated above, still valid + A += Bj * z else: # B is symmetric, C- or F-contiguous, but never sparse if B.flags['F_CONTIGUOUS']: # slice columns like for sparse csc - A += B[:, j] * z + A += B[:, jdx] * z else: # B.flags['C_CONTIGUOUS'] might be true # slice rows - A += B[j, :] * z - # end of cycle + A += B[jdx, :] * z + # end of cycle over features + # update intercept + if intercept: + if diag_fisher: + Bj = np.zeros_like(A) + Bj[0] = fisher.sum() + Bj[1:] = fisher @ X + b = Bj[0] + else: + b = B[0, 0] + z = 0 if b <= 0 else -A[0]/b + d[0] += z + if diag_fisher: + A += Bj * z + else: + if B.flags['F_CONTIGUOUS']: + A += B[:, 0] * z + else: + A += B[0, :] * z + # end of complete cycle # stopping criterion for inner loop # sum_i(|minimum of norm of subgrad of q(d)_i|) - mn_subgrad = np.where(coef + d == 0, - np.sign(A) * np.maximum(np.abs(A) - P1, 0), - A + np.sign(coef + d) * P1) + # subgrad q(d) = A + subgrad ||P1*(w+d)||_1 + mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1) mn_subgrad = linalg.norm(mn_subgrad, ord=1) if mn_subgrad <= inner_tol: if inner_iter == 1: @@ -1058,7 +1184,7 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, return d, coef_P2, n_cycles, inner_tol -def _cd_solver(coef, X, y, weights, P1, P2, family, link, +def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, max_iter=100, max_inner_iter=1000, tol=1e-4, selection='cyclic ', random_state=None, diag_fisher=False, copy_X=True): @@ -1083,7 +1209,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 A = f'(w) + d*H(w) + (w+d)*P2 - B = H+P2 + B = H + P2 Repeat steps 1-3 until convergence. Note: Use Fisher matrix instead of Hessian for H. @@ -1091,7 +1217,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, Parameters ---------- - coef: ndarray, shape (n_features,) + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) Training data (with intercept included if present). If not sparse, @@ -1112,6 +1240,10 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, The L2-penalty matrix or vector (=diagonal matrix). If a matrix is passed, it must be symmetric. If X is sparse, P2 must also be sparse. + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + family : ExponentialDispersionModel link : Link @@ -1120,8 +1252,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, Maximum numer of outer (Newton) iterations. max_inner_iter : int, optional (default=1000) - Maximum number of iterations, i.e. cycles over all features, in inner - loop. + Maximum number of iterations in each inner loop, i.e. max number of + cycles over all features per inner loop. tol : float, optional (default=1e-4) Covergence criterion is @@ -1133,8 +1265,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, random_state : {int, RandomState instance, None}, optional (default=None) diag_fisher : boolean, optional (default=False) - 'False' calculates full fisher matrix, 'True' only diagonal matrix s.t. - fisher = X.T @ diag @ X. This saves storage but needs more + ``False`` calculates full fisher matrix, ``True`` only diagonal matrix + s.t. fisher = X.T @ diag @ X. This saves storage but needs more matrix-vector multiplications. copy_X : boolean, optional (default=True) @@ -1142,7 +1274,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, Returns ------- - coef : ndarray, shape (n_features,) + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. n_iter : numer of outer iterations = newton iterations @@ -1174,6 +1308,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, n_cycles = 0 # number of (complete) cycles over features converged = False n_samples, n_features = X.shape + idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept # line search parameters (beta, sigma) = (0.5, 0.01) # some precalculations @@ -1186,16 +1321,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, d = np.zeros_like(coef) # initial stopping tolerance of inner loop # use L1-norm of minimum of norm of subgradient of F - # fp_wP2 = f'(w) + w*P2 - if P2.ndim == 1: - fp_wP2 = -score + coef * P2 - else: - # Note: P2 is symmetric and matrix @ vector is faster for sparse - # matrices. - fp_wP2 = -score + P2 @ coef - inner_tol = np.where(coef == 0, - np.sign(fp_wP2) * np.maximum(np.abs(fp_wP2) - P1, 0), - fp_wP2 + np.sign(coef) * P1) + inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) inner_tol = linalg.norm(inner_tol, ord=1) # outer loop while n_iter < max_iter: @@ -1211,23 +1337,23 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, # F(w + lambda d) - F(w) <= lambda * bound # bound = sigma * (f'(w)*d + w*P2*d # +||P1 (w+d)||_1 - ||P1 w||_1) - P1w_1 = linalg.norm(P1 * coef, ord=1) + P1w_1 = linalg.norm(P1 * coef[idx:], ord=1) + P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1) # Note: coef_P2 already calculated and still valid - bound = sigma * (-(score @ d) + coef_P2 @ d + - linalg.norm(P1 * (coef + d), ord=1) - P1w_1) + bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1) Fw = (0.5 * family.deviance(y, mu, weights) + - 0.5 * (coef_P2 @ coef) + P1w_1) + 0.5 * (coef_P2 @ coef[idx:]) + P1w_1) la = 1./beta for k in range(20): la *= beta # starts with la=1 coef_wd = coef + la * d - mu_wd = link.inverse(X @ coef_wd) + mu_wd = link.inverse(_safe_lin_pred(X, coef_wd)) Fwd = (0.5 * family.deviance(y, mu_wd, weights) + - linalg.norm(P1 * coef_wd, ord=1)) + linalg.norm(P1 * coef_wd[idx:], ord=1)) if P2.ndim == 1: - Fwd += 0.5 * ((coef_wd * P2) @ coef_wd) + Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:]) else: - Fwd += 0.5 * (coef_wd @ (P2 @ coef_wd)) + Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:])) if Fwd - Fw <= sigma * la * bound: break # update coefficients @@ -1238,16 +1364,10 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, coef=coef, phi=1, X=X, y=y, weights=weights, link=link, diag_fisher=diag_fisher) # stopping criterion for outer loop - # sum_i(|minimum of norm of subgrad of F(w)_i|) + # sum_i(|minimum-norm of subgrad of F(w)_i|) # fp_wP2 = f'(w) + w*P2 # Note: eta, mu and score are already updated - if P2.ndim == 1: - fp_wP2 = -score + coef * P2 - else: - fp_wP2 = -score + P2 @ coef # P2 is symmetric, mat @ vec is faster - mn_subgrad = np.where(coef == 0, - np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), - fp_wP2 + np.sign(coef) * P1) + mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) mn_subgrad = linalg.norm(mn_subgrad, ord=1) if mn_subgrad <= tol: converged = True @@ -1255,8 +1375,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, # end of outer loop if not converged: warnings.warn("Coordinate descent failed to converge. Increase" - " the number of iterations (currently {0})" - .format(max_iter), ConvergenceWarning) + " the maximum number of iterations max_iter" + " (currently {0})".format(max_iter), ConvergenceWarning) return coef, n_iter, n_cycles @@ -1387,8 +1507,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where g_i is the i-th component of the gradient (derivative) of the objective function. For the cd solver, covergence is reached - when ``sum_i(|minimum of norm of g_i|)``, where g_i is the - subgradient of the objective. + when ``sum_i(|minimum-norm of g_i|)``, where g_i is the + subgradient of the objective and minimum-norm of g_i is the element of + the subgradient g_i with the smallest L2-norm. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` @@ -1563,19 +1684,7 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # 1. input validation # ####################################################################### - # 1.1 validate arguments of fit ####################################### - _dtype = [np.float64, np.float32] - X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], - dtype=_dtype, y_numeric=True, multi_output=False, - copy=self.copy_X) - # Without converting y to float, deviance might raise - # ValueError: Integers to negative integer powers are not allowed. - # Also, y must not be sparse. - y = np.asarray(y, dtype=np.float64) - - weights = _check_weights(sample_weight, y.shape[0]) - - # 1.2 validate arguments of __init__ ################################## + # 1.1 validate arguments of __init__ ################################## # Guarantee that self._family_instance is an instance of class # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): @@ -1668,25 +1777,6 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.warm_start, bool): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) - start_params = self.start_params - if isinstance(start_params, str): - if start_params not in ['irls', 'least_squares', 'zero']: - raise ValueError("The argument start_params must be 'irls', " - "'least-squares', 'zero' or an array of " - " correct length;" - " got(start_params={0})".format(start_params)) - else: - start_params = check_array(start_params, accept_sparse=False, - force_all_finite=True, ensure_2d=False, - dtype=_dtype, copy=True) - if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or - (start_params.ndim != 1)): - raise ValueError("Start values for parameters must have the" - "right length and dimension; required (length" - "={0}, ndim=1); got (length={1}, ndim={2})." - .format(X.shape[1] + self.fit_intercept, - start_params.shape[0], - start_params.ndim)) if self.selection not in ['cyclic', 'random']: raise ValueError("The argument selection must be 'cyclic' or " "'random'; got (selection={0})" @@ -1702,36 +1792,59 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument check_input must be bool; got " "(check_input={0})".format(self.check_input)) + family = self._family_instance + link = self._link_instance + + # 1.2 validate arguments of fit ####################################### + _dtype = [np.float64, np.float32] + if solver == 'cd': + _stype = ['csc'] + else: + _stype = ['csc', 'csr'] + X, y = check_X_y(X, y, accept_sparse=_stype, + dtype=_dtype, y_numeric=True, multi_output=False, + copy=self.copy_X) + # Without converting y to float, deviance might raise + # ValueError: Integers to negative integer powers are not allowed. + # Also, y must not be sparse. + y = np.asarray(y, dtype=np.float64) + + weights = _check_weights(sample_weight, y.shape[0]) + + n_samples, n_features = X.shape + + # 1.3 arguments to take special care ################################## + # P1, P2, start_params if isinstance(self.P1, str) and self.P1 == 'identity': - P1 = np.ones(X.shape[1]) + P1 = np.ones(n_features) else: P1 = np.atleast_1d(self.P1) try: - P1 = P1.astype(np.float64, casting='safe', copy=True) + P1 = P1.astype(np.float64, casting='safe', copy=False) except TypeError: raise TypeError("The given P1 cannot be converted to a numeric" "array; got (P1.dtype={0})." .format(P1.dtype)) - if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): + if (P1.ndim != 1) or (P1.shape[0] != n_features): raise ValueError("P1 must be either 'identity' or a 1d array " "with the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." - .format(P1.shape[0], X.shape[1])) + .format(P1.shape[0], n_features)) # If X is sparse, make P2 sparse, too. if isinstance(self.P2, str) and self.P2 == 'identity': if sparse.issparse(X): - P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), - shape=(X.shape[1], X.shape[1]))).tocsr() + P2 = (sparse.dia_matrix((np.ones(n_features), 0), + shape=(n_features, n_features))).tocsc() else: - P2 = np.ones(X.shape[1]) + P2 = np.ones(n_features) else: P2 = check_array(self.P2, copy=True, - accept_sparse=['csr', 'csc'], + accept_sparse=_stype, dtype=_dtype, ensure_2d=False) if P2.ndim == 1: P2 = np.asarray(P2) - if P2.shape[0] != X.shape[1]: + if P2.shape[0] != n_features: raise ValueError("P2 should be a 1d array of shape " "(n_features,) with " "n_features=X.shape[1]; " @@ -1739,12 +1852,12 @@ def fit(self, X, y, sample_weight=None): .format(P2.shape[0], X.shape[1])) if sparse.issparse(X): P2 = (sparse.dia_matrix((P2, 0), - shape=(X.shape[1], X.shape[1]))).tocsr() + shape=(n_features, n_features))).tocsc() elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and P2.shape[0] == X.shape[1]): if sparse.issparse(X): P2 = (sparse.dia_matrix((P2, 0), - shape=(X.shape[1], X.shape[1]))).tocsr() + shape=(n_features, n_features))).tocsc() else: raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " @@ -1752,37 +1865,31 @@ def fit(self, X, y, sample_weight=None): "got (P2.shape=({0}, {1})), needed ({2}, {2})" .format(P2.shape[0], P2.shape[1], X.shape[1])) - family = self._family_instance - link = self._link_instance - - if self.fit_intercept: - # Note: intercept is first column <=> coef[0] is for intecept - if sparse.issparse(X): - Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X], - format=X.format) - else: - Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) - P1 = np.concatenate((np.array([0]), P1)) - if P2.ndim == 1: - P2 = np.concatenate((np.array([0]), P2)) - elif sparse.issparse(P2): - P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2), - format=P2.format, - dtype=P2.dtype).tocsr() - else: - # as of numpy 1.13 this would work: - # P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))], - # [np.zeros((X.shape[1], 1)), P2]]) - P2 = np.hstack((np.zeros((X.shape[1], 1)), P2)) - P2 = np.vstack((np.zeros((1, X.shape[1]+1)), P2)) + start_params = self.start_params + if isinstance(start_params, str): + if start_params not in ['irls', 'least_squares', 'zero']: + raise ValueError("The argument start_params must be 'irls', " + "'least-squares', 'zero' or an array of " + " correct length;" + " got(start_params={0})".format(start_params)) else: - Xnew = X + start_params = check_array(start_params, accept_sparse=False, + force_all_finite=True, ensure_2d=False, + dtype=_dtype, copy=True) + if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or + (start_params.ndim != 1)): + raise ValueError("Start values for parameters must have the" + "right length and dimension; required (length" + "={0}, ndim=1); got (length={1}, ndim={2})." + .format(X.shape[1] + self.fit_intercept, + start_params.shape[0], + start_params.ndim)) - n_samples, n_features = Xnew.shape l1 = self.alpha * self.l1_ratio l2 = self.alpha * (1 - self.l1_ratio) - P1 *= l1 - P2 *= l2 + # P1 and P2 are now for sure copies + P1 = l1 * P1 + P2 = l2 * P2 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric if P2.ndim == 2: @@ -1791,14 +1898,12 @@ def fit(self, X, y, sample_weight=None): else: P2 = 0.5 * (P2 + P2.T) - # 1.3 additional validations ########################################## + # 1.4 additional validations ########################################## if self.check_input: if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" .format(family.__class__.__name__)) - if not np.all(weights >= 0): - raise ValueError("Sample weights must be non-negative.") # check if P1 has only non-negative values, negative values might # indicate group lasso in the future. if not isinstance(self.P1, str): # if self.P1 != 'identity': @@ -1830,7 +1935,7 @@ def fit(self, X, y, sample_weight=None): else: if not np.all(linalg.eigvalsh(P2) >= epsneg): raise ValueError("P2 must be positive semi-definite.") - # TODO: if alpha=0 check that Xnew is not rank deficient + # TODO: if alpha=0 check that X is not rank deficient # TODO: what else to check? ####################################################################### @@ -1874,13 +1979,14 @@ def fit(self, X, y, sample_weight=None): z = eta + (y-mu)/hp # solve A*coef = b # A = X' W X + l2 P2, b = X' W z - coef = _irls_step(Xnew, W, P2, z) + coef = _irls_step(X, W, P2, z, + fit_intercept=self.fit_intercept) elif start_params == 'least_squares': # less restrictive tolerance for finding start values tol = np.max([self.tol, np.sqrt(self.tol)]) if self.alpha == 0: reg = LinearRegression(copy_X=True, fit_intercept=False) - reg.fit(Xnew, link.link(y)) + reg.fit(X, link.link(y)) coef = reg.coef_ elif self.l1_ratio <= 0.01: # ElasticNet says l1_ratio <= 0.01 is not reliable @@ -1888,19 +1994,21 @@ def fit(self, X, y, sample_weight=None): # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 reg = Ridge(copy_X=True, fit_intercept=False, alpha=self.alpha*n_samples, tol=tol) - reg.fit(Xnew, link.link(y)) + reg.fit(X, link.link(y)) coef = reg.coef_ else: # TODO: Does this make sense at all? reg = ElasticNet(copy_X=True, fit_intercept=False, alpha=self.alpha, l1_ratio=self.l1_ratio, tol=tol) - reg.fit(Xnew, link.link(y)) + reg.fit(X, link.link(y)) coef = reg.coef_ else: # start_params == 'zero' - coef = np.zeros(n_features) if self.fit_intercept: + coef = np.zeros(n_features+1) coef[0] = link.link(np.average(y, weights=weights)) + else: + coef = np.zeros(n_features) else: # assign given array as start values coef = start_params @@ -1915,24 +2023,28 @@ def fit(self, X, y, sample_weight=None): # Note: we already symmetriezed P2 = 1/2 (P2 + P2') if solver == 'irls': coef, self.n_iter_ = \ - _irls_solver(coef=coef, X=Xnew, y=y, weights=weights, P2=P2, - family=family, link=link, max_iter=self.max_iter, - tol=self.tol) + _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2, + fit_intercept=self.fit_intercept, family=family, + link=link, max_iter=self.max_iter, tol=self.tol) # 4.2 L-BFGS ########################################################## elif solver == 'lbfgs': def func(coef, X, y, weights, P2, family, link): - mu, dev, devp = \ + mu, devp = \ family._mu_deviance_derivative(coef, X, y, weights, link) + dev = family.deviance(y, mu, weights) + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - L2 = P2 * coef + L2 = P2 * coef[idx:] else: - L2 = P2 @ coef - obj = 0.5 * dev + 0.5 * (coef @ L2) - objp = 0.5 * devp + L2 + L2 = P2 @ coef[idx:] + obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2) + objp = 0.5 * devp + objp[idx:] += L2 return obj, objp - args = (Xnew, y, weights, P2, family, link) + args = (X, y, weights, P2, family, link) coef, loss, info = fmin_l_bfgs_b( func, coef, fprime=None, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, @@ -1952,50 +2064,66 @@ def func(coef, X, y, weights, P2, family, link): # precisely, expected hessian of deviance. elif solver == 'newton-cg': def func(coef, X, y, weights, P2, family, link): + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - L2 = coef @ (P2 * coef) + L2 = coef[idx:] @ (P2 * coef[idx:]) else: - L2 = coef @ (P2 @ coef) - mu = link.inverse(X @ coef) + L2 = coef[idx:] @ (P2 @ coef[idx:]) + mu = link.inverse(_safe_lin_pred(X, coef)) return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 def grad(coef, X, y, weights, P2, family, link): + mu, devp = \ + family._mu_deviance_derivative(coef, X, y, weights, link) + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - L2 = P2 * coef + L2 = P2 * coef[idx:] else: - L2 = P2 @ coef - eta = X @ coef - mu = link.inverse(eta) - d1 = link.inverse_derivative(eta) - grad = X.T @ (d1 * family.deviance_derivative(y, mu, weights)) - return 0.5 * grad + L2 + L2 = P2 @ coef[idx:] + objp = 0.5 * devp + objp[idx:] += L2 + return objp def grad_hess(coef, X, y, weights, P2, family, link): + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - L2 = P2 * coef + L2 = P2 * coef[idx:] else: - L2 = P2 @ coef - eta = X @ coef + L2 = P2 @ coef[idx:] + eta = _safe_lin_pred(X, coef) mu = link.inverse(eta) d1 = link.inverse_derivative(eta) - grad = 0.5 * \ - (X.T @ (d1 * family.deviance_derivative(y, mu, weights))) \ - + L2 - # expected hessian = X.T @ diag_matrix @ X + temp = d1 * family.deviance_derivative(y, mu, weights) + if intercept: + grad = np.concatenate(([0.5 * temp.sum()], + 0.5 * temp @ X + L2)) + else: + grad = 0.5 * temp @ X + L2 # sampe as 0.5* X.T @ temp + L2 + + # expected hessian = fisher = X.T @ diag_matrix @ X # calculate only diag_matrix diag = d1**2 / family.variance(mu, phi=1, weights=weights) - def Hs(s): - ret = 0.5 * (X.T @ (diag * (X @ s))) + def Hs(coef): + # return (0.5 * fisher + P2) @ coef + # ret = 0.5 * (X.T @ (diag * (X @ coef))) + ret = 0.5 * ((diag * (X @ coef[idx:])) @ X) if P2.ndim == 1: - ret += P2 * s + ret += P2 * coef[idx:] else: - ret += P2 @ s + ret += P2 @ coef[idx:] + if intercept: + h0i = np.concatenate(([diag.sum()], diag @ X)) + ret = np.concatenate(([0.5 * (h0i @ coef)], + ret + 0.5 * coef[0] * h0i[1:])) return ret return grad, Hs - args = (Xnew, y, weights, P2, family, link) + args = (X, y, weights, P2, family, link) coef, n_iter_i = newton_cg(grad_hess, func, grad, coef, args=args, maxiter=self.max_iter, tol=self.tol) @@ -2007,13 +2135,14 @@ def Hs(s): elif solver == 'cd': # For coordinate descent, if X is sparse, it should be csc format # If X is sparse, P2 must also be csc - if sparse.issparse(Xnew): - Xnew = Xnew.tocsc(copy=self.copy_X) + if sparse.issparse(X): + X = X.tocsc(copy=self.copy_X) P2 = sparse.csc_matrix(P2) coef, self.n_iter_, self._n_cycles = \ - _cd_solver(coef=coef, X=Xnew, y=y, weights=weights, P1=P1, - P2=P2, family=family, link=link, + _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, + P2=P2, fit_intercept=self.fit_intercept, + family=family, link=link, max_iter=self.max_iter, tol=self.tol, selection=self.selection, random_state=random_state, diag_fisher=self.diag_fisher, copy_X=self.copy_X) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index ae8a5f4cfc5e7..cdac151b77de6 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -566,7 +566,7 @@ def obj(coef): pd = PoissonDistribution() link = LogLink() N = y.shape[0] - mu = link.inverse(X @ coef[1:]+coef[0]) + mu = link.inverse(X @ coef[1:] + coef[0]) alpha, l1_ratio = (1, 0.5) return 1./(2.*N) * pd.deviance(y, mu) \ + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \ @@ -587,9 +587,9 @@ def obj(coef): assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) - # start_params='least_squares' with different alpha + # check warm_start, therefore start with different alpha glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, - family='poisson', + family='poisson', max_iter=300, link='log', solver='cd', tol=1e-5, start_params='zero') glm.fit(X, y) From c9a7a95e89deaadc40b92edd8e1208d550998a72 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 21 Apr 2019 17:03:07 +0200 Subject: [PATCH 51/69] Revised option start_params * renamed option irls into guess * removed option least_squares * updated tests --- doc/modules/linear_model.rst | 12 +- sklearn/linear_model/glm.py | 223 ++++++++++++++----------- sklearn/linear_model/tests/test_glm.py | 140 ++++++++-------- 3 files changed, 207 insertions(+), 168 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index e60e9e84a4747..4bede17af581a 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -925,12 +925,12 @@ follows: >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5, - check_input=True, copy_X=True, family='poisson', - fit_dispersion=None, fit_intercept=True, l1_ratio=0, - link='log', max_iter=100, random_state=None, - selection='cyclic', solver='auto', - start_params='irls', tol=0.0001, verbose=0, - warm_start=False) + check_input=True, copy_X=True, diag_fisher=False, + family='poisson', fit_dispersion=None, + fit_intercept=True, l1_ratio=0, link='log', + max_iter=100, random_state=None, selection='cyclic', + solver='auto', start_params='guess', tol=0.0001, + verbose=0, warm_start=False) >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE array([0.24630169, 0.43373464]) >>> reg.intercept_ #doctest: +ELLIPSIS diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 2afd1ddf8c79c..a53cc39ecd307 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -48,9 +48,6 @@ import scipy.sparse.linalg as splinalg from scipy.optimize import fmin_l_bfgs_b import warnings -from .base import LinearRegression -from .coordinate_descent import ElasticNet -from .ridge import Ridge from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, check_X_y @@ -93,6 +90,14 @@ def _safe_lin_pred(X, coef): return X @ coef +def _safe_toarray(X): + """Returns a numpy array.""" + if sparse.issparse(X): + return X.toarray() + else: + return np.asarray(X) + + def _safe_sandwich_dot(X, d, intercept=False): """Compute sandwich product X.T @ diag(d) @ X. @@ -100,7 +105,9 @@ def _safe_sandwich_dot(X, d, intercept=False): first column of X. X can be sparse, d must be an ndarray. Always returns a ndarray.""" if sparse.issparse(X): - temp = (X.transpose().multiply(d) @ X).toarray() + temp = (X.transpose() @ X.multiply(d[:, np.newaxis])) + # for older versions of numpy and scipy, temp may be a np.matrix + temp = _safe_toarray(temp) else: temp = (X.T * d) @ X if intercept: @@ -945,7 +952,8 @@ def _irls_step(X, W, P2, z, fit_intercept=True): else: if sparse.issparse(X): XtW = X.transpose().multiply(W) - A = (XtW @ X).toarray() + # for older versions of numpy and scipy, A may be a np.matrix + A = _safe_toarray(XtW @ X) else: XtW = (X.T * W) A = XtW @ X @@ -1105,8 +1113,9 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, if intercept: Bj[0] = fisher.sum() if sparse.issparse(X): - Bj[idx:] = (X[:, j].transpose().multiply(fisher) @ X - ).toarray().ravel() + Bj[idx:] = _safe_toarray(X[:, j].transpose() @ + X.multiply(fisher[:, np.newaxis]) + ).ravel() else: Bj[idx:] = (fisher * X[:, j]) @ X @@ -1477,27 +1486,32 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \ + solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \ optional (default='auto') Algorithm to use in the optimization problem: 'auto' Sets 'irls' if l1_ratio equals 0, else 'cd'. + 'cd' + Coordinate descent algorithm. It can deal with L1 as well as L2 + penalties. Note that in order to avoid unnecessary memory + duplication of X in the ``fit`` method, X should be directly passed + as a Fortran-contiguous numpy array or sparse csc matrix. + 'irls' - Iterated reweighted least squares (with Fisher scoring). + Iterated reweighted least squares. It is the standard algorithm for GLMs. It cannot deal with L1 penalties. + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties. + 'newton-cg', 'lbfgs' - Cannot deal with L1 penalties. + Newton conjugate gradient algorithm cannot deal with L1 penalties. - 'cd' - Coordinate descent algorithm. It can deal with L1 as well as L2 - penalties. Note that in order to avoid unnecessary memory - duplication of the X argument in the ``fit`` method, X should be - directly passed as a Fortran-contiguous numpy array or sparse csc - matrix. + Note that all solvers except lbfgs use the fisher matrix, i.e. the + expected Hessian instead of the Hessian matrix. max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -1505,11 +1519,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` - where g_i is the i-th component of the gradient (derivative) of + where ``g_i`` is the i-th component of the gradient (derivative) of the objective function. For the cd solver, covergence is reached - when ``sum_i(|minimum-norm of g_i|)``, where g_i is the - subgradient of the objective and minimum-norm of g_i is the element of - the subgradient g_i with the smallest L2-norm. + when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the + subgradient of the objective and minimum-norm of ``g_i`` is the element + of the subgradient ``g_i`` with the smallest L2-norm. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` @@ -1518,23 +1532,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): does not exit (first call to ``fit``), option ``start_params`` sets the start values for ``coef_`` and ``intercept_``. - start_params : {'irls', 'least_squares', 'zero', array of shape \ - (n_features*, )}, optional (default='irls') + start_params : {'guess', 'zero', array of shape (n_features*, )}, \ + optional (default='guess') Relevant only if ``warm_start=False`` or if fit is called the first time (``self.coef_`` does not yet exist). - 'irls' + 'guess' Start values of mu are calculated by family.starting_mu(..). Then, - one step of irls obtains start values for ``coef_``. This gives - usually good results. - - 'least_squares' - Start values for ``coef_`` are obtained by a least squares fit in the - link space (y is transformed to the space of the linear predictor). + one Newton step obtains start values for ``coef_``. If + ``solver='irls'``, it uses one irls step, else the Newton step is + calculated by the cd solver. + This gives usually good starting values. 'zero' All coefficients are set to zero. If ``fit_intercept=True``, the - start value for the intercept is obtained by the average of y. + start value for the intercept is obtained by the weighted average of y. array The array of size n_features* is directly used as start values @@ -1560,17 +1572,17 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): RandomState instance used by `np.random`. Used when ``selection`` == 'random'. - diag_fisher : boolean, (default=False) - Only relevant for solver 'cd'. If ``False``, the full Fisher matrix - (expected Hessian) is computed in each outer iteretion (Newton - iteration). If ``True``, only a diagonal matrix (stored as 1d array) is - computed, such that fisher = X.T @ diag @ X. This saves memory and - matrix-matrix multiplications, but needs more matrix-vector - multiplications. If you use large sparse X or if you have many - features, i.e. n_features >> n_samples, you might set this option to - ``True``. - - copy_X : boolean, optional, default True + diag_fisher : boolean, optional, (default=False) + Only relevant for solver 'cd' (see also ``start_params='guess'``). + If ``False``, the full Fisher matrix (expected Hessian) is computed in + each outer iteration (Newton iteration). If ``True``, only a diagonal + matrix (stored as 1d array) is computed, such that + fisher = X.T @ diag @ X. This saves memory and matrix-matrix + multiplications, but needs more matrix-vector multiplications. If you + use large sparse X or if you have many features, + i.e. n_features >> n_samples, you might set this option to ``True``. + + copy_X : boolean, optional, (default=True) If ``True``, X will be copied; else, it may be overwritten. check_input : boolean, optional (default=True) @@ -1634,7 +1646,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, - tol=1e-4, warm_start=False, start_params='irls', + tol=1e-4, warm_start=False, start_params='guess', selection='cyclic', random_state=None, diag_fisher=False, copy_X=True, check_input=True, verbose=0): self.alpha = alpha @@ -1867,11 +1879,10 @@ def fit(self, X, y, sample_weight=None): start_params = self.start_params if isinstance(start_params, str): - if start_params not in ['irls', 'least_squares', 'zero']: - raise ValueError("The argument start_params must be 'irls', " - "'least-squares', 'zero' or an array of " - " correct length;" - " got(start_params={0})".format(start_params)) + if start_params not in ['guess', 'zero']: + raise ValueError("The argument start_params must be 'guess', " + "'zero' or an array of correct length; " + "got(start_params={0})".format(start_params)) else: start_params = check_array(start_params, accept_sparse=False, force_all_finite=True, ensure_2d=False, @@ -1894,10 +1905,17 @@ def fit(self, X, y, sample_weight=None): # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric if P2.ndim == 2: if sparse.issparse(P2): - P2 = 0.5 * (P2 + P2.transpose()) + if sparse.isspmatrix_csc(P2): + P2 = 0.5 * (P2 + P2.transpose()).tocsc() + else: + P2 = 0.5 * (P2 + P2.transpose()).tocsr() else: P2 = 0.5 * (P2 + P2.T) + # For coordinate descent, if X is sparse, P2 must also be csc + if solver == 'cd' and sparse.issparse(X): + P2 = sparse.csc_matrix(P2) + # 1.4 additional validations ########################################## if self.check_input: if not np.all(family.in_y_range(y)): @@ -1964,45 +1982,63 @@ def fit(self, X, y, sample_weight=None): else: coef = self.coef_ elif isinstance(start_params, str): - if start_params == 'irls': - # See 3.1 IRLS - # Use mu_start and apply one irls step to calculate coef + if start_params == 'guess': + # Set mu=starting_mu of the family and do one Newton step + # If solver=cd use cd, else irls mu = family.starting_mu(y, weights=weights) - # linear predictor - eta = link.link(mu) - # h'(eta) - hp = link.inverse_derivative(eta) - # working weights W, in principle a diagonal matrix - # therefore here just as 1d array - W = (hp**2 / family.variance(mu, phi=1, weights=weights)) - # working observations - z = eta + (y-mu)/hp - # solve A*coef = b - # A = X' W X + l2 P2, b = X' W z - coef = _irls_step(X, W, P2, z, - fit_intercept=self.fit_intercept) - elif start_params == 'least_squares': - # less restrictive tolerance for finding start values - tol = np.max([self.tol, np.sqrt(self.tol)]) - if self.alpha == 0: - reg = LinearRegression(copy_X=True, fit_intercept=False) - reg.fit(X, link.link(y)) - coef = reg.coef_ - elif self.l1_ratio <= 0.01: - # ElasticNet says l1_ratio <= 0.01 is not reliable - # => use Ridge - # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - reg = Ridge(copy_X=True, fit_intercept=False, - alpha=self.alpha*n_samples, tol=tol) - reg.fit(X, link.link(y)) - coef = reg.coef_ + eta = link.link(mu) # linear predictor + if solver in ['cd', 'lbfgs', 'newton-cg']: + # see function _cd_solver + sigma_inv = 1/family.variance(mu, phi=1, weights=weights) + d1 = link.inverse_derivative(eta) + temp = sigma_inv * d1 * (y - mu) + if self.fit_intercept: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X # sampe as X.T @ temp + + d2_sigma_inv = d1 * d1 * sigma_inv + diag_fisher = self.diag_fisher + if diag_fisher: + fisher = d2_sigma_inv + else: + fisher = \ + _safe_sandwich_dot(X, d2_sigma_inv, + intercept=self.fit_intercept) + # set up space for search direction d for inner loop + if self.fit_intercept: + coef = np.zeros(n_features+1) + else: + coef = np.zeros(n_features) + d = np.zeros_like(coef) + # initial stopping tolerance of inner loop + # use L1-norm of minimum of norm of subgradient of F + # use less restrictive tolerance for initial guess + inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, + P1=P1) + inner_tol = 4 * linalg.norm(inner_tol, ord=1) + # just one outer loop = Newton step + n_cycles = 0 + d, coef_P2, n_cycles, inner_tol = \ + _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, + inner_tol, max_inner_iter=1000, + selection=self.selection, + random_state=self.random_state, + diag_fisher=self.diag_fisher) + coef += d # for simplicity no line search here else: - # TODO: Does this make sense at all? - reg = ElasticNet(copy_X=True, fit_intercept=False, - alpha=self.alpha, l1_ratio=self.l1_ratio, - tol=tol) - reg.fit(X, link.link(y)) - coef = reg.coef_ + # See _irls_solver + # h'(eta) + hp = link.inverse_derivative(eta) + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = (hp**2 / family.variance(mu, phi=1, weights=weights)) + # working observations + z = eta + (y-mu)/hp + # solve A*coef = b + # A = X' W X + l2 P2, b = X' W z + coef = _irls_step(X, W, P2, z, + fit_intercept=self.fit_intercept) else: # start_params == 'zero' if self.fit_intercept: coef = np.zeros(n_features+1) @@ -2048,7 +2084,7 @@ def func(coef, X, y, weights, P2, family, link): coef, loss, info = fmin_l_bfgs_b( func, coef, fprime=None, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, - maxiter=self.max_iter) + maxiter=self.max_iter, factr=1e3) if self.verbose > 0: if info["warnflag"] == 1: warnings.warn("lbfgs failed to converge." @@ -2106,6 +2142,8 @@ def grad_hess(coef, X, y, weights, P2, family, link): # expected hessian = fisher = X.T @ diag_matrix @ X # calculate only diag_matrix diag = d1**2 / family.variance(mu, phi=1, weights=weights) + if intercept: + h0i = np.concatenate(([diag.sum()], diag @ X)) def Hs(coef): # return (0.5 * fisher + P2) @ coef @@ -2116,7 +2154,6 @@ def Hs(coef): else: ret += P2 @ coef[idx:] if intercept: - h0i = np.concatenate(([diag.sum()], diag @ X)) ret = np.concatenate(([0.5 * (h0i @ coef)], ret + 0.5 * coef[0] * h0i[1:])) return ret @@ -2124,21 +2161,15 @@ def Hs(coef): return grad, Hs args = (X, y, weights, P2, family, link) - coef, n_iter_i = newton_cg(grad_hess, func, grad, coef, - args=args, maxiter=self.max_iter, - tol=self.tol) + coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef, + args=args, maxiter=self.max_iter, + tol=self.tol) # 4.4 coordinate descent ############################################## # Note: we already set P1 = l1*P1, see above # Note: we already set P2 = l2*P2, see above # Note: we already symmetriezed P2 = 1/2 (P2 + P2') elif solver == 'cd': - # For coordinate descent, if X is sparse, it should be csc format - # If X is sparse, P2 must also be csc - if sparse.issparse(X): - X = X.tocsc(copy=self.copy_X) - P2 = sparse.csc_matrix(P2) - coef, self.n_iter_, self._n_cycles = \ _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, P2=P2, fit_intercept=self.fit_intercept, diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index cdac151b77de6..17535c067bbb6 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -4,7 +4,7 @@ import scipy as sp from scipy import linalg, optimize, sparse -from sklearn.datasets import make_classification +from sklearn.datasets import make_classification, make_regression from sklearn.linear_model.glm import ( Link, IdentityLink, @@ -22,10 +22,12 @@ assert_array_equal, assert_array_almost_equal) +rng = np.random.RandomState(42) + + @pytest.mark.parametrize('link', Link.__subclasses__()) def test_link_properties(link): """Test link inverse and derivative.""" - rng = np.random.RandomState(0) x = rng.rand(100)*100 link = link() # instatiate object decimal = 10 @@ -86,7 +88,6 @@ def test_deviance_zero(family, chk_values): def test_fisher_matrix(family, link): """Test the Fisher matrix numerically. Trick: Use numerical differentiation with y = mu""" - rng = np.random.RandomState(0) coef = np.array([-2, 1, 0, 1, 2.5]) phi = 0.5 X = rng.randn(10, 5) @@ -218,7 +219,6 @@ def test_glm_P2_argument(P2): def test_glm_P2_positive_semidefinite(): """Test GLM for a positive semi-definite P2 argument.""" n_samples, n_features = 10, 5 - rng = np.random.RandomState(42) y = np.arange(n_samples) X = np.zeros((n_samples, n_features)) P2 = np.diag([100, 10, 5, 0, -1E-5]) @@ -351,20 +351,15 @@ def test_glm_check_input_argument(check_input): glm.fit(X, y) -@pytest.mark.parametrize( - 'family', - [NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), - GeneralizedHyperbolicSecant()]) @pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) -def test_glm_identiy_regression(family, solver): +def test_glm_identiy_regression(solver): """Test GLM regression with identity link on a simple dataset.""" coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) - glm = GeneralizedLinearRegressor(alpha=0, family=family, link='identity', - fit_intercept=False, solver=solver) + glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', + fit_intercept=False, solver=solver, + start_params='zero', tol=1e-7) res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) @@ -375,34 +370,42 @@ def test_glm_identiy_regression(family, solver): GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), GeneralizedHyperbolicSecant()]) -@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) -def test_glm_log_regression(family, solver): +@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), + ('lbfgs', 1e-6, 6), + ('newton-cg', 1e-7, 6), + ('cd', 1e-7, 6)]) +def test_glm_log_regression(family, solver, tol, dec): """Test GLM regression with log link on a simple dataset.""" - coef = [1, 2] + coef = [0.2, -0.1] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) glm = GeneralizedLinearRegressor( alpha=0, family=family, link='log', fit_intercept=False, - solver=solver, start_params='least_squares') + solver=solver, start_params='guess', tol=tol) res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef) + assert_array_almost_equal(res.coef_, coef, decimal=dec) @pytest.mark.filterwarnings('ignore::DeprecationWarning') -@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) -def test_normal_ridge(solver): +@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), + ('lbfgs', 1e-6, 5), + ('newton-cg', 1e-6, 5), + ('cd', 1e-6, 6)]) +def test_normal_ridge(solver, tol, dec): """Test ridge regression for Normal distributions. Compare to test_ridge in test_ridge.py. """ - rng = np.random.RandomState(0) alpha = 1.0 # 1. With more samples than features - n_samples, n_features, n_predict = 10, 5, 10 - y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) - T = rng.randn(n_predict, n_features) + n_samples, n_features, n_predict = 100, 7, 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=rng) + y = y[0:n_samples] + X, T = X[0:n_samples], X[n_samples:] # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, @@ -410,69 +413,74 @@ def test_normal_ridge(solver): ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=True, - tol=1e-6, max_iter=100, solver=solver, - random_state=42) + tol=tol, max_iter=100, solver=solver, + check_input=False, random_state=rng) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, solver='svd', normalize=False) ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=False, - tol=1e-6, max_iter=100, solver=solver, - random_state=42, fit_dispersion='chisqr') + tol=tol, max_iter=100, solver=solver, + check_input=False, random_state=rng, + fit_dispersion='chisqr') glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) mu = glm.predict(X) assert_almost_equal(glm.dispersion_, np.sum((y-mu)**2/(n_samples-n_features))) # 2. With more features than samples and sparse - n_samples, n_features, n_predict = 5, 10, 10 - y = rng.randn(n_samples) - X = sparse.csr_matrix(rng.randn(n_samples, n_features)) - T = sparse.csr_matrix(rng.randn(n_predict, n_features)) + n_samples, n_features, n_predict = 10, 100, 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=rng) + y = y[0:n_samples] + X, T = X[0:n_samples], X[n_samples:] # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, solver='sag', normalize=False, max_iter=100000) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, - family='normal', link='identity', - fit_intercept=True, solver=solver, - max_iter=300, random_state=42) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=True, + tol=tol, max_iter=300, solver=solver, + check_input=False, random_state=rng) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, solver='sag', normalize=False, max_iter=1000) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, - family='normal', link='identity', - fit_intercept=False, solver=solver) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=False, + tol=tol*2, max_iter=300, solver=solver, + check_input=False, random_state=rng) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2) -@pytest.mark.parametrize('solver, decimal, tol', - [('irls', 7, 1e-8), - ('lbfgs', 5, 1e-7), - ('newton-cg', 5, 1e-7), - ('cd', 7, 1e-8)]) -def test_poisson_ridge(solver, decimal, tol): +@pytest.mark.parametrize('solver, tol, dec', + [('irls', 1e-7, 6), + ('lbfgs', 1e-7, 5), + ('newton-cg', 1e-7, 5), + ('cd', 1e-7, 7)]) +def test_poisson_ridge(solver, tol, dec): """Test ridge regression with poisson family and LogLink. Compare to R's glmnet""" @@ -493,18 +501,17 @@ def test_poisson_ridge(solver, decimal, tol): fit_intercept=True, family='poisson', link='log', tol=tol, solver=solver, max_iter=300, - random_state=42) + random_state=rng) glm.fit(X, y) assert_almost_equal(glm.intercept_, -0.12889386979, - decimal=decimal) + decimal=dec) assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], - decimal=decimal) + decimal=dec) @pytest.mark.parametrize('diag_fisher', [False, True]) def test_normal_enet(diag_fisher): """Test elastic net regression with normal/gaussian family.""" - rng = np.random.RandomState(0) alpha, l1_ratio = 0.3, 0.7 n_samples, n_features = 20, 2 X = rng.randn(n_samples, n_features).copy(order='F') @@ -556,7 +563,8 @@ def test_poisson_enet(): y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', link='log', solver='cd', tol=1e-8, - selection='random', random_state=42) + selection='random', random_state=rng, + start_params='guess') glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) @@ -591,7 +599,7 @@ def obj(coef): glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, family='poisson', max_iter=300, link='log', solver='cd', tol=1e-5, - start_params='zero') + selection='cyclic', start_params='zero') glm.fit(X, y) # warm start with original alpha and use of sparse matrices glm.warm_start = True @@ -612,9 +620,9 @@ def test_binomial_enet(alpha): n_samples = 500 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6, n_informative=5, n_redundant=0, n_repeated=0, - random_state=0) + random_state=rng) log = LogisticRegression( - penalty='elasticnet', random_state=0, fit_intercept=False, tol=1e-6, + penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6, max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), solver='saga') log.fit(X, y) From a7755de2cdc5022b6d41285730ed12bd41628e66 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 4 Jun 2019 17:34:11 +0200 Subject: [PATCH 52/69] Fix a few typos --- doc/modules/linear_model.rst | 2 +- .../plot_poisson_spline_regression.py | 2 +- sklearn/linear_model/glm.py | 53 ++++++++++--------- sklearn/linear_model/tests/test_glm.py | 4 +- 4 files changed, 31 insertions(+), 30 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 4bede17af581a..8f17c67d950de 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -887,7 +887,7 @@ combination of the input variables :math:`X` via an inverse link function .. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [11]_. The objective function beeing minimized +exponential dispersion model (EDM) [11]_. The objective function being minimized becomes .. math:: \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1 diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py index fce85fae1ea8c..30b5881bba1f5 100644 --- a/examples/linear_model/plot_poisson_spline_regression.py +++ b/examples/linear_model/plot_poisson_spline_regression.py @@ -5,7 +5,7 @@ As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` example, a Poisson regression with penalized B-splines (P-splines) [1]_ is -fitted on slightly different sinusodial, Poisson distributed data and +fitted on slightly different sinusoidal, Poisson distributed data and compared to an AdaBoost model with decision trees. One can see, that this is a hard problem for both estimators. diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index a53cc39ecd307..a35c68828de81 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -29,7 +29,7 @@ # L2: w*P2*w with P2 a (semi-) positive definite matrix, e.g. P2 could be # a 1st or 2nd order difference matrix (compare B-spline penalties and # Tikhonov regularization). -# - The link funtion (instance of class Link) is necessary for the evaluation +# - The link function (instance of class Link) is necessary for the evaluation # of deviance, score, Fisher and Hessian matrix as functions of the # coefficients, which is needed by optimizers. # Solution: link as argument in those functions @@ -170,7 +170,7 @@ def _min_norm_sugrad(coef, grad, P2, P1): class Link(metaclass=ABCMeta): - """Abstract base class for Link funtions.""" + """Abstract base class for Link functions.""" @abstractmethod def link(self, mu): @@ -201,7 +201,7 @@ def derivative(self, mu): def inverse(self, lin_pred): """Compute the inverse link function h(lin_pred). - Gives the inverse relationship between linkear predictor and the mean + Gives the inverse relationship between linker predictor and the mean mu=E[Y], i.e. h(linear predictor) = mu. Parameters @@ -357,7 +357,7 @@ def include_lower_bound(self): @property def include_upper_bound(self): - """Get True if upper bound for y is includede: y <= upper_bound.""" + """Get True if upper bound for y is included: y <= upper_bound.""" return self._include_upper_bound def in_y_range(self, x): @@ -859,7 +859,7 @@ def __init__(self): class GeneralizedHyperbolicSecant(ExponentialDispersionModel): """A class for the Generalized Hyperbolic Secant (GHS) distribution. - The GHS distribution is for tagets y in (-inf, inf). + The GHS distribution is for targets y in (-inf, inf). """ def __init__(self): self._lower_bound = -np.Inf @@ -881,7 +881,7 @@ def unit_deviance(self, y, mu): class BinomialDistribution(ExponentialDispersionModel): """A class for the Binomial distribution. - The Binomial distribution is for tagets y in [0, 1]. + The Binomial distribution is for targets y in [0, 1]. """ def __init__(self): self._lower_bound = 0 @@ -1131,7 +1131,7 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, else: b = B[jdx, jdx] - # those ten lines aree what it is all about + # those ten lines are what it is all about if b <= 0: z = 0 elif P1[j] == 0: @@ -1199,7 +1199,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, diag_fisher=False, copy_X=True): """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. - The objective beeing minimized in the coefficients w=coef is:: + The objective being minimized in the coefficients w=coef is:: F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 @@ -1207,7 +1207,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, 1. Find optimal descent direction d by minimizing min_d F(w+d) = min_d F(w+d) - F(w) - 2. Quadrdatic approximation of F(w+d)-F(w) = q(d): + 2. Quadratic approximation of F(w+d)-F(w) = q(d): using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d + ||P1*(w+d)||_1 - ||P1*w||_1 @@ -1228,7 +1228,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, ---------- coef : ndarray, shape (c,) If fit_intercept=False, shape c=X.shape[1]. - If fit_intercept=True, then c=X.shapee[1] + 1. + If fit_intercept=True, then c=X.shape[1] + 1. X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) Training data (with intercept included if present). If not sparse, @@ -1265,7 +1265,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, cycles over all features per inner loop. tol : float, optional (default=1e-4) - Covergence criterion is + Convergence criterion is sum_i(|minimum of norm of subgrad of objective_i|)<=tol. selection : str, optional (default='cyclic') @@ -1285,9 +1285,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, ------- coef : ndarray, shape (c,) If fit_intercept=False, shape c=X.shape[1]. - If fit_intercept=True, then c=X.shapee[1] + 1. + If fit_intercept=True, then c=X.shape[1] + 1. - n_iter : numer of outer iterations = newton iterations + n_iter : number of outer iterations = newton iterations n_cycles : number of cycles over features @@ -1312,7 +1312,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, "format. Got P2 not sparse.") random_state = check_random_state(random_state) # Note: we already set P2 = l2*P2, P1 = l1*P1 - # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: we already symmetrized P2 = 1/2 (P2 + P2') n_iter = 0 # number of outer iterations n_cycles = 0 # number of (complete) cycles over features converged = False @@ -1427,7 +1427,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Parameters ---------- alpha : float, optional (default=1) - Constant that multiplies the penalty terms und thus determines the + Constant that multiplies the penalty terms and thus determines the regularization strength. See the notes for the exact mathematical meaning of this parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this @@ -1481,9 +1481,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'logit' for family 'binomial' - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) Method for estimation of the dispersion parameter phi. Whether to use - the chi squared statisic or the deviance statistic. If None, the + the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \ @@ -1520,7 +1520,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of - the objective function. For the cd solver, covergence is reached + the objective function. For the cd solver, convergence is reached when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the subgradient of the objective and minimum-norm of ``g_i`` is the element of the subgradient ``g_i`` with the smallest L2-norm. @@ -1626,7 +1626,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If the target y is a ratio, appropriate sample weights s should be provided. - As an example, consider Poission distributed counts z (integers) and + As an example, consider Poisson distributed counts z (integers) and weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite @@ -1681,7 +1681,7 @@ def fit(self, X, y, sample_weight=None): Target values. sample_weight : {None, array-like}, shape (n_samples,),\ - optinal (default=None) + optional (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has Var[Y_i]=phi/w_i * v(mu). @@ -1995,7 +1995,7 @@ def fit(self, X, y, sample_weight=None): if self.fit_intercept: score = np.concatenate(([temp.sum()], temp @ X)) else: - score = temp @ X # sampe as X.T @ temp + score = temp @ X # same as X.T @ temp d2_sigma_inv = d1 * d1 * sigma_inv diag_fisher = self.diag_fisher @@ -2051,12 +2051,12 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # 4. fit # ####################################################################### - # algorithms for optimiation + # algorithms for optimization # TODO: Parallelize it? # 4.1 IRLS ############################################################ # Note: we already set P2 = l2*P2, see above - # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: we already symmetrized P2 = 1/2 (P2 + P2') if solver == 'irls': coef, self.n_iter_ = \ _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2, @@ -2137,7 +2137,7 @@ def grad_hess(coef, X, y, weights, P2, family, link): grad = np.concatenate(([0.5 * temp.sum()], 0.5 * temp @ X + L2)) else: - grad = 0.5 * temp @ X + L2 # sampe as 0.5* X.T @ temp + L2 + grad = 0.5 * temp @ X + L2 # same as 0.5* X.T @ temp + L2 # expected hessian = fisher = X.T @ diag_matrix @ X # calculate only diag_matrix @@ -2168,7 +2168,7 @@ def Hs(coef): # 4.4 coordinate descent ############################################## # Note: we already set P1 = l1*P1, see above # Note: we already set P2 = l2*P2, see above - # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: we already symmetrized P2 = 1/2 (P2 + P2') elif solver == 'cd': coef, self.n_iter_, self._n_cycles = \ _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, @@ -2215,7 +2215,8 @@ def linear_predictor(self, X): return X @ self.coef_ + self.intercept_ def predict(self, X, sample_weight=None): - """Predict uing GLM with feature matrix X. + """Predict using GLM with feature matrix X. + If sample_weight is given, returns prediction*sample_weight. Parameters diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 17535c067bbb6..6172824cf1b79 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -29,7 +29,7 @@ def test_link_properties(link): """Test link inverse and derivative.""" x = rng.rand(100)*100 - link = link() # instatiate object + link = link() # instantiate object decimal = 10 if isinstance(link, LogitLink): # careful for large x, note expit(36) = 1 @@ -133,7 +133,7 @@ def test_sample_weights_validation(): with pytest.raises(ValueError): glm.fit(X, y, weights) - # 5. 1d but weith a negative value + # 5. 1d but with a negative value weights = [2, -1] with pytest.raises(ValueError): glm.fit(X, y, weights) From 9aa1fc41d726e5ac9007d9c867546aa70ea79ba7 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 4 Jun 2019 17:40:08 +0200 Subject: [PATCH 53/69] Make module private --- sklearn/linear_model/__init__.py | 4 ++-- sklearn/linear_model/{glm.py => _glm.py} | 0 sklearn/linear_model/tests/test_glm.py | 9 +++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) rename sklearn/linear_model/{glm.py => _glm.py} (100%) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 0c5840f343a3a..0f7856fcc2046 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,8 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from .glm import (TweedieDistribution, - GeneralizedLinearRegressor) +from ._glm import (TweedieDistribution, + GeneralizedLinearRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/_glm.py similarity index 100% rename from sklearn/linear_model/glm.py rename to sklearn/linear_model/_glm.py diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 6172824cf1b79..a148d11cb2632 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,3 +1,7 @@ +# Authors: Christian Lorentzen +# +# License: BSD 3 clause + import numpy as np from numpy.testing import assert_allclose import pytest @@ -5,7 +9,8 @@ from scipy import linalg, optimize, sparse from sklearn.datasets import make_classification, make_regression -from sklearn.linear_model.glm import ( +from sklearn.linear_model import GeneralizedLinearRegressor +from sklearn.linear_model._glm import ( Link, IdentityLink, LogLink, @@ -14,7 +19,7 @@ NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, GeneralizedHyperbolicSecant, BinomialDistribution, - GeneralizedLinearRegressor) +) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.utils.testing import ( From ca3eae24b4db9931eeac9fb925dba4f48199976c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 4 Jun 2019 21:35:11 +0200 Subject: [PATCH 54/69] Working on tests --- sklearn/linear_model/tests/test_glm.py | 52 +++++++++++++------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index a148d11cb2632..2c8a9c3d2c72c 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -23,7 +23,7 @@ from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.utils.testing import ( - assert_equal, assert_almost_equal, + assert_almost_equal, assert_array_equal, assert_array_almost_equal) @@ -35,16 +35,14 @@ def test_link_properties(link): """Test link inverse and derivative.""" x = rng.rand(100)*100 link = link() # instantiate object - decimal = 10 if isinstance(link, LogitLink): # careful for large x, note expit(36) = 1 # limit max eta to 15 x = x / 100 * 15 - decimal = 8 - assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal) + assert_allclose(link.link(link.inverse(x)), x) # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) - assert_almost_equal(link.derivative(link.inverse(x)), - 1./link.inverse_derivative(x), decimal=decimal) + assert_allclose(link.derivative(link.inverse(x)), + 1./link.inverse_derivative(x)) # for LogitLink, in the following x should be between 0 and 1. # assert_almost_equal(link.inverse_derivative(link.link(x)), # 1./link.derivative(x), decimal=decimal) @@ -79,7 +77,7 @@ def test_family_bounds(family, expected): def test_deviance_zero(family, chk_values): """Test deviance(y,y) = 0 for different families.""" for x in chk_values: - assert_almost_equal(family.deviance(x, x), 0, decimal=10) + assert_allclose(family.deviance(x, x), 0, atol=1e-9) @pytest.mark.parametrize( @@ -155,7 +153,7 @@ def test_glm_family_argument(f, fam): y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) - assert_equal(type(glm._family_instance), type(fam)) + assert isinstance(glm._family_instance, fam.__class__) glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) @@ -172,7 +170,7 @@ def test_glm_link_argument(l, link): y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) - assert_equal(type(glm._link_instance), type(link)) + assert isinstance(glm._link_instance, link.__class__) glm = GeneralizedLinearRegressor(family='normal', link='not a link') with pytest.raises(ValueError): @@ -366,7 +364,7 @@ def test_glm_identiy_regression(solver): fit_intercept=False, solver=solver, start_params='zero', tol=1e-7) res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef) + assert_allclose(res.coef_, coef) @pytest.mark.parametrize( @@ -375,11 +373,11 @@ def test_glm_identiy_regression(solver): GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), GeneralizedHyperbolicSecant()]) -@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), - ('lbfgs', 1e-6, 6), - ('newton-cg', 1e-7, 6), - ('cd', 1e-7, 6)]) -def test_glm_log_regression(family, solver, tol, dec): +@pytest.mark.parametrize('solver, tol', [('irls', 1e-6), + ('lbfgs', 1e-6), + ('newton-cg', 1e-7), + ('cd', 1e-7)]) +def test_glm_log_regression(family, solver, tol): """Test GLM regression with log link on a simple dataset.""" coef = [0.2, -0.1] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T @@ -388,7 +386,7 @@ def test_glm_log_regression(family, solver, tol, dec): alpha=0, family=family, link='log', fit_intercept=False, solver=solver, start_params='guess', tol=tol) res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef, decimal=dec) + assert_allclose(res.coef_, coef) @pytest.mark.filterwarnings('ignore::DeprecationWarning') @@ -421,10 +419,10 @@ def test_normal_ridge(solver, tol, dec): tol=tol, max_iter=100, solver=solver, check_input=False, random_state=rng) glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + assert glm.coef_.shape == (X.shape[1], ) + assert_allclose(glm.coef_, ridge.coef_) + assert glm.intercept_ == pytest.approx(ridge.intercept_) + assert_allclose(glm.predict(T), ridge.predict(T)) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, solver='svd', normalize=False) @@ -435,13 +433,13 @@ def test_normal_ridge(solver, tol, dec): check_input=False, random_state=rng, fit_dispersion='chisqr') glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert glm.coef_.shape == (X.shape[1], ) + assert_allclose(glm.coef_, ridge.coef_) assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + assert_allclose(glm.predict(T), ridge.predict(T)) mu = glm.predict(X) - assert_almost_equal(glm.dispersion_, - np.sum((y-mu)**2/(n_samples-n_features))) + assert_allclose(glm.dispersion_, + np.sum((y-mu)**2/(n_samples-n_features))) # 2. With more features than samples and sparse n_samples, n_features, n_predict = 10, 100, 10 @@ -461,7 +459,7 @@ def test_normal_ridge(solver, tol, dec): tol=tol, max_iter=300, solver=solver, check_input=False, random_state=rng) glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) + assert glm.coef_.shape == (X.shape[1], ) assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) @@ -474,7 +472,7 @@ def test_normal_ridge(solver, tol, dec): tol=tol*2, max_iter=300, solver=solver, check_input=False, random_state=rng) glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) + assert glm.coef_.shape == (X.shape[1], ) assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1) assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1) assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2) From 61bc6b8e2b1e227f539656744e9b4a4fa9f514f2 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 5 Jun 2019 18:45:20 +0200 Subject: [PATCH 55/69] Improve tests --- sklearn/linear_model/tests/test_glm.py | 82 +++++++++++++------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 2c8a9c3d2c72c..7d747c23ae441 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -22,9 +22,7 @@ ) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge -from sklearn.utils.testing import ( - assert_almost_equal, - assert_array_equal, assert_array_almost_equal) +from sklearn.utils.testing import assert_array_equal rng = np.random.RandomState(42) @@ -355,16 +353,16 @@ def test_glm_check_input_argument(check_input): @pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) -def test_glm_identiy_regression(solver): +def test_glm_identity_regression(solver): """Test GLM regression with identity link on a simple dataset.""" - coef = [1, 2] + coef = [1., 2.] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', fit_intercept=False, solver=solver, start_params='zero', tol=1e-7) res = glm.fit(X, y) - assert_allclose(res.coef_, coef) + assert_allclose(res.coef_, coef, rtol=1e-6) @pytest.mark.parametrize( @@ -386,7 +384,7 @@ def test_glm_log_regression(family, solver, tol): alpha=0, family=family, link='log', fit_intercept=False, solver=solver, start_params='guess', tol=tol) res = glm.fit(X, y) - assert_allclose(res.coef_, coef) + assert_allclose(res.coef_, coef, rtol=5e-6) @pytest.mark.filterwarnings('ignore::DeprecationWarning') @@ -420,9 +418,9 @@ def test_normal_ridge(solver, tol, dec): check_input=False, random_state=rng) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_) - assert glm.intercept_ == pytest.approx(ridge.intercept_) - assert_allclose(glm.predict(T), ridge.predict(T)) + assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) + assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, solver='svd', normalize=False) @@ -434,9 +432,9 @@ def test_normal_ridge(solver, tol, dec): fit_dispersion='chisqr') glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) - assert_allclose(glm.predict(T), ridge.predict(T)) + assert_allclose(glm.coef_, ridge.coef_, rtol=1e-5) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) + assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6) mu = glm.predict(X) assert_allclose(glm.dispersion_, np.sum((y-mu)**2/(n_samples-n_features))) @@ -452,7 +450,8 @@ def test_normal_ridge(solver, tol, dec): # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, - solver='sag', normalize=False, max_iter=100000) + solver='sag', normalize=False, max_iter=100000, + random_state=42) ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=True, @@ -460,22 +459,24 @@ def test_normal_ridge(solver, tol, dec): check_input=False, random_state=rng) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) + assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, - solver='sag', normalize=False, max_iter=1000) + solver='sag', normalize=False, max_iter=1000, + random_state=42) ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=False, tol=tol*2, max_iter=300, solver=solver, check_input=False, random_state=rng) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2) + assert_allclose(glm.coef_, ridge.coef_, rtol=1e-4) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) + assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) @pytest.mark.parametrize('solver, tol, dec', @@ -506,10 +507,8 @@ def test_poisson_ridge(solver, tol, dec): solver=solver, max_iter=300, random_state=rng) glm.fit(X, y) - assert_almost_equal(glm.intercept_, -0.12889386979, - decimal=dec) - assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], - decimal=dec) + assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) + assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-6) @pytest.mark.parametrize('diag_fisher', [False, True]) @@ -535,14 +534,14 @@ def test_normal_enet(diag_fisher): normalize=False, tol=1e-8, copy_X=True) enet.fit(X, y) - assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) - assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7) + assert_allclose(glm.coef_, enet.coef_, rtol=5e-5) # 2. test normal enet on sparse data X = sparse.csc_matrix(X) glm.fit(X, y) - assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) - assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7) + assert_allclose(glm.coef_, enet.coef_, rtol=5e-5) def test_poisson_enet(): @@ -569,8 +568,8 @@ def test_poisson_enet(): selection='random', random_state=rng, start_params='guess') glm.fit(X, y) - assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) - assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) + assert_allclose(glm.intercept_, glmnet_intercept, rtol=2e-6) + assert_allclose(glm.coef_, glmnet_coef, rtol=2e-7) # test results with general optimization procedure def obj(coef): @@ -584,10 +583,10 @@ def obj(coef): + alpha * l1_ratio * np.sum(np.abs(coef[1:])) res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10, options={'maxiter': 1000, 'disp': False}) - assert_almost_equal(glm.intercept_, res.x[0], decimal=5) - assert_almost_equal(glm.coef_, res.x[1:], decimal=5) - assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))), - res.fun, decimal=8) + assert_allclose(glm.intercept_, res.x[0], rtol=1e-5) + assert_allclose(glm.coef_, res.x[1:], rtol=1e-5, atol=1e-9) + assert_allclose(obj(np.concatenate(([glm.intercept_], glm.coef_))), + res.fun, rtol=1e-8) # same for start_params='zero' and selection='cyclic' # with reduced precision @@ -595,8 +594,8 @@ def obj(coef): link='log', solver='cd', tol=1e-5, selection='cyclic', start_params='zero') glm.fit(X, y) - assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) - assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4) + assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4) # check warm_start, therefore start with different alpha glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, @@ -609,8 +608,8 @@ def obj(coef): glm.alpha = 1 X = sparse.csr_matrix(X) glm.fit(X, y) - assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) - assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4) + assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4) @pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10]) @@ -629,10 +628,11 @@ def test_binomial_enet(alpha): max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), solver='saga') log.fit(X, y) + glm = GeneralizedLinearRegressor( family=BinomialDistribution(), link=LogitLink(), fit_intercept=False, alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic', tol=1e-7) glm.fit(X, y) - assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6) - assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6) + assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6) + assert_allclose(log.coef_[0, :], glm.coef_, rtol=2e-6) From b24a7cab9fc7d9f5dcec5b9d7657fee2d0a94283 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 5 Jun 2019 18:53:58 +0200 Subject: [PATCH 56/69] Remove unused dec parameter in tests --- sklearn/linear_model/tests/test_glm.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 7d747c23ae441..a3e943403a7a7 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -388,11 +388,11 @@ def test_glm_log_regression(family, solver, tol): @pytest.mark.filterwarnings('ignore::DeprecationWarning') -@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), - ('lbfgs', 1e-6, 5), - ('newton-cg', 1e-6, 5), - ('cd', 1e-6, 6)]) -def test_normal_ridge(solver, tol, dec): +@pytest.mark.parametrize('solver, tol', [('irls', 1e-6), + ('lbfgs', 1e-6), + ('newton-cg', 1e-6), + ('cd', 1e-6)]) +def test_normal_ridge(solver, tol): """Test ridge regression for Normal distributions. Compare to test_ridge in test_ridge.py. @@ -479,12 +479,12 @@ def test_normal_ridge(solver, tol, dec): assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) -@pytest.mark.parametrize('solver, tol, dec', - [('irls', 1e-7, 6), - ('lbfgs', 1e-7, 5), - ('newton-cg', 1e-7, 5), - ('cd', 1e-7, 7)]) -def test_poisson_ridge(solver, tol, dec): +@pytest.mark.parametrize('solver, tol', + [('irls', 1e-7), + ('lbfgs', 1e-7), + ('newton-cg', 1e-7), + ('cd', 1e-7)]) +def test_poisson_ridge(solver, tol): """Test ridge regression with poisson family and LogLink. Compare to R's glmnet""" From f95b390c04fd4c0a333f08a649d9450ec44f0395 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 18 Jul 2017 21:50:10 +0200 Subject: [PATCH 57/69] ENH: add Generalized Linear Models, issue #5975 * new estimator GeneralizedLinearRegressor * loss functions for Tweedie family and Binomial * elasitc net penalties * control of penalties by matrix P2 and vector P1 * new solvers: coordinate descent, irls * tests * documentation * example for Poisson regression --- doc/modules/classes.rst | 1 + doc/modules/linear_model.rst | 129 + .../plot_poisson_spline_regression.py | 85 + sklearn/linear_model/__init__.py | 6 +- sklearn/linear_model/glm.py | 2331 +++++++++++++++++ sklearn/linear_model/tests/test_glm.py | 640 +++++ 6 files changed, 3191 insertions(+), 1 deletion(-) create mode 100644 examples/linear_model/plot_poisson_spline_regression.py create mode 100644 sklearn/linear_model/glm.py create mode 100644 sklearn/linear_model/tests/test_glm.py diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 2dcf582a6ab39..4158e34e8bb8c 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -727,6 +727,7 @@ Kernels: linear_model.BayesianRidge linear_model.ElasticNet linear_model.ElasticNetCV + linear_model.GeneralizedLinearRegressor linear_model.HuberRegressor linear_model.Lars linear_model.LarsCV diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 02f406f629e04..888566fab3601 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -893,6 +893,135 @@ to warm-starting (see :term:`Glossary `). .. [9] `"Performance Evaluation of Lbfgs vs other solvers" `_ +.. _Generalized_linear_regression: + +Generalized Linear Regression +============================= + +:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two +ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear +combination of the input variables :math:`X` via an inverse link function +:math:`h` as + +.. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). + +Secondly, the squared loss function is replaced by the deviance :math:`D` of an +exponential dispersion model (EDM) [11]_. The objective function beeing minimized +becomes + +.. math:: \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1 + +\frac{\alpha(1-\rho)}{2} w^T P_2 w + +with sample weights :math:`s`. +:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in +the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows +for a more versatile L2 penalty. + +Use cases, where a loss different from the squared loss might be appropriate, +are the following: + + * If the target values :math:`y` are counts (non-negative integer valued) or + frequencies (non-negative), you might use a Poisson deviance with log-link. + + * If the target values are positive valued and skewed, you might try a + Gamma deviance with log-link. + + * If the target values seem to be heavier tailed than a Gamma distribution, + you might try an Inverse Gaussian deviance (or even higher variance powers + of the Tweedie family). + +Since the linear predictor :math:`Xw` can be negative and +Poisson, Gamma and Inverse Gaussian distributions don't support negative values, +it is convenient to apply a link function different from the identity link +:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with +:math:`h(Xw)=\exp(Xw)`. + +Note that the feature matrix `X` should be standardized before fitting. This +ensures that the penalty treats features equally. The estimator can be used as +follows: + + >>> from sklearn.linear_model import GeneralizedLinearRegressor + >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE + GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5, + check_input=True, copy_X=True, diag_fisher=False, + family='poisson', fit_dispersion=None, + fit_intercept=True, l1_ratio=0, link='log', + max_iter=100, random_state=None, selection='cyclic', + solver='auto', start_params='guess', tol=0.0001, + verbose=0, warm_start=False) + >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE + array([0.24630169, 0.43373464]) + >>> reg.intercept_ #doctest: +ELLIPSIS + -0.76383633... + + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py` + +Mathematical formulation +------------------------ + +In the unpenalized case, the assumptions are the following: + + * The target values :math:`y_i` are realizations of random variables + :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})` + with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter + :math:`\phi` and sample weights :math:`s_i`. + * The aim is to predict the expectation :math:`\mu_i` with + :math:`\hat{y_i} = h(\eta_i)`, linear predictor + :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`. + +Note that the first assumption implies +:math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance +function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the +same as specifying a unit variance function (they are one-to-one). + +Including penalties helps to avoid overfitting or, in case of L1 penalty, to +obtain sparse solutions. But there are also other motivations to include them, +e.g. accounting for the dependence structure of :math:`y`. + +The objective function, which is independent of :math:`\phi`, is minimized with +respect to the coefficients :math:`w`. + +The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` +likelihood as + +.. math:: d(y, \mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) + - loglike(y,y,\phi)\right) \\ + D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) + +===================================== =============================== ================================= ============================================ +Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` +===================================== =============================== ================================= ============================================ +Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` +Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` +Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` +Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` +===================================== =============================== ================================= ============================================ + +Two remarks: + +* The deviances for at least Normal, Poisson and Gamma distributions are + strictly consistent scoring functions for the mean :math:`\mu`, see Eq. + (19)-(20) in [12]_. + +* If you want to model a frequency, i.e. counts per exposure (time, volume, ...) + you can do so by a Poisson distribution and passing + :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together + with :math:`s=\mathrm{exposure}` as sample weights. + + +.. topic:: References: + + .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. + See also `Exponential dispersion model. `_ + + .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ Stochastic Gradient Descent - SGD ================================= diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py new file mode 100644 index 0000000000000..fce85fae1ea8c --- /dev/null +++ b/examples/linear_model/plot_poisson_spline_regression.py @@ -0,0 +1,85 @@ +""" +================================= +Poisson Regression with B-Splines +================================= + +As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` +example, a Poisson regression with penalized B-splines (P-splines) [1]_ is +fitted on slightly different sinusodial, Poisson distributed data and +compared to an AdaBoost model with decision trees. +One can see, that this is a hard problem for both estimators. + +.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines + and penalties". Statist. Sci. 11 (1996), no. 2, 89--121. + `doi:10.1214/ss/1038425655 + `_ + +""" +print(__doc__) + +# Author: Christian Lorentzen +# based on the AdaBoost regression example from Noel Dawe +# License: BSD 3 clause + +# importing necessary libraries +import numpy as np +from scipy.linalg import toeplitz +# from scipy.interpolate import BSpline +from scipy.interpolate import splev +import matplotlib.pyplot as plt +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import AdaBoostRegressor +from sklearn.linear_model import GeneralizedLinearRegressor + + +# Create the dataset +xmin, xmax = 0, 6 +rng = np.random.RandomState(1) +X = np.linspace(xmin, xmax, 500)[:, np.newaxis] +y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel()) +y = rng.poisson(y_true, X.shape[0]) + +# b-spline basis +nknots, degree = 40, 3 +ns = nknots - degree - 1 # number of base spline functions +dx = (xmax - xmin) / (nknots - 1 - 2 * degree) +knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots) +coef = np.zeros(ns) +splineBasis = np.empty((X.shape[0], ns), dtype=float) +for i in range(ns): + coef[i] = 1 +# splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ +# .ravel() + splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel() + coef[i] = 0 + +# second order difference matrix +P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float) +P2[0, 0] = P2[-1, -1] = 1 + +# Fit regression model +regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), + n_estimators=10, random_state=rng) + +regr_2 = GeneralizedLinearRegressor(family='poisson', link='log', + fit_intercept=True, alpha=0.02, + l1_ratio=0.1, P2=P2) + +regr_1.fit(X, y) +regr_2.fit(splineBasis, y) + +# Predict +y_1 = regr_1.predict(X) +y_2 = regr_2.predict(splineBasis) + +# Plot the results +plt.figure() +plt.plot(X, y_true, c="b", label="true mean") +plt.scatter(X, y, c="k", marker='.', label="training samples") +plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2) +plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2) +plt.xlabel("data") +plt.ylabel("target") +plt.title("Regression Comparison") +plt.legend() +plt.show() diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 770a1a49b600e..cbb2ad8826358 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,6 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) +from .glm import (TweedieDistribution, + GeneralizedLinearRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -78,4 +80,6 @@ 'orthogonal_mp', 'orthogonal_mp_gram', 'ridge_regression', - 'RANSACRegressor'] + 'RANSACRegressor', + 'GeneralizedLinearRegressor', + 'TweedieDistribution'] diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py new file mode 100644 index 0000000000000..ac0007c1789a8 --- /dev/null +++ b/sklearn/linear_model/glm.py @@ -0,0 +1,2331 @@ +""" +Generalized Linear Models with Exponential Dispersion Family +""" + +# Author: Christian Lorentzen +# some parts and tricks stolen from other sklearn files. +# License: BSD 3 clause + +# TODO: Should the option `normalize` be included (like other linear models)? +# So far, it is not included. User must pass a normalized X. +# TODO: Add cross validation support, e.g. GCV? +# TODO: Should GeneralizedLinearRegressor inherit from LinearModel? +# So far, it does not. +# TODO: Include further classes in class.rst? ExponentialDispersionModel? +# TweedieDistribution? +# TODO: Negative values in P1 are not allowed so far. They could be used +# for group lasso. + +# Design Decisions: +# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. +# Estimators in sklearn are either regressors or classifiers. A GLM can do +# both depending on the distr (Normal => regressor, Binomial => classifier). +# Solution: GeneralizedLinearRegressor since this is the focus. +# - Allow for finer control of penalty terms: +# L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude +# factors from the L1 penalty. +# L2: w*P2*w with P2 a positive (semi-) definite matrix, e.g. P2 could be +# a 1st or 2nd order difference matrix (compare B-spline penalties and +# Tikhonov regularization). +# - The link funtion (instance of class Link) is necessary for the evaluation +# of deviance, score, Fisher and Hessian matrix as a functions of the +# coefficients, which is needed by optimizers. +# Solution: link as argument in those functions +# - Which name/symbol for sample_weight in docu? +# sklearn.linear_models uses w for coefficients, standard literature on +# GLMs use beta for coefficients and w for (sample) weights. +# So far, coefficients=w and sample weights=s. +# - The intercept term is the first index, i.e. coef[0] + + +from __future__ import division +from abc import ABCMeta, abstractmethod +import numbers +import numpy as np +from scipy import linalg, sparse, special +import scipy.sparse.linalg as splinalg +from scipy.optimize import fmin_l_bfgs_b +import warnings +from ..base import BaseEstimator, RegressorMixin +from ..exceptions import ConvergenceWarning +from ..utils import check_array, check_X_y +from ..utils.optimize import newton_cg +from ..utils.validation import check_is_fitted, check_random_state + + +def _check_weights(sample_weight, n_samples): + """Check that sample weights are non-negative and have the right shape.""" + if sample_weight is None: + weights = np.ones(n_samples) + elif np.isscalar(sample_weight): + if sample_weight <= 0: + raise ValueError("Sample weights must be non-negative.") + weights = sample_weight * np.ones(n_samples) + else: + _dtype = [np.float64, np.float32] + weights = check_array(sample_weight, accept_sparse=False, + force_all_finite=True, ensure_2d=False, + dtype=_dtype) + if weights.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar") + elif weights.shape[0] != n_samples: + raise ValueError("Sample weights must have the same length as " + "y") + if not np.all(weights >= 0): + raise ValueError("Sample weights must be non-negative.") + elif not np.sum(weights) > 0: + raise ValueError("Sample weights must have at least one positive " + "element.") + + return weights + + +def _safe_lin_pred(X, coef): + """Compute the linear predictor taking care if intercept is present.""" + if coef.size == X.shape[1] + 1: + return X @ coef[1:] + coef[0] + else: + return X @ coef + + +def _safe_toarray(X): + """Returns a numpy array.""" + if sparse.issparse(X): + return X.toarray() + else: + return np.asarray(X) + + +def _safe_sandwich_dot(X, d, intercept=False): + """Compute sandwich product X.T @ diag(d) @ X. + + With ``intercept=True``, X is treated as if a column of 1 were appended as + first column of X. + X can be sparse, d must be an ndarray. Always returns a ndarray.""" + if sparse.issparse(X): + temp = (X.transpose() @ X.multiply(d[:, np.newaxis])) + # for older versions of numpy and scipy, temp may be a np.matrix + temp = _safe_toarray(temp) + else: + temp = (X.T * d) @ X + if intercept: + dim = X.shape[1] + 1 + if sparse.issparse(X): + order = 'F' if sparse.isspmatrix_csc(X) else 'C' + else: + order = 'F' if X.flags['F_CONTIGUOUS'] else 'C' + res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order) + res[0, 0] = d.sum() + res[1:, 0] = d @ X + res[0, 1:] = res[1:, 0] + res[1:, 1:] = temp + else: + res = temp + return res + + +def _min_norm_sugrad(coef, grad, P2, P1): + """Compute the gradient of all subgradients with minimal L2-norm. + + subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1) + + g_i = grad_i + (P2*coef)_i + + if coef_i > 0: g_i + P1_i + if coef_i < 0: g_i - P1_i + if coef_i = 0: sign(g_i) * max(|g_i|-P1_i, 0) + + Parameters + ---------- + coef : ndarray + coef[0] may be intercept. + + grad : ndarray, shape=coef.shape + + P2 : {1d or 2d array, None} + always without intercept, ``None`` means P2 = 0 + + P1 : ndarray + always without intercept + """ + intercept = (coef.size == P1.size + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + # compute grad + coef @ P2 without intercept + grad_wP2 = grad[idx:].copy() + if P2 is None: + pass + elif P2.ndim == 1: + grad_wP2 += coef[idx:] * P2 + else: + grad_wP2 += coef[idx:] @ P2 + res = np.where(coef[idx:] == 0, + np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0), + grad_wP2 + np.sign(coef[idx:]) * P1) + if intercept: + return np.concatenate(([grad[0]], res)) + else: + return res + + +class Link(metaclass=ABCMeta): + """Abstract base class for Link funtions.""" + + @abstractmethod + def link(self, mu): + """Compute the link function g(mu). + + The link function links the mean mu=E[Y] to the so called linear + predictor (X*w), i.e. g(mu) = linear predictor. + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the (predicted) mean. + """ + raise NotImplementedError + + @abstractmethod + def derivative(self, mu): + """Compute the derivative of the link g'(mu). + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the (predicted) mean. + """ + raise NotImplementedError + + @abstractmethod + def inverse(self, lin_pred): + """Compute the inverse link function h(lin_pred). + + Gives the inverse relationship between linkear predictor and the mean + mu=E[Y], i.e. h(linear predictor) = mu. + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + raise NotImplementedError + + @abstractmethod + def inverse_derivative(self, lin_pred): + """Compute the derivative of the inverse link function h'(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + raise NotImplementedError + + @abstractmethod + def inverse_derivative2(self, lin_pred): + """Compute 2nd derivative of the inverse link function h''(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + raise NotImplementedError + + +class IdentityLink(Link): + """The identity link function g(x)=x.""" + + def link(self, mu): + return mu + + def derivative(self, mu): + return np.ones_like(mu) + + def inverse(self, lin_pred): + return lin_pred + + def inverse_derivative(self, lin_pred): + return np.ones_like(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.zeros_like(lin_pred) + + +class LogLink(Link): + """The log link function g(x)=log(x).""" + + def link(self, mu): + return np.log(mu) + + def derivative(self, mu): + return 1./mu + + def inverse(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.exp(lin_pred) + + +class LogitLink(Link): + """The logit link function g(x)=logit(x).""" + + def link(self, mu): + return special.logit(mu) + + def derivative(self, mu): + return 1. / (mu * (1 - mu)) + + def inverse(self, lin_pred): + return special.expit(lin_pred) + + def inverse_derivative(self, lin_pred): + ep = special.expit(lin_pred) + return ep * (1. - ep) + + def inverse_derivative2(self, lin_pred): + ep = special.expit(lin_pred) + ep = special.expit(lin_pred) + return ep * (1. - ep) * (1. - 2 * ep) + + +class ExponentialDispersionModel(metaclass=ABCMeta): + r"""Base class for reproductive Exponential Dispersion Models (EDM). + + The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by + + .. math:: p(y| \theta, \phi) = c(y, \phi) + \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) + = \tilde{c}(y, \phi) + \exp\left(-\frac{d(y, \mu)}{2\phi}\right) + + with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, + unit variance :math:`v(\mu)` and + unit deviance :math:`d(y,\mu)`. + + Attributes + ---------- + lower_bound + upper_bound + include_lower_bound + include_upper_bound + + Methods + ------- + in_y_range + unit_variance + unit_variance_derivative + variance + variance_derivative + unit_deviance + unit_deviance_derivative + deviance + deviance_derivative + starting_mu + + _mu_deviance_derivative + _score + _fisher_matrix + _observed_information + _eta_mu_score_fisher + + References + ---------- + + https://en.wikipedia.org/wiki/Exponential_dispersion_model. + """ + @property + def lower_bound(self): + """Get the lower bound of values for Y~EDM.""" + return self._lower_bound + + @property + def upper_bound(self): + """Get the upper bound of values for Y~EDM.""" + return self._upper_bound + + @property + def include_lower_bound(self): + """Get True if lower bound for y is included: y >= lower_bound.""" + return self._include_lower_bound + + @property + def include_upper_bound(self): + """Get True if upper bound for y is includede: y <= upper_bound.""" + return self._include_upper_bound + + def in_y_range(self, x): + """Returns ``True`` if x is in the valid range of Y~EDM. + + Parameters + ---------- + x : array, shape (n_samples,) + Target values. + """ + if self.include_lower_bound: + if self.include_upper_bound: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less(x, self.upper_bound)) + else: + if self.include_upper_bound: + return np.logical_and(np.greater(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater(x, self.lower_bound), + np.less(x, self.upper_bound)) + + @abstractmethod + def unit_variance(self, mu): + r"""Compute the unit variance function. + + The unit variance :math:`v(\mu)` determines the variance as + a function of the mean :math:`\mu` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. + It can also be derived from the unit deviance :math:`d(y,\mu)` as + + .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ + \partial\mu^2}}\big|_{y=\mu} + + See also :func:`variance`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + raise NotImplementedError() + + @abstractmethod + def unit_variance_derivative(self, mu): + r"""Compute the derivative of the unit variance w.r.t. mu. + + Return :math:`v'(\mu)`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Target values. + """ + raise NotImplementedError() + + def variance(self, mu, phi=1, weights=1): + r"""Compute the variance function. + + The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is + :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, + with unit variance :math:`v(\mu)` and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return phi/weights * self.unit_variance(mu) + + def variance_derivative(self, mu, phi=1, weights=1): + r"""Compute the derivative of the variance w.r.t. mu. + + Returns + :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] + =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` + and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return phi/weights * self.unit_variance_derivative(mu) + + @abstractmethod + def unit_deviance(self, y, mu): + r"""Compute the unit deviance. + + The unit_deviance :math:`d(y,\mu)` can be defined by the + log-likelihood as + :math:`d(y,\mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + """ + raise NotImplementedError() + + def unit_deviance_derivative(self, y, mu): + r"""Compute the derivative of the unit deviance w.r.t. mu. + + The derivative of the unit deviance is given by + :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + with unit variance :math:`v(\mu)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + """ + return -2 * (y - mu) / self.unit_variance(mu) + + def deviance(self, y, mu, weights=1): + r"""Compute the deviance. + + The deviance is a weighted sum of the per sample unit deviances, + :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` + with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. + In terms of the log-likelihood it is :math:`D = -2\phi\cdot + \left(loglike(y,\mu,\frac{phi}{s}) + - loglike(y,y,\frac{phi}{s})\right)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return np.sum(weights * self.unit_deviance(y, mu)) + + def deviance_derivative(self, y, mu, weights=1): + """Compute the derivative of the deviance w.r.t. mu. + + It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return weights * self.unit_deviance_derivative(y, mu) + + def starting_mu(self, y, weights=1, ind_weight=0.5): + """Set starting values for the mean mu. + + These may be good starting points for the (unpenalized) IRLS solver. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + + ind_weight : float (default=0.5) + Must be between 0 and 1. Specifies how much weight is given to the + individual observations instead of the mean of y. + """ + return (ind_weight * y + + (1. - ind_weight) * np.average(y, weights=weights)) + + def _mu_deviance_derivative(self, coef, X, y, weights, link): + """Compute mu and the derivative of the deviance w.r.t coef.""" + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + d1 = link.inverse_derivative(lin_pred) + temp = d1 * self.deviance_derivative(y, mu, weights) + if coef.size == X.shape[1] + 1: + devp = np.concatenate(([temp.sum()], temp @ X)) + else: + devp = temp @ X # sampe as X.T @ temp + return mu, devp + + def _score(self, coef, phi, X, y, weights, link): + r"""Compute the score function. + + The score function is the derivative of the + log-likelihood w.r.t. `coef` (:math:`w`). + It is given by + + .. math: + + \mathbf{score}(\boldsymbol{w}) + = \frac{\partial loglike}{\partial\boldsymbol{w}} + = \mathbf{X}^T \mathbf{D} + \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, + + with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and + :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. + Note: The derivative of the deviance w.r.t. coef equals -2 * score. + """ + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) + d = link.inverse_derivative(lin_pred) + temp = sigma_inv * d * (y - mu) + if coef.size == X.shape[1] + 1: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X # sampe as X.T @ temp + return score + + def _fisher_matrix(self, coef, phi, X, y, weights, link): + r"""Compute the Fisher information matrix. + + The Fisher information matrix, also known as expected information + matrix is given by + + .. math: + + \mathbf{F}(\boldsymbol{w}) = + \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial + \boldsymbol{w}} \right] + = \mathrm{E}\left[ + -\frac{\partial^2 loglike}{\partial\boldsymbol{w} + \partial\boldsymbol{w}^T}\right] + = \mathbf{X}^T W \mathbf{X} \,, + + with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, + see func:`_score`. + """ + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) + d = link.inverse_derivative(lin_pred) + d2_sigma_inv = sigma_inv * d * d + intercept = (coef.size == X.shape[1] + 1) + fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, + intercept=intercept) + return fisher_matrix + + def _observed_information(self, coef, phi, X, y, weights, link): + r"""Compute the observed information matrix. + + The observed information matrix, also known as the negative of + the Hessian matrix of the log-likelihood, is given by + + .. math: + + \mathbf{H}(\boldsymbol{w}) = + -\frac{\partial^2 loglike}{\partial\boldsymbol{w} + \partial\boldsymbol{w}^T} + = \mathbf{X}^T \left[ + - \mathbf{D}' \mathbf{R} + + \mathbf{D}^2 \mathbf{V} \mathbf{R} + + \mathbf{D}^2 + \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,, + + with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`, + :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{ + v(\mu_i)} + \right)`, + see :func:`score_` function and :func:`_fisher_matrix`. + """ + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) + dp = link.inverse_derivative2(lin_pred) + d2 = link.inverse_derivative(lin_pred)**2 + v = self.unit_variance_derivative(mu)/self.unit_variance(mu) + r = y - mu + temp = sigma_inv * (-dp * r + d2 * v * r + d2) + intercept = (coef.size == X.shape[1] + 1) + observed_information = _safe_sandwich_dot(X, temp, + intercept=intercept) + return observed_information + + def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, + diag_fisher=False): + """Compute linear predictor, mean, score function and fisher matrix. + + It calculates the linear predictor, the mean, score function + (derivative of log-likelihood) and Fisher information matrix + all in one go as function of `coef` (:math:`w`) and the data. + + Parameters + ---------- + diag_fisher : boolean, optional (default=False) + If ``True``, returns only an array d such that + fisher = X.T @ np.diag(d) @ X. + + Returns + ------- + (eta, mu, score, fisher) : tuple with 4 elements + The 4 elements are: + + * eta: ndarray, shape (X.shape[0],) + * mu: ndarray, shape (X.shape[0],) + * score: ndarray, shape (X.shape[0],) + * fisher: + + * If diag_fisher is ``False``, the full fisher matrix, + an array of shape (X.shape[1], X.shape[1]) + * If diag_fisher is ``True`, an array of shape (X.shape[0]) + """ + intercept = (coef.size == X.shape[1] + 1) + # eta = linear predictor + eta = _safe_lin_pred(X, coef) + mu = link.inverse(eta) + sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) + d1 = link.inverse_derivative(eta) # = h'(eta) + # Alternatively: + # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g + # d1 = 1./link.derivative(mu) + d1_sigma_inv = d1 * sigma_inv + temp = d1_sigma_inv * (y - mu) + if intercept: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X + + d2_sigma_inv = d1 * d1_sigma_inv + if diag_fisher: + fisher_matrix = d2_sigma_inv + else: + fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, + intercept=intercept) + return eta, mu, score, fisher_matrix + + +class TweedieDistribution(ExponentialDispersionModel): + r"""A class for the Tweedie distribution. + + A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely + defined by it's mean-variance relationship + :math:`\mathrm{Var}[Y] \propto \mu^power`. + + Special cases are: + + ===== ================ + Power Distribution + ===== ================ + 0 Normal + 1 Poisson + (0,1) Compound Poisson + 2 Gamma + 3 Inverse Gaussian + + Parameters + ---------- + power : float (default=0) + The variance power of the `unit_variance` + :math:`v(\mu) = \mu^{power}`. + For ``0 0) and (power < 1): + raise ValueError('For 0 1) and (power < 2): + # Compound Poisson + self._lower_bound = 0 + self._include_lower_bound = True + elif power == 2: + # GammaDistribution + self._lower_bound = 0 + self._include_lower_bound = False + elif (power > 2) and (power < 3): + # Positive Stable + self._lower_bound = 0 + self._include_lower_bound = False + elif power == 3: + # InverseGaussianDistribution + self._lower_bound = 0 + self._include_lower_bound = False + elif power > 3: + # Positive Stable + self._lower_bound = 0 + self._include_lower_bound = False + else: + raise ValueError('The power must be a float, i.e. real number, ' + 'got (power={})'.format(power)) + + @property + def power(self): + return self._power + + @power.setter + def power(self, power): + if not isinstance(power, numbers.Real): + raise TypeError('power must be a real number, input was {0}' + .format(power)) + self._power = power + + def unit_variance(self, mu): + """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + return np.power(mu, self.power) + + def unit_variance_derivative(self, mu): + """Compute the derivative of the unit variance of a Tweedie + distribution v(mu)=power*mu**(power-1). + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + return self.power * np.power(mu, self.power - 1) + + def unit_deviance(self, y, mu): + p = self.power + if p == 0: + # NormalDistribution + return (y - mu)**2 + if p == 1: + # PoissonDistribution + # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 + return 2 * (special.xlogy(y, y/mu) - y + mu) + elif p == 2: + # GammaDistribution + return 2 * (np.log(mu/y) + y/mu - 1) + else: + # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) + # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) + return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - + y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + + +class NormalDistribution(TweedieDistribution): + """Class for the Normal (aka Gaussian) distribution""" + def __init__(self): + super(NormalDistribution, self).__init__(power=0) + + +class PoissonDistribution(TweedieDistribution): + """Class for the scaled Poisson distribution""" + def __init__(self): + super(PoissonDistribution, self).__init__(power=1) + + +class GammaDistribution(TweedieDistribution): + """Class for the Gamma distribution""" + def __init__(self): + super(GammaDistribution, self).__init__(power=2) + + +class InverseGaussianDistribution(TweedieDistribution): + """Class for the scaled InverseGaussianDistribution distribution""" + def __init__(self): + super(InverseGaussianDistribution, self).__init__(power=3) + + +class GeneralizedHyperbolicSecant(ExponentialDispersionModel): + """A class for the Generalized Hyperbolic Secant (GHS) distribution. + + The GHS distribution is for tagets y in (-inf, inf). + """ + def __init__(self): + self._lower_bound = -np.Inf + self._upper_bound = np.Inf + self._include_lower_bound = False + self._include_upper_bound = False + + def unit_variance(self, mu): + return 1 + mu**2 + + def unit_variance_derivative(self, mu): + return 2 * mu + + def unit_deviance(self, y, mu): + return (2 * y * (np.arctan(y) - np.arctan(mu)) + + np.log((1 + mu**2)/(1 + y**2))) + + +class BinomialDistribution(ExponentialDispersionModel): + """A class for the Binomial distribution. + + The Binomial distribution is for tagets y in [0, 1]. + """ + def __init__(self): + self._lower_bound = 0 + self._upper_bound = 1 + self._include_lower_bound = True + self._include_upper_bound = True + + def unit_variance(self, mu): + return mu * (1 - mu) + + def unit_variance_derivative(self, mu): + return 1 - 2 * mu + + def unit_deviance(self, y, mu): + return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) + + +def _irls_step(X, W, P2, z, fit_intercept=True): + """Compute one step in iteratively reweighted least squares. + + Solve A w = b for w with + A = (X' W X + P2) + b = X' W z + z = eta + D^-1 (y-mu) + + See also fit method of :class:`GeneralizedLinearRegressor`. + + Parameters + ---------- + X : {ndarray, sparse matrix}, shape (n_samples, n_features) + Training data (with intercept included if present) + + W : ndarray, shape (n_samples,) + + P2 : {ndarray, sparse matrix}, shape (n_features, n_features) + The L2-penalty matrix or vector (=diagonal matrix) + + z : ndarray, shape (n_samples,) + Working observations + + fit_intercept : boolean, optional (default=True) + + Returns + ------- + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. + """ + # Note: solve vs least squares, what is more appropriate? + # scipy.linalg.solve seems faster, but scipy.linalg.lstsq + # is more robust. + # Note: X.T @ W @ X is not sparse, even when X is sparse. + # Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b) + if fit_intercept: + Wz = W * z + if sparse.issparse(X): + b = np.concatenate(([Wz.sum()], X.transpose() @ Wz)) + else: + b = np.concatenate(([Wz.sum()], X.T @ Wz)) + A = _safe_sandwich_dot(X, W, intercept=fit_intercept) + if P2.ndim == 1: + idx = np.arange(start=1, stop=A.shape[0]) + A[(idx, idx)] += P2 # add to diag elements without intercept + elif sparse.issparse(P2): + A[1:, 1:] += P2.toarray() + else: + A[1:, 1:] += P2 + else: + if sparse.issparse(X): + XtW = X.transpose().multiply(W) + # for older versions of numpy and scipy, A may be a np.matrix + A = _safe_toarray(XtW @ X) + else: + XtW = (X.T * W) + A = XtW @ X + b = XtW @ z + if P2.ndim == 1: + A[np.diag_indices_from(A)] += P2 + elif sparse.issparse(P2): + A += P2.toarray() + else: + A += P2 + # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True) + coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) + return coef + + +def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, + max_iter, tol): + """Solve GLM with L2 penalty by IRLS algorithm. + + Note: If X is sparse, P2 must also be sparse. + """ + # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' + # Obj = objective function = 1/2 Dev + l2/2 w P2 w + # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 + # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) + # D2 = link.inverse_derivative(eta)^2 = D^2 + # W = D2/V(mu) + # l2 = alpha * (1 - l1_ratio) + # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w + # = -X' D (y-mu)/V(mu) + l2 P2 w + # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 + # Use Fisher matrix instead of full info matrix -X'(...) X, + # i.e. E[Dev''] with E[y-mu]=0: + # Obj'' ~ X' W X + l2 P2 + # (1): w = (X' W X + l2 P2)^-1 X' W z, + # with z = eta + D^-1 (y-mu) + # Note: P2 must be symmetrized + # Note: ' denotes derivative, but also transpose for matrices + + # eta = linear predictor + eta = _safe_lin_pred(X, coef) + mu = link.inverse(eta) + # D = h'(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) + n_iter = 0 + while n_iter < max_iter: + n_iter += 1 + # coef_old not used so far. + # coef_old = coef + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = hp**2 / V + # working observations + z = eta + (y - mu) / hp + # solve A*coef = b + # A = X' W X + P2, b = X' W z + coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept) + # updated linear predictor + # do it here for updated values for tolerance + eta = _safe_lin_pred(X, coef) + mu = link.inverse(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) + + # which tolerace? |coef - coef_old| or gradient? + # use gradient for compliance with newton-cg and lbfgs + # gradient = -X' D (y-mu)/V(mu) + l2 P2 w + temp = hp * (y - mu) / V + if sparse.issparse(X): + gradient = -(X.transpose() @ temp) + else: + gradient = -(X.T @ temp) + idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + gradient += P2 * coef[idx:] + else: + gradient += P2 @ coef[idx:] + if fit_intercept: + gradient = np.concatenate(([-temp.sum()], gradient)) + if (np.max(np.abs(gradient)) <= tol): + converged = True + break + + if not converged: + warnings.warn("irls failed to converge. Increase the number " + "of iterations (currently {0})" + .format(max_iter), ConvergenceWarning) + + return coef, n_iter + + +def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, + max_inner_iter=1000, selection='cyclic', + random_state=None, diag_fisher=False): + """Compute inner loop of coordinate descent, i.e. cycles through features. + + Minimization of 1-d subproblems:: + + min_z q(d+z*e_j) - q(d) + = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 + + A = f'(w) + d*H(w) + (w+d)*P2 + B = H+P2 + Note: f'=-score and H=fisher are updated at the end of outer iteration. + """ + # TODO: use sparsity (coefficient already 0 due to L1 penalty) + # => active set of features for featurelist, see paper + # of Improved GLMNET or Gap Safe Screening Rules + # https://arxiv.org/abs/1611.05780 + n_samples, n_features = X.shape + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + B = fisher + if P2.ndim == 1: + coef_P2 = coef[idx:] * P2 + if not diag_fisher: + idiag = np.arange(start=idx, stop=B.shape[0]) + # B[np.diag_indices_from(B)] += P2 + B[(idiag, idiag)] += P2 + else: + coef_P2 = coef[idx:] @ P2 + if not diag_fisher: + if sparse.issparse(P2): + B[idx:, idx:] += P2.toarray() + else: + B[idx:, idx:] += P2 + # A = -score + coef_P2 + A = -score + A[idx:] += coef_P2 + # A += d @ (H+P2) but so far d=0 + # inner loop + for inner_iter in range(1, max_inner_iter+1): + inner_iter += 1 + n_cycles += 1 + # cycle through features, update intercept separately at the end + if selection == 'random': + featurelist = random_state.permutation(n_features) + else: + featurelist = np.arange(n_features) + for j in featurelist: + # minimize_z: a z + 1/2 b z^2 + c |d+z| + # a = A_j + # b = B_jj > 0 + # c = |P1_j| = P1_j > 0, see 1.3 + # d = w_j + d_j + # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) + # with beta = z+d, beta_hat = d-a/b and gamma = c/b + # z = 1/b * S(bd-a,c) - d + # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding + jdx = j+idx # index for arrays containing entries for intercept + a = A[jdx] + if diag_fisher: + # Note: fisher is ndarray of shape (n_samples,) => no idx + # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway + Bj = np.zeros_like(A) + if intercept: + Bj[0] = fisher.sum() + if sparse.issparse(X): + Bj[idx:] = _safe_toarray(X[:, j].transpose() @ + X.multiply(fisher[:, np.newaxis]) + ).ravel() + else: + Bj[idx:] = (fisher * X[:, j]) @ X + + if P2.ndim == 1: + Bj[idx:] += P2[j] + else: + if sparse.issparse(P2): + # slice columns as P2 is csc + Bj[idx:] += P2[:, j].toarray().ravel() + else: + Bj[idx:] += P2[:, j] + b = Bj[jdx] + else: + b = B[jdx, jdx] + + # those ten lines aree what it is all about + if b <= 0: + z = 0 + elif P1[j] == 0: + z = -a/b + elif a + P1[j] < b * (coef[jdx] + d[jdx]): + z = -(a + P1[j])/b + elif a - P1[j] > b * (coef[jdx] + d[jdx]): + z = -(a - P1[j])/b + else: + z = -(coef[jdx] + d[jdx]) + + # update direction d + d[jdx] += z + # update A because d_j is now d_j+z + # A = f'(w) + d*H(w) + (w+d)*P2 + # => A += (H+P2)*e_j z = B_j * z + # Note: B is symmetric B = B.transpose + if diag_fisher: + # Bj = B[:, j] calculated above, still valid + A += Bj * z + else: + # B is symmetric, C- or F-contiguous, but never sparse + if B.flags['F_CONTIGUOUS']: + # slice columns like for sparse csc + A += B[:, jdx] * z + else: # B.flags['C_CONTIGUOUS'] might be true + # slice rows + A += B[jdx, :] * z + # end of cycle over features + # update intercept + if intercept: + if diag_fisher: + Bj = np.zeros_like(A) + Bj[0] = fisher.sum() + Bj[1:] = fisher @ X + b = Bj[0] + else: + b = B[0, 0] + z = 0 if b <= 0 else -A[0]/b + d[0] += z + if diag_fisher: + A += Bj * z + else: + if B.flags['F_CONTIGUOUS']: + A += B[:, 0] * z + else: + A += B[0, :] * z + # end of complete cycle + # stopping criterion for inner loop + # sum_i(|minimum of norm of subgrad of q(d)_i|) + # subgrad q(d) = A + subgrad ||P1*(w+d)||_1 + mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) + if mn_subgrad <= inner_tol: + if inner_iter == 1: + inner_tol = inner_tol/4. + break + # end of inner loop + return d, coef_P2, n_cycles, inner_tol + + +def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, + max_iter=100, max_inner_iter=1000, tol=1e-4, + selection='cyclic ', random_state=None, + diag_fisher=False, copy_X=True): + """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. + + The objective beeing minimized in the coefficients w=coef is:: + + F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 + + An Improved GLMNET for L1-regularized Logistic Regression: + + 1. Find optimal descent direction d by minimizing + min_d F(w+d) = min_d F(w+d) - F(w) + 2. Quadrdatic approximation of F(w+d)-F(w) = q(d): + using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: + q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d + + ||P1*(w+d)||_1 - ||P1*w||_1 + Then minimize q(d): min_d q(d) + 3. Coordinate descent by updating coordinate j (d -> d+z*e_j): + min_z q(d+z*e_j) + = min_z q(d+z*e_j) - q(d) + = min_z A_j z + 1/2 B_jj z^2 + + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 + A = f'(w) + d*H(w) + (w+d)*P2 + B = H + P2 + + Repeat steps 1-3 until convergence. + Note: Use Fisher matrix instead of Hessian for H. + Note: f' = -score, H = Fisher matrix + + Parameters + ---------- + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. + + X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) + Training data (with intercept included if present). If not sparse, + pass directly as Fortran-contiguous data to avoid + unnecessary memory duplication. + + y : ndarray, shape (n_samples,) + Target values. + + weights: ndarray, shape (n_samples,) + Sample weights with which the deviance is weighted. The weights must + bee normalized and sum to 1. + + P1 : {ndarray}, shape (n_features,) + The L1-penalty vector (=diagonal matrix) + + P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features) + The L2-penalty matrix or vector (=diagonal matrix). If a matrix is + passed, it must be symmetric. If X is sparse, P2 must also be sparse. + + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + + family : ExponentialDispersionModel + + link : Link + + max_iter : int, optional (default=100) + Maximum numer of outer (Newton) iterations. + + max_inner_iter : int, optional (default=1000) + Maximum number of iterations in each inner loop, i.e. max number of + cycles over all features per inner loop. + + tol : float, optional (default=1e-4) + Covergence criterion is + sum_i(|minimum of norm of subgrad of objective_i|)<=tol. + + selection : str, optional (default='cyclic') + If 'random', randomly chose features in inner loop. + + random_state : {int, RandomState instance, None}, optional (default=None) + + diag_fisher : boolean, optional (default=False) + ``False`` calculates full fisher matrix, ``True`` only diagonal matrix + s.t. fisher = X.T @ diag @ X. This saves storage but needs more + matrix-vector multiplications. + + copy_X : boolean, optional (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + Returns + ------- + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. + + n_iter : numer of outer iterations = newton iterations + + n_cycles : number of cycles over features + + References + ---------- + Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + """ + X = check_array(X, 'csc', dtype=[np.float64, np.float32], + order='F', copy=copy_X) + if P2.ndim == 2: + P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32], + order='F', copy=copy_X) + if sparse.issparse(X): + if not sparse.isspmatrix_csc(X): + raise ValueError("If X is sparse, it must be in csc format" + "; got (format={})".format(X.format)) + if not sparse.isspmatrix_csc(P2): + raise ValueError("If X is sparse, P2 must also be sparse csc" + "format. Got P2 not sparse.") + random_state = check_random_state(random_state) + # Note: we already set P2 = l2*P2, P1 = l1*P1 + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + n_iter = 0 # number of outer iterations + n_cycles = 0 # number of (complete) cycles over features + converged = False + n_samples, n_features = X.shape + idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept + # line search parameters + (beta, sigma) = (0.5, 0.01) + # some precalculations + # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a + # 1d array representing a diagonal matrix. + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=X, y=y, weights=weights, link=link, + diag_fisher=diag_fisher) + # set up space for search direction d for inner loop + d = np.zeros_like(coef) + # initial stopping tolerance of inner loop + # use L1-norm of minimum of norm of subgradient of F + inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) + inner_tol = linalg.norm(inner_tol, ord=1) + # outer loop + while n_iter < max_iter: + n_iter += 1 + # initialize search direction d (to be optimized) with zero + d.fill(0) + # inner loop = _cd_cycle + d, coef_P2, n_cycles, inner_tol = \ + _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, + max_inner_iter=max_inner_iter, selection=selection, + random_state=random_state, diag_fisher=diag_fisher) + # line search by sequence beta^k, k=0, 1, .. + # F(w + lambda d) - F(w) <= lambda * bound + # bound = sigma * (f'(w)*d + w*P2*d + # +||P1 (w+d)||_1 - ||P1 w||_1) + P1w_1 = linalg.norm(P1 * coef[idx:], ord=1) + P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1) + # Note: coef_P2 already calculated and still valid + bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1) + Fw = (0.5 * family.deviance(y, mu, weights) + + 0.5 * (coef_P2 @ coef[idx:]) + P1w_1) + la = 1./beta + for k in range(20): + la *= beta # starts with la=1 + coef_wd = coef + la * d + mu_wd = link.inverse(_safe_lin_pred(X, coef_wd)) + Fwd = (0.5 * family.deviance(y, mu_wd, weights) + + linalg.norm(P1 * coef_wd[idx:], ord=1)) + if P2.ndim == 1: + Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:]) + else: + Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:])) + if Fwd - Fw <= sigma * la * bound: + break + # update coefficients + # coef_old = coef.copy() + coef += la * d + # calculate eta, mu, score, Fisher matrix for next iteration + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=X, y=y, weights=weights, link=link, + diag_fisher=diag_fisher) + # stopping criterion for outer loop + # sum_i(|minimum-norm of subgrad of F(w)_i|) + # fp_wP2 = f'(w) + w*P2 + # Note: eta, mu and score are already updated + mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) + if mn_subgrad <= tol: + converged = True + break + # end of outer loop + if not converged: + warnings.warn("Coordinate descent failed to converge. Increase" + " the maximum number of iterations max_iter" + " (currently {0})".format(max_iter), ConvergenceWarning) + + return coef, n_iter, n_cycles + + +class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): + """Regression via a Generalized Linear Model (GLM) with penalties. + + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean of the target y as mu=h(X*w). Therefore, + the fit minimizes the following objective function with combined L1 and L2 + priors as regularizer:: + + 1/(2*sum(s)) * deviance(y, h(X*w); s) + + alpha * l1_ratio * ||P1*w||_1 + + 1/2 * alpha * (1 - l1_ratio) * w*P2*w + + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + For ``P1=P2='identity'``, the penalty is the elastic net:: + + alpha * l1_ratio * ||w||_1 + + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 + + If you are interested in controlling the L1 and L2 penalties + separately, keep in mind that this is equivalent to:: + + a * L1 + b * L2 + + where:: + + alpha = a + b and l1_ratio = a / (a + b) + + The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet, + while ``alpha`` corresponds to the lambda parameter in glmnet. + Specifically, l1_ratio = 1 is the lasso penalty. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, optional (default=1) + Constant that multiplies the penalty terms und thus determines the + regularization strength. + See the notes for the exact mathematical meaning of this + parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + case, the design matrix X must have full column rank + (no collinearities). + + l1_ratio : float, optional (default=0) + The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For + ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it + is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a + combination of L1 and L2. + + P1 : {'identity', array-like}, shape (n_features,), optional \ + (default='identity') + With this array, you can exclude coefficients from the L1 penalty. + Set the corresponding value to 1 (include) or 0 (exclude). The + default value ``'identity'`` is the same as a 1d array of ones. + Note that n_features = X.shape[1]. + + P2 : {'identity', array-like, sparse matrix}, shape \ + (n_features,) or (n_features, n_features), optional \ + (default='identity') + With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`. + This gives a fine control over this penalty (Tikhonov regularization). + A 2d array is directly used as the square matrix P2. A 1d array is + interpreted as diagonal (square) matrix. The default 'identity' sets + the identity matrix, which gives the usual squared L2-norm. If you just + want to exclude certain coefficients, pass a 1d array filled with 1, + and 0 for the coefficients to be excluded. + Note that P2 must be positive semi-definite. + + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \ + or an instance of class ExponentialDispersionModel, \ + optional(default='normal') + The distributional assumption of the GLM, i.e. which distribution from + the EDM, specifies the loss function to be minimized. + + link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \ + optional (default='auto') + The link function of the GLM, i.e. mapping from linear predictor + (X*coef) to expectation (mu). Option 'auto' sets the link depending on + the chosen family as follows: + + - 'identity' for family 'normal' + + - 'log' for families 'poisson', 'gamma', 'inverse.gaussian' + + - 'logit' for family 'binomial' + + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) + Method for estimation of the dispersion parameter phi. Whether to use + the chi squared statisic or the deviance statistic. If None, the + dispersion is not estimated. + + solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \ + optional (default='auto') + Algorithm to use in the optimization problem: + + 'auto' + Sets 'irls' if l1_ratio equals 0, else 'cd'. + + 'cd' + Coordinate descent algorithm. It can deal with L1 as well as L2 + penalties. Note that in order to avoid unnecessary memory + duplication of X in the ``fit`` method, X should be directly passed + as a Fortran-contiguous numpy array or sparse csc matrix. + + 'irls' + Iterated reweighted least squares. + It is the standard algorithm for GLMs. It cannot deal with + L1 penalties. + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties. + + 'newton-cg', 'lbfgs' + Newton conjugate gradient algorithm cannot deal with L1 penalties. + + Note that all solvers except lbfgs use the fisher matrix, i.e. the + expected Hessian instead of the Hessian matrix. + + max_iter : int, optional (default=100) + The maximal number of iterations for solver algorithms. + + tol : float, optional (default=1e-4) + Stopping criterion. For the irls, newton-cg and lbfgs solvers, + the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient (derivative) of + the objective function. For the cd solver, covergence is reached + when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the + subgradient of the objective and minimum-norm of ``g_i`` is the element + of the subgradient ``g_i`` with the smallest L2-norm. + + warm_start : boolean, optional (default=False) + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` (supersedes option + ``start_params``). If set to ``True`` or if the attribute ``coef_`` + does not exit (first call to ``fit``), option ``start_params`` sets the + start values for ``coef_`` and ``intercept_``. + + start_params : {'guess', 'zero', array of shape (n_features*, )}, \ + optional (default='guess') + Relevant only if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not yet exist). + + 'guess' + Start values of mu are calculated by family.starting_mu(..). Then, + one Newton step obtains start values for ``coef_``. If + ``solver='irls'``, it uses one irls step, else the Newton step is + calculated by the cd solver. + This gives usually good starting values. + + 'zero' + All coefficients are set to zero. If ``fit_intercept=True``, the + start value for the intercept is obtained by the weighted average of y. + + array + The array of size n_features* is directly used as start values + for ``coef_``. If ``fit_intercept=True``, the first element + is assumed to be the start value for the ``intercept_``. + Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes + the intercept in counting. + + selection : str, optional (default='cyclic') + For the solver 'cd' (coordinate descent), the coordinates (features) + can be updated in either cyclic or random order. + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially in the same order. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + random_state : {int, RandomState instance, None}, optional (default=None) + The seed of the pseudo random number generator that selects a random + feature to be updated for solver 'cd' (coordinate descent). + If int, random_state is the seed used by the random + number generator; if RandomState instance, random_state is the random + number generator; if None, the random number generator is the + RandomState instance used by `np.random`. Used when ``selection`` == + 'random'. + + diag_fisher : boolean, optional, (default=False) + Only relevant for solver 'cd' (see also ``start_params='guess'``). + If ``False``, the full Fisher matrix (expected Hessian) is computed in + each outer iteration (Newton iteration). If ``True``, only a diagonal + matrix (stored as 1d array) is computed, such that + fisher = X.T @ diag @ X. This saves memory and matrix-matrix + multiplications, but needs more matrix-vector multiplications. If you + use large sparse X or if you have many features, + i.e. n_features >> n_samples, you might set this option to ``True``. + + copy_X : boolean, optional, (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + check_input : boolean, optional (default=True) + Allow to bypass several checks on input: y values in range of family, + sample_weight non-negative, P2 positive semi-definite. + Don't use this parameter unless you know what you do. + + verbose : int, optional (default=0) + For the lbfgs solver set verbose to any positive number for verbosity. + + Attributes + ---------- + coef_ : array, shape (n_features,) + Estimated coefficients for the linear predictor (X*coef_+intercept_) in + the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + dispersion_ : float + The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. + + n_iter_ : int + Actual number of iterations used in solver. + + Notes + ----- + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + :ref:`User Guide `. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. + + For alpha > 0, the feature matrix X should be standardized in order to + penalize features equally strong. Call + :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. + + If the target y is a ratio, appropriate sample weights s should be + provided. + As an example, consider Poission distributed counts z (integers) and + weights s=exposure (time, money, persons years, ...). Then you fit + y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, + sample_weight=s)``. The weights are necessary for the right (finite + sample) mean. + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + in this case one might say that y has a 'scaled' Poisson distributions. + The same holds for other distributions. + + References + ---------- + For the coordinate descent implementation: + * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + """ + def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', + fit_intercept=True, family='normal', link='auto', + fit_dispersion=None, solver='auto', max_iter=100, + tol=1e-4, warm_start=False, start_params='guess', + selection='cyclic', random_state=None, diag_fisher=False, + copy_X=True, check_input=True, verbose=0): + self.alpha = alpha + self.l1_ratio = l1_ratio + self.P1 = P1 + self.P2 = P2 + self.fit_intercept = fit_intercept + self.family = family + self.link = link + self.fit_dispersion = fit_dispersion + self.solver = solver + self.max_iter = max_iter + self.tol = tol + self.warm_start = warm_start + self.start_params = start_params + self.selection = selection + self.random_state = random_state + self.diag_fisher = diag_fisher + self.copy_X = copy_X + self.check_input = check_input + self.verbose = verbose + + def fit(self, X, y, sample_weight=None): + """Fit a Generalized Linear Model. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + + y : array-like, shape (n_samples,) + Target values. + + sample_weight : {None, array-like}, shape (n_samples,),\ + optinal (default=None) + Individual weights w_i for each sample. Note that for an + Exponential Dispersion Model (EDM), one has + Var[Y_i]=phi/w_i * v(mu). + If Y_i ~ EDM(mu, phi/w_i), then + sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a + weighted average with weights=sample_weight. + + Returns + ------- + self : returns an instance of self. + """ + ####################################################################### + # 1. input validation # + ####################################################################### + # 1.1 validate arguments of __init__ ################################## + # Guarantee that self._family_instance is an instance of class + # ExponentialDispersionModel + if isinstance(self.family, ExponentialDispersionModel): + self._family_instance = self.family + else: + if self.family == 'normal': + self._family_instance = NormalDistribution() + elif self.family == 'poisson': + self._family_instance = PoissonDistribution() + elif self.family == 'gamma': + self._family_instance = GammaDistribution() + elif self.family == 'inverse.gaussian': + self._family_instance = InverseGaussianDistribution() + elif self.family == 'binomial': + self._family_instance = BinomialDistribution() + else: + raise ValueError( + "The family must be an instance of class" + " ExponentialDispersionModel or an element of" + " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " + "'binomial']; got (family={0})".format(self.family)) + + # Guarantee that self._link_instance is set to an instance of + # class Link + if isinstance(self.link, Link): + self._link_instance = self.link + else: + if self.link == 'auto': + if isinstance(self._family_instance, TweedieDistribution): + if self._family_instance.power <= 0: + self._link_instance = IdentityLink() + if self._family_instance.power >= 1: + self._link_instance = LogLink() + elif isinstance(self._family_instance, + GeneralizedHyperbolicSecant): + self._link_instance = IdentityLink() + elif isinstance(self._family_instance, BinomialDistribution): + self._link_instance = LogitLink() + else: + raise ValueError("No default link known for the " + "specified distribution family. Please " + "set link manually, i.e. not to 'auto'; " + "got (link='auto', family={}" + .format(self.family)) + elif self.link == 'identity': + self._link_instance = IdentityLink() + elif self.link == 'log': + self._link_instance = LogLink() + elif self.link == 'logit': + self._link_instance = LogitLink() + else: + raise ValueError( + "The link must be an instance of class Link or " + "an element of ['auto', 'identity', 'log', 'logit']; " + "got (link={0})".format(self.link)) + + # validate further arguments + if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: + raise ValueError("Penalty term must be a non-negative number;" + " got (alpha={0})".format(self.alpha)) + if (not isinstance(self.l1_ratio, numbers.Number) or + self.l1_ratio < 0 or self.l1_ratio > 1): + raise ValueError("l1_ratio must be a number in interval [0, 1];" + " got (l1_ratio={0})".format(self.l1_ratio)) + if not isinstance(self.fit_intercept, bool): + raise ValueError("The argument fit_intercept must be bool;" + " got {0}".format(self.fit_intercept)) + if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']: + raise ValueError("GeneralizedLinearRegressor supports only solvers" + " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';" + " got {0}".format(self.solver)) + solver = self.solver + if self.solver == 'auto': + if self.l1_ratio == 0: + solver = 'irls' + else: + solver = 'cd' + if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']): + raise ValueError("The chosen solver (solver={0}) can't deal " + "with L1 penalties, which are included with " + "(alpha={1}) and (l1_ratio={2})." + .format(solver, self.alpha, self.l1_ratio)) + if (not isinstance(self.max_iter, int) + or self.max_iter <= 0): + raise ValueError("Maximum number of iteration must be a positive " + "integer;" + " got (max_iter={0!r})".format(self.max_iter)) + if not isinstance(self.tol, numbers.Number) or self.tol <= 0: + raise ValueError("Tolerance for stopping criteria must be " + "positive; got (tol={0!r})".format(self.tol)) + if not isinstance(self.warm_start, bool): + raise ValueError("The argument warm_start must be bool;" + " got {0}".format(self.warm_start)) + if self.selection not in ['cyclic', 'random']: + raise ValueError("The argument selection must be 'cyclic' or " + "'random'; got (selection={0})" + .format(self.selection)) + random_state = check_random_state(self.random_state) + if not isinstance(self.diag_fisher, bool): + raise ValueError("The argument diag_fisher must be bool;" + " got {0}".format(self.diag_fisher)) + if not isinstance(self.copy_X, bool): + raise ValueError("The argument copy_X must be bool;" + " got {0}".format(self.copy_X)) + if not isinstance(self.check_input, bool): + raise ValueError("The argument check_input must be bool; got " + "(check_input={0})".format(self.check_input)) + + family = self._family_instance + link = self._link_instance + + # 1.2 validate arguments of fit ####################################### + _dtype = [np.float64, np.float32] + if solver == 'cd': + _stype = ['csc'] + else: + _stype = ['csc', 'csr'] + X, y = check_X_y(X, y, accept_sparse=_stype, + dtype=_dtype, y_numeric=True, multi_output=False, + copy=self.copy_X) + # Without converting y to float, deviance might raise + # ValueError: Integers to negative integer powers are not allowed. + # Also, y must not be sparse. + y = np.asarray(y, dtype=np.float64) + + weights = _check_weights(sample_weight, y.shape[0]) + + n_samples, n_features = X.shape + + # 1.3 arguments to take special care ################################## + # P1, P2, start_params + if isinstance(self.P1, str) and self.P1 == 'identity': + P1 = np.ones(n_features) + else: + P1 = np.atleast_1d(self.P1) + try: + P1 = P1.astype(np.float64, casting='safe', copy=False) + except TypeError: + raise TypeError("The given P1 cannot be converted to a numeric" + "array; got (P1.dtype={0})." + .format(P1.dtype)) + if (P1.ndim != 1) or (P1.shape[0] != n_features): + raise ValueError("P1 must be either 'identity' or a 1d array " + "with the length of X.shape[1]; " + "got (P1.shape[0]={0}), " + "needed (X.shape[1]={1})." + .format(P1.shape[0], n_features)) + # If X is sparse, make P2 sparse, too. + if isinstance(self.P2, str) and self.P2 == 'identity': + if sparse.issparse(X): + P2 = (sparse.dia_matrix((np.ones(n_features), 0), + shape=(n_features, n_features))).tocsc() + else: + P2 = np.ones(n_features) + else: + P2 = check_array(self.P2, copy=True, + accept_sparse=_stype, + dtype=_dtype, ensure_2d=False) + if P2.ndim == 1: + P2 = np.asarray(P2) + if P2.shape[0] != n_features: + raise ValueError("P2 should be a 1d array of shape " + "(n_features,) with " + "n_features=X.shape[1]; " + "got (P2.shape=({0},)), needed ({1},)" + .format(P2.shape[0], X.shape[1])) + if sparse.issparse(X): + P2 = (sparse.dia_matrix((P2, 0), + shape=(n_features, n_features))).tocsc() + elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and + P2.shape[0] == X.shape[1]): + if sparse.issparse(X): + P2 = (sparse.dia_matrix((P2, 0), + shape=(n_features, n_features))).tocsc() + else: + raise ValueError("P2 must be either None or an array of shape " + "(n_features, n_features) with " + "n_features=X.shape[1]; " + "got (P2.shape=({0}, {1})), needed ({2}, {2})" + .format(P2.shape[0], P2.shape[1], X.shape[1])) + + start_params = self.start_params + if isinstance(start_params, str): + if start_params not in ['guess', 'zero']: + raise ValueError("The argument start_params must be 'guess', " + "'zero' or an array of correct length; " + "got(start_params={0})".format(start_params)) + else: + start_params = check_array(start_params, accept_sparse=False, + force_all_finite=True, ensure_2d=False, + dtype=_dtype, copy=True) + if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or + (start_params.ndim != 1)): + raise ValueError("Start values for parameters must have the" + "right length and dimension; required (length" + "={0}, ndim=1); got (length={1}, ndim={2})." + .format(X.shape[1] + self.fit_intercept, + start_params.shape[0], + start_params.ndim)) + + l1 = self.alpha * self.l1_ratio + l2 = self.alpha * (1 - self.l1_ratio) + # P1 and P2 are now for sure copies + P1 = l1 * P1 + P2 = l2 * P2 + # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') + # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric + if P2.ndim == 2: + if sparse.issparse(P2): + if sparse.isspmatrix_csc(P2): + P2 = 0.5 * (P2 + P2.transpose()).tocsc() + else: + P2 = 0.5 * (P2 + P2.transpose()).tocsr() + else: + P2 = 0.5 * (P2 + P2.T) + + # For coordinate descent, if X is sparse, P2 must also be csc + if solver == 'cd' and sparse.issparse(X): + P2 = sparse.csc_matrix(P2) + + # 1.4 additional validations ########################################## + if self.check_input: + if not np.all(family.in_y_range(y)): + raise ValueError("Some value(s) of y are out of the valid " + "range for family {0}" + .format(family.__class__.__name__)) + # check if P1 has only non-negative values, negative values might + # indicate group lasso in the future. + if not isinstance(self.P1, str): # if self.P1 != 'identity': + if not np.all(P1 >= 0): + raise ValueError("P1 must not have negative values.") + # check if P2 is positive semidefinite + # np.linalg.cholesky(P2) 'only' asserts positive definite + if not isinstance(self.P2, str): # self.P2 != 'identity' + # due to numerical precision, we allow eigenvalues to be a + # tiny bit negative + epsneg = -10 * np.finfo(P2.dtype).epsneg + if P2.ndim == 1 or P2.shape[0] == 1: + p2 = P2 + if sparse.issparse(P2): + p2 = P2.toarray() + if not np.all(p2 >= 0): + raise ValueError("1d array P2 must not have negative " + "values.") + elif sparse.issparse(P2): + # for sparse matrices, not all eigenvals can be computed + # efficiently, use only half of n_features + # k = how many eigenvals to compute + k = np.min([10, n_features // 10 + 1]) + sigma = 0 # start searching near this value + which = 'SA' # find smallest algebraic eigenvalues first + if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma, + which=which) >= epsneg): + raise ValueError("P2 must be positive semi-definite.") + else: + if not np.all(linalg.eigvalsh(P2) >= epsneg): + raise ValueError("P2 must be positive semi-definite.") + # TODO: if alpha=0 check that X is not rank deficient + # TODO: what else to check? + + ####################################################################### + # 2. rescaling of weights (sample_weight) # + ####################################################################### + # IMPORTANT NOTE: Since we want to minimize + # 1/(2*sum(sample_weight)) * deviance + L1 + L2, + # deviance = sum(sample_weight * unit_deviance), + # we rescale weights such that sum(weights) = 1 and this becomes + # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + weights_sum = np.sum(weights) + weights = weights/weights_sum + + ####################################################################### + # 3. initialization of coef = (intercept_, coef_) # + ####################################################################### + # Note: Since phi=self.dispersion_ does not enter the estimation + # of mu_i=E[y_i], set it to 1. + + # set start values for coef + coef = None + if self.warm_start and hasattr(self, 'coef_'): + if self.fit_intercept: + coef = np.concatenate((np.array([self.intercept_]), + self.coef_)) + else: + coef = self.coef_ + elif isinstance(start_params, str): + if start_params == 'guess': + # Set mu=starting_mu of the family and do one Newton step + # If solver=cd use cd, else irls + mu = family.starting_mu(y, weights=weights) + eta = link.link(mu) # linear predictor + if solver in ['cd', 'lbfgs', 'newton-cg']: + # see function _cd_solver + sigma_inv = 1/family.variance(mu, phi=1, weights=weights) + d1 = link.inverse_derivative(eta) + temp = sigma_inv * d1 * (y - mu) + if self.fit_intercept: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X # sampe as X.T @ temp + + d2_sigma_inv = d1 * d1 * sigma_inv + diag_fisher = self.diag_fisher + if diag_fisher: + fisher = d2_sigma_inv + else: + fisher = \ + _safe_sandwich_dot(X, d2_sigma_inv, + intercept=self.fit_intercept) + # set up space for search direction d for inner loop + if self.fit_intercept: + coef = np.zeros(n_features+1) + else: + coef = np.zeros(n_features) + d = np.zeros_like(coef) + # initial stopping tolerance of inner loop + # use L1-norm of minimum of norm of subgradient of F + # use less restrictive tolerance for initial guess + inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, + P1=P1) + inner_tol = 4 * linalg.norm(inner_tol, ord=1) + # just one outer loop = Newton step + n_cycles = 0 + d, coef_P2, n_cycles, inner_tol = \ + _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, + inner_tol, max_inner_iter=1000, + selection=self.selection, + random_state=self.random_state, + diag_fisher=self.diag_fisher) + coef += d # for simplicity no line search here + else: + # See _irls_solver + # h'(eta) + hp = link.inverse_derivative(eta) + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = (hp**2 / family.variance(mu, phi=1, weights=weights)) + # working observations + z = eta + (y-mu)/hp + # solve A*coef = b + # A = X' W X + l2 P2, b = X' W z + coef = _irls_step(X, W, P2, z, + fit_intercept=self.fit_intercept) + else: # start_params == 'zero' + if self.fit_intercept: + coef = np.zeros(n_features+1) + coef[0] = link.link(np.average(y, weights=weights)) + else: + coef = np.zeros(n_features) + else: # assign given array as start values + coef = start_params + + ####################################################################### + # 4. fit # + ####################################################################### + # algorithms for optimiation + # TODO: Parallelize it? + + # 4.1 IRLS ############################################################ + # Note: we already set P2 = l2*P2, see above + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + if solver == 'irls': + coef, self.n_iter_ = \ + _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2, + fit_intercept=self.fit_intercept, family=family, + link=link, max_iter=self.max_iter, tol=self.tol) + + # 4.2 L-BFGS ########################################################## + elif solver == 'lbfgs': + def func(coef, X, y, weights, P2, family, link): + mu, devp = \ + family._mu_deviance_derivative(coef, X, y, weights, link) + dev = family.deviance(y, mu, weights) + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + L2 = P2 * coef[idx:] + else: + L2 = P2 @ coef[idx:] + obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2) + objp = 0.5 * devp + objp[idx:] += L2 + return obj, objp + + args = (X, y, weights, P2, family, link) + coef, loss, info = fmin_l_bfgs_b( + func, coef, fprime=None, args=args, + iprint=(self.verbose > 0) - 1, pgtol=self.tol, + maxiter=self.max_iter, factr=1e3) + if self.verbose > 0: + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.", + ConvergenceWarning) + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}" + .format(info["task"])) + self.n_iter_ = info['nit'] + + # 4.3 Newton-CG ####################################################### + # We use again the fisher matrix instead of the hessian. More + # precisely, expected hessian of deviance. + elif solver == 'newton-cg': + def func(coef, X, y, weights, P2, family, link): + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + L2 = coef[idx:] @ (P2 * coef[idx:]) + else: + L2 = coef[idx:] @ (P2 @ coef[idx:]) + mu = link.inverse(_safe_lin_pred(X, coef)) + return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 + + def grad(coef, X, y, weights, P2, family, link): + mu, devp = \ + family._mu_deviance_derivative(coef, X, y, weights, link) + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + L2 = P2 * coef[idx:] + else: + L2 = P2 @ coef[idx:] + objp = 0.5 * devp + objp[idx:] += L2 + return objp + + def grad_hess(coef, X, y, weights, P2, family, link): + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + L2 = P2 * coef[idx:] + else: + L2 = P2 @ coef[idx:] + eta = _safe_lin_pred(X, coef) + mu = link.inverse(eta) + d1 = link.inverse_derivative(eta) + temp = d1 * family.deviance_derivative(y, mu, weights) + if intercept: + grad = np.concatenate(([0.5 * temp.sum()], + 0.5 * temp @ X + L2)) + else: + grad = 0.5 * temp @ X + L2 # sampe as 0.5* X.T @ temp + L2 + + # expected hessian = fisher = X.T @ diag_matrix @ X + # calculate only diag_matrix + diag = d1**2 / family.variance(mu, phi=1, weights=weights) + if intercept: + h0i = np.concatenate(([diag.sum()], diag @ X)) + + def Hs(coef): + # return (0.5 * fisher + P2) @ coef + # ret = 0.5 * (X.T @ (diag * (X @ coef))) + ret = 0.5 * ((diag * (X @ coef[idx:])) @ X) + if P2.ndim == 1: + ret += P2 * coef[idx:] + else: + ret += P2 @ coef[idx:] + if intercept: + ret = np.concatenate(([0.5 * (h0i @ coef)], + ret + 0.5 * coef[0] * h0i[1:])) + return ret + + return grad, Hs + + args = (X, y, weights, P2, family, link) + coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef, + args=args, maxiter=self.max_iter, + tol=self.tol) + + # 4.4 coordinate descent ############################################## + # Note: we already set P1 = l1*P1, see above + # Note: we already set P2 = l2*P2, see above + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + elif solver == 'cd': + coef, self.n_iter_, self._n_cycles = \ + _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, + P2=P2, fit_intercept=self.fit_intercept, + family=family, link=link, + max_iter=self.max_iter, tol=self.tol, + selection=self.selection, random_state=random_state, + diag_fisher=self.diag_fisher, copy_X=self.copy_X) + + ####################################################################### + # 5. postprocessing # + ####################################################################### + if self.fit_intercept: + self.intercept_ = coef[0] + self.coef_ = coef[1:] + else: + # set intercept to zero as the other linear models do + self.intercept_ = 0. + self.coef_ = coef + + if self.fit_dispersion in ['chisqr', 'deviance']: + # attention because of rescaling of weights + self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum + + return self + + def linear_predictor(self, X): + """Compute the linear_predictor = X*coef_ + intercept_. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Samples. + + Returns + ------- + C : array, shape (n_samples,) + Returns predicted values of linear predictor. + """ + check_is_fitted(self, "coef_") + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype='numeric', copy=True, ensure_2d=True, + allow_nd=False) + return X @ self.coef_ + self.intercept_ + + def predict(self, X, sample_weight=None): + """Predict uing GLM with feature matrix X. + If sample_weight is given, returns prediction*sample_weight. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Samples. + + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) + + Returns + ------- + C : array, shape (n_samples,) + Returns predicted values times sample_weight. + """ + # TODO: Is copy=True necessary? + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype='numeric', copy=True, ensure_2d=True, + allow_nd=False) + eta = self.linear_predictor(X) + mu = self._link_instance.inverse(eta) + weights = _check_weights(sample_weight, X.shape[0]) + + return mu*weights + + def estimate_phi(self, X, y, sample_weight=None): + """Estimate/fit the dispersion parameter phi. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + + y : array-like, shape (n_samples,) + Target values. + + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) + Sample weights. + + Returns + ------- + phi : float + Dispersion parameter. + """ + check_is_fitted(self, "coef_") + _dtype = [np.float64, np.float32] + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=_dtype, y_numeric=True, multi_output=False) + n_samples, n_features = X.shape + weights = _check_weights(sample_weight, n_samples) + eta = X @ self.coef_ + if self.fit_intercept is True: + eta += self.intercept_ + n_features += 1 + if n_samples <= n_features: + raise ValueError("Estimation of dispersion parameter phi requires" + " more samples than features, got" + " samples=X.shape[0]={0} and" + " n_features=X.shape[1]+fit_intercept={1}." + .format(n_samples, n_features)) + mu = self._link_instance.inverse(eta) + if self.fit_dispersion == 'chisqr': + chisq = np.sum(weights*(y-mu)**2 / + self._family_instance.unit_variance(mu)) + return chisq/(n_samples - n_features) + elif self.fit_dispersion == 'deviance': + dev = self._family_instance.deviance(y, mu, weights) + return dev/(n_samples - n_features) + + # Note: check_estimator(GeneralizedLinearRegressor) might raise + # "AssertionError: -0.28014056555724598 not greater than 0.5" + # unless GeneralizedLinearRegressor has a score which passes the test. + def score(self, X, y, sample_weight=None): + """Compute D^2, the percentage of deviance explained. + + D^2 is a generalization of the coefficient of determination R^2. + R^2 uses squared error and D^2 deviance. Note that those two are equal + for family='normal'. + + D^2 is defined as + :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, + :math:`D_{null}` is the null deviance, i.e. the deviance of a model + with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. + The mean :math:`\\bar{y}` is averaged by sample_weight. + Best possible score is 1.0 and it can be negative (because the model + can be arbitrarily worse). + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Test samples. + + y : array-like, shape (n_samples,) + True values of target. + + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) + Sample weights. + + Returns + ------- + score : float + D^2 of self.predict(X) w.r.t. y. + """ + # Note, default score defined in RegressorMixin is R^2 score. + # TODO: make D^2 a score function in module metrics (and thereby get + # input validation and so on) + weights = _check_weights(sample_weight, y.shape[0]) + mu = self.predict(X) + dev = self._family_instance.deviance(y, mu, weights=weights) + y_mean = np.average(y, weights=weights) + dev_null = self._family_instance.deviance(y, y_mean, weights=weights) + return 1. - dev / dev_null diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py new file mode 100644 index 0000000000000..7cb3f4a5b5969 --- /dev/null +++ b/sklearn/linear_model/tests/test_glm.py @@ -0,0 +1,640 @@ +import numpy as np +from numpy.testing import assert_allclose +import pytest +import scipy as sp +from scipy import linalg, optimize, sparse + +from sklearn.datasets import make_classification, make_regression +from sklearn.linear_model.glm import ( + Link, + IdentityLink, + LogLink, + LogitLink, + TweedieDistribution, + NormalDistribution, PoissonDistribution, + GammaDistribution, InverseGaussianDistribution, + GeneralizedHyperbolicSecant, BinomialDistribution, + GeneralizedLinearRegressor) +from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge + +from sklearn.utils.testing import ( + assert_equal, assert_almost_equal, + assert_array_equal, assert_array_almost_equal) + + +@pytest.mark.parametrize('link', Link.__subclasses__()) +def test_link_properties(link): + """Test link inverse and derivative.""" + rng = np.random.RandomState(42) + x = rng.rand(100)*100 + link = link() # instatiate object + decimal = 10 + if isinstance(link, LogitLink): + # careful for large x, note expit(36) = 1 + # limit max eta to 15 + x = x / 100 * 15 + decimal = 8 + assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal) + # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) + assert_almost_equal(link.derivative(link.inverse(x)), + 1./link.inverse_derivative(x), decimal=decimal) + # for LogitLink, in the following x should be between 0 and 1. + # assert_almost_equal(link.inverse_derivative(link.link(x)), + # 1./link.derivative(x), decimal=decimal) + + +@pytest.mark.parametrize( + 'family, expected', + [(NormalDistribution(), [True, True, True]), + (PoissonDistribution(), [False, True, True]), + (TweedieDistribution(power=1.5), [False, True, True]), + (GammaDistribution(), [False, False, True]), + (InverseGaussianDistribution(), [False, False, True]), + (TweedieDistribution(power=4.5), [False, False, True])]) +def test_family_bounds(family, expected): + """Test the valid range of distributions at -1, 0, 1.""" + result = family.in_y_range([-1, 0, 1]) + assert_array_equal(result, expected) + + +@pytest.mark.parametrize( + 'family, chk_values', + [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), + (PoissonDistribution(), [0.1, 1.5]), + (GammaDistribution(), [0.1, 1.5]), + (InverseGaussianDistribution(), [0.1, 1.5]), + (TweedieDistribution(power=-2.5), [0.1, 1.5]), + (TweedieDistribution(power=-1), [0.1, 1.5]), + (TweedieDistribution(power=1.5), [0.1, 1.5]), + (TweedieDistribution(power=2.5), [0.1, 1.5]), + (TweedieDistribution(power=-4), [0.1, 1.5]), + (GeneralizedHyperbolicSecant(), [0.1, 1.5])]) +def test_deviance_zero(family, chk_values): + """Test deviance(y,y) = 0 for different families.""" + for x in chk_values: + assert_almost_equal(family.deviance(x, x), 0, decimal=10) + + +@pytest.mark.parametrize( + 'family, link', + [(NormalDistribution(), IdentityLink()), + (PoissonDistribution(), LogLink()), + (GammaDistribution(), LogLink()), + (InverseGaussianDistribution(), LogLink()), + (TweedieDistribution(power=1.5), LogLink()), + (TweedieDistribution(power=4.5), LogLink())]) +def test_fisher_matrix(family, link): + """Test the Fisher matrix numerically. + Trick: Use numerical differentiation with y = mu""" + coef = np.array([-2, 1, 0, 1, 2.5]) + phi = 0.5 + rng = np.random.RandomState(42) + X = rng.randn(10, 5) + lin_pred = np.dot(X, coef) + mu = link.inverse(lin_pred) + weights = rng.randn(10)**2 + 1 + fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link) + approx = np.array([]).reshape(0, coef.shape[0]) + for i in range(coef.shape[0]): + def f(coef): + return -family._score(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link)[i] + approx = np.vstack( + [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]) + assert_allclose(fisher, approx, rtol=1e-3) + + +def test_sample_weights_validation(): + """Test the raised errors in the validation of sample_weight.""" + # 1. scalar value but not positive + X = [[1]] + y = [1] + weights = 0 + glm = GeneralizedLinearRegressor(fit_intercept=False) + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + # 2. 2d array + weights = [[0]] + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + # 3. 1d but wrong length + weights = [1, 0] + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + # 4. 1d but only zeros (sum not greater than 0) + weights = [0, 0] + X = [[0], [1]] + y = [1, 2] + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + # 5. 1d but weith a negative value + weights = [2, -1] + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + +@pytest.mark.parametrize('f, fam', + [('normal', NormalDistribution()), + ('poisson', PoissonDistribution()), + ('gamma', GammaDistribution()), + ('inverse.gaussian', InverseGaussianDistribution()), + ('binomial', BinomialDistribution())]) +def test_glm_family_argument(f, fam): + """Test GLM family argument set as string.""" + y = np.array([0.1, 0.5]) # in range of all distributions + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) + assert_equal(type(glm._family_instance), type(fam)) + + glm = GeneralizedLinearRegressor(family='not a family', + fit_intercept=False) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('l, link', + [('identity', IdentityLink()), + ('log', LogLink()), + ('logit', LogitLink())]) +def test_glm_link_argument(l, link): + """Test GLM link argument set as string.""" + y = np.array([0.1, 0.5]) # in range of all distributions + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) + assert_equal(type(glm._link_instance), type(link)) + + glm = GeneralizedLinearRegressor(family='normal', link='not a link') + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('alpha', ['not a number', -4.2]) +def test_glm_alpha_argument(alpha): + """Test GLM for invalid alpha argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', alpha=alpha) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]]) +def test_glm_l1_ratio_argument(l1_ratio): + """Test GLM for invalid l1_ratio argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3], + [-1]]) +def test_glm_P1_argument(P1): + """Test GLM for invalid P1 argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True) + with pytest.raises((ValueError, TypeError)): + glm.fit(X, y) + + +@pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], + sparse.csr_matrix([1, 2, 3]), [-1]]) +def test_glm_P2_argument(P2): + """Test GLM for invalid P2 argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(P2=P2, check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) + + +def test_glm_P2_positive_semidefinite(): + """Test GLM for a positive semi-definite P2 argument.""" + n_samples, n_features = 10, 5 + y = np.arange(n_samples) + X = np.zeros((n_samples, n_features)) + P2 = np.diag([100, 10, 5, 0, -1E-5]) + rng = np.random.RandomState(42) + # construct random orthogonal matrix Q + Q, R = linalg.qr(rng.randn(n_features, n_features)) + P2 = Q.T @ P2 @ Q + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, + check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) + + P2 = sparse.csr_matrix(P2) + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, + check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) +def test_glm_fit_intercept_argument(fit_intercept): + """Test GLM for invalid fit_intercept argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('solver, l1_ratio', + [('not a solver', 0), (1, 0), ([1], 0), + ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)]) +def test_glm_solver_argument(solver, l1_ratio): + """Test GLM for invalid solver argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]]) +def test_glm_max_iter_argument(max_iter): + """Test GLM for invalid max_iter argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(max_iter=max_iter) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]]) +def test_glm_tol_argument(tol): + """Test GLM for invalid tol argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(tol=tol) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]]) +def test_glm_warm_start_argument(warm_start): + """Test GLM for invalid warm_start argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(warm_start=warm_start) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('start_params', + ['not a start_params', ['zero'], [0, 0, 0], + [[0, 0]], ['a', 'b']]) +def test_glm_start_params_argument(start_params): + """Test GLM for invalid start_params argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(start_params=start_params) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']]) +def test_glm_selection_argument(selection): + """Test GLM for invalid selection argument""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(selection=selection) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]]) +def test_glm_random_state_argument(random_state): + """Test GLM for invalid random_state argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(random_state=random_state) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]]) +def test_glm_diag_fisher_argument(diag_fisher): + """Test GLM for invalid diag_fisher arguments.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) +def test_glm_copy_X_argument(copy_X): + """Test GLM for invalid copy_X arguments.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(copy_X=copy_X) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]]) +def test_glm_check_input_argument(check_input): + """Test GLM for invalid check_input argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(check_input=check_input) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +def test_glm_identiy_regression(solver): + """Test GLM regression with identity link on a simple dataset.""" + coef = [1, 2] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T + y = np.dot(X, coef) + glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', + fit_intercept=False, solver=solver, + start_params='zero', tol=1e-7) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) + + +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecant()]) +@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), + ('lbfgs', 1e-6, 6), + ('newton-cg', 1e-7, 6), + ('cd', 1e-7, 6)]) +def test_glm_log_regression(family, solver, tol, dec): + """Test GLM regression with log link on a simple dataset.""" + coef = [0.2, -0.1] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T + y = np.exp(np.dot(X, coef)) + glm = GeneralizedLinearRegressor( + alpha=0, family=family, link='log', fit_intercept=False, + solver=solver, start_params='guess', tol=tol) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef, decimal=dec) + + +@pytest.mark.filterwarnings('ignore::DeprecationWarning') +@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 5), + ('lbfgs', 1e-6, 5), + ('newton-cg', 1e-5, 5), + ('cd', 1e-6, 6)]) +def test_normal_ridge(solver, tol, dec): + """Test ridge regression for Normal distributions. + + Compare to test_ridge in test_ridge.py. + """ + rng = np.random.RandomState(42) + alpha = 1.0 + + # 1. With more samples than features + n_samples, n_features, n_predict = 100, 7, 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=rng) + y = y[0:n_samples] + X, T = X[0:n_samples], X[n_samples:] + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, + solver='svd', normalize=False) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=True, + tol=tol, max_iter=100, solver=solver, + check_input=False, random_state=rng) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, + solver='svd', normalize=False) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=False, + tol=tol, max_iter=100, solver=solver, + check_input=False, random_state=rng, + fit_dispersion='chisqr') + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + mu = glm.predict(X) + assert_almost_equal(glm.dispersion_, + np.sum((y-mu)**2/(n_samples-n_features))) + + # 2. With more features than samples and sparse + n_samples, n_features, n_predict = 10, 100, 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=rng) + y = y[0:n_samples] + X, T = X[0:n_samples], X[n_samples:] + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, + solver='sag', normalize=False, max_iter=100000) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=True, + tol=tol, max_iter=300, solver=solver, + check_input=False, random_state=rng) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, + solver='sag', normalize=False, max_iter=1000) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=False, + tol=tol*2, max_iter=300, solver=solver, + check_input=False, random_state=rng) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2) + + +@pytest.mark.parametrize('solver, tol, dec', + [('irls', 1e-7, 6), + ('lbfgs', 1e-7, 5), + ('newton-cg', 1e-7, 5), + ('cd', 1e-7, 7)]) +def test_poisson_ridge(solver, tol, dec): + """Test ridge regression with poisson family and LogLink. + + Compare to R's glmnet""" + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.12889386979 + # a 0.29019207995 + # b 0.03741173122 + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + rng = np.random.RandomState(42) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, + fit_intercept=True, family='poisson', + link='log', tol=tol, + solver=solver, max_iter=300, + random_state=rng) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, -0.12889386979, + decimal=dec) + assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + decimal=dec) + + +@pytest.mark.parametrize('diag_fisher', [False, True]) +def test_normal_enet(diag_fisher): + """Test elastic net regression with normal/gaussian family.""" + alpha, l1_ratio = 0.3, 0.7 + n_samples, n_features = 20, 2 + rng = np.random.RandomState(42) + X = rng.randn(n_samples, n_features).copy(order='F') + beta = rng.randn(n_features) + y = 2 + np.dot(X, beta) + rng.randn(n_samples) + + # 1. test normal enet on dense data + glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, + family='normal', link='identity', + fit_intercept=True, tol=1e-8, + max_iter=100, selection='cyclic', + solver='cd', start_params='zero', + check_input=False, + diag_fisher=diag_fisher) + glm.fit(X, y) + + enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, + normalize=False, tol=1e-8, copy_X=True) + enet.fit(X, y) + + assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) + assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + + # 2. test normal enet on sparse data + X = sparse.csc_matrix(X) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) + assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + + +def test_poisson_enet(): + """Test elastic net regression with poisson family and LogLink. + + Compare to R's glmnet""" + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.03550978409 + # a 0.16936423283 + # b . + glmnet_intercept = -0.03550978409 + glmnet_coef = [0.16936423283, 0.] + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + rng = np.random.RandomState(42) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', + link='log', solver='cd', tol=1e-8, + selection='random', random_state=rng, + start_params='guess') + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) + + # test results with general optimization procedure + def obj(coef): + pd = PoissonDistribution() + link = LogLink() + N = y.shape[0] + mu = link.inverse(X @ coef[1:] + coef[0]) + alpha, l1_ratio = (1, 0.5) + return 1./(2.*N) * pd.deviance(y, mu) \ + + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \ + + alpha * l1_ratio * np.sum(np.abs(coef[1:])) + res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10, + options={'maxiter': 1000, 'disp': False}) + assert_almost_equal(glm.intercept_, res.x[0], decimal=5) + assert_almost_equal(glm.coef_, res.x[1:], decimal=5) + assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))), + res.fun, decimal=8) + + # same for start_params='zero' and selection='cyclic' + # with reduced precision + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', + link='log', solver='cd', tol=1e-5, + selection='cyclic', start_params='zero') + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + + # check warm_start, therefore start with different alpha + glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, + family='poisson', max_iter=300, + link='log', solver='cd', tol=1e-5, + selection='cyclic', start_params='zero') + glm.fit(X, y) + # warm start with original alpha and use of sparse matrices + glm.warm_start = True + glm.alpha = 1 + X = sparse.csr_matrix(X) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + + +@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10]) +def test_binomial_enet(alpha): + """Test elastic net regression with binomial family and LogitLink. + + Compare to LogisticRegression. + """ + l1_ratio = 0.5 + n_samples = 500 + rng = np.random.RandomState(42) + X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6, + n_informative=5, n_redundant=0, n_repeated=0, + random_state=rng) + log = LogisticRegression( + penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6, + max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), + solver='saga') + log.fit(X, y) + glm = GeneralizedLinearRegressor( + family=BinomialDistribution(), link=LogitLink(), fit_intercept=False, + alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic', + tol=1e-7) + glm.fit(X, y) + assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6) + assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6) From 9b574bdc0b3a4a1081f73563d0468c864fcbbb22 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 11 Jun 2019 07:30:52 -0500 Subject: [PATCH 58/69] Fix docstrings for the new print_changed_only=True by default --- doc/modules/linear_model.rst | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 52f28346cc047..b9f0e96734ae5 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -942,17 +942,11 @@ follows: >>> from sklearn.linear_model import GeneralizedLinearRegressor >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') - >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE - GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5, - check_input=True, copy_X=True, diag_fisher=False, - family='poisson', fit_dispersion=None, - fit_intercept=True, l1_ratio=0, link='log', - max_iter=100, random_state=None, selection='cyclic', - solver='auto', start_params='guess', tol=0.0001, - verbose=0, warm_start=False) - >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) + GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> reg.coef_ array([0.24630169, 0.43373464]) - >>> reg.intercept_ #doctest: +ELLIPSIS + >>> reg.intercept_ -0.76383633... From 90299fdd8a4745965eb8f9dbe26ddb685f1531f1 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 12 Jun 2019 04:31:50 -0500 Subject: [PATCH 59/69] Increase coverage --- sklearn/linear_model/_glm.py | 22 +++----- sklearn/linear_model/tests/test_glm.py | 78 ++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 17 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 8152e84ac7253..055aacf26d747 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -182,7 +182,7 @@ def link(self, mu): mu : array, shape (n_samples,) Usually the (predicted) mean. """ - raise NotImplementedError + pass @abstractmethod def derivative(self, mu): @@ -193,7 +193,7 @@ def derivative(self, mu): mu : array, shape (n_samples,) Usually the (predicted) mean. """ - raise NotImplementedError + pass @abstractmethod def inverse(self, lin_pred): @@ -207,7 +207,7 @@ def inverse(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - raise NotImplementedError + pass @abstractmethod def inverse_derivative(self, lin_pred): @@ -218,7 +218,7 @@ def inverse_derivative(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - raise NotImplementedError + pass @abstractmethod def inverse_derivative2(self, lin_pred): @@ -229,7 +229,7 @@ def inverse_derivative2(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - raise NotImplementedError + pass class IdentityLink(Link): @@ -400,7 +400,7 @@ def unit_variance(self, mu): mu : array, shape (n_samples,) Predicted mean. """ - raise NotImplementedError() + pass @abstractmethod def unit_variance_derivative(self, mu): @@ -413,7 +413,7 @@ def unit_variance_derivative(self, mu): mu : array, shape (n_samples,) Target values. """ - raise NotImplementedError() + pass def variance(self, mu, phi=1, weights=1): r"""Compute the variance function. @@ -473,7 +473,7 @@ def unit_deviance(self, y, mu): mu : array, shape (n_samples,) Predicted mean. """ - raise NotImplementedError() + pass def unit_deviance_derivative(self, y, mu): r"""Compute the derivative of the unit deviance w.r.t. mu. @@ -1079,7 +1079,6 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, B[idx:, idx:] += P2.toarray() else: B[idx:, idx:] += P2 - # A = -score + coef_P2 A = -score A[idx:] += coef_P2 # A += d @ (H+P2) but so far d=0 @@ -1302,9 +1301,6 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32], order='F', copy=copy_X) if sparse.issparse(X): - if not sparse.isspmatrix_csc(X): - raise ValueError("If X is sparse, it must be in csc format" - "; got (format={})".format(X.format)) if not sparse.isspmatrix_csc(P2): raise ValueError("If X is sparse, P2 must also be sparse csc" "format. Got P2 not sparse.") @@ -2021,7 +2017,7 @@ def fit(self, X, y, sample_weight=None): _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, max_inner_iter=1000, selection=self.selection, - random_state=self.random_state, + random_state=random_state, diag_fisher=self.diag_fisher) coef += d # for simplicity no line search here else: diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index d42a8739f6aa0..230bbdabae201 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -21,10 +21,20 @@ GeneralizedHyperbolicSecant, BinomialDistribution, ) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge +from sklearn.metrics import mean_absolute_error from sklearn.utils.testing import assert_array_equal +@pytest.fixture(scope="module") +def regression_data(): + X, y = make_regression(n_samples=107, + n_features=10, + n_informative=80, noise=0.5, + random_state=2) + return X, y + + @pytest.mark.parametrize('link', Link.__subclasses__()) def test_link_properties(link): """Test link inverse and derivative.""" @@ -39,6 +49,10 @@ def test_link_properties(link): # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) assert_allclose(link.derivative(link.inverse(x)), 1./link.inverse_derivative(x)) + + assert ( + link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) + # for LogitLink, in the following x should be between 0 and 1. # assert_almost_equal(link.inverse_derivative(link.link(x)), # 1./link.derivative(x), decimal=decimal) @@ -108,7 +122,7 @@ def f(coef): def test_sample_weights_validation(): """Test the raised errors in the validation of sample_weight.""" - # 1. scalar value but not positive + # scalar value but not positive X = [[1]] y = [1] weights = 0 @@ -116,17 +130,20 @@ def test_sample_weights_validation(): with pytest.raises(ValueError): glm.fit(X, y, weights) - # 2. 2d array + # Positive weights are accepted + glm.fit(X, y, sample_weight=1) + + # 2d array weights = [[0]] with pytest.raises(ValueError): glm.fit(X, y, weights) - # 3. 1d but wrong length + # 1d but wrong length weights = [1, 0] with pytest.raises(ValueError): glm.fit(X, y, weights) - # 4. 1d but only zeros (sum not greater than 0) + # 1d but only zeros (sum not greater than 0) weights = [0, 0] X = [[0], [1]] y = [1, 2] @@ -643,3 +660,56 @@ def test_binomial_enet(alpha): glm.fit(X, y) assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6) assert_allclose(log.coef_[0, :], glm.coef_, rtol=5e-6) + + +@pytest.mark.parametrize( + "params", + [ + {"solver": "irls", "start_params": "guess"}, + {"solver": "irls", "start_params": "zero"}, + {"solver": "lbfgs", "start_params": "guess"}, + {"solver": "lbfgs", "start_params": "zero"}, + {"solver": "newton-cg"}, + {"solver": "cd", "selection": "cyclic", "diag_fisher": False}, + {"solver": "cd", "selection": "cyclic", "diag_fisher": True}, + {"solver": "cd", "selection": "random", "diag_fisher": False}, + ], + ids=lambda params: ', '.join("%s=%s" % (key, val) + for key, val in params.items()) +) +def test_solver_equivalence(params, regression_data): + X, y = regression_data + est_ref = GeneralizedLinearRegressor(random_state=2) + est_ref.fit(X, y) + + estimator = GeneralizedLinearRegressor(**params) + estimator.set_params(random_state=2) + + estimator.fit(X, y) + + assert_allclose(estimator.intercept_, est_ref.intercept_, rtol=1e-4) + assert_allclose(estimator.coef_, est_ref.coef_, rtol=1e-4) + assert_allclose( + mean_absolute_error(estimator.predict(X), y), + mean_absolute_error(est_ref.predict(X), y), + rtol=1e-4 + ) + + +def test_fit_dispersion(regression_data): + X, y = regression_data + + est1 = GeneralizedLinearRegressor(random_state=2) + est1.fit(X, y) + assert not hasattr(est1, "dispersion_") + + est2 = GeneralizedLinearRegressor(random_state=2, fit_dispersion="chisqr") + est2.fit(X, y) + assert isinstance(est2.dispersion_, float) + + est3 = GeneralizedLinearRegressor( + random_state=2, fit_dispersion="deviance") + est3.fit(X, y) + assert isinstance(est3.dispersion_, float) + + assert_allclose(est2.dispersion_, est3.dispersion_) From e3a5a9aa6855928d89bdc5dad7f3a46ba934fb8e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 12 Jun 2019 12:00:28 -0500 Subject: [PATCH 60/69] More tests and addressing some review comments --- sklearn/linear_model/_glm.py | 51 +++++++++++++------------- sklearn/linear_model/tests/test_glm.py | 43 +++++++++++++++++++++- 2 files changed, 67 insertions(+), 27 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 055aacf26d747..acce438b2b5e2 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -6,8 +6,6 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -# TODO: Should the option `normalize` be included (like other linear models)? -# So far, it is not included. User must pass a normalized X. # TODO: Add cross validation support, e.g. GCV? # TODO: Should GeneralizedLinearRegressor inherit from LinearModel? # So far, it does not. @@ -287,7 +285,6 @@ def inverse_derivative(self, lin_pred): return ep * (1. - ep) def inverse_derivative2(self, lin_pred): - ep = special.expit(lin_pred) ep = special.expit(lin_pred) return ep * (1. - ep) * (1. - 2 * ep) @@ -738,7 +735,19 @@ class TweedieDistribution(ExponentialDispersionModel): For ``0 0) - 1, pgtol=self.tol, maxiter=self.max_iter, factr=1e3) - if self.verbose > 0: - if info["warnflag"] == 1: - warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.", - ConvergenceWarning) - elif info["warnflag"] == 2: - warnings.warn("lbfgs failed for the reason: {0}" - .format(info["task"])) + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.", + ConvergenceWarning) + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}" + .format(info["task"])) self.n_iter_ = info['nit'] # 4.3 Newton-CG ####################################################### diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 230bbdabae201..7229f21840829 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -22,6 +22,7 @@ ) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.metrics import mean_absolute_error +from sklearn.exceptions import ConvergenceWarning from sklearn.utils.testing import assert_array_equal @@ -72,6 +73,23 @@ def test_family_bounds(family, expected): assert_array_equal(result, expected) +def test_tweedie_distribution_power(): + with pytest.raises(ValueError, match="no distribution exists"): + TweedieDistribution(power=0.5) + + with pytest.raises(TypeError, match="must be a real number"): + TweedieDistribution(power=1j) + + with pytest.raises(TypeError, match="must be a real number"): + dist = TweedieDistribution() + dist.power = 1j + + dist = TweedieDistribution() + assert dist._include_lower_bound is False + dist.power = 1 + assert dist._include_lower_bound is True + + @pytest.mark.parametrize( 'family, chk_values', [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), @@ -97,7 +115,8 @@ def test_deviance_zero(family, chk_values): (GammaDistribution(), LogLink()), (InverseGaussianDistribution(), LogLink()), (TweedieDistribution(power=1.5), LogLink()), - (TweedieDistribution(power=4.5), LogLink())]) + (TweedieDistribution(power=4.5), LogLink())], + ids=lambda args: args.__class__.__name__) def test_fisher_matrix(family, link): """Test the Fisher matrix numerically. Trick: Use numerical differentiation with y = mu""" @@ -110,6 +129,11 @@ def test_fisher_matrix(family, link): weights = rng.randn(10)**2 + 1 fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, weights=weights, link=link) + # check that the Fisher matrix is square and positive definite + assert fisher.ndim == 2 + assert fisher.shape[0] == fisher.shape[1] + assert np.all(np.linalg.eigvals(fisher) >= 0) + approx = np.array([]).reshape(0, coef.shape[0]) for i in range(coef.shape[0]): def f(coef): @@ -119,6 +143,13 @@ def f(coef): [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]) assert_allclose(fisher, approx, rtol=1e-3) + # check the observed information matrix + oim = family._observed_information(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link) + assert oim.ndim == 2 + assert oim.shape == fisher.shape + assert_allclose(oim, fisher) + def test_sample_weights_validation(): """Test the raised errors in the validation of sample_weight.""" @@ -713,3 +744,13 @@ def test_fit_dispersion(regression_data): assert isinstance(est3.dispersion_, float) assert_allclose(est2.dispersion_, est3.dispersion_) + + +@pytest.mark.parametrize("solver", ["irls", "lbfgs", "newton-cg", "cd"]) +def test_convergence_warning(solver, regression_data): + X, y = regression_data + + est = GeneralizedLinearRegressor(solver=solver, random_state=2, + max_iter=1, tol=1e-20) + with pytest.warns(ConvergenceWarning): + est.fit(X, y) From 54b80b8db780fbd3b438c6d5dc0001cdafe7a47e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 13 Jun 2019 09:50:53 -0500 Subject: [PATCH 61/69] TST More specific checks of error messages in tests --- sklearn/linear_model/_glm.py | 6 ++-- sklearn/linear_model/tests/test_glm.py | 44 ++++++++++++++------------ 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index acce438b2b5e2..c4f8cf7a975d3 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -1941,8 +1941,10 @@ def fit(self, X, y, sample_weight=None): k = np.min([10, n_features // 10 + 1]) sigma = 0 # start searching near this value which = 'SA' # find smallest algebraic eigenvalues first - if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma, - which=which) >= epsneg): + eigenvalues = splinalg.eigsh(P2, k=k, sigma=sigma, + which=which, + return_eigenvectors=False) + if not np.all(eigenvalues >= epsneg): raise ValueError("P2 must be positive semi-definite.") else: if not np.all(linalg.eigvalsh(P2) >= epsneg): diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 7229f21840829..de1a5262b36ce 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -158,7 +158,7 @@ def test_sample_weights_validation(): y = [1] weights = 0 glm = GeneralizedLinearRegressor(fit_intercept=False) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="weights must be non-negative"): glm.fit(X, y, weights) # Positive weights are accepted @@ -166,24 +166,26 @@ def test_sample_weights_validation(): # 2d array weights = [[0]] - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="must be 1D array or scalar"): glm.fit(X, y, weights) # 1d but wrong length weights = [1, 0] - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="weights must have the same length as y"): glm.fit(X, y, weights) # 1d but only zeros (sum not greater than 0) weights = [0, 0] X = [[0], [1]] y = [1, 2] - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="must have at least one positive element"): glm.fit(X, y, weights) # 5. 1d but with a negative value weights = [2, -1] - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="weights must be non-negative"): glm.fit(X, y, weights) @@ -202,7 +204,7 @@ def test_glm_family_argument(f, fam): glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="family must be"): glm.fit(X, y) @@ -218,7 +220,7 @@ def test_glm_link_argument(l, link): assert isinstance(glm._link_instance, link.__class__) glm = GeneralizedLinearRegressor(family='normal', link='not a link') - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="link must be"): glm.fit(X, y) @@ -228,7 +230,8 @@ def test_glm_alpha_argument(alpha): y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family='normal', alpha=alpha) - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="Penalty term must be a non-negative"): glm.fit(X, y) @@ -238,7 +241,8 @@ def test_glm_l1_ratio_argument(l1_ratio): y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio) - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="l1_ratio must be a number in interval.*0, 1"): glm.fit(X, y) @@ -276,13 +280,13 @@ def test_glm_P2_positive_semidefinite(): P2 = Q.T @ P2 @ Q glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, check_input=True) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="P2 must be positive semi-definite"): glm.fit(X, y) P2 = sparse.csr_matrix(P2) glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, check_input=True) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="P2 must be positive semi-definite"): glm.fit(X, y) @@ -292,7 +296,7 @@ def test_glm_fit_intercept_argument(fit_intercept): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="fit_intercept must be bool"): glm.fit(X, y) @@ -314,7 +318,7 @@ def test_glm_max_iter_argument(max_iter): y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(max_iter=max_iter) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="must be a positive integer"): glm.fit(X, y) @@ -324,7 +328,7 @@ def test_glm_tol_argument(tol): y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(tol=tol) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="stopping criteria must be positive"): glm.fit(X, y) @@ -334,7 +338,7 @@ def test_glm_warm_start_argument(warm_start): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(warm_start=warm_start) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="warm_start must be bool"): glm.fit(X, y) @@ -356,7 +360,7 @@ def test_glm_selection_argument(selection): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(selection=selection) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="argument selection must be"): glm.fit(X, y) @@ -366,7 +370,7 @@ def test_glm_random_state_argument(random_state): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(random_state=random_state) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="cannot be used to seed"): glm.fit(X, y) @@ -376,7 +380,7 @@ def test_glm_diag_fisher_argument(diag_fisher): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="diag_fisher must be bool"): glm.fit(X, y) @@ -386,7 +390,7 @@ def test_glm_copy_X_argument(copy_X): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(copy_X=copy_X) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="copy_X must be bool"): glm.fit(X, y) @@ -396,7 +400,7 @@ def test_glm_check_input_argument(check_input): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(check_input=check_input) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="check_input must be bool"): glm.fit(X, y) From 7db0320f460676d8bed8e01bf64657a89532e2cb Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 14 Jun 2019 10:44:16 -0500 Subject: [PATCH 62/69] Add PoissonRegressor alias --- sklearn/linear_model/__init__.py | 5 +- sklearn/linear_model/_glm.py | 163 +++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index e5ede64413eb5..121418f901a1a 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -19,7 +19,7 @@ MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) from ._glm import (TweedieDistribution, - GeneralizedLinearRegressor) + GeneralizedLinearRegressor, PoissonRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -82,4 +82,5 @@ 'ridge_regression', 'RANSACRegressor', 'GeneralizedLinearRegressor', - 'TweedieDistribution'] + 'TweedieDistribution', + 'PoissonRegressor'] diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index c4f8cf7a975d3..b18731e73f328 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -2326,3 +2326,166 @@ def score(self, X, y, sample_weight=None): y_mean = np.average(y, weights=weights) dev_null = self._family_instance.deviance(y, y_mean, weights=weights) return 1. - dev / dev_null + + def _more_tags(self): + return {"requires_positive_y": True} + + +class PoissonRegressor(GeneralizedLinearRegressor): + """Regression with the response variable y following a Poisson distribution + + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean of the target y as mu=h(X*w). + The fit minimizes the following objective function with L2 regularization:: + + 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 + + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, optional (default=1) + Constant that multiplies the penalty terms and thus determines the + regularization strength. + See the notes for the exact mathematical meaning of this + parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + case, the design matrix X must have full column rank + (no collinearities). + + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) + Method for estimation of the dispersion parameter phi. Whether to use + the chi squared statistic or the deviance statistic. If None, the + dispersion is not estimated. + + solver : {'irls', 'lbfgs', 'newton-cg'}, optional (default='irls') + Algorithm to use in the optimization problem: + + 'irls' + Iterated reweighted least squares. It is the standard algorithm + for GLMs. + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. + + 'newton-cg' + Newton conjugate gradient algorithm. + + Note that all solvers except lbfgs use the fisher matrix, i.e. the + expected Hessian instead of the Hessian matrix. + + max_iter : int, optional (default=100) + The maximal number of iterations for solver algorithms. + + tol : float, optional (default=1e-4) + Stopping criterion. For the irls, newton-cg and lbfgs solvers, + the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient (derivative) of + the objective function. + + warm_start : boolean, optional (default=False) + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` (supersedes option + ``start_params``). If set to ``True`` or if the attribute ``coef_`` + does not exit (first call to ``fit``), option ``start_params`` sets the + start values for ``coef_`` and ``intercept_``. + + start_params : {'guess', 'zero', array of shape (n_features*, )}, \ + optional (default='guess') + Relevant only if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not yet exist). + + 'guess' + Start values of mu are calculated by family.starting_mu(..). Then, + one Newton step obtains start values for ``coef_``. If + ``solver='irls'``, it uses one irls step. This gives usually good + starting values. + + 'zero' + All coefficients are set to zero. If ``fit_intercept=True``, the + start value for the intercept is obtained by the weighted average of y. + + array + The array of size n_features* is directly used as start values + for ``coef_``. If ``fit_intercept=True``, the first element + is assumed to be the start value for the ``intercept_``. + Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes + the intercept in counting. + + random_state : {int, RandomState instance, None}, optional (default=None) + If int, random_state is the seed used by the random + number generator; if RandomState instance, random_state is the random + number generator; if None, the random number generator is the + RandomState instance used by `np.random`. Used when ``selection`` == + 'random'. + + copy_X : boolean, optional, (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + verbose : int, optional (default=0) + For the lbfgs solver set verbose to any positive number for verbosity. + + Attributes + ---------- + coef_ : array, shape (n_features,) + Estimated coefficients for the linear predictor (X*coef_+intercept_) in + the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + dispersion_ : float + The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. + + n_iter_ : int + Actual number of iterations used in solver. + + Notes + ----- + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + :ref:`User Guide `. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. + + For alpha > 0, the feature matrix X should be standardized in order to + penalize features equally strong. + + If the target y is a ratio, appropriate sample weights s should be + provided. + As an example, consider Poisson distributed counts z (integers) and + weights s=exposure (time, money, persons years, ...). Then you fit + y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``. + The weights are necessary for the right (finite sample) mean. + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + in this case one might say that y has a 'scaled' Poisson distributions. + + References + ---------- + For the coordinate descent implementation: + * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + """ + def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None, + solver='irls', max_iter=100, + tol=1e-4, warm_start=False, start_params='guess', + random_state=None, copy_X=True, check_input=True, verbose=0): + + super().__init__(alpha=alpha, fit_intercept=fit_intercept, + family="poisson", link='log', + fit_dispersion=fit_dispersion, solver=solver, + max_iter=max_iter, tol=tol, warm_start=warm_start, + start_params=start_params, random_state=random_state, + copy_X=copy_X, verbose=verbose) From dcfe9edaf83509d2b6d98ee840c28f674ea4f496 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 27 Jun 2019 08:41:04 -0500 Subject: [PATCH 63/69] TST Simplify comparison with ridge --- sklearn/linear_model/tests/test_glm.py | 94 ++++++-------------------- 1 file changed, 22 insertions(+), 72 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index de1a5262b36ce..1416bdcfad680 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -26,6 +26,8 @@ from sklearn.utils.testing import assert_array_equal +GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg', 'cd'] + @pytest.fixture(scope="module") def regression_data(): @@ -404,7 +406,7 @@ def test_glm_check_input_argument(check_input): glm.fit(X, y) -@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +@pytest.mark.parametrize('solver', GLM_SOLVERS) def test_glm_identity_regression(solver): """Test GLM regression with identity link on a simple dataset.""" coef = [1., 2.] @@ -442,97 +444,45 @@ def test_glm_log_regression(family, solver, tol): # newton-cg may issue a LineSearchWarning, which we filter out @pytest.mark.filterwarnings('ignore:The line search algorithm') @pytest.mark.filterwarnings('ignore:Line Search failed') -@pytest.mark.parametrize('solver, tol', [('irls', 1e-6), - ('lbfgs', 1e-6), - ('newton-cg', 1e-6), - ('cd', 1e-6)]) -def test_normal_ridge(solver, tol): +@pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) +@pytest.mark.parametrize('fit_intercept', [True, False]) +@pytest.mark.parametrize('solver', GLM_SOLVERS) +def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): """Test ridge regression for Normal distributions. + Case n_samples >> n_features + Compare to test_ridge in test_ridge.py. """ - rng = np.random.RandomState(42) alpha = 1.0 - - # 1. With more samples than features - n_samples, n_features, n_predict = 100, 7, 10 + n_predict = 10 X, y, coef = make_regression(n_samples=n_samples+n_predict, n_features=n_features, n_informative=n_features-2, noise=0.5, - coef=True, random_state=rng) + coef=True, random_state=42) y = y[0:n_samples] X, T = X[0:n_samples], X[n_samples:] - # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, - solver='svd', normalize=False) - ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', - link='identity', fit_intercept=True, - tol=tol, max_iter=100, solver=solver, - check_input=False, random_state=rng) - glm.fit(X, y) - assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6) - assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) - assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6) - - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, - solver='svd', normalize=False) - ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', - link='identity', fit_intercept=False, - tol=tol, max_iter=100, solver=solver, - check_input=False, random_state=rng, - fit_dispersion='chisqr') - glm.fit(X, y) - assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_, rtol=1e-5) - assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) - assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6) - mu = glm.predict(X) - assert_allclose(glm.dispersion_, - np.sum((y-mu)**2/(n_samples-n_features))) - - # 2. With more features than samples and sparse - n_samples, n_features, n_predict = 10, 100, 10 - X, y, coef = make_regression(n_samples=n_samples+n_predict, - n_features=n_features, - n_informative=n_features-2, noise=0.5, - coef=True, random_state=rng) - y = y[0:n_samples] - X, T = X[0:n_samples], X[n_samples:] + if n_samples > n_features: + ridge_params = {"solver": "svd"} + else: + ridge_params = {"solver": "sag", "max_iter": 10000, "tol": 1e-9} # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, - solver='sag', normalize=False, max_iter=100000, - random_state=42) + ridge = Ridge(alpha=alpha*n_samples, normalize=False, + random_state=42, **ridge_params) ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=True, - tol=tol, max_iter=300, solver=solver, - check_input=False, random_state=rng) + max_iter=300, solver=solver, tol=1e-6, + check_input=False, random_state=42) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6) assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, - solver='sag', normalize=False, max_iter=1000, - random_state=42) - ridge.fit(X, y) - - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', - link='identity', fit_intercept=False, - tol=tol*2, max_iter=300, solver=solver, - check_input=False, random_state=rng) - glm.fit(X, y) - assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_, rtol=1e-4) - assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) - assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) - @pytest.mark.parametrize('solver, tol', [('irls', 1e-7), @@ -559,7 +509,7 @@ def test_poisson_ridge(solver, tol): rng = np.random.RandomState(42) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, fit_intercept=True, family='poisson', - link='log', tol=tol, + link='log', tol=1e-7, solver=solver, max_iter=300, random_state=rng) glm.fit(X, y) @@ -750,7 +700,7 @@ def test_fit_dispersion(regression_data): assert_allclose(est2.dispersion_, est3.dispersion_) -@pytest.mark.parametrize("solver", ["irls", "lbfgs", "newton-cg", "cd"]) +@pytest.mark.parametrize("solver", GLM_SOLVERS) def test_convergence_warning(solver, regression_data): X, y = regression_data From 4879bb6e057a95c38ee4950d13ef2dbd98da0a19 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 28 Jun 2019 09:54:59 -0500 Subject: [PATCH 64/69] EXA Add plot_tweedie_regression_insurance_claims.py --- ...lot_tweedie_regression_insurance_claims.py | 500 ++++++++++++++++++ 1 file changed, 500 insertions(+) create mode 100644 examples/linear_model/plot_tweedie_regression_insurance_claims.py diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py new file mode 100644 index 0000000000000..cb2ff667e8379 --- /dev/null +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -0,0 +1,500 @@ +""" +====================================== +Tweedie regression on insurance claims +====================================== + +This example illustrate the use Poisson, Gamma and Tweedie regression +on the French Motor Third-Party Liability Claims dataset, and is inspired +by an R tutorial [1]. + +Insurance claims data consist of the number of claims and the total claim +amount. Often, the final goal is to predict the expected value, i.e. the mean, +of the total claim amount. There are several possibilities to do that, two of +which are: + +1. Model the number of claims with a Poisson distribution, the average + claim amount as a Gamma distribution and multiply the predictions, to get + the total claim amount. +2. Model total claim amount directly, typically with a Tweedie distribution. + +In this example we will illustrate both approaches. We start by defining a few +helper functions for loading the data and visualizing results. + + +.. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor + Third-Party Liability Claims (November 8, 2018). + `doi:10.2139/ssrn.3164764 `_ + +""" +print(__doc__) + +# Authors: Christian Lorentzen +# Roman Yurchak +# License: BSD 3 clause +from functools import partial + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +from sklearn.compose import ColumnTransformer +from sklearn.linear_model import GeneralizedLinearRegressor +from sklearn.linear_model._glm import TweedieDistribution +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.preprocessing import StandardScaler, KBinsDiscretizer + +from sklearn.metrics import mean_absolute_error + + +def load_mtpl2(n_samples=100000): + """Fetcher for French Motor Third-Party Liability Claims dataset + + Parameters + ---------- + n_samples: int, default=100000 + number of samples to select (for faster run time). + """ + + # Note: this should use the OpenML DataFrame fetcher in the future + df_freq = pd.read_csv( + "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv", + dtype={"IDpol": np.int}, + index_col=0, + ) + + df_sev = pd.read_csv( + "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff", + index_col=0, + ) + + # sum ClaimAmount over identical IDs + df_sev = df_sev.groupby(level=0).sum() + + df = df_freq.join(df_sev, how="left") + df["ClaimAmount"].fillna(0, inplace=True) + + # unquote string fields + for column_name in df.columns[df.dtypes.values == np.object]: + df[column_name] = df[column_name].str.strip("'") + return df.iloc[:n_samples] + + +def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None, + title=None, kind_weight=None, ax=None): + """Plot observed and predicted - aggregated per feature level. + + Parameters + ---------- + df : DataFrame with at least one column named feature + observed : str + a column name of the observed target + predicted : frame + a dataframe, with the same index as df, with the predicted target + weight : str + column name with the values of weights/exposure + """ + # aggregate observed and predicted variables by feature level + df_ = df.loc[:, [feature, weight]].copy() + df_["observed"] = df[observed] * df[weight] + df_["predicted"] = y_predicted * df[weight] + df_ = ( + df_.groupby([feature])[weight, "observed", "predicted"] + .sum() + .assign(observed=lambda x: x["observed"] / x[weight]) + .assign(predicted=lambda x: x["predicted"] / x[weight]) + ) + + ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax) + y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8 + ax.fill_between( + df_.index, + 0, + y_max * df_[weight] / df_[weight].values.max(), + color="g", + alpha=0.1, + ) + ax.set( + ylabel=y_label if y_label is not None else None, + title=title if title is not None else "Train: Observed vs Predicted", + ) + + +############################################################################## +# +# 1. Loading datasets and pre-processing +# -------------------------------------- +# +# We construct the freMTPL2 dataset by joining the freMTPL2freq table, +# containing the number of claims (``ClaimNb``) with the freMTPL2sev table +# containing the claim amount (``ClaimAmount``) for the same user ids. + +df = load_mtpl2(n_samples=100000) + +# Note: filter out claims with zero amount, as the severity model +# requires a strictly positive target values. +df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 + +# correct for unreasonable observations (that might be data error) +df["ClaimNb"].clip(upper=4, inplace=True) +df["Exposure"].clip(upper=1, inplace=True) + +column_trans = ColumnTransformer( + [ + ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ( + "Veh_Brand_Gas_Region", + OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), + ("BonusMalus", "passthrough", ["BonusMalus"]), + ( + "Density_log", + make_pipeline( + FunctionTransformer(np.log, validate=False), StandardScaler() + ), + ["Density"], + ), + ], + remainder="drop", +) +X = column_trans.fit_transform(df) + + +df["Frequency"] = df.ClaimNb / df.Exposure +df["AvgClaimAmount"] = df.ClaimAmount / np.fmax(df.ClaimNb, 1) + +print(df[df.ClaimAmount > 0].head()) + +############################################################################## +# +# 2. Frequency model -- Poisson distribution +# ------------------------------------------- +# +# The number of claims (``ClaimNb``) is a positive integer that can be modeled +# as a Poisson distribution. It is then assumed to be the number of discrete +# events occuring with a constant rate in a given time interval (``Exposure``). +# Here we model the frequency ``y = ClaimNb / Exposure``, +# which is still a (scaled) Poisson distribution. +# +# A very important property of the Poisson distribution is its mean-variance +# relation: The variance is proportional to the mean. + +df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) + +glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0) +glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) + + +def mean_deviance(estimator, y, y_pred, weights): + if hasattr(estimator, "_family_instance"): + return estimator._family_instance.deviance(y, y_pred, weights) / len(y) + else: + return np.nan + + +def score_estimator( + estimator, X_train, X_test, df_train, df_test, target, weights +): + res = [] + + for subset_label, X, df in [ + ("train", X_train, df_train), + ("test", X_test, df_test), + ]: + y, _weights = df[target], df[weights] + + for score_label, metric in [ + ("D² explaned", None), + ("mean deviance", partial(mean_deviance, estimator)), + ("mean abs. error", mean_absolute_error), + ]: + if estimator.__class__.__name__ == "ClaimProdEstimator": + # ClaimProdEstimator is the product of the frequency and + # severity models, together with a denormalized by the exposure + # values. It does not fully follow the scikit-learn API and we + # must handle it separately. + y_pred = estimator.predict(X, exposure=df.Exposure.values) + else: + y_pred = estimator.predict(X) + if metric is None: + if not hasattr(estimator, "score"): + continue + score = estimator.score(X, y, _weights) + else: + score = metric(y, y_pred, _weights) + + res.append( + {"subset": subset_label, "metric": score_label, "score": score} + ) + + res = ( + pd.DataFrame(res) + .set_index(["metric", "subset"]) + .score.unstack(-1) + .round(3) + ) + return res + + +scores = score_estimator( + glm_freq, + X_train, + X_test, + df_train, + df_test, + target="Frequency", + weights="Exposure", +) +print(scores) + +############################################################################## +# +# We can visually compare observed and predicted values, aggregated by +# the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance +# bonus/penalty (``BonusMalus``), + +fig, ax = plt.subplots(2, 2, figsize=(16, 8)) +fig.subplots_adjust(hspace=0.3, wspace=0.2) + +plot_obs_pred( + df_train, + "DrivAge", + "Frequency", + glm_freq.predict(X_train), + weight="Exposure", + y_label="Claim Frequency", + title="train data", + ax=ax[0, 0], +) + +plot_obs_pred( + df_test, + "DrivAge", + "Frequency", + glm_freq.predict(X_test), + weight="Exposure", + y_label="Claim Frequency", + title="test data", + ax=ax[0, 1], +) + +plot_obs_pred( + df_test, + "VehAge", + "Frequency", + glm_freq.predict(X_test), + weight="Exposure", + y_label="Claim Frequency", + title="test data", + ax=ax[1, 0], +) + +plot_obs_pred( + df_test, + "BonusMalus", + "Frequency", + glm_freq.predict(X_test), + weight="Exposure", + y_label="Claim Frequency", + title="test data", + ax=ax[1, 1], +) + + +############################################################################## +# +# 3. Severity model - Gamma Distribution +# --------------------------------------- +# The mean claim amount or severity (`AvgClaimAmount`) can be empirically +# shown to follow a Gamma distribution. We fit a GLM model for the severity +# with the same features as the frequency model. +# +# Note: +# - We filter out ``ClaimAmount == 0``` as the Gamma distribution as support +# on :math:`(0, \infty)` not :math:`[0, \infty)`. +# - We use ``ClaimNb`` as sample weights. + +mask_train = df_train["ClaimAmount"] > 0 +mask_test = df_test["ClaimAmount"] > 0 + +glm_sev = GeneralizedLinearRegressor(family="gamma", alpha=1) + +glm_sev.fit( + X_train[mask_train.values], + df_train.loc[mask_train, "AvgClaimAmount"], + sample_weight=df_train.loc[mask_train, "ClaimNb"], +) + + +scores = score_estimator( + glm_sev, + X_train[mask_train.values], + X_test[mask_test.values], + df_train[mask_train], + df_test[mask_test], + target="AvgClaimAmount", + weights="ClaimNb", +) +print(scores) + +############################################################################## +# +# Note that the resulting model is conditional on having at least one claim, +# and cannot be used to predict the average claim amount in general, + +print( + "Mean AvgClaim Amount: %.2f " + % df_train.AvgClaimAmount.mean() +) +print( + "Mean AvgClaim Amount | NbClaim > 0: %.2f" + % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean() +) +print( + "Predicted Mean AvgClaim Amount: %.2f" + % glm_sev.predict(X_train).mean() +) + + +############################################################################## +# +# We can visually compare observed and predicted values, aggregated for +# the drivers age (``Driv Age``), + +fig, ax = plt.subplots(1, 2, figsize=(16, 4)) + +# plot DivAge +plot_obs_pred( + df_train.loc[mask_train], + "DrivAge", + "AvgClaimAmount", + glm_sev.predict(X_train[mask_train.values]), + weight="Exposure", + y_label="Average Claim Severity", + title="train data", + ax=ax[0], +) + +plot_obs_pred( + df_test.loc[mask_test], + "DrivAge", + "AvgClaimAmount", + glm_sev.predict(X_test[mask_test.values]), + weight="Exposure", + y_label="Average Claim Severity", + title="test data", + ax=ax[1], +) + + +############################################################################## +# +# 3. Total Claims Amount -- Compound Poisson distribution +# ------------------------------------------------------- +# +# As mentionned in the introduction, the total claim amount can be modeled +# either as the product of the frequency model by the severity model. + + +class ClaimProdEstimator: + """Total claim amount estimator + + Computed as the product of the frequency model by the serverity model, + denormalized by exposure. + """ + + def __init__(self, est_freq, est_sev): + self.est_freq = est_freq + self.est_sev = est_sev + + def predict(self, X, exposure): + """Predict the total claim amount + + The predict method is not compatible with the scikit-learn API. + """ + return exposure * self.est_freq.predict(X) * self.est_sev.predict(X) + + +est_prod = ClaimProdEstimator(glm_freq, glm_sev) + +scores = score_estimator( + est_prod, + X_train, + X_test, + df_train, + df_test, + target="ClaimAmount", + weights="Exposure", +) +print(scores) + + +############################################################################## +# +# or as a unique Compound Poisson model, also corresponding to a Tweedie model +# with a power :math:`p \in (1, 2)`. We determine the optimal hyperparameter +# ``p`` with a grid search, + +from sklearn.model_selection import GridSearchCV + +# this takes a while +params = { + "family": [ + TweedieDistribution(power=power) for power in np.linspace(1, 2, 8) + ] +} + +glm_total = GridSearchCV( + GeneralizedLinearRegressor(), cv=3, param_grid=params, n_jobs=-1 +) +glm_total.fit( + X_train, df_train["ClaimAmount"], sample_weight=df_train["Exposure"] +) + + +print( + "Best hyperparameters: power=%.2f\n" + % glm_total.best_estimator_.family.power +) + +scores = score_estimator( + glm_total.best_estimator_, + X_train, + X_test, + df_train, + df_test, + target="ClaimAmount", + weights="Exposure", +) +print(scores) + +############################################################################## +# +# In this example, the mean absolute error is lower for the Compound Poisson +# model than when using separate models for frequency and severity. +# +# We can additionally validate these models by comparing observed and predicted +# total claim amount over the test and train subsets. + +res = [] +for subset_label, X, df in [ + ("train", X_train, df_train), + ("test", X_test, df_test), +]: + res.append( + { + "subset": subset_label, + "observed": df.ClaimAmount.values.sum(), + "predicted, frequency*severity model": np.sum( + est_prod.predict(X, exposure=df.Exposure.values) + ), + "predicted, tweedie, p=%.2f" + % glm_total.best_estimator_.family.power: np.sum( + glm_total.best_estimator_.predict(X) + ), + } + ) + +print(pd.DataFrame(res).set_index("subset").T) From 56069e5b3f5b453d9e8a487c9d27e20900ce4d63 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 28 Jun 2019 10:11:37 -0500 Subject: [PATCH 65/69] EXA Fix issues with older pandas versions in example --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index cb2ff667e8379..063d12e6e291b 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -137,8 +137,8 @@ def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None, df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 # correct for unreasonable observations (that might be data error) -df["ClaimNb"].clip(upper=4, inplace=True) -df["Exposure"].clip(upper=1, inplace=True) +df["ClaimNb"] = df["ClaimNb"].clip(upper=4) +df["Exposure"] = df["Exposure"].clip(upper=1) column_trans = ColumnTransformer( [ From 53f3c5f6670bbdefcd5ddf6d63e405401909cf06 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 09:14:30 -0500 Subject: [PATCH 66/69] DOC Add second poisson regression example --- ...plot_poisson_regression_non_normal_loss.py | 257 ++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 examples/linear_model/plot_poisson_regression_non_normal_loss.py diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py new file mode 100644 index 0000000000000..b06adcb787560 --- /dev/null +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -0,0 +1,257 @@ +""" +====================================== +Poisson regression and non normal loss +====================================== + +This example illustrate the use linear Poisson regression +on the French Motor Third-Party Liability Claims dataset [1] and compare +it with learning models with least squared error. + + +We start by defining a few helper functions for loading the data and +visualizing results. + + +.. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor + Third-Party Liability Claims (November 8, 2018). + `doi:10.2139/ssrn.3164764 `_ + +""" +print(__doc__) + +# Authors: Christian Lorentzen +# Roman Yurchak +# License: BSD 3 clause +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +from scipy.special import xlogy + +from sklearn.compose import ColumnTransformer +from sklearn.linear_model import GeneralizedLinearRegressor, LinearRegression +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.preprocessing import StandardScaler, KBinsDiscretizer +from sklearn.ensemble import GradientBoostingRegressor + +from sklearn.metrics import mean_squared_error, mean_absolute_error + + +def load_mtpl2(n_samples=100000): + """Fetcher for French Motor Third-Party Liability Claims dataset + + Parameters + ---------- + n_samples: int, default=100000 + number of samples to select (for faster run time). + """ + + # Note: this should use the OpenML DataFrame fetcher in the future + df_freq = pd.read_csv( + "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv", + dtype={"IDpol": np.int}, + index_col=0, + ) + + df_sev = pd.read_csv( + "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff", + index_col=0, + ) + + # sum ClaimAmount over identical IDs + df_sev = df_sev.groupby(level=0).sum() + + df = df_freq.join(df_sev, how="left") + df["ClaimAmount"].fillna(0, inplace=True) + + # unquote string fields + for column_name in df.columns[df.dtypes.values == np.object]: + df[column_name] = df[column_name].str.strip("'") + return df.iloc[:n_samples] + + +############################################################################## +# +# 1. Loading datasets and pre-processing +# -------------------------------------- +# +# We construct the freMTPL2 dataset by joining the freMTPL2freq table, +# containing the number of claims (``ClaimNb``) with the freMTPL2sev table +# containing the claim amount (``ClaimAmount``) for the same user ids. + +df = load_mtpl2(n_samples=100000) + +# Note: filter out claims with zero amount, as the severity model +# requires a strictly positive target values. +df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 + +# correct for unreasonable observations (that might be data error) +df["ClaimNb"] = df["ClaimNb"].clip(upper=4) +df["Exposure"] = df["Exposure"].clip(upper=1) + +column_trans = ColumnTransformer( + [ + ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ( + "Veh_Brand_Gas_Region", + OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), + ("BonusMalus", "passthrough", ["BonusMalus"]), + ( + "Density_log", + make_pipeline( + FunctionTransformer(np.log, validate=False), StandardScaler() + ), + ["Density"], + ), + ], + remainder="drop", +) +X = column_trans.fit_transform(df) + +############################################################################## +# +# The number of claims (``ClaimNb``) is a positive integer that can be modeled +# as a Poisson distribution. It is then assumed to be the number of discrete +# events occurring with a constant rate in a given time interval +# (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``, +# which is still a (scaled) Poisson distribution. +# +# A very important property of the Poisson distribution is its mean-variance +# relation: The variance is proportional to the mean. + +df["Frequency"] = df.ClaimNb / df.Exposure + +print( + pd.cut(df.Frequency, [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts() +) + +############################################################################## +# +# It worth noting that 96 % of users have 0 claims, and if we were to convert +# this problem into a binary classification task, it would be significantly +# imbalanced. +# +# To evaluate the pertinence of the used metrics, we will consider as a +# baseline an estimator that returns 0 for any input. + +df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) + + +def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): + y_true = np.atleast_1d(y_true) + y_pred = np.atleast_1d(y_pred) + dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred) + return np.average(dev, weights=sample_weights) + + +eps = 1e-5 +print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, np.zeros(len(df_test)), + df_test.Exposure.values)) +print("MAE: %.3f" % mean_absolute_error( + df_test.Frequency.values, np.zeros(len(df_test)), + df_test.Exposure.values)) +print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( + df_test.Frequency.values, eps + np.zeros(len(df_test)), + df_test.Exposure.values)) + + +############################################################################## +# +# We start by modeling the target variable with the least squares linear +# regression model, + + +linregr = LinearRegression() +linregr.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) + +print("LinearRegression") +print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, linregr.predict(X_test), + df_test.Exposure.values)) +print("MSE: %.3f" % mean_absolute_error( + df_test.Frequency.values, linregr.predict(X_test), + df_test.Exposure.values)) +print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( + df_test.Frequency.values, np.fmax(linregr.predict(X_test), eps), + df_test.Exposure.values)) + +############################################################################## +# +# The Poisson deviance cannot be computed because negative values are +# predicted by the model, + +print('Number Negatives: %s / total: %s' % ( + (linregr.predict(X_test) < 0).sum(), X_test.shape[0])) + +############################################################################## +# +# Next we fit the Poisson regressor on the target variable, + +glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0) +glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) + +print("PoissonRegressor") +print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, glm_freq.predict(X_test), + df_test.Exposure.values)) +print("MAE: %.3f" % mean_absolute_error( + df_test.Frequency.values, glm_freq.predict(X_test), + df_test.Exposure.values)) +print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( + df_test.Frequency.values, glm_freq.predict(X_test), + df_test.Exposure.values)) + +############################################################################## +# +# Finally we will consider a non linear model with Gradient boosting that +# still minimizes the least square error. + + +gbr = GradientBoostingRegressor(max_depth=3) +gbr.fit(X_train, df_train.Frequency.values, + sample_weight=df_train.Exposure.values) + + +print("GradientBoostingRegressor") +print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) +print("MAE: %.3f" % mean_absolute_error( + df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) +print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( + df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) + +############################################################################## +# +# In this example, although Gradient boosting minimizes the least square error, +# because of a higher predictive power it also results in a smaller Poisson +# deviance than the Poisson regression model. +# +# Evaluating models with a single train / test split is prone to numerical +# errors, we can verify that we would also get equivalent resuts with the +# cross-validation score. +# +# The difference between these models can also be visualized by comparing the +# histogram of observed target values with that of predicted values, + + +fig, ax = plt.subplots(1, 4, figsize=(16, 3)) + +df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=ax[0]) + +ax[0].set_title('Experimental data') + +for idx, model in enumerate([linregr, glm_freq, gbr]): + y_pred = model.predict(X_train) + + pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=ax[idx+1]) + ax[idx+1].set_title(model.__class__.__name__) + +for axi in ax: + axi.set( + yscale='log', + xlabel="y (Frequency)" + ) From e58d8e3a35ad0d5a6ceda167d5452d885ec855a3 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Jul 2019 18:20:09 +0200 Subject: [PATCH 67/69] EXA wording and score in plot_tweedie_regression_insurance_claims.html --- ...lot_tweedie_regression_insurance_claims.py | 137 ++++++++++-------- 1 file changed, 78 insertions(+), 59 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 063d12e6e291b..00111b811f923 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -3,7 +3,7 @@ Tweedie regression on insurance claims ====================================== -This example illustrate the use Poisson, Gamma and Tweedie regression +This example illustrates the use of Poisson, Gamma and Tweedie regression on the French Motor Third-Party Liability Claims dataset, and is inspired by an R tutorial [1]. @@ -13,9 +13,10 @@ which are: 1. Model the number of claims with a Poisson distribution, the average - claim amount as a Gamma distribution and multiply the predictions, to get - the total claim amount. -2. Model total claim amount directly, typically with a Tweedie distribution. + claim amount as a Gamma distribution and multiply the predictions of both in + order to get the total claim amount. +2. Model total claim amount directly, typically with a Tweedie distribution of + Tweedie power :math:`p \\in (1, 2)`. In this example we will illustrate both approaches. We start by defining a few helper functions for loading the data and visualizing results. @@ -49,7 +50,7 @@ def load_mtpl2(n_samples=100000): - """Fetcher for French Motor Third-Party Liability Claims dataset + """Fetch the French Motor Third-Party Liability Claims dataset. Parameters ---------- @@ -81,24 +82,27 @@ def load_mtpl2(n_samples=100000): return df.iloc[:n_samples] -def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None, +def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, title=None, kind_weight=None, ax=None): """Plot observed and predicted - aggregated per feature level. Parameters ---------- - df : DataFrame with at least one column named feature + df : DataFrame with at least three columns named feature, weight and + observed + feature: str + a column name of df for the feature to be plotted + weight : str + column name of df with the values of weights or exposure observed : str - a column name of the observed target + a column name of df with the observed target predicted : frame a dataframe, with the same index as df, with the predicted target - weight : str - column name with the values of weights/exposure """ # aggregate observed and predicted variables by feature level df_ = df.loc[:, [feature, weight]].copy() df_["observed"] = df[observed] * df[weight] - df_["predicted"] = y_predicted * df[weight] + df_["predicted"] = predicted * df[weight] df_ = ( df_.groupby([feature])[weight, "observed", "predicted"] .sum() @@ -126,9 +130,10 @@ def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None, # 1. Loading datasets and pre-processing # -------------------------------------- # -# We construct the freMTPL2 dataset by joining the freMTPL2freq table, -# containing the number of claims (``ClaimNb``) with the freMTPL2sev table -# containing the claim amount (``ClaimAmount``) for the same user ids. +# We construct the freMTPL2 dataset by joining the freMTPL2freq table, +# containing the number of claims (``ClaimNb``), with the freMTPL2sev table, +# containing the claim amount (``ClaimAmount``) for the same policy ids +# (``IDpol``). df = load_mtpl2(n_samples=100000) @@ -206,14 +211,14 @@ def score_estimator( y, _weights = df[target], df[weights] for score_label, metric in [ - ("D² explaned", None), + ("D² explained", None), ("mean deviance", partial(mean_deviance, estimator)), ("mean abs. error", mean_absolute_error), ]: if estimator.__class__.__name__ == "ClaimProdEstimator": - # ClaimProdEstimator is the product of the frequency and - # severity models, together with a denormalized by the exposure - # values. It does not fully follow the scikit-learn API and we + # ClaimProdEstimator is the product of frequency and severity + # models, denormalized by the exposure values. + # It does not fully follow the scikit-learn API and we # must handle it separately. y_pred = estimator.predict(X, exposure=df.Exposure.values) else: @@ -253,50 +258,50 @@ def score_estimator( # # We can visually compare observed and predicted values, aggregated by # the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance -# bonus/penalty (``BonusMalus``), +# bonus/malus (``BonusMalus``). fig, ax = plt.subplots(2, 2, figsize=(16, 8)) fig.subplots_adjust(hspace=0.3, wspace=0.2) plot_obs_pred( - df_train, - "DrivAge", - "Frequency", - glm_freq.predict(X_train), + df=df_train, + feature="DrivAge", weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_train), y_label="Claim Frequency", title="train data", ax=ax[0, 0], ) plot_obs_pred( - df_test, - "DrivAge", - "Frequency", - glm_freq.predict(X_test), + df=df_test, + feature="DrivAge", weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_test), y_label="Claim Frequency", title="test data", ax=ax[0, 1], ) plot_obs_pred( - df_test, - "VehAge", - "Frequency", - glm_freq.predict(X_test), + df=df_test, + feature="VehAge", weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_test), y_label="Claim Frequency", title="test data", ax=ax[1, 0], ) plot_obs_pred( - df_test, - "BonusMalus", - "Frequency", - glm_freq.predict(X_test), + df=df_test, + feature="BonusMalus", weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_test), y_label="Claim Frequency", title="test data", ax=ax[1, 1], @@ -308,12 +313,13 @@ def score_estimator( # 3. Severity model - Gamma Distribution # --------------------------------------- # The mean claim amount or severity (`AvgClaimAmount`) can be empirically -# shown to follow a Gamma distribution. We fit a GLM model for the severity -# with the same features as the frequency model. +# shown to follow approximately a Gamma distribution. We fit a GLM model for +# the severity with the same features as the frequency model. # # Note: -# - We filter out ``ClaimAmount == 0``` as the Gamma distribution as support -# on :math:`(0, \infty)` not :math:`[0, \infty)`. +# +# - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support +# on :math:`(0, \infty)`, not :math:`[0, \infty)`. # - We use ``ClaimNb`` as sample weights. mask_train = df_train["ClaimAmount"] > 0 @@ -341,19 +347,20 @@ def score_estimator( ############################################################################## # -# Note that the resulting model is conditional on having at least one claim, -# and cannot be used to predict the average claim amount in general, +# Note that the resulting model is the average claim amount per claim. As such, +# it is conditional on having at least one claim, and cannot be used to predict +# the average claim amount per policy in general. print( - "Mean AvgClaim Amount: %.2f " + "Mean AvgClaim Amount per policy: %.2f " % df_train.AvgClaimAmount.mean() ) print( - "Mean AvgClaim Amount | NbClaim > 0: %.2f" + "Mean AvgClaim Amount | NbClaim > 0: %.2f" % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean() ) print( - "Predicted Mean AvgClaim Amount: %.2f" + "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f" % glm_sev.predict(X_train).mean() ) @@ -361,28 +368,28 @@ def score_estimator( ############################################################################## # # We can visually compare observed and predicted values, aggregated for -# the drivers age (``Driv Age``), +# the drivers age (``DrivAge``). fig, ax = plt.subplots(1, 2, figsize=(16, 4)) # plot DivAge plot_obs_pred( - df_train.loc[mask_train], - "DrivAge", - "AvgClaimAmount", - glm_sev.predict(X_train[mask_train.values]), + df=df_train.loc[mask_train], + feature="DrivAge", weight="Exposure", + observed="AvgClaimAmount", + predicted=glm_sev.predict(X_train[mask_train.values]), y_label="Average Claim Severity", title="train data", ax=ax[0], ) plot_obs_pred( - df_test.loc[mask_test], - "DrivAge", - "AvgClaimAmount", - glm_sev.predict(X_test[mask_test.values]), + df=df_test.loc[mask_test], + feature="DrivAge", weight="Exposure", + observed="AvgClaimAmount", + predicted=glm_sev.predict(X_test[mask_test.values]), y_label="Average Claim Severity", title="test data", ax=ax[1], @@ -391,31 +398,41 @@ def score_estimator( ############################################################################## # -# 3. Total Claims Amount -- Compound Poisson distribution +# 4. Total Claims Amount -- Compound Poisson distribution # ------------------------------------------------------- # # As mentionned in the introduction, the total claim amount can be modeled -# either as the product of the frequency model by the severity model. +# either as the product of the frequency model by the severity model, class ClaimProdEstimator: - """Total claim amount estimator + """Total claim amount estimator. Computed as the product of the frequency model by the serverity model, - denormalized by exposure. + denormalized by exposure. Use Tweedie deviance with `p=1.5`. """ def __init__(self, est_freq, est_sev): self.est_freq = est_freq self.est_sev = est_sev + self._family_instance = TweedieDistribution(power=1.5) def predict(self, X, exposure): - """Predict the total claim amount + """Predict the total claim amount. The predict method is not compatible with the scikit-learn API. """ return exposure * self.est_freq.predict(X) * self.est_sev.predict(X) + def score(self, X, y, sample_weight=None): + """Compute D², the percentage of deviance explained.""" + mu = self.predict(X, exposure=sample_weight) + dev = self._family_instance.deviance(y, mu, weights=sample_weight) + y_mean = np.average(y, weights=sample_weight) + dev_null = self._family_instance.deviance(y, y_mean, + weights=sample_weight) + return 1. - dev / dev_null + est_prod = ClaimProdEstimator(glm_freq, glm_sev) @@ -476,7 +493,9 @@ def predict(self, X, exposure): # model than when using separate models for frequency and severity. # # We can additionally validate these models by comparing observed and predicted -# total claim amount over the test and train subsets. +# total claim amount over the test and train subsets. We see that in our case +# the frequency-severity model underestimates the total claim amount, whereas +# the Tweedie model overestimates. res = [] for subset_label, X, df in [ From c3fc3929f23d38b0a655cfcbf60a600d79777666 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 15 Jul 2019 14:15:26 +0200 Subject: [PATCH 68/69] Address review comments --- .../plot_tweedie_regression_insurance_claims.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 00111b811f923..1c8dd42df336d 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -46,7 +46,7 @@ from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer -from sklearn.metrics import mean_absolute_error +from sklearn.metrics import mean_absolute_error, mean_squared_error def load_mtpl2(n_samples=100000): @@ -83,7 +83,7 @@ def load_mtpl2(n_samples=100000): def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, - title=None, kind_weight=None, ax=None): + title=None, ax=None): """Plot observed and predicted - aggregated per feature level. Parameters @@ -141,9 +141,11 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # requires a strictly positive target values. df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 -# correct for unreasonable observations (that might be data error) +# Correct for unreasonable observations (that might be data error) +# and a few exceptionally large claim amounts df["ClaimNb"] = df["ClaimNb"].clip(upper=4) df["Exposure"] = df["Exposure"].clip(upper=1) +df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000) column_trans = ColumnTransformer( [ @@ -188,7 +190,9 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) -glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0) +# Some of the features are colinear, we use a weak penalization to avoid +# numerical issues. +glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=1e-2) glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) @@ -214,6 +218,7 @@ def score_estimator( ("D² explained", None), ("mean deviance", partial(mean_deviance, estimator)), ("mean abs. error", mean_absolute_error), + ("mean squared error", mean_squared_error), ]: if estimator.__class__.__name__ == "ClaimProdEstimator": # ClaimProdEstimator is the product of frequency and severity @@ -325,7 +330,7 @@ def score_estimator( mask_train = df_train["ClaimAmount"] > 0 mask_test = df_test["ClaimAmount"] > 0 -glm_sev = GeneralizedLinearRegressor(family="gamma", alpha=1) +glm_sev = GeneralizedLinearRegressor(family="gamma") glm_sev.fit( X_train[mask_train.values], From 98054bc9a4416c49b26a3a253b9a7bef16a1e27b Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 27 Nov 2019 22:03:20 +0100 Subject: [PATCH 69/69] fix sparse P2 cases --- sklearn/linear_model/_glm.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index b18731e73f328..b3c303ee9770b 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -1862,8 +1862,7 @@ def fit(self, X, y, sample_weight=None): elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and P2.shape[0] == X.shape[1]): if sparse.issparse(X): - P2 = (sparse.dia_matrix((P2, 0), - shape=(n_features, n_features))).tocsc() + P2 = sparse.csc_matrix(P2) else: raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " @@ -1939,7 +1938,7 @@ def fit(self, X, y, sample_weight=None): # efficiently, use only half of n_features # k = how many eigenvals to compute k = np.min([10, n_features // 10 + 1]) - sigma = 0 # start searching near this value + sigma = -1000 * epsneg # start searching near this value which = 'SA' # find smallest algebraic eigenvalues first eigenvalues = splinalg.eigsh(P2, k=k, sigma=sigma, which=which,