From d5e88108a71c6c256b67948515f20955cc96fabf Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 18 Jul 2017 21:50:10 +0200 Subject: [PATCH 001/209] [WIP] Add Generalized Linear Model, issue #5975, initial commit --- sklearn/linear_model/__init__.py | 7 + sklearn/linear_model/glm.py | 872 +++++++++++++++++++++++++ sklearn/linear_model/tests/test_glm.py | 73 +++ 3 files changed, 952 insertions(+) create mode 100644 sklearn/linear_model/glm.py create mode 100644 sklearn/linear_model/tests/test_glm.py diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 2e01990ccce8c..5acc51e9dc87f 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,6 +18,12 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) +from .glm import (Link, IdentityLink, LogLink, + ExponentialDispersionModel, TweedieDistribution, + NormalDistribution, GaussianDistribution, + PoissonDistribution, GammaDistribution, + InverseGaussianDistribution, GeneralizedHyperbolicSecand, + GeneralizedLinearModel) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -38,6 +44,7 @@ 'BayesianRidge', 'ElasticNet', 'ElasticNetCV', + 'GeneralizedLinearModel', 'Hinge', 'Huber', 'HuberRegressor', diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py new file mode 100644 index 0000000000000..8b6eb8f3bf16c --- /dev/null +++ b/sklearn/linear_model/glm.py @@ -0,0 +1,872 @@ +""" +Generalized Linear Models with Exponential Dispersion Family +""" + +# Author: Christian Lorentzen +# License: BSD 3 clause + +# TODO: Which name? GeneralizedLinearModel vs GeneralizedLinearRegression. +# So far, it is GeneralizedLinearModel, since it could very easily +# extended by Bernoulli/Binomial distribution. +# TODO: Which name/symbol for coefficients and weights in docu? +# sklearn.linear_models uses w for coefficients. +# So far, coefficients=beta and weight=w (as standard literature) +# TODO: Add l2-penalty +# TODO: Add l1-penalty (elastic net) +# TODO: Add cross validation +# TODO: Write docu and examples + +# Design Decisions: +# - The link funtion (instance of class Link) is necessary for the evaluation +# of deviance, score, Fisher and Hessian matrix as functions of the +# coefficients, which is needed by optimizers. +# Solution: link as argument in those functions + +from __future__ import division +from abc import ABCMeta, abstractmethod, abstractproperty +import numbers +import numpy as np +from scipy import linalg, optimize, sparse +import warnings +from .base import LinearModel, LinearRegression +from ..base import RegressorMixin +from ..utils import check_X_y +from ..utils.extmath import safe_sparse_dot +from ..utils.optimize import newton_cg +from ..utils.validation import check_is_fitted + + + +class Link(metaclass=ABCMeta): + """Abstract base class for Link funtions + """ + + @abstractmethod + def link(self, mu): + """The link function g(mu) with argument mu=E[Y] returns the + linear predictor. + """ + raise NotImplementedError + + @abstractmethod + def derivative(self, mu): + """Derivative of the link g'(mu). + """ + raise NotImplementedError + + @abstractmethod + def inverse(self, lin_pred): + """The inverse link function h(lin_pred) with the linear predictor as + argument returns mu=E[Y]. + """ + raise NotImplementedError + + @abstractmethod + def inverse_derivative(self, lin_pred): + """Derivative of the inverse link function h'(lin_pred). + """ + raise NotImplementedError + + @abstractmethod + def inverse_derivative2(self, lin_pred): + """Second derivative of the inverse link function h''(lin_pred). + """ + raise NotImplementedError + +class IdentityLink(Link): + """The identity link function g(x)=x. + """ + + def link(self, mu): + return mu + + def derivative(self, mu): + return np.ones_like(mu) + + def inverse(self, lin_pred): + return lin_pred + + def inverse_derivative(self, lin_pred): + return np.ones_like(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.zeros_like(lin_pred) + + +class LogLink(Link): + """The log link function g(x)=log(x). + """ + + def link(self, mu): + return np.log(mu) + + def derivative(self, mu): + return 1./mu + + def inverse(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.exp(lin_pred) + + +class ExponentialDispersionModel(metaclass=ABCMeta): + """Base class for reproductive Exponential Dispersion Models (EDM). + + The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by + + .. math:: p(y| \theta, \phi) = c(y, \phi) + \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) + = \tilde{c}(y, \phi) + \exp\left(-\frac{d(y, \mu)}{2\phi}\right) + + with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, + unit variance :math:`v(\mu)` and + unit deviance :math:`d(y,\mu)`. + + Attributes + ---------- + lower_bound + upper_bound + + Methods + ------- + in_y_range + unit_variance + unit_variance_derivative + variance + variance_derivative + unit_deviance + unit_deviance_derivative + deviance + deviance_derivative + starting_mu + + _score + _fisher_matrix + _observed_information + _deviance + _deviance_derivative + _deviance_hessian + + References + ---------- + See https://en.wikipedia.org/wiki/Exponential_dispersion_model. + """ + + @abstractproperty + def lower_bound(self): + """The lower bound of values of Y~EDM. + """ + raise NotImplementedError() + + @abstractproperty + def upper_bound(self): + """The upper bound of values of Y~EDM. + """ + raise NotImplementedError() + + @abstractmethod + def in_y_range(self, x): + """Returns true if x is in the valid range of Y~EDM. + """ + raise NotImplementedError() + + @abstractmethod + def unit_variance(self, mu): + """The unit variance :math:`v(mu)` determines the variance as + a function of the mean mu by + :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`. + It can also be derived from the unit deviance :math:`d(y,\mu)` as + + .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ + \partial\mu^2}}\big|_{y=\mu} + """ + raise NotImplementedError() + + @abstractmethod + def unit_variance_derivative(self, mu): + """The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`. + """ + raise NotImplementedError() + + def variance(self, mu, phi=1, weight=1): + """The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is + :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`, + with unit variance v(mu). + """ + return phi/weight * self.unit_variance(mu) + + def variance_derivative(self, mu, phi=1, weight=1): + """The derivative of the variance w.r.t. mu, + :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] + =phi/w_i*v'(\mu_i)`, with unit variance v(mu). + """ + return phi/weight * self.unit_variance_derivative(mu) + + @abstractmethod + def unit_deviance(self, y, mu): + """The unit_deviance :math:`d(y,\mu)`. + In terms of the log-likelihood it is given by + :math:`d(y,\mu) = -2\phi\cdot + \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).` + """ + raise NotImplementedError() + + def unit_deviance_derivative(self, y, mu): + """The derivative w.r.t. mu of the unit_deviance + :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + with unit variance :math:`v(\mu)`. + + Returns + ------- + derivative: array, shape = (n_samples,) + """ + return -2*(y-mu)/self.unit_variance(mu) + + def deviance(self, y, mu, weight=1): + """The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu) + with weight :math:`w_i` and unit_deviance :math:`d(y,mu)`. + In terms of the likelihood it is :math:`D = -2\phi\cdot + \left(loglike(y,\mu,\frac{phi}{w}) + - loglike(y,y,\frac{phi}{w})\right).` + """ + return np.sum(weight*self.unit_deviance(y,mu)) + + def _deviance(self, coef, X, y, weight, link): + """The deviance as a function of the coefficients ``coef`` + (:math:`beta`). + """ + lin_pred = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(lin_pred) + return self.deviance(y, mu, weight) + + def deviance_derivative(self, y, mu, weight=1): + """The derivative w.r.t. mu of the deviance.` + """ + return weight*self.unit_deviance_derivative(y,mu) + + def _score(self, coef, phi, X, y, weight, link): + """The score function :math:`s` is the derivative of the + log-likelihood w.r.t. the ``coef`` (:math:`\beta`). + It is given by + + .. math: + + \mathbf{s}(\boldsymbol{\beta}) = \mathbf{X}^T \mathbf{D} + \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, + + with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and + :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}(y_1),\ldots)`. + """ + n_samples = X.shape[0] + lin_pred = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + d = link.inverse_derivative(lin_pred) + d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False) + score = safe_sparse_dot(X.T, temp, dense_output=False) + return score + + def _fisher_matrix(self, coef, phi, X, y, weight, link): + """The Fisher information matrix, also known as expected + information matrix. It is given by + + .. math: + + \mathbf{F}(\boldsymbol{\beta}) = \mathrm{E}\left[ + -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta} + \partial\boldsymbol{\beta}^T}\right] + = \mathbf{X}^T W \mathbf{X} \,, + + with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, + see score function. + """ + n_samples = X.shape[0] + lin_pred = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + d2 = link.inverse_derivative(lin_pred)**2 + d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) + fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False) + return fisher_matrix + + def _observed_information(self, coef, phi, X, y, weight, link): + """The observed information matrix, also known as the negative of + the Hessian matrix of the log-likelihood. It is given by + + .. math: + + \mathbf{H}(\boldsymbol{\beta}) = + -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta} + \partial\boldsymbol{\beta}^T} + = \mathbf{X}^T \legt[ + - \mathbf{D}' \mathbf{R} + + \mathbf{D}^2 \mathbf{V} \mathbf{R} + + \mathbf{D}^2 + \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,, + + with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`, + :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{ + v(\mu_i)} + \right)`, + see score function and Fisher matrix. + """ + n_samples = X.shape[0] + lin_pred = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + dp = link.inverse_derivative2(lin_pred) + d2 = link.inverse_derivative(lin_pred)**2 + v = self.unit_variance_derivative(mu)/self.unit_variance(mu) + r = y - mu + temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(temp, X, dense_output=False) + observed_information = safe_sparse_dot(X.T, temp, dense_output=False) + return observed_information + + def _deviance_derivative(self, coef, X, y, weight, link): + """The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a + function of the coefficients ``coef``. + This is equivalent to :math:`-2\phi` times the score function + :math:`s` (derivative of the log-likelihood). + """ + score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight, + link=link) + return -2*score + + def _deviance_hessian(self, coef, X, y, weight, link): + """The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance + as a function of the coefficients ``coef``. + This is equivalent to :math:`+2\phi` times the observed information + matrix. + """ + info_matrix = self._observed_information(coef=coef, phi=1, + X=X, y=y, weight=weight, link=link) + return 2*info_matrix + + def starting_mu(self, y, weight=1): + """Starting values for the mean mu_i in IRLS.""" + return (weight*y+np.mean(weight*y))/(2.*np.sum(np.ones_like(y)*weight)) + + +class TweedieDistribution(ExponentialDispersionModel): + """A class for the Tweedie distribution. + They have mu=E[X] and Var[X] \propto mu**power. + + Attributes + ---------- + power : float + The variance power of the unit_variance + :math:`v(mu) = mu^{power}`. + """ + def __init__(self, power=0): + self.power = power + self._upper_bound = np.Inf + self._upper_compare = lambda x: np.less(x, self.upper_bound) + if power < 0: + #Extreme Stable + self._lower_bound = -np.Inf + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif power == 0: + #GaussianDistribution + self._lower_bound = -np.Inf + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif (power > 0) and (power < 1): + raise ValueError('For 0 1) and (power < 2): + #Compound Poisson + self._lower_bound = 0 + self._lower_compare = ( + lambda x: np.greater_equal(x, self.lower_bound)) + elif power == 2: + #GammaDistribution + self._lower_bound = 0 + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif (power > 2) and (power < 3): + #Positive Stable + self._lower_bound = 0 + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif power == 3: + #InverseGaussianDistribution + self._lower_bound = 0 + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif power > 3: + #Positive Stable + self._lower_bound = 0 + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + + @property + def power(self): + return self._power + + @power.setter + def power(self, power): + if not isinstance(power, numbers.Real): + raise TypeError('power must be a real number, input was {0}' + .format(power)) + self._power = power + + @property + def lower_bound(self): + return self._lower_bound + + @property + def upper_bound(self): + return self._upper_bound + + def in_y_range(self, x): + return np.logical_and(self._lower_compare(x), self._upper_compare(x)) + + def unit_variance(self, mu): + """The unit variance of a Tweedie distribution is v(mu)=mu**power. + """ + return np.power(mu, self.power) + + def unit_variance_derivative(self, mu): + """The derivative of the unit variance of a Tweedie distribution is + v(mu)=power*mu**(power-1). + """ + return self.power*np.power(mu, self.power-1) + + def unit_deviance(self, y, mu): + p = self.power + if p == 0: + #NormalDistribution + return (y-mu)**2 + if p == 1: + #PoissonDistribution + return 2 * (np.where(y==0,0,y*np.log(y/mu))-y+mu) + elif p == 2: + #GammaDistribution + return 2 * (np.log(mu/y)+y/mu-1) + else: + #return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) + # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) + return 2 * (np.power(np.maximum(y,0), 2-p)/((1-p)*(2-p)) + - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + + def likelihood(self, y, X, beta, phi, weight=1): + raise NotImplementedError('This function is not (yet) implemented.') + + +class NormalDistribution(TweedieDistribution): + """Class for the Normal (aka Gaussian) distribution""" + def __init__(self): + super(NormalDistribution, self).__init__(power=0) + +GaussianDistribution = NormalDistribution + +class PoissonDistribution(TweedieDistribution): + """Class for the scaled Poisson distribution""" + def __init__(self): + super(PoissonDistribution, self).__init__(power=1) + +class GammaDistribution(TweedieDistribution): + """Class for the Gamma distribution""" + def __init__(self): + super(GammaDistribution, self).__init__(power=2) + +class InverseGaussianDistribution(TweedieDistribution): + """Class for the scaled InverseGaussianDistribution distribution""" + def __init__(self): + super(InverseGaussianDistribution, self).__init__(power=3) + +class GeneralizedHyperbolicSecand(ExponentialDispersionModel): + """A class for the von Generalized Hyperbolic Secand (GHS) distribution. + + The GHS distribution is for data y in (-inf, inf). + """ + def __init__(self): + self._lower_bound = -np.Inf + self._upper_bound = np.Inf + + @property + def lower_bound(self): + return self._lower_bound + + @property + def upper_bound(self): + return self._upper_bound + + def in_y_range(self, x): + np.logical_and( + np.greater(x, self.lower_bound), + np.less(x, self.lower_bound) + ) + + def unit_variance(self, mu): + return 1 + mu**2 + + def unit_variance_derivative(self, mu): + return 2*mu + + def unit_deviance(self, y, mu): + return (2*y*(np.arctan(y) - np.arctan(mu)) + + np.log((1+mu**2)/(1+y**2))) + + + +class GeneralizedLinearModel(LinearModel, RegressorMixin): + """ + Class to fit a Generalized Linear Model (GLM) based on reproductive + Exponential Dispersion Models (EDM). + + Assumptions: + + - The target values y_i are realizations of random variables + :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion + parameter :math:`\phi` and weights :math:`w_i`. + - The expectation of :math:`Y_i` is :math:`mu_i=\mathrm{E}[Y]=h(\eta_i)` + whith the linear predictor :math:`\eta=X*\beta`, inverse link function + :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta` + to be estimated. + + Note that the first assumption implies + :math:`\mathrm{Var}[Y_i]=\frac{\phi}{w_i} v(\mu_i)` with uni variance + function :math:`v(\mu)`. + + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments :math:`E[Y_i]=\mu_i=h(\eta_i)` and + :math:`Var[Y_i]=\frac{\phi}{w_i} v(\mu_i)` + + The parameters :math:`\beta` are estimated by maximum likelihood which is + equivalent to minimizing the deviance. + + TODO: Estimation of the dispersion parameter phi. + + TODO: Notes on 'scaled' Poisson and weights + + Parameters + ---------- + fit_intercept : boolean, optional, default True + whether to calculate the intercept for this model. If set + to False, no intercept will be used in calculations + (e.g. data is expected to be already centered). + + family : ExponentialDispersionModel, optional, default NormalDistribution() + the distributional assumption of the GLM + + link : Link, optional, default IdentityLink() + the link function (class) of the GLM + + fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr' + method for estimation of the dispersion parameter phi. Whether to use + the chi squared statisic or the deviance statistic. If None, the + dispersion is not estimated. + + solver : {'irls', 'newton-cg', 'lbfgs'}, defaul 'irls' + Algorithm to use in the optimization problem. + + - 'irls' is iterated reweighted least squares. It is the standard + algorithm for GLMs. + + - 'newton-cg', 'lbfgs' + + max_iter : int, default 100 + TODO + + tol : float + Stopping criterion. For the irls, newton-cg and lbfgs solvers, + the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient (derivative of + the deviance). + + start_params : {array shape (n_features, ), 'ols'}, default None + sets the start values for coef_ in the fit. + If None, default values are taken. + If 'ols' the result of an ordinary least squares in the link space + (linear predictor) is taken. + If an array is given, these values are taken as coef_ to start with. + If fit_intercept is true, the first value is assumed to be the start + value for the intercept_. + + verbose : int, default: 0 + For the lbfgs solver set verbose to any positive + number for verbosity. + + Attributes + ---------- + coef_ : array, shape (1, n_features) + Estimated coefficients for the linear predictor (X*coef_) in the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + dispersion_ : float + The dispersion parameter :math:`\phi` if fit_dispersion is set. + + n_iter_ : int + Actual number of iterations of the solver. + + Notes + ----- + + References + ---------- + TODO + """ + + def __init__(self, fit_intercept=True, family=NormalDistribution(), + link=IdentityLink(), fit_dispersion='chisqr', solver='irls', max_iter=100, + tol=1e-4, start_params=None, verbose=0): + self.fit_intercept = fit_intercept + self.family = family + self.link = link + self.fit_dispersion = fit_dispersion + self.solver = solver + self.max_iter = 100 + self.tol = tol + self.start_params = start_params + self.verbose = verbose + + def fit(self, X, y, weight=None): + """ + Fit a generalized linear model. + + Parameters + ---------- + X : numpy array or sparse matrix of shape [n_samples,n_features] + Training data + + y : numpy array of shape [n_samples] + Target values + + weight : numpy array of shape [n_samples] + Individual weights for each sample. + Var[Y_i]=phi/weight_i * v(mu) + If Y_i ~ EDM(mu, phi/w_i) then + sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)) + + Returns + ------- + self : returns an instance of self. + """ + if not isinstance(self.family, ExponentialDispersionModel): + raise ValueError("The argument family must be an instance of class" + "ExponentialDispersionModel.") + if not isinstance(self.fit_intercept, bool): + raise ValueError("The argument fit_intercept must be bool," + " got {0}".format(self.fit_intercept)) + if not self.solver in ['irls', 'lbfgs', 'newton-cg']: + raise ValueError("GLM Regression supports only irls, lbfgs and" + "newton-cg solvers, got {0}".format(self.solver)) + if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: + raise ValueError("Maximum number of iteration must be positive;" + " got (max_iter={0!r})".format(self.max_iter)) + if not isinstance(self.tol, numbers.Number) or self.tol < 0: + raise ValueError("Tolerance for stopping criteria must be " + "positive; got (tol={0!r})".format(self.tol)) + start_params = self.start_params + if start_params is not None and start_params is not 'ols': + start_params = np.atleast_1d(start_params) + if start_params.shape[0] != X.shape[1] + self.fit_intercept: + raise ValueError("Start values for parameters must have the" + "right length; required length {0}, got {1}".format( + X.shape[1] + self.fit_intercept, start_params.shape[0])) + + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + y_numeric=True, multi_output=False) + y = y.astype(np.float64) + + if not np.all(self.family.in_y_range(y)): + raise ValueError("Some value(s) of y are out of the valid " + "range for family {0}".format(self.family.__class__.__name__)) + + if weight is None: + weight = np.ones_like(y) + elif np.isscalar(weight): + weight = weight*np.ones_like(y) + else: + weight = np.atleast_1d(weight) + if weight.ndim > 1: + raise ValueError("Weights must be 1D array or scalar") + elif weight.shape[0] != y.shape[0]: + raise ValueError("Weights must have the same length as y") + + + if self.fit_intercept: + #intercept is first column <=> coef[0] is for intecept + if sparse.issparse(X): + Xnew = sparse.hstack([np.ones([X.shape[0],1]), X]) + else: + Xnew = np.concatenate((np.ones((X.shape[0],1)), X), axis=1) + else: + Xnew = X + + n_samples, n_features = Xnew.shape + + #Note: Since phi does not enter the estimation of mu_i=E[y_i] + # set it to 1 where convenient. + + #set start values for coef + coef = None + if start_params is None: + #Use mu_start and apply one irls step to calculate coef + mu = self.family.starting_mu(y, weight) + #linear predictor + eta = self.link.link(mu) + #h'(eta) + hp = self.link.inverse_derivative(eta) + #working weights w, in principle a diagonal matrix + #therefore here just as 1d array + w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) + wroot = np.sqrt(w) + #working observations + yw = eta + (y-mu)/hp + #least squares rescaled with wroot + wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples)) + X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) + yw_rescale = safe_sparse_dot(wroot, y, dense_output=True) + coef = linalg.lstsq(X_rescale, yw_rescale)[0] + elif start_params is 'ols': + reg = LinearRegression(copy_X=False, + fit_intercept=False) + reg.fit(Xnew, self.link.link(y)) + coef = reg.coef_ + else: + coef = start_params + + #algorithms for optimiation + #TODO: Parallelize it + self.n_iter_ = 0 + converged = False + if self.solver == 'irls': + #linear predictor + eta = safe_sparse_dot(Xnew, coef, dense_output=True) + mu = self.link.inverse(eta) + while self.n_iter_ < self.max_iter: + self.n_iter_ += 1 + #coef_old not used so far. + #coef_old = coef + #h'(eta) + hp = self.link.inverse_derivative(eta) + #working weights w, in principle a diagonal matrix + #therefore here just as 1d array + w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) + wroot = np.sqrt(w) + #working observations + yw = eta + (y-mu)/hp + #least squares rescaled with wroot + wroot = sparse.dia_matrix((wroot, 0), + shape=(n_samples, n_samples)) + X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) + yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True) + coef, residues, rank, singular_ = ( + linalg.lstsq(X_rescale, yw_rescale)) + + #updated linear predictor + #do it here for updated values for tolerance + eta = safe_sparse_dot(Xnew, coef, dense_output=True) + mu = self.link.inverse(eta) + + #which tolerace? |coef - coef_old| or gradient? + #use gradient for compliance with newton-cg and lbfgs + #TODO: faster computation of gradient, use mu and eta directly + gradient = self.family._deviance_derivative(coef=coef, + X=Xnew, y=y, weight=weight, link=self.link) + if (np.max(np.abs(gradient)) <= self.tol): + converged = True + break + + if not converged: + warnings.warn("irls failed to converge. Increase the number " + "of iterations (currently {0})".format(self.max_iter)) + + #TODO: performance: make one function return both deviance and gradient + elif self.solver == 'lbfgs': + func = self.family._deviance + fprime = self.family._deviance_derivative + args = (Xnew, y, weight, self.link) + coef, loss, info = optimize.fmin_l_bfgs_b( + func, coef, fprime=fprime, + args=args, + iprint=(self.verbose > 0) - 1, pgtol=self.tol, + maxiter=self.max_iter) + if self.verbose > 0: + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.") + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}".format( + info["task"])) + self.n_iter_ = info['nit'] + elif self.solver == 'newton-cg': + func = self.family._deviance + grad = self.family._deviance_derivative + def grad_hess(coef, X, y, weight, link): + grad = (self.family + ._deviance_derivative(coef, X, y, weight, link)) + hessian = (self.family + ._deviance_hessian(coef, X, y, weight,link)) + def Hs(s): + ret = np.dot(hessian, s) + return ret + return grad, Hs + hess = grad_hess + args = (Xnew, y, weight, self.link) + coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, + maxiter=self.max_iter, tol=self.tol) + self.coef_ = coef + + if self.fit_intercept is True: + self.intercept_ = coef[0] + self.coef_ = coef[1:] + else: + self.coef_ = coef + + if self.fit_dispersion in ['chisqr', 'deviance']: + self.dispersion_ = self.estimate_phi(y, X, weight) + + return self + + def predict(self, X, weight=1): + check_is_fitted(self, "coef_") + eta = safe_sparse_dot(X, self.coef_, dense_output=True) + if self.fit_intercept is True: + eta += self.intercept_ + mu = self.link.inverse(eta) + return mu*weight + + def estimate_phi(self, y, X, weight): + n_samples, n_features = X.shape + eta = safe_sparse_dot(X, self.coef_, dense_output=True) + if self.fit_intercept is True: + eta += self.intercept_ + mu = self.link.inverse(eta) + if self.fit_dispersion == 'chisqr': + chisq = np.sum(weight*(y-mu)**2/self.family.unit_variance(mu)) + return chisq/(n_samples - n_features) + elif self.fit_dispersion == 'deviance': + dev = self.family.deviance(y, mu, weight) + return dev/(n_samples - n_features) + + def score(self, X, y, weight=1): + """The natural score for a GLM is -deviance. + Returns the weight averaged negitive deviance (the better the score, + the better the fit). Maximum score is therefore 0. + """ + #RegressorMixin has R^2 score. + #TODO: Make it more compatible with the score function in + # sklearn.metrics.regression.py + eta = safe_sparse_dot(X, self.coef_, dense_output=True) + if self.fit_intercept is True: + eta += self.intercept_ + mu = self.link.inverse(eta) + output_errors = self.family.unit_deviance(y,mu) + weight = weight * np.ones_like(y) + return np.average(output_errors, weights=weight) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py new file mode 100644 index 0000000000000..2a88a7ec899d9 --- /dev/null +++ b/sklearn/linear_model/tests/test_glm.py @@ -0,0 +1,73 @@ +import numpy as np + +from sklearn.linear_model.glm import (Link, IdentityLink, LogLink, + ExponentialDispersionModel, TweedieDistribution, + NormalDistribution, GaussianDistribution, + PoissonDistribution, GammaDistribution, + InverseGaussianDistribution, GeneralizedHyperbolicSecand, + GeneralizedLinearModel) + +from sklearn.utils.testing import (assert_equal, assert_array_equal, + assert_array_almost_equal) + +def test_family_bounds(): + """Test the valid range of distributions + """ + family = NormalDistribution() + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [True, True, True]) + + family = PoissonDistribution() + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, True, True]) + + family = TweedieDistribution(power=1.5) + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, True, True]) + + family = GammaDistribution() + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, False, True]) + + family = InverseGaussianDistribution() + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, False, True]) + + family = TweedieDistribution(power=4.5) + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, False, True]) + +def test_glm_identiy_regression(): + """Test linear regression on a simple dataset + """ + coef = [1,2] + X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T + y = np.dot(X, coef) + for solver in ['irls', 'lbfgs', 'newton-cg']: + for family in (GaussianDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)): + glm = GeneralizedLinearModel(family=family, + fit_intercept=False, solver=solver) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) + +def test_glm_log_regression(): + """Test linear regression on a simple dataset + """ + coef = [1,2] + X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T + y = np.exp(np.dot(X, coef)) + #for solver in ['irls', 'lbfgs', 'newton-cg']: + for solver in ['irls']: + #for family in [GaussianDistribution(), PoissonDistribution(), + # GammaDistribution(), InverseGaussianDistribution(), + # TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]: + for family in [GaussianDistribution()]: + glm = GeneralizedLinearModel(family=family, + link=LogLink(), + fit_intercept=False, solver=solver, start_params='ols') + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) + +#TODO: Test compatibility with R's glm, glmnet From 2fc189d8351c9710c1329750545539afe3e6e40c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 19 Jul 2017 17:33:04 +0200 Subject: [PATCH 002/209] [WIP] Add Generalized Linear Models (#9405) * Fixed pep8 * Fixed flake8 * Rename GeneralizedLinearModel as GeneralizedLinearRegressor * Use of six.with_metaclass * PEP257: summary should be on same line as quotes * Docstring of class GeneralizedLinearRegressor: \ before mu * Arguments family and link accept strings * Use of ConvergenceWarning --- sklearn/linear_model/__init__.py | 13 +- sklearn/linear_model/glm.py | 231 ++++++++++++++----------- sklearn/linear_model/tests/test_glm.py | 75 ++++---- 3 files changed, 180 insertions(+), 139 deletions(-) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 5acc51e9dc87f..0c5840f343a3a 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,12 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from .glm import (Link, IdentityLink, LogLink, - ExponentialDispersionModel, TweedieDistribution, - NormalDistribution, GaussianDistribution, - PoissonDistribution, GammaDistribution, - InverseGaussianDistribution, GeneralizedHyperbolicSecand, - GeneralizedLinearModel) +from .glm import (TweedieDistribution, + GeneralizedLinearRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -44,7 +40,6 @@ 'BayesianRidge', 'ElasticNet', 'ElasticNetCV', - 'GeneralizedLinearModel', 'Hinge', 'Huber', 'HuberRegressor', @@ -84,4 +79,6 @@ 'orthogonal_mp', 'orthogonal_mp_gram', 'ridge_regression', - 'RANSACRegressor'] + 'RANSACRegressor', + 'GeneralizedLinearRegressor', + 'TweedieDistribution'] diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 8b6eb8f3bf16c..cf91a64fafc12 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -5,9 +5,6 @@ # Author: Christian Lorentzen # License: BSD 3 clause -# TODO: Which name? GeneralizedLinearModel vs GeneralizedLinearRegression. -# So far, it is GeneralizedLinearModel, since it could very easily -# extended by Bernoulli/Binomial distribution. # TODO: Which name/symbol for coefficients and weights in docu? # sklearn.linear_models uses w for coefficients. # So far, coefficients=beta and weight=w (as standard literature) @@ -17,6 +14,10 @@ # TODO: Write docu and examples # Design Decisions: +# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. +# So far, it is GeneralizedLinearModel, since it could very easily +# extended by Bernoulli/Binomial distribution. +# Solution: GeneralizedLinearRegressor # - The link funtion (instance of class Link) is necessary for the evaluation # of deviance, score, Fisher and Hessian matrix as functions of the # coefficients, which is needed by optimizers. @@ -28,16 +29,17 @@ import numpy as np from scipy import linalg, optimize, sparse import warnings -from .base import LinearModel, LinearRegression -from ..base import RegressorMixin +from .base import LinearRegression +from ..base import BaseEstimator, RegressorMixin +from ..exceptions import ConvergenceWarning +from ..externals import six from ..utils import check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg from ..utils.validation import check_is_fitted - -class Link(metaclass=ABCMeta): +class Link(six.with_metaclass(ABCMeta)): """Abstract base class for Link funtions """ @@ -73,6 +75,7 @@ def inverse_derivative2(self, lin_pred): """ raise NotImplementedError + class IdentityLink(Link): """The identity link function g(x)=x. """ @@ -113,7 +116,7 @@ def inverse_derivative2(self, lin_pred): return np.exp(lin_pred) -class ExponentialDispersionModel(metaclass=ABCMeta): +class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): """Base class for reproductive Exponential Dispersion Models (EDM). The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by @@ -235,7 +238,7 @@ def deviance(self, y, mu, weight=1): \left(loglike(y,\mu,\frac{phi}{w}) - loglike(y,y,\frac{phi}{w})\right).` """ - return np.sum(weight*self.unit_deviance(y,mu)) + return np.sum(weight*self.unit_deviance(y, mu)) def _deviance(self, coef, X, y, weight, link): """The deviance as a function of the coefficients ``coef`` @@ -248,7 +251,7 @@ def _deviance(self, coef, X, y, weight, link): def deviance_derivative(self, y, mu, weight=1): """The derivative w.r.t. mu of the deviance.` """ - return weight*self.unit_deviance_derivative(y,mu) + return weight*self.unit_deviance_derivative(y, mu) def _score(self, coef, phi, X, y, weight, link): """The score function :math:`s` is the derivative of the @@ -269,7 +272,7 @@ def _score(self, coef, phi, X, y, weight, link): sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) d = link.inverse_derivative(lin_pred) d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), - shape=(n_samples, n_samples)) + shape=(n_samples, n_samples)) temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False) score = safe_sparse_dot(X.T, temp, dense_output=False) return score @@ -294,7 +297,7 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link): sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) d2 = link.inverse_derivative(lin_pred)**2 d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), - shape=(n_samples, n_samples)) + shape=(n_samples, n_samples)) temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False) return fisher_matrix @@ -329,7 +332,7 @@ def _observed_information(self, coef, phi, X, y, weight, link): v = self.unit_variance_derivative(mu)/self.unit_variance(mu) r = y - mu temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0), - shape=(n_samples, n_samples)) + shape=(n_samples, n_samples)) temp = safe_sparse_dot(temp, X, dense_output=False) observed_information = safe_sparse_dot(X.T, temp, dense_output=False) return observed_information @@ -341,7 +344,7 @@ def _deviance_derivative(self, coef, X, y, weight, link): :math:`s` (derivative of the log-likelihood). """ score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight, - link=link) + link=link) return -2*score def _deviance_hessian(self, coef, X, y, weight, link): @@ -350,8 +353,8 @@ def _deviance_hessian(self, coef, X, y, weight, link): This is equivalent to :math:`+2\phi` times the observed information matrix. """ - info_matrix = self._observed_information(coef=coef, phi=1, - X=X, y=y, weight=weight, link=link) + info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y, + weight=weight, link=link) return 2*info_matrix def starting_mu(self, y, weight=1): @@ -374,39 +377,39 @@ def __init__(self, power=0): self._upper_bound = np.Inf self._upper_compare = lambda x: np.less(x, self.upper_bound) if power < 0: - #Extreme Stable + # Extreme Stable self._lower_bound = -np.Inf self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif power == 0: - #GaussianDistribution + # GaussianDistribution self._lower_bound = -np.Inf self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif (power > 0) and (power < 1): raise ValueError('For 0 1) and (power < 2): - #Compound Poisson + # Compound Poisson self._lower_bound = 0 self._lower_compare = ( lambda x: np.greater_equal(x, self.lower_bound)) elif power == 2: - #GammaDistribution + # GammaDistribution self._lower_bound = 0 self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif (power > 2) and (power < 3): - #Positive Stable + # Positive Stable self._lower_bound = 0 self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif power == 3: - #InverseGaussianDistribution + # InverseGaussianDistribution self._lower_bound = 0 self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif power > 3: - #Positive Stable + # Positive Stable self._lower_bound = 0 self._lower_compare = lambda x: np.greater(x, self.lower_bound) @@ -418,7 +421,7 @@ def power(self): def power(self, power): if not isinstance(power, numbers.Real): raise TypeError('power must be a real number, input was {0}' - .format(power)) + .format(power)) self._power = power @property @@ -446,19 +449,19 @@ def unit_variance_derivative(self, mu): def unit_deviance(self, y, mu): p = self.power if p == 0: - #NormalDistribution + # NormalDistribution return (y-mu)**2 if p == 1: - #PoissonDistribution - return 2 * (np.where(y==0,0,y*np.log(y/mu))-y+mu) + # PoissonDistribution + return 2 * (np.where(y == 0, 0, y*np.log(y/mu))-y+mu) elif p == 2: - #GammaDistribution + # GammaDistribution return 2 * (np.log(mu/y)+y/mu-1) else: - #return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) + # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) - return 2 * (np.power(np.maximum(y,0), 2-p)/((1-p)*(2-p)) - - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - + y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) def likelihood(self, y, X, beta, phi, weight=1): raise NotImplementedError('This function is not (yet) implemented.') @@ -469,23 +472,25 @@ class NormalDistribution(TweedieDistribution): def __init__(self): super(NormalDistribution, self).__init__(power=0) -GaussianDistribution = NormalDistribution class PoissonDistribution(TweedieDistribution): """Class for the scaled Poisson distribution""" def __init__(self): super(PoissonDistribution, self).__init__(power=1) + class GammaDistribution(TweedieDistribution): """Class for the Gamma distribution""" def __init__(self): super(GammaDistribution, self).__init__(power=2) + class InverseGaussianDistribution(TweedieDistribution): """Class for the scaled InverseGaussianDistribution distribution""" def __init__(self): super(InverseGaussianDistribution, self).__init__(power=3) + class GeneralizedHyperbolicSecand(ExponentialDispersionModel): """A class for the von Generalized Hyperbolic Secand (GHS) distribution. @@ -516,12 +521,11 @@ def unit_variance_derivative(self, mu): return 2*mu def unit_deviance(self, y, mu): - return (2*y*(np.arctan(y) - np.arctan(mu)) - + np.log((1+mu**2)/(1+y**2))) - + return (2*y*(np.arctan(y) - np.arctan(mu)) + + np.log((1+mu**2)/(1+y**2))) -class GeneralizedLinearModel(LinearModel, RegressorMixin): +class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """ Class to fit a Generalized Linear Model (GLM) based on reproductive Exponential Dispersion Models (EDM). @@ -531,7 +535,7 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin): - The target values y_i are realizations of random variables :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion parameter :math:`\phi` and weights :math:`w_i`. - - The expectation of :math:`Y_i` is :math:`mu_i=\mathrm{E}[Y]=h(\eta_i)` + - The expectation of :math:`Y_i` is :math:`\mu_i=\mathrm{E}[Y]=h(\eta_i)` whith the linear predictor :math:`\eta=X*\beta`, inverse link function :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta` to be estimated. @@ -549,7 +553,9 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin): TODO: Estimation of the dispersion parameter phi. - TODO: Notes on 'scaled' Poisson and weights + TODO: Notes on weights and 'scaled' Poisson, e.g. fit y = x/w with + with x=counts and w=exposure (time, money, persons, ...) => y is a + ratio with weights w. Parameters ---------- @@ -558,10 +564,12 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin): to False, no intercept will be used in calculations (e.g. data is expected to be already centered). - family : ExponentialDispersionModel, optional, default NormalDistribution() - the distributional assumption of the GLM + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance + of a subclass of ExponentialDispersionModel, optional, default 'normal' + the distributional assumption of the GLM. - link : Link, optional, default IdentityLink() + link : {'identity', 'log'} or an instance of a subclass of Link, + optional, default IdentityLink() the link function (class) of the GLM fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr' @@ -622,8 +630,8 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin): """ def __init__(self, fit_intercept=True, family=NormalDistribution(), - link=IdentityLink(), fit_dispersion='chisqr', solver='irls', max_iter=100, - tol=1e-4, start_params=None, verbose=0): + link=IdentityLink(), fit_dispersion='chisqr', solver='irls', + max_iter=100, tol=1e-4, start_params=None, verbose=0): self.fit_intercept = fit_intercept self.family = family self.link = link @@ -635,8 +643,7 @@ def __init__(self, fit_intercept=True, family=NormalDistribution(), self.verbose = verbose def fit(self, X, y, weight=None): - """ - Fit a generalized linear model. + """Fit a generalized linear model. Parameters ---------- @@ -657,12 +664,32 @@ def fit(self, X, y, weight=None): self : returns an instance of self. """ if not isinstance(self.family, ExponentialDispersionModel): - raise ValueError("The argument family must be an instance of class" - "ExponentialDispersionModel.") + if self.family == 'normal': + self.family = NormalDistribution() + elif self.family == 'poisson': + self.family = PoissonDistribution() + elif self.family == 'gamma': + self.family = GammaDistribution() + elif self.family == 'inverse.gaussian': + self.family = InverseGaussianDistribution() + else: + raise ValueError( + "The argument family must be an instance of class" + " ExponentialDispersionModel or an element of" + " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].") + if not isinstance(self.link, Link): + if self.link == 'identity': + self.link = IdentityLink() + if self.link == 'log': + self.link = LogLink() + else: + raise ValueError( + "The argument link must be an instance of class Link or" + " an element of ['identity', 'log'].") if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool," " got {0}".format(self.fit_intercept)) - if not self.solver in ['irls', 'lbfgs', 'newton-cg']: + if self.solver not in ['irls', 'lbfgs', 'newton-cg']: raise ValueError("GLM Regression supports only irls, lbfgs and" "newton-cg solvers, got {0}".format(self.solver)) if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: @@ -676,8 +703,9 @@ def fit(self, X, y, weight=None): start_params = np.atleast_1d(start_params) if start_params.shape[0] != X.shape[1] + self.fit_intercept: raise ValueError("Start values for parameters must have the" - "right length; required length {0}, got {1}".format( - X.shape[1] + self.fit_intercept, start_params.shape[0])) + "right length; required length {0}, got {1}" + .format(X.shape[1] + self.fit_intercept, + start_params.shape[0])) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=False) @@ -685,7 +713,8 @@ def fit(self, X, y, weight=None): if not np.all(self.family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " - "range for family {0}".format(self.family.__class__.__name__)) + "range for family {0}" + .format(self.family.__class__.__name__)) if weight is None: weight = np.ones_like(y) @@ -698,96 +727,96 @@ def fit(self, X, y, weight=None): elif weight.shape[0] != y.shape[0]: raise ValueError("Weights must have the same length as y") - if self.fit_intercept: - #intercept is first column <=> coef[0] is for intecept + # intercept is first column <=> coef[0] is for intecept if sparse.issparse(X): - Xnew = sparse.hstack([np.ones([X.shape[0],1]), X]) + Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X]) else: - Xnew = np.concatenate((np.ones((X.shape[0],1)), X), axis=1) + Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) else: Xnew = X n_samples, n_features = Xnew.shape - #Note: Since phi does not enter the estimation of mu_i=E[y_i] - # set it to 1 where convenient. + # Note: Since dispersion_ alias phi does not enter the estimation + # of mu_i=E[y_i] set it to 1 where convenient. - #set start values for coef + # set start values for coef coef = None if start_params is None: - #Use mu_start and apply one irls step to calculate coef + # Use mu_start and apply one irls step to calculate coef mu = self.family.starting_mu(y, weight) - #linear predictor + # linear predictor eta = self.link.link(mu) - #h'(eta) + # h'(eta) hp = self.link.inverse_derivative(eta) - #working weights w, in principle a diagonal matrix - #therefore here just as 1d array + # working weights w, in principle a diagonal matrix + # therefore here just as 1d array w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) wroot = np.sqrt(w) - #working observations + # working observations yw = eta + (y-mu)/hp - #least squares rescaled with wroot + # least squares rescaled with wroot wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples)) X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) yw_rescale = safe_sparse_dot(wroot, y, dense_output=True) coef = linalg.lstsq(X_rescale, yw_rescale)[0] elif start_params is 'ols': - reg = LinearRegression(copy_X=False, - fit_intercept=False) + reg = LinearRegression(copy_X=False, fit_intercept=False) reg.fit(Xnew, self.link.link(y)) coef = reg.coef_ else: coef = start_params - #algorithms for optimiation - #TODO: Parallelize it + # algorithms for optimiation + # TODO: Parallelize it self.n_iter_ = 0 converged = False if self.solver == 'irls': - #linear predictor + # linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = self.link.inverse(eta) while self.n_iter_ < self.max_iter: self.n_iter_ += 1 - #coef_old not used so far. - #coef_old = coef - #h'(eta) + # coef_old not used so far. + # coef_old = coef + # h'(eta) hp = self.link.inverse_derivative(eta) - #working weights w, in principle a diagonal matrix - #therefore here just as 1d array + # working weights w, in principle a diagonal matrix + # therefore here just as 1d array w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) wroot = np.sqrt(w) - #working observations + # working observations yw = eta + (y-mu)/hp - #least squares rescaled with wroot + # least squares rescaled with wroot wroot = sparse.dia_matrix((wroot, 0), - shape=(n_samples, n_samples)) + shape=(n_samples, n_samples)) X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True) - coef, residues, rank, singular_ = ( + coef, residues, rank, singular_ = ( linalg.lstsq(X_rescale, yw_rescale)) - #updated linear predictor - #do it here for updated values for tolerance + # updated linear predictor + # do it here for updated values for tolerance eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = self.link.inverse(eta) - #which tolerace? |coef - coef_old| or gradient? - #use gradient for compliance with newton-cg and lbfgs - #TODO: faster computation of gradient, use mu and eta directly - gradient = self.family._deviance_derivative(coef=coef, - X=Xnew, y=y, weight=weight, link=self.link) + # which tolerace? |coef - coef_old| or gradient? + # use gradient for compliance with newton-cg and lbfgs + # TODO: faster computation of gradient, use mu and eta directly + gradient = self.family._deviance_derivative( + coef=coef, X=Xnew, y=y, weight=weight, link=self.link) if (np.max(np.abs(gradient)) <= self.tol): converged = True break if not converged: warnings.warn("irls failed to converge. Increase the number " - "of iterations (currently {0})".format(self.max_iter)) + "of iterations (currently {0})" + .format(self.max_iter), ConvergenceWarning) - #TODO: performance: make one function return both deviance and gradient + # TODO: performance: make one function return both deviance and + # gradient of deviance elif self.solver == 'lbfgs': func = self.family._deviance fprime = self.family._deviance_derivative @@ -800,7 +829,8 @@ def fit(self, X, y, weight=None): if self.verbose > 0: if info["warnflag"] == 1: warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.") + " Increase the number of iterations.", + ConvergenceWarning) elif info["warnflag"] == 2: warnings.warn("lbfgs failed for the reason: {0}".format( info["task"])) @@ -808,11 +838,13 @@ def fit(self, X, y, weight=None): elif self.solver == 'newton-cg': func = self.family._deviance grad = self.family._deviance_derivative + def grad_hess(coef, X, y, weight, link): - grad = (self.family - ._deviance_derivative(coef, X, y, weight, link)) - hessian = (self.family - ._deviance_hessian(coef, X, y, weight,link)) + grad = (self.family._deviance_derivative( + coef, X, y, weight, link)) + hessian = (self.family._deviance_hessian( + coef, X, y, weight, link)) + def Hs(s): ret = np.dot(hessian, s) return ret @@ -820,7 +852,7 @@ def Hs(s): hess = grad_hess args = (Xnew, y, weight, self.link) coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, - maxiter=self.max_iter, tol=self.tol) + maxiter=self.max_iter, tol=self.tol) self.coef_ = coef if self.fit_intercept is True: @@ -835,6 +867,9 @@ def Hs(s): return self def predict(self, X, weight=1): + """Prediction with features X. + If weights are given, returns prediction*weights. + """ check_is_fitted(self, "coef_") eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: @@ -860,13 +895,13 @@ def score(self, X, y, weight=1): Returns the weight averaged negitive deviance (the better the score, the better the fit). Maximum score is therefore 0. """ - #RegressorMixin has R^2 score. - #TODO: Make it more compatible with the score function in + # RegressorMixin has R^2 score. + # TODO: Make it more compatible with the score function in # sklearn.metrics.regression.py eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ mu = self.link.inverse(eta) - output_errors = self.family.unit_deviance(y,mu) + output_errors = self.family.unit_deviance(y, mu) weight = weight * np.ones_like(y) return np.average(output_errors, weights=weight) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 2a88a7ec899d9..a4d4ea8650860 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,73 +1,82 @@ import numpy as np -from sklearn.linear_model.glm import (Link, IdentityLink, LogLink, - ExponentialDispersionModel, TweedieDistribution, - NormalDistribution, GaussianDistribution, - PoissonDistribution, GammaDistribution, - InverseGaussianDistribution, GeneralizedHyperbolicSecand, - GeneralizedLinearModel) +from sklearn.linear_model.glm import ( + # Link, IdentityLink, + LogLink, + TweedieDistribution, + NormalDistribution, PoissonDistribution, + GammaDistribution, InverseGaussianDistribution, + # GeneralizedHyperbolicSecand, + GeneralizedLinearRegressor) + +from sklearn.utils.testing import ( + # assert_equal, + assert_array_equal, assert_array_almost_equal) -from sklearn.utils.testing import (assert_equal, assert_array_equal, - assert_array_almost_equal) def test_family_bounds(): """Test the valid range of distributions """ family = NormalDistribution() - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [True, True, True]) family = PoissonDistribution() - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, True, True]) family = TweedieDistribution(power=1.5) - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, True, True]) family = GammaDistribution() - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, False, True]) family = InverseGaussianDistribution() - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, False, True]) family = TweedieDistribution(power=4.5) - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, False, True]) + def test_glm_identiy_regression(): """Test linear regression on a simple dataset """ - coef = [1,2] - X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T + coef = [1, 2] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) + families = ( + NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)) for solver in ['irls', 'lbfgs', 'newton-cg']: - for family in (GaussianDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)): - glm = GeneralizedLinearModel(family=family, - fit_intercept=False, solver=solver) + for family in families: + glm = GeneralizedLinearRegressor( + family=family, fit_intercept=False, solver=solver) res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) + def test_glm_log_regression(): """Test linear regression on a simple dataset """ - coef = [1,2] - X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T + coef = [1, 2] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) - #for solver in ['irls', 'lbfgs', 'newton-cg']: - for solver in ['irls']: - #for family in [GaussianDistribution(), PoissonDistribution(), - # GammaDistribution(), InverseGaussianDistribution(), - # TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]: - for family in [GaussianDistribution()]: - glm = GeneralizedLinearModel(family=family, - link=LogLink(), - fit_intercept=False, solver=solver, start_params='ols') + families = ( + NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)) + for solver in ['irls', 'lbfgs', 'newton-cg']: + for family in families: + glm = GeneralizedLinearRegressor( + family=family, link=LogLink(), fit_intercept=False, + solver=solver, start_params='ols') res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) -#TODO: Test compatibility with R's glm, glmnet + +# TODO: Test compatibility with R's glm, glmnet From a6137d85401ef72976327c211f44d721d9f81e00 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 19 Jul 2017 17:41:49 +0200 Subject: [PATCH 003/209] [WIP] Add Generalized Linear Models (#9405) * GeneralizedLinearRegressor added to doc/modules/classes.rst --- doc/modules/classes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index e09ca0422d8a7..2d451b6758eb1 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -720,6 +720,7 @@ Kernels: linear_model.BayesianRidge linear_model.ElasticNet linear_model.ElasticNetCV + linear_model.GeneralizedLinearRegressor linear_model.HuberRegressor linear_model.Lars linear_model.LarsCV From b0be167080588a35dc1f4b762d961edb897b1019 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 9 Aug 2017 13:38:49 +0200 Subject: [PATCH 004/209] [WIP] Add Generalized Linear Models (#9405) * fixed bug: init parameter max_iter * fix API for family and link: default parameter changed to string non public variables self._family_instance and self._link_instance * fixed bug in score, minus sign forgotten * added check_is_fitted to estimate_phi and score * added check_array(X) in predict * replaced lambda functions in TweedieDistribution * some documentation --- sklearn/linear_model/glm.py | 196 ++++++++++++++++++++++-------------- 1 file changed, 122 insertions(+), 74 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index cf91a64fafc12..0ee1564049329 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -15,9 +15,10 @@ # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. -# So far, it is GeneralizedLinearModel, since it could very easily -# extended by Bernoulli/Binomial distribution. -# Solution: GeneralizedLinearRegressor +# Estimators in sklearn are either regressors or classifiers. A Generalized +# Linear Model does both depending on the chosen distribution, e.g. Normal => +# regressor, Bernoulli/Binomial => classifier. +# Solution: GeneralizedLinearRegressor since this is the focus. # - The link funtion (instance of class Link) is necessary for the evaluation # of deviance, score, Fisher and Hessian matrix as functions of the # coefficients, which is needed by optimizers. @@ -33,7 +34,7 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..externals import six -from ..utils import check_X_y +from ..utils import check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg from ..utils.validation import check_is_fitted @@ -372,46 +373,67 @@ class TweedieDistribution(ExponentialDispersionModel): The variance power of the unit_variance :math:`v(mu) = mu^{power}`. """ + def _less_upper_bound(self, x): + return np.less(x, self.upper_bound) + + def _less_equal_upper_bound(self, x): + return np.less_equal(x, self.upper_bound) + + def _greater_lower_bound(self, x): + return np.greater(x, self.lower_bound) + + def _greater_equal_lower_bound(self, x): + return np.greater_equal(x, self.lower_bound) + def __init__(self, power=0): self.power = power self._upper_bound = np.Inf - self._upper_compare = lambda x: np.less(x, self.upper_bound) + # self._upper_compare = lambda x: np.less(x, self.upper_bound) + self._upper_compare = self._less_upper_bound if power < 0: # Extreme Stable self._lower_bound = -np.Inf - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif power == 0: - # GaussianDistribution + # NormalDistribution self._lower_bound = -np.Inf - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif (power > 0) and (power < 1): raise ValueError('For 0 1) and (power < 2): # Compound Poisson self._lower_bound = 0 - self._lower_compare = ( - lambda x: np.greater_equal(x, self.lower_bound)) + # self._lower_compare = ( + # lambda x: np.greater_equal(x, self.lower_bound)) + self._lower_compare = self._greater_equal_lower_bound elif power == 2: # GammaDistribution self._lower_bound = 0 - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif (power > 2) and (power < 3): # Positive Stable self._lower_bound = 0 - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif power == 3: # InverseGaussianDistribution self._lower_bound = 0 - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif power > 3: # Positive Stable self._lower_bound = 0 - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound @property def power(self): @@ -530,6 +552,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Class to fit a Generalized Linear Model (GLM) based on reproductive Exponential Dispersion Models (EDM). + #TODO: This belongs to User Guide Assumptions: - The target values y_i are realizations of random variables @@ -559,25 +582,26 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Parameters ---------- - fit_intercept : boolean, optional, default True - whether to calculate the intercept for this model. If set - to False, no intercept will be used in calculations - (e.g. data is expected to be already centered). + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance - of a subclass of ExponentialDispersionModel, optional, default 'normal' + of a subclass of ExponentialDispersionModel, optional + (default='normal') the distributional assumption of the GLM. link : {'identity', 'log'} or an instance of a subclass of Link, - optional, default IdentityLink() - the link function (class) of the GLM + optional (default='identity') + the link function of the GLM, i.e. mapping from linear predictor + (X*coef) to expectation (mu). - fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr' + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul='chisqr') method for estimation of the dispersion parameter phi. Whether to use the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'irls', 'newton-cg', 'lbfgs'}, defaul 'irls' + solver : {'irls', 'newton-cg', 'lbfgs'}, optional (defaul='irls') Algorithm to use in the optimization problem. - 'irls' is iterated reweighted least squares. It is the standard @@ -585,16 +609,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'newton-cg', 'lbfgs' - max_iter : int, default 100 + max_iter : int, optional (default=100) TODO - tol : float + tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative of the deviance). - start_params : {array shape (n_features, ), 'ols'}, default None + start_params : {array shape (n_features, ), 'ols'}, optional (default=None) sets the start values for coef_ in the fit. If None, default values are taken. If 'ols' the result of an ordinary least squares in the link space @@ -603,9 +627,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If fit_intercept is true, the first value is assumed to be the start value for the intercept_. - verbose : int, default: 0 - For the lbfgs solver set verbose to any positive - number for verbosity. + verbose : int, optional (default=0) + For the lbfgs solver set verbose to any positive number for verbosity. Attributes ---------- @@ -629,15 +652,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): TODO """ - def __init__(self, fit_intercept=True, family=NormalDistribution(), - link=IdentityLink(), fit_dispersion='chisqr', solver='irls', + def __init__(self, fit_intercept=True, family='normal', + link='identity', fit_dispersion='chisqr', solver='irls', max_iter=100, tol=1e-4, start_params=None, verbose=0): self.fit_intercept = fit_intercept self.family = family self.link = link self.fit_dispersion = fit_dispersion self.solver = solver - self.max_iter = 100 + self.max_iter = max_iter self.tol = tol self.start_params = start_params self.verbose = verbose @@ -663,29 +686,38 @@ def fit(self, X, y, weight=None): ------- self : returns an instance of self. """ - if not isinstance(self.family, ExponentialDispersionModel): + # Garantee that self._family_instance is an instance of class + # ExponentialDispersionModel + if isinstance(self.family, ExponentialDispersionModel): + self._family_instance = self.family + else: if self.family == 'normal': - self.family = NormalDistribution() + self._family_instance = NormalDistribution() elif self.family == 'poisson': - self.family = PoissonDistribution() + self._family_instance = PoissonDistribution() elif self.family == 'gamma': - self.family = GammaDistribution() + self._family_instance = GammaDistribution() elif self.family == 'inverse.gaussian': - self.family = InverseGaussianDistribution() + self._family_instance = InverseGaussianDistribution() else: raise ValueError( - "The argument family must be an instance of class" + "The family must be an instance of class" " ExponentialDispersionModel or an element of" " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].") - if not isinstance(self.link, Link): + + # Garantee that self._link_instance is set to an instance of class Link + if isinstance(self.link, Link): + self._link_instance = self.link + else: if self.link == 'identity': - self.link = IdentityLink() - if self.link == 'log': - self.link = LogLink() + self._link_instance = IdentityLink() + elif self.link == 'log': + self._link_instance = LogLink() else: raise ValueError( - "The argument link must be an instance of class Link or" + "The link must be an instance of class Link or" " an element of ['identity', 'log'].") + if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool," " got {0}".format(self.fit_intercept)) @@ -711,10 +743,13 @@ def fit(self, X, y, weight=None): y_numeric=True, multi_output=False) y = y.astype(np.float64) - if not np.all(self.family.in_y_range(y)): + family = self._family_instance + link = self._link_instance + + if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" - .format(self.family.__class__.__name__)) + .format(family.__class__.__name__)) if weight is None: weight = np.ones_like(y) @@ -745,14 +780,14 @@ def fit(self, X, y, weight=None): coef = None if start_params is None: # Use mu_start and apply one irls step to calculate coef - mu = self.family.starting_mu(y, weight) + mu = family.starting_mu(y, weight) # linear predictor - eta = self.link.link(mu) + eta = link.link(mu) # h'(eta) - hp = self.link.inverse_derivative(eta) + hp = link.inverse_derivative(eta) # working weights w, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) + w = (hp**2 / family.variance(mu, phi=1, weight=weight)) wroot = np.sqrt(w) # working observations yw = eta + (y-mu)/hp @@ -763,7 +798,7 @@ def fit(self, X, y, weight=None): coef = linalg.lstsq(X_rescale, yw_rescale)[0] elif start_params is 'ols': reg = LinearRegression(copy_X=False, fit_intercept=False) - reg.fit(Xnew, self.link.link(y)) + reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: coef = start_params @@ -775,16 +810,16 @@ def fit(self, X, y, weight=None): if self.solver == 'irls': # linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) - mu = self.link.inverse(eta) + mu = link.inverse(eta) while self.n_iter_ < self.max_iter: self.n_iter_ += 1 # coef_old not used so far. # coef_old = coef # h'(eta) - hp = self.link.inverse_derivative(eta) + hp = link.inverse_derivative(eta) # working weights w, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) + w = (hp**2 / family.variance(mu, phi=1, weight=weight)) wroot = np.sqrt(w) # working observations yw = eta + (y-mu)/hp @@ -799,13 +834,13 @@ def fit(self, X, y, weight=None): # updated linear predictor # do it here for updated values for tolerance eta = safe_sparse_dot(Xnew, coef, dense_output=True) - mu = self.link.inverse(eta) + mu = link.inverse(eta) # which tolerace? |coef - coef_old| or gradient? # use gradient for compliance with newton-cg and lbfgs # TODO: faster computation of gradient, use mu and eta directly - gradient = self.family._deviance_derivative( - coef=coef, X=Xnew, y=y, weight=weight, link=self.link) + gradient = family._deviance_derivative( + coef=coef, X=Xnew, y=y, weight=weight, link=link) if (np.max(np.abs(gradient)) <= self.tol): converged = True break @@ -818,9 +853,9 @@ def fit(self, X, y, weight=None): # TODO: performance: make one function return both deviance and # gradient of deviance elif self.solver == 'lbfgs': - func = self.family._deviance - fprime = self.family._deviance_derivative - args = (Xnew, y, weight, self.link) + func = family._deviance + fprime = family._deviance_derivative + args = (Xnew, y, weight, link) coef, loss, info = optimize.fmin_l_bfgs_b( func, coef, fprime=fprime, args=args, @@ -836,13 +871,13 @@ def fit(self, X, y, weight=None): info["task"])) self.n_iter_ = info['nit'] elif self.solver == 'newton-cg': - func = self.family._deviance - grad = self.family._deviance_derivative + func = family._deviance + grad = family._deviance_derivative def grad_hess(coef, X, y, weight, link): - grad = (self.family._deviance_derivative( + grad = (family._deviance_derivative( coef, X, y, weight, link)) - hessian = (self.family._deviance_hessian( + hessian = (family._deviance_hessian( coef, X, y, weight, link)) def Hs(s): @@ -850,7 +885,7 @@ def Hs(s): return ret return grad, Hs hess = grad_hess - args = (Xnew, y, weight, self.link) + args = (Xnew, y, weight, link) coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, maxiter=self.max_iter, tol=self.tol) self.coef_ = coef @@ -871,37 +906,50 @@ def predict(self, X, weight=1): If weights are given, returns prediction*weights. """ check_is_fitted(self, "coef_") + X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) + # TODO: validation of weight eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ - mu = self.link.inverse(eta) + mu = self._link_instance.inverse(eta) return mu*weight def estimate_phi(self, y, X, weight): + """Estimation of the dispersion parameter. + Returns the estimate. + """ + check_is_fitted(self, "coef_") n_samples, n_features = X.shape eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ - mu = self.link.inverse(eta) + mu = self._link_instance.inverse(eta) if self.fit_dispersion == 'chisqr': - chisq = np.sum(weight*(y-mu)**2/self.family.unit_variance(mu)) + chisq = np.sum(weight*(y-mu)**2 / + self._family_instance.unit_variance(mu)) return chisq/(n_samples - n_features) elif self.fit_dispersion == 'deviance': - dev = self.family.deviance(y, mu, weight) + dev = self._family_instance.deviance(y, mu, weight) return dev/(n_samples - n_features) +# TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5" +# in check_estimator for score +# from sklearn.utils.estimator_checks import check_estimator +# from sklearn.linear_model import GeneralizedLinearRegressor +# check_estimator(GeneralizedLinearRegressor) def score(self, X, y, weight=1): """The natural score for a GLM is -deviance. - Returns the weight averaged negitive deviance (the better the score, + Returns the weight averaged negative deviance (the better the score, the better the fit). Maximum score is therefore 0. """ # RegressorMixin has R^2 score. # TODO: Make it more compatible with the score function in # sklearn.metrics.regression.py + check_is_fitted(self, "coef_") eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ - mu = self.link.inverse(eta) - output_errors = self.family.unit_deviance(y, mu) + mu = self._link_instance.inverse(eta) + output_errors = self._family_instance.unit_deviance(y, mu) weight = weight * np.ones_like(y) - return np.average(output_errors, weights=weight) + return -np.average(output_errors, weights=weight) From 85c52ec9c6adb3b1f75650cfa7fe0b770393d24e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 13 Aug 2017 01:46:16 +0200 Subject: [PATCH 005/209] [WIP] Add Generalized Linear Models (#9405) * make raw docstrings where appropriate * make ExponentialDispersionModel (i.e. TweedieDistribution) pickable: ExponentialDispersionModel has new properties include_lower_bound, method in_y_range is not abstract anymore. * set self.intercept_=0 if fit_intercept=False, such that it is always defined. * set score to D2, a generalized R2 with deviance instead of squared error, as does glmnet. This also solves issues with check_regressors_train(GeneralizedLinearRegressor), which assumes R2 score. * change of names: weight to weights in ExponentialDispersionModel and to sample_weight in GeneralizedLinearRegressor * add class method linear_predictor --- sklearn/linear_model/glm.py | 330 ++++++++++++++++++++++-------------- 1 file changed, 199 insertions(+), 131 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 0ee1564049329..b80842f817f4d 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -5,13 +5,16 @@ # Author: Christian Lorentzen # License: BSD 3 clause +# TODO: Write more tests # TODO: Which name/symbol for coefficients and weights in docu? # sklearn.linear_models uses w for coefficients. -# So far, coefficients=beta and weight=w (as standard literature) -# TODO: Add l2-penalty +# So far, coefficients=beta and weights=w (as standard literature) +# TODO: Add l2-penalty (maybe more general w.P.w with P penalty matrix) # TODO: Add l1-penalty (elastic net) # TODO: Add cross validation # TODO: Write docu and examples +# TODO: Make it as much consistent to other estimators in linear_model as +# possible # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -118,7 +121,7 @@ def inverse_derivative2(self, lin_pred): class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): - """Base class for reproductive Exponential Dispersion Models (EDM). + r"""Base class for reproductive Exponential Dispersion Models (EDM). The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by @@ -136,6 +139,8 @@ class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): ---------- lower_bound upper_bound + include_lower_bound + include_upper_bound Methods ------- @@ -174,15 +179,39 @@ def upper_bound(self): """ raise NotImplementedError() - @abstractmethod + @abstractproperty + def include_lower_bound(self): + """If True, values of y may equal lower bound: y >= lower_bound. + """ + raise NotImplementedError() + + @abstractproperty + def include_upper_bound(self): + """If True, values of y may equal upper bound: y <= upper_bound. + """ + raise NotImplementedError() + def in_y_range(self, x): """Returns true if x is in the valid range of Y~EDM. """ - raise NotImplementedError() + if self.include_lower_bound: + if self.include_upper_bound: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less(x, self.upper_bound)) + else: + if self.include_upper_bound: + return np.logical_and(np.greater(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater(x, self.lower_bound), + np.less(x, self.upper_bound)) @abstractmethod def unit_variance(self, mu): - """The unit variance :math:`v(mu)` determines the variance as + r"""The unit variance :math:`v(mu)` determines the variance as a function of the mean mu by :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`. It can also be derived from the unit deviance :math:`d(y,\mu)` as @@ -194,27 +223,27 @@ def unit_variance(self, mu): @abstractmethod def unit_variance_derivative(self, mu): - """The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`. + r"""The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`. """ raise NotImplementedError() - def variance(self, mu, phi=1, weight=1): - """The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is + def variance(self, mu, phi=1, weights=1): + r"""The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`, with unit variance v(mu). """ - return phi/weight * self.unit_variance(mu) + return phi/weights * self.unit_variance(mu) - def variance_derivative(self, mu, phi=1, weight=1): - """The derivative of the variance w.r.t. mu, + def variance_derivative(self, mu, phi=1, weights=1): + r"""The derivative of the variance w.r.t. mu, :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] =phi/w_i*v'(\mu_i)`, with unit variance v(mu). """ - return phi/weight * self.unit_variance_derivative(mu) + return phi/weights * self.unit_variance_derivative(mu) @abstractmethod def unit_deviance(self, y, mu): - """The unit_deviance :math:`d(y,\mu)`. + r"""The unit_deviance :math:`d(y,\mu)`. In terms of the log-likelihood it is given by :math:`d(y,\mu) = -2\phi\cdot \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).` @@ -222,7 +251,7 @@ def unit_deviance(self, y, mu): raise NotImplementedError() def unit_deviance_derivative(self, y, mu): - """The derivative w.r.t. mu of the unit_deviance + r"""The derivative w.r.t. mu of the unit_deviance :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` with unit variance :math:`v(\mu)`. @@ -232,30 +261,30 @@ def unit_deviance_derivative(self, y, mu): """ return -2*(y-mu)/self.unit_variance(mu) - def deviance(self, y, mu, weight=1): - """The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu) - with weight :math:`w_i` and unit_deviance :math:`d(y,mu)`. + def deviance(self, y, mu, weights=1): + r"""The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu) + with weights :math:`w_i` and unit_deviance :math:`d(y,mu)`. In terms of the likelihood it is :math:`D = -2\phi\cdot \left(loglike(y,\mu,\frac{phi}{w}) - loglike(y,y,\frac{phi}{w})\right).` """ - return np.sum(weight*self.unit_deviance(y, mu)) + return np.sum(weights*self.unit_deviance(y, mu)) - def _deviance(self, coef, X, y, weight, link): + def _deviance(self, coef, X, y, weights, link): """The deviance as a function of the coefficients ``coef`` (:math:`beta`). """ lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) - return self.deviance(y, mu, weight) + return self.deviance(y, mu, weights) - def deviance_derivative(self, y, mu, weight=1): + def deviance_derivative(self, y, mu, weights=1): """The derivative w.r.t. mu of the deviance.` """ - return weight*self.unit_deviance_derivative(y, mu) + return weights*self.unit_deviance_derivative(y, mu) - def _score(self, coef, phi, X, y, weight, link): - """The score function :math:`s` is the derivative of the + def _score(self, coef, phi, X, y, weights, link): + r"""The score function :math:`s` is the derivative of the log-likelihood w.r.t. the ``coef`` (:math:`\beta`). It is given by @@ -270,7 +299,7 @@ def _score(self, coef, phi, X, y, weight, link): n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d = link.inverse_derivative(lin_pred) d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), shape=(n_samples, n_samples)) @@ -278,8 +307,8 @@ def _score(self, coef, phi, X, y, weight, link): score = safe_sparse_dot(X.T, temp, dense_output=False) return score - def _fisher_matrix(self, coef, phi, X, y, weight, link): - """The Fisher information matrix, also known as expected + def _fisher_matrix(self, coef, phi, X, y, weights, link): + r"""The Fisher information matrix, also known as expected information matrix. It is given by .. math: @@ -295,7 +324,7 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link): n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d2 = link.inverse_derivative(lin_pred)**2 d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), shape=(n_samples, n_samples)) @@ -303,8 +332,8 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link): fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False) return fisher_matrix - def _observed_information(self, coef, phi, X, y, weight, link): - """The observed information matrix, also known as the negative of + def _observed_information(self, coef, phi, X, y, weights, link): + r"""The observed information matrix, also known as the negative of the Hessian matrix of the log-likelihood. It is given by .. math: @@ -327,7 +356,7 @@ def _observed_information(self, coef, phi, X, y, weight, link): n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) dp = link.inverse_derivative2(lin_pred) d2 = link.inverse_derivative(lin_pred)**2 v = self.unit_variance_derivative(mu)/self.unit_variance(mu) @@ -338,33 +367,34 @@ def _observed_information(self, coef, phi, X, y, weight, link): observed_information = safe_sparse_dot(X.T, temp, dense_output=False) return observed_information - def _deviance_derivative(self, coef, X, y, weight, link): - """The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a + def _deviance_derivative(self, coef, X, y, weights, link): + r"""The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a function of the coefficients ``coef``. This is equivalent to :math:`-2\phi` times the score function :math:`s` (derivative of the log-likelihood). """ - score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight, + score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights, link=link) return -2*score - def _deviance_hessian(self, coef, X, y, weight, link): - """The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance + def _deviance_hessian(self, coef, X, y, weights, link): + r"""The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance as a function of the coefficients ``coef``. This is equivalent to :math:`+2\phi` times the observed information matrix. """ info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y, - weight=weight, link=link) + weights=weights, link=link) return 2*info_matrix - def starting_mu(self, y, weight=1): + def starting_mu(self, y, weights=1): """Starting values for the mean mu_i in IRLS.""" - return (weight*y+np.mean(weight*y))/(2.*np.sum(np.ones_like(y)*weight)) + return ((weights*y+np.mean(weights*y)) + / (2.*np.sum(np.ones_like(y)*weights))) class TweedieDistribution(ExponentialDispersionModel): - """A class for the Tweedie distribution. + r"""A class for the Tweedie distribution. They have mu=E[X] and Var[X] \propto mu**power. Attributes @@ -373,67 +403,44 @@ class TweedieDistribution(ExponentialDispersionModel): The variance power of the unit_variance :math:`v(mu) = mu^{power}`. """ - def _less_upper_bound(self, x): - return np.less(x, self.upper_bound) - - def _less_equal_upper_bound(self, x): - return np.less_equal(x, self.upper_bound) - - def _greater_lower_bound(self, x): - return np.greater(x, self.lower_bound) - - def _greater_equal_lower_bound(self, x): - return np.greater_equal(x, self.lower_bound) - def __init__(self, power=0): self.power = power self._upper_bound = np.Inf - # self._upper_compare = lambda x: np.less(x, self.upper_bound) - self._upper_compare = self._less_upper_bound + self._include_upper_bound = False if power < 0: # Extreme Stable self._lower_bound = -np.Inf - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif power == 0: # NormalDistribution self._lower_bound = -np.Inf - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif (power > 0) and (power < 1): raise ValueError('For 0 1) and (power < 2): # Compound Poisson self._lower_bound = 0 - # self._lower_compare = ( - # lambda x: np.greater_equal(x, self.lower_bound)) - self._lower_compare = self._greater_equal_lower_bound + self._include_lower_bound = True elif power == 2: # GammaDistribution self._lower_bound = 0 - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif (power > 2) and (power < 3): # Positive Stable self._lower_bound = 0 - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif power == 3: # InverseGaussianDistribution self._lower_bound = 0 - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif power > 3: # Positive Stable self._lower_bound = 0 - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False @property def power(self): @@ -454,8 +461,13 @@ def lower_bound(self): def upper_bound(self): return self._upper_bound - def in_y_range(self, x): - return np.logical_and(self._lower_compare(x), self._upper_compare(x)) + @property + def include_lower_bound(self): + return self._include_lower_bound + + @property + def include_upper_bound(self): + return self._include_upper_bound def unit_variance(self, mu): """The unit variance of a Tweedie distribution is v(mu)=mu**power. @@ -485,7 +497,7 @@ def unit_deviance(self, y, mu): return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) - def likelihood(self, y, X, beta, phi, weight=1): + def likelihood(self, y, X, beta, phi, weights=1): raise NotImplementedError('This function is not (yet) implemented.') @@ -521,6 +533,8 @@ class GeneralizedHyperbolicSecand(ExponentialDispersionModel): def __init__(self): self._lower_bound = -np.Inf self._upper_bound = np.Inf + self._include_lower_bound = False + self._include_upper_bound = False @property def lower_bound(self): @@ -530,11 +544,13 @@ def lower_bound(self): def upper_bound(self): return self._upper_bound - def in_y_range(self, x): - np.logical_and( - np.greater(x, self.lower_bound), - np.less(x, self.lower_bound) - ) + @property + def include_lower_bound(self): + return self._include_lower_bound + + @property + def include_upper_bound(self): + return self._include_upper_bound def unit_variance(self, mu): return 1 + mu**2 @@ -548,7 +564,7 @@ def unit_deviance(self, y, mu): class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - """ + r""" Class to fit a Generalized Linear Model (GLM) based on reproductive Exponential Dispersion Models (EDM). @@ -665,7 +681,7 @@ def __init__(self, fit_intercept=True, family='normal', self.start_params = start_params self.verbose = verbose - def fit(self, X, y, weight=None): + def fit(self, X, y, sample_weight=None): """Fit a generalized linear model. Parameters @@ -676,11 +692,12 @@ def fit(self, X, y, weight=None): y : numpy array of shape [n_samples] Target values - weight : numpy array of shape [n_samples] + sample_weight : numpy array of shape [n_samples] Individual weights for each sample. Var[Y_i]=phi/weight_i * v(mu) If Y_i ~ EDM(mu, phi/w_i) then - sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)) + sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a + weighted average with weights=sample_weight. Returns ------- @@ -751,16 +768,17 @@ def fit(self, X, y, weight=None): "range for family {0}" .format(family.__class__.__name__)) - if weight is None: - weight = np.ones_like(y) - elif np.isscalar(weight): - weight = weight*np.ones_like(y) + if sample_weight is None: + weights = np.ones_like(y) + elif np.isscalar(sample_weight): + weights = sample_weight*np.ones_like(y) else: - weight = np.atleast_1d(weight) - if weight.ndim > 1: - raise ValueError("Weights must be 1D array or scalar") - elif weight.shape[0] != y.shape[0]: - raise ValueError("Weights must have the same length as y") + weights = np.atleast_1d(sample_weight) + if weights.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar") + elif weights.shape[0] != y.shape[0]: + raise ValueError("Sample weights must have the same length as" + " y") if self.fit_intercept: # intercept is first column <=> coef[0] is for intecept @@ -780,14 +798,14 @@ def fit(self, X, y, weight=None): coef = None if start_params is None: # Use mu_start and apply one irls step to calculate coef - mu = family.starting_mu(y, weight) + mu = family.starting_mu(y, weights) # linear predictor eta = link.link(mu) # h'(eta) hp = link.inverse_derivative(eta) # working weights w, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / family.variance(mu, phi=1, weight=weight)) + w = (hp**2 / family.variance(mu, phi=1, weights=weights)) wroot = np.sqrt(w) # working observations yw = eta + (y-mu)/hp @@ -819,7 +837,7 @@ def fit(self, X, y, weight=None): hp = link.inverse_derivative(eta) # working weights w, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / family.variance(mu, phi=1, weight=weight)) + w = (hp**2 / family.variance(mu, phi=1, weights=weights)) wroot = np.sqrt(w) # working observations yw = eta + (y-mu)/hp @@ -840,7 +858,7 @@ def fit(self, X, y, weight=None): # use gradient for compliance with newton-cg and lbfgs # TODO: faster computation of gradient, use mu and eta directly gradient = family._deviance_derivative( - coef=coef, X=Xnew, y=y, weight=weight, link=link) + coef=coef, X=Xnew, y=y, weights=weights, link=link) if (np.max(np.abs(gradient)) <= self.tol): converged = True break @@ -855,7 +873,7 @@ def fit(self, X, y, weight=None): elif self.solver == 'lbfgs': func = family._deviance fprime = family._deviance_derivative - args = (Xnew, y, weight, link) + args = (Xnew, y, weights, link) coef, loss, info = optimize.fmin_l_bfgs_b( func, coef, fprime=fprime, args=args, @@ -874,47 +892,72 @@ def fit(self, X, y, weight=None): func = family._deviance grad = family._deviance_derivative - def grad_hess(coef, X, y, weight, link): + def grad_hess(coef, X, y, weights, link): grad = (family._deviance_derivative( - coef, X, y, weight, link)) + coef, X, y, weights, link)) hessian = (family._deviance_hessian( - coef, X, y, weight, link)) + coef, X, y, weights, link)) def Hs(s): ret = np.dot(hessian, s) return ret return grad, Hs hess = grad_hess - args = (Xnew, y, weight, link) + args = (Xnew, y, weights, link) coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, maxiter=self.max_iter, tol=self.tol) self.coef_ = coef - if self.fit_intercept is True: + if self.fit_intercept: self.intercept_ = coef[0] self.coef_ = coef[1:] else: + self.intercept_ = 0. self.coef_ = coef if self.fit_dispersion in ['chisqr', 'deviance']: - self.dispersion_ = self.estimate_phi(y, X, weight) + self.dispersion_ = self.estimate_phi(y, X, weights) return self - def predict(self, X, weight=1): - """Prediction with features X. - If weights are given, returns prediction*weights. + def linear_predictor(self, X): + """The linear_predictor X*coef_ + intercept_. + + Parameters + ---------- + X : numpy array or sparse matrix of shape [n_samples,n_features] + Samples. + + Returns + ------- + C : array, shape = (n_samples) + Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) - # TODO: validation of weight - eta = safe_sparse_dot(X, self.coef_, dense_output=True) - if self.fit_intercept is True: - eta += self.intercept_ + return safe_sparse_dot(X, self.coef_, + dense_output=True) + self.intercept_ + + def predict(self, X, sample_weight=1): + """Predict uing GLM with feature matrix X. + If sample_weight is given, returns prediction*sample_weight. + + Parameters + ---------- + X : numpy array or sparse matrix of shape [n_samples,n_features] + Samples. + + Returns + ------- + C : array, shape = (n_samples) + Returns predicted values times sample_weight. + """ + # TODO: validation of sample_weight + eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) - return mu*weight + return mu*sample_weight - def estimate_phi(self, y, X, weight): + def estimate_phi(self, y, X, sample_weight): """Estimation of the dispersion parameter. Returns the estimate. """ @@ -925,11 +968,11 @@ def estimate_phi(self, y, X, weight): eta += self.intercept_ mu = self._link_instance.inverse(eta) if self.fit_dispersion == 'chisqr': - chisq = np.sum(weight*(y-mu)**2 / + chisq = np.sum(sample_weight*(y-mu)**2 / self._family_instance.unit_variance(mu)) return chisq/(n_samples - n_features) elif self.fit_dispersion == 'deviance': - dev = self._family_instance.deviance(y, mu, weight) + dev = self._family_instance.deviance(y, mu, sample_weight) return dev/(n_samples - n_features) # TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5" @@ -937,19 +980,44 @@ def estimate_phi(self, y, X, weight): # from sklearn.utils.estimator_checks import check_estimator # from sklearn.linear_model import GeneralizedLinearRegressor # check_estimator(GeneralizedLinearRegressor) - def score(self, X, y, weight=1): - """The natural score for a GLM is -deviance. - Returns the weight averaged negative deviance (the better the score, - the better the fit). Maximum score is therefore 0. + def score(self, X, y, sample_weight=None): + r"""Returns D^2, a generalization of the coefficient of determination + R^2, which uses deviance instead of squared error. + + D^2 is defined as + :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` + is the null deviance, i.e. the deviance of a model with intercept + alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean + :math:`\bar{y}` is average by sample_weight. In the case of a Normal + distribution, this D^2 equals R^2. + Best possible score is 1.0 and it can be negative (because the + model can be arbitrarily worse). + + Parameters + ---------- + X : array-like, shape = (n_samples, n_features) + Test samples + + y : array-like of shape = (n_samples) + True valeus for X. + + sample_weight : array-like, shape = (n_samples), optional + Sample weights. + + Returns + ------- + score : float + D^2 of self.predict(X) wrt. y. """ - # RegressorMixin has R^2 score. - # TODO: Make it more compatible with the score function in - # sklearn.metrics.regression.py - check_is_fitted(self, "coef_") - eta = safe_sparse_dot(X, self.coef_, dense_output=True) - if self.fit_intercept is True: - eta += self.intercept_ - mu = self._link_instance.inverse(eta) - output_errors = self._family_instance.unit_deviance(y, mu) - weight = weight * np.ones_like(y) - return -np.average(output_errors, weights=weight) + # Note, default score defined in RegressorMixin is R^2 score. + # TODO: make D^2 a score function in module metrics (and thereby get + # input validation and so on) + if sample_weight is None: + weights = np.ones_like(y) + else: + weights = np.atleast_1d(sample_weight) + mu = self.predict(X) + dev = self._family_instance.deviance(y, mu, weights=weights) + y_mean = np.average(y, weights=weights) + dev_null = self._family_instance.deviance(y, y_mean, weights=weights) + return 1. - dev / dev_null From 0f4bdb3a8c5c45e80786b8156398da93bfc597e8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 18 Sep 2017 23:41:19 +0200 Subject: [PATCH 006/209] [WIP] Add Generalized Linear Models (#9405) * added L2 penalty * api change: alpha, l1_ratio, P1, P2, warm_start, check_input, copy_X * added entry in user guide * improved docstrings * helper function _irls_step --- doc/modules/linear_model.rst | 113 +++- sklearn/linear_model/glm.py | 702 ++++++++++++++++++------- sklearn/linear_model/tests/test_glm.py | 115 +++- 3 files changed, 721 insertions(+), 209 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 75b95f6c7a44f..51b3821fa6207 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -786,7 +786,7 @@ non-smooth `penalty="l1"`. This is therefore the solver of choice for sparse multinomial logistic regression. It is also the only solver that supports `penalty="elasticnet"`. -The "lbfgs" is an optimization algorithm that approximates the +The "lbfgs" is an optimization algorithm that approximates the Broyden–Fletcher–Goldfarb–Shanno algorithm [8]_, which belongs to quasi-Newton methods. The "lbfgs" solver is recommended for use for small data-sets but for larger datasets its performance suffers. [9]_ @@ -874,6 +874,117 @@ to warm-starting (see :term:`Glossary `). .. [9] `"Performance Evaluation of Lbfgs vs other solvers" `_ +.. _Generalized_linear_regression: + +Generalized linear regression +============================= + +:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two +ways [1]_. First, the predicted values :math:`\hat{y}` are linked to a linear +combination of the input variables :math:`X` via an inverse link function +:math:`h` as + +.. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). + +Secondly, the squared loss function is replaced by the deviance :math:`D` of an +exponential dispersion model (EDM) [2]_. The objective function beeing minimized +becomes + +.. math:: \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1 + +\frac{\alpha(1-\rho)}{2} w^T P_2 w + +with sample weights :math:`s`. +:math:`P_1` can be used to exclude some of the coefficients in the L1 +penalty, :math:`P_2` (must be positive semi-definite) allows for a more +versatile L2 penalty. + +Use cases, where a loss different from the squared loss might be appropriate, +are the following: + + * If the target values :math:`y` are counts (integer valued) or frequencies, you might try a Poisson deviance. + + * If the target values are positive valued and skewed, you might try a Gamma deviance. + + * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family). + +Since the linear predictor :math:`Xw` can be negative and +Poisson, Gamma and Inverse Gaussian distributions don't have negative values, +it is convenient to apply a link function different from the identity link +:math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with +:math:`h(Xw)=\exp(Xw)`. + +Note that the feature matrix `X` should be standardized before fitting. This +ensures that the penalty treats features equally. + + >>> from sklearn import linear_model + >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, l1_ratio=0) + >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) + >>> reg.coef_ + array([ 0.24630255, 0.43373521]) + >>> reg.intercept_ + -0.76383575123143277 + +Mathematical formulation +------------------------ + +In the unpenalized case, the assumptions are the folowing: + + * The target values :math:`y_i` are realizations of random variables + :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})` + with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter + :math:`\phi` and sample weights :math:`s_i`. + * The aim is to predict the expectation :math:`\mu_i` with + :math:`\hat{y_i} = h(\eta_i)`, linear predictor + :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`. + +Note that the first assumption implies +:math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance +function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the +same as specifying a unit variance function (they are one-to-one). + +Including penalties helps to avoid overfitting or, in case of L1 penalty, to +obtain sparse solutions. But there are also other motivations to include them, +e.g. accounting fo dependence structure of :math:`y`. + +The objective function, which is independent of :math:`\phi`, is minimized with +respect to the coefficients :math:`w`. + +The deviance is defined by + +.. math:: D(y, \mu) = -2\phi\cdot + \left(loglike(y,\mu,\frac{\phi}{s}) + - loglike(y,y,\frac{\phi}{s})\right) + +===================================== ================================= +Distribution Variance Function :math:`v(\mu)` +===================================== ================================= +Normal ("normal") :math:`1` +Poisson ("poisson") :math:`\mu` +Gamma ("gamma") :math:`\mu^2` +Inverse Gaussian ("inverse.gaussian") :math:`\mu^3` +===================================== ================================= + +Two remarks: + +* The deviances for at least Normal, Poisson and Gamma distributions are + strictly consistent scoring functions for the mean :math:`\mu`, see Eq. + (19)-(20) in [3]_. + +* If you want to model a frequency, i.e. counts per exposure (time, volume, ...) + you can do so by a Poisson distribution and passing + :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together + with :math:`s=\mathrm{exposure}` as sample weights. + + +.. topic:: References: + + .. [1] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. [2] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. + See also `Exponential dispersion model. `_ + + .. [3] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ Stochastic Gradient Descent - SGD ================================= diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index b80842f817f4d..2db3c56d5e1c1 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -6,15 +6,15 @@ # License: BSD 3 clause # TODO: Write more tests -# TODO: Which name/symbol for coefficients and weights in docu? -# sklearn.linear_models uses w for coefficients. -# So far, coefficients=beta and weights=w (as standard literature) -# TODO: Add l2-penalty (maybe more general w.P.w with P penalty matrix) # TODO: Add l1-penalty (elastic net) +# TODO: deal with option self.copy_X +# TODO: Should the option `normalize` be included (like other linear models)? +# So far, it is not included. User must pass a normalized X. # TODO: Add cross validation -# TODO: Write docu and examples +# TODO: Write examples and more docu # TODO: Make it as much consistent to other estimators in linear_model as # possible +# TODO: options P1 and P2 in fit() or in __init__()??? # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -22,10 +22,21 @@ # Linear Model does both depending on the chosen distribution, e.g. Normal => # regressor, Bernoulli/Binomial => classifier. # Solution: GeneralizedLinearRegressor since this is the focus. +# - Allow for finer control of penalty terms: +# L1: ||P1*w||_1 with P1*w a componentwise product, this allows to exclude +# factors from the L1 penalty. +# L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be +# a 1st or 2nd order difference matrix (compare B-spline penalties and +# Tikhonov regularization). # - The link funtion (instance of class Link) is necessary for the evaluation # of deviance, score, Fisher and Hessian matrix as functions of the # coefficients, which is needed by optimizers. # Solution: link as argument in those functions +# - Which name/symbol for sample_weight in docu? +# sklearn.linear_models uses w for coefficients, standard literature on +# GLMs use beta for coefficients and w for (sample) weights. +# So far, coefficients=w and sample weights=s. + from __future__ import division from abc import ABCMeta, abstractmethod, abstractproperty @@ -34,6 +45,8 @@ from scipy import linalg, optimize, sparse import warnings from .base import LinearRegression +from .coordinate_descent import ElasticNet +from .ridge import Ridge from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..externals import six @@ -164,7 +177,8 @@ class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): References ---------- - See https://en.wikipedia.org/wiki/Exponential_dispersion_model. + + https://en.wikipedia.org/wiki/Exponential_dispersion_model. """ @abstractproperty @@ -192,7 +206,7 @@ def include_upper_bound(self): raise NotImplementedError() def in_y_range(self, x): - """Returns true if x is in the valid range of Y~EDM. + """Returns true if `x` is in the valid range of Y~EDM. """ if self.include_lower_bound: if self.include_upper_bound: @@ -211,33 +225,36 @@ def in_y_range(self, x): @abstractmethod def unit_variance(self, mu): - r"""The unit variance :math:`v(mu)` determines the variance as - a function of the mean mu by - :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`. + r"""The unit variance :math:`v(\mu)` determines the variance as + a function of the mean :math:`\mu` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. It can also be derived from the unit deviance :math:`d(y,\mu)` as .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ \partial\mu^2}}\big|_{y=\mu} + + See also :func:`variance`. """ raise NotImplementedError() @abstractmethod def unit_variance_derivative(self, mu): - r"""The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`. + r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`. """ raise NotImplementedError() def variance(self, mu, phi=1, weights=1): - r"""The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is - :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`, - with unit variance v(mu). + r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is + :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, + with unit variance :math:`v(\mu)` and weights :math:`s_i`. """ return phi/weights * self.unit_variance(mu) def variance_derivative(self, mu, phi=1, weights=1): - r"""The derivative of the variance w.r.t. mu, + r"""The derivative of the variance w.r.t. `mu`, :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] - =phi/w_i*v'(\mu_i)`, with unit variance v(mu). + =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` + and weights :math:`s_i`. """ return phi/weights * self.unit_variance_derivative(mu) @@ -251,8 +268,8 @@ def unit_deviance(self, y, mu): raise NotImplementedError() def unit_deviance_derivative(self, y, mu): - r"""The derivative w.r.t. mu of the unit_deviance - :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + r"""The derivative w.r.t. `mu` of the unit deviance + :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` with unit variance :math:`v(\mu)`. Returns @@ -262,39 +279,39 @@ def unit_deviance_derivative(self, y, mu): return -2*(y-mu)/self.unit_variance(mu) def deviance(self, y, mu, weights=1): - r"""The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu) - with weights :math:`w_i` and unit_deviance :math:`d(y,mu)`. + r"""The deviance is given by :math:`D = \sum_i s_i \cdot d(y, \mu) + with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. In terms of the likelihood it is :math:`D = -2\phi\cdot - \left(loglike(y,\mu,\frac{phi}{w}) - - loglike(y,y,\frac{phi}{w})\right).` + \left(loglike(y,\mu,\frac{phi}{s}) + - loglike(y,y,\frac{phi}{s})\right)`. """ return np.sum(weights*self.unit_deviance(y, mu)) def _deviance(self, coef, X, y, weights, link): - """The deviance as a function of the coefficients ``coef`` - (:math:`beta`). + """The deviance as a function of the coefficients `coef` + (:math:`w`). """ lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) return self.deviance(y, mu, weights) def deviance_derivative(self, y, mu, weights=1): - """The derivative w.r.t. mu of the deviance.` + """The derivative w.r.t. `mu` of the deviance. """ return weights*self.unit_deviance_derivative(y, mu) def _score(self, coef, phi, X, y, weights, link): - r"""The score function :math:`s` is the derivative of the - log-likelihood w.r.t. the ``coef`` (:math:`\beta`). + r"""The score function is the derivative of the + log-likelihood w.r.t. `coef` (:math:`w`). It is given by .. math: - \mathbf{s}(\boldsymbol{\beta}) = \mathbf{X}^T \mathbf{D} + \mathbf{score}(\boldsymbol{w}) = \mathbf{X}^T \mathbf{D} \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and - :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}(y_1),\ldots)`. + :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. """ n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) @@ -303,23 +320,27 @@ def _score(self, coef, phi, X, y, weights, link): d = link.inverse_derivative(lin_pred) d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False) - score = safe_sparse_dot(X.T, temp, dense_output=False) + temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=True) + score = safe_sparse_dot(X.T, temp, dense_output=True) return score def _fisher_matrix(self, coef, phi, X, y, weights, link): - r"""The Fisher information matrix, also known as expected - information matrix. It is given by + r"""The Fisher information matrix. + The Fisher information matrix, also known as expected information + matrix is given by .. math: - \mathbf{F}(\boldsymbol{\beta}) = \mathrm{E}\left[ - -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta} - \partial\boldsymbol{\beta}^T}\right] + \mathbf{F}(\boldsymbol{w}) = + \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial + \boldsymbol{w}} \right] + = \mathrm{E}\left[ + -\frac{\partial^2 loglike}{\partial\boldsymbol{w} + \partial\boldsymbol{w}^T}\right] = \mathbf{X}^T W \mathbf{X} \,, with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, - see score function. + see func:`score_function`. """ n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) @@ -333,14 +354,15 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): return fisher_matrix def _observed_information(self, coef, phi, X, y, weights, link): - r"""The observed information matrix, also known as the negative of + r"""The observed information matrix. + The observed information matrix, also known as the negative of the Hessian matrix of the log-likelihood. It is given by .. math: - \mathbf{H}(\boldsymbol{\beta}) = - -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta} - \partial\boldsymbol{\beta}^T} + \mathbf{H}(\boldsymbol{w}) = + -\frac{\partial^2 loglike}{\partial\boldsymbol{w} + \partial\boldsymbol{w}^T} = \mathbf{X}^T \legt[ - \mathbf{D}' \mathbf{R} + \mathbf{D}^2 \mathbf{V} \mathbf{R} @@ -351,7 +373,7 @@ def _observed_information(self, coef, phi, X, y, weights, link): :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{ v(\mu_i)} \right)`, - see score function and Fisher matrix. + see :func:`score_` function and :func:`_fisher_matrix`. """ n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) @@ -368,18 +390,18 @@ def _observed_information(self, coef, phi, X, y, weights, link): return observed_information def _deviance_derivative(self, coef, X, y, weights, link): - r"""The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a - function of the coefficients ``coef``. + r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a + function of the coefficients `coef`. This is equivalent to :math:`-2\phi` times the score function - :math:`s` (derivative of the log-likelihood). + :func:`score_function` (derivative of the log-likelihood). """ score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights, link=link) return -2*score def _deviance_hessian(self, coef, X, y, weights, link): - r"""The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance - as a function of the coefficients ``coef``. + r"""The hessian matrix w.r.t. `coef` (:math:`w`) of the deviance + as a function of the coefficients `coef`. This is equivalent to :math:`+2\phi` times the observed information matrix. """ @@ -388,20 +410,21 @@ def _deviance_hessian(self, coef, X, y, weights, link): return 2*info_matrix def starting_mu(self, y, weights=1): - """Starting values for the mean mu_i in IRLS.""" - return ((weights*y+np.mean(weights*y)) - / (2.*np.sum(np.ones_like(y)*weights))) + """Starting values for the mean mu_i in (unpenalized) IRLS.""" + return ((weights*y+np.mean(weights*y)) / + (2.*np.sum(np.ones_like(y)*weights))) class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. - They have mu=E[X] and Var[X] \propto mu**power. + They have :math:`\mu=\mathrm{E}[Y]` and + :math:`\mathrm{Var}[Y] \propto \mu^power. Attributes ---------- power : float The variance power of the unit_variance - :math:`v(mu) = mu^{power}`. + :math:`v(\mu) = \mu^{power}`. """ def __init__(self, power=0): self.power = power @@ -497,7 +520,7 @@ def unit_deviance(self, y, mu): return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) - def likelihood(self, y, X, beta, phi, weights=1): + def likelihood(self, y, X, w, phi, weights=1): raise NotImplementedError('This function is not (yet) implemented.') @@ -563,51 +586,135 @@ def unit_deviance(self, y, mu): np.log((1+mu**2)/(1+y**2))) +def _irls_step(X, W, P2, z): + """One step in iteratively reweighted least squares + + Solve A w = b for w with + A = (X' W X + P2) + b = X' W z + z = eta + D^-1 (y-mu) + + See also fit method of :class:`GeneralizedLinearRegressor`. + + Parameters + ---------- + X : numpy array or sparse matrix of shape (n_samples, n_features) + Training data (with intercept included if present) + + W : numpy array of shape (n_samples, ) + + P2 : numpy array or sparse matrix of shape (n_features, n_features) + The l2-penalty matrix or vector (=diagonal matrix) + + z : numpy array of shape (n_samples, ) + Working observations + + Returns + ------- + coef: array, shape = (X.shape[1]) + """ + # TODO: scipy.linalg.solve if faster, but ordinary least squares uses + # scipy.linalg.lstsq. What is more appropriate? + n_samples, n_features = X.shape + if sparse.issparse(X): + W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr() + if P2.ndim == 1: + L2 = (sparse.dia_matrix((P2, 0), shape=(n_features, n_features)) + ).tocsr() + else: + L2 = sparse.csr_matrix(P2) + XtW = X.transpose() * W + A = XtW * X + L2 + b = XtW * z + coef = sparse.linalg.spsolve(A, b) + else: + XtW = (X.T * W) + A = XtW.dot(X) + if P2.ndim == 1: + A[np.diag_indices_from(A)] += P2 + else: + A += P2 + b = XtW.dot(z) + coef = linalg.solve(A, b) + return coef + + class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - r""" - Class to fit a Generalized Linear Model (GLM) based on reproductive - Exponential Dispersion Models (EDM). + """Regression via a Generalized Linear Model (GLM) based on reproductive + Exponential Dispersion Models (EDM) with combined L1 and L2 priors as + regularizer. + + Minimizes the objective function:: + + 1/(2s) * deviance(y, h(X*w)) + + alpha * l1_ratio * ||P1*w||_1 + + 1/2 * alpha * (1 - l1_ratio) * w*P2*w + + with inverse link function `h` and s=sum of `sample_weight` (which equals + n_samples for `sample_weight=None`). + For `P1`=`P2`=identity, the penalty is the elastic net:: - #TODO: This belongs to User Guide - Assumptions: + alpha * l1_ratio * ||w||_1 + + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 - - The target values y_i are realizations of random variables - :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion - parameter :math:`\phi` and weights :math:`w_i`. - - The expectation of :math:`Y_i` is :math:`\mu_i=\mathrm{E}[Y]=h(\eta_i)` - whith the linear predictor :math:`\eta=X*\beta`, inverse link function - :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta` - to be estimated. + If you are interested in controlling the L1 and L2 penalty + separately, keep in mind that this is equivalent to:: - Note that the first assumption implies - :math:`\mathrm{Var}[Y_i]=\frac{\phi}{w_i} v(\mu_i)` with uni variance - function :math:`v(\mu)`. + a * L1 + b * L2 + + where:: + + alpha = a + b and l1_ratio = a / (a + b) + + The parameter `l1_ratio` corresponds to alpha in the glmnet R package while + alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio + = 1 is the lasso penalty. + + Read more in the :ref:`User Guide `. The fit itself does not need Y to be from an EDM, but only assumes - the first two moments :math:`E[Y_i]=\mu_i=h(\eta_i)` and - :math:`Var[Y_i]=\frac{\phi}{w_i} v(\mu_i)` + the first two moments :math:`E[Y_i]=\\mu_i=h(\\eta_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{w_i} v(\\mu_i)`. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + (penalized) maximum likelihood which is equivalent to minimizing the + deviance. - The parameters :math:`\beta` are estimated by maximum likelihood which is - equivalent to minimizing the deviance. + TODO: For `alpha` > 0, the feature matrix `X` is assumed to be + standardized. Call + :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. TODO: Estimation of the dispersion parameter phi. - TODO: Notes on weights and 'scaled' Poisson, e.g. fit y = x/w with - with x=counts and w=exposure (time, money, persons, ...) => y is a - ratio with weights w. + TODO: Notes on weights and 'scaled' distributions. For Poisson, this means + to fit y = z/w with z=counts and w=exposure (time, money, persons, ...) + => y is a ratio with weights w. Same for other distributions. Parameters ---------- + alpha : float, optional (default=1) + Constant that multiplies the penalty terms und thus determines the + regularization strength. + See the notes for the exact mathematical meaning of this + parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + case, the design matrix X must have full column rank + (no collinearities). + + l1_ratio : float, optional (defaul=0) + The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For + ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it + is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a + combination of L1 and L2. + fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance - of a subclass of ExponentialDispersionModel, optional - (default='normal') + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\ + of class ExponentialDispersionModel, optional(default='normal') the distributional assumption of the GLM. - link : {'identity', 'log'} or an instance of a subclass of Link, + link : {'identity', 'log'} or an instance of class Link, optional (default='identity') the link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). @@ -634,28 +741,41 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): where ``g_i`` is the i-th component of the gradient (derivative of the deviance). - start_params : {array shape (n_features, ), 'ols'}, optional (default=None) - sets the start values for coef_ in the fit. - If None, default values are taken. - If 'ols' the result of an ordinary least squares in the link space - (linear predictor) is taken. - If an array is given, these values are taken as coef_ to start with. - If fit_intercept is true, the first value is assumed to be the start - value for the intercept_. + warm_start : boolean, optional (default=False) + If set to ``True``, reuse the solution of the previous call to fit as + initialization for ``coef_`` and ``intercept_`` (supersedes option + ``start_params``). If set to ``True`` or if the attribute ``coef_`` + does not exit (first call to fit), option ``start_params`` sets the + starting values for ``coef_`` and ``intercept_``. + + start_params : None or array of shape (n_features, ) or 'least_squares'}, \ + optional (default=None) + If an array of size n_features is supplied, use these as start values + for ``coef_`` in the fit. If ``fit_intercept=True``, the first element + is assumed to be the start value for the ``intercept_``. + If 'least_squares' is set, the result of a least squares fit in the + link space (linear predictor) is taken. If ``None``, the start values + are calculated by setting mu to family.starting_mu(..) and one step of + irls. + This option only applies if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not exist). + + copy_X : boolean, optional, default True + If ``True``, X will be copied; else, it may be overwritten. verbose : int, optional (default=0) For the lbfgs solver set verbose to any positive number for verbosity. Attributes ---------- - coef_ : array, shape (1, n_features) + coef_ : array, shape (n_features, ) Estimated coefficients for the linear predictor (X*coef_) in the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. dispersion_ : float - The dispersion parameter :math:`\phi` if fit_dispersion is set. + The dispersion parameter :math:`\\phi` if fit_dispersion is set. n_iter_ : int Actual number of iterations of the solver. @@ -667,10 +787,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): ---------- TODO """ - - def __init__(self, fit_intercept=True, family='normal', - link='identity', fit_dispersion='chisqr', solver='irls', - max_iter=100, tol=1e-4, start_params=None, verbose=0): + def __init__(self, alpha=1.0, l1_ratio=0, + fit_intercept=True, family='normal', link='identity', + fit_dispersion='chisqr', solver='irls', max_iter=100, + tol=1e-4, warm_start=False, start_params=None, copy_X=True, + verbose=0): + self.alpha = alpha + self.l1_ratio = l1_ratio self.fit_intercept = fit_intercept self.family = family self.link = link @@ -678,31 +801,86 @@ def __init__(self, fit_intercept=True, family='normal', self.solver = solver self.max_iter = max_iter self.tol = tol + self.warm_start = warm_start self.start_params = start_params + self.copy_X = copy_X self.verbose = verbose - def fit(self, X, y, sample_weight=None): + def fit(self, X, y, sample_weight=None, P1=None, P2=None, + check_input=True): """Fit a generalized linear model. Parameters ---------- - X : numpy array or sparse matrix of shape [n_samples,n_features] + X : numpy array or sparse matrix of shape (n_samples, n_features) Training data - y : numpy array of shape [n_samples] + y : numpy array of shape (n_samples, ) Target values - sample_weight : numpy array of shape [n_samples] + sample_weight : array of shape (n_samples, ) or None,\ + optinal (default=None) Individual weights for each sample. Var[Y_i]=phi/weight_i * v(mu) If Y_i ~ EDM(mu, phi/w_i) then sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a weighted average with weights=sample_weight. + P1 : None or array of shape (n_features*, ), optional\ + (default=None) + With this array, you can exclude coefficients from ths L1 penalty. + Set the corresponding value to 1 (include) or 0 (exclude). The + default value ``None`` is the same as an array of ones. + Note that n_features* = X.shape[1] = length of coef_ (intercept + always excluded from counting). + + P2 : None or array of shape (n_features*, n_features*) + With this square matrix the L2 penalty is calculated as `w P2 w`. + This gives a fine control over this penalty (Tikhonov + regularization). + Note that n_features* = X.shape[1] = length of coef_ (intercept + always excluded from counting). + + check_input : boolean, optional (default=True) + Allow to bypass several input checking. + Don't use this parameter unless you know what you do. + Returns ------- self : returns an instance of self. """ + ####################################################################### + # 1. input validation # + ####################################################################### + # 1.1 validate arguments of fit ####################################### + _dtype = [np.float64, np.float32] + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=_dtype, y_numeric=True, multi_output=False) + y = y.astype(np.float64) + + if sample_weight is None: + weights = np.ones_like(y) + elif np.isscalar(sample_weight): + weights = sample_weight*np.ones_like(y) + else: + weights = np.atleast_1d(sample_weight) + if weights.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar") + elif weights.shape[0] != y.shape[0]: + raise ValueError("Sample weights must have the same length as" + " y") + # IMPORTANT NOTE: Since we want to minimize + # 1/(2*sum(sample_weight)) * deviance + L1 + L2, + # deviance = sum(sample_weight * unit_deviance), + # we rescale weights such that sum(weights) = 1 and this becomes + # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + weights = weights/np.sum(weights) + + if not isinstance(check_input, bool): + raise ValueError("The argument check_input must be bool; got " + "(check_input={0})".format(check_input)) + + # 1.2 validate arguments of __init__ ################################## # Garantee that self._family_instance is an instance of class # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): @@ -720,7 +898,8 @@ def fit(self, X, y, sample_weight=None): raise ValueError( "The family must be an instance of class" " ExponentialDispersionModel or an element of" - " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].") + " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];" + " got (family={0})".format(self.family)) # Garantee that self._link_instance is set to an instance of class Link if isinstance(self.link, Link): @@ -733,132 +912,232 @@ def fit(self, X, y, sample_weight=None): else: raise ValueError( "The link must be an instance of class Link or" - " an element of ['identity', 'log'].") - + " an element of ['identity', 'log']; got (link={0})" + .format(self.link)) + + if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: + raise ValueError("Penalty term must be non-negative;" + " got (alpha={0})".format(self.alpha)) + if (not isinstance(self.l1_ratio, numbers.Number) or + self.l1_ratio < 0 or self.l1_ratio > 1): + raise ValueError("l1_ratio must be in interval [0, 1]; got" + " (l1_ratio={0]})".format(self.l1_ratio)) if not isinstance(self.fit_intercept, bool): - raise ValueError("The argument fit_intercept must be bool," + raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) if self.solver not in ['irls', 'lbfgs', 'newton-cg']: raise ValueError("GLM Regression supports only irls, lbfgs and" "newton-cg solvers, got {0}".format(self.solver)) + if self.alpha > 0: + if (self.l1_ratio > 0 and + self.solver not in []): + # TODO: Add solver for L1 + # raise ValueError("The solver option (solver={0}) is not " + # "appropriate for the chosen penalty which" + # " includes L1 (alpha={1})." + # .format(self.solver, self.alpha)) + raise NotImplementedError("Currently, no solver is implemented" + " that can deal with L1 penalties.") if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: raise ValueError("Maximum number of iteration must be positive;" " got (max_iter={0!r})".format(self.max_iter)) if not isinstance(self.tol, numbers.Number) or self.tol < 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol={0!r})".format(self.tol)) + if not isinstance(self.warm_start, bool): + raise ValueError("The argument warm_start must be bool;" + " got {0}".format(self.warm_start)) start_params = self.start_params - if start_params is not None and start_params is not 'ols': + if start_params is not None and start_params is not 'least_squares': start_params = np.atleast_1d(start_params) - if start_params.shape[0] != X.shape[1] + self.fit_intercept: + if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or + (start_params.ndim != 1)): raise ValueError("Start values for parameters must have the" - "right length; required length {0}, got {1}" + "right length and dimension; required (length" + "={0}, ndim=1), got (length={1}, ndim={2})." .format(X.shape[1] + self.fit_intercept, - start_params.shape[0])) - - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - y_numeric=True, multi_output=False) - y = y.astype(np.float64) + start_params.shape[0], + start_params.ndim)) + if not isinstance(self.copy_X, bool): + raise ValueError("The argument copy_X must be bool;" + " got {0}".format(self.copy_X)) + + if P1 is None: + P1 = np.ones(X.shape[1]) + else: + P1 = np.atleast_1d(P1) + if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1): + raise ValueError("P1 must be either None or an 1D array with " + "the length of X.shape[1]; " + "got (P1.shape[0]={0}), " + "needed (X.shape[1]={1})." + .format(P1.shape[0], X.shape[1])) + if P2 is None: + P2 = np.ones(X.shape[1]) + if sparse.issparse(X): + P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), + shape=(X.shape[1], X.shape[1]))).tocsr() + else: + P2 = check_array(P2, accept_sparse=['csr', 'csc', 'coo'], + dtype="numeric", ensure_2d=True) + if ((P2.shape[0] != P2.shape[1]) or + (P2.shape[0] != X.shape[1]) or + (P2.ndim != 2)): + raise ValueError("P2 must be either None or an array of shape " + "(n_features, n_features) with " + "n_features=X.shape[1]; " + "got (P2.shape=({0},{1})), needed ({3},{3})" + .format(P2.shape[0], P2.shape[1], X.shape[1])) family = self._family_instance link = self._link_instance - if not np.all(family.in_y_range(y)): - raise ValueError("Some value(s) of y are out of the valid " - "range for family {0}" - .format(family.__class__.__name__)) - - if sample_weight is None: - weights = np.ones_like(y) - elif np.isscalar(sample_weight): - weights = sample_weight*np.ones_like(y) - else: - weights = np.atleast_1d(sample_weight) - if weights.ndim > 1: - raise ValueError("Sample weight must be 1D array or scalar") - elif weights.shape[0] != y.shape[0]: - raise ValueError("Sample weights must have the same length as" - " y") - if self.fit_intercept: # intercept is first column <=> coef[0] is for intecept if sparse.issparse(X): Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X]) else: Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) + P1 = np.concatenate((np.array([0]), P1)) + if P2.ndim == 1: + P2 = np.concatenate((np.array([0]), P2)) + elif sparse.issparse(P2): + P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2), + dtype=P2.dtype).tocsr() + else: + P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))], + [np.zeros((X.shape[1], 1)), P2]]) else: Xnew = X n_samples, n_features = Xnew.shape - - # Note: Since dispersion_ alias phi does not enter the estimation - # of mu_i=E[y_i] set it to 1 where convenient. + l1 = self.alpha * self.l1_ratio + l2 = self.alpha * (1-self.l1_ratio) + P1 *= l1 + P2 *= l2 + + # 1.3 additional validations ########################################## + if check_input: + if not np.all(family.in_y_range(y)): + raise ValueError("Some value(s) of y are out of the valid " + "range for family {0}" + .format(family.__class__.__name__)) + # TODO: if alpha=0 check that Xnew is not rank deficient + # TODO: what else to check? + + ####################################################################### + # 2. initialization of coef = (intercept_, coef_) # + ####################################################################### + # Note: Since phi=self.dispersion_ does not enter the estimation + # of mu_i=E[y_i], set it to 1. # set start values for coef coef = None - if start_params is None: - # Use mu_start and apply one irls step to calculate coef - mu = family.starting_mu(y, weights) - # linear predictor - eta = link.link(mu) - # h'(eta) - hp = link.inverse_derivative(eta) - # working weights w, in principle a diagonal matrix - # therefore here just as 1d array - w = (hp**2 / family.variance(mu, phi=1, weights=weights)) - wroot = np.sqrt(w) - # working observations - yw = eta + (y-mu)/hp - # least squares rescaled with wroot - wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples)) - X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) - yw_rescale = safe_sparse_dot(wroot, y, dense_output=True) - coef = linalg.lstsq(X_rescale, yw_rescale)[0] - elif start_params is 'ols': - reg = LinearRegression(copy_X=False, fit_intercept=False) - reg.fit(Xnew, link.link(y)) - coef = reg.coef_ + if self.warm_start and hasattr(self, "coef_"): + if self.fit_intercept: + coef = np.concatenate((self.intercept_, self.coef_)) + else: + coef = self.coef_ + elif self.start_params is None: + if self.l1_ratio == 0: + # See 3.1 IRLS + # Use mu_start and apply one irls step to calculate coef + mu = family.starting_mu(y, weights) + # linear predictor + eta = link.link(mu) + # h'(eta) + hp = link.inverse_derivative(eta) + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = (hp**2 / family.variance(mu, phi=1, weights=weights)) + # working observations + z = eta + (y-mu)/hp + # solve A*coef = b + # A = X' W X + l2 P2, b = X' W z + coef = _irls_step(Xnew, W, P2, z) + else: + # with L1 penalty, start with coef = 0 + coef = np.zeros(n_features) + elif self.start_params is 'least_squares': + if self.alpha == 0: + reg = LinearRegression(copy_X=True, fit_intercept=False) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ + elif self.l1_ratio <= 0.01: + # ElasticNet says l1_ratio <= 0.01 is not reliable, use Ridge + reg = Ridge(copy_X=True, fit_intercept=False, + alpha=self.alpha) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ + else: + # TODO: Does this make sense? + reg = ElasticNet(copy_X=True, fit_intercept=False, + alpha=self.alpha, l1_ratio=self.l1_ratio) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ else: coef = start_params + ####################################################################### + # 3. fit # + ####################################################################### # algorithms for optimiation # TODO: Parallelize it self.n_iter_ = 0 converged = False + # 3.1 IRLS ############################################################ + # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' + # Obj = objective function = 1/2 Dev + l2/2 w P2 w + # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 + # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) + # D2 = link.inverse_derivative(eta)^2 = D^2 + # W = D2/V(mu) + # l2 = alpha * (1 - l1_ratio) + # Obj' = d(Obj)/d(w) = 1/2 Dev' + P2 w + # = -X' D (y-mu)/V(mu) + l2 P2 w + # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 + # Use Fisher matrix instead of full info matrix -X'(...) X, + # i.e. E[Dev''] with E[y-mu]=0: + # Obj'' ~ X' W X + l2 P2 + # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu) + # Note: P2 = l2*P2, see above if self.solver == 'irls': - # linear predictor + # eta = linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = link.inverse(eta) + # D = h'(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) while self.n_iter_ < self.max_iter: self.n_iter_ += 1 # coef_old not used so far. # coef_old = coef - # h'(eta) - hp = link.inverse_derivative(eta) - # working weights w, in principle a diagonal matrix + # working weights W, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / family.variance(mu, phi=1, weights=weights)) - wroot = np.sqrt(w) + W = (hp**2 / V) # working observations - yw = eta + (y-mu)/hp - # least squares rescaled with wroot - wroot = sparse.dia_matrix((wroot, 0), - shape=(n_samples, n_samples)) - X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) - yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True) - coef, residues, rank, singular_ = ( - linalg.lstsq(X_rescale, yw_rescale)) + z = eta + (y-mu)/hp + # solve A*coef = b + # A = X' W X + l2 P2, b = X' W z + coef = _irls_step(Xnew, W, P2, z) # updated linear predictor # do it here for updated values for tolerance eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = link.inverse(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) # which tolerace? |coef - coef_old| or gradient? # use gradient for compliance with newton-cg and lbfgs - # TODO: faster computation of gradient, use mu and eta directly - gradient = family._deviance_derivative( - coef=coef, X=Xnew, y=y, weights=weights, link=link) + # gradient = family._deviance_derivative( + # coef=coef, X=Xnew, y=y, weights=weights, link=link) + # gradient = -X' D (y-mu)/V(mu) + l2 P2 w + gradient = -safe_sparse_dot(Xnew.T, hp*(y-mu)/V) + if P2.ndim == 1: + gradient += P2*coef + else: + gradient += safe_sparse_dot(P2, coef) if (np.max(np.abs(gradient)) <= self.tol): converged = True break @@ -868,50 +1147,73 @@ def fit(self, X, y, sample_weight=None): "of iterations (currently {0})" .format(self.max_iter), ConvergenceWarning) + # 3.2 L-BFGS and Newton-CG ############################################ # TODO: performance: make one function return both deviance and # gradient of deviance - elif self.solver == 'lbfgs': - func = family._deviance - fprime = family._deviance_derivative - args = (Xnew, y, weights, link) - coef, loss, info = optimize.fmin_l_bfgs_b( - func, coef, fprime=fprime, - args=args, - iprint=(self.verbose > 0) - 1, pgtol=self.tol, - maxiter=self.max_iter) - if self.verbose > 0: - if info["warnflag"] == 1: - warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.", - ConvergenceWarning) - elif info["warnflag"] == 2: - warnings.warn("lbfgs failed for the reason: {0}".format( - info["task"])) - self.n_iter_ = info['nit'] - elif self.solver == 'newton-cg': - func = family._deviance - grad = family._deviance_derivative + elif self.solver in ['lbfgs', 'newton-cg']: + def func(coef, *args): + if P2.ndim == 1: + L2 = safe_sparse_dot(coef.T, P2*coef) + else: + L2 = safe_sparse_dot(coef.T, safe_sparse_dot(P2, coef)) + # A[np.diag_indices_from(A)] += P2 + return 0.5*family._deviance(coef, *args) + 0.5*L2 + + def fprime(coef, *args): + if P2.ndim == 1: + L2 = P2*coef + else: + L2 = safe_sparse_dot(P2, coef) + return 0.5*family._deviance_derivative(coef, *args) + L2 def grad_hess(coef, X, y, weights, link): - grad = (family._deviance_derivative( - coef, X, y, weights, link)) - hessian = (family._deviance_hessian( - coef, X, y, weights, link)) + if P2.ndim == 1: + L2 = P2*coef + else: + L2 = safe_sparse_dot(P2, coef) + grad = 0.5*family._deviance_derivative( + coef, X, y, weights, link) + L2 + hessian = 0.5*family._deviance_hessian( + coef, X, y, weights, link) + if P2.ndim == 1: + hessian[np.diag_indices_from(hessian)] += P2 + else: + hessian += P2 def Hs(s): - ret = np.dot(hessian, s) + ret = safe_sparse_dot(hessian, s) return ret return grad, Hs - hess = grad_hess + args = (Xnew, y, weights, link) - coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, - maxiter=self.max_iter, tol=self.tol) - self.coef_ = coef + if self.solver == 'lbfgs': + coef, loss, info = optimize.fmin_l_bfgs_b( + func, coef, fprime=fprime, args=args, + iprint=(self.verbose > 0) - 1, pgtol=self.tol, + maxiter=self.max_iter) + if self.verbose > 0: + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.", + ConvergenceWarning) + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}" + .format(info["task"])) + self.n_iter_ = info['nit'] + elif self.solver == 'newton-cg': + coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef, + args=args, maxiter=self.max_iter, + tol=self.tol) + + ####################################################################### + # 4. postprocessing # + ####################################################################### if self.fit_intercept: self.intercept_ = coef[0] self.coef_ = coef[1:] else: + # set intercept to zero as the other linear models do self.intercept_ = 0. self.coef_ = coef @@ -988,8 +1290,8 @@ def score(self, X, y, sample_weight=None): :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` is the null deviance, i.e. the deviance of a model with intercept alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean - :math:`\bar{y}` is average by sample_weight. In the case of a Normal - distribution, this D^2 equals R^2. + :math:`\bar{y}` is averaged by sample_weight. In the case of a Normal + distribution, D^2 equals R^2. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index a4d4ea8650860..df0413b4d7836 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,19 +1,34 @@ import numpy as np from sklearn.linear_model.glm import ( - # Link, IdentityLink, + Link, + IdentityLink, LogLink, TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, - # GeneralizedHyperbolicSecand, + GeneralizedHyperbolicSecand, GeneralizedLinearRegressor) +from sklearn.linear_model.ridge import Ridge from sklearn.utils.testing import ( - # assert_equal, + assert_equal, assert_almost_equal, assert_array_equal, assert_array_almost_equal) +def test_link_properties(): + """Test link inverse and derivative + """ + rng = np.random.RandomState(0) + x = rng.rand(100)*100 + from sklearn.linear_model.glm import Link + for link in vars()['Link'].__subclasses__(): + link = link() + assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) + assert_almost_equal(link.inverse_derivative(link.link(x)), + 1/link.derivative(x), decimal=10) + + def test_family_bounds(): """Test the valid range of distributions """ @@ -42,8 +57,23 @@ def test_family_bounds(): assert_array_equal(result, [False, False, True]) +def test_deviance_zero(): + """Test deviance(y,y) = 0 for different families + """ + for family in [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=-2.5), + TweedieDistribution(power=-1), + TweedieDistribution(power=1.5), + TweedieDistribution(power=2.5), + TweedieDistribution(power=4), + GeneralizedHyperbolicSecand()]: + assert_almost_equal(family.deviance(0.1, 0.1), 0, decimal=10) + assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10) + + def test_glm_identiy_regression(): - """Test linear regression on a simple dataset + """Test GLM regression with identity link on a simple dataset """ coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T @@ -55,13 +85,13 @@ def test_glm_identiy_regression(): for solver in ['irls', 'lbfgs', 'newton-cg']: for family in families: glm = GeneralizedLinearRegressor( - family=family, fit_intercept=False, solver=solver) + alpha=0, family=family, fit_intercept=False, solver=solver) res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) def test_glm_log_regression(): - """Test linear regression on a simple dataset + """Test GLM regression with log link on a simple dataset """ coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T @@ -73,10 +103,79 @@ def test_glm_log_regression(): for solver in ['irls', 'lbfgs', 'newton-cg']: for family in families: glm = GeneralizedLinearRegressor( - family=family, link=LogLink(), fit_intercept=False, - solver=solver, start_params='ols') + alpha=0, family=family, link=LogLink(), fit_intercept=False, + solver=solver, start_params='least_squares') res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) +def test_normal_ridge(): + """Test ridge regression for Normal distributions + + Compare to test_ridge in test_ridge.py. + """ + rng = np.random.RandomState(0) + alpha = 1.0 + + # With more samples than features + n_samples, n_features, n_predict = 6, 5, 10 + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + T = rng.randn(n_predict, n_features) + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True) + ridge.fit(X, y) + for solver in ['irls', 'lbfgs', 'newton-cg']: + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + family='normal', link='identity', + fit_intercept=True, solver=solver) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + family='normal', link='identity', + fit_intercept=False, solver='irls') + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + + # With more features than samples + n_samples, n_features, n_predict = 5, 10, 10 + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + T = rng.randn(n_predict, n_features) + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True) + ridge.fit(X, y) + for solver in ['irls', 'lbfgs', 'newton-cg']: + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + family='normal', link='identity', + fit_intercept=True, solver=solver) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + family='normal', link='identity', + fit_intercept=False, solver='irls') + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + + # TODO: Test compatibility with R's glm, glmnet From 5b46c23977a8e386987a2767b2c12d4296d332af Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 19 Sep 2017 00:40:34 +0200 Subject: [PATCH 007/209] [WIP] Add Generalized Linear Models (#9405) * fix some bugs in user guide linear_model.rst * fix some pep8 issues in test_glm.py --- doc/modules/linear_model.rst | 28 +++++++++++++++----------- sklearn/linear_model/tests/test_glm.py | 7 ++++--- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 51b3821fa6207..98736facd9b76 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -880,14 +880,14 @@ Generalized linear regression ============================= :class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two -ways [1]_. First, the predicted values :math:`\hat{y}` are linked to a linear +ways [8]_. First, the predicted values :math:`\hat{y}` are linked to a linear combination of the input variables :math:`X` via an inverse link function :math:`h` as .. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [2]_. The objective function beeing minimized +exponential dispersion model (EDM) [9]_. The objective function beeing minimized becomes .. math:: \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1 @@ -914,16 +914,20 @@ it is convenient to apply a link function different from the identity link :math:`h(Xw)=\exp(Xw)`. Note that the feature matrix `X` should be standardized before fitting. This -ensures that the penalty treats features equally. +ensures that the penalty treats features equally. The estimator can be used as +follows:: - >>> from sklearn import linear_model - >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, l1_ratio=0) - >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> from sklearn.linear_model import GeneralizedLinearRegressor + >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) + GeneralizedLinearRegressor(alpha=0.5, copy_X=True, family='poisson', + fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0, + link='log', max_iter=100, solver='irls', start_params=None, + tol=0.0001, verbose=0, warm_start=False) >>> reg.coef_ array([ 0.24630255, 0.43373521]) - >>> reg.intercept_ - -0.76383575123143277 + >>> reg.intercept_ #doctest: +ELLIPSIS + -0.76383575... Mathematical formulation ------------------------ @@ -969,7 +973,7 @@ Two remarks: * The deviances for at least Normal, Poisson and Gamma distributions are strictly consistent scoring functions for the mean :math:`\mu`, see Eq. - (19)-(20) in [3]_. + (19)-(20) in [10]_. * If you want to model a frequency, i.e. counts per exposure (time, volume, ...) you can do so by a Poisson distribution and passing @@ -979,12 +983,12 @@ Two remarks: .. topic:: References: - .. [1] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + .. [8] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - .. [2] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. + .. [9] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. See also `Exponential dispersion model. `_ - .. [3] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ + .. [10] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ Stochastic Gradient Descent - SGD ================================= diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index df0413b4d7836..b62b51b5bcb9e 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -2,7 +2,7 @@ from sklearn.linear_model.glm import ( Link, - IdentityLink, + # IdentityLink, LogLink, TweedieDistribution, NormalDistribution, PoissonDistribution, @@ -21,8 +21,9 @@ def test_link_properties(): """ rng = np.random.RandomState(0) x = rng.rand(100)*100 - from sklearn.linear_model.glm import Link - for link in vars()['Link'].__subclasses__(): + # from sklearn.linear_model.glm import Link + # for link in vars()['Link'].__subclasses__(): + for link in Link.__subclasses__(): link = link() assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) assert_almost_equal(link.inverse_derivative(link.link(x)), From 10dd14603a5fc04f53ca4920621434aaff662064 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 3 Dec 2017 19:54:57 +0100 Subject: [PATCH 008/209] [WIP] Add Generalized Linear Models (#9405) * added test: ridge poisson with log-link compared to glmnet * fix ValueError message for l1_ratio * fix ValueError message for P2 * string comparison: use '==' and '!=' instead of 'is' and 'is not' * fix RuntimeWarnings in unit_deviance of poisson: x*log(x) as xlogy * added test for fisher matrix * added test for family argument --- sklearn/linear_model/glm.py | 29 ++++++----- sklearn/linear_model/tests/test_glm.py | 72 ++++++++++++++++++++++++-- 2 files changed, 84 insertions(+), 17 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 2db3c56d5e1c1..93ce358a8a874 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -15,6 +15,8 @@ # TODO: Make it as much consistent to other estimators in linear_model as # possible # TODO: options P1 and P2 in fit() or in __init__()??? +# TODO: Include further classes in class.rst? ExponentialDispersionModel? +# TweedieDistribution? # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -42,7 +44,7 @@ from abc import ABCMeta, abstractmethod, abstractproperty import numbers import numpy as np -from scipy import linalg, optimize, sparse +from scipy import linalg, optimize, sparse, special import warnings from .base import LinearRegression from .coordinate_descent import ElasticNet @@ -340,7 +342,7 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): = \mathbf{X}^T W \mathbf{X} \,, with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, - see func:`score_function`. + see func:`_score`. """ n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) @@ -363,7 +365,7 @@ def _observed_information(self, coef, phi, X, y, weights, link): \mathbf{H}(\boldsymbol{w}) = -\frac{\partial^2 loglike}{\partial\boldsymbol{w} \partial\boldsymbol{w}^T} - = \mathbf{X}^T \legt[ + = \mathbf{X}^T \left[ - \mathbf{D}' \mathbf{R} + \mathbf{D}^2 \mathbf{V} \mathbf{R} + \mathbf{D}^2 @@ -393,7 +395,7 @@ def _deviance_derivative(self, coef, X, y, weights, link): r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a function of the coefficients `coef`. This is equivalent to :math:`-2\phi` times the score function - :func:`score_function` (derivative of the log-likelihood). + :func:`_score` (derivative of the log-likelihood). """ score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights, link=link) @@ -510,7 +512,8 @@ def unit_deviance(self, y, mu): return (y-mu)**2 if p == 1: # PoissonDistribution - return 2 * (np.where(y == 0, 0, y*np.log(y/mu))-y+mu) + # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 + return 2 * (special.xlogy(y, y/mu) - y + mu) elif p == 2: # GammaDistribution return 2 * (np.log(mu/y)+y/mu-1) @@ -921,7 +924,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, if (not isinstance(self.l1_ratio, numbers.Number) or self.l1_ratio < 0 or self.l1_ratio > 1): raise ValueError("l1_ratio must be in interval [0, 1]; got" - " (l1_ratio={0]})".format(self.l1_ratio)) + " (l1_ratio={0})".format(self.l1_ratio)) if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) @@ -948,7 +951,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) start_params = self.start_params - if start_params is not None and start_params is not 'least_squares': + if start_params is not None and start_params != 'least_squares': start_params = np.atleast_1d(start_params) if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or (start_params.ndim != 1)): @@ -986,7 +989,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " "n_features=X.shape[1]; " - "got (P2.shape=({0},{1})), needed ({3},{3})" + "got (P2.shape=({0}, {1})), needed ({2}, {2})" .format(P2.shape[0], P2.shape[1], X.shape[1])) family = self._family_instance @@ -1058,7 +1061,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, else: # with L1 penalty, start with coef = 0 coef = np.zeros(n_features) - elif self.start_params is 'least_squares': + elif self.start_params == 'least_squares': if self.alpha == 0: reg = LinearRegression(copy_X=True, fit_intercept=False) reg.fit(Xnew, link.link(y)) @@ -1277,11 +1280,9 @@ def estimate_phi(self, y, X, sample_weight): dev = self._family_instance.deviance(y, mu, sample_weight) return dev/(n_samples - n_features) -# TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5" -# in check_estimator for score -# from sklearn.utils.estimator_checks import check_estimator -# from sklearn.linear_model import GeneralizedLinearRegressor -# check_estimator(GeneralizedLinearRegressor) + # Note: check_estimator(GeneralizedLinearRegressor) might raise + # "AssertionError: -0.28014056555724598 not greater than 0.5" + # unless GeneralizedLinearRegressor has a score which passes the test. def score(self, X, y, sample_weight=None): r"""Returns D^2, a generalization of the coefficient of determination R^2, which uses deviance instead of squared error. diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index b62b51b5bcb9e..de7de90db967b 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,4 +1,6 @@ import numpy as np +from numpy.testing import assert_allclose +import scipy as sp from sklearn.linear_model.glm import ( Link, @@ -73,6 +75,46 @@ def test_deviance_zero(): assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10) +def test_fisher_matrix(): + """Test the Fisher matrix numerically. + Trick: Use numerical differentiation with y = mu""" + for family in [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution()]: + link = LogLink() + rng = np.random.RandomState(0) + coef = np.array([-2, 1, 0, 1, 2.5]) + phi = 0.5 + X = rng.randn(10, 5) + lin_pred = np.dot(X, coef) + mu = link.inverse(lin_pred) + weights = rng.randn(10)**2 + 1 + fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link) + approx = np.array([]).reshape(0, coef.shape[0]) + for i in range(coef.shape[0]): + def f(coef): + return -family._score(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link)[i] + approx = np.vstack( + [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)] + ) + assert_allclose(fisher, approx, rtol=1e-3) + + +def test_glm_family_argument(): + """Test GLM family argument set as string + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for (f, fam) in [('normal', NormalDistribution()), + ('poisson', PoissonDistribution()), + ('gamma', GammaDistribution()), + ('inverse.gaussian', InverseGaussianDistribution())]: + glm = GeneralizedLinearRegressor(family=f, fit_intercept=False, + alpha=0).fit(X, y) + assert_equal(type(glm._family_instance), type(fam)) + + def test_glm_identiy_regression(): """Test GLM regression with identity link on a simple dataset """ @@ -82,7 +124,8 @@ def test_glm_identiy_regression(): families = ( NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)) + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecand()) for solver in ['irls', 'lbfgs', 'newton-cg']: for family in families: glm = GeneralizedLinearRegressor( @@ -100,7 +143,8 @@ def test_glm_log_regression(): families = ( NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)) + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecand()) for solver in ['irls', 'lbfgs', 'newton-cg']: for family in families: glm = GeneralizedLinearRegressor( @@ -179,4 +223,26 @@ def test_normal_ridge(): assert_array_almost_equal(glm.predict(T), ridge.predict(T)) -# TODO: Test compatibility with R's glm, glmnet +def test_poisson_ridge(): + """Test ridge regression with poisson family and LogLink + + Compare to R's glmnet""" + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.12889386979 + # a 0.29019207995 + # b 0.03741173122 + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, family='poisson', + link='log', tol=1e-10) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, -0.12889386979, decimal=7) + assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + decimal=7) From 72485b63e89879e65381bca12152b54600fd3970 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 8 Jan 2018 22:13:45 +0100 Subject: [PATCH 009/209] [WIP] Add Generalized Linear Models (#9405) * put arguments P1, P2 and check_input from fit to __init__ * added check_input test: is P2 positive definite? * added solver option: 'auto' --- sklearn/linear_model/glm.py | 181 +++++++++++++++++++++++------------- 1 file changed, 117 insertions(+), 64 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 93ce358a8a874..3f6b91026ef9b 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -14,7 +14,7 @@ # TODO: Write examples and more docu # TODO: Make it as much consistent to other estimators in linear_model as # possible -# TODO: options P1 and P2 in fit() or in __init__()??? +# TODO: which dtype to force for y and X? Which for P1, P2? # TODO: Include further classes in class.rst? ExponentialDispersionModel? # TweedieDistribution? @@ -709,6 +709,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. + P1 : None or array of shape (n_features*, ), optional\ + (default=None) + With this array, you can exclude coefficients from the L1 penalty. + Set the corresponding value to 1 (include) or 0 (exclude). The + default value ``None`` is the same as an array of ones. + Note that n_features* = X.shape[1] = length of coef_ (intercept + always excluded from counting). + + P2 : None or array of shape (n_features*, n_features*) + With this square matrix the L2 penalty is calculated as `w P2 w`. + This gives a fine control over this penalty (Tikhonov + regularization). + Note that n_features* = X.shape[1] = length of coef_ (intercept + always excluded from counting). P2 must be positive semi-definite. + fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). @@ -727,13 +742,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'irls', 'newton-cg', 'lbfgs'}, optional (defaul='irls') + solver : {'auto', 'irls', 'newton-cg', 'lbfgs'}, optional (defaul='auto') Algorithm to use in the optimization problem. - - 'irls' is iterated reweighted least squares. It is the standard - algorithm for GLMs. + - 'irls' is iterated reweighted least squares (Fisher scoring). + It is the standard algorithm for GLMs. Cannot deal with + L1 penalties. + + - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties. - - 'newton-cg', 'lbfgs' + - 'auto' sets 'irls'. max_iter : int, optional (default=100) TODO @@ -766,6 +784,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten. + check_input : boolean, optional (default=True) + Allow to bypass several checks on input: y values in range of family, + sample_weights non-negative, P2 positive semi-definite. + Don't use this parameter unless you know what you do. + verbose : int, optional (default=0) For the lbfgs solver set verbose to any positive number for verbosity. @@ -790,13 +813,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): ---------- TODO """ - def __init__(self, alpha=1.0, l1_ratio=0, + def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', - fit_dispersion='chisqr', solver='irls', max_iter=100, + fit_dispersion='chisqr', solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params=None, copy_X=True, - verbose=0): + check_input=True, verbose=0): self.alpha = alpha self.l1_ratio = l1_ratio + self.P1 = P1 + self.P2 = P2 self.fit_intercept = fit_intercept self.family = family self.link = link @@ -807,10 +832,10 @@ def __init__(self, alpha=1.0, l1_ratio=0, self.warm_start = warm_start self.start_params = start_params self.copy_X = copy_X + self.check_input = check_input self.verbose = verbose - def fit(self, X, y, sample_weight=None, P1=None, P2=None, - check_input=True): + def fit(self, X, y, sample_weight=None): """Fit a generalized linear model. Parameters @@ -823,31 +848,13 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, sample_weight : array of shape (n_samples, ) or None,\ optinal (default=None) - Individual weights for each sample. - Var[Y_i]=phi/weight_i * v(mu) - If Y_i ~ EDM(mu, phi/w_i) then + Individual weights w_i for each sample. Note that for an + Exponential Dispersion Model (EDM), one has + Var[Y_i]=phi/w_i * v(mu). + If Y_i ~ EDM(mu, phi/w_i), then sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a weighted average with weights=sample_weight. - P1 : None or array of shape (n_features*, ), optional\ - (default=None) - With this array, you can exclude coefficients from ths L1 penalty. - Set the corresponding value to 1 (include) or 0 (exclude). The - default value ``None`` is the same as an array of ones. - Note that n_features* = X.shape[1] = length of coef_ (intercept - always excluded from counting). - - P2 : None or array of shape (n_features*, n_features*) - With this square matrix the L2 penalty is calculated as `w P2 w`. - This gives a fine control over this penalty (Tikhonov - regularization). - Note that n_features* = X.shape[1] = length of coef_ (intercept - always excluded from counting). - - check_input : boolean, optional (default=True) - Allow to bypass several input checking. - Don't use this parameter unless you know what you do. - Returns ------- self : returns an instance of self. @@ -872,16 +879,6 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, elif weights.shape[0] != y.shape[0]: raise ValueError("Sample weights must have the same length as" " y") - # IMPORTANT NOTE: Since we want to minimize - # 1/(2*sum(sample_weight)) * deviance + L1 + L2, - # deviance = sum(sample_weight * unit_deviance), - # we rescale weights such that sum(weights) = 1 and this becomes - # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) - weights = weights/np.sum(weights) - - if not isinstance(check_input, bool): - raise ValueError("The argument check_input must be bool; got " - "(check_input={0})".format(check_input)) # 1.2 validate arguments of __init__ ################################## # Garantee that self._family_instance is an instance of class @@ -928,17 +925,22 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver not in ['irls', 'lbfgs', 'newton-cg']: - raise ValueError("GLM Regression supports only irls, lbfgs and" - "newton-cg solvers, got {0}".format(self.solver)) + if self.solver == 'auto': + solver = 'irls' + else: + solver = self.solver + if solver not in ['irls', 'lbfgs', 'newton-cg']: + raise ValueError("GeneralizedLinearRegressor supports only irls, " + "lbfgs and newton-cg solvers, got {0}" + "".format(solver)) if self.alpha > 0: if (self.l1_ratio > 0 and - self.solver not in []): + solver not in []): # TODO: Add solver for L1 # raise ValueError("The solver option (solver={0}) is not " # "appropriate for the chosen penalty which" # " includes L1 (alpha={1})." - # .format(self.solver, self.alpha)) + # .format(solver, self.alpha)) raise NotImplementedError("Currently, no solver is implemented" " that can deal with L1 penalties.") if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: @@ -964,28 +966,32 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) + if not isinstance(self.check_input, bool): + raise ValueError("The attribute check_input must be bool; got " + "(check_input={0})".format(self.check_input)) - if P1 is None: + if self.P1 is None: P1 = np.ones(X.shape[1]) else: - P1 = np.atleast_1d(P1) + P1 = np.atleast_1d(np.copy(self.P1)) if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1): raise ValueError("P1 must be either None or an 1D array with " "the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." .format(P1.shape[0], X.shape[1])) - if P2 is None: + if self.P2 is None: P2 = np.ones(X.shape[1]) if sparse.issparse(X): P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), shape=(X.shape[1], X.shape[1]))).tocsr() else: - P2 = check_array(P2, accept_sparse=['csr', 'csc', 'coo'], + P2 = check_array(self.P2, copy=True, + accept_sparse=['csr', 'csc', 'coo'], dtype="numeric", ensure_2d=True) - if ((P2.shape[0] != P2.shape[1]) or - (P2.shape[0] != X.shape[1]) or - (P2.ndim != 2)): + if ((P2.ndim != 2) or + (P2.shape[0] != P2.shape[1]) or + (P2.shape[0] != X.shape[1])): raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " "n_features=X.shape[1]; " @@ -1020,16 +1026,39 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, P2 *= l2 # 1.3 additional validations ########################################## - if check_input: + if self.check_input: if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" .format(family.__class__.__name__)) + if not np.all(weights >= 0): + raise ValueError("Sample weights must be non-negative.") + # check that P2 is positive semidefinite + # np.linalg.cholesky(P2) 'only' asserts positive definite + if self.P2 is not None: + if sparse.issparse(P2): + # TODO: check sparse P2 for non-negativeness + raise NotImplementedError("Check sparse P2 for " + "non-negaitveness is not yet " + "implemented.") + elif P2.ndim == 2: + if not np.all(np.linalg.eigvals(P2) >= -1e-15): + raise ValueError("P2 must be positive definite.") # TODO: if alpha=0 check that Xnew is not rank deficient # TODO: what else to check? ####################################################################### - # 2. initialization of coef = (intercept_, coef_) # + # 2. rescaling of weights (sample_weight) # + ####################################################################### + # IMPORTANT NOTE: Since we want to minimize + # 1/(2*sum(sample_weight)) * deviance + L1 + L2, + # deviance = sum(sample_weight * unit_deviance), + # we rescale weights such that sum(weights) = 1 and this becomes + # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + weights = weights/np.sum(weights) + + ####################################################################### + # 3. initialization of coef = (intercept_, coef_) # ####################################################################### # Note: Since phi=self.dispersion_ does not enter the estimation # of mu_i=E[y_i], set it to 1. @@ -1082,13 +1111,13 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, coef = start_params ####################################################################### - # 3. fit # + # 4. fit # ####################################################################### # algorithms for optimiation # TODO: Parallelize it self.n_iter_ = 0 converged = False - # 3.1 IRLS ############################################################ + # 4.1 IRLS ############################################################ # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' # Obj = objective function = 1/2 Dev + l2/2 w P2 w # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 @@ -1104,7 +1133,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, # Obj'' ~ X' W X + l2 P2 # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu) # Note: P2 = l2*P2, see above - if self.solver == 'irls': + if solver == 'irls': # eta = linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = link.inverse(eta) @@ -1150,10 +1179,10 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, "of iterations (currently {0})" .format(self.max_iter), ConvergenceWarning) - # 3.2 L-BFGS and Newton-CG ############################################ + # 4.2 L-BFGS and Newton-CG ############################################ # TODO: performance: make one function return both deviance and # gradient of deviance - elif self.solver in ['lbfgs', 'newton-cg']: + elif solver in ['lbfgs', 'newton-cg']: def func(coef, *args): if P2.ndim == 1: L2 = safe_sparse_dot(coef.T, P2*coef) @@ -1190,7 +1219,7 @@ def Hs(s): args = (Xnew, y, weights, link) - if self.solver == 'lbfgs': + if solver == 'lbfgs': coef, loss, info = optimize.fmin_l_bfgs_b( func, coef, fprime=fprime, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, @@ -1204,13 +1233,37 @@ def Hs(s): warnings.warn("lbfgs failed for the reason: {0}" .format(info["task"])) self.n_iter_ = info['nit'] - elif self.solver == 'newton-cg': + elif solver == 'newton-cg': coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef, args=args, maxiter=self.max_iter, tol=self.tol) + # 4.3 coordinate descent ############################################## + # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + # An Improved GLMNET for L1-regularized Logistic Regression, + # Journal of Machine Learning Research 13 (2012) 1999-2030 + # Note: Use Fisher matrix instead of Hessian + # + # 1. find optimal descent direction d by minimizing + # min_d F(w+d) = min_d F(w+d) - F(w) + # F = f + g; f(w) = 1/2 dev; g(w) = 1/2*w*P2*w + ||P1*w||_1 + # 2. quadrdatic approx of f(w+d)-f(w): + # q(d) = f'(w)*d +1/2 d*H*d + # min_d q(d) + g(w+d) - g(w) + # 3. coordinate descent by updating coordinate j (d -> d+z*e_j): + # min_z q(d+z*e_j) + g(w+d+z*e_j) - g(w) + # = min_z q(d+z e_j) - q(d) + g(w+d+z*e_j) - g(w+d) + # TODO + # elif solver == 'cd': + # line search parameters + # (beta, sigma) = (0.5, 0.01) + # for iteration k from 1 to maxiter + # for coordinate j sample at random + # np.random.choice(coord, replace = False) + # + ####################################################################### - # 4. postprocessing # + # 5. postprocessing # ####################################################################### if self.fit_intercept: self.intercept_ = coef[0] From 5c1369bde863a73aff46a502acd70e58e06dcb85 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 24 Jan 2018 15:22:08 +0100 Subject: [PATCH 010/209] [WIP] Add Generalized Linear Models (#9405) * added coordinate descent solver * skip doctest for GeneralizedLinearRegressor example * symmetrize P2 => use P2 = 1/2 (P2+P2') * better validation of parameter start_params --- doc/modules/linear_model.rst | 21 +- sklearn/linear_model/glm.py | 353 ++++++++++++++++++++----- sklearn/linear_model/tests/test_glm.py | 82 ++++-- 3 files changed, 365 insertions(+), 91 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 98736facd9b76..834466e494a4a 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -894,9 +894,9 @@ becomes +\frac{\alpha(1-\rho)}{2} w^T P_2 w with sample weights :math:`s`. -:math:`P_1` can be used to exclude some of the coefficients in the L1 -penalty, :math:`P_2` (must be positive semi-definite) allows for a more -versatile L2 penalty. +:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in +the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows +for a more versatile L2 penalty. Use cases, where a loss different from the squared loss might be appropriate, are the following: @@ -908,22 +908,23 @@ are the following: * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family). Since the linear predictor :math:`Xw` can be negative and -Poisson, Gamma and Inverse Gaussian distributions don't have negative values, +Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link :math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with :math:`h(Xw)=\exp(Xw)`. Note that the feature matrix `X` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as -follows:: +follows: >>> from sklearn.linear_model import GeneralizedLinearRegressor >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') - >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) - GeneralizedLinearRegressor(alpha=0.5, copy_X=True, family='poisson', - fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0, - link='log', max_iter=100, solver='irls', start_params=None, - tol=0.0001, verbose=0, warm_start=False) + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +SKIP + GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, + copy_X=True, family='poisson', fit_dispersion='chisqr', + fit_intercept=True, l1_ratio=0, link='log', max_iter=100, + random_state=None, selection='random', solver='auto', + start_params=None, tol=0.0001, verbose=0, warm_start=False) >>> reg.coef_ array([ 0.24630255, 0.43373521]) >>> reg.intercept_ #doctest: +ELLIPSIS diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 3f6b91026ef9b..3de82c20f33cf 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -3,20 +3,21 @@ """ # Author: Christian Lorentzen +# some parts and tricks stolen from other sklearn files. # License: BSD 3 clause # TODO: Write more tests -# TODO: Add l1-penalty (elastic net) +# TODO: Write examples and more docu # TODO: deal with option self.copy_X # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. # TODO: Add cross validation -# TODO: Write examples and more docu -# TODO: Make it as much consistent to other estimators in linear_model as -# possible -# TODO: which dtype to force for y and X? Which for P1, P2? +# TODO: Should GeneralizedLinearRegressor inherit from LinearModel? +# So far, it does not. # TODO: Include further classes in class.rst? ExponentialDispersionModel? # TweedieDistribution? +# TODO: Negative values in P1 are not allowed so far. They could be used form +# group lasse. # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -52,10 +53,11 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..externals import six +from ..externals.six.moves import xrange from ..utils import check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_random_state class Link(six.with_metaclass(ABCMeta)): @@ -309,7 +311,9 @@ def _score(self, coef, phi, X, y, weights, link): .. math: - \mathbf{score}(\boldsymbol{w}) = \mathbf{X}^T \mathbf{D} + \mathbf{score}(\boldsymbol{w}) + = \frac{\partial loglike}{\partial\boldsymbol{w}} + = \mathbf{X}^T \mathbf{D} \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and @@ -411,6 +415,29 @@ def _deviance_hessian(self, coef, X, y, weights, link): weights=weights, link=link) return 2*info_matrix + def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): + """Calculates eta (linear predictor), mu, score function (derivative + of log-likelihood) and Fisher matrix (all with phi=1) all in one go""" + n_samples, n_features = X.shape + # eta = linear predictor + eta = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(eta) + sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) + d1 = link.inverse_derivative(eta) # = h'(eta) + # Alternatively: + # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g + # d1 = 1./link.derivative(mu) + d1_sigma_inv = sparse.dia_matrix((sigma_inv*d1, 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(d1_sigma_inv, (y-mu), dense_output=True) + score = safe_sparse_dot(X.T, temp, dense_output=True) + # + d2_sigma_inv = sparse.dia_matrix((sigma_inv*(d1**2), 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) + fisher = safe_sparse_dot(X.T, temp, dense_output=False) + return eta, mu, score, fisher + def starting_mu(self, y, weights=1): """Starting values for the mean mu_i in (unpenalized) IRLS.""" return ((weights*y+np.mean(weights*y)) / @@ -670,8 +697,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): alpha = a + b and l1_ratio = a / (a + b) The parameter `l1_ratio` corresponds to alpha in the glmnet R package while - alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio - = 1 is the lasso penalty. + 'alpha' corresponds to the lambda parameter in glmnet. Specifically, + l1_ratio = 1 is the lasso penalty. Read more in the :ref:`User Guide `. @@ -686,6 +713,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): TODO: For `alpha` > 0, the feature matrix `X` is assumed to be standardized. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. + Otherwise, the strength of the penalty is different for the features. TODO: Estimation of the dispersion parameter phi. @@ -742,19 +770,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'auto', 'irls', 'newton-cg', 'lbfgs'}, optional (defaul='auto') + solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \ + optional (defaul='auto') Algorithm to use in the optimization problem. + - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'. + - 'irls' is iterated reweighted least squares (Fisher scoring). It is the standard algorithm for GLMs. Cannot deal with L1 penalties. - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties. - - 'auto' sets 'irls'. + - 'cd' is the coordinate descent algorithm. It can deal with L1 and + L2 penalties. max_iter : int, optional (default=100) - TODO + The maximal number of iterations for solver algorithms. tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, @@ -781,6 +813,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): This option only applies if ``warm_start=False`` or if fit is called the first time (``self.coef_`` does not exist). + selection : str, optional (default='random') + For the solver 'cd' (coordinate descent), the coordinates (features) + can be updated in either cyclic or random order. + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator that selects a random + feature to be updated for solver 'cd' (coordinate descent). + If int, random_state is the seed used by the random + number generator; if RandomState instance, random_state is the random + number generator; if None, the random number generator is the + RandomState instance used by `np.random`. Used when ``selection`` == + 'random'. + copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten. @@ -816,7 +865,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', fit_dispersion='chisqr', solver='auto', max_iter=100, - tol=1e-4, warm_start=False, start_params=None, copy_X=True, + tol=1e-4, warm_start=False, start_params=None, + selection='random', random_state=None, copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.l1_ratio = l1_ratio @@ -831,6 +881,8 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, self.tol = tol self.warm_start = warm_start self.start_params = start_params + self.selection = selection + self.random_state = random_state self.copy_X = copy_X self.check_input = check_input self.verbose = verbose @@ -925,24 +977,21 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver == 'auto': - solver = 'irls' - else: - solver = self.solver - if solver not in ['irls', 'lbfgs', 'newton-cg']: + if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']: raise ValueError("GeneralizedLinearRegressor supports only irls, " - "lbfgs and newton-cg solvers, got {0}" - "".format(solver)) - if self.alpha > 0: - if (self.l1_ratio > 0 and - solver not in []): - # TODO: Add solver for L1 - # raise ValueError("The solver option (solver={0}) is not " - # "appropriate for the chosen penalty which" - # " includes L1 (alpha={1})." - # .format(solver, self.alpha)) - raise NotImplementedError("Currently, no solver is implemented" - " that can deal with L1 penalties.") + "auto, lbfgs, newton-cg and cd solvers, got {0}" + "".format(self.solver)) + solver = self.solver + if self.solver == 'auto': + if self.l1_ratio == 0: + solver = 'irls' + else: + solver = 'cd' + if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']): + raise ValueError("The chosen solver (solver={0}) can't deal " + "with L1 penalties, which are included with " + "(alpha={1}) and (l1_ratio={2})." + .format(solver, self.alpha, self.l1_ratio)) if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: raise ValueError("Maximum number of iteration must be positive;" " got (max_iter={0!r})".format(self.max_iter)) @@ -953,7 +1002,14 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) start_params = self.start_params - if start_params is not None and start_params != 'least_squares': + if start_params is None: + pass + elif isinstance(start_params, six.string_types): + if start_params not in ['least_squares']: + raise ValueError("The argument start_params must be None, " + "'least-squares' or an array of right length," + " got(start_params={0})".format(start_params)) + else: start_params = np.atleast_1d(start_params) if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or (start_params.ndim != 1)): @@ -963,6 +1019,12 @@ def fit(self, X, y, sample_weight=None): .format(X.shape[1] + self.fit_intercept, start_params.shape[0], start_params.ndim)) + + if self.selection not in ['cyclic', 'random']: + raise ValueError("The argument selection must be 'cyclic' or " + "'random', got (selection={0})" + .format(self.selection)) + random_state = check_random_state(self.random_state) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) @@ -974,15 +1036,16 @@ def fit(self, X, y, sample_weight=None): P1 = np.ones(X.shape[1]) else: P1 = np.atleast_1d(np.copy(self.P1)) - if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1): + if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): raise ValueError("P1 must be either None or an 1D array with " "the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." .format(P1.shape[0], X.shape[1])) if self.P2 is None: - P2 = np.ones(X.shape[1]) - if sparse.issparse(X): + if not sparse.issparse(X): + P2 = np.ones(X.shape[1]) + else: P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), shape=(X.shape[1], X.shape[1]))).tocsr() else: @@ -1024,6 +1087,12 @@ def fit(self, X, y, sample_weight=None): l2 = self.alpha * (1-self.l1_ratio) P1 *= l1 P2 *= l2 + # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') + # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric + if sparse.issparse(P2): + P2 = 0.5 * (P2 + P2.transpose()) + else: + P2 = 0.5 * (P2 + P2.T) # 1.3 additional validations ########################################## if self.check_input: @@ -1033,14 +1102,20 @@ def fit(self, X, y, sample_weight=None): .format(family.__class__.__name__)) if not np.all(weights >= 0): raise ValueError("Sample weights must be non-negative.") - # check that P2 is positive semidefinite + # check if P1 has only non-negative values, negative values might + # indicate group lasso in the future. + if self.P1 is not None: + if not np.all(P1 >= 0): + raise ValueError("P1 must not have negative values.") + # check if P2 is positive semidefinite # np.linalg.cholesky(P2) 'only' asserts positive definite if self.P2 is not None: if sparse.issparse(P2): # TODO: check sparse P2 for non-negativeness - raise NotImplementedError("Check sparse P2 for " - "non-negaitveness is not yet " - "implemented.") + # raise NotImplementedError("Check sparse P2 for " + # "non-negaitveness is not yet " + # "implemented.") + pass elif P2.ndim == 2: if not np.all(np.linalg.eigvals(P2) >= -1e-15): raise ValueError("P2 must be positive definite.") @@ -1090,7 +1165,8 @@ def fit(self, X, y, sample_weight=None): else: # with L1 penalty, start with coef = 0 coef = np.zeros(n_features) - elif self.start_params == 'least_squares': + elif (isinstance(self.start_params, six.string_types) and + self.start_params == 'least_squares'): if self.alpha == 0: reg = LinearRegression(copy_X=True, fit_intercept=False) reg.fit(Xnew, link.link(y)) @@ -1102,7 +1178,7 @@ def fit(self, X, y, sample_weight=None): reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: - # TODO: Does this make sense? + # TODO: Does this make sense at all? reg = ElasticNet(copy_X=True, fit_intercept=False, alpha=self.alpha, l1_ratio=self.l1_ratio) reg.fit(Xnew, link.link(y)) @@ -1125,14 +1201,17 @@ def fit(self, X, y, sample_weight=None): # D2 = link.inverse_derivative(eta)^2 = D^2 # W = D2/V(mu) # l2 = alpha * (1 - l1_ratio) - # Obj' = d(Obj)/d(w) = 1/2 Dev' + P2 w + # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w # = -X' D (y-mu)/V(mu) + l2 P2 w # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 # Use Fisher matrix instead of full info matrix -X'(...) X, # i.e. E[Dev''] with E[y-mu]=0: # Obj'' ~ X' W X + l2 P2 - # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu) - # Note: P2 = l2*P2, see above + # (1): w = (X' W X + l2 P2)^-1 X' W z, + # with z = eta + D^-1 (y-mu) + # Note: we already set P2 = l2*P2, see above + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: ' denotes derivative, but also transpose for matrices if solver == 'irls': # eta = linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) @@ -1150,9 +1229,8 @@ def fit(self, X, y, sample_weight=None): # working observations z = eta + (y-mu)/hp # solve A*coef = b - # A = X' W X + l2 P2, b = X' W z + # A = X' W X + P2, b = X' W z coef = _irls_step(Xnew, W, P2, z) - # updated linear predictor # do it here for updated values for tolerance eta = safe_sparse_dot(Xnew, coef, dense_output=True) @@ -1242,25 +1320,182 @@ def Hs(s): # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin # An Improved GLMNET for L1-regularized Logistic Regression, # Journal of Machine Learning Research 13 (2012) 1999-2030 - # Note: Use Fisher matrix instead of Hessian + # Note: Use Fisher matrix instead of Hessian for H # # 1. find optimal descent direction d by minimizing # min_d F(w+d) = min_d F(w+d) - F(w) - # F = f + g; f(w) = 1/2 dev; g(w) = 1/2*w*P2*w + ||P1*w||_1 - # 2. quadrdatic approx of f(w+d)-f(w): - # q(d) = f'(w)*d +1/2 d*H*d - # min_d q(d) + g(w+d) - g(w) + # F = f + g, f(w) = 1/2 deviance, g(w) = 1/2 w*P2*w + ||P1*w||_1 + # 2. quadrdatic approximation of F(w+d)-F(w) = q(d): + # using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives + # q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d + # + ||P1*(w+d)||_1 - ||P1*w||_1 + # min_d q(d) # 3. coordinate descent by updating coordinate j (d -> d+z*e_j): - # min_z q(d+z*e_j) + g(w+d+z*e_j) - g(w) - # = min_z q(d+z e_j) - q(d) + g(w+d+z*e_j) - g(w+d) - # TODO - # elif solver == 'cd': + # min_z q(d+z*e_j) + # = min_z q(d+z*e_j) - q(d) + # = min_z A_j z + 1/2 B_jj z^2 + # + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 + # A = f'(w) + d*H(w) + (w+d)*P2 + # B = H+P2 + # Note: we already set P2 = l2*P2, P1 = l1*P1, see above + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: f' = -score, H = Fisher matrix + elif solver == 'cd': # line search parameters - # (beta, sigma) = (0.5, 0.01) - # for iteration k from 1 to maxiter - # for coordinate j sample at random - # np.random.choice(coord, replace = False) - # + (beta, sigma) = (0.5, 0.01) + # max inner loops (cycles through all features) + max_inner_iter = 1000 + # some precalculations + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) + # initial stopping tolerance of inner loop + # use L1-norm of minimum-norm of subgradient of F + # fp_wP2 = f'(w) + w*P2 + if P2.ndim == 1: + fp_wP2 = -score + coef*P2 + else: + fp_wP2 = -score + safe_sparse_dot(coef, P2) + inner_tol = (np.where(coef == 0, + np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), + fp_wP2+np.sign(coef)*P1)) + inner_tol = linalg.norm(inner_tol, ord=1) + # outer loop + while self.n_iter_ < self.max_iter: + self.n_iter_ += 1 + # initialize search direction d (to be optimized) + d = np.zeros_like(coef) + # inner loop + # TODO: use sparsity (coefficient already 0 due to L1 penalty) + d = np.zeros_like(coef) + # A = f'(w) + d*H(w) + (w+d)*P2 + # B = H+P2 + # Note: f'=-score and H=fisher are updated at the end of outer + # iteration + B = fisher + if P2.ndim == 1: + coef_P2 = coef * P2 + B[np.diag_indices_from(B)] += P2 + else: + coef_P2 = safe_sparse_dot(coef, P2) + B += P2 + A = -score + coef_P2 # + d*(H+P2) but d=0 so far + inner_iter = 0 + while inner_iter < max_inner_iter: + inner_iter += 1 + if self.selection == 'random': + featurelist = random_state.permutation(n_features) + else: + featurelist = np.arange(n_features) + for j in featurelist: + # minimize_z: a z + 1/2 b z^2 + c |d+z| + # a = A_j + # b = B_jj > 0 + # c = |P1_j| = P1_j > 0, ee 1.3 + # d = w_j + d_j + # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) + # with beta = z+d, beta_hat = d-a/b and gamma = c/b + # z = 1/b * S(bd-a,c) - d + # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding + a = A[j] + b = B[j, j] + if P1[j] == 0: + if b == 0: + z = 0 + else: + z = -a/b + elif a + P1[j] < b * (coef[j]+d[j]): + if b == 0: + z = 0 + else: + z = -(a + P1[j])/b + elif a - P1[j] > b * (coef[j]+d[j]): + if b == 0: + z = 0 + else: + z = -(a - P1[j])/b + else: + z = -(coef[j] + d[j]) + # update direction d + d[j] += z + # update A because d_j is now d_j+z + # A = f'(w) + d*H(w) + (w+d)*P2 + # => A += (H+P2)*e_j z = B_j * z + # Note: B is symmetric B = B.transpose + if sparse.issparse(B): + if sparse.isspmatrix_csc(B): + # slice columns + A += B[:, j].toarray().ravel() * z + else: + # slice rows + A += B[j, :].toarray().ravel() * z + else: + A += B[j, :] * z + # end of cycle + # stopping criterion for inner loop + # sum_i(|minimum-norm subgrad of q(d)_i|) + mn_subgrad = (np.where(coef + d == 0, + np.sign(A)*np.maximum(np.abs(A)-P1, 0), + A+np.sign(coef+d)*P1)) + mn_subgrad = np.sum(np.abs(mn_subgrad)) + if mn_subgrad <= inner_tol: + if inner_iter == 1: + inner_tol = inner_tol/4. + break + # end of inner loop + # line search by sequence beta^k, k=0, 1, .. + # F(w + lambda d) - F(w) <= lambda * bound + # bound = sigma * (f'(w)*d + w*P2*d + # +||P1 (w+d)||_1 - ||P1 w||_1) + P1w_1 = linalg.norm(P1*coef, ord=1) + # Note: coef_P2 already calculated and still valid + bound = sigma * ( + safe_sparse_dot(-score, d) + + safe_sparse_dot(coef_P2, d) + + linalg.norm(P1*(coef+d), ord=1) - + P1w_1) + Fw = (0.5 * family.deviance(y, mu, weights) + + 0.5 * safe_sparse_dot(coef_P2, coef) + + P1w_1) + la = 1./beta + for k in range(20): + la *= beta # starts with la=1 + mu_wd = link.inverse(safe_sparse_dot(Xnew, coef+la*d, + dense_output=True)) + Fwd = (0.5 * family.deviance(y, mu_wd, weights) + + linalg.norm(P1*(coef+la*d), ord=1)) + if P2.ndim == 1: + Fwd += 0.5 * safe_sparse_dot((coef+la*d)*P2, coef+la*d) + else: + Fwd += 0.5 * (safe_sparse_dot(coef+la*d, + safe_sparse_dot(P2, coef+la*d))) + if Fwd-Fw <= sigma*la*bound: + break + # update coefficients + # coef_old = coef.copy() + coef += la * d + # calculate eta, mu, score, Fisher matrix for next iteration + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) + # stopping criterion for outer loop + # sum_i(|minimum-norm subgrad of F(w)_i|) + # fp_wP2 = f'(w) + w*P2 + # Note: eta, mu and score are already updated + if P2.ndim == 1: + fp_wP2 = -score + coef*P2 + else: + fp_wP2 = -score + safe_sparse_dot(coef, P2) + mn_subgrad = (np.where(coef == 0, + np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), + fp_wP2+np.sign(coef)*P1)) + mn_subgrad = np.sum(np.abs(mn_subgrad)) + if mn_subgrad <= self.tol: + converged = True + break + # end of outer loop + if not converged: + warnings.warn("Coordinate descent failed to converge. Increase" + " the number of iterations (currently {0})" + .format(self.max_iter), ConvergenceWarning) ####################################################################### # 5. postprocessing # diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index de7de90db967b..87cc8bea45f5b 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,6 +1,7 @@ import numpy as np from numpy.testing import assert_allclose import scipy as sp +from scipy import sparse from sklearn.linear_model.glm import ( Link, @@ -126,7 +127,7 @@ def test_glm_identiy_regression(): GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), GeneralizedHyperbolicSecand()) - for solver in ['irls', 'lbfgs', 'newton-cg']: + for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: for family in families: glm = GeneralizedLinearRegressor( alpha=0, family=family, fit_intercept=False, solver=solver) @@ -162,28 +163,31 @@ def test_normal_ridge(): rng = np.random.RandomState(0) alpha = 1.0 - # With more samples than features + # 1. With more samples than features n_samples, n_features, n_predict = 6, 5, 10 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) T = rng.randn(n_predict, n_features) # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True) + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, + solver='svd', normalize=False) ridge.fit(X, y) - for solver in ['irls', 'lbfgs', 'newton-cg']: + for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', - fit_intercept=True, solver=solver) + fit_intercept=True, tol=1e-6, + max_iter=100, solver=solver) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) assert_almost_equal(glm.intercept_, ridge.intercept_) assert_array_almost_equal(glm.predict(T), ridge.predict(T)) - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False) + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, + solver='svd', normalize=False) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, family='normal', link='identity', fit_intercept=False, solver='irls') glm.fit(X, y) @@ -192,28 +196,30 @@ def test_normal_ridge(): assert_almost_equal(glm.intercept_, ridge.intercept_) assert_array_almost_equal(glm.predict(T), ridge.predict(T)) - # With more features than samples + # 2. With more features than samples and sparse n_samples, n_features, n_predict = 5, 10, 10 y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) - T = rng.randn(n_predict, n_features) + X = sparse.csr_matrix(rng.randn(n_samples, n_features)) + T = sparse.csr_matrix(rng.randn(n_predict, n_features)) # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True) + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, + solver='sag', normalize=False, max_iter=100000) ridge.fit(X, y) - for solver in ['irls', 'lbfgs', 'newton-cg']: - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, family='normal', link='identity', fit_intercept=True, solver=solver) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False) + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, + solver='sag', normalize=False, max_iter=1000) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, family='normal', link='identity', fit_intercept=False, solver='irls') glm.fit(X, y) @@ -240,9 +246,41 @@ def test_poisson_ridge(): # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) - glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, family='poisson', - link='log', tol=1e-10) + s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 7, 'cd': 7} + for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, + fit_intercept=True, family='poisson', + link='log', tol=1e-7, + solver=solver, max_iter=200) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, -0.12889386979, + decimal=s_dec[solver]) + assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + decimal=s_dec[solver]) + + +def test_poisson_enet(): + """Test elastic net regression with poisson family and LogLink + + Compare to R's glmnet""" + # library("glmnet") + # options(digits=10) + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.03550978409 + # a 0.16936423283 + # b . + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', + link='log', tol=1e-7) glm.fit(X, y) - assert_almost_equal(glm.intercept_, -0.12889386979, decimal=7) - assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + assert_almost_equal(glm.intercept_, -0.03550978409, decimal=7) + assert_array_almost_equal(glm.coef_, [0.16936423283, 0.], decimal=7) From 91497a2abc4824cdcb72f88dc26c9fd347d54b0d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 24 Jan 2018 20:37:27 +0100 Subject: [PATCH 011/209] [WIP] Add Generalized Linear Models (#9405) * bug for sparse matrices for newton-cg solver, function grad_hess * reduce precision for solver newton-cg in test_poisson_ridge * remedy doctest issues in linear_model.rst for example of GeneralizedLinearRegressor * remove unused import of xrange from six --- doc/modules/linear_model.rst | 2 +- sklearn/linear_model/glm.py | 3 +-- sklearn/linear_model/tests/test_glm.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 834466e494a4a..1f0946e97b059 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -919,7 +919,7 @@ follows: >>> from sklearn.linear_model import GeneralizedLinearRegressor >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') - >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +SKIP + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, copy_X=True, family='poisson', fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0, link='log', max_iter=100, diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 3de82c20f33cf..25f3ee1f52a2e 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -53,7 +53,6 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..externals import six -from ..externals.six.moves import xrange from ..utils import check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg @@ -1288,7 +1287,7 @@ def grad_hess(coef, X, y, weights, link): if P2.ndim == 1: hessian[np.diag_indices_from(hessian)] += P2 else: - hessian += P2 + hessian = hessian + P2 def Hs(s): ret = safe_sparse_dot(hessian, s) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 87cc8bea45f5b..c48c59ebd0eda 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -246,7 +246,7 @@ def test_poisson_ridge(): # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) - s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 7, 'cd': 7} + s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7} for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, fit_intercept=True, family='poisson', From b9e5105ddb011a2a4efd74eeb3033ebb824fa5a8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 24 Jan 2018 21:44:10 +0100 Subject: [PATCH 012/209] [WIP] Add Generalized Linear Models (#9405) * bug in cd solver for sparse matrices * higer precision (smaller tol) in test_normal_ridge for sparse matrices * for each solver a separate precision (tol) in test_poisson_ridge --- sklearn/linear_model/glm.py | 2 +- sklearn/linear_model/tests/test_glm.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 25f3ee1f52a2e..b428ee7509d14 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -1376,7 +1376,7 @@ def Hs(s): B[np.diag_indices_from(B)] += P2 else: coef_P2 = safe_sparse_dot(coef, P2) - B += P2 + B = B + P2 A = -score + coef_P2 # + d*(H+P2) but d=0 so far inner_iter = 0 while inner_iter < max_inner_iter: diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index c48c59ebd0eda..baad852dfb945 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -207,19 +207,20 @@ def test_normal_ridge(): solver='sag', normalize=False, max_iter=100000) ridge.fit(X, y) for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, family='normal', link='identity', - fit_intercept=True, solver=solver) + fit_intercept=True, solver=solver, + max_iter=300) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, solver='sag', normalize=False, max_iter=1000) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, family='normal', link='identity', fit_intercept=False, solver='irls') glm.fit(X, y) @@ -247,11 +248,12 @@ def test_poisson_ridge(): X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7} + s_tol = {'irls': 1e-8, 'lbfgs': 1e-7, 'newton-cg': 1e-7, 'cd': 1e-8} for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, fit_intercept=True, family='poisson', - link='log', tol=1e-7, - solver=solver, max_iter=200) + link='log', tol=s_tol[solver], + solver=solver, max_iter=300) glm.fit(X, y) assert_almost_equal(glm.intercept_, -0.12889386979, decimal=s_dec[solver]) From e317422e9dd860c4ed5a3c6ac6191eb8e560c365 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 25 Jan 2018 21:44:04 +0100 Subject: [PATCH 013/209] [WIP] Add Generalized Linear Models (#9405) * improved documentation * additional option 'zero' for argument start_params * validation of sample_weight in function predict * input validation of estimate_phi * set default fit_dispersion=None * bug in estimate_phi because of weight rescaling * test for estimate_phi in normal ridge regression * extended tests for elastic net poisson --- sklearn/linear_model/glm.py | 116 ++++++++++++++++--------- sklearn/linear_model/tests/test_glm.py | 39 +++++++-- 2 files changed, 110 insertions(+), 45 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index b428ee7509d14..e5eda6108052c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -11,13 +11,13 @@ # TODO: deal with option self.copy_X # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. -# TODO: Add cross validation +# TODO: Add cross validation support # TODO: Should GeneralizedLinearRegressor inherit from LinearModel? # So far, it does not. # TODO: Include further classes in class.rst? ExponentialDispersionModel? # TweedieDistribution? -# TODO: Negative values in P1 are not allowed so far. They could be used form -# group lasse. +# TODO: Negative values in P1 are not allowed so far. They could be used to +# for group lasso. # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -642,7 +642,7 @@ def _irls_step(X, W, P2, z): ------- coef: array, shape = (X.shape[1]) """ - # TODO: scipy.linalg.solve if faster, but ordinary least squares uses + # TODO: scipy.linalg.solve is faster, but ordinary least squares uses # scipy.linalg.lstsq. What is more appropriate? n_samples, n_features = X.shape if sparse.issparse(X): @@ -709,16 +709,20 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): (penalized) maximum likelihood which is equivalent to minimizing the deviance. - TODO: For `alpha` > 0, the feature matrix `X` is assumed to be - standardized. Call + For `alpha` > 0, the feature matrix `X` should be standardized in order to + penalize features equally strong. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. - Otherwise, the strength of the penalty is different for the features. TODO: Estimation of the dispersion parameter phi. - TODO: Notes on weights and 'scaled' distributions. For Poisson, this means - to fit y = z/w with z=counts and w=exposure (time, money, persons, ...) - => y is a ratio with weights w. Same for other distributions. + If your target `y` is a ratio, you should also provide appropriate weights + `w`. As an example, consider Poission distributed counts `z` (integers) and + weights `w`=exposure (time, money, persons years, ...), then you fit + `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y, + sample_weight=w)``. You need the weights for the right mean, consider: + :math:`\bar(y) = \frac{\sum_i w_i y_i}{\sum_i w_i}`. + In this case one might say that y has a 'scaled' Poisson distributions. + The same holds for other distributions. Parameters ---------- @@ -800,8 +804,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): does not exit (first call to fit), option ``start_params`` sets the starting values for ``coef_`` and ``intercept_``. - start_params : None or array of shape (n_features, ) or 'least_squares'}, \ - optional (default=None) + start_params : {None, 'least_squares', 'zero'} or array of shape \ + (n_features, ) or }, optional (default=None) If an array of size n_features is supplied, use these as start values for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. @@ -854,16 +858,18 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): n_iter_ : int Actual number of iterations of the solver. - Notes - ----- References ---------- - TODO + For the coordinate descent implementation: + .. [1] Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', - fit_dispersion='chisqr', solver='auto', max_iter=100, + fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params=None, selection='random', random_state=None, copy_X=True, check_input=True, verbose=0): @@ -1004,9 +1010,10 @@ def fit(self, X, y, sample_weight=None): if start_params is None: pass elif isinstance(start_params, six.string_types): - if start_params not in ['least_squares']: + if start_params not in ['least_squares', 'zero']: raise ValueError("The argument start_params must be None, " - "'least-squares' or an array of right length," + "'least-squares', 'zero' or an array of right" + " length," " got(start_params={0})".format(start_params)) else: start_params = np.atleast_1d(start_params) @@ -1129,6 +1136,7 @@ def fit(self, X, y, sample_weight=None): # deviance = sum(sample_weight * unit_deviance), # we rescale weights such that sum(weights) = 1 and this becomes # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + weights_sum = np.sum(weights) weights = weights/np.sum(weights) ####################################################################### @@ -1141,7 +1149,8 @@ def fit(self, X, y, sample_weight=None): coef = None if self.warm_start and hasattr(self, "coef_"): if self.fit_intercept: - coef = np.concatenate((self.intercept_, self.coef_)) + coef = np.concatenate((np.array([self.intercept_]), + self.coef_)) else: coef = self.coef_ elif self.start_params is None: @@ -1164,24 +1173,27 @@ def fit(self, X, y, sample_weight=None): else: # with L1 penalty, start with coef = 0 coef = np.zeros(n_features) - elif (isinstance(self.start_params, six.string_types) and - self.start_params == 'least_squares'): - if self.alpha == 0: - reg = LinearRegression(copy_X=True, fit_intercept=False) - reg.fit(Xnew, link.link(y)) - coef = reg.coef_ - elif self.l1_ratio <= 0.01: - # ElasticNet says l1_ratio <= 0.01 is not reliable, use Ridge - reg = Ridge(copy_X=True, fit_intercept=False, - alpha=self.alpha) - reg.fit(Xnew, link.link(y)) - coef = reg.coef_ - else: - # TODO: Does this make sense at all? - reg = ElasticNet(copy_X=True, fit_intercept=False, - alpha=self.alpha, l1_ratio=self.l1_ratio) - reg.fit(Xnew, link.link(y)) - coef = reg.coef_ + elif isinstance(self.start_params, six.string_types): + if self.start_params == 'zero': + coef = np.zeros(n_features) + elif self.start_params == 'least_squares': + if self.alpha == 0: + reg = LinearRegression(copy_X=True, fit_intercept=False) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ + elif self.l1_ratio <= 0.01: + # ElasticNet says l1_ratio <= 0.01 is not reliable + # => use Ridge + reg = Ridge(copy_X=True, fit_intercept=False, + alpha=self.alpha) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ + else: + # TODO: Does this make sense at all? + reg = ElasticNet(copy_X=True, fit_intercept=False, + alpha=self.alpha, l1_ratio=self.l1_ratio) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ else: coef = start_params @@ -1365,6 +1377,7 @@ def Hs(s): d = np.zeros_like(coef) # inner loop # TODO: use sparsity (coefficient already 0 due to L1 penalty) + # => active set of features for featurelist, see paper d = np.zeros_like(coef) # A = f'(w) + d*H(w) + (w+d)*P2 # B = H+P2 @@ -1508,7 +1521,8 @@ def Hs(s): self.coef_ = coef if self.fit_dispersion in ['chisqr', 'deviance']: - self.dispersion_ = self.estimate_phi(y, X, weights) + # attention because of rescaling of weights + self.dispersion_ = self.estimate_phi(y, X, weights)*weights_sum return self @@ -1544,9 +1558,23 @@ def predict(self, X, sample_weight=1): C : array, shape = (n_samples) Returns predicted values times sample_weight. """ - # TODO: validation of sample_weight eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) + if sample_weight is None: + return mu + elif np.isscalar(sample_weight): + if sample_weight <= 0: + raise ValueError("Sample weight must be positive, " + "got (sample_weight={0})." + .format(sample_weight)) + else: + sample_weights = np.atleast_1d(sample_weight) + if sample_weight.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar.") + elif sample_weight.shape[0] != mu.shape[0]: + raise ValueError("Sample weights must have the same length as" + " X.shape[1].") + return mu*sample_weight def estimate_phi(self, y, X, sample_weight): @@ -1554,10 +1582,20 @@ def estimate_phi(self, y, X, sample_weight): Returns the estimate. """ check_is_fitted(self, "coef_") + _dtype = [np.float64, np.float32] + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=_dtype, y_numeric=True, multi_output=False) n_samples, n_features = X.shape eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ + n_features += 1 + if n_samples <= n_features: + raise ValueError("Estimation of dispersion parameter phi requires" + " more samples than features, got" + " samples=X.shape[0]={0} and" + " n_features=X.shape[1]+fit_intercept={1}." + .format(n_samples, n_features)) mu = self._link_instance.inverse(eta) if self.fit_dispersion == 'chisqr': chisq = np.sum(sample_weight*(y-mu)**2 / diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index baad852dfb945..ee90cd51c874d 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -164,7 +164,7 @@ def test_normal_ridge(): alpha = 1.0 # 1. With more samples than features - n_samples, n_features, n_predict = 6, 5, 10 + n_samples, n_features, n_predict = 10, 5, 10 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) T = rng.randn(n_predict, n_features) @@ -189,12 +189,16 @@ def test_normal_ridge(): ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, family='normal', link='identity', - fit_intercept=False, solver='irls') + fit_intercept=False, solver='irls', + fit_dispersion='chisqr') glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) assert_almost_equal(glm.intercept_, ridge.intercept_) assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + mu = glm.predict(X) + assert_almost_equal(glm.dispersion_, + np.sum((y-mu)**2/(n_samples-n_features))) # 2. With more features than samples and sparse n_samples, n_features, n_predict = 5, 10, 10 @@ -278,11 +282,34 @@ def test_poisson_enet(): # (Intercept) -0.03550978409 # a 0.16936423283 # b . + glmnet_intercept = -0.03550978409 + glmnet_coef = [0.16936423283, 0.] X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', tol=1e-7) + link='log', solver='cd', tol=1e-7) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) + + # same for start_params='zero' with reduced precision + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', + link='log', solver='cd', tol=1e-5, + start_params='zero') + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + + # start_params='least_squares' with different alpha + glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, + family='poisson', + link='log', solver='cd', tol=1e-5, + start_params='zero') + glm.fit(X, y) + # warm start with original alpha and use of sparse matrices + glm.warm_start = True + glm.alpha = 1 + X = sparse.csr_matrix(X) glm.fit(X, y) - assert_almost_equal(glm.intercept_, -0.03550978409, decimal=7) - assert_array_almost_equal(glm.coef_, [0.16936423283, 0.], - decimal=7) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) From 9a9818441d605bf86547651997db81e969f41cdf Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 25 Jan 2018 22:59:46 +0100 Subject: [PATCH 014/209] [WIP] Add Generalized Linear Models (#9405) * new helper function _check_weights for validation of sample_weight * fix white space issue in doctest of linear_model.rst --- doc/modules/linear_model.rst | 8 ++--- sklearn/linear_model/glm.py | 59 +++++++++++++++++------------------- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 1f0946e97b059..f7b0ca0cc7add 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -921,10 +921,10 @@ follows: >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, - copy_X=True, family='poisson', fit_dispersion='chisqr', - fit_intercept=True, l1_ratio=0, link='log', max_iter=100, - random_state=None, selection='random', solver='auto', - start_params=None, tol=0.0001, verbose=0, warm_start=False) + copy_X=True, family='poisson', fit_dispersion='chisqr', + fit_intercept=True, l1_ratio=0, link='log', max_iter=100, + random_state=None, selection='random', solver='auto', + start_params=None, tol=0.0001, verbose=0, warm_start=False) >>> reg.coef_ array([ 0.24630255, 0.43373521]) >>> reg.intercept_ #doctest: +ELLIPSIS diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index e5eda6108052c..138830ea431c9 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -59,6 +59,26 @@ from ..utils.validation import check_is_fitted, check_random_state +def _check_weights(sample_weight, n_samples): + if sample_weight is None: + weights = np.ones(n_samples) + elif np.isscalar(sample_weight): + if sample_weight < 0: + raise ValueError("Sample weights must be non-negative.") + weights = sample_weight*np.ones(n_samples) + else: + weights = np.atleast_1d(sample_weight) + if weights.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar") + elif weights.shape[0] != n_samples: + raise ValueError("Sample weights must have the same length as" + " y") + if not np.all(sample_weight >= 0): + raise ValueError("Sample weights must be non-negative.") + + return weights + + class Link(six.with_metaclass(ABCMeta)): """Abstract base class for Link funtions """ @@ -925,17 +945,7 @@ def fit(self, X, y, sample_weight=None): dtype=_dtype, y_numeric=True, multi_output=False) y = y.astype(np.float64) - if sample_weight is None: - weights = np.ones_like(y) - elif np.isscalar(sample_weight): - weights = sample_weight*np.ones_like(y) - else: - weights = np.atleast_1d(sample_weight) - if weights.ndim > 1: - raise ValueError("Sample weight must be 1D array or scalar") - elif weights.shape[0] != y.shape[0]: - raise ValueError("Sample weights must have the same length as" - " y") + weights = _check_weights(sample_weight, y.shape[0]) # 1.2 validate arguments of __init__ ################################## # Garantee that self._family_instance is an instance of class @@ -1544,7 +1554,7 @@ def linear_predictor(self, X): return safe_sparse_dot(X, self.coef_, dense_output=True) + self.intercept_ - def predict(self, X, sample_weight=1): + def predict(self, X, sample_weight=None): """Predict uing GLM with feature matrix X. If sample_weight is given, returns prediction*sample_weight. @@ -1558,26 +1568,13 @@ def predict(self, X, sample_weight=1): C : array, shape = (n_samples) Returns predicted values times sample_weight. """ + weights = _check_weights(sample_weight, X.shape[0]) eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) - if sample_weight is None: - return mu - elif np.isscalar(sample_weight): - if sample_weight <= 0: - raise ValueError("Sample weight must be positive, " - "got (sample_weight={0})." - .format(sample_weight)) - else: - sample_weights = np.atleast_1d(sample_weight) - if sample_weight.ndim > 1: - raise ValueError("Sample weight must be 1D array or scalar.") - elif sample_weight.shape[0] != mu.shape[0]: - raise ValueError("Sample weights must have the same length as" - " X.shape[1].") - return mu*sample_weight + return mu*weights - def estimate_phi(self, y, X, sample_weight): + def estimate_phi(self, y, X, sample_weight=None): """Estimation of the dispersion parameter. Returns the estimate. """ @@ -1586,6 +1583,7 @@ def estimate_phi(self, y, X, sample_weight): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=_dtype, y_numeric=True, multi_output=False) n_samples, n_features = X.shape + weights = _check_weights(sample_weight, n_samples) eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ @@ -1640,10 +1638,7 @@ def score(self, X, y, sample_weight=None): # Note, default score defined in RegressorMixin is R^2 score. # TODO: make D^2 a score function in module metrics (and thereby get # input validation and so on) - if sample_weight is None: - weights = np.ones_like(y) - else: - weights = np.atleast_1d(sample_weight) + weights = _check_weights(sample_weight, y.shape[0]) mu = self.predict(X) dev = self._family_instance.deviance(y, mu, weights=weights) y_mean = np.average(y, weights=weights) From db9defe6b7637fe022034ca7f435f4fd37f6c118 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 26 Jan 2018 08:33:59 +0100 Subject: [PATCH 015/209] [WIP] Add Generalized Linear Models (#9405) * fit_dispersion default=None also in docs. * improved docs. * fixed input validation of predict * fixed bug for sample_weight in estimate_phi --- sklearn/linear_model/glm.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 138830ea431c9..22a8be9e50828 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -754,7 +754,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): case, the design matrix X must have full column rank (no collinearities). - l1_ratio : float, optional (defaul=0) + l1_ratio : float, optional (default=0) The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a @@ -768,7 +768,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Note that n_features* = X.shape[1] = length of coef_ (intercept always excluded from counting). - P2 : None or array of shape (n_features*, n_features*) + P2 : None or array of shape (n_features*, n_features*), optional\ + (default=None) With this square matrix the L2 penalty is calculated as `w P2 w`. This gives a fine control over this penalty (Tikhonov regularization). @@ -781,20 +782,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\ of class ExponentialDispersionModel, optional(default='normal') - the distributional assumption of the GLM. + the distributional assumption of the GLM, i.e. which loss function to + be minimized. link : {'identity', 'log'} or an instance of class Link, optional (default='identity') the link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul='chisqr') + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) method for estimation of the dispersion parameter phi. Whether to use the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \ - optional (defaul='auto') + optional (default='auto') Algorithm to use in the optimization problem. - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'. @@ -830,11 +832,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. If 'least_squares' is set, the result of a least squares fit in the - link space (linear predictor) is taken. If ``None``, the start values - are calculated by setting mu to family.starting_mu(..) and one step of - irls. - This option only applies if ``warm_start=False`` or if fit is called - the first time (``self.coef_`` does not exist). + link space (linear predictor) is taken. + If 'zero' is set, all coefficients start with zero. + If ``None``, the start values are calculated by setting mu to + family.starting_mu(..) and one step of irls. + These options only apply if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not yet exist). selection : str, optional (default='random') For the solver 'cd' (coordinate descent), the coordinates (features) @@ -1550,7 +1553,9 @@ def linear_predictor(self, X): Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") - X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype='numeric', copy=True, ensure_2d=True, + allow_nd=False) return safe_sparse_dot(X, self.coef_, dense_output=True) + self.intercept_ @@ -1568,9 +1573,10 @@ def predict(self, X, sample_weight=None): C : array, shape = (n_samples) Returns predicted values times sample_weight. """ - weights = _check_weights(sample_weight, X.shape[0]) + # validation of X in linear_predictor eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) + weights = _check_weights(sample_weight, X.shape[0]) return mu*weights @@ -1596,11 +1602,11 @@ def estimate_phi(self, y, X, sample_weight=None): .format(n_samples, n_features)) mu = self._link_instance.inverse(eta) if self.fit_dispersion == 'chisqr': - chisq = np.sum(sample_weight*(y-mu)**2 / + chisq = np.sum(weights*(y-mu)**2 / self._family_instance.unit_variance(mu)) return chisq/(n_samples - n_features) elif self.fit_dispersion == 'deviance': - dev = self._family_instance.deviance(y, mu, sample_weight) + dev = self._family_instance.deviance(y, mu, weights) return dev/(n_samples - n_features) # Note: check_estimator(GeneralizedLinearRegressor) might raise From dc7fdd7f8f6a3a2276f5f96aa6d5dd6ad3ce853e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 26 Jan 2018 08:41:24 +0100 Subject: [PATCH 016/209] [WIP] Add Generalized Linear Models (#9405) * improved docs --- sklearn/linear_model/tests/test_glm.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index ee90cd51c874d..0af837c9c73f3 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -177,7 +177,8 @@ def test_normal_ridge(): glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=True, tol=1e-6, - max_iter=100, solver=solver) + max_iter=100, solver=solver, + random_state=42) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) @@ -214,7 +215,7 @@ def test_normal_ridge(): glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, family='normal', link='identity', fit_intercept=True, solver=solver, - max_iter=300) + max_iter=300, random_state=42) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) @@ -257,7 +258,8 @@ def test_poisson_ridge(): glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, fit_intercept=True, family='poisson', link='log', tol=s_tol[solver], - solver=solver, max_iter=300) + solver=solver, max_iter=300, + random_state=42) glm.fit(X, y) assert_almost_equal(glm.intercept_, -0.12889386979, decimal=s_dec[solver]) @@ -282,20 +284,23 @@ def test_poisson_enet(): # (Intercept) -0.03550978409 # a 0.16936423283 # b . + rand = 0 glmnet_intercept = -0.03550978409 glmnet_coef = [0.16936423283, 0.] X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', solver='cd', tol=1e-7) + link='log', solver='cd', tol=1e-7, + selection='random', random_state=42) glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) - # same for start_params='zero' with reduced precision + # same for start_params='zero' and selection='cyclic' + # with reduced precision glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', link='log', solver='cd', tol=1e-5, - start_params='zero') + selection='cyclic', start_params='zero') glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) From b11d06ba72865c14b0532c6d6c34d264a09d7ae4 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 26 Jan 2018 16:57:45 +0100 Subject: [PATCH 017/209] [WIP] Add Generalized Linear Models (#9405) * fixed input validation of X in predict --- sklearn/linear_model/glm.py | 4 +++- sklearn/linear_model/tests/test_glm.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 22a8be9e50828..032ded86816dd 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -1573,7 +1573,9 @@ def predict(self, X, sample_weight=None): C : array, shape = (n_samples) Returns predicted values times sample_weight. """ - # validation of X in linear_predictor + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype='numeric', copy=True, ensure_2d=True, + allow_nd=False) eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) weights = _check_weights(sample_weight, X.shape[0]) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 0af837c9c73f3..776edd8aeec46 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -284,7 +284,6 @@ def test_poisson_enet(): # (Intercept) -0.03550978409 # a 0.16936423283 # b . - rand = 0 glmnet_intercept = -0.03550978409 glmnet_coef = [0.16936423283, 0.] X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T From 9e6c01378a4cb245824bcb9429b3d566652af743 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 26 Jan 2018 17:37:44 +0100 Subject: [PATCH 018/209] [WIP] Add Generalized Linear Models (#9405) * redundant line of code 'd = np.zeros_like(coef)' --- doc/modules/linear_model.rst | 2 +- sklearn/linear_model/glm.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index f7b0ca0cc7add..dcb35b6a5d941 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -921,7 +921,7 @@ follows: >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, - copy_X=True, family='poisson', fit_dispersion='chisqr', + copy_X=True, family='poisson', fit_dispersion=None, fit_intercept=True, l1_ratio=0, link='log', max_iter=100, random_state=None, selection='random', solver='auto', start_params=None, tol=0.0001, verbose=0, warm_start=False) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 032ded86816dd..8856af8ec698a 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -73,7 +73,7 @@ def _check_weights(sample_weight, n_samples): elif weights.shape[0] != n_samples: raise ValueError("Sample weights must have the same length as" " y") - if not np.all(sample_weight >= 0): + if not np.all(weights >= 0): raise ValueError("Sample weights must be non-negative.") return weights @@ -1132,7 +1132,7 @@ def fit(self, X, y, sample_weight=None): if sparse.issparse(P2): # TODO: check sparse P2 for non-negativeness # raise NotImplementedError("Check sparse P2 for " - # "non-negaitveness is not yet " + # "non-negativeness is not yet " # "implemented.") pass elif P2.ndim == 2: @@ -1391,7 +1391,6 @@ def Hs(s): # inner loop # TODO: use sparsity (coefficient already 0 due to L1 penalty) # => active set of features for featurelist, see paper - d = np.zeros_like(coef) # A = f'(w) + d*H(w) + (w+d)*P2 # B = H+P2 # Note: f'=-score and H=fisher are updated at the end of outer From bad0190a22623eae3a2f6dfcdb0fd8caee625111 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 27 Jan 2018 20:38:02 +0100 Subject: [PATCH 019/209] [WIP] Add Generalized Linear Models (#9405) * added test to compare to ElasticNet * deleted identical comment lines --- sklearn/linear_model/tests/test_glm.py | 29 +++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 776edd8aeec46..9990cafe2cbcf 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -12,7 +12,7 @@ GammaDistribution, InverseGaussianDistribution, GeneralizedHyperbolicSecand, GeneralizedLinearRegressor) -from sklearn.linear_model.ridge import Ridge +from sklearn.linear_model import ElasticNet, Ridge from sklearn.utils.testing import ( assert_equal, assert_almost_equal, @@ -267,14 +267,37 @@ def test_poisson_ridge(): decimal=s_dec[solver]) +def test_normal_enet(): + """Tet elastic net regression with normal/gaussian family""" + rng = np.random.RandomState(0) + alpha, l1_ratio = 0.3, 0.7 + n_samples, n_features = 20, 2 + X = rng.randn(n_samples, n_features).copy(order='F') + beta = rng.randn(n_features) + y = 2 + np.dot(X, beta) + rng.randn(n_samples) + + glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, + family='normal', link='identity', + fit_intercept=True, tol=1e-7, + max_iter=100, selection='cyclic', + solver='cd', start_params='zero', + check_input=False) + glm.fit(X, y) + + enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, + normalize=False, tol=1e-7, copy_X=True) + enet.fit(X, y) + + assert_almost_equal(glm.intercept_, enet.intercept_) + assert_array_almost_equal(glm.coef_, enet.coef_) + + def test_poisson_enet(): """Test elastic net regression with poisson family and LogLink Compare to R's glmnet""" # library("glmnet") # options(digits=10) - # library("glmnet") - # options(digits=10) # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) # x <- data.matrix(df[,c("a", "b")]) # y <- df$y From 48137d86079c9a8efd15d57e719e3ee35f1644c9 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 28 Jan 2018 11:52:19 +0100 Subject: [PATCH 020/209] [WIP] Add Generalized Linear Models (#9405) * increased precision in test_normal_enet --- sklearn/linear_model/tests/test_glm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 9990cafe2cbcf..6f8bdd3a72f40 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -278,18 +278,18 @@ def test_normal_enet(): glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, family='normal', link='identity', - fit_intercept=True, tol=1e-7, + fit_intercept=True, tol=1e-8, max_iter=100, selection='cyclic', solver='cd', start_params='zero', check_input=False) glm.fit(X, y) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, - normalize=False, tol=1e-7, copy_X=True) + normalize=False, tol=1e-8, copy_X=True) enet.fit(X, y) - assert_almost_equal(glm.intercept_, enet.intercept_) - assert_array_almost_equal(glm.coef_, enet.coef_) + assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) + assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) def test_poisson_enet(): From 2c2a077a2e8c57bdf3c945678526d6efdb5763e9 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 28 Jan 2018 12:42:03 +0100 Subject: [PATCH 021/209] [WIP] Add Generalized Linear Models (#9405) * better doc for heavy tailed distributions --- doc/modules/linear_model.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index dcb35b6a5d941..5cb8e54afbb06 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -905,7 +905,8 @@ are the following: * If the target values are positive valued and skewed, you might try a Gamma deviance. - * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family). + * If the target values seem to be heavier tailed than a Gamma distribution, you might try an Inverse Gaussian deviance (or even higher variance powers of the Tweedie family). + Keep in mind that the mean is not a good measure for very heavy tailed distributions, cf. extreme value theory. Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, From 15931c3148b68c47aa3c3c19983525ae758a0981 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 28 Jan 2018 15:18:26 +0100 Subject: [PATCH 022/209] [WIP] Add Generalized Linear Models (#9405) * improved input validation and testing of them --- sklearn/linear_model/glm.py | 24 +++- sklearn/linear_model/tests/test_glm.py | 174 ++++++++++++++++++++++++- 2 files changed, 189 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 8856af8ec698a..33df6b4c9b850 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -63,11 +63,14 @@ def _check_weights(sample_weight, n_samples): if sample_weight is None: weights = np.ones(n_samples) elif np.isscalar(sample_weight): - if sample_weight < 0: + if sample_weight <= 0: raise ValueError("Sample weights must be non-negative.") weights = sample_weight*np.ones(n_samples) else: - weights = np.atleast_1d(sample_weight) + _dtype = [np.float64, np.float32] + weights = check_array(sample_weight, accept_sparse='csr', + force_all_finite=True, ensure_2d=False, + dtype=_dtype) if weights.ndim > 1: raise ValueError("Sample weight must be 1D array or scalar") elif weights.shape[0] != n_samples: @@ -75,6 +78,9 @@ def _check_weights(sample_weight, n_samples): " y") if not np.all(weights >= 0): raise ValueError("Sample weights must be non-negative.") + elif not np.sum(weights) > 0: + raise ValueError("Sample weights must have at least one positive " + "element.") return weights @@ -1010,10 +1016,12 @@ def fit(self, X, y, sample_weight=None): "with L1 penalties, which are included with " "(alpha={1}) and (l1_ratio={2})." .format(solver, self.alpha, self.l1_ratio)) - if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: - raise ValueError("Maximum number of iteration must be positive;" + if (not isinstance(self.max_iter, six.integer_types) + or self.max_iter <= 0): + raise ValueError("Maximum number of iteration must be a positive " + "integer;" " got (max_iter={0!r})".format(self.max_iter)) - if not isinstance(self.tol, numbers.Number) or self.tol < 0: + if not isinstance(self.tol, numbers.Number) or self.tol <= 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol={0!r})".format(self.tol)) if not isinstance(self.warm_start, bool): @@ -1029,7 +1037,9 @@ def fit(self, X, y, sample_weight=None): " length," " got(start_params={0})".format(start_params)) else: - start_params = np.atleast_1d(start_params) + start_params = check_array(start_params, accept_sparse='csr', + force_all_finite=True, ensure_2d=False, + dtype=_dtype, copy=True) if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or (start_params.ndim != 1)): raise ValueError("Start values for parameters must have the" @@ -1160,7 +1170,7 @@ def fit(self, X, y, sample_weight=None): # set start values for coef coef = None - if self.warm_start and hasattr(self, "coef_"): + if self.warm_start and hasattr(self, 'coef_'): if self.fit_intercept: coef = np.concatenate((np.array([self.intercept_]), self.coef_)) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 6f8bdd3a72f40..1abbcf0540e28 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -5,7 +5,7 @@ from sklearn.linear_model.glm import ( Link, - # IdentityLink, + IdentityLink, LogLink, TweedieDistribution, NormalDistribution, PoissonDistribution, @@ -16,7 +16,8 @@ from sklearn.utils.testing import ( assert_equal, assert_almost_equal, - assert_array_equal, assert_array_almost_equal) + assert_array_equal, assert_array_almost_equal, + assert_raises) def test_link_properties(): @@ -102,6 +103,34 @@ def f(coef): assert_allclose(fisher, approx, rtol=1e-3) +def test_sample_weights_validation(): + """Test the raised errors in the validation of sample_weight""" + # 1. scalar value but not positive + X = [[1]] + y = [1] + weights = 0 + glm = GeneralizedLinearRegressor(fit_intercept=False) + assert_raises(ValueError, glm.fit, X, y, weights) + + # 2. 2d array + weights = [[0]] + assert_raises(ValueError, glm.fit, X, y, weights) + + # 3. 1d but wrong length + weights = [1, 0] + assert_raises(ValueError, glm.fit, X, y, weights) + + # 4. 1d but only zeros (sum not greater than 0) + weights = [0, 0] + X = [[0], [1]] + y = [1, 2] + assert_raises(ValueError, glm.fit, X, y, weights) + + # 5. 1d but weith a negative value + weights = [2, -1] + assert_raises(ValueError, glm.fit, X, y, weights) + + def test_glm_family_argument(): """Test GLM family argument set as string """ @@ -115,6 +144,147 @@ def test_glm_family_argument(): alpha=0).fit(X, y) assert_equal(type(glm._family_instance), type(fam)) + glm = GeneralizedLinearRegressor(family='not a family', + fit_intercept=False) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_link_argument(): + """Test GLM link argument set as string + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for (l, link) in [('identity', IdentityLink()), + ('log', LogLink())]: + glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, + link=l).fit(X, y) + assert_equal(type(glm._link_instance), type(link)) + + glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, + link='not a link') + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_alpha_argument(): + """Test GLM alpha argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for alpha in ['not a number', -4.2]: + glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, + alpha=alpha) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_l1_ratio_argument(): + """Test GLM l1_ratio argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for l1_ratio in ['not a number', -4.2, 1.1, [1]]: + glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, + l1_ratio=l1_ratio) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_fit_intercept_argument(): + """Test GLM fit_intercept argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for fit_intercept in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_solver_argument(): + """Test GLM solver argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for solver in ['not a solver', 1, [1]]: + glm = GeneralizedLinearRegressor(solver=solver) + assert_raises(ValueError, glm.fit, X, y) + + # solver not suitable for L1 penalty + for solver in ['irls', 'lbfgs', 'newton-cg']: + glm = GeneralizedLinearRegressor(solver=solver, alpha=1, l1_ratio=0.1) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_max_iter_argument(): + """Test GLM max_iter argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for max_iter in ['not a number', 0, -1, 5.5, [1]]: + glm = GeneralizedLinearRegressor(max_iter=max_iter) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_tol_argument(): + """Test GLM tol argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for tol in ['not a number', 0, -1.0, [1e-3]]: + glm = GeneralizedLinearRegressor(tol=tol) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_warm_start_argument(): + """Test GLM warm_start argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for warm_start in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(warm_start=warm_start) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_start_params_argument(): + """Test GLM start_params argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for start_params in ['not a start_params', ['zero'], [0, 0, 0], + [[0, 0]], ['a', 'b']]: + glm = GeneralizedLinearRegressor(start_params=start_params) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_selection_argument(): + """Test GLM selection argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for selection in ['not a selection', 1, 0, ['cyclic']]: + glm = GeneralizedLinearRegressor(selection=selection) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_check_input_argument(): + """Test GLM check_input argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for check_input in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(check_input=check_input) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_random_state_argument(): + """Test GLM random_state argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for random_state in ['a string', 0.5, [0]]: + glm = GeneralizedLinearRegressor(random_state=random_state) + assert_raises(ValueError, glm.fit, X, y) + + +# TODO: check P1 and P2 +# TODO: check additional validations if check_input == True def test_glm_identiy_regression(): """Test GLM regression with identity link on a simple dataset From feedba379c2a8f53a7a9792e33041f29da7a4c95 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 30 Mar 2018 18:50:06 +0200 Subject: [PATCH 023/209] [MRG] Add Generalized Linear Models (#9405) * improved input validation and testing of P1 * test case for validation of argument P2 * test case for validation of argument copy_X --- sklearn/linear_model/glm.py | 3 ++ sklearn/linear_model/tests/test_glm.py | 48 +++++++++++++++++++++----- 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 33df6b4c9b850..eae4a56ea1d95 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -1065,6 +1065,9 @@ def fit(self, X, y, sample_weight=None): P1 = np.ones(X.shape[1]) else: P1 = np.atleast_1d(np.copy(self.P1)) + if P1.dtype.kind not in ['b', 'i', 'u', 'f']: + raise ValueError("P1 must be a numeric value; " + "got (dtype={0}).".format(P1.dtype)) if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): raise ValueError("P1 must be either None or an 1D array with " "the length of X.shape[1]; " diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 1abbcf0540e28..edf579a416973 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -187,6 +187,27 @@ def test_glm_l1_ratio_argument(): assert_raises(ValueError, glm.fit, X, y) +def test_glm_P1_argument(): + """Test GLM P1 arguments + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for P1 in [['a string', 'a string'], [1, [2]], [1, 2, 3]]: + glm = GeneralizedLinearRegressor(P1=P1) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_P2_argument(): + """Test GLM P2 arguments + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for P2 in [np.full((2, 2), 'a string'), [[1, [2]], [3, 4]], [1, 2, 3], + [[1, 2]], [[1], [2]]]: + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False) + assert_raises(ValueError, glm.fit, X, y) + + def test_glm_fit_intercept_argument(): """Test GLM fit_intercept argument """ @@ -263,27 +284,36 @@ def test_glm_selection_argument(): assert_raises(ValueError, glm.fit, X, y) -def test_glm_check_input_argument(): - """Test GLM check_input argument +def test_glm_random_state_argument(): + """Test GLM random_state argument """ y = np.array([1, 2]) X = np.array([[1], [1]]) - for check_input in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(check_input=check_input) + for random_state in ['a string', 0.5, [0]]: + glm = GeneralizedLinearRegressor(random_state=random_state) assert_raises(ValueError, glm.fit, X, y) -def test_glm_random_state_argument(): - """Test GLM random_state argument +def test_glm_copy_X_argument(): + """Test GLM copy_X arguments """ y = np.array([1, 2]) X = np.array([[1], [1]]) - for random_state in ['a string', 0.5, [0]]: - glm = GeneralizedLinearRegressor(random_state=random_state) + for copy_X in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(copy_X=copy_X) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_check_input_argument(): + """Test GLM check_input argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for check_input in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(check_input=check_input) assert_raises(ValueError, glm.fit, X, y) -# TODO: check P1 and P2 # TODO: check additional validations if check_input == True def test_glm_identiy_regression(): From 6fdfb47428571b4c8e89046a7b1f481711832f61 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 30 Mar 2018 19:41:09 +0200 Subject: [PATCH 024/209] [MRG] Add Generalized Linear Models (#9405) * fix doctest failure in example of linear_model.rst * fix dtype issue in test_glm_P2_argument --- doc/modules/linear_model.rst | 4 ++-- sklearn/linear_model/tests/test_glm.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 5cb8e54afbb06..9f85da771c6f1 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -926,8 +926,8 @@ follows: fit_intercept=True, l1_ratio=0, link='log', max_iter=100, random_state=None, selection='random', solver='auto', start_params=None, tol=0.0001, verbose=0, warm_start=False) - >>> reg.coef_ - array([ 0.24630255, 0.43373521]) + >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE + array([0.24630255, 0.43373521]) >>> reg.intercept_ #doctest: +ELLIPSIS -0.76383575... diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index edf579a416973..c5d132d35bdb9 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -202,8 +202,8 @@ def test_glm_P2_argument(): """ y = np.array([1, 2]) X = np.array([[1], [1]]) - for P2 in [np.full((2, 2), 'a string'), [[1, [2]], [3, 4]], [1, 2, 3], - [[1, 2]], [[1], [2]]]: + for P2 in [np.full((2, 2), 'a string', dtype=np.dtype(' Date: Sun, 5 Aug 2018 14:48:33 +0200 Subject: [PATCH 025/209] [MRG] Add Generalized Linear Models (#9405) * fix typos in doc --- doc/modules/linear_model.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 9f85da771c6f1..a204ccb080cc9 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -934,7 +934,7 @@ follows: Mathematical formulation ------------------------ -In the unpenalized case, the assumptions are the folowing: +In the unpenalized case, the assumptions are the following: * The target values :math:`y_i` are realizations of random variables :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})` @@ -951,7 +951,7 @@ same as specifying a unit variance function (they are one-to-one). Including penalties helps to avoid overfitting or, in case of L1 penalty, to obtain sparse solutions. But there are also other motivations to include them, -e.g. accounting fo dependence structure of :math:`y`. +e.g. accounting for the dependence structure of :math:`y`. The objective function, which is independent of :math:`\phi`, is minimized with respect to the coefficients :math:`w`. From 809e3a2747e451fc97b9a462cf682110d59fac25 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 26 Aug 2018 20:41:25 +0200 Subject: [PATCH 026/209] Remove test_glm_P2_argument --- sklearn/linear_model/tests/test_glm.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index c5d132d35bdb9..82b8ec2435543 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -197,15 +197,15 @@ def test_glm_P1_argument(): assert_raises(ValueError, glm.fit, X, y) -def test_glm_P2_argument(): - """Test GLM P2 arguments - """ - y = np.array([1, 2]) - X = np.array([[1], [1]]) - for P2 in [np.full((2, 2), 'a string', dtype=np.dtype(' Date: Thu, 30 Aug 2018 19:33:27 +0200 Subject: [PATCH 027/209] Filter out DeprecationWarning in old versions of scipy.sparse.linalg.spsolve about usage of umfpack --- sklearn/linear_model/tests/test_glm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 82b8ec2435543..e4be75ddb7a64 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -355,6 +355,7 @@ def test_glm_log_regression(): assert_array_almost_equal(res.coef_, coef) +@pytest.mark.filterwarnings('ignore:DeprecationWarning') def test_normal_ridge(): """Test ridge regression for Normal distributions From 46df5b6555938dc7c9acac67929b0633ea1354a3 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 30 Aug 2018 20:40:08 +0200 Subject: [PATCH 028/209] import pytest --- sklearn/linear_model/tests/test_glm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index e4be75ddb7a64..dfa205407a193 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,5 +1,6 @@ import numpy as np from numpy.testing import assert_allclose +import pytest import scipy as sp from scipy import sparse From 21f2136c083b4057868b7886880f356e58703611 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 30 Aug 2018 21:17:53 +0200 Subject: [PATCH 029/209] Document arguments of abstact methods --- sklearn/linear_model/glm.py | 125 +++++++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index eae4a56ea1d95..825ee6bfe8c45 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -93,12 +93,22 @@ class Link(six.with_metaclass(ABCMeta)): def link(self, mu): """The link function g(mu) with argument mu=E[Y] returns the linear predictor. + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the predicted mean. """ raise NotImplementedError @abstractmethod def derivative(self, mu): """Derivative of the link g'(mu). + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the predicted mean. """ raise NotImplementedError @@ -106,18 +116,33 @@ def derivative(self, mu): def inverse(self, lin_pred): """The inverse link function h(lin_pred) with the linear predictor as argument returns mu=E[Y]. + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (predicted) linear predictor. """ raise NotImplementedError @abstractmethod def inverse_derivative(self, lin_pred): """Derivative of the inverse link function h'(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (predicted) linear predictor. """ raise NotImplementedError @abstractmethod def inverse_derivative2(self, lin_pred): """Second derivative of the inverse link function h''(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (predicted) linear predictor. """ raise NotImplementedError @@ -236,6 +261,11 @@ def include_upper_bound(self): def in_y_range(self, x): """Returns true if `x` is in the valid range of Y~EDM. + + Parameters + ---------- + x : array, shape (n_samples,) + Target values. """ if self.include_lower_bound: if self.include_upper_bound: @@ -263,12 +293,22 @@ def unit_variance(self, mu): \partial\mu^2}}\big|_{y=\mu} See also :func:`variance`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. """ raise NotImplementedError() @abstractmethod def unit_variance_derivative(self, mu): r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Target values. """ raise NotImplementedError() @@ -276,6 +316,17 @@ def variance(self, mu, phi=1, weights=1): r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, with unit variance :math:`v(\mu)` and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. """ return phi/weights * self.unit_variance(mu) @@ -284,6 +335,17 @@ def variance_derivative(self, mu, phi=1, weights=1): :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. """ return phi/weights * self.unit_variance_derivative(mu) @@ -293,6 +355,14 @@ def unit_deviance(self, y, mu): In terms of the log-likelihood it is given by :math:`d(y,\mu) = -2\phi\cdot \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).` + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. """ raise NotImplementedError() @@ -301,9 +371,13 @@ def unit_deviance_derivative(self, y, mu): :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` with unit variance :math:`v(\mu)`. - Returns - ------- - derivative: array, shape = (n_samples,) + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. """ return -2*(y-mu)/self.unit_variance(mu) @@ -313,6 +387,17 @@ def deviance(self, y, mu, weights=1): In terms of the likelihood it is :math:`D = -2\phi\cdot \left(loglike(y,\mu,\frac{phi}{s}) - loglike(y,y,\frac{phi}{s})\right)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. """ return np.sum(weights*self.unit_deviance(y, mu)) @@ -326,6 +411,17 @@ def _deviance(self, coef, X, y, weights, link): def deviance_derivative(self, y, mu, weights=1): """The derivative w.r.t. `mu` of the deviance. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. """ return weights*self.unit_deviance_derivative(y, mu) @@ -464,7 +560,16 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): return eta, mu, score, fisher def starting_mu(self, y, weights=1): - """Starting values for the mean mu_i in (unpenalized) IRLS.""" + """Starting values for the mean mu_i in (unpenalized) IRLS. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ return ((weights*y+np.mean(weights*y)) / (2.*np.sum(np.ones_like(y)*weights))) @@ -656,12 +761,12 @@ def _irls_step(X, W, P2, z): X : numpy array or sparse matrix of shape (n_samples, n_features) Training data (with intercept included if present) - W : numpy array of shape (n_samples, ) + W : numpy array of shape (n_samples,) P2 : numpy array or sparse matrix of shape (n_features, n_features) The l2-penalty matrix or vector (=diagonal matrix) - z : numpy array of shape (n_samples, ) + z : numpy array of shape (n_samples,) Working observations Returns @@ -927,12 +1032,12 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- X : numpy array or sparse matrix of shape (n_samples, n_features) - Training data + Training data. - y : numpy array of shape (n_samples, ) - Target values + y : numpy array of shape (n_samples,) + Target values. - sample_weight : array of shape (n_samples, ) or None,\ + sample_weight : array of shape (n_samples,) or None,\ optinal (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has From 1faedf87eecfd2c6a668e2aec6e28a9fff8780ec Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 30 Aug 2018 21:18:51 +0200 Subject: [PATCH 030/209] Pytest filter warnings use two colons --- sklearn/linear_model/tests/test_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index dfa205407a193..1ac5ccd4d3d5c 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -356,7 +356,7 @@ def test_glm_log_regression(): assert_array_almost_equal(res.coef_, coef) -@pytest.mark.filterwarnings('ignore:DeprecationWarning') +@pytest.mark.filterwarnings('ignore::DeprecationWarning') def test_normal_ridge(): """Test ridge regression for Normal distributions From 992f9819838336a9372a7cadecc53eeafef439ff Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 30 Aug 2018 22:07:33 +0200 Subject: [PATCH 031/209] Improve documentation of arguments that were so far undocumented --- sklearn/linear_model/glm.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 825ee6bfe8c45..a0d0266fa0efe 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -579,6 +579,17 @@ class TweedieDistribution(ExponentialDispersionModel): They have :math:`\mu=\mathrm{E}[Y]` and :math:`\mathrm{Var}[Y] \propto \mu^power. + Special cases are: + + ===== ================ + Power Distribution + ===== ================ + 0 Normal + 1 Poisson + (0,1) Compound Poisson + 2 Gamma + 3 Inverse Gaussian + Attributes ---------- power : float @@ -586,6 +597,12 @@ class TweedieDistribution(ExponentialDispersionModel): :math:`v(\mu) = \mu^{power}`. """ def __init__(self, power=0): + """ + Parameters + ---------- + power : float (default=0) + Power of (of mu) of the variance function. + """ self.power = power self._upper_bound = np.Inf self._include_upper_bound = False @@ -623,6 +640,9 @@ def __init__(self, power=0): # Positive Stable self._lower_bound = 0 self._include_lower_bound = False + else: + raise ValueError('The power must be a float, i.e. real number, ' + 'got (power={})'.format(power)) @property def power(self): @@ -653,12 +673,22 @@ def include_upper_bound(self): def unit_variance(self, mu): """The unit variance of a Tweedie distribution is v(mu)=mu**power. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. """ return np.power(mu, self.power) def unit_variance_derivative(self, mu): """The derivative of the unit variance of a Tweedie distribution is v(mu)=power*mu**(power-1). + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. """ return self.power*np.power(mu, self.power-1) @@ -680,9 +710,6 @@ def unit_deviance(self, y, mu): return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) - def likelihood(self, y, X, w, phi, weights=1): - raise NotImplementedError('This function is not (yet) implemented.') - class NormalDistribution(TweedieDistribution): """Class for the Normal (aka Gaussian) distribution""" From 06b8451ea109040371615e9e7baaa8ff505197f0 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 31 Aug 2018 00:26:39 +0200 Subject: [PATCH 032/209] Further improve documentation of arguments --- sklearn/linear_model/glm.py | 43 +++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index a0d0266fa0efe..9688b1e0c9e5c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -601,7 +601,7 @@ def __init__(self, power=0): Parameters ---------- power : float (default=0) - Power of (of mu) of the variance function. + Variance power of the `unit_variance` function. """ self.power = power self._upper_bound = np.Inf @@ -798,7 +798,7 @@ def _irls_step(X, W, P2, z): Returns ------- - coef: array, shape = (X.shape[1]) + coef: array, shape (X.shape[1]) """ # TODO: scipy.linalg.solve is faster, but ordinary least squares uses # scipy.linalg.lstsq. What is more appropriate? @@ -898,7 +898,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : None or array of shape (n_features*, ), optional\ + P1 : None or array of shape (n_features*,), optional\ (default=None) With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The @@ -1007,7 +1007,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Attributes ---------- - coef_ : array, shape (n_features, ) + coef_ : array, shape (n_features,) Estimated coefficients for the linear predictor (X*coef_) in the GLM. intercept_ : float @@ -1679,7 +1679,7 @@ def Hs(s): if self.fit_dispersion in ['chisqr', 'deviance']: # attention because of rescaling of weights - self.dispersion_ = self.estimate_phi(y, X, weights)*weights_sum + self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum return self @@ -1688,12 +1688,12 @@ def linear_predictor(self, X): Parameters ---------- - X : numpy array or sparse matrix of shape [n_samples,n_features] + X : numpy array or sparse matrix, shape (n_samples, n_features) Samples. Returns ------- - C : array, shape = (n_samples) + C : array, shape (n_samples) Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") @@ -1709,12 +1709,15 @@ def predict(self, X, sample_weight=None): Parameters ---------- - X : numpy array or sparse matrix of shape [n_samples,n_features] + X : numpy array or sparse matrix, shape (n_samples, n_features) Samples. + sample_weight : array of shape (n_samples,) or None , \ + (default=None) + Returns ------- - C : array, shape = (n_samples) + C : array, shape (n_samples,) Returns predicted values times sample_weight. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], @@ -1726,9 +1729,21 @@ def predict(self, X, sample_weight=None): return mu*weights - def estimate_phi(self, y, X, sample_weight=None): - """Estimation of the dispersion parameter. + def estimate_phi(self, X, y, sample_weight=None): + """Estimation of the dispersion parameter phi. Returns the estimate. + + Parameters + ---------- + X : numpy array or sparse matrix of shape (n_samples, n_features) + Training data. + + y : numpy array, shape (n_samples,) + Target values. + + sample_weight : array of shape (n_samples,) or None,\ + optinal (default=None) + Sample weights. """ check_is_fitted(self, "coef_") _dtype = [np.float64, np.float32] @@ -1773,13 +1788,13 @@ def score(self, X, y, sample_weight=None): Parameters ---------- - X : array-like, shape = (n_samples, n_features) + X : array-like, shape (n_samples, n_features) Test samples - y : array-like of shape = (n_samples) + y : array-like, shape (n_samples,) True valeus for X. - sample_weight : array-like, shape = (n_samples), optional + sample_weight : array-like, shape = (n_samples,), optional Sample weights. Returns From c93f60d9e98a5be0d493d513e37c3c9de5167542 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 31 Aug 2018 08:25:01 +0200 Subject: [PATCH 033/209] Remove parameters docstring for __init__ --- sklearn/linear_model/glm.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 9688b1e0c9e5c..021927b598822 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -597,12 +597,6 @@ class TweedieDistribution(ExponentialDispersionModel): :math:`v(\mu) = \mu^{power}`. """ def __init__(self, power=0): - """ - Parameters - ---------- - power : float (default=0) - Variance power of the `unit_variance` function. - """ self.power = power self._upper_bound = np.Inf self._include_upper_bound = False From 66ec63b5157026f8541e2761c29ef3225d89a44c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 31 Aug 2018 19:24:23 +0200 Subject: [PATCH 034/209] Fix typos in docstring of TweedieDistribution --- sklearn/linear_model/glm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 021927b598822..65abb42b043d5 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -577,7 +577,7 @@ def starting_mu(self, y, weights=1): class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. They have :math:`\mu=\mathrm{E}[Y]` and - :math:`\mathrm{Var}[Y] \propto \mu^power. + :math:`\mathrm{Var}[Y] \propto \mu^power`. Special cases are: @@ -593,7 +593,7 @@ class TweedieDistribution(ExponentialDispersionModel): Attributes ---------- power : float - The variance power of the unit_variance + The variance power of the `unit_variance` :math:`v(\mu) = \mu^{power}`. """ def __init__(self, power=0): From 53c69702c790223d3940cd650b9d52a61e39b244 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 31 Aug 2018 22:20:43 +0200 Subject: [PATCH 035/209] Change docstring section of TweedieDistribution from Attributes to Parameters --- sklearn/linear_model/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 65abb42b043d5..fcb6c9754b826 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -590,7 +590,7 @@ class TweedieDistribution(ExponentialDispersionModel): 2 Gamma 3 Inverse Gaussian - Attributes + Parameters ---------- power : float The variance power of the `unit_variance` From 87d5ba38c5663224edd7ec10bc7efdf27172e7f8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 7 Oct 2018 19:43:42 +0200 Subject: [PATCH 036/209] Minor doc improvements of GeneralizedLinearRegressor --- sklearn/linear_model/glm.py | 87 +++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index fcb6c9754b826..01d5420773ffc 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -6,17 +6,16 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -# TODO: Write more tests -# TODO: Write examples and more docu -# TODO: deal with option self.copy_X +# TODO: Write examples +# TODO: Make option self.copy_X more meaningfull than just for start values. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. -# TODO: Add cross validation support +# TODO: Add cross validation support? # TODO: Should GeneralizedLinearRegressor inherit from LinearModel? # So far, it does not. # TODO: Include further classes in class.rst? ExponentialDispersionModel? # TweedieDistribution? -# TODO: Negative values in P1 are not allowed so far. They could be used to +# TODO: Negative values in P1 are not allowed so far. They could be used # for group lasso. # Design Decisions: @@ -26,7 +25,7 @@ # regressor, Bernoulli/Binomial => classifier. # Solution: GeneralizedLinearRegressor since this is the focus. # - Allow for finer control of penalty terms: -# L1: ||P1*w||_1 with P1*w a componentwise product, this allows to exclude +# L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude # factors from the L1 penalty. # L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be # a 1st or 2nd order difference matrix (compare B-spline penalties and @@ -322,7 +321,7 @@ def variance(self, mu, phi=1, weights=1): mu : array, shape (n_samples,) Predicted mean. - phi : float + phi : float (default=1) Dispersion parameter. weights : array, shape (n_samples,) (default=1) @@ -592,7 +591,7 @@ class TweedieDistribution(ExponentialDispersionModel): Parameters ---------- - power : float + power : float (default=0) The variance power of the `unit_variance` :math:`v(\mu) = \mu^{power}`. """ @@ -779,22 +778,22 @@ def _irls_step(X, W, P2, z): Parameters ---------- - X : numpy array or sparse matrix of shape (n_samples, n_features) + X : {numpy array, sparse matrix}, shape (n_samples, n_features) Training data (with intercept included if present) - W : numpy array of shape (n_samples,) + W : numpy array, shape (n_samples,) - P2 : numpy array or sparse matrix of shape (n_features, n_features) - The l2-penalty matrix or vector (=diagonal matrix) + P2 : {numpy array, sparse matrix}, shape (n_features, n_features) + The L2-penalty matrix or vector (=diagonal matrix) - z : numpy array of shape (n_samples,) + z : numpy array, shape (n_samples,) Working observations Returns ------- coef: array, shape (X.shape[1]) """ - # TODO: scipy.linalg.solve is faster, but ordinary least squares uses + # TODO: scipy.linalg.solve seems faster, but ordinary least squares uses # scipy.linalg.lstsq. What is more appropriate? n_samples, n_features = X.shape if sparse.issparse(X): @@ -892,19 +891,20 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : None or array of shape (n_features*,), optional\ + P1 : {None, array-like}, shape (n_features*,), optional\ (default=None) With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The - default value ``None`` is the same as an array of ones. + default value ``None`` is the same as a 1d array of ones. Note that n_features* = X.shape[1] = length of coef_ (intercept always excluded from counting). - P2 : None or array of shape (n_features*, n_features*), optional\ - (default=None) + P2 : {None, array-like, sparse matrix}, shape \ + (n_features*, n_features*), optional (default=None) With this square matrix the L2 penalty is calculated as `w P2 w`. This gives a fine control over this penalty (Tikhonov regularization). + The default value ``None`` is the same as the idendity matrix. Note that n_features* = X.shape[1] = length of coef_ (intercept always excluded from counting). P2 must be positive semi-definite. @@ -939,8 +939,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties. - - 'cd' is the coordinate descent algorithm. It can deal with L1 and - L2 penalties. + - 'cd' is the coordinate descent algorithm. It can + deal with L1 as well as L2 penalties. max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -958,8 +958,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): does not exit (first call to fit), option ``start_params`` sets the starting values for ``coef_`` and ``intercept_``. - start_params : {None, 'least_squares', 'zero'} or array of shape \ - (n_features, ) or }, optional (default=None) + start_params : {None, 'least_squares', 'zero', array of shape \ + (n_features, )}, optional (default=None) If an array of size n_features is supplied, use these as start values for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. @@ -979,7 +979,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. - random_state : int, RandomState instance or None, optional (default=None) + random_state : {int, RandomState instance, None}, optional (default=None) The seed of the pseudo random number generator that selects a random feature to be updated for solver 'cd' (coordinate descent). If int, random_state is the seed used by the random @@ -1052,13 +1052,13 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : numpy array or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. - y : numpy array of shape (n_samples,) + y : array-like, shape (n_samples,) Target values. - sample_weight : array of shape (n_samples,) or None,\ + sample_weight : {None, array-like}, shape (n_samples,),\ optinal (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has @@ -1190,12 +1190,12 @@ def fit(self, X, y, sample_weight=None): if self.P1 is None: P1 = np.ones(X.shape[1]) else: - P1 = np.atleast_1d(np.copy(self.P1)) + P1 = np.copy(np.atleast_1d(self.P1)) if P1.dtype.kind not in ['b', 'i', 'u', 'f']: raise ValueError("P1 must be a numeric value; " "got (dtype={0}).".format(P1.dtype)) if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): - raise ValueError("P1 must be either None or an 1D array with " + raise ValueError("P1 must be either None or a 1d array with " "the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." @@ -1324,6 +1324,7 @@ def fit(self, X, y, sample_weight=None): coef = _irls_step(Xnew, W, P2, z) else: # with L1 penalty, start with coef = 0 + # TODO: Are there better options? coef = np.zeros(n_features) elif isinstance(self.start_params, six.string_types): if self.start_params == 'zero': @@ -1353,7 +1354,7 @@ def fit(self, X, y, sample_weight=None): # 4. fit # ####################################################################### # algorithms for optimiation - # TODO: Parallelize it + # TODO: Parallelize it? self.n_iter_ = 0 converged = False # 4.1 IRLS ############################################################ @@ -1682,12 +1683,12 @@ def linear_predictor(self, X): Parameters ---------- - X : numpy array or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Samples. Returns ------- - C : array, shape (n_samples) + C : array, shape (n_samples,) Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") @@ -1703,17 +1704,18 @@ def predict(self, X, sample_weight=None): Parameters ---------- - X : numpy array or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Samples. - sample_weight : array of shape (n_samples,) or None , \ - (default=None) + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) Returns ------- C : array, shape (n_samples,) Returns predicted values times sample_weight. """ + # TODO: Is copy=True necessary? X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype='numeric', copy=True, ensure_2d=True, allow_nd=False) @@ -1729,14 +1731,14 @@ def estimate_phi(self, X, y, sample_weight=None): Parameters ---------- - X : numpy array or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. - y : numpy array, shape (n_samples,) + y : array-like, shape (n_samples,) Target values. - sample_weight : array of shape (n_samples,) or None,\ - optinal (default=None) + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) Sample weights. """ check_is_fitted(self, "coef_") @@ -1782,13 +1784,14 @@ def score(self, X, y, sample_weight=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) - Test samples + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Test samples. y : array-like, shape (n_samples,) - True valeus for X. + True values of target. - sample_weight : array-like, shape = (n_samples,), optional + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) Sample weights. Returns From a9ae023ec331e782d7a23de18f7e6fbb0dd1f57d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 8 Oct 2018 20:21:35 +0200 Subject: [PATCH 037/209] Double escape in doctring of GeneralizedLinearRegressor --- sklearn/linear_model/glm.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 01d5420773ffc..535af60289f8e 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -866,13 +866,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): TODO: Estimation of the dispersion parameter phi. - If your target `y` is a ratio, you should also provide appropriate weights - `w`. As an example, consider Poission distributed counts `z` (integers) and - weights `w`=exposure (time, money, persons years, ...), then you fit + If the target `y` is a ratio, appropriate weights `w` should be provided. + As an example, consider Poission distributed counts `z` (integers) and + weights `w`=exposure (time, money, persons years, ...). Then you fit `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y, - sample_weight=w)``. You need the weights for the right mean, consider: - :math:`\bar(y) = \frac{\sum_i w_i y_i}{\sum_i w_i}`. - In this case one might say that y has a 'scaled' Poisson distributions. + sample_weight=w)``. The weights are necessary for the right mean, consider: + :math:`\\bar(y) = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`. + In this case one might say that 'y' has a 'scaled' Poisson distributions. The same holds for other distributions. Parameters @@ -891,7 +891,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : {None, array-like}, shape (n_features*,), optional\ + P1 : {None, array-like}, shape (n_features*,), optional \ (default=None) With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The From bb62485166412d8ba6393e5ba753b015b806867c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 31 Dec 2018 16:54:40 +0100 Subject: [PATCH 038/209] Add example for GeneralizedLinearRegressor * add example * improve docstring of GeneralizedLinearRegressor * improve user guide for GeneralizedLinearRegressor --- doc/modules/linear_model.rst | 34 +++++--- .../plot_poisson_spline_regression.py | 83 +++++++++++++++++++ sklearn/linear_model/glm.py | 22 ++--- 3 files changed, 115 insertions(+), 24 deletions(-) create mode 100644 examples/linear_model/plot_poisson_spline_regression.py diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index a204ccb080cc9..d65f7ed121f8e 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -901,12 +901,15 @@ for a more versatile L2 penalty. Use cases, where a loss different from the squared loss might be appropriate, are the following: - * If the target values :math:`y` are counts (integer valued) or frequencies, you might try a Poisson deviance. + * If the target values :math:`y` are counts (non-negative integer valued) or + frequencies (non-negative), you might use a Poisson deviance with log-link. - * If the target values are positive valued and skewed, you might try a Gamma deviance. + * If the target values are positive valued and skewed, you might try a + Gamma deviance with log-link. - * If the target values seem to be heavier tailed than a Gamma distribution, you might try an Inverse Gaussian deviance (or even higher variance powers of the Tweedie family). - Keep in mind that the mean is not a good measure for very heavy tailed distributions, cf. extreme value theory. + * If the target values seem to be heavier tailed than a Gamma distribution, + you might try an Inverse Gaussian deviance (or even higher variance powers + of the Tweedie family). Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, @@ -931,6 +934,11 @@ follows: >>> reg.intercept_ #doctest: +ELLIPSIS -0.76383575... + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py` + Mathematical formulation ------------------------ @@ -956,20 +964,20 @@ e.g. accounting for the dependence structure of :math:`y`. The objective function, which is independent of :math:`\phi`, is minimized with respect to the coefficients :math:`w`. -The deviance is defined by +The deviance is defined by the log of the EDM likelihood as .. math:: D(y, \mu) = -2\phi\cdot \left(loglike(y,\mu,\frac{\phi}{s}) - loglike(y,y,\frac{\phi}{s})\right) -===================================== ================================= -Distribution Variance Function :math:`v(\mu)` -===================================== ================================= -Normal ("normal") :math:`1` -Poisson ("poisson") :math:`\mu` -Gamma ("gamma") :math:`\mu^2` -Inverse Gaussian ("inverse.gaussian") :math:`\mu^3` -===================================== ================================= +===================================== =============================== ================================= ============================================ +Distribution Target Domain Variance Function :math:`v(\mu)` Deviance :math:`D(y, \mu)` +===================================== =============================== ================================= ============================================ +Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` +Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{/mu}-y+\mu)` +Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` +Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` +===================================== =============================== ================================= ============================================ Two remarks: diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py new file mode 100644 index 0000000000000..b98bca5d8f867 --- /dev/null +++ b/examples/linear_model/plot_poisson_spline_regression.py @@ -0,0 +1,83 @@ +""" +================================= +Poisson Regression with B-Splines +================================= + +As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` +example, a Poisson regression with penalized B-splines (P-splines) [1]_ is +fitted on slightly different sinusodial, Poisson distributed data and +compared to an AdaBoost model with decision trees. +One can see, that this is a hard problem for both estimators. + +.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines + and penalties". Statist. Sci. 11 (1996), no. 2, 89--121. + `doi:10.1214/ss/1038425655 + `_ + +""" +print(__doc__) + +# Author: Christian Lorentzen +# based on the AdaBoost regression example from Noel Dawe +# License: BSD 3 clause + +# importing necessary libraries +import numpy as np +from scipy.linalg import toeplitz +from scipy.interpolate import BSpline +import matplotlib.pyplot as plt +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import AdaBoostRegressor +from sklearn.linear_model import GeneralizedLinearRegressor + + +# Create the dataset +xmin, xmax = 0, 6 +rng = np.random.RandomState(1) +X = np.linspace(xmin, xmax, 500)[:, np.newaxis] +y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel()) +y = rng.poisson(y_true, X.shape[0]) + +# b-spline basis +nknots, degree = 40, 3 +ns = nknots - degree - 1 # number of base spline functions +dx = (xmax - xmin) / (nknots - 1 - 2 * degree) +knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots) +coef = np.zeros(ns) +splineBasis = np.empty((X.shape[0], ns), dtype=float) +for i in range(ns): + coef[i] = 1 + splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ + .ravel() + coef[i] = 0 + +# second order difference matrix +P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float) +P2[0, 0] = P2[-1, -1] = 1 + +# Fit regression model +regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), + n_estimators=10, random_state=rng) + +regr_2 = GeneralizedLinearRegressor(family='poisson', link='log', + fit_intercept=True, alpha=0.02, + l1_ratio=0.1, P2=P2) + +regr_1.fit(X, y) +regr_2.fit(splineBasis, y) + +# Predict +y_1 = regr_1.predict(X) +y_2 = regr_2.predict(splineBasis) + +# Plot the results +plt.figure() +plt.plot(X, y_true, c="b", label="true mean") +plt.scatter(X, y, c="k", marker='.', label="training samples") +plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2) +plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2) +plt.xlabel("data") +plt.ylabel("target") +plt.title("Regression Comparison") +plt.legend() +plt.show() diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 535af60289f8e..37afc8da2d6db 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -6,7 +6,7 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -# TODO: Write examples +# TODO: Write more examples. # TODO: Make option self.copy_X more meaningfull than just for start values. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. @@ -832,7 +832,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): with inverse link function `h` and s=sum of `sample_weight` (which equals n_samples for `sample_weight=None`). - For `P1`=`P2`=identity, the penalty is the elastic net:: + For `P1=P2=identity`, the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 @@ -868,11 +868,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If the target `y` is a ratio, appropriate weights `w` should be provided. As an example, consider Poission distributed counts `z` (integers) and - weights `w`=exposure (time, money, persons years, ...). Then you fit - `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y, - sample_weight=w)``. The weights are necessary for the right mean, consider: - :math:`\\bar(y) = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`. - In this case one might say that 'y' has a 'scaled' Poisson distributions. + weights `w=exposure` (time, money, persons years, ...). Then you fit + `y = z/w`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, + sample_weight=w)``. The weights are necessary for the right meanself. + Consider :math:`\\bar{y} = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`, + in this case one might say that `y` has a 'scaled' Poisson distributions. The same holds for other distributions. Parameters @@ -1017,10 +1017,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): References ---------- For the coordinate descent implementation: - .. [1] Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin - An Improved GLMNET for L1-regularized Logistic Regression, - Journal of Machine Learning Research 13 (2012) 1999-2030 - https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', From 16d064db7cee1d59569d21631cc2fa41be8b3b14 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 1 Jan 2019 11:58:41 +0100 Subject: [PATCH 039/209] Resolve merge conflicts * resolve merge conflicts in linear_model.rst * replace BSpline by splev to support older scipy versions --- doc/modules/linear_model.rst | 14 +++++++------- .../linear_model/plot_poisson_spline_regression.py | 8 +++++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index d65f7ed121f8e..09f14735c2907 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -876,18 +876,18 @@ to warm-starting (see :term:`Glossary `). .. _Generalized_linear_regression: -Generalized linear regression +Generalized Linear Regression ============================= :class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two -ways [8]_. First, the predicted values :math:`\hat{y}` are linked to a linear +ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear combination of the input variables :math:`X` via an inverse link function :math:`h` as .. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [9]_. The objective function beeing minimized +exponential dispersion model (EDM) [11]_. The objective function beeing minimized becomes .. math:: \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1 @@ -983,7 +983,7 @@ Two remarks: * The deviances for at least Normal, Poisson and Gamma distributions are strictly consistent scoring functions for the mean :math:`\mu`, see Eq. - (19)-(20) in [10]_. + (19)-(20) in [12]_. * If you want to model a frequency, i.e. counts per exposure (time, volume, ...) you can do so by a Poisson distribution and passing @@ -993,12 +993,12 @@ Two remarks: .. topic:: References: - .. [8] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - .. [9] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. + .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. See also `Exponential dispersion model. `_ - .. [10] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ + .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ Stochastic Gradient Descent - SGD ================================= diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py index b98bca5d8f867..fce85fae1ea8c 100644 --- a/examples/linear_model/plot_poisson_spline_regression.py +++ b/examples/linear_model/plot_poisson_spline_regression.py @@ -24,7 +24,8 @@ # importing necessary libraries import numpy as np from scipy.linalg import toeplitz -from scipy.interpolate import BSpline +# from scipy.interpolate import BSpline +from scipy.interpolate import splev import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor @@ -47,8 +48,9 @@ splineBasis = np.empty((X.shape[0], ns), dtype=float) for i in range(ns): coef[i] = 1 - splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ - .ravel() +# splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ +# .ravel() + splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel() coef[i] = 0 # second order difference matrix From 1a02a901d1a6d99484241ca205178ac61fc47846 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 1 Jan 2019 12:59:30 +0100 Subject: [PATCH 040/209] Adapt for minimum numpy version * replace np.block --- sklearn/linear_model/glm.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 37afc8da2d6db..851767055c61c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -1235,8 +1235,11 @@ def fit(self, X, y, sample_weight=None): P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2), dtype=P2.dtype).tocsr() else: - P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))], - [np.zeros((X.shape[1], 1)), P2]]) + # as of numpy 1.13 this would work: + # P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))], + # [np.zeros((X.shape[1], 1)), P2]]) + P2 = np.hstack((np.zeros((X.shape[1], 1)), P2)) + P2 = np.vstack((np.zeros((1, X.shape[1]+1)), P2)) else: Xnew = X From 177eb4cc017a7262e472070b4a920250711a099c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 6 Jan 2019 19:46:19 +0100 Subject: [PATCH 041/209] Remove six dependencies as in #12639 * replace six.with_metaclass(ABCMeta) by metaclass=ABCMeta * replace six.integer_types by int * replace six.string_types by str * rebase * correct email address --- sklearn/linear_model/glm.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 851767055c61c..aca49ec7edf28 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -2,12 +2,13 @@ Generalized Linear Models with Exponential Dispersion Family """ -# Author: Christian Lorentzen +# Author: Christian Lorentzen # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause # TODO: Write more examples. -# TODO: Make option self.copy_X more meaningfull than just for start values. +# TODO: Make option self.copy_X more meaningful. +# So far, fit uses Xnew instead of X. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. # TODO: Add cross validation support? @@ -51,7 +52,6 @@ from .ridge import Ridge from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning -from ..externals import six from ..utils import check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg @@ -84,7 +84,7 @@ def _check_weights(sample_weight, n_samples): return weights -class Link(six.with_metaclass(ABCMeta)): +class Link(metaclass=ABCMeta): """Abstract base class for Link funtions """ @@ -186,7 +186,7 @@ def inverse_derivative2(self, lin_pred): return np.exp(lin_pred) -class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): +class ExponentialDispersionModel(metaclass=ABCMeta): r"""Base class for reproductive Exponential Dispersion Models (EDM). The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by @@ -1142,7 +1142,7 @@ def fit(self, X, y, sample_weight=None): "with L1 penalties, which are included with " "(alpha={1}) and (l1_ratio={2})." .format(solver, self.alpha, self.l1_ratio)) - if (not isinstance(self.max_iter, six.integer_types) + if (not isinstance(self.max_iter, int) or self.max_iter <= 0): raise ValueError("Maximum number of iteration must be a positive " "integer;" @@ -1156,7 +1156,7 @@ def fit(self, X, y, sample_weight=None): start_params = self.start_params if start_params is None: pass - elif isinstance(start_params, six.string_types): + elif isinstance(start_params, str): if start_params not in ['least_squares', 'zero']: raise ValueError("The argument start_params must be None, " "'least-squares', 'zero' or an array of right" @@ -1329,7 +1329,7 @@ def fit(self, X, y, sample_weight=None): # with L1 penalty, start with coef = 0 # TODO: Are there better options? coef = np.zeros(n_features) - elif isinstance(self.start_params, six.string_types): + elif isinstance(self.start_params, str): if self.start_params == 'zero': coef = np.zeros(n_features) elif self.start_params == 'least_squares': From 3d4c784df6797c244b31dd3393083d2c63114bf2 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 3 Feb 2019 21:38:24 +0100 Subject: [PATCH 042/209] Improve user guide, doc and fix penalty parameter for Ridge * move parts of docstring to new Notes section * improve user guide and doc * fix typos * fix scaling of penalty parameter in Ridge() * docstring for _check_weights * reduce tol for parameter initialization --- doc/modules/linear_model.rst | 18 +++--- sklearn/linear_model/glm.py | 114 +++++++++++++++++++---------------- 2 files changed, 73 insertions(+), 59 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 09f14735c2907..174d1e4eddae4 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -890,7 +890,7 @@ Secondly, the squared loss function is replaced by the deviance :math:`D` of an exponential dispersion model (EDM) [11]_. The objective function beeing minimized becomes -.. math:: \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1 +.. math:: \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1 +\frac{\alpha(1-\rho)}{2} w^T P_2 w with sample weights :math:`s`. @@ -914,7 +914,7 @@ are the following: Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link -:math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with +:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with :math:`h(Xw)=\exp(Xw)`. Note that the feature matrix `X` should be standardized before fitting. This @@ -964,17 +964,19 @@ e.g. accounting for the dependence structure of :math:`y`. The objective function, which is independent of :math:`\phi`, is minimized with respect to the coefficients :math:`w`. -The deviance is defined by the log of the EDM likelihood as +The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` +likelihood as -.. math:: D(y, \mu) = -2\phi\cdot - \left(loglike(y,\mu,\frac{\phi}{s}) - - loglike(y,y,\frac{\phi}{s})\right) +.. math:: d(y, \mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) + - loglike(y,y,\phi)\right) \\ + D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) ===================================== =============================== ================================= ============================================ -Distribution Target Domain Variance Function :math:`v(\mu)` Deviance :math:`D(y, \mu)` +Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` ===================================== =============================== ================================= ============================================ Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` -Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{/mu}-y+\mu)` +Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` ===================================== =============================== ================================= ============================================ diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index aca49ec7edf28..5fc869f81195f 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -8,7 +8,7 @@ # TODO: Write more examples. # TODO: Make option self.copy_X more meaningful. -# So far, fit uses Xnew instead of X. +# So far, fit uses Xnew instead of X. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. # TODO: Add cross validation support? @@ -28,7 +28,7 @@ # - Allow for finer control of penalty terms: # L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude # factors from the L1 penalty. -# L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be +# L2: w*P2*w with P2 a (semi-) positive definite matrix, e.g. P2 could be # a 1st or 2nd order difference matrix (compare B-spline penalties and # Tikhonov regularization). # - The link funtion (instance of class Link) is necessary for the evaluation @@ -59,6 +59,8 @@ def _check_weights(sample_weight, n_samples): + """Check that weights are non-negative and have the right shape + """ if sample_weight is None: weights = np.ones(n_samples) elif np.isscalar(sample_weight): @@ -594,6 +596,7 @@ class TweedieDistribution(ExponentialDispersionModel): power : float (default=0) The variance power of the `unit_variance` :math:`v(\mu) = \mu^{power}`. + For ``0`. - The fit itself does not need Y to be from an EDM, but only assumes - the first two moments :math:`E[Y_i]=\\mu_i=h(\\eta_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{w_i} v(\\mu_i)`. - - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - (penalized) maximum likelihood which is equivalent to minimizing the - deviance. - - For `alpha` > 0, the feature matrix `X` should be standardized in order to - penalize features equally strong. Call - :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. - - TODO: Estimation of the dispersion parameter phi. - - If the target `y` is a ratio, appropriate weights `w` should be provided. - As an example, consider Poission distributed counts `z` (integers) and - weights `w=exposure` (time, money, persons years, ...). Then you fit - `y = z/w`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, - sample_weight=w)``. The weights are necessary for the right meanself. - Consider :math:`\\bar{y} = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`, - in this case one might say that `y` has a 'scaled' Poisson distributions. - The same holds for other distributions. - Parameters ---------- alpha : float, optional (default=1) @@ -891,22 +871,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : {None, array-like}, shape (n_features*,), optional \ + P1 : {None, array-like}, shape (n_features,), optional \ (default=None) With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The default value ``None`` is the same as a 1d array of ones. - Note that n_features* = X.shape[1] = length of coef_ (intercept - always excluded from counting). + Note that n_features = X.shape[1]. P2 : {None, array-like, sparse matrix}, shape \ - (n_features*, n_features*), optional (default=None) + (n_features, n_features), optional (default=None) With this square matrix the L2 penalty is calculated as `w P2 w`. This gives a fine control over this penalty (Tikhonov - regularization). - The default value ``None`` is the same as the idendity matrix. - Note that n_features* = X.shape[1] = length of coef_ (intercept - always excluded from counting). P2 must be positive semi-definite. + regularization). The diagonal zeros of a diagonal P2, for example, + exclude all corresponding coefficients from the L2 penalty. + The default value ``None`` is the same as the identity matrix. + Note that n_features = X.shape[1]. P2 must be positive semi-definite. fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be @@ -929,18 +908,22 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \ optional (default='auto') - Algorithm to use in the optimization problem. + Algorithm to use in the optimization problem: - - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'. + 'auto' + Sets 'irls' if l1_ratio equals 0, else 'cd'. - - 'irls' is iterated reweighted least squares (Fisher scoring). + 'irls' + iterated reweighted least squares (Fisher scoring). It is the standard algorithm for GLMs. Cannot deal with L1 penalties. - - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties. + 'newton-cg', 'lbfgs' + Cannot deal with L1 penalties. - - 'cd' is the coordinate descent algorithm. It can - deal with L1 as well as L2 penalties. + 'cd' + coordinate descent algorithm. It can deal with L1 as well as L2 + penalties. max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -959,10 +942,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): starting values for ``coef_`` and ``intercept_``. start_params : {None, 'least_squares', 'zero', array of shape \ - (n_features, )}, optional (default=None) - If an array of size n_features is supplied, use these as start values + (n_features*, )}, optional (default=None) + If an array of size n_features* is supplied, use it as start values for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. + Note that n_features* = X.shape[1] + fit_intercept includes the + intercept in counting. If 'least_squares' is set, the result of a least squares fit in the link space (linear predictor) is taken. If 'zero' is set, all coefficients start with zero. @@ -1013,6 +998,30 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): n_iter_ : int Actual number of iterations of the solver. + Notes + ----- + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + (penalized) maximum likelihood which is equivalent to minimizing the + deviance. + + For `alpha` > 0, the feature matrix `X` should be standardized in order to + penalize features equally strong. Call + :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. + + If the target `y` is a ratio, appropriate sample weights `s` should be + provided. + As an example, consider Poission distributed counts `z` (integers) and + weights `s=exposure` (time, money, persons years, ...). Then you fit + `y = z/s`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, + sample_weight=s)``. The weights are necessary for the right (finite + sample) mean. + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + in this case one might say that `y` has a 'scaled' Poisson distributions. + The same holds for other distributions. References ---------- @@ -1138,10 +1147,10 @@ def fit(self, X, y, sample_weight=None): else: solver = 'cd' if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']): - raise ValueError("The chosen solver (solver={0}) can't deal " - "with L1 penalties, which are included with " - "(alpha={1}) and (l1_ratio={2})." - .format(solver, self.alpha, self.l1_ratio)) + raise ValueError("The chosen solver (solver={0}) can't deal " + "with L1 penalties, which are included with " + "(alpha={1}) and (l1_ratio={2})." + .format(solver, self.alpha, self.l1_ratio)) if (not isinstance(self.max_iter, int) or self.max_iter <= 0): raise ValueError("Maximum number of iteration must be a positive " @@ -1340,14 +1349,17 @@ def fit(self, X, y, sample_weight=None): elif self.l1_ratio <= 0.01: # ElasticNet says l1_ratio <= 0.01 is not reliable # => use Ridge + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 reg = Ridge(copy_X=True, fit_intercept=False, - alpha=self.alpha) + alpha=self.alpha*n_samples, + tol=np.max([self.tol, np.sqrt(self.tol)])) reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: # TODO: Does this make sense at all? reg = ElasticNet(copy_X=True, fit_intercept=False, - alpha=self.alpha, l1_ratio=self.l1_ratio) + alpha=self.alpha, l1_ratio=self.l1_ratio, + tol=np.max([self.tol, np.sqrt(self.tol)])) reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: @@ -1557,7 +1569,7 @@ def Hs(s): # minimize_z: a z + 1/2 b z^2 + c |d+z| # a = A_j # b = B_jj > 0 - # c = |P1_j| = P1_j > 0, ee 1.3 + # c = |P1_j| = P1_j > 0, see 1.3 # d = w_j + d_j # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) # with beta = z+d, beta_hat = d-a/b and gamma = c/b From 919912c3f98e6d1190737e344c14f31c2eef9077 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 17 Feb 2019 18:38:36 +0100 Subject: [PATCH 043/209] Smarter intercept initialization and docstring improvements * smarter initialization of intercept * PEP 257 -- Docstring Conventions * minor docstring changes --- sklearn/linear_model/glm.py | 265 +++++++++++++++++++++--------------- 1 file changed, 153 insertions(+), 112 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 5fc869f81195f..d69ccd0a66486 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -59,14 +59,13 @@ def _check_weights(sample_weight, n_samples): - """Check that weights are non-negative and have the right shape - """ + """Check that weights are non-negative and have the right shape.""" if sample_weight is None: weights = np.ones(n_samples) elif np.isscalar(sample_weight): if sample_weight <= 0: raise ValueError("Sample weights must be non-negative.") - weights = sample_weight*np.ones(n_samples) + weights = sample_weight * np.ones(n_samples) else: _dtype = [np.float64, np.float32] weights = check_array(sample_weight, accept_sparse='csr', @@ -75,8 +74,8 @@ def _check_weights(sample_weight, n_samples): if weights.ndim > 1: raise ValueError("Sample weight must be 1D array or scalar") elif weights.shape[0] != n_samples: - raise ValueError("Sample weights must have the same length as" - " y") + raise ValueError("Sample weights must have the same length as " + "y") if not np.all(weights >= 0): raise ValueError("Sample weights must be non-negative.") elif not np.sum(weights) > 0: @@ -87,70 +86,72 @@ def _check_weights(sample_weight, n_samples): class Link(metaclass=ABCMeta): - """Abstract base class for Link funtions - """ + """Abstract base class for Link funtions.""" @abstractmethod def link(self, mu): - """The link function g(mu) with argument mu=E[Y] returns the - linear predictor. + """Compute the link function g(mu). + + The link function links the mean mu=E[Y] to the so called linear + predictor (X*w), i.e. g(mu) = linear predictor. Parameters ---------- mu : array, shape (n_samples,) - Usually the predicted mean. + Usually the (predicted) mean. """ raise NotImplementedError @abstractmethod def derivative(self, mu): - """Derivative of the link g'(mu). + """Compute the derivative of the link g'(mu). Parameters ---------- mu : array, shape (n_samples,) - Usually the predicted mean. + Usually the (predicted) mean. """ raise NotImplementedError @abstractmethod def inverse(self, lin_pred): - """The inverse link function h(lin_pred) with the linear predictor as - argument returns mu=E[Y]. + """Compute the inverse link function h(lin_pred). + + Gives the inverse relationship between linkear predictor and the mean + mu=E[Y], i.e. h(linear predictor) = mu. Parameters ---------- lin_pred : array, shape (n_samples,) - Usually the (predicted) linear predictor. + Usually the (fitted) linear predictor. """ raise NotImplementedError @abstractmethod def inverse_derivative(self, lin_pred): - """Derivative of the inverse link function h'(lin_pred). + """Compute the derivative of the inverse link function h'(lin_pred). Parameters ---------- lin_pred : array, shape (n_samples,) - Usually the (predicted) linear predictor. + Usually the (fitted) linear predictor. """ raise NotImplementedError @abstractmethod def inverse_derivative2(self, lin_pred): - """Second derivative of the inverse link function h''(lin_pred). + """Compute 2nd derivative of the inverse link function h''(lin_pred). Parameters ---------- lin_pred : array, shape (n_samples,) - Usually the (predicted) linear predictor. + Usually the (fitted) linear predictor. """ raise NotImplementedError class IdentityLink(Link): - """The identity link function g(x)=x. - """ + """The identity link function g(x)=x.""" def link(self, mu): return mu @@ -169,8 +170,7 @@ def inverse_derivative2(self, lin_pred): class LogLink(Link): - """The log link function g(x)=log(x). - """ + """The log link function g(x)=log(x).""" def link(self, mu): return np.log(mu) @@ -238,26 +238,22 @@ class ExponentialDispersionModel(metaclass=ABCMeta): @abstractproperty def lower_bound(self): - """The lower bound of values of Y~EDM. - """ + """The lower bound of values of Y~EDM.""" raise NotImplementedError() @abstractproperty def upper_bound(self): - """The upper bound of values of Y~EDM. - """ + """The upper bound of values of Y~EDM.""" raise NotImplementedError() @abstractproperty def include_lower_bound(self): - """If True, values of y may equal lower bound: y >= lower_bound. - """ + """If True, values of y may equal lower bound: y >= lower_bound.""" raise NotImplementedError() @abstractproperty def include_upper_bound(self): - """If True, values of y may equal upper bound: y <= upper_bound. - """ + """If True, values of y may equal upper bound: y <= upper_bound.""" raise NotImplementedError() def in_y_range(self, x): @@ -285,7 +281,9 @@ def in_y_range(self, x): @abstractmethod def unit_variance(self, mu): - r"""The unit variance :math:`v(\mu)` determines the variance as + r"""Compute the unit variance function. + + The unit variance :math:`v(\mu)` determines the variance as a function of the mean :math:`\mu` by :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. It can also be derived from the unit deviance :math:`d(y,\mu)` as @@ -304,7 +302,9 @@ def unit_variance(self, mu): @abstractmethod def unit_variance_derivative(self, mu): - r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`. + r"""Compute the derivative of the unit variance w.r.t. mu. + + Return :math:`v'(\mu)`. Parameters ---------- @@ -314,7 +314,9 @@ def unit_variance_derivative(self, mu): raise NotImplementedError() def variance(self, mu, phi=1, weights=1): - r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is + r"""Compute the variance function. + + The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, with unit variance :math:`v(\mu)` and weights :math:`s_i`. @@ -332,7 +334,9 @@ def variance(self, mu, phi=1, weights=1): return phi/weights * self.unit_variance(mu) def variance_derivative(self, mu, phi=1, weights=1): - r"""The derivative of the variance w.r.t. `mu`, + r"""Compute the derivative of the variance w.r.t. mu. + + Returns :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` and weights :math:`s_i`. @@ -352,10 +356,12 @@ def variance_derivative(self, mu, phi=1, weights=1): @abstractmethod def unit_deviance(self, y, mu): - r"""The unit_deviance :math:`d(y,\mu)`. - In terms of the log-likelihood it is given by + r"""Compute the unit deviance. + + The unit_deviance :math:`d(y,\mu)` can be defined by the + log-likelihood as :math:`d(y,\mu) = -2\phi\cdot - \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).` + \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` Parameters ---------- @@ -368,7 +374,9 @@ def unit_deviance(self, y, mu): raise NotImplementedError() def unit_deviance_derivative(self, y, mu): - r"""The derivative w.r.t. `mu` of the unit deviance + r"""Compute the derivative of the unit deviance w.r.t. mu. + + The derivative of the unit deviance is given by :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` with unit variance :math:`v(\mu)`. @@ -383,9 +391,12 @@ def unit_deviance_derivative(self, y, mu): return -2*(y-mu)/self.unit_variance(mu) def deviance(self, y, mu, weights=1): - r"""The deviance is given by :math:`D = \sum_i s_i \cdot d(y, \mu) + r"""Compute the deviance. + + The deviance is a weighted sum of the per sample unit deviances, + :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. - In terms of the likelihood it is :math:`D = -2\phi\cdot + In terms of the log-likelihood it is :math:`D = -2\phi\cdot \left(loglike(y,\mu,\frac{phi}{s}) - loglike(y,y,\frac{phi}{s})\right)`. @@ -403,15 +414,15 @@ def deviance(self, y, mu, weights=1): return np.sum(weights*self.unit_deviance(y, mu)) def _deviance(self, coef, X, y, weights, link): - """The deviance as a function of the coefficients `coef` - (:math:`w`). - """ + """Compute the deviance as a function of the coefficients and data.""" lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) return self.deviance(y, mu, weights) def deviance_derivative(self, y, mu, weights=1): - """The derivative w.r.t. `mu` of the deviance. + """Compute the derivative of the deviance w.r.t. mu. + + It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. Parameters ---------- @@ -427,7 +438,9 @@ def deviance_derivative(self, y, mu, weights=1): return weights*self.unit_deviance_derivative(y, mu) def _score(self, coef, phi, X, y, weights, link): - r"""The score function is the derivative of the + r"""Compute the score function. + + The score function is the derivative of the log-likelihood w.r.t. `coef` (:math:`w`). It is given by @@ -453,7 +466,8 @@ def _score(self, coef, phi, X, y, weights, link): return score def _fisher_matrix(self, coef, phi, X, y, weights, link): - r"""The Fisher information matrix. + r"""Compute the Fisher information matrix. + The Fisher information matrix, also known as expected information matrix is given by @@ -482,9 +496,10 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): return fisher_matrix def _observed_information(self, coef, phi, X, y, weights, link): - r"""The observed information matrix. + r"""Compute the observed information matrix. + The observed information matrix, also known as the negative of - the Hessian matrix of the log-likelihood. It is given by + the Hessian matrix of the log-likelihood, is given by .. math: @@ -518,8 +533,10 @@ def _observed_information(self, coef, phi, X, y, weights, link): return observed_information def _deviance_derivative(self, coef, X, y, weights, link): - r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a - function of the coefficients `coef`. + r"""Compute the derivative of the deviance w.r.t. coef. + + The derivative of the deviance w.r.t. `coef` (:math:`w`) as a + function of the coefficients `coef` and the data. This is equivalent to :math:`-2\phi` times the score function :func:`_score` (derivative of the log-likelihood). """ @@ -528,9 +545,11 @@ def _deviance_derivative(self, coef, X, y, weights, link): return -2*score def _deviance_hessian(self, coef, X, y, weights, link): - r"""The hessian matrix w.r.t. `coef` (:math:`w`) of the deviance - as a function of the coefficients `coef`. - This is equivalent to :math:`+2\phi` times the observed information + r"""Compute the hessian matrix of the deviance w.r.t. coef. + + The hessian of the deviance w.r.t. `coef` (:math:`w`) is evaluated as + a function of the coefficients `coef` and the data. + It is equivalent to :math:`+2\phi` times the observed information matrix. """ info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y, @@ -538,8 +557,12 @@ def _deviance_hessian(self, coef, X, y, weights, link): return 2*info_matrix def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): - """Calculates eta (linear predictor), mu, score function (derivative - of log-likelihood) and Fisher matrix (all with phi=1) all in one go""" + """Compute linear predictor, mean, score function and fisher matrix. + + It calculates the linear predictor, the mean, score function + (derivative of log-likelihood) and Fisher information matrix + all in one go as function of `coef` (:math:`w`) and the data. + """ n_samples, n_features = X.shape # eta = linear predictor eta = safe_sparse_dot(X, coef, dense_output=True) @@ -561,7 +584,9 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): return eta, mu, score, fisher def starting_mu(self, y, weights=1): - """Starting values for the mean mu_i in (unpenalized) IRLS. + """Set starting values for the mean mu. + + These may be good starting points for the (unpenalized) IRLS solver. Parameters ---------- @@ -577,7 +602,9 @@ def starting_mu(self, y, weights=1): class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. - They have :math:`\mu=\mathrm{E}[Y]` and + + A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely + defined by it's mean-variance relationship :math:`\mathrm{Var}[Y] \propto \mu^power`. Special cases are: @@ -668,7 +695,7 @@ def include_upper_bound(self): return self._include_upper_bound def unit_variance(self, mu): - """The unit variance of a Tweedie distribution is v(mu)=mu**power. + """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. Parameters ---------- @@ -678,8 +705,8 @@ def unit_variance(self, mu): return np.power(mu, self.power) def unit_variance_derivative(self, mu): - """The derivative of the unit variance of a Tweedie distribution is - v(mu)=power*mu**(power-1). + """Compute the derivative of the unit variance of a Tweedie + distribution v(mu)=power*mu**(power-1). Parameters ---------- @@ -732,9 +759,9 @@ def __init__(self): class GeneralizedHyperbolicSecand(ExponentialDispersionModel): - """A class for the von Generalized Hyperbolic Secand (GHS) distribution. + """A class for the Generalized Hyperbolic Secand (GHS) distribution. - The GHS distribution is for data y in (-inf, inf). + The GHS distribution is for tagets y in (-inf, inf). """ def __init__(self): self._lower_bound = -np.Inf @@ -770,7 +797,7 @@ def unit_deviance(self, y, mu): def _irls_step(X, W, P2, z): - """One step in iteratively reweighted least squares + """Compute one step in iteratively reweighted least squares. Solve A w = b for w with A = (X' W X + P2) @@ -823,18 +850,18 @@ def _irls_step(X, W, P2, z): class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - """Regression via a Generalized Linear Model (GLM) based on reproductive - Exponential Dispersion Models (EDM) with combined L1 and L2 priors as - regularizer. + """Regression via a Generalized Linear Model (GLM) with penalties. - Minimizes the objective function:: + GLMs based on a reproductive Exponential Dispersion Model (EDM) with + combined L1 and L2 priors as regularizer minimizes the following objective + function:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + alpha * l1_ratio * ||P1*w||_1 + 1/2 * alpha * (1 - l1_ratio) * w*P2*w with inverse link function `h` and s=`sample_weight` (for - `sample_weight=Nones` one has s=1 and sum(s) equals `n_samples`). + `sample_weight=None`, one has s=1 and sum(s)=`n_samples`). For `P1=P2=identity`, the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 @@ -893,16 +920,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\ of class ExponentialDispersionModel, optional(default='normal') - the distributional assumption of the GLM, i.e. which loss function to - be minimized. + The distributional assumption of the GLM, i.e. which distribution from + the EDM, specifies the loss function to be minimized. link : {'identity', 'log'} or an instance of class Link, optional (default='identity') - the link function of the GLM, i.e. mapping from linear predictor + The link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) - method for estimation of the dispersion parameter phi. Whether to use + Method for estimation of the dispersion parameter phi. Whether to use the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. @@ -914,15 +941,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Sets 'irls' if l1_ratio equals 0, else 'cd'. 'irls' - iterated reweighted least squares (Fisher scoring). - It is the standard algorithm for GLMs. Cannot deal with + Iterated reweighted least squares (with Fisher scoring). + It is the standard algorithm for GLMs. It cannot deal with L1 penalties. 'newton-cg', 'lbfgs' Cannot deal with L1 penalties. 'cd' - coordinate descent algorithm. It can deal with L1 as well as L2 + Coordinate descent algorithm. It can deal with L1 as well as L2 penalties. max_iter : int, optional (default=100) @@ -930,15 +957,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, - the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol`` + the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative of - the deviance). + the objective function). warm_start : boolean, optional (default=False) - If set to ``True``, reuse the solution of the previous call to fit as - initialization for ``coef_`` and ``intercept_`` (supersedes option + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` (supersedes option ``start_params``). If set to ``True`` or if the attribute ``coef_`` - does not exit (first call to fit), option ``start_params`` sets the + does not exit (first call to ``fit``), option ``start_params`` sets the starting values for ``coef_`` and ``intercept_``. start_params : {None, 'least_squares', 'zero', array of shape \ @@ -946,8 +973,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If an array of size n_features* is supplied, use it as start values for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. - Note that n_features* = X.shape[1] + fit_intercept includes the - intercept in counting. + Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes + the intercept in counting. If 'least_squares' is set, the result of a least squares fit in the link space (linear predictor) is taken. If 'zero' is set, all coefficients start with zero. @@ -960,7 +987,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): For the solver 'cd' (coordinate descent), the coordinates (features) can be updated in either cyclic or random order. If set to 'random', a random coefficient is updated every iteration - rather than looping over features sequentially by default. This + rather than looping over features sequentially in the same order. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. @@ -1057,7 +1084,7 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, self.verbose = verbose def fit(self, X, y, sample_weight=None): - """Fit a generalized linear model. + """Fit a Generalized Linear Model. Parameters ---------- @@ -1087,12 +1114,14 @@ def fit(self, X, y, sample_weight=None): _dtype = [np.float64, np.float32] X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=_dtype, y_numeric=True, multi_output=False) + # Without converting y to float, deviance might raise + # ValueError: Integers to negative integer powers are not allowed. y = y.astype(np.float64) weights = _check_weights(sample_weight, y.shape[0]) # 1.2 validate arguments of __init__ ################################## - # Garantee that self._family_instance is an instance of class + # Guarantee that self._family_instance is an instance of class # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): self._family_instance = self.family @@ -1112,7 +1141,8 @@ def fit(self, X, y, sample_weight=None): " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];" " got (family={0})".format(self.family)) - # Garantee that self._link_instance is set to an instance of class Link + # Guarantee that self._link_instance is set to an instance of + # class Link if isinstance(self.link, Link): self._link_instance = self.link else: @@ -1127,19 +1157,19 @@ def fit(self, X, y, sample_weight=None): .format(self.link)) if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: - raise ValueError("Penalty term must be non-negative;" + raise ValueError("Penalty term must be a non-negative number;" " got (alpha={0})".format(self.alpha)) if (not isinstance(self.l1_ratio, numbers.Number) or self.l1_ratio < 0 or self.l1_ratio > 1): - raise ValueError("l1_ratio must be in interval [0, 1]; got" - " (l1_ratio={0})".format(self.l1_ratio)) + raise ValueError("l1_ratio must be a number in interval [0, 1];" + " got (l1_ratio={0})".format(self.l1_ratio)) if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']: - raise ValueError("GeneralizedLinearRegressor supports only irls, " - "auto, lbfgs, newton-cg and cd solvers, got {0}" - "".format(self.solver)) + raise ValueError("GeneralizedLinearRegressor supports only solvers" + " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';" + " got {0}".format(self.solver)) solver = self.solver if self.solver == 'auto': if self.l1_ratio == 0: @@ -1168,8 +1198,8 @@ def fit(self, X, y, sample_weight=None): elif isinstance(start_params, str): if start_params not in ['least_squares', 'zero']: raise ValueError("The argument start_params must be None, " - "'least-squares', 'zero' or an array of right" - " length," + "'least-squares', 'zero' or an array of " + " correct length;" " got(start_params={0})".format(start_params)) else: start_params = check_array(start_params, accept_sparse='csr', @@ -1179,21 +1209,21 @@ def fit(self, X, y, sample_weight=None): (start_params.ndim != 1)): raise ValueError("Start values for parameters must have the" "right length and dimension; required (length" - "={0}, ndim=1), got (length={1}, ndim={2})." + "={0}, ndim=1); got (length={1}, ndim={2})." .format(X.shape[1] + self.fit_intercept, start_params.shape[0], start_params.ndim)) if self.selection not in ['cyclic', 'random']: raise ValueError("The argument selection must be 'cyclic' or " - "'random', got (selection={0})" + "'random'; got (selection={0})" .format(self.selection)) random_state = check_random_state(self.random_state) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) if not isinstance(self.check_input, bool): - raise ValueError("The attribute check_input must be bool; got " + raise ValueError("The argument check_input must be bool; got " "(check_input={0})".format(self.check_input)) if self.P1 is None: @@ -1232,7 +1262,7 @@ def fit(self, X, y, sample_weight=None): link = self._link_instance if self.fit_intercept: - # intercept is first column <=> coef[0] is for intecept + # Note: intercept is first column <=> coef[0] is for intecept if sparse.issparse(X): Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X]) else: @@ -1259,10 +1289,11 @@ def fit(self, X, y, sample_weight=None): P2 *= l2 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric - if sparse.issparse(P2): - P2 = 0.5 * (P2 + P2.transpose()) - else: - P2 = 0.5 * (P2 + P2.T) + if P2.ndim == 2: + if sparse.issparse(P2): + P2 = 0.5 * (P2 + P2.transpose()) + else: + P2 = 0.5 * (P2 + P2.T) # 1.3 additional validations ########################################## if self.check_input: @@ -1301,7 +1332,7 @@ def fit(self, X, y, sample_weight=None): # we rescale weights such that sum(weights) = 1 and this becomes # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) weights_sum = np.sum(weights) - weights = weights/np.sum(weights) + weights = weights/weights_sum ####################################################################### # 3. initialization of coef = (intercept_, coef_) # @@ -1338,6 +1369,8 @@ def fit(self, X, y, sample_weight=None): # with L1 penalty, start with coef = 0 # TODO: Are there better options? coef = np.zeros(n_features) + if self.fit_intercept: + coef[0] = link.link(np.mean(y)) elif isinstance(self.start_params, str): if self.start_params == 'zero': coef = np.zeros(n_features) @@ -1546,6 +1579,8 @@ def Hs(s): # inner loop # TODO: use sparsity (coefficient already 0 due to L1 penalty) # => active set of features for featurelist, see paper + # of Improved GLMNET or Gap Safe Screening Rules + # https://arxiv.org/abs/1611.05780 # A = f'(w) + d*H(w) + (w+d)*P2 # B = H+P2 # Note: f'=-score and H=fisher are updated at the end of outer @@ -1694,7 +1729,7 @@ def Hs(s): return self def linear_predictor(self, X): - """The linear_predictor X*coef_ + intercept_. + """Compute the linear_predictor = X*coef_ + intercept_. Parameters ---------- @@ -1741,8 +1776,7 @@ def predict(self, X, sample_weight=None): return mu*weights def estimate_phi(self, X, y, sample_weight=None): - """Estimation of the dispersion parameter phi. - Returns the estimate. + """Estimate/fit the dispersion parameter phi. Parameters ---------- @@ -1755,6 +1789,11 @@ def estimate_phi(self, X, y, sample_weight=None): sample_weight : {None, array-like}, shape (n_samples,), optional \ (default=None) Sample weights. + + Returns + ------- + phi : float + Dispersion parameter. """ check_is_fitted(self, "coef_") _dtype = [np.float64, np.float32] @@ -1785,15 +1824,17 @@ def estimate_phi(self, X, y, sample_weight=None): # "AssertionError: -0.28014056555724598 not greater than 0.5" # unless GeneralizedLinearRegressor has a score which passes the test. def score(self, X, y, sample_weight=None): - r"""Returns D^2, a generalization of the coefficient of determination - R^2, which uses deviance instead of squared error. + r"""Compute D^2, the percentage of deviance explained. + + D^2 is a generalization of the coefficient of determination R^2. + R^2 uses squared error and D^2 deviance. Note that those two are equal + for family='normal'. D^2 is defined as :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` is the null deviance, i.e. the deviance of a model with intercept alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean - :math:`\bar{y}` is averaged by sample_weight. In the case of a Normal - distribution, D^2 equals R^2. + :math:`\bar{y}` is averaged by sample_weight. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). @@ -1812,7 +1853,7 @@ def score(self, X, y, sample_weight=None): Returns ------- score : float - D^2 of self.predict(X) wrt. y. + D^2 of self.predict(X) w.r.t. y. """ # Note, default score defined in RegressorMixin is R^2 score. # TODO: make D^2 a score function in module metrics (and thereby get From 01033e36d913756f7ff5e2214189cf1d7426dee1 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 20 Feb 2019 17:20:09 +0100 Subject: [PATCH 044/209] Fix false formula in starting_mu and improve start_params --- sklearn/linear_model/glm.py | 108 ++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 48 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index d69ccd0a66486..fad7492acc2fb 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -583,7 +583,7 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): fisher = safe_sparse_dot(X.T, temp, dense_output=False) return eta, mu, score, fisher - def starting_mu(self, y, weights=1): + def starting_mu(self, y, weights=1, ind_weight=0.5): """Set starting values for the mean mu. These may be good starting points for the (unpenalized) IRLS solver. @@ -595,9 +595,13 @@ def starting_mu(self, y, weights=1): weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. + + ind_weight : float (default=0.5) + Must be between 0 and 1. Specifies how much weight is given to the + individual observations instead of the mean of y. """ - return ((weights*y+np.mean(weights*y)) / - (2.*np.sum(np.ones_like(y)*weights))) + return (ind_weight * y + + (1. - ind_weight) * np.average(y, weights=weights)) class TweedieDistribution(ExponentialDispersionModel): @@ -852,17 +856,19 @@ def _irls_step(X, W, P2, z): class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. - GLMs based on a reproductive Exponential Dispersion Model (EDM) with - combined L1 and L2 priors as regularizer minimizes the following objective - function:: + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean `mu=h(X*w)`. Therefore the fit minimizes + the following objective function with combined L1 and L2 priors as + regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + alpha * l1_ratio * ||P1*w||_1 + 1/2 * alpha * (1 - l1_ratio) * w*P2*w with inverse link function `h` and s=`sample_weight` (for - `sample_weight=None`, one has s=1 and sum(s)=`n_samples`). - For `P1=P2=identity`, the penalty is the elastic net:: + ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`). + For `P1=P2=identity` (``P1=None``, ``P2=None``), the penalty is the + elastic net:: alpha * l1_ratio * ||w||_1 + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 @@ -966,24 +972,34 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): as initialization for ``coef_`` and ``intercept_`` (supersedes option ``start_params``). If set to ``True`` or if the attribute ``coef_`` does not exit (first call to ``fit``), option ``start_params`` sets the - starting values for ``coef_`` and ``intercept_``. + start values for ``coef_`` and ``intercept_``. + + start_params : {'irls', 'least_squares', 'zero', array of shape \ + (n_features*, )}, optional (default='irls') + Relevant only if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not yet exist). + + 'irls' + Start values of mu are calculated by family.starting_mu(..). Then, + one step of irls obtains start values for ``coef_`. This gives + usually good results. - start_params : {None, 'least_squares', 'zero', array of shape \ - (n_features*, )}, optional (default=None) - If an array of size n_features* is supplied, use it as start values - for ``coef_`` in the fit. If ``fit_intercept=True``, the first element + 'least_squares' + Start values for ``coef_`` are obtained by a least squares fit in the + link space (y is transformed to the space of the linear predictor). + + 'zero' + All coefficients are set to zero. If ``fit_intercept=True``, the + start value for the intercept is obtained by the average of y. + + array + The array of size n_features* is directly used as start values + for ``coef_``. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes the intercept in counting. - If 'least_squares' is set, the result of a least squares fit in the - link space (linear predictor) is taken. - If 'zero' is set, all coefficients start with zero. - If ``None``, the start values are calculated by setting mu to - family.starting_mu(..) and one step of irls. - These options only apply if ``warm_start=False`` or if fit is called - the first time (``self.coef_`` does not yet exist). - selection : str, optional (default='random') + selection : str, optional (default='cyclic') For the solver 'cd' (coordinate descent), the coordinates (features) can be updated in either cyclic or random order. If set to 'random', a random coefficient is updated every iteration @@ -1005,7 +1021,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): check_input : boolean, optional (default=True) Allow to bypass several checks on input: y values in range of family, - sample_weights non-negative, P2 positive semi-definite. + sample_weight non-negative, P2 positive semi-definite. Don't use this parameter unless you know what you do. verbose : int, optional (default=0) @@ -1061,8 +1077,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', fit_dispersion=None, solver='auto', max_iter=100, - tol=1e-4, warm_start=False, start_params=None, - selection='random', random_state=None, copy_X=True, + tol=1e-4, warm_start=False, start_params='irls', + selection='cyclic', random_state=None, copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.l1_ratio = l1_ratio @@ -1193,11 +1209,9 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) start_params = self.start_params - if start_params is None: - pass - elif isinstance(start_params, str): - if start_params not in ['least_squares', 'zero']: - raise ValueError("The argument start_params must be None, " + if isinstance(start_params, str): + if start_params not in ['irls', 'least_squares', 'zero']: + raise ValueError("The argument start_params must be 'irls', " "'least-squares', 'zero' or an array of " " correct length;" " got(start_params={0})".format(start_params)) @@ -1348,11 +1362,11 @@ def fit(self, X, y, sample_weight=None): self.coef_)) else: coef = self.coef_ - elif self.start_params is None: - if self.l1_ratio == 0: + elif isinstance(start_params, str): + if start_params == 'irls': # See 3.1 IRLS # Use mu_start and apply one irls step to calculate coef - mu = family.starting_mu(y, weights) + mu = family.starting_mu(y, weights=weights) # linear predictor eta = link.link(mu) # h'(eta) @@ -1365,16 +1379,9 @@ def fit(self, X, y, sample_weight=None): # solve A*coef = b # A = X' W X + l2 P2, b = X' W z coef = _irls_step(Xnew, W, P2, z) - else: - # with L1 penalty, start with coef = 0 - # TODO: Are there better options? - coef = np.zeros(n_features) - if self.fit_intercept: - coef[0] = link.link(np.mean(y)) - elif isinstance(self.start_params, str): - if self.start_params == 'zero': - coef = np.zeros(n_features) - elif self.start_params == 'least_squares': + elif start_params == 'least_squares': + # less restrictive tolerance for finding start values + tol = np.max([self.tol, np.sqrt(self.tol)]) if self.alpha == 0: reg = LinearRegression(copy_X=True, fit_intercept=False) reg.fit(Xnew, link.link(y)) @@ -1384,18 +1391,21 @@ def fit(self, X, y, sample_weight=None): # => use Ridge # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 reg = Ridge(copy_X=True, fit_intercept=False, - alpha=self.alpha*n_samples, - tol=np.max([self.tol, np.sqrt(self.tol)])) + alpha=self.alpha*n_samples, tol=tol) reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: # TODO: Does this make sense at all? reg = ElasticNet(copy_X=True, fit_intercept=False, alpha=self.alpha, l1_ratio=self.l1_ratio, - tol=np.max([self.tol, np.sqrt(self.tol)])) + tol=tol) reg.fit(Xnew, link.link(y)) coef = reg.coef_ - else: + else: # start_params == 'zero' + coef = np.zeros(n_features) + if self.fit_intercept: + coef[0] = link.link(np.average(y, weights=weights)) + else: # assign given array as start values coef = start_params ####################################################################### @@ -1560,6 +1570,8 @@ def Hs(s): # some precalculations eta, mu, score, fisher = family._eta_mu_score_fisher( coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) + # set up space for search direction d for inner loop + d = np.zeros_like(coef) # initial stopping tolerance of inner loop # use L1-norm of minimum-norm of subgradient of F # fp_wP2 = f'(w) + w*P2 @@ -1574,8 +1586,8 @@ def Hs(s): # outer loop while self.n_iter_ < self.max_iter: self.n_iter_ += 1 - # initialize search direction d (to be optimized) - d = np.zeros_like(coef) + # initialize search direction d (to be optimized) with zero + d.fill(0) # inner loop # TODO: use sparsity (coefficient already 0 due to L1 penalty) # => active set of features for featurelist, see paper From 4071a8a54de0112fd1afd6d8fc5b5585708c84ea Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 20 Feb 2019 21:15:04 +0100 Subject: [PATCH 045/209] Improve argument handling of P1 and P2 * P2 also accepts 1d array and interprets it as diagonal matrix * improved input checks for P1 and P2 --- sklearn/linear_model/glm.py | 103 ++++++++++++++++--------- sklearn/linear_model/tests/test_glm.py | 31 ++++---- 2 files changed, 81 insertions(+), 53 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index fad7492acc2fb..33e0d75730e3a 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -45,7 +45,10 @@ from abc import ABCMeta, abstractmethod, abstractproperty import numbers import numpy as np -from scipy import linalg, optimize, sparse, special +from scipy import linalg, sparse +import scipy.sparse.linalg as splinalg +from scipy.optimize import fmin_l_bfgs_b +from scipy.special import xlogy import warnings from .base import LinearRegression from .coordinate_descent import ElasticNet @@ -727,7 +730,7 @@ def unit_deviance(self, y, mu): if p == 1: # PoissonDistribution # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 - return 2 * (special.xlogy(y, y/mu) - y + mu) + return 2 * (xlogy(y, y/mu) - y + mu) elif p == 2: # GammaDistribution return 2 * (np.log(mu/y)+y/mu-1) @@ -840,7 +843,7 @@ def _irls_step(X, W, P2, z): XtW = X.transpose() * W A = XtW * X + L2 b = XtW * z - coef = sparse.linalg.spsolve(A, b) + coef = splinalg.spsolve(A, b) else: XtW = (X.T * W) A = XtW.dot(X) @@ -867,7 +870,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): with inverse link function `h` and s=`sample_weight` (for ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`). - For `P1=P2=identity` (``P1=None``, ``P2=None``), the penalty is the + For ``P1=P2='identity'`` (``P1=None``, ``P2=None``), the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 @@ -904,21 +907,24 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : {None, array-like}, shape (n_features,), optional \ - (default=None) + P1 : {'identity', array-like}, shape (n_features,), optional \ + (default='identity') With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The - default value ``None`` is the same as a 1d array of ones. + default value ``'identity'`` is the same as a 1d array of ones. Note that n_features = X.shape[1]. - P2 : {None, array-like, sparse matrix}, shape \ - (n_features, n_features), optional (default=None) - With this square matrix the L2 penalty is calculated as `w P2 w`. - This gives a fine control over this penalty (Tikhonov - regularization). The diagonal zeros of a diagonal P2, for example, - exclude all corresponding coefficients from the L2 penalty. - The default value ``None`` is the same as the identity matrix. - Note that n_features = X.shape[1]. P2 must be positive semi-definite. + P2 : {'identity', array-like, sparse matrix}, shape \ + (n_features,) or (n_features, n_features), optional \ + (default='identity') + With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`. + This gives a fine control over this penalty (Tikhonov regularization). + A 2d array is directly used as the square matrix P2. A 1d array is + interpreted as diagonal (square) matrix. The default 'identity' sets + the identity matrix, which gives the usual squared L2-norm. If you just + want to exclude certain coefficients, pass a 1d array filled with 1, + and 0 for the coefficients to be excluded. + Note that P2 must be positive semi-definite. fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be @@ -1074,7 +1080,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Journal of Machine Learning Research 13 (2012) 1999-2030 https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ - def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, + def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', fit_intercept=True, family='normal', link='identity', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params='irls', @@ -1240,20 +1246,23 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument check_input must be bool; got " "(check_input={0})".format(self.check_input)) - if self.P1 is None: + if isinstance(self.P1, str) and self.P1 == 'identity': P1 = np.ones(X.shape[1]) else: - P1 = np.copy(np.atleast_1d(self.P1)) - if P1.dtype.kind not in ['b', 'i', 'u', 'f']: - raise ValueError("P1 must be a numeric value; " - "got (dtype={0}).".format(P1.dtype)) + P1 = np.atleast_1d(self.P1) + try: + P1 = P1.astype(np.float64, casting='safe', copy=True) + except TypeError: + raise TypeError("The given P1 cannot be converted to a numeric" + "array; got (P1.dtype={0})." + .format(P1.dtype)) if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): - raise ValueError("P1 must be either None or a 1d array with " - "the length of X.shape[1]; " + raise ValueError("P1 must be either 'identity' or a 1d array " + "with the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." .format(P1.shape[0], X.shape[1])) - if self.P2 is None: + if isinstance(self.P2, str) and self.P2 == 'identity': if not sparse.issparse(X): P2 = np.ones(X.shape[1]) else: @@ -1262,8 +1271,15 @@ def fit(self, X, y, sample_weight=None): else: P2 = check_array(self.P2, copy=True, accept_sparse=['csr', 'csc', 'coo'], - dtype="numeric", ensure_2d=True) - if ((P2.ndim != 2) or + dtype=_dtype, ensure_2d=False) + if P2.ndim == 1: + if P2.shape[0] != X.shape[1]: + raise ValueError("P2 should be a 1d array of shape " + "(n_features,) with " + "n_features=X.shape[1]; " + "got (P2.shape=({0},)), needed ({1},)" + .format(P2.shape[0], X.shape[1])) + elif ((P2.ndim != 2) or (P2.shape[0] != P2.shape[1]) or (P2.shape[0] != X.shape[1])): raise ValueError("P2 must be either None or an array of shape " @@ -1319,21 +1335,32 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Sample weights must be non-negative.") # check if P1 has only non-negative values, negative values might # indicate group lasso in the future. - if self.P1 is not None: + if self.P1 != 'identity': if not np.all(P1 >= 0): raise ValueError("P1 must not have negative values.") # check if P2 is positive semidefinite # np.linalg.cholesky(P2) 'only' asserts positive definite - if self.P2 is not None: - if sparse.issparse(P2): - # TODO: check sparse P2 for non-negativeness - # raise NotImplementedError("Check sparse P2 for " - # "non-negativeness is not yet " - # "implemented.") - pass - elif P2.ndim == 2: - if not np.all(np.linalg.eigvals(P2) >= -1e-15): - raise ValueError("P2 must be positive definite.") + if self.P2 != 'identity': + # due to numerical precision, we allow eigenvalues to be a + # tiny bit negative + epsneg = 10 * np.finfo(P2.dtype).epsneg + if P2.ndim == 1 or P2.shape[0] == 1: + if not np.all(P2 >= 0): + raise ValueError("1d array P2 must not have negative " + "values.") + elif sparse.issparse(P2): + # for sparse matrices, not all eigenvals can be computed + # efficiently, use only half of n_features + # k = how many eigenvals to compute + k = np.min([10, n_features // 10 + 1]) + sigma = 0 # start searching near this value + which = 'SA' # find smallest algebraic eigenvalues first + if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma, + which=which) >= epsneg): + raise ValueError("P2 must be positive semi-definite.") + else: + if not np.all(linalg.eigvalsh(P2) >= epsneg): + raise ValueError("P2 must be positive semi-definite.") # TODO: if alpha=0 check that Xnew is not rank deficient # TODO: what else to check? @@ -1520,7 +1547,7 @@ def Hs(s): args = (Xnew, y, weights, link) if solver == 'lbfgs': - coef, loss, info = optimize.fmin_l_bfgs_b( + coef, loss, info = fmin_l_bfgs_b( func, coef, fprime=fprime, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, maxiter=self.max_iter) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 1ac5ccd4d3d5c..fde1604ad16e3 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -188,25 +188,26 @@ def test_glm_l1_ratio_argument(): assert_raises(ValueError, glm.fit, X, y) -def test_glm_P1_argument(): - """Test GLM P1 arguments - """ +@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3]]) +def test_glm_P1_argument(P1): + """Test GLM P1 arguments.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for P1 in [['a string', 'a string'], [1, [2]], [1, 2, 3]]: - glm = GeneralizedLinearRegressor(P1=P1) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(P1=P1) + with pytest.raises((ValueError, TypeError)): + glm.fit(X, y) -# def test_glm_P2_argument(): -# """Test GLM P2 arguments -# """ -# y = np.array([1, 2]) -# X = np.array([[1], [1]]) -# for P2 in [np.full((2, 2), 'a string', dtype=np.dtype(' Date: Wed, 20 Feb 2019 22:48:14 +0100 Subject: [PATCH 046/209] Fix doctest, test_poisson_enet, change IRLS to use lstsq, fix input checks * adapt examples of GeneralizedLinearModel to new defaults for P1, P2 and selection * fix precision/decimal issue in test_poisson_enet * use more robust least squares instead of solve in IRLS * fix sign error in input checks --- doc/modules/linear_model.rst | 16 +++++++++------- sklearn/linear_model/glm.py | 25 ++++++++++++++++--------- sklearn/linear_model/tests/test_glm.py | 24 ++++++++++++++++++++---- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 174d1e4eddae4..e60e9e84a4747 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -924,15 +924,17 @@ follows: >>> from sklearn.linear_model import GeneralizedLinearRegressor >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE - GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, - copy_X=True, family='poisson', fit_dispersion=None, - fit_intercept=True, l1_ratio=0, link='log', max_iter=100, - random_state=None, selection='random', solver='auto', - start_params=None, tol=0.0001, verbose=0, warm_start=False) + GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5, + check_input=True, copy_X=True, family='poisson', + fit_dispersion=None, fit_intercept=True, l1_ratio=0, + link='log', max_iter=100, random_state=None, + selection='cyclic', solver='auto', + start_params='irls', tol=0.0001, verbose=0, + warm_start=False) >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE - array([0.24630255, 0.43373521]) + array([0.24630169, 0.43373464]) >>> reg.intercept_ #doctest: +ELLIPSIS - -0.76383575... + -0.76383633... .. topic:: Examples: diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 33e0d75730e3a..bc1a0434fa3b0 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -830,8 +830,9 @@ def _irls_step(X, W, P2, z): ------- coef: array, shape (X.shape[1]) """ - # TODO: scipy.linalg.solve seems faster, but ordinary least squares uses - # scipy.linalg.lstsq. What is more appropriate? + # Note: solve vs least squares, what is more appropriate? + # scipy.linalg.solve seems faster, but scipy.linalg.lstsq + # is more robust. n_samples, n_features = X.shape if sparse.issparse(X): W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr() @@ -843,7 +844,8 @@ def _irls_step(X, W, P2, z): XtW = X.transpose() * W A = XtW * X + L2 b = XtW * z - coef = splinalg.spsolve(A, b) + # coef = splinalg.spsolve(A, b) + coef, *_ = splinalg.lsmr(A, b) else: XtW = (X.T * W) A = XtW.dot(X) @@ -852,7 +854,8 @@ def _irls_step(X, W, P2, z): else: A += P2 b = XtW.dot(z) - coef = linalg.solve(A, b) + # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True) + coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) return coef @@ -1340,12 +1343,15 @@ def fit(self, X, y, sample_weight=None): raise ValueError("P1 must not have negative values.") # check if P2 is positive semidefinite # np.linalg.cholesky(P2) 'only' asserts positive definite - if self.P2 != 'identity': + if not isinstance(self.P2, str): # self.P2 != 'identity' # due to numerical precision, we allow eigenvalues to be a # tiny bit negative - epsneg = 10 * np.finfo(P2.dtype).epsneg + epsneg = -10 * np.finfo(P2.dtype).epsneg if P2.ndim == 1 or P2.shape[0] == 1: - if not np.all(P2 >= 0): + p2 = P2 + if sparse.issparse(P2): + p2 = P2.toarray() + if not np.all(p2 >= 0): raise ValueError("1d array P2 must not have negative " "values.") elif sparse.issparse(P2): @@ -1360,6 +1366,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("P2 must be positive semi-definite.") else: if not np.all(linalg.eigvalsh(P2) >= epsneg): + return P2 raise ValueError("P2 must be positive semi-definite.") # TODO: if alpha=0 check that Xnew is not rank deficient # TODO: what else to check? @@ -1689,7 +1696,7 @@ def Hs(s): mn_subgrad = (np.where(coef + d == 0, np.sign(A)*np.maximum(np.abs(A)-P1, 0), A+np.sign(coef+d)*P1)) - mn_subgrad = np.sum(np.abs(mn_subgrad)) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) if mn_subgrad <= inner_tol: if inner_iter == 1: inner_tol = inner_tol/4. @@ -1740,7 +1747,7 @@ def Hs(s): mn_subgrad = (np.where(coef == 0, np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), fp_wP2+np.sign(coef)*P1)) - mn_subgrad = np.sum(np.abs(mn_subgrad)) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) if mn_subgrad <= self.tol: converged = True break diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index fde1604ad16e3..8893028d0176a 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -2,7 +2,7 @@ from numpy.testing import assert_allclose import pytest import scipy as sp -from scipy import sparse +from scipy import sparse, optimize from sklearn.linear_model.glm import ( Link, @@ -199,8 +199,7 @@ def test_glm_P1_argument(P1): @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], - sparse.csr_matrix([1, 2, 3]), - sparse.lil_matrix([[1]])]) + sparse.csr_matrix([1, 2, 3])]) def test_glm_P2_argument(P2): """Test GLM P2 arguments.""" y = np.array([1, 2]) @@ -515,12 +514,29 @@ def test_poisson_enet(): X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', solver='cd', tol=1e-7, + link='log', solver='cd', tol=1e-8, selection='random', random_state=42) glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) + # test results with general optimization procedure + def obj(coef): + pd = PoissonDistribution() + link = LogLink() + N = y.shape[0] + mu = link.inverse(X @ coef[1:]+coef[0]) + alpha, l1_ratio = (1, 0.5) + return 1./(2.*N) * pd.deviance(y, mu) \ + + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \ + + alpha * l1_ratio * np.sum(np.abs(coef[1:])) + res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10, + options={'maxiter': 1000, 'disp': False}) + assert_almost_equal(glm.intercept_, res.x[0], decimal=5) + assert_almost_equal(glm.coef_, res.x[1:], decimal=5) + assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))), + res.fun, decimal=8) + # same for start_params='zero' and selection='cyclic' # with reduced precision glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', From ed8e74f97d2b1921af4b8c2907c9e30629788bdc Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 23 Feb 2019 14:13:22 +0100 Subject: [PATCH 047/209] Use pytest decorators and pytest.raises --- sklearn/linear_model/glm.py | 7 +- sklearn/linear_model/tests/test_glm.py | 510 ++++++++++++------------- 2 files changed, 257 insertions(+), 260 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index bc1a0434fa3b0..f583e17433ee3 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -765,8 +765,8 @@ def __init__(self): super(InverseGaussianDistribution, self).__init__(power=3) -class GeneralizedHyperbolicSecand(ExponentialDispersionModel): - """A class for the Generalized Hyperbolic Secand (GHS) distribution. +class GeneralizedHyperbolicSecant(ExponentialDispersionModel): + """A class for the Generalized Hyperbolic Secant (GHS) distribution. The GHS distribution is for tagets y in (-inf, inf). """ @@ -1338,7 +1338,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Sample weights must be non-negative.") # check if P1 has only non-negative values, negative values might # indicate group lasso in the future. - if self.P1 != 'identity': + if not isinstance(self.P1, str): # if self.P1 != 'identity': if not np.all(P1 >= 0): raise ValueError("P1 must not have negative values.") # check if P2 is positive semidefinite @@ -1366,7 +1366,6 @@ def fit(self, X, y, sample_weight=None): raise ValueError("P2 must be positive semi-definite.") else: if not np.all(linalg.eigvalsh(P2) >= epsneg): - return P2 raise ValueError("P2 must be positive semi-definite.") # TODO: if alpha=0 check that Xnew is not rank deficient # TODO: what else to check? diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 8893028d0176a..361a237f2cc9f 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -2,7 +2,7 @@ from numpy.testing import assert_allclose import pytest import scipy as sp -from scipy import sparse, optimize +from scipy import linalg, optimize, sparse from sklearn.linear_model.glm import ( Link, @@ -11,354 +11,355 @@ TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, - GeneralizedHyperbolicSecand, + GeneralizedHyperbolicSecant, GeneralizedLinearRegressor) from sklearn.linear_model import ElasticNet, Ridge from sklearn.utils.testing import ( assert_equal, assert_almost_equal, - assert_array_equal, assert_array_almost_equal, - assert_raises) + assert_array_equal, assert_array_almost_equal) -def test_link_properties(): - """Test link inverse and derivative - """ +@pytest.mark.parametrize('link', Link.__subclasses__()) +def test_link_properties(link): + """Test link inverse and derivative.""" rng = np.random.RandomState(0) x = rng.rand(100)*100 - # from sklearn.linear_model.glm import Link - # for link in vars()['Link'].__subclasses__(): - for link in Link.__subclasses__(): - link = link() - assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) - assert_almost_equal(link.inverse_derivative(link.link(x)), - 1/link.derivative(x), decimal=10) - - -def test_family_bounds(): - """Test the valid range of distributions - """ - family = NormalDistribution() - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [True, True, True]) - - family = PoissonDistribution() + link = link() # instatiate object + assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) + assert_almost_equal(link.inverse_derivative(link.link(x)), + 1/link.derivative(x), decimal=10) + + +@pytest.mark.parametrize( + 'family, expected', + [(NormalDistribution(), [True, True, True]), + (PoissonDistribution(), [False, True, True]), + (TweedieDistribution(power=1.5), [False, True, True]), + (GammaDistribution(), [False, False, True]), + (InverseGaussianDistribution(), [False, False, True]), + (TweedieDistribution(power=4.5), [False, False, True])]) +def test_family_bounds(family, expected): + """Test the valid range of distributions at -1, 0, 1.""" result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, True, True]) - - family = TweedieDistribution(power=1.5) - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, True, True]) - - family = GammaDistribution() - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, False, True]) - - family = InverseGaussianDistribution() - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, False, True]) - - family = TweedieDistribution(power=4.5) - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, False, True]) - - -def test_deviance_zero(): - """Test deviance(y,y) = 0 for different families - """ - for family in [NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=-2.5), - TweedieDistribution(power=-1), - TweedieDistribution(power=1.5), - TweedieDistribution(power=2.5), - TweedieDistribution(power=4), - GeneralizedHyperbolicSecand()]: - assert_almost_equal(family.deviance(0.1, 0.1), 0, decimal=10) - assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10) - - -def test_fisher_matrix(): + assert_array_equal(result, expected) + + +@pytest.mark.parametrize( + 'family, chk_values', + [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), + (PoissonDistribution(), [0.1, 1.5]), + (GammaDistribution(), [0.1, 1.5]), + (InverseGaussianDistribution(), [0.1, 1.5]), + (TweedieDistribution(power=-2.5), [0.1, 1.5]), + (TweedieDistribution(power=-1), [0.1, 1.5]), + (TweedieDistribution(power=1.5), [0.1, 1.5]), + (TweedieDistribution(power=2.5), [0.1, 1.5]), + (TweedieDistribution(power=-4), [0.1, 1.5]), + (GeneralizedHyperbolicSecant(), [0.1, 1.5])]) +def test_deviance_zero(family, chk_values): + """Test deviance(y,y) = 0 for different families.""" + for x in chk_values: + assert_almost_equal(family.deviance(x, x), 0, decimal=10) + + +@pytest.mark.parametrize( + 'family, link', + [(NormalDistribution(), IdentityLink()), + (PoissonDistribution(), LogLink()), + (GammaDistribution(), LogLink()), + (InverseGaussianDistribution(), LogLink()), + (TweedieDistribution(power=1.5), LogLink()), + (TweedieDistribution(power=4.5), LogLink())]) +def test_fisher_matrix(family, link): """Test the Fisher matrix numerically. Trick: Use numerical differentiation with y = mu""" - for family in [NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution()]: - link = LogLink() - rng = np.random.RandomState(0) - coef = np.array([-2, 1, 0, 1, 2.5]) - phi = 0.5 - X = rng.randn(10, 5) - lin_pred = np.dot(X, coef) - mu = link.inverse(lin_pred) - weights = rng.randn(10)**2 + 1 - fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, - weights=weights, link=link) - approx = np.array([]).reshape(0, coef.shape[0]) - for i in range(coef.shape[0]): - def f(coef): - return -family._score(coef=coef, phi=phi, X=X, y=mu, - weights=weights, link=link)[i] - approx = np.vstack( - [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)] - ) - assert_allclose(fisher, approx, rtol=1e-3) + rng = np.random.RandomState(0) + coef = np.array([-2, 1, 0, 1, 2.5]) + phi = 0.5 + X = rng.randn(10, 5) + lin_pred = np.dot(X, coef) + mu = link.inverse(lin_pred) + weights = rng.randn(10)**2 + 1 + fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link) + approx = np.array([]).reshape(0, coef.shape[0]) + for i in range(coef.shape[0]): + def f(coef): + return -family._score(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link)[i] + approx = np.vstack( + [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]) + assert_allclose(fisher, approx, rtol=1e-3) def test_sample_weights_validation(): - """Test the raised errors in the validation of sample_weight""" + """Test the raised errors in the validation of sample_weight.""" # 1. scalar value but not positive X = [[1]] y = [1] weights = 0 glm = GeneralizedLinearRegressor(fit_intercept=False) - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) # 2. 2d array weights = [[0]] - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) # 3. 1d but wrong length weights = [1, 0] - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) # 4. 1d but only zeros (sum not greater than 0) weights = [0, 0] X = [[0], [1]] y = [1, 2] - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) # 5. 1d but weith a negative value weights = [2, -1] - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) def test_glm_family_argument(): - """Test GLM family argument set as string - """ + """Test GLM family argument set as string.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) + X = np.array([[1], [2]]) for (f, fam) in [('normal', NormalDistribution()), ('poisson', PoissonDistribution()), ('gamma', GammaDistribution()), ('inverse.gaussian', InverseGaussianDistribution())]: - glm = GeneralizedLinearRegressor(family=f, fit_intercept=False, - alpha=0).fit(X, y) + glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) assert_equal(type(glm._family_instance), type(fam)) glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) - assert_raises(ValueError, glm.fit, X, y) + with pytest.raises(ValueError): + glm.fit(X, y) def test_glm_link_argument(): - """Test GLM link argument set as string - """ + """Test GLM link argument set as string.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) + X = np.array([[1], [2]]) for (l, link) in [('identity', IdentityLink()), ('log', LogLink())]: - glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, - link=l).fit(X, y) + glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) assert_equal(type(glm._link_instance), type(link)) - glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, - link='not a link') - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(family='normal', link='not a link') + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_alpha_argument(): - """Test GLM alpha argument - """ +@pytest.mark.parametrize('alpha', ['not a number', -4.2]) +def test_glm_alpha_argument(alpha): + """Test GLM for invalid alpha argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - for alpha in ['not a number', -4.2]: - glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, - alpha=alpha) - assert_raises(ValueError, glm.fit, X, y) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', alpha=alpha) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_l1_ratio_argument(): - """Test GLM l1_ratio argument - """ +@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]]) +def test_glm_l1_ratio_argument(l1_ratio): + """Test GLM for invalid l1_ratio argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - for l1_ratio in ['not a number', -4.2, 1.1, [1]]: - glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, - l1_ratio=l1_ratio) - assert_raises(ValueError, glm.fit, X, y) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio) + with pytest.raises(ValueError): + glm.fit(X, y) -@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3]]) +@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3], + [-1]]) def test_glm_P1_argument(P1): - """Test GLM P1 arguments.""" + """Test GLM for invalid P1 argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(P1=P1) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True) with pytest.raises((ValueError, TypeError)): glm.fit(X, y) @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], - sparse.csr_matrix([1, 2, 3])]) + sparse.csr_matrix([1, 2, 3]), [-1]]) def test_glm_P2_argument(P2): - """Test GLM P2 arguments.""" + """Test GLM for invalid P2 argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False) - with pytest.raises((ValueError, TypeError)): + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(P2=P2, check_input=True) + with pytest.raises(ValueError): glm.fit(X, y) -def test_glm_fit_intercept_argument(): - """Test GLM fit_intercept argument - """ - y = np.array([1, 2]) - X = np.array([[1], [1]]) - for fit_intercept in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) - assert_raises(ValueError, glm.fit, X, y) +def test_glm_P2_positive_semidefinite(): + """Test GLM for a positive semi-definite P2 argument.""" + n_samples, n_features = 10, 5 + rng = np.random.RandomState(42) + y = np.arange(n_samples) + X = np.zeros((n_samples, n_features)) + P2 = np.diag([100, 10, 5, 0, -1E-5]) + # construct random orthogonal matrix Q + Q, R = linalg.qr(rng.randn(n_features, n_features)) + P2 = Q.T @ P2 @ Q + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, + check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_solver_argument(): - """Test GLM solver argument - """ +@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) +def test_glm_fit_intercept_argument(fit_intercept): + """Test GLM for invalid fit_intercept argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for solver in ['not a solver', 1, [1]]: - glm = GeneralizedLinearRegressor(solver=solver) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) + with pytest.raises(ValueError): + glm.fit(X, y) - # solver not suitable for L1 penalty - for solver in ['irls', 'lbfgs', 'newton-cg']: - glm = GeneralizedLinearRegressor(solver=solver, alpha=1, l1_ratio=0.1) - assert_raises(ValueError, glm.fit, X, y) +@pytest.mark.parametrize('solver, l1_ratio', + [('not a solver', 0), (1, 0), ([1], 0), + ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)]) +def test_glm_solver_argument(solver, l1_ratio): + """Test GLM for invalid solver argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_max_iter_argument(): - """Test GLM max_iter argument - """ + +@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]]) +def test_glm_max_iter_argument(max_iter): + """Test GLM for invalid max_iter argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - for max_iter in ['not a number', 0, -1, 5.5, [1]]: - glm = GeneralizedLinearRegressor(max_iter=max_iter) - assert_raises(ValueError, glm.fit, X, y) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(max_iter=max_iter) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_tol_argument(): - """Test GLM tol argument - """ +@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]]) +def test_glm_tol_argument(tol): + """Test GLM for invalid tol argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - for tol in ['not a number', 0, -1.0, [1e-3]]: - glm = GeneralizedLinearRegressor(tol=tol) - assert_raises(ValueError, glm.fit, X, y) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(tol=tol) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_warm_start_argument(): - """Test GLM warm_start argument - """ +@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]]) +def test_glm_warm_start_argument(warm_start): + """Test GLM for invalid warm_start argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for warm_start in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(warm_start=warm_start) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(warm_start=warm_start) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_start_params_argument(): - """Test GLM start_params argument - """ +@pytest.mark.parametrize('start_params', + ['not a start_params', ['zero'], [0, 0, 0], + [[0, 0]], ['a', 'b']]) +def test_glm_start_params_argument(start_params): + """Test GLM for invalid start_params argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for start_params in ['not a start_params', ['zero'], [0, 0, 0], - [[0, 0]], ['a', 'b']]: - glm = GeneralizedLinearRegressor(start_params=start_params) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(start_params=start_params) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_selection_argument(): - """Test GLM selection argument - """ +@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']]) +def test_glm_selection_argument(selection): + """Test GLM for invalid selection argument""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for selection in ['not a selection', 1, 0, ['cyclic']]: - glm = GeneralizedLinearRegressor(selection=selection) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(selection=selection) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_random_state_argument(): - """Test GLM random_state argument - """ +@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]]) +def test_glm_random_state_argument(random_state): + """Test GLM for invalid random_state argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for random_state in ['a string', 0.5, [0]]: - glm = GeneralizedLinearRegressor(random_state=random_state) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(random_state=random_state) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_copy_X_argument(): - """Test GLM copy_X arguments - """ +@pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) +def test_glm_copy_X_argument(copy_X): + """Test GLM for invalid copy_X arguments.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for copy_X in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(copy_X=copy_X) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(copy_X=copy_X) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_check_input_argument(): - """Test GLM check_input argument - """ +@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]]) +def test_glm_check_input_argument(check_input): + """Test GLM for invalid check_input argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for check_input in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(check_input=check_input) - assert_raises(ValueError, glm.fit, X, y) - + glm = GeneralizedLinearRegressor(check_input=check_input) + with pytest.raises(ValueError): + glm.fit(X, y) -# TODO: check additional validations if check_input == True -def test_glm_identiy_regression(): - """Test GLM regression with identity link on a simple dataset - """ +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecant()]) +@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +def test_glm_identiy_regression(family, solver): + """Test GLM regression with identity link on a simple dataset.""" coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) - families = ( - NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), - GeneralizedHyperbolicSecand()) - for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - for family in families: - glm = GeneralizedLinearRegressor( - alpha=0, family=family, fit_intercept=False, solver=solver) - res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef) - - -def test_glm_log_regression(): - """Test GLM regression with log link on a simple dataset - """ + glm = GeneralizedLinearRegressor(alpha=0, family=family, + fit_intercept=False, solver=solver) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) + + +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecant()]) +@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +def test_glm_log_regression(family, solver): + """Test GLM regression with log link on a simple dataset.""" coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) - families = ( - NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), - GeneralizedHyperbolicSecand()) - for solver in ['irls', 'lbfgs', 'newton-cg']: - for family in families: - glm = GeneralizedLinearRegressor( + glm = GeneralizedLinearRegressor( alpha=0, family=family, link=LogLink(), fit_intercept=False, solver=solver, start_params='least_squares') - res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) @pytest.mark.filterwarnings('ignore::DeprecationWarning') -def test_normal_ridge(): - """Test ridge regression for Normal distributions +@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +def test_normal_ridge(solver): + """Test ridge regression for Normal distributions. Compare to test_ridge in test_ridge.py. """ @@ -375,25 +376,23 @@ def test_normal_ridge(): ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, solver='svd', normalize=False) ridge.fit(X, y) - for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, - family='normal', link='identity', - fit_intercept=True, tol=1e-6, - max_iter=100, solver=solver, - random_state=42) - glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=True, + tol=1e-6, max_iter=100, solver=solver, + random_state=42) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, solver='svd', normalize=False) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, - family='normal', link='identity', - fit_intercept=False, solver='irls', - fit_dispersion='chisqr') + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=False, + tol=1e-6, max_iter=100, solver=solver, + random_state=42, fit_dispersion='chisqr') glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) @@ -413,23 +412,22 @@ def test_normal_ridge(): ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, solver='sag', normalize=False, max_iter=100000) ridge.fit(X, y) - for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, - family='normal', link='identity', - fit_intercept=True, solver=solver, - max_iter=300, random_state=42) - glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, + family='normal', link='identity', + fit_intercept=True, solver=solver, + max_iter=300, random_state=42) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, solver='sag', normalize=False, max_iter=1000) ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, family='normal', link='identity', - fit_intercept=False, solver='irls') + fit_intercept=False, solver=solver) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) @@ -438,7 +436,7 @@ def test_normal_ridge(): def test_poisson_ridge(): - """Test ridge regression with poisson family and LogLink + """Test ridge regression with poisson family and LogLink. Compare to R's glmnet""" # library("glmnet") @@ -470,7 +468,7 @@ def test_poisson_ridge(): def test_normal_enet(): - """Tet elastic net regression with normal/gaussian family""" + """Test elastic net regression with normal/gaussian family.""" rng = np.random.RandomState(0) alpha, l1_ratio = 0.3, 0.7 n_samples, n_features = 20, 2 @@ -495,7 +493,7 @@ def test_normal_enet(): def test_poisson_enet(): - """Test elastic net regression with poisson family and LogLink + """Test elastic net regression with poisson family and LogLink. Compare to R's glmnet""" # library("glmnet") From fe876da908a7d5aefe8fa9ac56f4c5130ccf83df Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 24 Feb 2019 12:45:55 +0100 Subject: [PATCH 048/209] Add Logistic regression=Binomial + Logit * add Binomial distribution * add Logit link * tests for binomial against LogisticRegression * option 'auto' for link * reduce code duplication by replacing @abstractproperty by @property --- sklearn/linear_model/glm.py | 160 +++++++++++++++---------- sklearn/linear_model/tests/test_glm.py | 57 +++++++-- 2 files changed, 149 insertions(+), 68 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index f583e17433ee3..01e40b322946c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -42,13 +42,12 @@ from __future__ import division -from abc import ABCMeta, abstractmethod, abstractproperty +from abc import ABCMeta, abstractmethod import numbers import numpy as np -from scipy import linalg, sparse +from scipy import linalg, sparse, special import scipy.sparse.linalg as splinalg from scipy.optimize import fmin_l_bfgs_b -from scipy.special import xlogy import warnings from .base import LinearRegression from .coordinate_descent import ElasticNet @@ -191,6 +190,28 @@ def inverse_derivative2(self, lin_pred): return np.exp(lin_pred) +class LogitLink(Link): + """The logit link function g(x)=logit(x).""" + + def link(self, mu): + return special.logit(mu) + + def derivative(self, mu): + return 1. / (mu * (1 - mu)) + + def inverse(self, lin_pred): + return special.expit(lin_pred) + + def inverse_derivative(self, lin_pred): + ep = special.expit(lin_pred) + return ep * (1. - ep) + + def inverse_derivative2(self, lin_pred): + ep = special.expit(lin_pred) + ep = special.expit(lin_pred) + return ep * (1. - ep) * (1. - 2 * ep) + + class ExponentialDispersionModel(metaclass=ABCMeta): r"""Base class for reproductive Exponential Dispersion Models (EDM). @@ -238,26 +259,25 @@ class ExponentialDispersionModel(metaclass=ABCMeta): https://en.wikipedia.org/wiki/Exponential_dispersion_model. """ - - @abstractproperty + @property def lower_bound(self): - """The lower bound of values of Y~EDM.""" - raise NotImplementedError() + """Get the lower bound of values for Y~EDM.""" + return self._lower_bound - @abstractproperty + @property def upper_bound(self): - """The upper bound of values of Y~EDM.""" - raise NotImplementedError() + """Get the upper bound of values for Y~EDM.""" + return self._upper_bound - @abstractproperty + @property def include_lower_bound(self): - """If True, values of y may equal lower bound: y >= lower_bound.""" - raise NotImplementedError() + """Get True if lower bound for y is included: y >= lower_bound.""" + return self._include_lower_bound - @abstractproperty + @property def include_upper_bound(self): - """If True, values of y may equal upper bound: y <= upper_bound.""" - raise NotImplementedError() + """Get True if upper bound for y is includede: y <= upper_bound.""" + return self._include_upper_bound def in_y_range(self, x): """Returns true if `x` is in the valid range of Y~EDM. @@ -685,22 +705,6 @@ def power(self, power): .format(power)) self._power = power - @property - def lower_bound(self): - return self._lower_bound - - @property - def upper_bound(self): - return self._upper_bound - - @property - def include_lower_bound(self): - return self._include_lower_bound - - @property - def include_upper_bound(self): - return self._include_upper_bound - def unit_variance(self, mu): """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. @@ -730,7 +734,7 @@ def unit_deviance(self, y, mu): if p == 1: # PoissonDistribution # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 - return 2 * (xlogy(y, y/mu) - y + mu) + return 2 * (special.xlogy(y, y/mu) - y + mu) elif p == 2: # GammaDistribution return 2 * (np.log(mu/y)+y/mu-1) @@ -776,22 +780,6 @@ def __init__(self): self._include_lower_bound = False self._include_upper_bound = False - @property - def lower_bound(self): - return self._lower_bound - - @property - def upper_bound(self): - return self._upper_bound - - @property - def include_lower_bound(self): - return self._include_lower_bound - - @property - def include_upper_bound(self): - return self._include_upper_bound - def unit_variance(self, mu): return 1 + mu**2 @@ -803,6 +791,27 @@ def unit_deviance(self, y, mu): np.log((1+mu**2)/(1+y**2))) +class BinomialDistribution(ExponentialDispersionModel): + """A class for the Binomial distribution. + + The Binomial distribution is for tagets y in [0, 1]. + """ + def __init__(self): + self._lower_bound = 0 + self._upper_bound = 1 + self._include_lower_bound = True + self._include_upper_bound = True + + def unit_variance(self, mu): + return mu * (1 - mu) + + def unit_variance_derivative(self, mu): + return 1 - 2 * mu + + def unit_deviance(self, y, mu): + return 2*(special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) + + def _irls_step(X, W, P2, z): """Compute one step in iteratively reweighted least squares. @@ -933,15 +942,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\ - of class ExponentialDispersionModel, optional(default='normal') + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \ + or an instance of class ExponentialDispersionModel, \ + optional(default='normal') The distributional assumption of the GLM, i.e. which distribution from the EDM, specifies the loss function to be minimized. - link : {'identity', 'log'} or an instance of class Link, - optional (default='identity') + link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, + optional (default='auto') The link function of the GLM, i.e. mapping from linear predictor - (X*coef) to expectation (mu). + (X*coef) to expectation (mu). Option 'auto' sets the link depending on + the chosen family as follows: + + - 'identity' for family 'normal' + + - 'log' for families 'poisson', 'gamma', 'inverse.gaussian' + + - 'logit' for family 'binomial' fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) Method for estimation of the dispersion parameter phi. Whether to use @@ -1084,7 +1101,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', - fit_intercept=True, family='normal', link='identity', + fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params='irls', selection='cyclic', random_state=None, copy_X=True, @@ -1159,27 +1176,48 @@ def fit(self, X, y, sample_weight=None): self._family_instance = GammaDistribution() elif self.family == 'inverse.gaussian': self._family_instance = InverseGaussianDistribution() + elif self.family == 'binomial': + self._family_instance = BinomialDistribution() else: raise ValueError( "The family must be an instance of class" " ExponentialDispersionModel or an element of" - " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];" - " got (family={0})".format(self.family)) + " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " + "'binomial']; got (family={0})".format(self.family)) # Guarantee that self._link_instance is set to an instance of # class Link if isinstance(self.link, Link): self._link_instance = self.link else: - if self.link == 'identity': + if self.link == 'auto': + if isinstance(self._family_instance, TweedieDistribution): + if self._family_instance.power <= 0: + self._link_instance = IdentityLink() + if self._family_instance.power >= 1: + self._link_instance = LogLink() + elif isinstance(self._family_instance, + GeneralizedHyperbolicSecant): + self._link_instance = IdentityLink() + elif isinstance(self._family_instance, BinomialDistribution): + self._link_instance = LogitLink() + else: + raise ValueError("No default link known for the " + "specified distribution family. Please " + "set link manually, i.e. not to 'auto'; " + "got (link='auto', family={}" + .format(self.family)) + elif self.link == 'identity': self._link_instance = IdentityLink() elif self.link == 'log': self._link_instance = LogLink() + elif self.link == 'logit': + self._link_instance = LogitLink() else: raise ValueError( - "The link must be an instance of class Link or" - " an element of ['identity', 'log']; got (link={0})" - .format(self.link)) + "The link must be an instance of class Link or " + "an element of ['auto', 'identity', 'log', 'logit']; " + "got (link={0})".format(self.link)) if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: raise ValueError("Penalty term must be a non-negative number;" diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 361a237f2cc9f..de0857a34fe3a 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -4,16 +4,18 @@ import scipy as sp from scipy import linalg, optimize, sparse +from sklearn.datasets import make_classification from sklearn.linear_model.glm import ( Link, IdentityLink, LogLink, + LogitLink, TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, - GeneralizedHyperbolicSecant, + GeneralizedHyperbolicSecant, BinomialDistribution, GeneralizedLinearRegressor) -from sklearn.linear_model import ElasticNet, Ridge +from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.utils.testing import ( assert_equal, assert_almost_equal, @@ -26,9 +28,19 @@ def test_link_properties(link): rng = np.random.RandomState(0) x = rng.rand(100)*100 link = link() # instatiate object - assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) - assert_almost_equal(link.inverse_derivative(link.link(x)), - 1/link.derivative(x), decimal=10) + decimal = 10 + if isinstance(link, LogitLink): + # careful for large x, note expit(36) = 1 + # limit max eta to 15 + x = x / 100 * 15 + decimal = 8 + assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal) + # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) + assert_almost_equal(link.derivative(link.inverse(x)), + 1./link.inverse_derivative(x), decimal=decimal) + # for LogitLink, in the following x should be between 0 and 1. + # assert_almost_equal(link.inverse_derivative(link.link(x)), + # 1./link.derivative(x), decimal=decimal) @pytest.mark.parametrize( @@ -214,6 +226,12 @@ def test_glm_P2_positive_semidefinite(): with pytest.raises(ValueError): glm.fit(X, y) + P2 = sparse.csr_matrix(P2) + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, + check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) + @pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) def test_glm_fit_intercept_argument(fit_intercept): @@ -331,7 +349,7 @@ def test_glm_identiy_regression(family, solver): coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) - glm = GeneralizedLinearRegressor(alpha=0, family=family, + glm = GeneralizedLinearRegressor(alpha=0, family=family, link='identity', fit_intercept=False, solver=solver) res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) @@ -350,7 +368,7 @@ def test_glm_log_regression(family, solver): X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) glm = GeneralizedLinearRegressor( - alpha=0, family=family, link=LogLink(), fit_intercept=False, + alpha=0, family=family, link='log', fit_intercept=False, solver=solver, start_params='least_squares') res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) @@ -557,3 +575,28 @@ def obj(coef): glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + + +@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10]) +def test_binomial_enet(alpha): + """Test elastic net regression with binomial family and LogitLink. + + Compare to LogisticRegression. + """ + l1_ratio = 0.5 + n_samples = 500 + X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6, + n_informative=5, n_redundant=0, n_repeated=0, + random_state=0) + log = LogisticRegression( + penalty='elasticnet', random_state=0, fit_intercept=False, tol=1e-6, + max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), + solver='saga') + log.fit(X, y) + glm = GeneralizedLinearRegressor( + family=BinomialDistribution(), link=LogitLink(), fit_intercept=False, + alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic', + tol=1e-7) + glm.fit(X, y) + assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6) + assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6) From 2993e03dbfc89b068373718c82f65957639767ac Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 7 Apr 2019 15:33:27 +0200 Subject: [PATCH 049/209] More efficient sparse matrices and refactor of irls and cd solver * refactor into function _irls_solver * refactor into function _cd_solver * replace of safe_sparse_dot by matmul operator @ * more efficient handling of fisher matrix * sparse coo matrices are converted to csc or csr * sample weights don't except sparse matrices * minor doc changes --- sklearn/linear_model/glm.py | 1101 ++++++++++++++---------- sklearn/linear_model/tests/test_glm.py | 85 +- 2 files changed, 717 insertions(+), 469 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 01e40b322946c..b2de866a4b69d 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -55,13 +55,12 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, check_X_y -from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg from ..utils.validation import check_is_fitted, check_random_state def _check_weights(sample_weight, n_samples): - """Check that weights are non-negative and have the right shape.""" + """Check that sample weights are non-negative and have the right shape.""" if sample_weight is None: weights = np.ones(n_samples) elif np.isscalar(sample_weight): @@ -70,7 +69,7 @@ def _check_weights(sample_weight, n_samples): weights = sample_weight * np.ones(n_samples) else: _dtype = [np.float64, np.float32] - weights = check_array(sample_weight, accept_sparse='csr', + weights = check_array(sample_weight, accept_sparse=False, force_all_finite=True, ensure_2d=False, dtype=_dtype) if weights.ndim > 1: @@ -247,12 +246,11 @@ class ExponentialDispersionModel(metaclass=ABCMeta): deviance_derivative starting_mu + _mu_deviance_derivative _score _fisher_matrix _observed_information - _deviance - _deviance_derivative - _deviance_hessian + _eta_mu_score_fisher References ---------- @@ -280,7 +278,7 @@ def include_upper_bound(self): return self._include_upper_bound def in_y_range(self, x): - """Returns true if `x` is in the valid range of Y~EDM. + """Returns ``True`` if x is in the valid range of Y~EDM. Parameters ---------- @@ -411,7 +409,7 @@ def unit_deviance_derivative(self, y, mu): mu : array, shape (n_samples,) Predicted mean. """ - return -2*(y-mu)/self.unit_variance(mu) + return -2 * (y - mu) / self.unit_variance(mu) def deviance(self, y, mu, weights=1): r"""Compute the deviance. @@ -434,13 +432,7 @@ def deviance(self, y, mu, weights=1): weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. """ - return np.sum(weights*self.unit_deviance(y, mu)) - - def _deviance(self, coef, X, y, weights, link): - """Compute the deviance as a function of the coefficients and data.""" - lin_pred = safe_sparse_dot(X, coef, dense_output=True) - mu = link.inverse(lin_pred) - return self.deviance(y, mu, weights) + return np.sum(weights * self.unit_deviance(y, mu)) def deviance_derivative(self, y, mu, weights=1): """Compute the derivative of the deviance w.r.t. mu. @@ -458,7 +450,36 @@ def deviance_derivative(self, y, mu, weights=1): weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. """ - return weights*self.unit_deviance_derivative(y, mu) + return weights * self.unit_deviance_derivative(y, mu) + + def starting_mu(self, y, weights=1, ind_weight=0.5): + """Set starting values for the mean mu. + + These may be good starting points for the (unpenalized) IRLS solver. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + + ind_weight : float (default=0.5) + Must be between 0 and 1. Specifies how much weight is given to the + individual observations instead of the mean of y. + """ + return (ind_weight * y + + (1. - ind_weight) * np.average(y, weights=weights)) + + def _mu_deviance_derivative(self, coef, X, y, weights, link): + """Compute mu, the deviance and it's derivative w.r.t coef.""" + lin_pred = X @ coef + mu = link.inverse(lin_pred) + dev = self.deviance(y, mu, weights) + d1 = link.inverse_derivative(lin_pred) + devp = X.T @ (d1 * self.deviance_derivative(y, mu, weights)) + return mu, dev, devp def _score(self, coef, phi, X, y, weights, link): r"""Compute the score function. @@ -476,16 +497,14 @@ def _score(self, coef, phi, X, y, weights, link): with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. + Note: The derivative of the deviance w.r.t. coef equals -2 * score. """ - n_samples = X.shape[0] - lin_pred = safe_sparse_dot(X, coef, dense_output=True) + lin_pred = X @ coef mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d = link.inverse_derivative(lin_pred) - d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), - shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=True) - score = safe_sparse_dot(X.T, temp, dense_output=True) + temp = sigma_inv * d * (y - mu) + score = X.T @ temp return score def _fisher_matrix(self, coef, phi, X, y, weights, link): @@ -508,14 +527,13 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): see func:`_score`. """ n_samples = X.shape[0] - lin_pred = safe_sparse_dot(X, coef, dense_output=True) + lin_pred = X @ coef mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d2 = link.inverse_derivative(lin_pred)**2 d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) - fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False) + fisher_matrix = X.T @ d2_sigma_inv @ X return fisher_matrix def _observed_information(self, coef, phi, X, y, weights, link): @@ -542,7 +560,7 @@ def _observed_information(self, coef, phi, X, y, weights, link): see :func:`score_` function and :func:`_fisher_matrix`. """ n_samples = X.shape[0] - lin_pred = safe_sparse_dot(X, coef, dense_output=True) + lin_pred = X @ coef mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) dp = link.inverse_derivative2(lin_pred) @@ -551,80 +569,59 @@ def _observed_information(self, coef, phi, X, y, weights, link): r = y - mu temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0), shape=(n_samples, n_samples)) - temp = safe_sparse_dot(temp, X, dense_output=False) - observed_information = safe_sparse_dot(X.T, temp, dense_output=False) + observed_information = X.T @ temp @ X return observed_information - def _deviance_derivative(self, coef, X, y, weights, link): - r"""Compute the derivative of the deviance w.r.t. coef. - - The derivative of the deviance w.r.t. `coef` (:math:`w`) as a - function of the coefficients `coef` and the data. - This is equivalent to :math:`-2\phi` times the score function - :func:`_score` (derivative of the log-likelihood). - """ - score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights, - link=link) - return -2*score - - def _deviance_hessian(self, coef, X, y, weights, link): - r"""Compute the hessian matrix of the deviance w.r.t. coef. - - The hessian of the deviance w.r.t. `coef` (:math:`w`) is evaluated as - a function of the coefficients `coef` and the data. - It is equivalent to :math:`+2\phi` times the observed information - matrix. - """ - info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y, - weights=weights, link=link) - return 2*info_matrix - - def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): + def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, + diag_fisher=False): """Compute linear predictor, mean, score function and fisher matrix. It calculates the linear predictor, the mean, score function (derivative of log-likelihood) and Fisher information matrix all in one go as function of `coef` (:math:`w`) and the data. + + Parameters + ---------- + diag_fisher : boolean, optional (default=False) + If ``True``, returns only an array d such that + fisher = X.T @ np.diag(d) @ X. + + Returns + ------- + (eta, mu, score, fisher) : tuple with 4 elements + The 4 elements are: + + * eta: ndarray, shape (X.shape[0],) + * mu: ndarray, shape (X.shape[0],) + * score: ndarray, shape (X.shape[0],) + * fisher: + + * If diag_fisher is ``False``, the full fisher matrix, + an array of shape (X.shape[1], X.shape[1]) + * If diag_fisher is ``True`, an array of shape (X.shape[0]) """ n_samples, n_features = X.shape # eta = linear predictor - eta = safe_sparse_dot(X, coef, dense_output=True) + eta = X @ coef mu = link.inverse(eta) sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) d1 = link.inverse_derivative(eta) # = h'(eta) # Alternatively: # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g # d1 = 1./link.derivative(mu) - d1_sigma_inv = sparse.dia_matrix((sigma_inv*d1, 0), - shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d1_sigma_inv, (y-mu), dense_output=True) - score = safe_sparse_dot(X.T, temp, dense_output=True) + score = X.T @ (sigma_inv * d1 * (y - mu)) # - d2_sigma_inv = sparse.dia_matrix((sigma_inv*(d1**2), 0), - shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) - fisher = safe_sparse_dot(X.T, temp, dense_output=False) - return eta, mu, score, fisher - - def starting_mu(self, y, weights=1, ind_weight=0.5): - """Set starting values for the mean mu. - - These may be good starting points for the (unpenalized) IRLS solver. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - - ind_weight : float (default=0.5) - Must be between 0 and 1. Specifies how much weight is given to the - individual observations instead of the mean of y. - """ - return (ind_weight * y + - (1. - ind_weight) * np.average(y, weights=weights)) + d2_sigma_inv = sigma_inv * (d1**2) + if diag_fisher: + return eta, mu, score, d2_sigma_inv + else: + if sparse.issparse(X): + d2_sigma_inv = sparse.dia_matrix((d2_sigma_inv, 0), + shape=(n_samples, n_samples)) + fisher = (X.T @ d2_sigma_inv @ X).toarray() + else: + fisher = (X.T * d2_sigma_inv) @ X + return eta, mu, score, fisher class TweedieDistribution(ExponentialDispersionModel): @@ -724,20 +721,20 @@ def unit_variance_derivative(self, mu): mu : array, shape (n_samples,) Predicted mean. """ - return self.power*np.power(mu, self.power-1) + return self.power * np.power(mu, self.power - 1) def unit_deviance(self, y, mu): p = self.power if p == 0: # NormalDistribution - return (y-mu)**2 + return (y - mu)**2 if p == 1: # PoissonDistribution # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 return 2 * (special.xlogy(y, y/mu) - y + mu) elif p == 2: # GammaDistribution - return 2 * (np.log(mu/y)+y/mu-1) + return 2 * (np.log(mu/y) + y/mu - 1) else: # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) @@ -784,11 +781,11 @@ def unit_variance(self, mu): return 1 + mu**2 def unit_variance_derivative(self, mu): - return 2*mu + return 2 * mu def unit_deviance(self, y, mu): - return (2*y*(np.arctan(y) - np.arctan(mu)) + - np.log((1+mu**2)/(1+y**2))) + return (2 * y * (np.arctan(y) - np.arctan(mu)) + + np.log((1 + mu**2)/(1 + y**2))) class BinomialDistribution(ExponentialDispersionModel): @@ -809,7 +806,7 @@ def unit_variance_derivative(self, mu): return 1 - 2 * mu def unit_deviance(self, y, mu): - return 2*(special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) + return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) def _irls_step(X, W, P2, z): @@ -824,20 +821,20 @@ def _irls_step(X, W, P2, z): Parameters ---------- - X : {numpy array, sparse matrix}, shape (n_samples, n_features) + X : {ndarray, sparse matrix}, shape (n_samples, n_features) Training data (with intercept included if present) - W : numpy array, shape (n_samples,) + W : ndarray, shape (n_samples,) - P2 : {numpy array, sparse matrix}, shape (n_features, n_features) + P2 : {ndarray, sparse matrix}, shape (n_features, n_features) The L2-penalty matrix or vector (=diagonal matrix) - z : numpy array, shape (n_samples,) + z : ndarray, shape (n_samples,) Working observations Returns ------- - coef: array, shape (X.shape[1]) + coef: ndarray, shape (X.shape[1]) """ # Note: solve vs least squares, what is more appropriate? # scipy.linalg.solve seems faster, but scipy.linalg.lstsq @@ -868,27 +865,422 @@ def _irls_step(X, W, P2, z): return coef +def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): + """Solve GLM with L2 penalty by IRLS algorithm. + + Note: If X is sparse, P2 must also be sparse. + """ + # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' + # Obj = objective function = 1/2 Dev + l2/2 w P2 w + # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 + # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) + # D2 = link.inverse_derivative(eta)^2 = D^2 + # W = D2/V(mu) + # l2 = alpha * (1 - l1_ratio) + # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w + # = -X' D (y-mu)/V(mu) + l2 P2 w + # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 + # Use Fisher matrix instead of full info matrix -X'(...) X, + # i.e. E[Dev''] with E[y-mu]=0: + # Obj'' ~ X' W X + l2 P2 + # (1): w = (X' W X + l2 P2)^-1 X' W z, + # with z = eta + D^-1 (y-mu) + # Note: P2 must be symmetrized + # Note: ' denotes derivative, but also transpose for matrices + + # eta = linear predictor + eta = X @ coef + mu = link.inverse(eta) + # D = h'(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) + n_iter = 0 + while n_iter < max_iter: + n_iter += 1 + # coef_old not used so far. + # coef_old = coef + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = hp**2 / V + # working observations + z = eta + (y - mu) / hp + # solve A*coef = b + # A = X' W X + P2, b = X' W z + coef = _irls_step(X, W, P2, z) + # updated linear predictor + # do it here for updated values for tolerance + eta = X @ coef + mu = link.inverse(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) + + # which tolerace? |coef - coef_old| or gradient? + # use gradient for compliance with newton-cg and lbfgs + # gradient = -X' D (y-mu)/V(mu) + l2 P2 w + gradient = -(X.T @ (hp*(y-mu)/V)) + if P2.ndim == 1: + gradient += P2*coef + else: + gradient += P2 @ coef + if (np.max(np.abs(gradient)) <= tol): + converged = True + break + + if not converged: + warnings.warn("irls failed to converge. Increase the number " + "of iterations (currently {0})" + .format(max_iter), ConvergenceWarning) + + return coef, n_iter + + +def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, + max_inner_iter=1000, selection='cyclic', + random_state=None, diag_fisher=False): + """Compute inner loop of coordinate descent = cycles through features. + + Minimization of 1-d subproblems:: + + min_z q(d+z*e_j) - q(d) + = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 + + A = f'(w) + d*H(w) + (w+d)*P2 + B = H+P2 + Note: f'=-score and H=fisher are updated at the end of outer iteration. + """ + # TODO: use sparsity (coefficient already 0 due to L1 penalty) + # => active set of features for featurelist, see paper + # of Improved GLMNET or Gap Safe Screening Rules + # https://arxiv.org/abs/1611.05780 + n_samples, n_features = X.shape + B = fisher + if P2.ndim == 1: + coef_P2 = coef * P2 + if not diag_fisher: + B[np.diag_indices_from(B)] += P2 + else: + coef_P2 = P2 @ coef # P2 is symmetric, mat @ vec is usually faster + if not diag_fisher: + if sparse.issparse(P2): + B += P2.toarray() + else: + B += P2 + A = -score + coef_P2 # + d @ (H+P2) but d=0 so far + # inner loop + inner_iter = 0 + while inner_iter < max_inner_iter: + inner_iter += 1 + n_cycles += 1 + if selection == 'random': + featurelist = random_state.permutation(n_features) + else: + featurelist = np.arange(n_features) + for j in featurelist: + # minimize_z: a z + 1/2 b z^2 + c |d+z| + # a = A_j + # b = B_jj > 0 + # c = |P1_j| = P1_j > 0, see 1.3 + # d = w_j + d_j + # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) + # with beta = z+d, beta_hat = d-a/b and gamma = c/b + # z = 1/b * S(bd-a,c) - d + # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding + a = A[j] + if diag_fisher: + if sparse.issparse(X): + xj = X[:, j] + b = xj.transpose() @ xj.multiply(fisher[:, np.newaxis]) + b = b[0, 0] + else: + b = X[:, j] @ (fisher * X[:, j]) + + if P2.ndim == 1: + b += P2[j] + else: + b += P2[j, j] + else: + b = B[j, j] + + if b <= 0: + z = 0 + elif P1[j] == 0: + z = -a/b + elif a + P1[j] < b * (coef[j] + d[j]): + z = -(a + P1[j])/b + elif a - P1[j] > b * (coef[j] + d[j]): + z = -(a - P1[j])/b + else: + z = -(coef[j] + d[j]) + + # update direction d + d[j] += z + # update A because d_j is now d_j+z + # A = f'(w) + d*H(w) + (w+d)*P2 + # => A += (H+P2)*e_j z = B_j * z + # Note: B is symmetric B = B.transpose + if diag_fisher: + if sparse.issparse(X): + A += (X.transpose() @ + X[:, j].multiply(fisher[:, np.newaxis]) + ).toarray().ravel() * z + else: + # A += (X.T @ (fisher * X[:, j])) * z + # same without transpose of X + A += ((fisher * X[:, j]) @ X) * z + + if P2.ndim == 1: + A[j] += P2[j] * z + elif sparse.issparse(P2): + # slice columns as P2 is csc + A += P2[:, j].toarray().ravel() * z + else: + A += P2[:, j] * z + else: + # B is symmetric, C- or F-contiguous, but never sparse + if B.flags['F_CONTIGUOUS']: + # slice columns like for sparse csc + A += B[:, j] * z + else: # B.flags['C_CONTIGUOUS'] might be true + # slice rows + A += B[j, :] * z + # end of cycle + # stopping criterion for inner loop + # sum_i(|minimum of norm of subgrad of q(d)_i|) + mn_subgrad = np.where(coef + d == 0, + np.sign(A) * np.maximum(np.abs(A) - P1, 0), + A + np.sign(coef + d) * P1) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) + if mn_subgrad <= inner_tol: + if inner_iter == 1: + inner_tol = inner_tol/4. + break + # end of inner loop + return d, coef_P2, n_cycles, inner_tol + + +def _cd_solver(coef, X, y, weights, P1, P2, family, link, + max_iter=100, max_inner_iter=1000, tol=1e-4, + selection='cyclic ', random_state=None, + diag_fisher=False, copy_X=True): + """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. + + The objective beeing minimized in the coefficients w=coef is:: + + F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 + + An Improved GLMNET for L1-regularized Logistic Regression: + + 1. Find optimal descent direction d by minimizing + min_d F(w+d) = min_d F(w+d) - F(w) + 2. Quadrdatic approximation of F(w+d)-F(w) = q(d): + using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: + q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d + + ||P1*(w+d)||_1 - ||P1*w||_1 + Then minimize q(d): min_d q(d) + 3. Coordinate descent by updating coordinate j (d -> d+z*e_j): + min_z q(d+z*e_j) + = min_z q(d+z*e_j) - q(d) + = min_z A_j z + 1/2 B_jj z^2 + + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 + A = f'(w) + d*H(w) + (w+d)*P2 + B = H+P2 + + Repeat steps 1-3 until convergence. + Note: Use Fisher matrix instead of Hessian for H. + Note: f' = -score, H = Fisher matrix + + Parameters + ---------- + coef: ndarray, shape (n_features,) + + X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) + Training data (with intercept included if present). If not sparse, + pass directly as Fortran-contiguous data to avoid + unnecessary memory duplication. + + y : ndarray, shape (n_samples,) + Target values. + + weights: ndarray, shape (n_samples,) + Sample weights with which the deviance is weighted. The weights must + bee normalized and sum to 1. + + P1 : {ndarray}, shape (n_features,) + The L1-penalty vector (=diagonal matrix) + + P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features) + The L2-penalty matrix or vector (=diagonal matrix). If a matrix is + passed, it must be symmetric. If X is sparse, P2 must also be sparse. + + family : ExponentialDispersionModel + + link : Link + + max_iter : int, optional (default=100) + Maximum numer of outer (Newton) iterations. + + max_inner_iter : int, optional (default=1000) + Maximum number of iterations, i.e. cycles over all features, in inner + loop. + + tol : float, optional (default=1e-4) + Covergence criterion is + sum_i(|minimum of norm of subgrad of objective_i|)<=tol. + + selection : str, optional (default='cyclic') + If 'random', randomly chose features in inner loop. + + random_state : {int, RandomState instance, None}, optional (default=None) + + diag_fisher : boolean, optional (default=False) + 'False' calculates full fisher matrix, 'True' only diagonal matrix s.t. + fisher = X.T @ diag @ X. This saves storage but needs more + matrix-vector multiplications. + + copy_X : boolean, optional (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + Returns + ------- + coef : ndarray, shape (n_features,) + + n_iter : numer of outer iterations = newton iterations + + n_cycles : number of cycles over features + + References + ---------- + Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + """ + X = check_array(X, 'csc', dtype=[np.float64, np.float32], + order='F', copy=copy_X) + if P2.ndim == 2: + P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32], + order='F', copy=copy_X) + if sparse.issparse(X): + if not sparse.isspmatrix_csc(X): + raise ValueError("If X is sparse, it must be in csc format" + "; got (format={})".format(X.format)) + if not sparse.isspmatrix_csc(P2): + raise ValueError("If X is sparse, P2 must also be sparse csc" + "format. Got P2 not sparse.") + random_state = check_random_state(random_state) + # Note: we already set P2 = l2*P2, P1 = l1*P1 + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + n_iter = 0 # number of outer iterations + n_cycles = 0 # number of (complete) cycles over features + converged = False + n_samples, n_features = X.shape + # line search parameters + (beta, sigma) = (0.5, 0.01) + # some precalculations + # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a + # 1d array representing a diagonal matrix. + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=X, y=y, weights=weights, link=link, + diag_fisher=diag_fisher) + # set up space for search direction d for inner loop + d = np.zeros_like(coef) + # initial stopping tolerance of inner loop + # use L1-norm of minimum of norm of subgradient of F + # fp_wP2 = f'(w) + w*P2 + if P2.ndim == 1: + fp_wP2 = -score + coef * P2 + else: + # Note: P2 is symmetric and matrix @ vector is faster for sparse + # matrices. + fp_wP2 = -score + P2 @ coef + inner_tol = np.where(coef == 0, + np.sign(fp_wP2) * np.maximum(np.abs(fp_wP2) - P1, 0), + fp_wP2 + np.sign(coef) * P1) + inner_tol = linalg.norm(inner_tol, ord=1) + # outer loop + while n_iter < max_iter: + n_iter += 1 + # initialize search direction d (to be optimized) with zero + d.fill(0) + # inner loop = _cd_cycle + d, coef_P2, n_cycles, inner_tol = \ + _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, + max_inner_iter=max_inner_iter, selection=selection, + random_state=random_state, diag_fisher=diag_fisher) + # line search by sequence beta^k, k=0, 1, .. + # F(w + lambda d) - F(w) <= lambda * bound + # bound = sigma * (f'(w)*d + w*P2*d + # +||P1 (w+d)||_1 - ||P1 w||_1) + P1w_1 = linalg.norm(P1 * coef, ord=1) + # Note: coef_P2 already calculated and still valid + bound = sigma * (-(score @ d) + coef_P2 @ d + + linalg.norm(P1 * (coef + d), ord=1) - P1w_1) + Fw = (0.5 * family.deviance(y, mu, weights) + + 0.5 * (coef_P2 @ coef) + P1w_1) + la = 1./beta + for k in range(20): + la *= beta # starts with la=1 + coef_wd = coef + la * d + mu_wd = link.inverse(X @ coef_wd) + Fwd = (0.5 * family.deviance(y, mu_wd, weights) + + linalg.norm(P1 * coef_wd, ord=1)) + if P2.ndim == 1: + Fwd += 0.5 * ((coef_wd * P2) @ coef_wd) + else: + Fwd += 0.5 * (coef_wd @ (P2 @ coef_wd)) + if Fwd - Fw <= sigma * la * bound: + break + # update coefficients + # coef_old = coef.copy() + coef += la * d + # calculate eta, mu, score, Fisher matrix for next iteration + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=X, y=y, weights=weights, link=link, + diag_fisher=diag_fisher) + # stopping criterion for outer loop + # sum_i(|minimum of norm of subgrad of F(w)_i|) + # fp_wP2 = f'(w) + w*P2 + # Note: eta, mu and score are already updated + if P2.ndim == 1: + fp_wP2 = -score + coef * P2 + else: + fp_wP2 = -score + P2 @ coef # P2 is symmetric, mat @ vec is faster + mn_subgrad = np.where(coef == 0, + np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), + fp_wP2 + np.sign(coef) * P1) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) + if mn_subgrad <= tol: + converged = True + break + # end of outer loop + if not converged: + warnings.warn("Coordinate descent failed to converge. Increase" + " the number of iterations (currently {0})" + .format(max_iter), ConvergenceWarning) + + return coef, n_iter, n_cycles + + class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean `mu=h(X*w)`. Therefore the fit minimizes - the following objective function with combined L1 and L2 priors as - regularizer:: + fitting and predicting the mean of the target y as mu=h(X*w). Therefore, + the fit minimizes the following objective function with combined L1 and L2 + priors as regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + alpha * l1_ratio * ||P1*w||_1 + 1/2 * alpha * (1 - l1_ratio) * w*P2*w - with inverse link function `h` and s=`sample_weight` (for - ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`). - For ``P1=P2='identity'`` (``P1=None``, ``P2=None``), the penalty is the - elastic net:: + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + For ``P1=P2='identity'``, the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 - If you are interested in controlling the L1 and L2 penalty + If you are interested in controlling the L1 and L2 penalties separately, keep in mind that this is equivalent to:: a * L1 + b * L2 @@ -897,9 +1289,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): alpha = a + b and l1_ratio = a / (a + b) - The parameter `l1_ratio` corresponds to alpha in the glmnet R package while - 'alpha' corresponds to the lambda parameter in glmnet. Specifically, - l1_ratio = 1 is the lasso penalty. + The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet, + while ``alpha`` corresponds to the lambda parameter in glmnet. + Specifically, l1_ratio = 1 is the lasso penalty. Read more in the :ref:`User Guide `. @@ -948,8 +1340,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The distributional assumption of the GLM, i.e. which distribution from the EDM, specifies the loss function to be minimized. - link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, - optional (default='auto') + link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \ + optional (default='auto') The link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). Option 'auto' sets the link depending on the chosen family as follows: @@ -982,7 +1374,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): 'cd' Coordinate descent algorithm. It can deal with L1 as well as L2 - penalties. + penalties. Note that in order to avoid unnecessary memory + duplication of the X argument in the ``fit`` method, X should be + directly passed as a Fortran-contiguous numpy array or sparse csc + matrix. max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -990,8 +1385,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` - where ``g_i`` is the i-th component of the gradient (derivative of - the objective function). + where g_i is the i-th component of the gradient (derivative) of + the objective function. For the cd solver, covergence is reached + when ``sum_i(|minimum of norm of g_i|)``, where g_i is the + subgradient of the objective. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` @@ -1007,7 +1404,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): 'irls' Start values of mu are calculated by family.starting_mu(..). Then, - one step of irls obtains start values for ``coef_`. This gives + one step of irls obtains start values for ``coef_``. This gives usually good results. 'least_squares' @@ -1042,6 +1439,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): RandomState instance used by `np.random`. Used when ``selection`` == 'random'. + diag_fisher : boolean, (default=False) + Only relevant for solver 'cd'. If ``False``, the full Fisher matrix + (expected Hessian) is computed in each outer iteretion (Newton + iteration). If ``True``, only a diagonal matrix (stored as 1d array) is + computed, such that fisher = X.T @ diag @ X. This saves memory and + matrix-matrix multiplications, but needs more matrix-vector + multiplications. If you use large sparse X or if you have many + features, i.e. n_features >> n_samples, you might set this option to + ``True``. + copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten. @@ -1056,40 +1463,43 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Attributes ---------- coef_ : array, shape (n_features,) - Estimated coefficients for the linear predictor (X*coef_) in the GLM. + Estimated coefficients for the linear predictor (X*coef_+intercept_) in + the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. dispersion_ : float - The dispersion parameter :math:`\\phi` if fit_dispersion is set. + The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. n_iter_ : int - Actual number of iterations of the solver. + Actual number of iterations used in solver. Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + :ref:`User Guide `. The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - (penalized) maximum likelihood which is equivalent to minimizing the - deviance. + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. - For `alpha` > 0, the feature matrix `X` should be standardized in order to + For alpha > 0, the feature matrix X should be standardized in order to penalize features equally strong. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. - If the target `y` is a ratio, appropriate sample weights `s` should be + If the target y is a ratio, appropriate sample weights s should be provided. - As an example, consider Poission distributed counts `z` (integers) and - weights `s=exposure` (time, money, persons years, ...). Then you fit - `y = z/s`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, + As an example, consider Poission distributed counts z (integers) and + weights s=exposure (time, money, persons years, ...). Then you fit + y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, - in this case one might say that `y` has a 'scaled' Poisson distributions. + in this case one might say that y has a 'scaled' Poisson distributions. The same holds for other distributions. References @@ -1104,8 +1514,8 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params='irls', - selection='cyclic', random_state=None, copy_X=True, - check_input=True, verbose=0): + selection='cyclic', random_state=None, diag_fisher=False, + copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.l1_ratio = l1_ratio self.P1 = P1 @@ -1121,6 +1531,7 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', self.start_params = start_params self.selection = selection self.random_state = random_state + self.diag_fisher = diag_fisher self.copy_X = copy_X self.check_input = check_input self.verbose = verbose @@ -1154,11 +1565,13 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # 1.1 validate arguments of fit ####################################### _dtype = [np.float64, np.float32] - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - dtype=_dtype, y_numeric=True, multi_output=False) + X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], + dtype=_dtype, y_numeric=True, multi_output=False, + copy=self.copy_X) # Without converting y to float, deviance might raise # ValueError: Integers to negative integer powers are not allowed. - y = y.astype(np.float64) + # Also, y must not be sparse. + y = np.asarray(y, dtype=np.float64) weights = _check_weights(sample_weight, y.shape[0]) @@ -1263,7 +1676,7 @@ def fit(self, X, y, sample_weight=None): " correct length;" " got(start_params={0})".format(start_params)) else: - start_params = check_array(start_params, accept_sparse='csr', + start_params = check_array(start_params, accept_sparse=False, force_all_finite=True, ensure_2d=False, dtype=_dtype, copy=True) if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or @@ -1274,12 +1687,14 @@ def fit(self, X, y, sample_weight=None): .format(X.shape[1] + self.fit_intercept, start_params.shape[0], start_params.ndim)) - if self.selection not in ['cyclic', 'random']: raise ValueError("The argument selection must be 'cyclic' or " "'random'; got (selection={0})" .format(self.selection)) random_state = check_random_state(self.random_state) + if not isinstance(self.diag_fisher, bool): + raise ValueError("The argument diag_fisher must be bool;" + " got {0}".format(self.diag_fisher)) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) @@ -1303,26 +1718,34 @@ def fit(self, X, y, sample_weight=None): "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." .format(P1.shape[0], X.shape[1])) + # If X is sparse, make P2 sparse, too. if isinstance(self.P2, str) and self.P2 == 'identity': - if not sparse.issparse(X): - P2 = np.ones(X.shape[1]) - else: + if sparse.issparse(X): P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), shape=(X.shape[1], X.shape[1]))).tocsr() + else: + P2 = np.ones(X.shape[1]) else: P2 = check_array(self.P2, copy=True, - accept_sparse=['csr', 'csc', 'coo'], + accept_sparse=['csr', 'csc'], dtype=_dtype, ensure_2d=False) if P2.ndim == 1: + P2 = np.asarray(P2) if P2.shape[0] != X.shape[1]: raise ValueError("P2 should be a 1d array of shape " "(n_features,) with " "n_features=X.shape[1]; " "got (P2.shape=({0},)), needed ({1},)" .format(P2.shape[0], X.shape[1])) - elif ((P2.ndim != 2) or - (P2.shape[0] != P2.shape[1]) or - (P2.shape[0] != X.shape[1])): + if sparse.issparse(X): + P2 = (sparse.dia_matrix((P2, 0), + shape=(X.shape[1], X.shape[1]))).tocsr() + elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and + P2.shape[0] == X.shape[1]): + if sparse.issparse(X): + P2 = (sparse.dia_matrix((P2, 0), + shape=(X.shape[1], X.shape[1]))).tocsr() + else: raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " "n_features=X.shape[1]; " @@ -1335,7 +1758,8 @@ def fit(self, X, y, sample_weight=None): if self.fit_intercept: # Note: intercept is first column <=> coef[0] is for intecept if sparse.issparse(X): - Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X]) + Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X], + format=X.format) else: Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) P1 = np.concatenate((np.array([0]), P1)) @@ -1343,6 +1767,7 @@ def fit(self, X, y, sample_weight=None): P2 = np.concatenate((np.array([0]), P2)) elif sparse.issparse(P2): P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2), + format=P2.format, dtype=P2.dtype).tocsr() else: # as of numpy 1.13 this would work: @@ -1355,7 +1780,7 @@ def fit(self, X, y, sample_weight=None): n_samples, n_features = Xnew.shape l1 = self.alpha * self.l1_ratio - l2 = self.alpha * (1-self.l1_ratio) + l2 = self.alpha * (1 - self.l1_ratio) P1 *= l1 P2 *= l2 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') @@ -1484,315 +1909,114 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # algorithms for optimiation # TODO: Parallelize it? - self.n_iter_ = 0 - converged = False + # 4.1 IRLS ############################################################ - # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' - # Obj = objective function = 1/2 Dev + l2/2 w P2 w - # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 - # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) - # D2 = link.inverse_derivative(eta)^2 = D^2 - # W = D2/V(mu) - # l2 = alpha * (1 - l1_ratio) - # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w - # = -X' D (y-mu)/V(mu) + l2 P2 w - # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 - # Use Fisher matrix instead of full info matrix -X'(...) X, - # i.e. E[Dev''] with E[y-mu]=0: - # Obj'' ~ X' W X + l2 P2 - # (1): w = (X' W X + l2 P2)^-1 X' W z, - # with z = eta + D^-1 (y-mu) # Note: we already set P2 = l2*P2, see above # Note: we already symmetriezed P2 = 1/2 (P2 + P2') - # Note: ' denotes derivative, but also transpose for matrices if solver == 'irls': - # eta = linear predictor - eta = safe_sparse_dot(Xnew, coef, dense_output=True) - mu = link.inverse(eta) - # D = h'(eta) - hp = link.inverse_derivative(eta) - V = family.variance(mu, phi=1, weights=weights) - while self.n_iter_ < self.max_iter: - self.n_iter_ += 1 - # coef_old not used so far. - # coef_old = coef - # working weights W, in principle a diagonal matrix - # therefore here just as 1d array - W = (hp**2 / V) - # working observations - z = eta + (y-mu)/hp - # solve A*coef = b - # A = X' W X + P2, b = X' W z - coef = _irls_step(Xnew, W, P2, z) - # updated linear predictor - # do it here for updated values for tolerance - eta = safe_sparse_dot(Xnew, coef, dense_output=True) - mu = link.inverse(eta) - hp = link.inverse_derivative(eta) - V = family.variance(mu, phi=1, weights=weights) - - # which tolerace? |coef - coef_old| or gradient? - # use gradient for compliance with newton-cg and lbfgs - # gradient = family._deviance_derivative( - # coef=coef, X=Xnew, y=y, weights=weights, link=link) - # gradient = -X' D (y-mu)/V(mu) + l2 P2 w - gradient = -safe_sparse_dot(Xnew.T, hp*(y-mu)/V) + coef, self.n_iter_ = \ + _irls_solver(coef=coef, X=Xnew, y=y, weights=weights, P2=P2, + family=family, link=link, max_iter=self.max_iter, + tol=self.tol) + + # 4.2 L-BFGS ########################################################## + elif solver == 'lbfgs': + def func(coef, X, y, weights, P2, family, link): + mu, dev, devp = \ + family._mu_deviance_derivative(coef, X, y, weights, link) if P2.ndim == 1: - gradient += P2*coef + L2 = P2 * coef else: - gradient += safe_sparse_dot(P2, coef) - if (np.max(np.abs(gradient)) <= self.tol): - converged = True - break - - if not converged: - warnings.warn("irls failed to converge. Increase the number " - "of iterations (currently {0})" - .format(self.max_iter), ConvergenceWarning) - - # 4.2 L-BFGS and Newton-CG ############################################ - # TODO: performance: make one function return both deviance and - # gradient of deviance - elif solver in ['lbfgs', 'newton-cg']: - def func(coef, *args): + L2 = P2 @ coef + obj = 0.5 * dev + 0.5 * (coef @ L2) + objp = 0.5 * devp + L2 + return obj, objp + + args = (Xnew, y, weights, P2, family, link) + coef, loss, info = fmin_l_bfgs_b( + func, coef, fprime=None, args=args, + iprint=(self.verbose > 0) - 1, pgtol=self.tol, + maxiter=self.max_iter) + if self.verbose > 0: + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.", + ConvergenceWarning) + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}" + .format(info["task"])) + self.n_iter_ = info['nit'] + + # 4.3 Newton-CG ####################################################### + # We use again the fisher matrix instead of the hessian. More + # precisely, expected hessian of deviance. + elif solver == 'newton-cg': + def func(coef, X, y, weights, P2, family, link): if P2.ndim == 1: - L2 = safe_sparse_dot(coef.T, P2*coef) + L2 = coef @ (P2 * coef) else: - L2 = safe_sparse_dot(coef.T, safe_sparse_dot(P2, coef)) - # A[np.diag_indices_from(A)] += P2 - return 0.5*family._deviance(coef, *args) + 0.5*L2 + L2 = coef @ (P2 @ coef) + mu = link.inverse(X @ coef) + return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 - def fprime(coef, *args): + def grad(coef, X, y, weights, P2, family, link): if P2.ndim == 1: - L2 = P2*coef + L2 = P2 * coef else: - L2 = safe_sparse_dot(P2, coef) - return 0.5*family._deviance_derivative(coef, *args) + L2 + L2 = P2 @ coef + eta = X @ coef + mu = link.inverse(eta) + d1 = link.inverse_derivative(eta) + grad = X.T @ (d1 * family.deviance_derivative(y, mu, weights)) + return 0.5 * grad + L2 - def grad_hess(coef, X, y, weights, link): + def grad_hess(coef, X, y, weights, P2, family, link): if P2.ndim == 1: - L2 = P2*coef + L2 = P2 * coef else: - L2 = safe_sparse_dot(P2, coef) - grad = 0.5*family._deviance_derivative( - coef, X, y, weights, link) + L2 - hessian = 0.5*family._deviance_hessian( - coef, X, y, weights, link) - if P2.ndim == 1: - hessian[np.diag_indices_from(hessian)] += P2 - else: - hessian = hessian + P2 + L2 = P2 @ coef + eta = X @ coef + mu = link.inverse(eta) + d1 = link.inverse_derivative(eta) + grad = 0.5 * \ + (X.T @ (d1 * family.deviance_derivative(y, mu, weights))) \ + + L2 + # expected hessian = X.T @ diag_matrix @ X + # calculate only diag_matrix + diag = d1**2 / family.variance(mu, phi=1, weights=weights) def Hs(s): - ret = safe_sparse_dot(hessian, s) + ret = 0.5 * (X.T @ (diag * (X @ s))) + if P2.ndim == 1: + ret += P2 * s + else: + ret += P2 @ s return ret + return grad, Hs - args = (Xnew, y, weights, link) - - if solver == 'lbfgs': - coef, loss, info = fmin_l_bfgs_b( - func, coef, fprime=fprime, args=args, - iprint=(self.verbose > 0) - 1, pgtol=self.tol, - maxiter=self.max_iter) - if self.verbose > 0: - if info["warnflag"] == 1: - warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.", - ConvergenceWarning) - elif info["warnflag"] == 2: - warnings.warn("lbfgs failed for the reason: {0}" - .format(info["task"])) - self.n_iter_ = info['nit'] - elif solver == 'newton-cg': - coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef, - args=args, maxiter=self.max_iter, - tol=self.tol) - - # 4.3 coordinate descent ############################################## - # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin - # An Improved GLMNET for L1-regularized Logistic Regression, - # Journal of Machine Learning Research 13 (2012) 1999-2030 - # Note: Use Fisher matrix instead of Hessian for H - # - # 1. find optimal descent direction d by minimizing - # min_d F(w+d) = min_d F(w+d) - F(w) - # F = f + g, f(w) = 1/2 deviance, g(w) = 1/2 w*P2*w + ||P1*w||_1 - # 2. quadrdatic approximation of F(w+d)-F(w) = q(d): - # using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives - # q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d - # + ||P1*(w+d)||_1 - ||P1*w||_1 - # min_d q(d) - # 3. coordinate descent by updating coordinate j (d -> d+z*e_j): - # min_z q(d+z*e_j) - # = min_z q(d+z*e_j) - q(d) - # = min_z A_j z + 1/2 B_jj z^2 - # + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 - # A = f'(w) + d*H(w) + (w+d)*P2 - # B = H+P2 - # Note: we already set P2 = l2*P2, P1 = l1*P1, see above + args = (Xnew, y, weights, P2, family, link) + coef, n_iter_i = newton_cg(grad_hess, func, grad, coef, + args=args, maxiter=self.max_iter, + tol=self.tol) + + # 4.4 coordinate descent ############################################## + # Note: we already set P1 = l1*P1, see above + # Note: we already set P2 = l2*P2, see above # Note: we already symmetriezed P2 = 1/2 (P2 + P2') - # Note: f' = -score, H = Fisher matrix elif solver == 'cd': - # line search parameters - (beta, sigma) = (0.5, 0.01) - # max inner loops (cycles through all features) - max_inner_iter = 1000 - # some precalculations - eta, mu, score, fisher = family._eta_mu_score_fisher( - coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) - # set up space for search direction d for inner loop - d = np.zeros_like(coef) - # initial stopping tolerance of inner loop - # use L1-norm of minimum-norm of subgradient of F - # fp_wP2 = f'(w) + w*P2 - if P2.ndim == 1: - fp_wP2 = -score + coef*P2 - else: - fp_wP2 = -score + safe_sparse_dot(coef, P2) - inner_tol = (np.where(coef == 0, - np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), - fp_wP2+np.sign(coef)*P1)) - inner_tol = linalg.norm(inner_tol, ord=1) - # outer loop - while self.n_iter_ < self.max_iter: - self.n_iter_ += 1 - # initialize search direction d (to be optimized) with zero - d.fill(0) - # inner loop - # TODO: use sparsity (coefficient already 0 due to L1 penalty) - # => active set of features for featurelist, see paper - # of Improved GLMNET or Gap Safe Screening Rules - # https://arxiv.org/abs/1611.05780 - # A = f'(w) + d*H(w) + (w+d)*P2 - # B = H+P2 - # Note: f'=-score and H=fisher are updated at the end of outer - # iteration - B = fisher - if P2.ndim == 1: - coef_P2 = coef * P2 - B[np.diag_indices_from(B)] += P2 - else: - coef_P2 = safe_sparse_dot(coef, P2) - B = B + P2 - A = -score + coef_P2 # + d*(H+P2) but d=0 so far - inner_iter = 0 - while inner_iter < max_inner_iter: - inner_iter += 1 - if self.selection == 'random': - featurelist = random_state.permutation(n_features) - else: - featurelist = np.arange(n_features) - for j in featurelist: - # minimize_z: a z + 1/2 b z^2 + c |d+z| - # a = A_j - # b = B_jj > 0 - # c = |P1_j| = P1_j > 0, see 1.3 - # d = w_j + d_j - # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) - # with beta = z+d, beta_hat = d-a/b and gamma = c/b - # z = 1/b * S(bd-a,c) - d - # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding - a = A[j] - b = B[j, j] - if P1[j] == 0: - if b == 0: - z = 0 - else: - z = -a/b - elif a + P1[j] < b * (coef[j]+d[j]): - if b == 0: - z = 0 - else: - z = -(a + P1[j])/b - elif a - P1[j] > b * (coef[j]+d[j]): - if b == 0: - z = 0 - else: - z = -(a - P1[j])/b - else: - z = -(coef[j] + d[j]) - # update direction d - d[j] += z - # update A because d_j is now d_j+z - # A = f'(w) + d*H(w) + (w+d)*P2 - # => A += (H+P2)*e_j z = B_j * z - # Note: B is symmetric B = B.transpose - if sparse.issparse(B): - if sparse.isspmatrix_csc(B): - # slice columns - A += B[:, j].toarray().ravel() * z - else: - # slice rows - A += B[j, :].toarray().ravel() * z - else: - A += B[j, :] * z - # end of cycle - # stopping criterion for inner loop - # sum_i(|minimum-norm subgrad of q(d)_i|) - mn_subgrad = (np.where(coef + d == 0, - np.sign(A)*np.maximum(np.abs(A)-P1, 0), - A+np.sign(coef+d)*P1)) - mn_subgrad = linalg.norm(mn_subgrad, ord=1) - if mn_subgrad <= inner_tol: - if inner_iter == 1: - inner_tol = inner_tol/4. - break - # end of inner loop - # line search by sequence beta^k, k=0, 1, .. - # F(w + lambda d) - F(w) <= lambda * bound - # bound = sigma * (f'(w)*d + w*P2*d - # +||P1 (w+d)||_1 - ||P1 w||_1) - P1w_1 = linalg.norm(P1*coef, ord=1) - # Note: coef_P2 already calculated and still valid - bound = sigma * ( - safe_sparse_dot(-score, d) + - safe_sparse_dot(coef_P2, d) + - linalg.norm(P1*(coef+d), ord=1) - - P1w_1) - Fw = (0.5 * family.deviance(y, mu, weights) + - 0.5 * safe_sparse_dot(coef_P2, coef) + - P1w_1) - la = 1./beta - for k in range(20): - la *= beta # starts with la=1 - mu_wd = link.inverse(safe_sparse_dot(Xnew, coef+la*d, - dense_output=True)) - Fwd = (0.5 * family.deviance(y, mu_wd, weights) + - linalg.norm(P1*(coef+la*d), ord=1)) - if P2.ndim == 1: - Fwd += 0.5 * safe_sparse_dot((coef+la*d)*P2, coef+la*d) - else: - Fwd += 0.5 * (safe_sparse_dot(coef+la*d, - safe_sparse_dot(P2, coef+la*d))) - if Fwd-Fw <= sigma*la*bound: - break - # update coefficients - # coef_old = coef.copy() - coef += la * d - # calculate eta, mu, score, Fisher matrix for next iteration - eta, mu, score, fisher = family._eta_mu_score_fisher( - coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) - # stopping criterion for outer loop - # sum_i(|minimum-norm subgrad of F(w)_i|) - # fp_wP2 = f'(w) + w*P2 - # Note: eta, mu and score are already updated - if P2.ndim == 1: - fp_wP2 = -score + coef*P2 - else: - fp_wP2 = -score + safe_sparse_dot(coef, P2) - mn_subgrad = (np.where(coef == 0, - np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), - fp_wP2+np.sign(coef)*P1)) - mn_subgrad = linalg.norm(mn_subgrad, ord=1) - if mn_subgrad <= self.tol: - converged = True - break - # end of outer loop - if not converged: - warnings.warn("Coordinate descent failed to converge. Increase" - " the number of iterations (currently {0})" - .format(self.max_iter), ConvergenceWarning) + # For coordinate descent, if X is sparse, it should be csc format + # If X is sparse, P2 must also be csc + if sparse.issparse(Xnew): + Xnew = Xnew.tocsc(copy=self.copy_X) + P2 = sparse.csc_matrix(P2) + + coef, self.n_iter_, self._n_cycles = \ + _cd_solver(coef=coef, X=Xnew, y=y, weights=weights, P1=P1, + P2=P2, family=family, link=link, + max_iter=self.max_iter, tol=self.tol, + selection=self.selection, random_state=random_state, + diag_fisher=self.diag_fisher, copy_X=self.copy_X) ####################################################################### # 5. postprocessing # @@ -1828,8 +2052,7 @@ def linear_predictor(self, X): X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype='numeric', copy=True, ensure_2d=True, allow_nd=False) - return safe_sparse_dot(X, self.coef_, - dense_output=True) + self.intercept_ + return X @ self.coef_ + self.intercept_ def predict(self, X, sample_weight=None): """Predict uing GLM with feature matrix X. @@ -1884,7 +2107,7 @@ def estimate_phi(self, X, y, sample_weight=None): dtype=_dtype, y_numeric=True, multi_output=False) n_samples, n_features = X.shape weights = _check_weights(sample_weight, n_samples) - eta = safe_sparse_dot(X, self.coef_, dense_output=True) + eta = X @ self.coef_ if self.fit_intercept is True: eta += self.intercept_ n_features += 1 @@ -1907,19 +2130,19 @@ def estimate_phi(self, X, y, sample_weight=None): # "AssertionError: -0.28014056555724598 not greater than 0.5" # unless GeneralizedLinearRegressor has a score which passes the test. def score(self, X, y, sample_weight=None): - r"""Compute D^2, the percentage of deviance explained. + """Compute D^2, the percentage of deviance explained. D^2 is a generalization of the coefficient of determination R^2. R^2 uses squared error and D^2 deviance. Note that those two are equal for family='normal'. D^2 is defined as - :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` - is the null deviance, i.e. the deviance of a model with intercept - alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean - :math:`\bar{y}` is averaged by sample_weight. - Best possible score is 1.0 and it can be negative (because the - model can be arbitrarily worse). + :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, + :math:`D_{null}` is the null deviance, i.e. the deviance of a model + with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. + The mean :math:`\\bar{y}` is averaged by sample_weight. + Best possible score is 1.0 and it can be negative (because the model + can be arbitrarily worse). Parameters ---------- diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index de0857a34fe3a..ae8a5f4cfc5e7 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -138,16 +138,18 @@ def test_sample_weights_validation(): glm.fit(X, y, weights) -def test_glm_family_argument(): +@pytest.mark.parametrize('f, fam', + [('normal', NormalDistribution()), + ('poisson', PoissonDistribution()), + ('gamma', GammaDistribution()), + ('inverse.gaussian', InverseGaussianDistribution()), + ('binomial', BinomialDistribution())]) +def test_glm_family_argument(f, fam): """Test GLM family argument set as string.""" - y = np.array([1, 2]) + y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) - for (f, fam) in [('normal', NormalDistribution()), - ('poisson', PoissonDistribution()), - ('gamma', GammaDistribution()), - ('inverse.gaussian', InverseGaussianDistribution())]: - glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) - assert_equal(type(glm._family_instance), type(fam)) + glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) + assert_equal(type(glm._family_instance), type(fam)) glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) @@ -155,14 +157,16 @@ def test_glm_family_argument(): glm.fit(X, y) -def test_glm_link_argument(): +@pytest.mark.parametrize('l, link', + [('identity', IdentityLink()), + ('log', LogLink()), + ('logit', LogitLink())]) +def test_glm_link_argument(l, link): """Test GLM link argument set as string.""" - y = np.array([1, 2]) + y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) - for (l, link) in [('identity', IdentityLink()), - ('log', LogLink())]: - glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) - assert_equal(type(glm._link_instance), type(link)) + glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) + assert_equal(type(glm._link_instance), type(link)) glm = GeneralizedLinearRegressor(family='normal', link='not a link') with pytest.raises(ValueError): @@ -317,6 +321,16 @@ def test_glm_random_state_argument(random_state): glm.fit(X, y) +@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]]) +def test_glm_diag_fisher_argument(diag_fisher): + """Test GLM for invalid diag_fisher arguments.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher) + with pytest.raises(ValueError): + glm.fit(X, y) + + @pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) def test_glm_copy_X_argument(copy_X): """Test GLM for invalid copy_X arguments.""" @@ -453,7 +467,12 @@ def test_normal_ridge(solver): assert_array_almost_equal(glm.predict(T), ridge.predict(T)) -def test_poisson_ridge(): +@pytest.mark.parametrize('solver, decimal, tol', + [('irls', 7, 1e-8), + ('lbfgs', 5, 1e-7), + ('newton-cg', 5, 1e-7), + ('cd', 7, 1e-8)]) +def test_poisson_ridge(solver, decimal, tol): """Test ridge regression with poisson family and LogLink. Compare to R's glmnet""" @@ -470,22 +489,20 @@ def test_poisson_ridge(): # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) - s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7} - s_tol = {'irls': 1e-8, 'lbfgs': 1e-7, 'newton-cg': 1e-7, 'cd': 1e-8} - for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, - fit_intercept=True, family='poisson', - link='log', tol=s_tol[solver], - solver=solver, max_iter=300, - random_state=42) - glm.fit(X, y) - assert_almost_equal(glm.intercept_, -0.12889386979, - decimal=s_dec[solver]) - assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], - decimal=s_dec[solver]) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, + fit_intercept=True, family='poisson', + link='log', tol=tol, + solver=solver, max_iter=300, + random_state=42) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, -0.12889386979, + decimal=decimal) + assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + decimal=decimal) -def test_normal_enet(): +@pytest.mark.parametrize('diag_fisher', [False, True]) +def test_normal_enet(diag_fisher): """Test elastic net regression with normal/gaussian family.""" rng = np.random.RandomState(0) alpha, l1_ratio = 0.3, 0.7 @@ -494,12 +511,14 @@ def test_normal_enet(): beta = rng.randn(n_features) y = 2 + np.dot(X, beta) + rng.randn(n_samples) + # 1. test normal enet on dense data glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, family='normal', link='identity', fit_intercept=True, tol=1e-8, max_iter=100, selection='cyclic', solver='cd', start_params='zero', - check_input=False) + check_input=False, + diag_fisher=diag_fisher) glm.fit(X, y) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, @@ -509,6 +528,12 @@ def test_normal_enet(): assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + # 2. test normal enet on sparse data + X = sparse.csc_matrix(X) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) + assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + def test_poisson_enet(): """Test elastic net regression with poisson family and LogLink. From a6f9f13db9dd11d01d1dcab93819e67d5ca18b9d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 20 Apr 2019 15:59:39 +0200 Subject: [PATCH 050/209] Treat the intercept separately, i.e. X, P1, P2 never include intercept --- sklearn/linear_model/glm.py | 637 +++++++++++++++---------- sklearn/linear_model/tests/test_glm.py | 6 +- 2 files changed, 386 insertions(+), 257 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index b2de866a4b69d..2afd1ddf8c79c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -7,8 +7,6 @@ # License: BSD 3 clause # TODO: Write more examples. -# TODO: Make option self.copy_X more meaningful. -# So far, fit uses Xnew instead of X. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. # TODO: Add cross validation support? @@ -39,6 +37,7 @@ # sklearn.linear_models uses w for coefficients, standard literature on # GLMs use beta for coefficients and w for (sample) weights. # So far, coefficients=w and sample weights=s. +# - The intercept term is the first index, i.e. coef[0] from __future__ import division @@ -86,6 +85,83 @@ def _check_weights(sample_weight, n_samples): return weights +def _safe_lin_pred(X, coef): + """Compute the linear predictor taking care if intercept is present.""" + if coef.size == X.shape[1] + 1: + return X @ coef[1:] + coef[0] + else: + return X @ coef + + +def _safe_sandwich_dot(X, d, intercept=False): + """Compute sandwich product X.T @ diag(d) @ X. + + With ``intercept=True``, X is treated as if a column of 1 were appended as + first column of X. + X can be sparse, d must be an ndarray. Always returns a ndarray.""" + if sparse.issparse(X): + temp = (X.transpose().multiply(d) @ X).toarray() + else: + temp = (X.T * d) @ X + if intercept: + dim = X.shape[1] + 1 + if sparse.issparse(X): + order = 'F' if sparse.isspmatrix_csc(X) else 'C' + else: + order = 'F' if X.flags['F_CONTIGUOUS'] else 'C' + res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order) + res[0, 0] = d.sum() + res[1:, 0] = d @ X + res[0, 1:] = res[1:, 0] + res[1:, 1:] = temp + else: + res = temp + return res + + +def _min_norm_sugrad(coef, grad, P2, P1): + """Compute the gradient of all subgradients with minimal L2-norm. + + subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1) + + g_i = grad_i + (P2*coef)_i + + if coef_i > 0: g_i + P1_i + if coef_i < 0: g_i - P1_i + if coef_i = 0: sign(g_i) * max(|g_i|-P1_i, 0) + + Parameters + ---------- + coef : ndarray + coef[0] may be intercept. + + grad : ndarray, shape=coef.shape + + P2 : {1d or 2d array, None} + always without intercept, ``None`` means P2 = 0 + + P1 : ndarray + always without intercept + """ + intercept = (coef.size == P1.size + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + # compute grad + coef @ P2 without intercept + grad_wP2 = grad[idx:].copy() + if P2 is None: + pass + elif P2.ndim == 1: + grad_wP2 += coef[idx:] * P2 + else: + grad_wP2 += coef[idx:] @ P2 + res = np.where(coef[idx:] == 0, + np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0), + grad_wP2 + np.sign(coef[idx:]) * P1) + if intercept: + return np.concatenate(([grad[0]], res)) + else: + return res + + class Link(metaclass=ABCMeta): """Abstract base class for Link funtions.""" @@ -473,13 +549,16 @@ def starting_mu(self, y, weights=1, ind_weight=0.5): (1. - ind_weight) * np.average(y, weights=weights)) def _mu_deviance_derivative(self, coef, X, y, weights, link): - """Compute mu, the deviance and it's derivative w.r.t coef.""" - lin_pred = X @ coef + """Compute mu and the derivative of the deviance w.r.t coef.""" + lin_pred = _safe_lin_pred(X, coef) mu = link.inverse(lin_pred) - dev = self.deviance(y, mu, weights) d1 = link.inverse_derivative(lin_pred) - devp = X.T @ (d1 * self.deviance_derivative(y, mu, weights)) - return mu, dev, devp + temp = d1 * self.deviance_derivative(y, mu, weights) + if coef.size == X.shape[1] + 1: + devp = np.concatenate(([temp.sum()], temp @ X)) + else: + devp = temp @ X # sampe as X.T @ temp + return mu, devp def _score(self, coef, phi, X, y, weights, link): r"""Compute the score function. @@ -499,12 +578,15 @@ def _score(self, coef, phi, X, y, weights, link): :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. Note: The derivative of the deviance w.r.t. coef equals -2 * score. """ - lin_pred = X @ coef + lin_pred = _safe_lin_pred(X, coef) mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d = link.inverse_derivative(lin_pred) temp = sigma_inv * d * (y - mu) - score = X.T @ temp + if coef.size == X.shape[1] + 1: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X # sampe as X.T @ temp return score def _fisher_matrix(self, coef, phi, X, y, weights, link): @@ -526,14 +608,14 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, see func:`_score`. """ - n_samples = X.shape[0] - lin_pred = X @ coef + lin_pred = _safe_lin_pred(X, coef) mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) - d2 = link.inverse_derivative(lin_pred)**2 - d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), - shape=(n_samples, n_samples)) - fisher_matrix = X.T @ d2_sigma_inv @ X + d = link.inverse_derivative(lin_pred) + d2_sigma_inv = sigma_inv * d * d + intercept = (coef.size == X.shape[1] + 1) + fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, + intercept=intercept) return fisher_matrix def _observed_information(self, coef, phi, X, y, weights, link): @@ -559,17 +641,17 @@ def _observed_information(self, coef, phi, X, y, weights, link): \right)`, see :func:`score_` function and :func:`_fisher_matrix`. """ - n_samples = X.shape[0] - lin_pred = X @ coef + lin_pred = _safe_lin_pred(X, coef) mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) dp = link.inverse_derivative2(lin_pred) d2 = link.inverse_derivative(lin_pred)**2 v = self.unit_variance_derivative(mu)/self.unit_variance(mu) r = y - mu - temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0), - shape=(n_samples, n_samples)) - observed_information = X.T @ temp @ X + temp = sigma_inv * (-dp * r + d2 * v * r + d2) + intercept = (coef.size == X.shape[1] + 1) + observed_information = _safe_sandwich_dot(X, temp, + intercept=intercept) return observed_information def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, @@ -600,28 +682,29 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, an array of shape (X.shape[1], X.shape[1]) * If diag_fisher is ``True`, an array of shape (X.shape[0]) """ - n_samples, n_features = X.shape + intercept = (coef.size == X.shape[1] + 1) # eta = linear predictor - eta = X @ coef + eta = _safe_lin_pred(X, coef) mu = link.inverse(eta) sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) d1 = link.inverse_derivative(eta) # = h'(eta) # Alternatively: # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g # d1 = 1./link.derivative(mu) - score = X.T @ (sigma_inv * d1 * (y - mu)) - # - d2_sigma_inv = sigma_inv * (d1**2) + d1_sigma_inv = d1 * sigma_inv + temp = d1_sigma_inv * (y - mu) + if intercept: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X + + d2_sigma_inv = d1 * d1_sigma_inv if diag_fisher: - return eta, mu, score, d2_sigma_inv + fisher_matrix = d2_sigma_inv else: - if sparse.issparse(X): - d2_sigma_inv = sparse.dia_matrix((d2_sigma_inv, 0), - shape=(n_samples, n_samples)) - fisher = (X.T @ d2_sigma_inv @ X).toarray() - else: - fisher = (X.T * d2_sigma_inv) @ X - return eta, mu, score, fisher + fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, + intercept=intercept) + return eta, mu, score, fisher_matrix class TweedieDistribution(ExponentialDispersionModel): @@ -809,7 +892,7 @@ def unit_deviance(self, y, mu): return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) -def _irls_step(X, W, P2, z): +def _irls_step(X, W, P2, z, fit_intercept=True): """Compute one step in iteratively reweighted least squares. Solve A w = b for w with @@ -829,43 +912,57 @@ def _irls_step(X, W, P2, z): P2 : {ndarray, sparse matrix}, shape (n_features, n_features) The L2-penalty matrix or vector (=diagonal matrix) - z : ndarray, shape (n_samples,) + z : ndarray, shape (n_samples,) Working observations + fit_intercept : boolean, optional (default=True) + Returns ------- - coef: ndarray, shape (X.shape[1]) + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. """ # Note: solve vs least squares, what is more appropriate? # scipy.linalg.solve seems faster, but scipy.linalg.lstsq # is more robust. - n_samples, n_features = X.shape - if sparse.issparse(X): - W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr() + # Note: X.T @ W @ X is not sparse, even when X is sparse. + # Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b) + if fit_intercept: + Wz = W * z + if sparse.issparse(X): + b = np.concatenate(([Wz.sum()], X.transpose() @ Wz)) + else: + b = np.concatenate(([Wz.sum()], X.T @ Wz)) + A = _safe_sandwich_dot(X, W, intercept=fit_intercept) if P2.ndim == 1: - L2 = (sparse.dia_matrix((P2, 0), shape=(n_features, n_features)) - ).tocsr() + idx = np.arange(start=1, stop=A.shape[0]) + A[(idx, idx)] += P2 # add to diag elements without intercept + elif sparse.issparse(P2): + A[1:, 1:] += P2.toarray() else: - L2 = sparse.csr_matrix(P2) - XtW = X.transpose() * W - A = XtW * X + L2 - b = XtW * z - # coef = splinalg.spsolve(A, b) - coef, *_ = splinalg.lsmr(A, b) + A[1:, 1:] += P2 else: - XtW = (X.T * W) - A = XtW.dot(X) + if sparse.issparse(X): + XtW = X.transpose().multiply(W) + A = (XtW @ X).toarray() + else: + XtW = (X.T * W) + A = XtW @ X + b = XtW @ z if P2.ndim == 1: A[np.diag_indices_from(A)] += P2 + elif sparse.issparse(P2): + A += P2.toarray() else: A += P2 - b = XtW.dot(z) - # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True) - coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) + # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True) + coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) return coef -def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): +def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, + max_iter, tol): """Solve GLM with L2 penalty by IRLS algorithm. Note: If X is sparse, P2 must also be sparse. @@ -889,7 +986,7 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): # Note: ' denotes derivative, but also transpose for matrices # eta = linear predictor - eta = X @ coef + eta = _safe_lin_pred(X, coef) mu = link.inverse(eta) # D = h'(eta) hp = link.inverse_derivative(eta) @@ -906,10 +1003,10 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): z = eta + (y - mu) / hp # solve A*coef = b # A = X' W X + P2, b = X' W z - coef = _irls_step(X, W, P2, z) + coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept) # updated linear predictor # do it here for updated values for tolerance - eta = X @ coef + eta = _safe_lin_pred(X, coef) mu = link.inverse(eta) hp = link.inverse_derivative(eta) V = family.variance(mu, phi=1, weights=weights) @@ -917,11 +1014,18 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): # which tolerace? |coef - coef_old| or gradient? # use gradient for compliance with newton-cg and lbfgs # gradient = -X' D (y-mu)/V(mu) + l2 P2 w - gradient = -(X.T @ (hp*(y-mu)/V)) + temp = hp * (y - mu) / V + if sparse.issparse(X): + gradient = -(X.transpose() @ temp) + else: + gradient = -(X.T @ temp) + idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - gradient += P2*coef + gradient += P2 * coef[idx:] else: - gradient += P2 @ coef + gradient += P2 @ coef[idx:] + if fit_intercept: + gradient = np.concatenate(([-temp.sum()], gradient)) if (np.max(np.abs(gradient)) <= tol): converged = True break @@ -937,7 +1041,7 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, max_inner_iter=1000, selection='cyclic', random_state=None, diag_fisher=False): - """Compute inner loop of coordinate descent = cycles through features. + """Compute inner loop of coordinate descent, i.e. cycles through features. Minimization of 1-d subproblems:: @@ -953,24 +1057,31 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, # of Improved GLMNET or Gap Safe Screening Rules # https://arxiv.org/abs/1611.05780 n_samples, n_features = X.shape + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept B = fisher if P2.ndim == 1: - coef_P2 = coef * P2 + coef_P2 = coef[idx:] * P2 if not diag_fisher: - B[np.diag_indices_from(B)] += P2 + idiag = np.arange(start=idx, stop=B.shape[0]) + # B[np.diag_indices_from(B)] += P2 + B[(idiag, idiag)] += P2 else: - coef_P2 = P2 @ coef # P2 is symmetric, mat @ vec is usually faster + coef_P2 = coef[idx:] @ P2 if not diag_fisher: if sparse.issparse(P2): - B += P2.toarray() + B[idx:, idx:] += P2.toarray() else: - B += P2 - A = -score + coef_P2 # + d @ (H+P2) but d=0 so far + B[idx:, idx:] += P2 + # A = -score + coef_P2 + A = -score + A[idx:] += coef_P2 + # A += d @ (H+P2) but so far d=0 # inner loop - inner_iter = 0 - while inner_iter < max_inner_iter: + for inner_iter in range(1, max_inner_iter+1): inner_iter += 1 n_cycles += 1 + # cycle through features, update intercept separately at the end if selection == 'random': featurelist = random_state.permutation(n_features) else: @@ -985,70 +1096,85 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, # with beta = z+d, beta_hat = d-a/b and gamma = c/b # z = 1/b * S(bd-a,c) - d # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding - a = A[j] + jdx = j+idx # index for arrays containing entries for intercept + a = A[jdx] if diag_fisher: + # Note: fisher is ndarray of shape (n_samples,) => no idx + # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway + Bj = np.zeros_like(A) + if intercept: + Bj[0] = fisher.sum() if sparse.issparse(X): - xj = X[:, j] - b = xj.transpose() @ xj.multiply(fisher[:, np.newaxis]) - b = b[0, 0] + Bj[idx:] = (X[:, j].transpose().multiply(fisher) @ X + ).toarray().ravel() else: - b = X[:, j] @ (fisher * X[:, j]) + Bj[idx:] = (fisher * X[:, j]) @ X if P2.ndim == 1: - b += P2[j] + Bj[idx:] += P2[j] else: - b += P2[j, j] + if sparse.issparse(P2): + # slice columns as P2 is csc + Bj[idx:] += P2[:, j].toarray().ravel() + else: + Bj[idx:] += P2[:, j] + b = Bj[jdx] else: - b = B[j, j] + b = B[jdx, jdx] + # those ten lines aree what it is all about if b <= 0: z = 0 elif P1[j] == 0: z = -a/b - elif a + P1[j] < b * (coef[j] + d[j]): + elif a + P1[j] < b * (coef[jdx] + d[jdx]): z = -(a + P1[j])/b - elif a - P1[j] > b * (coef[j] + d[j]): + elif a - P1[j] > b * (coef[jdx] + d[jdx]): z = -(a - P1[j])/b else: - z = -(coef[j] + d[j]) + z = -(coef[jdx] + d[jdx]) # update direction d - d[j] += z + d[jdx] += z # update A because d_j is now d_j+z # A = f'(w) + d*H(w) + (w+d)*P2 # => A += (H+P2)*e_j z = B_j * z # Note: B is symmetric B = B.transpose if diag_fisher: - if sparse.issparse(X): - A += (X.transpose() @ - X[:, j].multiply(fisher[:, np.newaxis]) - ).toarray().ravel() * z - else: - # A += (X.T @ (fisher * X[:, j])) * z - # same without transpose of X - A += ((fisher * X[:, j]) @ X) * z - - if P2.ndim == 1: - A[j] += P2[j] * z - elif sparse.issparse(P2): - # slice columns as P2 is csc - A += P2[:, j].toarray().ravel() * z - else: - A += P2[:, j] * z + # Bj = B[:, j] calculated above, still valid + A += Bj * z else: # B is symmetric, C- or F-contiguous, but never sparse if B.flags['F_CONTIGUOUS']: # slice columns like for sparse csc - A += B[:, j] * z + A += B[:, jdx] * z else: # B.flags['C_CONTIGUOUS'] might be true # slice rows - A += B[j, :] * z - # end of cycle + A += B[jdx, :] * z + # end of cycle over features + # update intercept + if intercept: + if diag_fisher: + Bj = np.zeros_like(A) + Bj[0] = fisher.sum() + Bj[1:] = fisher @ X + b = Bj[0] + else: + b = B[0, 0] + z = 0 if b <= 0 else -A[0]/b + d[0] += z + if diag_fisher: + A += Bj * z + else: + if B.flags['F_CONTIGUOUS']: + A += B[:, 0] * z + else: + A += B[0, :] * z + # end of complete cycle # stopping criterion for inner loop # sum_i(|minimum of norm of subgrad of q(d)_i|) - mn_subgrad = np.where(coef + d == 0, - np.sign(A) * np.maximum(np.abs(A) - P1, 0), - A + np.sign(coef + d) * P1) + # subgrad q(d) = A + subgrad ||P1*(w+d)||_1 + mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1) mn_subgrad = linalg.norm(mn_subgrad, ord=1) if mn_subgrad <= inner_tol: if inner_iter == 1: @@ -1058,7 +1184,7 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, return d, coef_P2, n_cycles, inner_tol -def _cd_solver(coef, X, y, weights, P1, P2, family, link, +def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, max_iter=100, max_inner_iter=1000, tol=1e-4, selection='cyclic ', random_state=None, diag_fisher=False, copy_X=True): @@ -1083,7 +1209,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 A = f'(w) + d*H(w) + (w+d)*P2 - B = H+P2 + B = H + P2 Repeat steps 1-3 until convergence. Note: Use Fisher matrix instead of Hessian for H. @@ -1091,7 +1217,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, Parameters ---------- - coef: ndarray, shape (n_features,) + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) Training data (with intercept included if present). If not sparse, @@ -1112,6 +1240,10 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, The L2-penalty matrix or vector (=diagonal matrix). If a matrix is passed, it must be symmetric. If X is sparse, P2 must also be sparse. + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + family : ExponentialDispersionModel link : Link @@ -1120,8 +1252,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, Maximum numer of outer (Newton) iterations. max_inner_iter : int, optional (default=1000) - Maximum number of iterations, i.e. cycles over all features, in inner - loop. + Maximum number of iterations in each inner loop, i.e. max number of + cycles over all features per inner loop. tol : float, optional (default=1e-4) Covergence criterion is @@ -1133,8 +1265,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, random_state : {int, RandomState instance, None}, optional (default=None) diag_fisher : boolean, optional (default=False) - 'False' calculates full fisher matrix, 'True' only diagonal matrix s.t. - fisher = X.T @ diag @ X. This saves storage but needs more + ``False`` calculates full fisher matrix, ``True`` only diagonal matrix + s.t. fisher = X.T @ diag @ X. This saves storage but needs more matrix-vector multiplications. copy_X : boolean, optional (default=True) @@ -1142,7 +1274,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, Returns ------- - coef : ndarray, shape (n_features,) + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. n_iter : numer of outer iterations = newton iterations @@ -1174,6 +1308,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, n_cycles = 0 # number of (complete) cycles over features converged = False n_samples, n_features = X.shape + idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept # line search parameters (beta, sigma) = (0.5, 0.01) # some precalculations @@ -1186,16 +1321,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, d = np.zeros_like(coef) # initial stopping tolerance of inner loop # use L1-norm of minimum of norm of subgradient of F - # fp_wP2 = f'(w) + w*P2 - if P2.ndim == 1: - fp_wP2 = -score + coef * P2 - else: - # Note: P2 is symmetric and matrix @ vector is faster for sparse - # matrices. - fp_wP2 = -score + P2 @ coef - inner_tol = np.where(coef == 0, - np.sign(fp_wP2) * np.maximum(np.abs(fp_wP2) - P1, 0), - fp_wP2 + np.sign(coef) * P1) + inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) inner_tol = linalg.norm(inner_tol, ord=1) # outer loop while n_iter < max_iter: @@ -1211,23 +1337,23 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, # F(w + lambda d) - F(w) <= lambda * bound # bound = sigma * (f'(w)*d + w*P2*d # +||P1 (w+d)||_1 - ||P1 w||_1) - P1w_1 = linalg.norm(P1 * coef, ord=1) + P1w_1 = linalg.norm(P1 * coef[idx:], ord=1) + P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1) # Note: coef_P2 already calculated and still valid - bound = sigma * (-(score @ d) + coef_P2 @ d + - linalg.norm(P1 * (coef + d), ord=1) - P1w_1) + bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1) Fw = (0.5 * family.deviance(y, mu, weights) + - 0.5 * (coef_P2 @ coef) + P1w_1) + 0.5 * (coef_P2 @ coef[idx:]) + P1w_1) la = 1./beta for k in range(20): la *= beta # starts with la=1 coef_wd = coef + la * d - mu_wd = link.inverse(X @ coef_wd) + mu_wd = link.inverse(_safe_lin_pred(X, coef_wd)) Fwd = (0.5 * family.deviance(y, mu_wd, weights) + - linalg.norm(P1 * coef_wd, ord=1)) + linalg.norm(P1 * coef_wd[idx:], ord=1)) if P2.ndim == 1: - Fwd += 0.5 * ((coef_wd * P2) @ coef_wd) + Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:]) else: - Fwd += 0.5 * (coef_wd @ (P2 @ coef_wd)) + Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:])) if Fwd - Fw <= sigma * la * bound: break # update coefficients @@ -1238,16 +1364,10 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, coef=coef, phi=1, X=X, y=y, weights=weights, link=link, diag_fisher=diag_fisher) # stopping criterion for outer loop - # sum_i(|minimum of norm of subgrad of F(w)_i|) + # sum_i(|minimum-norm of subgrad of F(w)_i|) # fp_wP2 = f'(w) + w*P2 # Note: eta, mu and score are already updated - if P2.ndim == 1: - fp_wP2 = -score + coef * P2 - else: - fp_wP2 = -score + P2 @ coef # P2 is symmetric, mat @ vec is faster - mn_subgrad = np.where(coef == 0, - np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), - fp_wP2 + np.sign(coef) * P1) + mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) mn_subgrad = linalg.norm(mn_subgrad, ord=1) if mn_subgrad <= tol: converged = True @@ -1255,8 +1375,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, # end of outer loop if not converged: warnings.warn("Coordinate descent failed to converge. Increase" - " the number of iterations (currently {0})" - .format(max_iter), ConvergenceWarning) + " the maximum number of iterations max_iter" + " (currently {0})".format(max_iter), ConvergenceWarning) return coef, n_iter, n_cycles @@ -1387,8 +1507,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where g_i is the i-th component of the gradient (derivative) of the objective function. For the cd solver, covergence is reached - when ``sum_i(|minimum of norm of g_i|)``, where g_i is the - subgradient of the objective. + when ``sum_i(|minimum-norm of g_i|)``, where g_i is the + subgradient of the objective and minimum-norm of g_i is the element of + the subgradient g_i with the smallest L2-norm. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` @@ -1563,19 +1684,7 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # 1. input validation # ####################################################################### - # 1.1 validate arguments of fit ####################################### - _dtype = [np.float64, np.float32] - X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], - dtype=_dtype, y_numeric=True, multi_output=False, - copy=self.copy_X) - # Without converting y to float, deviance might raise - # ValueError: Integers to negative integer powers are not allowed. - # Also, y must not be sparse. - y = np.asarray(y, dtype=np.float64) - - weights = _check_weights(sample_weight, y.shape[0]) - - # 1.2 validate arguments of __init__ ################################## + # 1.1 validate arguments of __init__ ################################## # Guarantee that self._family_instance is an instance of class # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): @@ -1668,25 +1777,6 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.warm_start, bool): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) - start_params = self.start_params - if isinstance(start_params, str): - if start_params not in ['irls', 'least_squares', 'zero']: - raise ValueError("The argument start_params must be 'irls', " - "'least-squares', 'zero' or an array of " - " correct length;" - " got(start_params={0})".format(start_params)) - else: - start_params = check_array(start_params, accept_sparse=False, - force_all_finite=True, ensure_2d=False, - dtype=_dtype, copy=True) - if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or - (start_params.ndim != 1)): - raise ValueError("Start values for parameters must have the" - "right length and dimension; required (length" - "={0}, ndim=1); got (length={1}, ndim={2})." - .format(X.shape[1] + self.fit_intercept, - start_params.shape[0], - start_params.ndim)) if self.selection not in ['cyclic', 'random']: raise ValueError("The argument selection must be 'cyclic' or " "'random'; got (selection={0})" @@ -1702,36 +1792,59 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument check_input must be bool; got " "(check_input={0})".format(self.check_input)) + family = self._family_instance + link = self._link_instance + + # 1.2 validate arguments of fit ####################################### + _dtype = [np.float64, np.float32] + if solver == 'cd': + _stype = ['csc'] + else: + _stype = ['csc', 'csr'] + X, y = check_X_y(X, y, accept_sparse=_stype, + dtype=_dtype, y_numeric=True, multi_output=False, + copy=self.copy_X) + # Without converting y to float, deviance might raise + # ValueError: Integers to negative integer powers are not allowed. + # Also, y must not be sparse. + y = np.asarray(y, dtype=np.float64) + + weights = _check_weights(sample_weight, y.shape[0]) + + n_samples, n_features = X.shape + + # 1.3 arguments to take special care ################################## + # P1, P2, start_params if isinstance(self.P1, str) and self.P1 == 'identity': - P1 = np.ones(X.shape[1]) + P1 = np.ones(n_features) else: P1 = np.atleast_1d(self.P1) try: - P1 = P1.astype(np.float64, casting='safe', copy=True) + P1 = P1.astype(np.float64, casting='safe', copy=False) except TypeError: raise TypeError("The given P1 cannot be converted to a numeric" "array; got (P1.dtype={0})." .format(P1.dtype)) - if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): + if (P1.ndim != 1) or (P1.shape[0] != n_features): raise ValueError("P1 must be either 'identity' or a 1d array " "with the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." - .format(P1.shape[0], X.shape[1])) + .format(P1.shape[0], n_features)) # If X is sparse, make P2 sparse, too. if isinstance(self.P2, str) and self.P2 == 'identity': if sparse.issparse(X): - P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), - shape=(X.shape[1], X.shape[1]))).tocsr() + P2 = (sparse.dia_matrix((np.ones(n_features), 0), + shape=(n_features, n_features))).tocsc() else: - P2 = np.ones(X.shape[1]) + P2 = np.ones(n_features) else: P2 = check_array(self.P2, copy=True, - accept_sparse=['csr', 'csc'], + accept_sparse=_stype, dtype=_dtype, ensure_2d=False) if P2.ndim == 1: P2 = np.asarray(P2) - if P2.shape[0] != X.shape[1]: + if P2.shape[0] != n_features: raise ValueError("P2 should be a 1d array of shape " "(n_features,) with " "n_features=X.shape[1]; " @@ -1739,12 +1852,12 @@ def fit(self, X, y, sample_weight=None): .format(P2.shape[0], X.shape[1])) if sparse.issparse(X): P2 = (sparse.dia_matrix((P2, 0), - shape=(X.shape[1], X.shape[1]))).tocsr() + shape=(n_features, n_features))).tocsc() elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and P2.shape[0] == X.shape[1]): if sparse.issparse(X): P2 = (sparse.dia_matrix((P2, 0), - shape=(X.shape[1], X.shape[1]))).tocsr() + shape=(n_features, n_features))).tocsc() else: raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " @@ -1752,37 +1865,31 @@ def fit(self, X, y, sample_weight=None): "got (P2.shape=({0}, {1})), needed ({2}, {2})" .format(P2.shape[0], P2.shape[1], X.shape[1])) - family = self._family_instance - link = self._link_instance - - if self.fit_intercept: - # Note: intercept is first column <=> coef[0] is for intecept - if sparse.issparse(X): - Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X], - format=X.format) - else: - Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) - P1 = np.concatenate((np.array([0]), P1)) - if P2.ndim == 1: - P2 = np.concatenate((np.array([0]), P2)) - elif sparse.issparse(P2): - P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2), - format=P2.format, - dtype=P2.dtype).tocsr() - else: - # as of numpy 1.13 this would work: - # P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))], - # [np.zeros((X.shape[1], 1)), P2]]) - P2 = np.hstack((np.zeros((X.shape[1], 1)), P2)) - P2 = np.vstack((np.zeros((1, X.shape[1]+1)), P2)) + start_params = self.start_params + if isinstance(start_params, str): + if start_params not in ['irls', 'least_squares', 'zero']: + raise ValueError("The argument start_params must be 'irls', " + "'least-squares', 'zero' or an array of " + " correct length;" + " got(start_params={0})".format(start_params)) else: - Xnew = X + start_params = check_array(start_params, accept_sparse=False, + force_all_finite=True, ensure_2d=False, + dtype=_dtype, copy=True) + if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or + (start_params.ndim != 1)): + raise ValueError("Start values for parameters must have the" + "right length and dimension; required (length" + "={0}, ndim=1); got (length={1}, ndim={2})." + .format(X.shape[1] + self.fit_intercept, + start_params.shape[0], + start_params.ndim)) - n_samples, n_features = Xnew.shape l1 = self.alpha * self.l1_ratio l2 = self.alpha * (1 - self.l1_ratio) - P1 *= l1 - P2 *= l2 + # P1 and P2 are now for sure copies + P1 = l1 * P1 + P2 = l2 * P2 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric if P2.ndim == 2: @@ -1791,14 +1898,12 @@ def fit(self, X, y, sample_weight=None): else: P2 = 0.5 * (P2 + P2.T) - # 1.3 additional validations ########################################## + # 1.4 additional validations ########################################## if self.check_input: if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" .format(family.__class__.__name__)) - if not np.all(weights >= 0): - raise ValueError("Sample weights must be non-negative.") # check if P1 has only non-negative values, negative values might # indicate group lasso in the future. if not isinstance(self.P1, str): # if self.P1 != 'identity': @@ -1830,7 +1935,7 @@ def fit(self, X, y, sample_weight=None): else: if not np.all(linalg.eigvalsh(P2) >= epsneg): raise ValueError("P2 must be positive semi-definite.") - # TODO: if alpha=0 check that Xnew is not rank deficient + # TODO: if alpha=0 check that X is not rank deficient # TODO: what else to check? ####################################################################### @@ -1874,13 +1979,14 @@ def fit(self, X, y, sample_weight=None): z = eta + (y-mu)/hp # solve A*coef = b # A = X' W X + l2 P2, b = X' W z - coef = _irls_step(Xnew, W, P2, z) + coef = _irls_step(X, W, P2, z, + fit_intercept=self.fit_intercept) elif start_params == 'least_squares': # less restrictive tolerance for finding start values tol = np.max([self.tol, np.sqrt(self.tol)]) if self.alpha == 0: reg = LinearRegression(copy_X=True, fit_intercept=False) - reg.fit(Xnew, link.link(y)) + reg.fit(X, link.link(y)) coef = reg.coef_ elif self.l1_ratio <= 0.01: # ElasticNet says l1_ratio <= 0.01 is not reliable @@ -1888,19 +1994,21 @@ def fit(self, X, y, sample_weight=None): # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 reg = Ridge(copy_X=True, fit_intercept=False, alpha=self.alpha*n_samples, tol=tol) - reg.fit(Xnew, link.link(y)) + reg.fit(X, link.link(y)) coef = reg.coef_ else: # TODO: Does this make sense at all? reg = ElasticNet(copy_X=True, fit_intercept=False, alpha=self.alpha, l1_ratio=self.l1_ratio, tol=tol) - reg.fit(Xnew, link.link(y)) + reg.fit(X, link.link(y)) coef = reg.coef_ else: # start_params == 'zero' - coef = np.zeros(n_features) if self.fit_intercept: + coef = np.zeros(n_features+1) coef[0] = link.link(np.average(y, weights=weights)) + else: + coef = np.zeros(n_features) else: # assign given array as start values coef = start_params @@ -1915,24 +2023,28 @@ def fit(self, X, y, sample_weight=None): # Note: we already symmetriezed P2 = 1/2 (P2 + P2') if solver == 'irls': coef, self.n_iter_ = \ - _irls_solver(coef=coef, X=Xnew, y=y, weights=weights, P2=P2, - family=family, link=link, max_iter=self.max_iter, - tol=self.tol) + _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2, + fit_intercept=self.fit_intercept, family=family, + link=link, max_iter=self.max_iter, tol=self.tol) # 4.2 L-BFGS ########################################################## elif solver == 'lbfgs': def func(coef, X, y, weights, P2, family, link): - mu, dev, devp = \ + mu, devp = \ family._mu_deviance_derivative(coef, X, y, weights, link) + dev = family.deviance(y, mu, weights) + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - L2 = P2 * coef + L2 = P2 * coef[idx:] else: - L2 = P2 @ coef - obj = 0.5 * dev + 0.5 * (coef @ L2) - objp = 0.5 * devp + L2 + L2 = P2 @ coef[idx:] + obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2) + objp = 0.5 * devp + objp[idx:] += L2 return obj, objp - args = (Xnew, y, weights, P2, family, link) + args = (X, y, weights, P2, family, link) coef, loss, info = fmin_l_bfgs_b( func, coef, fprime=None, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, @@ -1952,50 +2064,66 @@ def func(coef, X, y, weights, P2, family, link): # precisely, expected hessian of deviance. elif solver == 'newton-cg': def func(coef, X, y, weights, P2, family, link): + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - L2 = coef @ (P2 * coef) + L2 = coef[idx:] @ (P2 * coef[idx:]) else: - L2 = coef @ (P2 @ coef) - mu = link.inverse(X @ coef) + L2 = coef[idx:] @ (P2 @ coef[idx:]) + mu = link.inverse(_safe_lin_pred(X, coef)) return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 def grad(coef, X, y, weights, P2, family, link): + mu, devp = \ + family._mu_deviance_derivative(coef, X, y, weights, link) + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - L2 = P2 * coef + L2 = P2 * coef[idx:] else: - L2 = P2 @ coef - eta = X @ coef - mu = link.inverse(eta) - d1 = link.inverse_derivative(eta) - grad = X.T @ (d1 * family.deviance_derivative(y, mu, weights)) - return 0.5 * grad + L2 + L2 = P2 @ coef[idx:] + objp = 0.5 * devp + objp[idx:] += L2 + return objp def grad_hess(coef, X, y, weights, P2, family, link): + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - L2 = P2 * coef + L2 = P2 * coef[idx:] else: - L2 = P2 @ coef - eta = X @ coef + L2 = P2 @ coef[idx:] + eta = _safe_lin_pred(X, coef) mu = link.inverse(eta) d1 = link.inverse_derivative(eta) - grad = 0.5 * \ - (X.T @ (d1 * family.deviance_derivative(y, mu, weights))) \ - + L2 - # expected hessian = X.T @ diag_matrix @ X + temp = d1 * family.deviance_derivative(y, mu, weights) + if intercept: + grad = np.concatenate(([0.5 * temp.sum()], + 0.5 * temp @ X + L2)) + else: + grad = 0.5 * temp @ X + L2 # sampe as 0.5* X.T @ temp + L2 + + # expected hessian = fisher = X.T @ diag_matrix @ X # calculate only diag_matrix diag = d1**2 / family.variance(mu, phi=1, weights=weights) - def Hs(s): - ret = 0.5 * (X.T @ (diag * (X @ s))) + def Hs(coef): + # return (0.5 * fisher + P2) @ coef + # ret = 0.5 * (X.T @ (diag * (X @ coef))) + ret = 0.5 * ((diag * (X @ coef[idx:])) @ X) if P2.ndim == 1: - ret += P2 * s + ret += P2 * coef[idx:] else: - ret += P2 @ s + ret += P2 @ coef[idx:] + if intercept: + h0i = np.concatenate(([diag.sum()], diag @ X)) + ret = np.concatenate(([0.5 * (h0i @ coef)], + ret + 0.5 * coef[0] * h0i[1:])) return ret return grad, Hs - args = (Xnew, y, weights, P2, family, link) + args = (X, y, weights, P2, family, link) coef, n_iter_i = newton_cg(grad_hess, func, grad, coef, args=args, maxiter=self.max_iter, tol=self.tol) @@ -2007,13 +2135,14 @@ def Hs(s): elif solver == 'cd': # For coordinate descent, if X is sparse, it should be csc format # If X is sparse, P2 must also be csc - if sparse.issparse(Xnew): - Xnew = Xnew.tocsc(copy=self.copy_X) + if sparse.issparse(X): + X = X.tocsc(copy=self.copy_X) P2 = sparse.csc_matrix(P2) coef, self.n_iter_, self._n_cycles = \ - _cd_solver(coef=coef, X=Xnew, y=y, weights=weights, P1=P1, - P2=P2, family=family, link=link, + _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, + P2=P2, fit_intercept=self.fit_intercept, + family=family, link=link, max_iter=self.max_iter, tol=self.tol, selection=self.selection, random_state=random_state, diag_fisher=self.diag_fisher, copy_X=self.copy_X) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index ae8a5f4cfc5e7..cdac151b77de6 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -566,7 +566,7 @@ def obj(coef): pd = PoissonDistribution() link = LogLink() N = y.shape[0] - mu = link.inverse(X @ coef[1:]+coef[0]) + mu = link.inverse(X @ coef[1:] + coef[0]) alpha, l1_ratio = (1, 0.5) return 1./(2.*N) * pd.deviance(y, mu) \ + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \ @@ -587,9 +587,9 @@ def obj(coef): assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) - # start_params='least_squares' with different alpha + # check warm_start, therefore start with different alpha glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, - family='poisson', + family='poisson', max_iter=300, link='log', solver='cd', tol=1e-5, start_params='zero') glm.fit(X, y) From c9a7a95e89deaadc40b92edd8e1208d550998a72 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 21 Apr 2019 17:03:07 +0200 Subject: [PATCH 051/209] Revised option start_params * renamed option irls into guess * removed option least_squares * updated tests --- doc/modules/linear_model.rst | 12 +- sklearn/linear_model/glm.py | 223 ++++++++++++++----------- sklearn/linear_model/tests/test_glm.py | 140 ++++++++-------- 3 files changed, 207 insertions(+), 168 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index e60e9e84a4747..4bede17af581a 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -925,12 +925,12 @@ follows: >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5, - check_input=True, copy_X=True, family='poisson', - fit_dispersion=None, fit_intercept=True, l1_ratio=0, - link='log', max_iter=100, random_state=None, - selection='cyclic', solver='auto', - start_params='irls', tol=0.0001, verbose=0, - warm_start=False) + check_input=True, copy_X=True, diag_fisher=False, + family='poisson', fit_dispersion=None, + fit_intercept=True, l1_ratio=0, link='log', + max_iter=100, random_state=None, selection='cyclic', + solver='auto', start_params='guess', tol=0.0001, + verbose=0, warm_start=False) >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE array([0.24630169, 0.43373464]) >>> reg.intercept_ #doctest: +ELLIPSIS diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 2afd1ddf8c79c..a53cc39ecd307 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -48,9 +48,6 @@ import scipy.sparse.linalg as splinalg from scipy.optimize import fmin_l_bfgs_b import warnings -from .base import LinearRegression -from .coordinate_descent import ElasticNet -from .ridge import Ridge from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, check_X_y @@ -93,6 +90,14 @@ def _safe_lin_pred(X, coef): return X @ coef +def _safe_toarray(X): + """Returns a numpy array.""" + if sparse.issparse(X): + return X.toarray() + else: + return np.asarray(X) + + def _safe_sandwich_dot(X, d, intercept=False): """Compute sandwich product X.T @ diag(d) @ X. @@ -100,7 +105,9 @@ def _safe_sandwich_dot(X, d, intercept=False): first column of X. X can be sparse, d must be an ndarray. Always returns a ndarray.""" if sparse.issparse(X): - temp = (X.transpose().multiply(d) @ X).toarray() + temp = (X.transpose() @ X.multiply(d[:, np.newaxis])) + # for older versions of numpy and scipy, temp may be a np.matrix + temp = _safe_toarray(temp) else: temp = (X.T * d) @ X if intercept: @@ -945,7 +952,8 @@ def _irls_step(X, W, P2, z, fit_intercept=True): else: if sparse.issparse(X): XtW = X.transpose().multiply(W) - A = (XtW @ X).toarray() + # for older versions of numpy and scipy, A may be a np.matrix + A = _safe_toarray(XtW @ X) else: XtW = (X.T * W) A = XtW @ X @@ -1105,8 +1113,9 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, if intercept: Bj[0] = fisher.sum() if sparse.issparse(X): - Bj[idx:] = (X[:, j].transpose().multiply(fisher) @ X - ).toarray().ravel() + Bj[idx:] = _safe_toarray(X[:, j].transpose() @ + X.multiply(fisher[:, np.newaxis]) + ).ravel() else: Bj[idx:] = (fisher * X[:, j]) @ X @@ -1477,27 +1486,32 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \ + solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \ optional (default='auto') Algorithm to use in the optimization problem: 'auto' Sets 'irls' if l1_ratio equals 0, else 'cd'. + 'cd' + Coordinate descent algorithm. It can deal with L1 as well as L2 + penalties. Note that in order to avoid unnecessary memory + duplication of X in the ``fit`` method, X should be directly passed + as a Fortran-contiguous numpy array or sparse csc matrix. + 'irls' - Iterated reweighted least squares (with Fisher scoring). + Iterated reweighted least squares. It is the standard algorithm for GLMs. It cannot deal with L1 penalties. + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties. + 'newton-cg', 'lbfgs' - Cannot deal with L1 penalties. + Newton conjugate gradient algorithm cannot deal with L1 penalties. - 'cd' - Coordinate descent algorithm. It can deal with L1 as well as L2 - penalties. Note that in order to avoid unnecessary memory - duplication of the X argument in the ``fit`` method, X should be - directly passed as a Fortran-contiguous numpy array or sparse csc - matrix. + Note that all solvers except lbfgs use the fisher matrix, i.e. the + expected Hessian instead of the Hessian matrix. max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -1505,11 +1519,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` - where g_i is the i-th component of the gradient (derivative) of + where ``g_i`` is the i-th component of the gradient (derivative) of the objective function. For the cd solver, covergence is reached - when ``sum_i(|minimum-norm of g_i|)``, where g_i is the - subgradient of the objective and minimum-norm of g_i is the element of - the subgradient g_i with the smallest L2-norm. + when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the + subgradient of the objective and minimum-norm of ``g_i`` is the element + of the subgradient ``g_i`` with the smallest L2-norm. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` @@ -1518,23 +1532,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): does not exit (first call to ``fit``), option ``start_params`` sets the start values for ``coef_`` and ``intercept_``. - start_params : {'irls', 'least_squares', 'zero', array of shape \ - (n_features*, )}, optional (default='irls') + start_params : {'guess', 'zero', array of shape (n_features*, )}, \ + optional (default='guess') Relevant only if ``warm_start=False`` or if fit is called the first time (``self.coef_`` does not yet exist). - 'irls' + 'guess' Start values of mu are calculated by family.starting_mu(..). Then, - one step of irls obtains start values for ``coef_``. This gives - usually good results. - - 'least_squares' - Start values for ``coef_`` are obtained by a least squares fit in the - link space (y is transformed to the space of the linear predictor). + one Newton step obtains start values for ``coef_``. If + ``solver='irls'``, it uses one irls step, else the Newton step is + calculated by the cd solver. + This gives usually good starting values. 'zero' All coefficients are set to zero. If ``fit_intercept=True``, the - start value for the intercept is obtained by the average of y. + start value for the intercept is obtained by the weighted average of y. array The array of size n_features* is directly used as start values @@ -1560,17 +1572,17 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): RandomState instance used by `np.random`. Used when ``selection`` == 'random'. - diag_fisher : boolean, (default=False) - Only relevant for solver 'cd'. If ``False``, the full Fisher matrix - (expected Hessian) is computed in each outer iteretion (Newton - iteration). If ``True``, only a diagonal matrix (stored as 1d array) is - computed, such that fisher = X.T @ diag @ X. This saves memory and - matrix-matrix multiplications, but needs more matrix-vector - multiplications. If you use large sparse X or if you have many - features, i.e. n_features >> n_samples, you might set this option to - ``True``. - - copy_X : boolean, optional, default True + diag_fisher : boolean, optional, (default=False) + Only relevant for solver 'cd' (see also ``start_params='guess'``). + If ``False``, the full Fisher matrix (expected Hessian) is computed in + each outer iteration (Newton iteration). If ``True``, only a diagonal + matrix (stored as 1d array) is computed, such that + fisher = X.T @ diag @ X. This saves memory and matrix-matrix + multiplications, but needs more matrix-vector multiplications. If you + use large sparse X or if you have many features, + i.e. n_features >> n_samples, you might set this option to ``True``. + + copy_X : boolean, optional, (default=True) If ``True``, X will be copied; else, it may be overwritten. check_input : boolean, optional (default=True) @@ -1634,7 +1646,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, - tol=1e-4, warm_start=False, start_params='irls', + tol=1e-4, warm_start=False, start_params='guess', selection='cyclic', random_state=None, diag_fisher=False, copy_X=True, check_input=True, verbose=0): self.alpha = alpha @@ -1867,11 +1879,10 @@ def fit(self, X, y, sample_weight=None): start_params = self.start_params if isinstance(start_params, str): - if start_params not in ['irls', 'least_squares', 'zero']: - raise ValueError("The argument start_params must be 'irls', " - "'least-squares', 'zero' or an array of " - " correct length;" - " got(start_params={0})".format(start_params)) + if start_params not in ['guess', 'zero']: + raise ValueError("The argument start_params must be 'guess', " + "'zero' or an array of correct length; " + "got(start_params={0})".format(start_params)) else: start_params = check_array(start_params, accept_sparse=False, force_all_finite=True, ensure_2d=False, @@ -1894,10 +1905,17 @@ def fit(self, X, y, sample_weight=None): # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric if P2.ndim == 2: if sparse.issparse(P2): - P2 = 0.5 * (P2 + P2.transpose()) + if sparse.isspmatrix_csc(P2): + P2 = 0.5 * (P2 + P2.transpose()).tocsc() + else: + P2 = 0.5 * (P2 + P2.transpose()).tocsr() else: P2 = 0.5 * (P2 + P2.T) + # For coordinate descent, if X is sparse, P2 must also be csc + if solver == 'cd' and sparse.issparse(X): + P2 = sparse.csc_matrix(P2) + # 1.4 additional validations ########################################## if self.check_input: if not np.all(family.in_y_range(y)): @@ -1964,45 +1982,63 @@ def fit(self, X, y, sample_weight=None): else: coef = self.coef_ elif isinstance(start_params, str): - if start_params == 'irls': - # See 3.1 IRLS - # Use mu_start and apply one irls step to calculate coef + if start_params == 'guess': + # Set mu=starting_mu of the family and do one Newton step + # If solver=cd use cd, else irls mu = family.starting_mu(y, weights=weights) - # linear predictor - eta = link.link(mu) - # h'(eta) - hp = link.inverse_derivative(eta) - # working weights W, in principle a diagonal matrix - # therefore here just as 1d array - W = (hp**2 / family.variance(mu, phi=1, weights=weights)) - # working observations - z = eta + (y-mu)/hp - # solve A*coef = b - # A = X' W X + l2 P2, b = X' W z - coef = _irls_step(X, W, P2, z, - fit_intercept=self.fit_intercept) - elif start_params == 'least_squares': - # less restrictive tolerance for finding start values - tol = np.max([self.tol, np.sqrt(self.tol)]) - if self.alpha == 0: - reg = LinearRegression(copy_X=True, fit_intercept=False) - reg.fit(X, link.link(y)) - coef = reg.coef_ - elif self.l1_ratio <= 0.01: - # ElasticNet says l1_ratio <= 0.01 is not reliable - # => use Ridge - # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - reg = Ridge(copy_X=True, fit_intercept=False, - alpha=self.alpha*n_samples, tol=tol) - reg.fit(X, link.link(y)) - coef = reg.coef_ + eta = link.link(mu) # linear predictor + if solver in ['cd', 'lbfgs', 'newton-cg']: + # see function _cd_solver + sigma_inv = 1/family.variance(mu, phi=1, weights=weights) + d1 = link.inverse_derivative(eta) + temp = sigma_inv * d1 * (y - mu) + if self.fit_intercept: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X # sampe as X.T @ temp + + d2_sigma_inv = d1 * d1 * sigma_inv + diag_fisher = self.diag_fisher + if diag_fisher: + fisher = d2_sigma_inv + else: + fisher = \ + _safe_sandwich_dot(X, d2_sigma_inv, + intercept=self.fit_intercept) + # set up space for search direction d for inner loop + if self.fit_intercept: + coef = np.zeros(n_features+1) + else: + coef = np.zeros(n_features) + d = np.zeros_like(coef) + # initial stopping tolerance of inner loop + # use L1-norm of minimum of norm of subgradient of F + # use less restrictive tolerance for initial guess + inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, + P1=P1) + inner_tol = 4 * linalg.norm(inner_tol, ord=1) + # just one outer loop = Newton step + n_cycles = 0 + d, coef_P2, n_cycles, inner_tol = \ + _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, + inner_tol, max_inner_iter=1000, + selection=self.selection, + random_state=self.random_state, + diag_fisher=self.diag_fisher) + coef += d # for simplicity no line search here else: - # TODO: Does this make sense at all? - reg = ElasticNet(copy_X=True, fit_intercept=False, - alpha=self.alpha, l1_ratio=self.l1_ratio, - tol=tol) - reg.fit(X, link.link(y)) - coef = reg.coef_ + # See _irls_solver + # h'(eta) + hp = link.inverse_derivative(eta) + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = (hp**2 / family.variance(mu, phi=1, weights=weights)) + # working observations + z = eta + (y-mu)/hp + # solve A*coef = b + # A = X' W X + l2 P2, b = X' W z + coef = _irls_step(X, W, P2, z, + fit_intercept=self.fit_intercept) else: # start_params == 'zero' if self.fit_intercept: coef = np.zeros(n_features+1) @@ -2048,7 +2084,7 @@ def func(coef, X, y, weights, P2, family, link): coef, loss, info = fmin_l_bfgs_b( func, coef, fprime=None, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, - maxiter=self.max_iter) + maxiter=self.max_iter, factr=1e3) if self.verbose > 0: if info["warnflag"] == 1: warnings.warn("lbfgs failed to converge." @@ -2106,6 +2142,8 @@ def grad_hess(coef, X, y, weights, P2, family, link): # expected hessian = fisher = X.T @ diag_matrix @ X # calculate only diag_matrix diag = d1**2 / family.variance(mu, phi=1, weights=weights) + if intercept: + h0i = np.concatenate(([diag.sum()], diag @ X)) def Hs(coef): # return (0.5 * fisher + P2) @ coef @@ -2116,7 +2154,6 @@ def Hs(coef): else: ret += P2 @ coef[idx:] if intercept: - h0i = np.concatenate(([diag.sum()], diag @ X)) ret = np.concatenate(([0.5 * (h0i @ coef)], ret + 0.5 * coef[0] * h0i[1:])) return ret @@ -2124,21 +2161,15 @@ def Hs(coef): return grad, Hs args = (X, y, weights, P2, family, link) - coef, n_iter_i = newton_cg(grad_hess, func, grad, coef, - args=args, maxiter=self.max_iter, - tol=self.tol) + coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef, + args=args, maxiter=self.max_iter, + tol=self.tol) # 4.4 coordinate descent ############################################## # Note: we already set P1 = l1*P1, see above # Note: we already set P2 = l2*P2, see above # Note: we already symmetriezed P2 = 1/2 (P2 + P2') elif solver == 'cd': - # For coordinate descent, if X is sparse, it should be csc format - # If X is sparse, P2 must also be csc - if sparse.issparse(X): - X = X.tocsc(copy=self.copy_X) - P2 = sparse.csc_matrix(P2) - coef, self.n_iter_, self._n_cycles = \ _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, P2=P2, fit_intercept=self.fit_intercept, diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index cdac151b77de6..17535c067bbb6 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -4,7 +4,7 @@ import scipy as sp from scipy import linalg, optimize, sparse -from sklearn.datasets import make_classification +from sklearn.datasets import make_classification, make_regression from sklearn.linear_model.glm import ( Link, IdentityLink, @@ -22,10 +22,12 @@ assert_array_equal, assert_array_almost_equal) +rng = np.random.RandomState(42) + + @pytest.mark.parametrize('link', Link.__subclasses__()) def test_link_properties(link): """Test link inverse and derivative.""" - rng = np.random.RandomState(0) x = rng.rand(100)*100 link = link() # instatiate object decimal = 10 @@ -86,7 +88,6 @@ def test_deviance_zero(family, chk_values): def test_fisher_matrix(family, link): """Test the Fisher matrix numerically. Trick: Use numerical differentiation with y = mu""" - rng = np.random.RandomState(0) coef = np.array([-2, 1, 0, 1, 2.5]) phi = 0.5 X = rng.randn(10, 5) @@ -218,7 +219,6 @@ def test_glm_P2_argument(P2): def test_glm_P2_positive_semidefinite(): """Test GLM for a positive semi-definite P2 argument.""" n_samples, n_features = 10, 5 - rng = np.random.RandomState(42) y = np.arange(n_samples) X = np.zeros((n_samples, n_features)) P2 = np.diag([100, 10, 5, 0, -1E-5]) @@ -351,20 +351,15 @@ def test_glm_check_input_argument(check_input): glm.fit(X, y) -@pytest.mark.parametrize( - 'family', - [NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), - GeneralizedHyperbolicSecant()]) @pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) -def test_glm_identiy_regression(family, solver): +def test_glm_identiy_regression(solver): """Test GLM regression with identity link on a simple dataset.""" coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) - glm = GeneralizedLinearRegressor(alpha=0, family=family, link='identity', - fit_intercept=False, solver=solver) + glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', + fit_intercept=False, solver=solver, + start_params='zero', tol=1e-7) res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) @@ -375,34 +370,42 @@ def test_glm_identiy_regression(family, solver): GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), GeneralizedHyperbolicSecant()]) -@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) -def test_glm_log_regression(family, solver): +@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), + ('lbfgs', 1e-6, 6), + ('newton-cg', 1e-7, 6), + ('cd', 1e-7, 6)]) +def test_glm_log_regression(family, solver, tol, dec): """Test GLM regression with log link on a simple dataset.""" - coef = [1, 2] + coef = [0.2, -0.1] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) glm = GeneralizedLinearRegressor( alpha=0, family=family, link='log', fit_intercept=False, - solver=solver, start_params='least_squares') + solver=solver, start_params='guess', tol=tol) res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef) + assert_array_almost_equal(res.coef_, coef, decimal=dec) @pytest.mark.filterwarnings('ignore::DeprecationWarning') -@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) -def test_normal_ridge(solver): +@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), + ('lbfgs', 1e-6, 5), + ('newton-cg', 1e-6, 5), + ('cd', 1e-6, 6)]) +def test_normal_ridge(solver, tol, dec): """Test ridge regression for Normal distributions. Compare to test_ridge in test_ridge.py. """ - rng = np.random.RandomState(0) alpha = 1.0 # 1. With more samples than features - n_samples, n_features, n_predict = 10, 5, 10 - y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) - T = rng.randn(n_predict, n_features) + n_samples, n_features, n_predict = 100, 7, 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=rng) + y = y[0:n_samples] + X, T = X[0:n_samples], X[n_samples:] # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, @@ -410,69 +413,74 @@ def test_normal_ridge(solver): ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=True, - tol=1e-6, max_iter=100, solver=solver, - random_state=42) + tol=tol, max_iter=100, solver=solver, + check_input=False, random_state=rng) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, solver='svd', normalize=False) ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=False, - tol=1e-6, max_iter=100, solver=solver, - random_state=42, fit_dispersion='chisqr') + tol=tol, max_iter=100, solver=solver, + check_input=False, random_state=rng, + fit_dispersion='chisqr') glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) mu = glm.predict(X) assert_almost_equal(glm.dispersion_, np.sum((y-mu)**2/(n_samples-n_features))) # 2. With more features than samples and sparse - n_samples, n_features, n_predict = 5, 10, 10 - y = rng.randn(n_samples) - X = sparse.csr_matrix(rng.randn(n_samples, n_features)) - T = sparse.csr_matrix(rng.randn(n_predict, n_features)) + n_samples, n_features, n_predict = 10, 100, 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=rng) + y = y[0:n_samples] + X, T = X[0:n_samples], X[n_samples:] # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, solver='sag', normalize=False, max_iter=100000) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, - family='normal', link='identity', - fit_intercept=True, solver=solver, - max_iter=300, random_state=42) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=True, + tol=tol, max_iter=300, solver=solver, + check_input=False, random_state=rng) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, solver='sag', normalize=False, max_iter=1000) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, - family='normal', link='identity', - fit_intercept=False, solver=solver) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=False, + tol=tol*2, max_iter=300, solver=solver, + check_input=False, random_state=rng) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2) -@pytest.mark.parametrize('solver, decimal, tol', - [('irls', 7, 1e-8), - ('lbfgs', 5, 1e-7), - ('newton-cg', 5, 1e-7), - ('cd', 7, 1e-8)]) -def test_poisson_ridge(solver, decimal, tol): +@pytest.mark.parametrize('solver, tol, dec', + [('irls', 1e-7, 6), + ('lbfgs', 1e-7, 5), + ('newton-cg', 1e-7, 5), + ('cd', 1e-7, 7)]) +def test_poisson_ridge(solver, tol, dec): """Test ridge regression with poisson family and LogLink. Compare to R's glmnet""" @@ -493,18 +501,17 @@ def test_poisson_ridge(solver, decimal, tol): fit_intercept=True, family='poisson', link='log', tol=tol, solver=solver, max_iter=300, - random_state=42) + random_state=rng) glm.fit(X, y) assert_almost_equal(glm.intercept_, -0.12889386979, - decimal=decimal) + decimal=dec) assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], - decimal=decimal) + decimal=dec) @pytest.mark.parametrize('diag_fisher', [False, True]) def test_normal_enet(diag_fisher): """Test elastic net regression with normal/gaussian family.""" - rng = np.random.RandomState(0) alpha, l1_ratio = 0.3, 0.7 n_samples, n_features = 20, 2 X = rng.randn(n_samples, n_features).copy(order='F') @@ -556,7 +563,8 @@ def test_poisson_enet(): y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', link='log', solver='cd', tol=1e-8, - selection='random', random_state=42) + selection='random', random_state=rng, + start_params='guess') glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) @@ -591,7 +599,7 @@ def obj(coef): glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, family='poisson', max_iter=300, link='log', solver='cd', tol=1e-5, - start_params='zero') + selection='cyclic', start_params='zero') glm.fit(X, y) # warm start with original alpha and use of sparse matrices glm.warm_start = True @@ -612,9 +620,9 @@ def test_binomial_enet(alpha): n_samples = 500 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6, n_informative=5, n_redundant=0, n_repeated=0, - random_state=0) + random_state=rng) log = LogisticRegression( - penalty='elasticnet', random_state=0, fit_intercept=False, tol=1e-6, + penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6, max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), solver='saga') log.fit(X, y) From a7755de2cdc5022b6d41285730ed12bd41628e66 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 4 Jun 2019 17:34:11 +0200 Subject: [PATCH 052/209] Fix a few typos --- doc/modules/linear_model.rst | 2 +- .../plot_poisson_spline_regression.py | 2 +- sklearn/linear_model/glm.py | 53 ++++++++++--------- sklearn/linear_model/tests/test_glm.py | 4 +- 4 files changed, 31 insertions(+), 30 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 4bede17af581a..8f17c67d950de 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -887,7 +887,7 @@ combination of the input variables :math:`X` via an inverse link function .. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [11]_. The objective function beeing minimized +exponential dispersion model (EDM) [11]_. The objective function being minimized becomes .. math:: \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1 diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py index fce85fae1ea8c..30b5881bba1f5 100644 --- a/examples/linear_model/plot_poisson_spline_regression.py +++ b/examples/linear_model/plot_poisson_spline_regression.py @@ -5,7 +5,7 @@ As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` example, a Poisson regression with penalized B-splines (P-splines) [1]_ is -fitted on slightly different sinusodial, Poisson distributed data and +fitted on slightly different sinusoidal, Poisson distributed data and compared to an AdaBoost model with decision trees. One can see, that this is a hard problem for both estimators. diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index a53cc39ecd307..a35c68828de81 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -29,7 +29,7 @@ # L2: w*P2*w with P2 a (semi-) positive definite matrix, e.g. P2 could be # a 1st or 2nd order difference matrix (compare B-spline penalties and # Tikhonov regularization). -# - The link funtion (instance of class Link) is necessary for the evaluation +# - The link function (instance of class Link) is necessary for the evaluation # of deviance, score, Fisher and Hessian matrix as functions of the # coefficients, which is needed by optimizers. # Solution: link as argument in those functions @@ -170,7 +170,7 @@ def _min_norm_sugrad(coef, grad, P2, P1): class Link(metaclass=ABCMeta): - """Abstract base class for Link funtions.""" + """Abstract base class for Link functions.""" @abstractmethod def link(self, mu): @@ -201,7 +201,7 @@ def derivative(self, mu): def inverse(self, lin_pred): """Compute the inverse link function h(lin_pred). - Gives the inverse relationship between linkear predictor and the mean + Gives the inverse relationship between linker predictor and the mean mu=E[Y], i.e. h(linear predictor) = mu. Parameters @@ -357,7 +357,7 @@ def include_lower_bound(self): @property def include_upper_bound(self): - """Get True if upper bound for y is includede: y <= upper_bound.""" + """Get True if upper bound for y is included: y <= upper_bound.""" return self._include_upper_bound def in_y_range(self, x): @@ -859,7 +859,7 @@ def __init__(self): class GeneralizedHyperbolicSecant(ExponentialDispersionModel): """A class for the Generalized Hyperbolic Secant (GHS) distribution. - The GHS distribution is for tagets y in (-inf, inf). + The GHS distribution is for targets y in (-inf, inf). """ def __init__(self): self._lower_bound = -np.Inf @@ -881,7 +881,7 @@ def unit_deviance(self, y, mu): class BinomialDistribution(ExponentialDispersionModel): """A class for the Binomial distribution. - The Binomial distribution is for tagets y in [0, 1]. + The Binomial distribution is for targets y in [0, 1]. """ def __init__(self): self._lower_bound = 0 @@ -1131,7 +1131,7 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, else: b = B[jdx, jdx] - # those ten lines aree what it is all about + # those ten lines are what it is all about if b <= 0: z = 0 elif P1[j] == 0: @@ -1199,7 +1199,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, diag_fisher=False, copy_X=True): """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. - The objective beeing minimized in the coefficients w=coef is:: + The objective being minimized in the coefficients w=coef is:: F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 @@ -1207,7 +1207,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, 1. Find optimal descent direction d by minimizing min_d F(w+d) = min_d F(w+d) - F(w) - 2. Quadrdatic approximation of F(w+d)-F(w) = q(d): + 2. Quadratic approximation of F(w+d)-F(w) = q(d): using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d + ||P1*(w+d)||_1 - ||P1*w||_1 @@ -1228,7 +1228,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, ---------- coef : ndarray, shape (c,) If fit_intercept=False, shape c=X.shape[1]. - If fit_intercept=True, then c=X.shapee[1] + 1. + If fit_intercept=True, then c=X.shape[1] + 1. X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) Training data (with intercept included if present). If not sparse, @@ -1265,7 +1265,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, cycles over all features per inner loop. tol : float, optional (default=1e-4) - Covergence criterion is + Convergence criterion is sum_i(|minimum of norm of subgrad of objective_i|)<=tol. selection : str, optional (default='cyclic') @@ -1285,9 +1285,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, ------- coef : ndarray, shape (c,) If fit_intercept=False, shape c=X.shape[1]. - If fit_intercept=True, then c=X.shapee[1] + 1. + If fit_intercept=True, then c=X.shape[1] + 1. - n_iter : numer of outer iterations = newton iterations + n_iter : number of outer iterations = newton iterations n_cycles : number of cycles over features @@ -1312,7 +1312,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, "format. Got P2 not sparse.") random_state = check_random_state(random_state) # Note: we already set P2 = l2*P2, P1 = l1*P1 - # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: we already symmetrized P2 = 1/2 (P2 + P2') n_iter = 0 # number of outer iterations n_cycles = 0 # number of (complete) cycles over features converged = False @@ -1427,7 +1427,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Parameters ---------- alpha : float, optional (default=1) - Constant that multiplies the penalty terms und thus determines the + Constant that multiplies the penalty terms and thus determines the regularization strength. See the notes for the exact mathematical meaning of this parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this @@ -1481,9 +1481,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'logit' for family 'binomial' - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) Method for estimation of the dispersion parameter phi. Whether to use - the chi squared statisic or the deviance statistic. If None, the + the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \ @@ -1520,7 +1520,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of - the objective function. For the cd solver, covergence is reached + the objective function. For the cd solver, convergence is reached when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the subgradient of the objective and minimum-norm of ``g_i`` is the element of the subgradient ``g_i`` with the smallest L2-norm. @@ -1626,7 +1626,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If the target y is a ratio, appropriate sample weights s should be provided. - As an example, consider Poission distributed counts z (integers) and + As an example, consider Poisson distributed counts z (integers) and weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite @@ -1681,7 +1681,7 @@ def fit(self, X, y, sample_weight=None): Target values. sample_weight : {None, array-like}, shape (n_samples,),\ - optinal (default=None) + optional (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has Var[Y_i]=phi/w_i * v(mu). @@ -1995,7 +1995,7 @@ def fit(self, X, y, sample_weight=None): if self.fit_intercept: score = np.concatenate(([temp.sum()], temp @ X)) else: - score = temp @ X # sampe as X.T @ temp + score = temp @ X # same as X.T @ temp d2_sigma_inv = d1 * d1 * sigma_inv diag_fisher = self.diag_fisher @@ -2051,12 +2051,12 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # 4. fit # ####################################################################### - # algorithms for optimiation + # algorithms for optimization # TODO: Parallelize it? # 4.1 IRLS ############################################################ # Note: we already set P2 = l2*P2, see above - # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: we already symmetrized P2 = 1/2 (P2 + P2') if solver == 'irls': coef, self.n_iter_ = \ _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2, @@ -2137,7 +2137,7 @@ def grad_hess(coef, X, y, weights, P2, family, link): grad = np.concatenate(([0.5 * temp.sum()], 0.5 * temp @ X + L2)) else: - grad = 0.5 * temp @ X + L2 # sampe as 0.5* X.T @ temp + L2 + grad = 0.5 * temp @ X + L2 # same as 0.5* X.T @ temp + L2 # expected hessian = fisher = X.T @ diag_matrix @ X # calculate only diag_matrix @@ -2168,7 +2168,7 @@ def Hs(coef): # 4.4 coordinate descent ############################################## # Note: we already set P1 = l1*P1, see above # Note: we already set P2 = l2*P2, see above - # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: we already symmetrized P2 = 1/2 (P2 + P2') elif solver == 'cd': coef, self.n_iter_, self._n_cycles = \ _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, @@ -2215,7 +2215,8 @@ def linear_predictor(self, X): return X @ self.coef_ + self.intercept_ def predict(self, X, sample_weight=None): - """Predict uing GLM with feature matrix X. + """Predict using GLM with feature matrix X. + If sample_weight is given, returns prediction*sample_weight. Parameters diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 17535c067bbb6..6172824cf1b79 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -29,7 +29,7 @@ def test_link_properties(link): """Test link inverse and derivative.""" x = rng.rand(100)*100 - link = link() # instatiate object + link = link() # instantiate object decimal = 10 if isinstance(link, LogitLink): # careful for large x, note expit(36) = 1 @@ -133,7 +133,7 @@ def test_sample_weights_validation(): with pytest.raises(ValueError): glm.fit(X, y, weights) - # 5. 1d but weith a negative value + # 5. 1d but with a negative value weights = [2, -1] with pytest.raises(ValueError): glm.fit(X, y, weights) From 9aa1fc41d726e5ac9007d9c867546aa70ea79ba7 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 4 Jun 2019 17:40:08 +0200 Subject: [PATCH 053/209] Make module private --- sklearn/linear_model/__init__.py | 4 ++-- sklearn/linear_model/{glm.py => _glm.py} | 0 sklearn/linear_model/tests/test_glm.py | 9 +++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) rename sklearn/linear_model/{glm.py => _glm.py} (100%) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 0c5840f343a3a..0f7856fcc2046 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,8 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from .glm import (TweedieDistribution, - GeneralizedLinearRegressor) +from ._glm import (TweedieDistribution, + GeneralizedLinearRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/_glm.py similarity index 100% rename from sklearn/linear_model/glm.py rename to sklearn/linear_model/_glm.py diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 6172824cf1b79..a148d11cb2632 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,3 +1,7 @@ +# Authors: Christian Lorentzen +# +# License: BSD 3 clause + import numpy as np from numpy.testing import assert_allclose import pytest @@ -5,7 +9,8 @@ from scipy import linalg, optimize, sparse from sklearn.datasets import make_classification, make_regression -from sklearn.linear_model.glm import ( +from sklearn.linear_model import GeneralizedLinearRegressor +from sklearn.linear_model._glm import ( Link, IdentityLink, LogLink, @@ -14,7 +19,7 @@ NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, GeneralizedHyperbolicSecant, BinomialDistribution, - GeneralizedLinearRegressor) +) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.utils.testing import ( From ca3eae24b4db9931eeac9fb925dba4f48199976c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 4 Jun 2019 21:35:11 +0200 Subject: [PATCH 054/209] Working on tests --- sklearn/linear_model/tests/test_glm.py | 52 +++++++++++++------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index a148d11cb2632..2c8a9c3d2c72c 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -23,7 +23,7 @@ from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.utils.testing import ( - assert_equal, assert_almost_equal, + assert_almost_equal, assert_array_equal, assert_array_almost_equal) @@ -35,16 +35,14 @@ def test_link_properties(link): """Test link inverse and derivative.""" x = rng.rand(100)*100 link = link() # instantiate object - decimal = 10 if isinstance(link, LogitLink): # careful for large x, note expit(36) = 1 # limit max eta to 15 x = x / 100 * 15 - decimal = 8 - assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal) + assert_allclose(link.link(link.inverse(x)), x) # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) - assert_almost_equal(link.derivative(link.inverse(x)), - 1./link.inverse_derivative(x), decimal=decimal) + assert_allclose(link.derivative(link.inverse(x)), + 1./link.inverse_derivative(x)) # for LogitLink, in the following x should be between 0 and 1. # assert_almost_equal(link.inverse_derivative(link.link(x)), # 1./link.derivative(x), decimal=decimal) @@ -79,7 +77,7 @@ def test_family_bounds(family, expected): def test_deviance_zero(family, chk_values): """Test deviance(y,y) = 0 for different families.""" for x in chk_values: - assert_almost_equal(family.deviance(x, x), 0, decimal=10) + assert_allclose(family.deviance(x, x), 0, atol=1e-9) @pytest.mark.parametrize( @@ -155,7 +153,7 @@ def test_glm_family_argument(f, fam): y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) - assert_equal(type(glm._family_instance), type(fam)) + assert isinstance(glm._family_instance, fam.__class__) glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) @@ -172,7 +170,7 @@ def test_glm_link_argument(l, link): y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) - assert_equal(type(glm._link_instance), type(link)) + assert isinstance(glm._link_instance, link.__class__) glm = GeneralizedLinearRegressor(family='normal', link='not a link') with pytest.raises(ValueError): @@ -366,7 +364,7 @@ def test_glm_identiy_regression(solver): fit_intercept=False, solver=solver, start_params='zero', tol=1e-7) res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef) + assert_allclose(res.coef_, coef) @pytest.mark.parametrize( @@ -375,11 +373,11 @@ def test_glm_identiy_regression(solver): GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), GeneralizedHyperbolicSecant()]) -@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), - ('lbfgs', 1e-6, 6), - ('newton-cg', 1e-7, 6), - ('cd', 1e-7, 6)]) -def test_glm_log_regression(family, solver, tol, dec): +@pytest.mark.parametrize('solver, tol', [('irls', 1e-6), + ('lbfgs', 1e-6), + ('newton-cg', 1e-7), + ('cd', 1e-7)]) +def test_glm_log_regression(family, solver, tol): """Test GLM regression with log link on a simple dataset.""" coef = [0.2, -0.1] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T @@ -388,7 +386,7 @@ def test_glm_log_regression(family, solver, tol, dec): alpha=0, family=family, link='log', fit_intercept=False, solver=solver, start_params='guess', tol=tol) res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef, decimal=dec) + assert_allclose(res.coef_, coef) @pytest.mark.filterwarnings('ignore::DeprecationWarning') @@ -421,10 +419,10 @@ def test_normal_ridge(solver, tol, dec): tol=tol, max_iter=100, solver=solver, check_input=False, random_state=rng) glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + assert glm.coef_.shape == (X.shape[1], ) + assert_allclose(glm.coef_, ridge.coef_) + assert glm.intercept_ == pytest.approx(ridge.intercept_) + assert_allclose(glm.predict(T), ridge.predict(T)) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, solver='svd', normalize=False) @@ -435,13 +433,13 @@ def test_normal_ridge(solver, tol, dec): check_input=False, random_state=rng, fit_dispersion='chisqr') glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert glm.coef_.shape == (X.shape[1], ) + assert_allclose(glm.coef_, ridge.coef_) assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + assert_allclose(glm.predict(T), ridge.predict(T)) mu = glm.predict(X) - assert_almost_equal(glm.dispersion_, - np.sum((y-mu)**2/(n_samples-n_features))) + assert_allclose(glm.dispersion_, + np.sum((y-mu)**2/(n_samples-n_features))) # 2. With more features than samples and sparse n_samples, n_features, n_predict = 10, 100, 10 @@ -461,7 +459,7 @@ def test_normal_ridge(solver, tol, dec): tol=tol, max_iter=300, solver=solver, check_input=False, random_state=rng) glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) + assert glm.coef_.shape == (X.shape[1], ) assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) @@ -474,7 +472,7 @@ def test_normal_ridge(solver, tol, dec): tol=tol*2, max_iter=300, solver=solver, check_input=False, random_state=rng) glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) + assert glm.coef_.shape == (X.shape[1], ) assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1) assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1) assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2) From 61bc6b8e2b1e227f539656744e9b4a4fa9f514f2 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 5 Jun 2019 18:45:20 +0200 Subject: [PATCH 055/209] Improve tests --- sklearn/linear_model/tests/test_glm.py | 82 +++++++++++++------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 2c8a9c3d2c72c..7d747c23ae441 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -22,9 +22,7 @@ ) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge -from sklearn.utils.testing import ( - assert_almost_equal, - assert_array_equal, assert_array_almost_equal) +from sklearn.utils.testing import assert_array_equal rng = np.random.RandomState(42) @@ -355,16 +353,16 @@ def test_glm_check_input_argument(check_input): @pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) -def test_glm_identiy_regression(solver): +def test_glm_identity_regression(solver): """Test GLM regression with identity link on a simple dataset.""" - coef = [1, 2] + coef = [1., 2.] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', fit_intercept=False, solver=solver, start_params='zero', tol=1e-7) res = glm.fit(X, y) - assert_allclose(res.coef_, coef) + assert_allclose(res.coef_, coef, rtol=1e-6) @pytest.mark.parametrize( @@ -386,7 +384,7 @@ def test_glm_log_regression(family, solver, tol): alpha=0, family=family, link='log', fit_intercept=False, solver=solver, start_params='guess', tol=tol) res = glm.fit(X, y) - assert_allclose(res.coef_, coef) + assert_allclose(res.coef_, coef, rtol=5e-6) @pytest.mark.filterwarnings('ignore::DeprecationWarning') @@ -420,9 +418,9 @@ def test_normal_ridge(solver, tol, dec): check_input=False, random_state=rng) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_) - assert glm.intercept_ == pytest.approx(ridge.intercept_) - assert_allclose(glm.predict(T), ridge.predict(T)) + assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) + assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, solver='svd', normalize=False) @@ -434,9 +432,9 @@ def test_normal_ridge(solver, tol, dec): fit_dispersion='chisqr') glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) - assert_allclose(glm.predict(T), ridge.predict(T)) + assert_allclose(glm.coef_, ridge.coef_, rtol=1e-5) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) + assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6) mu = glm.predict(X) assert_allclose(glm.dispersion_, np.sum((y-mu)**2/(n_samples-n_features))) @@ -452,7 +450,8 @@ def test_normal_ridge(solver, tol, dec): # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, - solver='sag', normalize=False, max_iter=100000) + solver='sag', normalize=False, max_iter=100000, + random_state=42) ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=True, @@ -460,22 +459,24 @@ def test_normal_ridge(solver, tol, dec): check_input=False, random_state=rng) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) + assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, - solver='sag', normalize=False, max_iter=1000) + solver='sag', normalize=False, max_iter=1000, + random_state=42) ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=False, tol=tol*2, max_iter=300, solver=solver, check_input=False, random_state=rng) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2) + assert_allclose(glm.coef_, ridge.coef_, rtol=1e-4) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) + assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) @pytest.mark.parametrize('solver, tol, dec', @@ -506,10 +507,8 @@ def test_poisson_ridge(solver, tol, dec): solver=solver, max_iter=300, random_state=rng) glm.fit(X, y) - assert_almost_equal(glm.intercept_, -0.12889386979, - decimal=dec) - assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], - decimal=dec) + assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) + assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-6) @pytest.mark.parametrize('diag_fisher', [False, True]) @@ -535,14 +534,14 @@ def test_normal_enet(diag_fisher): normalize=False, tol=1e-8, copy_X=True) enet.fit(X, y) - assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) - assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7) + assert_allclose(glm.coef_, enet.coef_, rtol=5e-5) # 2. test normal enet on sparse data X = sparse.csc_matrix(X) glm.fit(X, y) - assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) - assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7) + assert_allclose(glm.coef_, enet.coef_, rtol=5e-5) def test_poisson_enet(): @@ -569,8 +568,8 @@ def test_poisson_enet(): selection='random', random_state=rng, start_params='guess') glm.fit(X, y) - assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) - assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) + assert_allclose(glm.intercept_, glmnet_intercept, rtol=2e-6) + assert_allclose(glm.coef_, glmnet_coef, rtol=2e-7) # test results with general optimization procedure def obj(coef): @@ -584,10 +583,10 @@ def obj(coef): + alpha * l1_ratio * np.sum(np.abs(coef[1:])) res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10, options={'maxiter': 1000, 'disp': False}) - assert_almost_equal(glm.intercept_, res.x[0], decimal=5) - assert_almost_equal(glm.coef_, res.x[1:], decimal=5) - assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))), - res.fun, decimal=8) + assert_allclose(glm.intercept_, res.x[0], rtol=1e-5) + assert_allclose(glm.coef_, res.x[1:], rtol=1e-5, atol=1e-9) + assert_allclose(obj(np.concatenate(([glm.intercept_], glm.coef_))), + res.fun, rtol=1e-8) # same for start_params='zero' and selection='cyclic' # with reduced precision @@ -595,8 +594,8 @@ def obj(coef): link='log', solver='cd', tol=1e-5, selection='cyclic', start_params='zero') glm.fit(X, y) - assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) - assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4) + assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4) # check warm_start, therefore start with different alpha glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, @@ -609,8 +608,8 @@ def obj(coef): glm.alpha = 1 X = sparse.csr_matrix(X) glm.fit(X, y) - assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) - assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4) + assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4) @pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10]) @@ -629,10 +628,11 @@ def test_binomial_enet(alpha): max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), solver='saga') log.fit(X, y) + glm = GeneralizedLinearRegressor( family=BinomialDistribution(), link=LogitLink(), fit_intercept=False, alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic', tol=1e-7) glm.fit(X, y) - assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6) - assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6) + assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6) + assert_allclose(log.coef_[0, :], glm.coef_, rtol=2e-6) From b24a7cab9fc7d9f5dcec5b9d7657fee2d0a94283 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 5 Jun 2019 18:53:58 +0200 Subject: [PATCH 056/209] Remove unused dec parameter in tests --- sklearn/linear_model/tests/test_glm.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 7d747c23ae441..a3e943403a7a7 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -388,11 +388,11 @@ def test_glm_log_regression(family, solver, tol): @pytest.mark.filterwarnings('ignore::DeprecationWarning') -@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), - ('lbfgs', 1e-6, 5), - ('newton-cg', 1e-6, 5), - ('cd', 1e-6, 6)]) -def test_normal_ridge(solver, tol, dec): +@pytest.mark.parametrize('solver, tol', [('irls', 1e-6), + ('lbfgs', 1e-6), + ('newton-cg', 1e-6), + ('cd', 1e-6)]) +def test_normal_ridge(solver, tol): """Test ridge regression for Normal distributions. Compare to test_ridge in test_ridge.py. @@ -479,12 +479,12 @@ def test_normal_ridge(solver, tol, dec): assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) -@pytest.mark.parametrize('solver, tol, dec', - [('irls', 1e-7, 6), - ('lbfgs', 1e-7, 5), - ('newton-cg', 1e-7, 5), - ('cd', 1e-7, 7)]) -def test_poisson_ridge(solver, tol, dec): +@pytest.mark.parametrize('solver, tol', + [('irls', 1e-7), + ('lbfgs', 1e-7), + ('newton-cg', 1e-7), + ('cd', 1e-7)]) +def test_poisson_ridge(solver, tol): """Test ridge regression with poisson family and LogLink. Compare to R's glmnet""" From f95b390c04fd4c0a333f08a649d9450ec44f0395 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 18 Jul 2017 21:50:10 +0200 Subject: [PATCH 057/209] ENH: add Generalized Linear Models, issue #5975 * new estimator GeneralizedLinearRegressor * loss functions for Tweedie family and Binomial * elasitc net penalties * control of penalties by matrix P2 and vector P1 * new solvers: coordinate descent, irls * tests * documentation * example for Poisson regression --- doc/modules/classes.rst | 1 + doc/modules/linear_model.rst | 129 + .../plot_poisson_spline_regression.py | 85 + sklearn/linear_model/__init__.py | 6 +- sklearn/linear_model/glm.py | 2331 +++++++++++++++++ sklearn/linear_model/tests/test_glm.py | 640 +++++ 6 files changed, 3191 insertions(+), 1 deletion(-) create mode 100644 examples/linear_model/plot_poisson_spline_regression.py create mode 100644 sklearn/linear_model/glm.py create mode 100644 sklearn/linear_model/tests/test_glm.py diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 2dcf582a6ab39..4158e34e8bb8c 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -727,6 +727,7 @@ Kernels: linear_model.BayesianRidge linear_model.ElasticNet linear_model.ElasticNetCV + linear_model.GeneralizedLinearRegressor linear_model.HuberRegressor linear_model.Lars linear_model.LarsCV diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 02f406f629e04..888566fab3601 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -893,6 +893,135 @@ to warm-starting (see :term:`Glossary `). .. [9] `"Performance Evaluation of Lbfgs vs other solvers" `_ +.. _Generalized_linear_regression: + +Generalized Linear Regression +============================= + +:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two +ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear +combination of the input variables :math:`X` via an inverse link function +:math:`h` as + +.. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). + +Secondly, the squared loss function is replaced by the deviance :math:`D` of an +exponential dispersion model (EDM) [11]_. The objective function beeing minimized +becomes + +.. math:: \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1 + +\frac{\alpha(1-\rho)}{2} w^T P_2 w + +with sample weights :math:`s`. +:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in +the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows +for a more versatile L2 penalty. + +Use cases, where a loss different from the squared loss might be appropriate, +are the following: + + * If the target values :math:`y` are counts (non-negative integer valued) or + frequencies (non-negative), you might use a Poisson deviance with log-link. + + * If the target values are positive valued and skewed, you might try a + Gamma deviance with log-link. + + * If the target values seem to be heavier tailed than a Gamma distribution, + you might try an Inverse Gaussian deviance (or even higher variance powers + of the Tweedie family). + +Since the linear predictor :math:`Xw` can be negative and +Poisson, Gamma and Inverse Gaussian distributions don't support negative values, +it is convenient to apply a link function different from the identity link +:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with +:math:`h(Xw)=\exp(Xw)`. + +Note that the feature matrix `X` should be standardized before fitting. This +ensures that the penalty treats features equally. The estimator can be used as +follows: + + >>> from sklearn.linear_model import GeneralizedLinearRegressor + >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE + GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5, + check_input=True, copy_X=True, diag_fisher=False, + family='poisson', fit_dispersion=None, + fit_intercept=True, l1_ratio=0, link='log', + max_iter=100, random_state=None, selection='cyclic', + solver='auto', start_params='guess', tol=0.0001, + verbose=0, warm_start=False) + >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE + array([0.24630169, 0.43373464]) + >>> reg.intercept_ #doctest: +ELLIPSIS + -0.76383633... + + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py` + +Mathematical formulation +------------------------ + +In the unpenalized case, the assumptions are the following: + + * The target values :math:`y_i` are realizations of random variables + :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})` + with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter + :math:`\phi` and sample weights :math:`s_i`. + * The aim is to predict the expectation :math:`\mu_i` with + :math:`\hat{y_i} = h(\eta_i)`, linear predictor + :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`. + +Note that the first assumption implies +:math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance +function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the +same as specifying a unit variance function (they are one-to-one). + +Including penalties helps to avoid overfitting or, in case of L1 penalty, to +obtain sparse solutions. But there are also other motivations to include them, +e.g. accounting for the dependence structure of :math:`y`. + +The objective function, which is independent of :math:`\phi`, is minimized with +respect to the coefficients :math:`w`. + +The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` +likelihood as + +.. math:: d(y, \mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) + - loglike(y,y,\phi)\right) \\ + D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) + +===================================== =============================== ================================= ============================================ +Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` +===================================== =============================== ================================= ============================================ +Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` +Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` +Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` +Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` +===================================== =============================== ================================= ============================================ + +Two remarks: + +* The deviances for at least Normal, Poisson and Gamma distributions are + strictly consistent scoring functions for the mean :math:`\mu`, see Eq. + (19)-(20) in [12]_. + +* If you want to model a frequency, i.e. counts per exposure (time, volume, ...) + you can do so by a Poisson distribution and passing + :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together + with :math:`s=\mathrm{exposure}` as sample weights. + + +.. topic:: References: + + .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. + See also `Exponential dispersion model. `_ + + .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ Stochastic Gradient Descent - SGD ================================= diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py new file mode 100644 index 0000000000000..fce85fae1ea8c --- /dev/null +++ b/examples/linear_model/plot_poisson_spline_regression.py @@ -0,0 +1,85 @@ +""" +================================= +Poisson Regression with B-Splines +================================= + +As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` +example, a Poisson regression with penalized B-splines (P-splines) [1]_ is +fitted on slightly different sinusodial, Poisson distributed data and +compared to an AdaBoost model with decision trees. +One can see, that this is a hard problem for both estimators. + +.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines + and penalties". Statist. Sci. 11 (1996), no. 2, 89--121. + `doi:10.1214/ss/1038425655 + `_ + +""" +print(__doc__) + +# Author: Christian Lorentzen +# based on the AdaBoost regression example from Noel Dawe +# License: BSD 3 clause + +# importing necessary libraries +import numpy as np +from scipy.linalg import toeplitz +# from scipy.interpolate import BSpline +from scipy.interpolate import splev +import matplotlib.pyplot as plt +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import AdaBoostRegressor +from sklearn.linear_model import GeneralizedLinearRegressor + + +# Create the dataset +xmin, xmax = 0, 6 +rng = np.random.RandomState(1) +X = np.linspace(xmin, xmax, 500)[:, np.newaxis] +y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel()) +y = rng.poisson(y_true, X.shape[0]) + +# b-spline basis +nknots, degree = 40, 3 +ns = nknots - degree - 1 # number of base spline functions +dx = (xmax - xmin) / (nknots - 1 - 2 * degree) +knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots) +coef = np.zeros(ns) +splineBasis = np.empty((X.shape[0], ns), dtype=float) +for i in range(ns): + coef[i] = 1 +# splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ +# .ravel() + splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel() + coef[i] = 0 + +# second order difference matrix +P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float) +P2[0, 0] = P2[-1, -1] = 1 + +# Fit regression model +regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), + n_estimators=10, random_state=rng) + +regr_2 = GeneralizedLinearRegressor(family='poisson', link='log', + fit_intercept=True, alpha=0.02, + l1_ratio=0.1, P2=P2) + +regr_1.fit(X, y) +regr_2.fit(splineBasis, y) + +# Predict +y_1 = regr_1.predict(X) +y_2 = regr_2.predict(splineBasis) + +# Plot the results +plt.figure() +plt.plot(X, y_true, c="b", label="true mean") +plt.scatter(X, y, c="k", marker='.', label="training samples") +plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2) +plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2) +plt.xlabel("data") +plt.ylabel("target") +plt.title("Regression Comparison") +plt.legend() +plt.show() diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 770a1a49b600e..cbb2ad8826358 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,6 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) +from .glm import (TweedieDistribution, + GeneralizedLinearRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -78,4 +80,6 @@ 'orthogonal_mp', 'orthogonal_mp_gram', 'ridge_regression', - 'RANSACRegressor'] + 'RANSACRegressor', + 'GeneralizedLinearRegressor', + 'TweedieDistribution'] diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py new file mode 100644 index 0000000000000..ac0007c1789a8 --- /dev/null +++ b/sklearn/linear_model/glm.py @@ -0,0 +1,2331 @@ +""" +Generalized Linear Models with Exponential Dispersion Family +""" + +# Author: Christian Lorentzen +# some parts and tricks stolen from other sklearn files. +# License: BSD 3 clause + +# TODO: Should the option `normalize` be included (like other linear models)? +# So far, it is not included. User must pass a normalized X. +# TODO: Add cross validation support, e.g. GCV? +# TODO: Should GeneralizedLinearRegressor inherit from LinearModel? +# So far, it does not. +# TODO: Include further classes in class.rst? ExponentialDispersionModel? +# TweedieDistribution? +# TODO: Negative values in P1 are not allowed so far. They could be used +# for group lasso. + +# Design Decisions: +# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. +# Estimators in sklearn are either regressors or classifiers. A GLM can do +# both depending on the distr (Normal => regressor, Binomial => classifier). +# Solution: GeneralizedLinearRegressor since this is the focus. +# - Allow for finer control of penalty terms: +# L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude +# factors from the L1 penalty. +# L2: w*P2*w with P2 a positive (semi-) definite matrix, e.g. P2 could be +# a 1st or 2nd order difference matrix (compare B-spline penalties and +# Tikhonov regularization). +# - The link funtion (instance of class Link) is necessary for the evaluation +# of deviance, score, Fisher and Hessian matrix as a functions of the +# coefficients, which is needed by optimizers. +# Solution: link as argument in those functions +# - Which name/symbol for sample_weight in docu? +# sklearn.linear_models uses w for coefficients, standard literature on +# GLMs use beta for coefficients and w for (sample) weights. +# So far, coefficients=w and sample weights=s. +# - The intercept term is the first index, i.e. coef[0] + + +from __future__ import division +from abc import ABCMeta, abstractmethod +import numbers +import numpy as np +from scipy import linalg, sparse, special +import scipy.sparse.linalg as splinalg +from scipy.optimize import fmin_l_bfgs_b +import warnings +from ..base import BaseEstimator, RegressorMixin +from ..exceptions import ConvergenceWarning +from ..utils import check_array, check_X_y +from ..utils.optimize import newton_cg +from ..utils.validation import check_is_fitted, check_random_state + + +def _check_weights(sample_weight, n_samples): + """Check that sample weights are non-negative and have the right shape.""" + if sample_weight is None: + weights = np.ones(n_samples) + elif np.isscalar(sample_weight): + if sample_weight <= 0: + raise ValueError("Sample weights must be non-negative.") + weights = sample_weight * np.ones(n_samples) + else: + _dtype = [np.float64, np.float32] + weights = check_array(sample_weight, accept_sparse=False, + force_all_finite=True, ensure_2d=False, + dtype=_dtype) + if weights.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar") + elif weights.shape[0] != n_samples: + raise ValueError("Sample weights must have the same length as " + "y") + if not np.all(weights >= 0): + raise ValueError("Sample weights must be non-negative.") + elif not np.sum(weights) > 0: + raise ValueError("Sample weights must have at least one positive " + "element.") + + return weights + + +def _safe_lin_pred(X, coef): + """Compute the linear predictor taking care if intercept is present.""" + if coef.size == X.shape[1] + 1: + return X @ coef[1:] + coef[0] + else: + return X @ coef + + +def _safe_toarray(X): + """Returns a numpy array.""" + if sparse.issparse(X): + return X.toarray() + else: + return np.asarray(X) + + +def _safe_sandwich_dot(X, d, intercept=False): + """Compute sandwich product X.T @ diag(d) @ X. + + With ``intercept=True``, X is treated as if a column of 1 were appended as + first column of X. + X can be sparse, d must be an ndarray. Always returns a ndarray.""" + if sparse.issparse(X): + temp = (X.transpose() @ X.multiply(d[:, np.newaxis])) + # for older versions of numpy and scipy, temp may be a np.matrix + temp = _safe_toarray(temp) + else: + temp = (X.T * d) @ X + if intercept: + dim = X.shape[1] + 1 + if sparse.issparse(X): + order = 'F' if sparse.isspmatrix_csc(X) else 'C' + else: + order = 'F' if X.flags['F_CONTIGUOUS'] else 'C' + res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order) + res[0, 0] = d.sum() + res[1:, 0] = d @ X + res[0, 1:] = res[1:, 0] + res[1:, 1:] = temp + else: + res = temp + return res + + +def _min_norm_sugrad(coef, grad, P2, P1): + """Compute the gradient of all subgradients with minimal L2-norm. + + subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1) + + g_i = grad_i + (P2*coef)_i + + if coef_i > 0: g_i + P1_i + if coef_i < 0: g_i - P1_i + if coef_i = 0: sign(g_i) * max(|g_i|-P1_i, 0) + + Parameters + ---------- + coef : ndarray + coef[0] may be intercept. + + grad : ndarray, shape=coef.shape + + P2 : {1d or 2d array, None} + always without intercept, ``None`` means P2 = 0 + + P1 : ndarray + always without intercept + """ + intercept = (coef.size == P1.size + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + # compute grad + coef @ P2 without intercept + grad_wP2 = grad[idx:].copy() + if P2 is None: + pass + elif P2.ndim == 1: + grad_wP2 += coef[idx:] * P2 + else: + grad_wP2 += coef[idx:] @ P2 + res = np.where(coef[idx:] == 0, + np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0), + grad_wP2 + np.sign(coef[idx:]) * P1) + if intercept: + return np.concatenate(([grad[0]], res)) + else: + return res + + +class Link(metaclass=ABCMeta): + """Abstract base class for Link funtions.""" + + @abstractmethod + def link(self, mu): + """Compute the link function g(mu). + + The link function links the mean mu=E[Y] to the so called linear + predictor (X*w), i.e. g(mu) = linear predictor. + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the (predicted) mean. + """ + raise NotImplementedError + + @abstractmethod + def derivative(self, mu): + """Compute the derivative of the link g'(mu). + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the (predicted) mean. + """ + raise NotImplementedError + + @abstractmethod + def inverse(self, lin_pred): + """Compute the inverse link function h(lin_pred). + + Gives the inverse relationship between linkear predictor and the mean + mu=E[Y], i.e. h(linear predictor) = mu. + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + raise NotImplementedError + + @abstractmethod + def inverse_derivative(self, lin_pred): + """Compute the derivative of the inverse link function h'(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + raise NotImplementedError + + @abstractmethod + def inverse_derivative2(self, lin_pred): + """Compute 2nd derivative of the inverse link function h''(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + raise NotImplementedError + + +class IdentityLink(Link): + """The identity link function g(x)=x.""" + + def link(self, mu): + return mu + + def derivative(self, mu): + return np.ones_like(mu) + + def inverse(self, lin_pred): + return lin_pred + + def inverse_derivative(self, lin_pred): + return np.ones_like(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.zeros_like(lin_pred) + + +class LogLink(Link): + """The log link function g(x)=log(x).""" + + def link(self, mu): + return np.log(mu) + + def derivative(self, mu): + return 1./mu + + def inverse(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.exp(lin_pred) + + +class LogitLink(Link): + """The logit link function g(x)=logit(x).""" + + def link(self, mu): + return special.logit(mu) + + def derivative(self, mu): + return 1. / (mu * (1 - mu)) + + def inverse(self, lin_pred): + return special.expit(lin_pred) + + def inverse_derivative(self, lin_pred): + ep = special.expit(lin_pred) + return ep * (1. - ep) + + def inverse_derivative2(self, lin_pred): + ep = special.expit(lin_pred) + ep = special.expit(lin_pred) + return ep * (1. - ep) * (1. - 2 * ep) + + +class ExponentialDispersionModel(metaclass=ABCMeta): + r"""Base class for reproductive Exponential Dispersion Models (EDM). + + The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by + + .. math:: p(y| \theta, \phi) = c(y, \phi) + \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) + = \tilde{c}(y, \phi) + \exp\left(-\frac{d(y, \mu)}{2\phi}\right) + + with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, + unit variance :math:`v(\mu)` and + unit deviance :math:`d(y,\mu)`. + + Attributes + ---------- + lower_bound + upper_bound + include_lower_bound + include_upper_bound + + Methods + ------- + in_y_range + unit_variance + unit_variance_derivative + variance + variance_derivative + unit_deviance + unit_deviance_derivative + deviance + deviance_derivative + starting_mu + + _mu_deviance_derivative + _score + _fisher_matrix + _observed_information + _eta_mu_score_fisher + + References + ---------- + + https://en.wikipedia.org/wiki/Exponential_dispersion_model. + """ + @property + def lower_bound(self): + """Get the lower bound of values for Y~EDM.""" + return self._lower_bound + + @property + def upper_bound(self): + """Get the upper bound of values for Y~EDM.""" + return self._upper_bound + + @property + def include_lower_bound(self): + """Get True if lower bound for y is included: y >= lower_bound.""" + return self._include_lower_bound + + @property + def include_upper_bound(self): + """Get True if upper bound for y is includede: y <= upper_bound.""" + return self._include_upper_bound + + def in_y_range(self, x): + """Returns ``True`` if x is in the valid range of Y~EDM. + + Parameters + ---------- + x : array, shape (n_samples,) + Target values. + """ + if self.include_lower_bound: + if self.include_upper_bound: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less(x, self.upper_bound)) + else: + if self.include_upper_bound: + return np.logical_and(np.greater(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater(x, self.lower_bound), + np.less(x, self.upper_bound)) + + @abstractmethod + def unit_variance(self, mu): + r"""Compute the unit variance function. + + The unit variance :math:`v(\mu)` determines the variance as + a function of the mean :math:`\mu` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. + It can also be derived from the unit deviance :math:`d(y,\mu)` as + + .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ + \partial\mu^2}}\big|_{y=\mu} + + See also :func:`variance`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + raise NotImplementedError() + + @abstractmethod + def unit_variance_derivative(self, mu): + r"""Compute the derivative of the unit variance w.r.t. mu. + + Return :math:`v'(\mu)`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Target values. + """ + raise NotImplementedError() + + def variance(self, mu, phi=1, weights=1): + r"""Compute the variance function. + + The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is + :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, + with unit variance :math:`v(\mu)` and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return phi/weights * self.unit_variance(mu) + + def variance_derivative(self, mu, phi=1, weights=1): + r"""Compute the derivative of the variance w.r.t. mu. + + Returns + :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] + =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` + and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return phi/weights * self.unit_variance_derivative(mu) + + @abstractmethod + def unit_deviance(self, y, mu): + r"""Compute the unit deviance. + + The unit_deviance :math:`d(y,\mu)` can be defined by the + log-likelihood as + :math:`d(y,\mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + """ + raise NotImplementedError() + + def unit_deviance_derivative(self, y, mu): + r"""Compute the derivative of the unit deviance w.r.t. mu. + + The derivative of the unit deviance is given by + :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + with unit variance :math:`v(\mu)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + """ + return -2 * (y - mu) / self.unit_variance(mu) + + def deviance(self, y, mu, weights=1): + r"""Compute the deviance. + + The deviance is a weighted sum of the per sample unit deviances, + :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` + with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. + In terms of the log-likelihood it is :math:`D = -2\phi\cdot + \left(loglike(y,\mu,\frac{phi}{s}) + - loglike(y,y,\frac{phi}{s})\right)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return np.sum(weights * self.unit_deviance(y, mu)) + + def deviance_derivative(self, y, mu, weights=1): + """Compute the derivative of the deviance w.r.t. mu. + + It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return weights * self.unit_deviance_derivative(y, mu) + + def starting_mu(self, y, weights=1, ind_weight=0.5): + """Set starting values for the mean mu. + + These may be good starting points for the (unpenalized) IRLS solver. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + + ind_weight : float (default=0.5) + Must be between 0 and 1. Specifies how much weight is given to the + individual observations instead of the mean of y. + """ + return (ind_weight * y + + (1. - ind_weight) * np.average(y, weights=weights)) + + def _mu_deviance_derivative(self, coef, X, y, weights, link): + """Compute mu and the derivative of the deviance w.r.t coef.""" + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + d1 = link.inverse_derivative(lin_pred) + temp = d1 * self.deviance_derivative(y, mu, weights) + if coef.size == X.shape[1] + 1: + devp = np.concatenate(([temp.sum()], temp @ X)) + else: + devp = temp @ X # sampe as X.T @ temp + return mu, devp + + def _score(self, coef, phi, X, y, weights, link): + r"""Compute the score function. + + The score function is the derivative of the + log-likelihood w.r.t. `coef` (:math:`w`). + It is given by + + .. math: + + \mathbf{score}(\boldsymbol{w}) + = \frac{\partial loglike}{\partial\boldsymbol{w}} + = \mathbf{X}^T \mathbf{D} + \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, + + with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and + :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. + Note: The derivative of the deviance w.r.t. coef equals -2 * score. + """ + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) + d = link.inverse_derivative(lin_pred) + temp = sigma_inv * d * (y - mu) + if coef.size == X.shape[1] + 1: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X # sampe as X.T @ temp + return score + + def _fisher_matrix(self, coef, phi, X, y, weights, link): + r"""Compute the Fisher information matrix. + + The Fisher information matrix, also known as expected information + matrix is given by + + .. math: + + \mathbf{F}(\boldsymbol{w}) = + \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial + \boldsymbol{w}} \right] + = \mathrm{E}\left[ + -\frac{\partial^2 loglike}{\partial\boldsymbol{w} + \partial\boldsymbol{w}^T}\right] + = \mathbf{X}^T W \mathbf{X} \,, + + with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, + see func:`_score`. + """ + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) + d = link.inverse_derivative(lin_pred) + d2_sigma_inv = sigma_inv * d * d + intercept = (coef.size == X.shape[1] + 1) + fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, + intercept=intercept) + return fisher_matrix + + def _observed_information(self, coef, phi, X, y, weights, link): + r"""Compute the observed information matrix. + + The observed information matrix, also known as the negative of + the Hessian matrix of the log-likelihood, is given by + + .. math: + + \mathbf{H}(\boldsymbol{w}) = + -\frac{\partial^2 loglike}{\partial\boldsymbol{w} + \partial\boldsymbol{w}^T} + = \mathbf{X}^T \left[ + - \mathbf{D}' \mathbf{R} + + \mathbf{D}^2 \mathbf{V} \mathbf{R} + + \mathbf{D}^2 + \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,, + + with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`, + :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{ + v(\mu_i)} + \right)`, + see :func:`score_` function and :func:`_fisher_matrix`. + """ + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) + dp = link.inverse_derivative2(lin_pred) + d2 = link.inverse_derivative(lin_pred)**2 + v = self.unit_variance_derivative(mu)/self.unit_variance(mu) + r = y - mu + temp = sigma_inv * (-dp * r + d2 * v * r + d2) + intercept = (coef.size == X.shape[1] + 1) + observed_information = _safe_sandwich_dot(X, temp, + intercept=intercept) + return observed_information + + def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, + diag_fisher=False): + """Compute linear predictor, mean, score function and fisher matrix. + + It calculates the linear predictor, the mean, score function + (derivative of log-likelihood) and Fisher information matrix + all in one go as function of `coef` (:math:`w`) and the data. + + Parameters + ---------- + diag_fisher : boolean, optional (default=False) + If ``True``, returns only an array d such that + fisher = X.T @ np.diag(d) @ X. + + Returns + ------- + (eta, mu, score, fisher) : tuple with 4 elements + The 4 elements are: + + * eta: ndarray, shape (X.shape[0],) + * mu: ndarray, shape (X.shape[0],) + * score: ndarray, shape (X.shape[0],) + * fisher: + + * If diag_fisher is ``False``, the full fisher matrix, + an array of shape (X.shape[1], X.shape[1]) + * If diag_fisher is ``True`, an array of shape (X.shape[0]) + """ + intercept = (coef.size == X.shape[1] + 1) + # eta = linear predictor + eta = _safe_lin_pred(X, coef) + mu = link.inverse(eta) + sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) + d1 = link.inverse_derivative(eta) # = h'(eta) + # Alternatively: + # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g + # d1 = 1./link.derivative(mu) + d1_sigma_inv = d1 * sigma_inv + temp = d1_sigma_inv * (y - mu) + if intercept: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X + + d2_sigma_inv = d1 * d1_sigma_inv + if diag_fisher: + fisher_matrix = d2_sigma_inv + else: + fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, + intercept=intercept) + return eta, mu, score, fisher_matrix + + +class TweedieDistribution(ExponentialDispersionModel): + r"""A class for the Tweedie distribution. + + A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely + defined by it's mean-variance relationship + :math:`\mathrm{Var}[Y] \propto \mu^power`. + + Special cases are: + + ===== ================ + Power Distribution + ===== ================ + 0 Normal + 1 Poisson + (0,1) Compound Poisson + 2 Gamma + 3 Inverse Gaussian + + Parameters + ---------- + power : float (default=0) + The variance power of the `unit_variance` + :math:`v(\mu) = \mu^{power}`. + For ``0 0) and (power < 1): + raise ValueError('For 0 1) and (power < 2): + # Compound Poisson + self._lower_bound = 0 + self._include_lower_bound = True + elif power == 2: + # GammaDistribution + self._lower_bound = 0 + self._include_lower_bound = False + elif (power > 2) and (power < 3): + # Positive Stable + self._lower_bound = 0 + self._include_lower_bound = False + elif power == 3: + # InverseGaussianDistribution + self._lower_bound = 0 + self._include_lower_bound = False + elif power > 3: + # Positive Stable + self._lower_bound = 0 + self._include_lower_bound = False + else: + raise ValueError('The power must be a float, i.e. real number, ' + 'got (power={})'.format(power)) + + @property + def power(self): + return self._power + + @power.setter + def power(self, power): + if not isinstance(power, numbers.Real): + raise TypeError('power must be a real number, input was {0}' + .format(power)) + self._power = power + + def unit_variance(self, mu): + """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + return np.power(mu, self.power) + + def unit_variance_derivative(self, mu): + """Compute the derivative of the unit variance of a Tweedie + distribution v(mu)=power*mu**(power-1). + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + return self.power * np.power(mu, self.power - 1) + + def unit_deviance(self, y, mu): + p = self.power + if p == 0: + # NormalDistribution + return (y - mu)**2 + if p == 1: + # PoissonDistribution + # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 + return 2 * (special.xlogy(y, y/mu) - y + mu) + elif p == 2: + # GammaDistribution + return 2 * (np.log(mu/y) + y/mu - 1) + else: + # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) + # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) + return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - + y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + + +class NormalDistribution(TweedieDistribution): + """Class for the Normal (aka Gaussian) distribution""" + def __init__(self): + super(NormalDistribution, self).__init__(power=0) + + +class PoissonDistribution(TweedieDistribution): + """Class for the scaled Poisson distribution""" + def __init__(self): + super(PoissonDistribution, self).__init__(power=1) + + +class GammaDistribution(TweedieDistribution): + """Class for the Gamma distribution""" + def __init__(self): + super(GammaDistribution, self).__init__(power=2) + + +class InverseGaussianDistribution(TweedieDistribution): + """Class for the scaled InverseGaussianDistribution distribution""" + def __init__(self): + super(InverseGaussianDistribution, self).__init__(power=3) + + +class GeneralizedHyperbolicSecant(ExponentialDispersionModel): + """A class for the Generalized Hyperbolic Secant (GHS) distribution. + + The GHS distribution is for tagets y in (-inf, inf). + """ + def __init__(self): + self._lower_bound = -np.Inf + self._upper_bound = np.Inf + self._include_lower_bound = False + self._include_upper_bound = False + + def unit_variance(self, mu): + return 1 + mu**2 + + def unit_variance_derivative(self, mu): + return 2 * mu + + def unit_deviance(self, y, mu): + return (2 * y * (np.arctan(y) - np.arctan(mu)) + + np.log((1 + mu**2)/(1 + y**2))) + + +class BinomialDistribution(ExponentialDispersionModel): + """A class for the Binomial distribution. + + The Binomial distribution is for tagets y in [0, 1]. + """ + def __init__(self): + self._lower_bound = 0 + self._upper_bound = 1 + self._include_lower_bound = True + self._include_upper_bound = True + + def unit_variance(self, mu): + return mu * (1 - mu) + + def unit_variance_derivative(self, mu): + return 1 - 2 * mu + + def unit_deviance(self, y, mu): + return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) + + +def _irls_step(X, W, P2, z, fit_intercept=True): + """Compute one step in iteratively reweighted least squares. + + Solve A w = b for w with + A = (X' W X + P2) + b = X' W z + z = eta + D^-1 (y-mu) + + See also fit method of :class:`GeneralizedLinearRegressor`. + + Parameters + ---------- + X : {ndarray, sparse matrix}, shape (n_samples, n_features) + Training data (with intercept included if present) + + W : ndarray, shape (n_samples,) + + P2 : {ndarray, sparse matrix}, shape (n_features, n_features) + The L2-penalty matrix or vector (=diagonal matrix) + + z : ndarray, shape (n_samples,) + Working observations + + fit_intercept : boolean, optional (default=True) + + Returns + ------- + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. + """ + # Note: solve vs least squares, what is more appropriate? + # scipy.linalg.solve seems faster, but scipy.linalg.lstsq + # is more robust. + # Note: X.T @ W @ X is not sparse, even when X is sparse. + # Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b) + if fit_intercept: + Wz = W * z + if sparse.issparse(X): + b = np.concatenate(([Wz.sum()], X.transpose() @ Wz)) + else: + b = np.concatenate(([Wz.sum()], X.T @ Wz)) + A = _safe_sandwich_dot(X, W, intercept=fit_intercept) + if P2.ndim == 1: + idx = np.arange(start=1, stop=A.shape[0]) + A[(idx, idx)] += P2 # add to diag elements without intercept + elif sparse.issparse(P2): + A[1:, 1:] += P2.toarray() + else: + A[1:, 1:] += P2 + else: + if sparse.issparse(X): + XtW = X.transpose().multiply(W) + # for older versions of numpy and scipy, A may be a np.matrix + A = _safe_toarray(XtW @ X) + else: + XtW = (X.T * W) + A = XtW @ X + b = XtW @ z + if P2.ndim == 1: + A[np.diag_indices_from(A)] += P2 + elif sparse.issparse(P2): + A += P2.toarray() + else: + A += P2 + # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True) + coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) + return coef + + +def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, + max_iter, tol): + """Solve GLM with L2 penalty by IRLS algorithm. + + Note: If X is sparse, P2 must also be sparse. + """ + # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' + # Obj = objective function = 1/2 Dev + l2/2 w P2 w + # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 + # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) + # D2 = link.inverse_derivative(eta)^2 = D^2 + # W = D2/V(mu) + # l2 = alpha * (1 - l1_ratio) + # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w + # = -X' D (y-mu)/V(mu) + l2 P2 w + # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 + # Use Fisher matrix instead of full info matrix -X'(...) X, + # i.e. E[Dev''] with E[y-mu]=0: + # Obj'' ~ X' W X + l2 P2 + # (1): w = (X' W X + l2 P2)^-1 X' W z, + # with z = eta + D^-1 (y-mu) + # Note: P2 must be symmetrized + # Note: ' denotes derivative, but also transpose for matrices + + # eta = linear predictor + eta = _safe_lin_pred(X, coef) + mu = link.inverse(eta) + # D = h'(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) + n_iter = 0 + while n_iter < max_iter: + n_iter += 1 + # coef_old not used so far. + # coef_old = coef + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = hp**2 / V + # working observations + z = eta + (y - mu) / hp + # solve A*coef = b + # A = X' W X + P2, b = X' W z + coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept) + # updated linear predictor + # do it here for updated values for tolerance + eta = _safe_lin_pred(X, coef) + mu = link.inverse(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) + + # which tolerace? |coef - coef_old| or gradient? + # use gradient for compliance with newton-cg and lbfgs + # gradient = -X' D (y-mu)/V(mu) + l2 P2 w + temp = hp * (y - mu) / V + if sparse.issparse(X): + gradient = -(X.transpose() @ temp) + else: + gradient = -(X.T @ temp) + idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + gradient += P2 * coef[idx:] + else: + gradient += P2 @ coef[idx:] + if fit_intercept: + gradient = np.concatenate(([-temp.sum()], gradient)) + if (np.max(np.abs(gradient)) <= tol): + converged = True + break + + if not converged: + warnings.warn("irls failed to converge. Increase the number " + "of iterations (currently {0})" + .format(max_iter), ConvergenceWarning) + + return coef, n_iter + + +def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, + max_inner_iter=1000, selection='cyclic', + random_state=None, diag_fisher=False): + """Compute inner loop of coordinate descent, i.e. cycles through features. + + Minimization of 1-d subproblems:: + + min_z q(d+z*e_j) - q(d) + = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 + + A = f'(w) + d*H(w) + (w+d)*P2 + B = H+P2 + Note: f'=-score and H=fisher are updated at the end of outer iteration. + """ + # TODO: use sparsity (coefficient already 0 due to L1 penalty) + # => active set of features for featurelist, see paper + # of Improved GLMNET or Gap Safe Screening Rules + # https://arxiv.org/abs/1611.05780 + n_samples, n_features = X.shape + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + B = fisher + if P2.ndim == 1: + coef_P2 = coef[idx:] * P2 + if not diag_fisher: + idiag = np.arange(start=idx, stop=B.shape[0]) + # B[np.diag_indices_from(B)] += P2 + B[(idiag, idiag)] += P2 + else: + coef_P2 = coef[idx:] @ P2 + if not diag_fisher: + if sparse.issparse(P2): + B[idx:, idx:] += P2.toarray() + else: + B[idx:, idx:] += P2 + # A = -score + coef_P2 + A = -score + A[idx:] += coef_P2 + # A += d @ (H+P2) but so far d=0 + # inner loop + for inner_iter in range(1, max_inner_iter+1): + inner_iter += 1 + n_cycles += 1 + # cycle through features, update intercept separately at the end + if selection == 'random': + featurelist = random_state.permutation(n_features) + else: + featurelist = np.arange(n_features) + for j in featurelist: + # minimize_z: a z + 1/2 b z^2 + c |d+z| + # a = A_j + # b = B_jj > 0 + # c = |P1_j| = P1_j > 0, see 1.3 + # d = w_j + d_j + # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) + # with beta = z+d, beta_hat = d-a/b and gamma = c/b + # z = 1/b * S(bd-a,c) - d + # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding + jdx = j+idx # index for arrays containing entries for intercept + a = A[jdx] + if diag_fisher: + # Note: fisher is ndarray of shape (n_samples,) => no idx + # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway + Bj = np.zeros_like(A) + if intercept: + Bj[0] = fisher.sum() + if sparse.issparse(X): + Bj[idx:] = _safe_toarray(X[:, j].transpose() @ + X.multiply(fisher[:, np.newaxis]) + ).ravel() + else: + Bj[idx:] = (fisher * X[:, j]) @ X + + if P2.ndim == 1: + Bj[idx:] += P2[j] + else: + if sparse.issparse(P2): + # slice columns as P2 is csc + Bj[idx:] += P2[:, j].toarray().ravel() + else: + Bj[idx:] += P2[:, j] + b = Bj[jdx] + else: + b = B[jdx, jdx] + + # those ten lines aree what it is all about + if b <= 0: + z = 0 + elif P1[j] == 0: + z = -a/b + elif a + P1[j] < b * (coef[jdx] + d[jdx]): + z = -(a + P1[j])/b + elif a - P1[j] > b * (coef[jdx] + d[jdx]): + z = -(a - P1[j])/b + else: + z = -(coef[jdx] + d[jdx]) + + # update direction d + d[jdx] += z + # update A because d_j is now d_j+z + # A = f'(w) + d*H(w) + (w+d)*P2 + # => A += (H+P2)*e_j z = B_j * z + # Note: B is symmetric B = B.transpose + if diag_fisher: + # Bj = B[:, j] calculated above, still valid + A += Bj * z + else: + # B is symmetric, C- or F-contiguous, but never sparse + if B.flags['F_CONTIGUOUS']: + # slice columns like for sparse csc + A += B[:, jdx] * z + else: # B.flags['C_CONTIGUOUS'] might be true + # slice rows + A += B[jdx, :] * z + # end of cycle over features + # update intercept + if intercept: + if diag_fisher: + Bj = np.zeros_like(A) + Bj[0] = fisher.sum() + Bj[1:] = fisher @ X + b = Bj[0] + else: + b = B[0, 0] + z = 0 if b <= 0 else -A[0]/b + d[0] += z + if diag_fisher: + A += Bj * z + else: + if B.flags['F_CONTIGUOUS']: + A += B[:, 0] * z + else: + A += B[0, :] * z + # end of complete cycle + # stopping criterion for inner loop + # sum_i(|minimum of norm of subgrad of q(d)_i|) + # subgrad q(d) = A + subgrad ||P1*(w+d)||_1 + mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) + if mn_subgrad <= inner_tol: + if inner_iter == 1: + inner_tol = inner_tol/4. + break + # end of inner loop + return d, coef_P2, n_cycles, inner_tol + + +def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, + max_iter=100, max_inner_iter=1000, tol=1e-4, + selection='cyclic ', random_state=None, + diag_fisher=False, copy_X=True): + """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. + + The objective beeing minimized in the coefficients w=coef is:: + + F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 + + An Improved GLMNET for L1-regularized Logistic Regression: + + 1. Find optimal descent direction d by minimizing + min_d F(w+d) = min_d F(w+d) - F(w) + 2. Quadrdatic approximation of F(w+d)-F(w) = q(d): + using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: + q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d + + ||P1*(w+d)||_1 - ||P1*w||_1 + Then minimize q(d): min_d q(d) + 3. Coordinate descent by updating coordinate j (d -> d+z*e_j): + min_z q(d+z*e_j) + = min_z q(d+z*e_j) - q(d) + = min_z A_j z + 1/2 B_jj z^2 + + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 + A = f'(w) + d*H(w) + (w+d)*P2 + B = H + P2 + + Repeat steps 1-3 until convergence. + Note: Use Fisher matrix instead of Hessian for H. + Note: f' = -score, H = Fisher matrix + + Parameters + ---------- + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. + + X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) + Training data (with intercept included if present). If not sparse, + pass directly as Fortran-contiguous data to avoid + unnecessary memory duplication. + + y : ndarray, shape (n_samples,) + Target values. + + weights: ndarray, shape (n_samples,) + Sample weights with which the deviance is weighted. The weights must + bee normalized and sum to 1. + + P1 : {ndarray}, shape (n_features,) + The L1-penalty vector (=diagonal matrix) + + P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features) + The L2-penalty matrix or vector (=diagonal matrix). If a matrix is + passed, it must be symmetric. If X is sparse, P2 must also be sparse. + + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + + family : ExponentialDispersionModel + + link : Link + + max_iter : int, optional (default=100) + Maximum numer of outer (Newton) iterations. + + max_inner_iter : int, optional (default=1000) + Maximum number of iterations in each inner loop, i.e. max number of + cycles over all features per inner loop. + + tol : float, optional (default=1e-4) + Covergence criterion is + sum_i(|minimum of norm of subgrad of objective_i|)<=tol. + + selection : str, optional (default='cyclic') + If 'random', randomly chose features in inner loop. + + random_state : {int, RandomState instance, None}, optional (default=None) + + diag_fisher : boolean, optional (default=False) + ``False`` calculates full fisher matrix, ``True`` only diagonal matrix + s.t. fisher = X.T @ diag @ X. This saves storage but needs more + matrix-vector multiplications. + + copy_X : boolean, optional (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + Returns + ------- + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. + + n_iter : numer of outer iterations = newton iterations + + n_cycles : number of cycles over features + + References + ---------- + Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + """ + X = check_array(X, 'csc', dtype=[np.float64, np.float32], + order='F', copy=copy_X) + if P2.ndim == 2: + P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32], + order='F', copy=copy_X) + if sparse.issparse(X): + if not sparse.isspmatrix_csc(X): + raise ValueError("If X is sparse, it must be in csc format" + "; got (format={})".format(X.format)) + if not sparse.isspmatrix_csc(P2): + raise ValueError("If X is sparse, P2 must also be sparse csc" + "format. Got P2 not sparse.") + random_state = check_random_state(random_state) + # Note: we already set P2 = l2*P2, P1 = l1*P1 + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + n_iter = 0 # number of outer iterations + n_cycles = 0 # number of (complete) cycles over features + converged = False + n_samples, n_features = X.shape + idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept + # line search parameters + (beta, sigma) = (0.5, 0.01) + # some precalculations + # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a + # 1d array representing a diagonal matrix. + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=X, y=y, weights=weights, link=link, + diag_fisher=diag_fisher) + # set up space for search direction d for inner loop + d = np.zeros_like(coef) + # initial stopping tolerance of inner loop + # use L1-norm of minimum of norm of subgradient of F + inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) + inner_tol = linalg.norm(inner_tol, ord=1) + # outer loop + while n_iter < max_iter: + n_iter += 1 + # initialize search direction d (to be optimized) with zero + d.fill(0) + # inner loop = _cd_cycle + d, coef_P2, n_cycles, inner_tol = \ + _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, + max_inner_iter=max_inner_iter, selection=selection, + random_state=random_state, diag_fisher=diag_fisher) + # line search by sequence beta^k, k=0, 1, .. + # F(w + lambda d) - F(w) <= lambda * bound + # bound = sigma * (f'(w)*d + w*P2*d + # +||P1 (w+d)||_1 - ||P1 w||_1) + P1w_1 = linalg.norm(P1 * coef[idx:], ord=1) + P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1) + # Note: coef_P2 already calculated and still valid + bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1) + Fw = (0.5 * family.deviance(y, mu, weights) + + 0.5 * (coef_P2 @ coef[idx:]) + P1w_1) + la = 1./beta + for k in range(20): + la *= beta # starts with la=1 + coef_wd = coef + la * d + mu_wd = link.inverse(_safe_lin_pred(X, coef_wd)) + Fwd = (0.5 * family.deviance(y, mu_wd, weights) + + linalg.norm(P1 * coef_wd[idx:], ord=1)) + if P2.ndim == 1: + Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:]) + else: + Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:])) + if Fwd - Fw <= sigma * la * bound: + break + # update coefficients + # coef_old = coef.copy() + coef += la * d + # calculate eta, mu, score, Fisher matrix for next iteration + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=X, y=y, weights=weights, link=link, + diag_fisher=diag_fisher) + # stopping criterion for outer loop + # sum_i(|minimum-norm of subgrad of F(w)_i|) + # fp_wP2 = f'(w) + w*P2 + # Note: eta, mu and score are already updated + mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) + if mn_subgrad <= tol: + converged = True + break + # end of outer loop + if not converged: + warnings.warn("Coordinate descent failed to converge. Increase" + " the maximum number of iterations max_iter" + " (currently {0})".format(max_iter), ConvergenceWarning) + + return coef, n_iter, n_cycles + + +class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): + """Regression via a Generalized Linear Model (GLM) with penalties. + + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean of the target y as mu=h(X*w). Therefore, + the fit minimizes the following objective function with combined L1 and L2 + priors as regularizer:: + + 1/(2*sum(s)) * deviance(y, h(X*w); s) + + alpha * l1_ratio * ||P1*w||_1 + + 1/2 * alpha * (1 - l1_ratio) * w*P2*w + + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + For ``P1=P2='identity'``, the penalty is the elastic net:: + + alpha * l1_ratio * ||w||_1 + + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 + + If you are interested in controlling the L1 and L2 penalties + separately, keep in mind that this is equivalent to:: + + a * L1 + b * L2 + + where:: + + alpha = a + b and l1_ratio = a / (a + b) + + The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet, + while ``alpha`` corresponds to the lambda parameter in glmnet. + Specifically, l1_ratio = 1 is the lasso penalty. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, optional (default=1) + Constant that multiplies the penalty terms und thus determines the + regularization strength. + See the notes for the exact mathematical meaning of this + parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + case, the design matrix X must have full column rank + (no collinearities). + + l1_ratio : float, optional (default=0) + The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For + ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it + is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a + combination of L1 and L2. + + P1 : {'identity', array-like}, shape (n_features,), optional \ + (default='identity') + With this array, you can exclude coefficients from the L1 penalty. + Set the corresponding value to 1 (include) or 0 (exclude). The + default value ``'identity'`` is the same as a 1d array of ones. + Note that n_features = X.shape[1]. + + P2 : {'identity', array-like, sparse matrix}, shape \ + (n_features,) or (n_features, n_features), optional \ + (default='identity') + With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`. + This gives a fine control over this penalty (Tikhonov regularization). + A 2d array is directly used as the square matrix P2. A 1d array is + interpreted as diagonal (square) matrix. The default 'identity' sets + the identity matrix, which gives the usual squared L2-norm. If you just + want to exclude certain coefficients, pass a 1d array filled with 1, + and 0 for the coefficients to be excluded. + Note that P2 must be positive semi-definite. + + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \ + or an instance of class ExponentialDispersionModel, \ + optional(default='normal') + The distributional assumption of the GLM, i.e. which distribution from + the EDM, specifies the loss function to be minimized. + + link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \ + optional (default='auto') + The link function of the GLM, i.e. mapping from linear predictor + (X*coef) to expectation (mu). Option 'auto' sets the link depending on + the chosen family as follows: + + - 'identity' for family 'normal' + + - 'log' for families 'poisson', 'gamma', 'inverse.gaussian' + + - 'logit' for family 'binomial' + + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) + Method for estimation of the dispersion parameter phi. Whether to use + the chi squared statisic or the deviance statistic. If None, the + dispersion is not estimated. + + solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \ + optional (default='auto') + Algorithm to use in the optimization problem: + + 'auto' + Sets 'irls' if l1_ratio equals 0, else 'cd'. + + 'cd' + Coordinate descent algorithm. It can deal with L1 as well as L2 + penalties. Note that in order to avoid unnecessary memory + duplication of X in the ``fit`` method, X should be directly passed + as a Fortran-contiguous numpy array or sparse csc matrix. + + 'irls' + Iterated reweighted least squares. + It is the standard algorithm for GLMs. It cannot deal with + L1 penalties. + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties. + + 'newton-cg', 'lbfgs' + Newton conjugate gradient algorithm cannot deal with L1 penalties. + + Note that all solvers except lbfgs use the fisher matrix, i.e. the + expected Hessian instead of the Hessian matrix. + + max_iter : int, optional (default=100) + The maximal number of iterations for solver algorithms. + + tol : float, optional (default=1e-4) + Stopping criterion. For the irls, newton-cg and lbfgs solvers, + the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient (derivative) of + the objective function. For the cd solver, covergence is reached + when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the + subgradient of the objective and minimum-norm of ``g_i`` is the element + of the subgradient ``g_i`` with the smallest L2-norm. + + warm_start : boolean, optional (default=False) + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` (supersedes option + ``start_params``). If set to ``True`` or if the attribute ``coef_`` + does not exit (first call to ``fit``), option ``start_params`` sets the + start values for ``coef_`` and ``intercept_``. + + start_params : {'guess', 'zero', array of shape (n_features*, )}, \ + optional (default='guess') + Relevant only if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not yet exist). + + 'guess' + Start values of mu are calculated by family.starting_mu(..). Then, + one Newton step obtains start values for ``coef_``. If + ``solver='irls'``, it uses one irls step, else the Newton step is + calculated by the cd solver. + This gives usually good starting values. + + 'zero' + All coefficients are set to zero. If ``fit_intercept=True``, the + start value for the intercept is obtained by the weighted average of y. + + array + The array of size n_features* is directly used as start values + for ``coef_``. If ``fit_intercept=True``, the first element + is assumed to be the start value for the ``intercept_``. + Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes + the intercept in counting. + + selection : str, optional (default='cyclic') + For the solver 'cd' (coordinate descent), the coordinates (features) + can be updated in either cyclic or random order. + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially in the same order. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + random_state : {int, RandomState instance, None}, optional (default=None) + The seed of the pseudo random number generator that selects a random + feature to be updated for solver 'cd' (coordinate descent). + If int, random_state is the seed used by the random + number generator; if RandomState instance, random_state is the random + number generator; if None, the random number generator is the + RandomState instance used by `np.random`. Used when ``selection`` == + 'random'. + + diag_fisher : boolean, optional, (default=False) + Only relevant for solver 'cd' (see also ``start_params='guess'``). + If ``False``, the full Fisher matrix (expected Hessian) is computed in + each outer iteration (Newton iteration). If ``True``, only a diagonal + matrix (stored as 1d array) is computed, such that + fisher = X.T @ diag @ X. This saves memory and matrix-matrix + multiplications, but needs more matrix-vector multiplications. If you + use large sparse X or if you have many features, + i.e. n_features >> n_samples, you might set this option to ``True``. + + copy_X : boolean, optional, (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + check_input : boolean, optional (default=True) + Allow to bypass several checks on input: y values in range of family, + sample_weight non-negative, P2 positive semi-definite. + Don't use this parameter unless you know what you do. + + verbose : int, optional (default=0) + For the lbfgs solver set verbose to any positive number for verbosity. + + Attributes + ---------- + coef_ : array, shape (n_features,) + Estimated coefficients for the linear predictor (X*coef_+intercept_) in + the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + dispersion_ : float + The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. + + n_iter_ : int + Actual number of iterations used in solver. + + Notes + ----- + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + :ref:`User Guide `. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. + + For alpha > 0, the feature matrix X should be standardized in order to + penalize features equally strong. Call + :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. + + If the target y is a ratio, appropriate sample weights s should be + provided. + As an example, consider Poission distributed counts z (integers) and + weights s=exposure (time, money, persons years, ...). Then you fit + y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, + sample_weight=s)``. The weights are necessary for the right (finite + sample) mean. + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + in this case one might say that y has a 'scaled' Poisson distributions. + The same holds for other distributions. + + References + ---------- + For the coordinate descent implementation: + * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + """ + def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', + fit_intercept=True, family='normal', link='auto', + fit_dispersion=None, solver='auto', max_iter=100, + tol=1e-4, warm_start=False, start_params='guess', + selection='cyclic', random_state=None, diag_fisher=False, + copy_X=True, check_input=True, verbose=0): + self.alpha = alpha + self.l1_ratio = l1_ratio + self.P1 = P1 + self.P2 = P2 + self.fit_intercept = fit_intercept + self.family = family + self.link = link + self.fit_dispersion = fit_dispersion + self.solver = solver + self.max_iter = max_iter + self.tol = tol + self.warm_start = warm_start + self.start_params = start_params + self.selection = selection + self.random_state = random_state + self.diag_fisher = diag_fisher + self.copy_X = copy_X + self.check_input = check_input + self.verbose = verbose + + def fit(self, X, y, sample_weight=None): + """Fit a Generalized Linear Model. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + + y : array-like, shape (n_samples,) + Target values. + + sample_weight : {None, array-like}, shape (n_samples,),\ + optinal (default=None) + Individual weights w_i for each sample. Note that for an + Exponential Dispersion Model (EDM), one has + Var[Y_i]=phi/w_i * v(mu). + If Y_i ~ EDM(mu, phi/w_i), then + sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a + weighted average with weights=sample_weight. + + Returns + ------- + self : returns an instance of self. + """ + ####################################################################### + # 1. input validation # + ####################################################################### + # 1.1 validate arguments of __init__ ################################## + # Guarantee that self._family_instance is an instance of class + # ExponentialDispersionModel + if isinstance(self.family, ExponentialDispersionModel): + self._family_instance = self.family + else: + if self.family == 'normal': + self._family_instance = NormalDistribution() + elif self.family == 'poisson': + self._family_instance = PoissonDistribution() + elif self.family == 'gamma': + self._family_instance = GammaDistribution() + elif self.family == 'inverse.gaussian': + self._family_instance = InverseGaussianDistribution() + elif self.family == 'binomial': + self._family_instance = BinomialDistribution() + else: + raise ValueError( + "The family must be an instance of class" + " ExponentialDispersionModel or an element of" + " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " + "'binomial']; got (family={0})".format(self.family)) + + # Guarantee that self._link_instance is set to an instance of + # class Link + if isinstance(self.link, Link): + self._link_instance = self.link + else: + if self.link == 'auto': + if isinstance(self._family_instance, TweedieDistribution): + if self._family_instance.power <= 0: + self._link_instance = IdentityLink() + if self._family_instance.power >= 1: + self._link_instance = LogLink() + elif isinstance(self._family_instance, + GeneralizedHyperbolicSecant): + self._link_instance = IdentityLink() + elif isinstance(self._family_instance, BinomialDistribution): + self._link_instance = LogitLink() + else: + raise ValueError("No default link known for the " + "specified distribution family. Please " + "set link manually, i.e. not to 'auto'; " + "got (link='auto', family={}" + .format(self.family)) + elif self.link == 'identity': + self._link_instance = IdentityLink() + elif self.link == 'log': + self._link_instance = LogLink() + elif self.link == 'logit': + self._link_instance = LogitLink() + else: + raise ValueError( + "The link must be an instance of class Link or " + "an element of ['auto', 'identity', 'log', 'logit']; " + "got (link={0})".format(self.link)) + + # validate further arguments + if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: + raise ValueError("Penalty term must be a non-negative number;" + " got (alpha={0})".format(self.alpha)) + if (not isinstance(self.l1_ratio, numbers.Number) or + self.l1_ratio < 0 or self.l1_ratio > 1): + raise ValueError("l1_ratio must be a number in interval [0, 1];" + " got (l1_ratio={0})".format(self.l1_ratio)) + if not isinstance(self.fit_intercept, bool): + raise ValueError("The argument fit_intercept must be bool;" + " got {0}".format(self.fit_intercept)) + if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']: + raise ValueError("GeneralizedLinearRegressor supports only solvers" + " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';" + " got {0}".format(self.solver)) + solver = self.solver + if self.solver == 'auto': + if self.l1_ratio == 0: + solver = 'irls' + else: + solver = 'cd' + if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']): + raise ValueError("The chosen solver (solver={0}) can't deal " + "with L1 penalties, which are included with " + "(alpha={1}) and (l1_ratio={2})." + .format(solver, self.alpha, self.l1_ratio)) + if (not isinstance(self.max_iter, int) + or self.max_iter <= 0): + raise ValueError("Maximum number of iteration must be a positive " + "integer;" + " got (max_iter={0!r})".format(self.max_iter)) + if not isinstance(self.tol, numbers.Number) or self.tol <= 0: + raise ValueError("Tolerance for stopping criteria must be " + "positive; got (tol={0!r})".format(self.tol)) + if not isinstance(self.warm_start, bool): + raise ValueError("The argument warm_start must be bool;" + " got {0}".format(self.warm_start)) + if self.selection not in ['cyclic', 'random']: + raise ValueError("The argument selection must be 'cyclic' or " + "'random'; got (selection={0})" + .format(self.selection)) + random_state = check_random_state(self.random_state) + if not isinstance(self.diag_fisher, bool): + raise ValueError("The argument diag_fisher must be bool;" + " got {0}".format(self.diag_fisher)) + if not isinstance(self.copy_X, bool): + raise ValueError("The argument copy_X must be bool;" + " got {0}".format(self.copy_X)) + if not isinstance(self.check_input, bool): + raise ValueError("The argument check_input must be bool; got " + "(check_input={0})".format(self.check_input)) + + family = self._family_instance + link = self._link_instance + + # 1.2 validate arguments of fit ####################################### + _dtype = [np.float64, np.float32] + if solver == 'cd': + _stype = ['csc'] + else: + _stype = ['csc', 'csr'] + X, y = check_X_y(X, y, accept_sparse=_stype, + dtype=_dtype, y_numeric=True, multi_output=False, + copy=self.copy_X) + # Without converting y to float, deviance might raise + # ValueError: Integers to negative integer powers are not allowed. + # Also, y must not be sparse. + y = np.asarray(y, dtype=np.float64) + + weights = _check_weights(sample_weight, y.shape[0]) + + n_samples, n_features = X.shape + + # 1.3 arguments to take special care ################################## + # P1, P2, start_params + if isinstance(self.P1, str) and self.P1 == 'identity': + P1 = np.ones(n_features) + else: + P1 = np.atleast_1d(self.P1) + try: + P1 = P1.astype(np.float64, casting='safe', copy=False) + except TypeError: + raise TypeError("The given P1 cannot be converted to a numeric" + "array; got (P1.dtype={0})." + .format(P1.dtype)) + if (P1.ndim != 1) or (P1.shape[0] != n_features): + raise ValueError("P1 must be either 'identity' or a 1d array " + "with the length of X.shape[1]; " + "got (P1.shape[0]={0}), " + "needed (X.shape[1]={1})." + .format(P1.shape[0], n_features)) + # If X is sparse, make P2 sparse, too. + if isinstance(self.P2, str) and self.P2 == 'identity': + if sparse.issparse(X): + P2 = (sparse.dia_matrix((np.ones(n_features), 0), + shape=(n_features, n_features))).tocsc() + else: + P2 = np.ones(n_features) + else: + P2 = check_array(self.P2, copy=True, + accept_sparse=_stype, + dtype=_dtype, ensure_2d=False) + if P2.ndim == 1: + P2 = np.asarray(P2) + if P2.shape[0] != n_features: + raise ValueError("P2 should be a 1d array of shape " + "(n_features,) with " + "n_features=X.shape[1]; " + "got (P2.shape=({0},)), needed ({1},)" + .format(P2.shape[0], X.shape[1])) + if sparse.issparse(X): + P2 = (sparse.dia_matrix((P2, 0), + shape=(n_features, n_features))).tocsc() + elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and + P2.shape[0] == X.shape[1]): + if sparse.issparse(X): + P2 = (sparse.dia_matrix((P2, 0), + shape=(n_features, n_features))).tocsc() + else: + raise ValueError("P2 must be either None or an array of shape " + "(n_features, n_features) with " + "n_features=X.shape[1]; " + "got (P2.shape=({0}, {1})), needed ({2}, {2})" + .format(P2.shape[0], P2.shape[1], X.shape[1])) + + start_params = self.start_params + if isinstance(start_params, str): + if start_params not in ['guess', 'zero']: + raise ValueError("The argument start_params must be 'guess', " + "'zero' or an array of correct length; " + "got(start_params={0})".format(start_params)) + else: + start_params = check_array(start_params, accept_sparse=False, + force_all_finite=True, ensure_2d=False, + dtype=_dtype, copy=True) + if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or + (start_params.ndim != 1)): + raise ValueError("Start values for parameters must have the" + "right length and dimension; required (length" + "={0}, ndim=1); got (length={1}, ndim={2})." + .format(X.shape[1] + self.fit_intercept, + start_params.shape[0], + start_params.ndim)) + + l1 = self.alpha * self.l1_ratio + l2 = self.alpha * (1 - self.l1_ratio) + # P1 and P2 are now for sure copies + P1 = l1 * P1 + P2 = l2 * P2 + # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') + # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric + if P2.ndim == 2: + if sparse.issparse(P2): + if sparse.isspmatrix_csc(P2): + P2 = 0.5 * (P2 + P2.transpose()).tocsc() + else: + P2 = 0.5 * (P2 + P2.transpose()).tocsr() + else: + P2 = 0.5 * (P2 + P2.T) + + # For coordinate descent, if X is sparse, P2 must also be csc + if solver == 'cd' and sparse.issparse(X): + P2 = sparse.csc_matrix(P2) + + # 1.4 additional validations ########################################## + if self.check_input: + if not np.all(family.in_y_range(y)): + raise ValueError("Some value(s) of y are out of the valid " + "range for family {0}" + .format(family.__class__.__name__)) + # check if P1 has only non-negative values, negative values might + # indicate group lasso in the future. + if not isinstance(self.P1, str): # if self.P1 != 'identity': + if not np.all(P1 >= 0): + raise ValueError("P1 must not have negative values.") + # check if P2 is positive semidefinite + # np.linalg.cholesky(P2) 'only' asserts positive definite + if not isinstance(self.P2, str): # self.P2 != 'identity' + # due to numerical precision, we allow eigenvalues to be a + # tiny bit negative + epsneg = -10 * np.finfo(P2.dtype).epsneg + if P2.ndim == 1 or P2.shape[0] == 1: + p2 = P2 + if sparse.issparse(P2): + p2 = P2.toarray() + if not np.all(p2 >= 0): + raise ValueError("1d array P2 must not have negative " + "values.") + elif sparse.issparse(P2): + # for sparse matrices, not all eigenvals can be computed + # efficiently, use only half of n_features + # k = how many eigenvals to compute + k = np.min([10, n_features // 10 + 1]) + sigma = 0 # start searching near this value + which = 'SA' # find smallest algebraic eigenvalues first + if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma, + which=which) >= epsneg): + raise ValueError("P2 must be positive semi-definite.") + else: + if not np.all(linalg.eigvalsh(P2) >= epsneg): + raise ValueError("P2 must be positive semi-definite.") + # TODO: if alpha=0 check that X is not rank deficient + # TODO: what else to check? + + ####################################################################### + # 2. rescaling of weights (sample_weight) # + ####################################################################### + # IMPORTANT NOTE: Since we want to minimize + # 1/(2*sum(sample_weight)) * deviance + L1 + L2, + # deviance = sum(sample_weight * unit_deviance), + # we rescale weights such that sum(weights) = 1 and this becomes + # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + weights_sum = np.sum(weights) + weights = weights/weights_sum + + ####################################################################### + # 3. initialization of coef = (intercept_, coef_) # + ####################################################################### + # Note: Since phi=self.dispersion_ does not enter the estimation + # of mu_i=E[y_i], set it to 1. + + # set start values for coef + coef = None + if self.warm_start and hasattr(self, 'coef_'): + if self.fit_intercept: + coef = np.concatenate((np.array([self.intercept_]), + self.coef_)) + else: + coef = self.coef_ + elif isinstance(start_params, str): + if start_params == 'guess': + # Set mu=starting_mu of the family and do one Newton step + # If solver=cd use cd, else irls + mu = family.starting_mu(y, weights=weights) + eta = link.link(mu) # linear predictor + if solver in ['cd', 'lbfgs', 'newton-cg']: + # see function _cd_solver + sigma_inv = 1/family.variance(mu, phi=1, weights=weights) + d1 = link.inverse_derivative(eta) + temp = sigma_inv * d1 * (y - mu) + if self.fit_intercept: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X # sampe as X.T @ temp + + d2_sigma_inv = d1 * d1 * sigma_inv + diag_fisher = self.diag_fisher + if diag_fisher: + fisher = d2_sigma_inv + else: + fisher = \ + _safe_sandwich_dot(X, d2_sigma_inv, + intercept=self.fit_intercept) + # set up space for search direction d for inner loop + if self.fit_intercept: + coef = np.zeros(n_features+1) + else: + coef = np.zeros(n_features) + d = np.zeros_like(coef) + # initial stopping tolerance of inner loop + # use L1-norm of minimum of norm of subgradient of F + # use less restrictive tolerance for initial guess + inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, + P1=P1) + inner_tol = 4 * linalg.norm(inner_tol, ord=1) + # just one outer loop = Newton step + n_cycles = 0 + d, coef_P2, n_cycles, inner_tol = \ + _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, + inner_tol, max_inner_iter=1000, + selection=self.selection, + random_state=self.random_state, + diag_fisher=self.diag_fisher) + coef += d # for simplicity no line search here + else: + # See _irls_solver + # h'(eta) + hp = link.inverse_derivative(eta) + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = (hp**2 / family.variance(mu, phi=1, weights=weights)) + # working observations + z = eta + (y-mu)/hp + # solve A*coef = b + # A = X' W X + l2 P2, b = X' W z + coef = _irls_step(X, W, P2, z, + fit_intercept=self.fit_intercept) + else: # start_params == 'zero' + if self.fit_intercept: + coef = np.zeros(n_features+1) + coef[0] = link.link(np.average(y, weights=weights)) + else: + coef = np.zeros(n_features) + else: # assign given array as start values + coef = start_params + + ####################################################################### + # 4. fit # + ####################################################################### + # algorithms for optimiation + # TODO: Parallelize it? + + # 4.1 IRLS ############################################################ + # Note: we already set P2 = l2*P2, see above + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + if solver == 'irls': + coef, self.n_iter_ = \ + _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2, + fit_intercept=self.fit_intercept, family=family, + link=link, max_iter=self.max_iter, tol=self.tol) + + # 4.2 L-BFGS ########################################################## + elif solver == 'lbfgs': + def func(coef, X, y, weights, P2, family, link): + mu, devp = \ + family._mu_deviance_derivative(coef, X, y, weights, link) + dev = family.deviance(y, mu, weights) + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + L2 = P2 * coef[idx:] + else: + L2 = P2 @ coef[idx:] + obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2) + objp = 0.5 * devp + objp[idx:] += L2 + return obj, objp + + args = (X, y, weights, P2, family, link) + coef, loss, info = fmin_l_bfgs_b( + func, coef, fprime=None, args=args, + iprint=(self.verbose > 0) - 1, pgtol=self.tol, + maxiter=self.max_iter, factr=1e3) + if self.verbose > 0: + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.", + ConvergenceWarning) + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}" + .format(info["task"])) + self.n_iter_ = info['nit'] + + # 4.3 Newton-CG ####################################################### + # We use again the fisher matrix instead of the hessian. More + # precisely, expected hessian of deviance. + elif solver == 'newton-cg': + def func(coef, X, y, weights, P2, family, link): + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + L2 = coef[idx:] @ (P2 * coef[idx:]) + else: + L2 = coef[idx:] @ (P2 @ coef[idx:]) + mu = link.inverse(_safe_lin_pred(X, coef)) + return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 + + def grad(coef, X, y, weights, P2, family, link): + mu, devp = \ + family._mu_deviance_derivative(coef, X, y, weights, link) + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + L2 = P2 * coef[idx:] + else: + L2 = P2 @ coef[idx:] + objp = 0.5 * devp + objp[idx:] += L2 + return objp + + def grad_hess(coef, X, y, weights, P2, family, link): + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + L2 = P2 * coef[idx:] + else: + L2 = P2 @ coef[idx:] + eta = _safe_lin_pred(X, coef) + mu = link.inverse(eta) + d1 = link.inverse_derivative(eta) + temp = d1 * family.deviance_derivative(y, mu, weights) + if intercept: + grad = np.concatenate(([0.5 * temp.sum()], + 0.5 * temp @ X + L2)) + else: + grad = 0.5 * temp @ X + L2 # sampe as 0.5* X.T @ temp + L2 + + # expected hessian = fisher = X.T @ diag_matrix @ X + # calculate only diag_matrix + diag = d1**2 / family.variance(mu, phi=1, weights=weights) + if intercept: + h0i = np.concatenate(([diag.sum()], diag @ X)) + + def Hs(coef): + # return (0.5 * fisher + P2) @ coef + # ret = 0.5 * (X.T @ (diag * (X @ coef))) + ret = 0.5 * ((diag * (X @ coef[idx:])) @ X) + if P2.ndim == 1: + ret += P2 * coef[idx:] + else: + ret += P2 @ coef[idx:] + if intercept: + ret = np.concatenate(([0.5 * (h0i @ coef)], + ret + 0.5 * coef[0] * h0i[1:])) + return ret + + return grad, Hs + + args = (X, y, weights, P2, family, link) + coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef, + args=args, maxiter=self.max_iter, + tol=self.tol) + + # 4.4 coordinate descent ############################################## + # Note: we already set P1 = l1*P1, see above + # Note: we already set P2 = l2*P2, see above + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + elif solver == 'cd': + coef, self.n_iter_, self._n_cycles = \ + _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, + P2=P2, fit_intercept=self.fit_intercept, + family=family, link=link, + max_iter=self.max_iter, tol=self.tol, + selection=self.selection, random_state=random_state, + diag_fisher=self.diag_fisher, copy_X=self.copy_X) + + ####################################################################### + # 5. postprocessing # + ####################################################################### + if self.fit_intercept: + self.intercept_ = coef[0] + self.coef_ = coef[1:] + else: + # set intercept to zero as the other linear models do + self.intercept_ = 0. + self.coef_ = coef + + if self.fit_dispersion in ['chisqr', 'deviance']: + # attention because of rescaling of weights + self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum + + return self + + def linear_predictor(self, X): + """Compute the linear_predictor = X*coef_ + intercept_. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Samples. + + Returns + ------- + C : array, shape (n_samples,) + Returns predicted values of linear predictor. + """ + check_is_fitted(self, "coef_") + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype='numeric', copy=True, ensure_2d=True, + allow_nd=False) + return X @ self.coef_ + self.intercept_ + + def predict(self, X, sample_weight=None): + """Predict uing GLM with feature matrix X. + If sample_weight is given, returns prediction*sample_weight. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Samples. + + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) + + Returns + ------- + C : array, shape (n_samples,) + Returns predicted values times sample_weight. + """ + # TODO: Is copy=True necessary? + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype='numeric', copy=True, ensure_2d=True, + allow_nd=False) + eta = self.linear_predictor(X) + mu = self._link_instance.inverse(eta) + weights = _check_weights(sample_weight, X.shape[0]) + + return mu*weights + + def estimate_phi(self, X, y, sample_weight=None): + """Estimate/fit the dispersion parameter phi. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + + y : array-like, shape (n_samples,) + Target values. + + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) + Sample weights. + + Returns + ------- + phi : float + Dispersion parameter. + """ + check_is_fitted(self, "coef_") + _dtype = [np.float64, np.float32] + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=_dtype, y_numeric=True, multi_output=False) + n_samples, n_features = X.shape + weights = _check_weights(sample_weight, n_samples) + eta = X @ self.coef_ + if self.fit_intercept is True: + eta += self.intercept_ + n_features += 1 + if n_samples <= n_features: + raise ValueError("Estimation of dispersion parameter phi requires" + " more samples than features, got" + " samples=X.shape[0]={0} and" + " n_features=X.shape[1]+fit_intercept={1}." + .format(n_samples, n_features)) + mu = self._link_instance.inverse(eta) + if self.fit_dispersion == 'chisqr': + chisq = np.sum(weights*(y-mu)**2 / + self._family_instance.unit_variance(mu)) + return chisq/(n_samples - n_features) + elif self.fit_dispersion == 'deviance': + dev = self._family_instance.deviance(y, mu, weights) + return dev/(n_samples - n_features) + + # Note: check_estimator(GeneralizedLinearRegressor) might raise + # "AssertionError: -0.28014056555724598 not greater than 0.5" + # unless GeneralizedLinearRegressor has a score which passes the test. + def score(self, X, y, sample_weight=None): + """Compute D^2, the percentage of deviance explained. + + D^2 is a generalization of the coefficient of determination R^2. + R^2 uses squared error and D^2 deviance. Note that those two are equal + for family='normal'. + + D^2 is defined as + :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, + :math:`D_{null}` is the null deviance, i.e. the deviance of a model + with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. + The mean :math:`\\bar{y}` is averaged by sample_weight. + Best possible score is 1.0 and it can be negative (because the model + can be arbitrarily worse). + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Test samples. + + y : array-like, shape (n_samples,) + True values of target. + + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) + Sample weights. + + Returns + ------- + score : float + D^2 of self.predict(X) w.r.t. y. + """ + # Note, default score defined in RegressorMixin is R^2 score. + # TODO: make D^2 a score function in module metrics (and thereby get + # input validation and so on) + weights = _check_weights(sample_weight, y.shape[0]) + mu = self.predict(X) + dev = self._family_instance.deviance(y, mu, weights=weights) + y_mean = np.average(y, weights=weights) + dev_null = self._family_instance.deviance(y, y_mean, weights=weights) + return 1. - dev / dev_null diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py new file mode 100644 index 0000000000000..7cb3f4a5b5969 --- /dev/null +++ b/sklearn/linear_model/tests/test_glm.py @@ -0,0 +1,640 @@ +import numpy as np +from numpy.testing import assert_allclose +import pytest +import scipy as sp +from scipy import linalg, optimize, sparse + +from sklearn.datasets import make_classification, make_regression +from sklearn.linear_model.glm import ( + Link, + IdentityLink, + LogLink, + LogitLink, + TweedieDistribution, + NormalDistribution, PoissonDistribution, + GammaDistribution, InverseGaussianDistribution, + GeneralizedHyperbolicSecant, BinomialDistribution, + GeneralizedLinearRegressor) +from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge + +from sklearn.utils.testing import ( + assert_equal, assert_almost_equal, + assert_array_equal, assert_array_almost_equal) + + +@pytest.mark.parametrize('link', Link.__subclasses__()) +def test_link_properties(link): + """Test link inverse and derivative.""" + rng = np.random.RandomState(42) + x = rng.rand(100)*100 + link = link() # instatiate object + decimal = 10 + if isinstance(link, LogitLink): + # careful for large x, note expit(36) = 1 + # limit max eta to 15 + x = x / 100 * 15 + decimal = 8 + assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal) + # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) + assert_almost_equal(link.derivative(link.inverse(x)), + 1./link.inverse_derivative(x), decimal=decimal) + # for LogitLink, in the following x should be between 0 and 1. + # assert_almost_equal(link.inverse_derivative(link.link(x)), + # 1./link.derivative(x), decimal=decimal) + + +@pytest.mark.parametrize( + 'family, expected', + [(NormalDistribution(), [True, True, True]), + (PoissonDistribution(), [False, True, True]), + (TweedieDistribution(power=1.5), [False, True, True]), + (GammaDistribution(), [False, False, True]), + (InverseGaussianDistribution(), [False, False, True]), + (TweedieDistribution(power=4.5), [False, False, True])]) +def test_family_bounds(family, expected): + """Test the valid range of distributions at -1, 0, 1.""" + result = family.in_y_range([-1, 0, 1]) + assert_array_equal(result, expected) + + +@pytest.mark.parametrize( + 'family, chk_values', + [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), + (PoissonDistribution(), [0.1, 1.5]), + (GammaDistribution(), [0.1, 1.5]), + (InverseGaussianDistribution(), [0.1, 1.5]), + (TweedieDistribution(power=-2.5), [0.1, 1.5]), + (TweedieDistribution(power=-1), [0.1, 1.5]), + (TweedieDistribution(power=1.5), [0.1, 1.5]), + (TweedieDistribution(power=2.5), [0.1, 1.5]), + (TweedieDistribution(power=-4), [0.1, 1.5]), + (GeneralizedHyperbolicSecant(), [0.1, 1.5])]) +def test_deviance_zero(family, chk_values): + """Test deviance(y,y) = 0 for different families.""" + for x in chk_values: + assert_almost_equal(family.deviance(x, x), 0, decimal=10) + + +@pytest.mark.parametrize( + 'family, link', + [(NormalDistribution(), IdentityLink()), + (PoissonDistribution(), LogLink()), + (GammaDistribution(), LogLink()), + (InverseGaussianDistribution(), LogLink()), + (TweedieDistribution(power=1.5), LogLink()), + (TweedieDistribution(power=4.5), LogLink())]) +def test_fisher_matrix(family, link): + """Test the Fisher matrix numerically. + Trick: Use numerical differentiation with y = mu""" + coef = np.array([-2, 1, 0, 1, 2.5]) + phi = 0.5 + rng = np.random.RandomState(42) + X = rng.randn(10, 5) + lin_pred = np.dot(X, coef) + mu = link.inverse(lin_pred) + weights = rng.randn(10)**2 + 1 + fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link) + approx = np.array([]).reshape(0, coef.shape[0]) + for i in range(coef.shape[0]): + def f(coef): + return -family._score(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link)[i] + approx = np.vstack( + [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]) + assert_allclose(fisher, approx, rtol=1e-3) + + +def test_sample_weights_validation(): + """Test the raised errors in the validation of sample_weight.""" + # 1. scalar value but not positive + X = [[1]] + y = [1] + weights = 0 + glm = GeneralizedLinearRegressor(fit_intercept=False) + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + # 2. 2d array + weights = [[0]] + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + # 3. 1d but wrong length + weights = [1, 0] + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + # 4. 1d but only zeros (sum not greater than 0) + weights = [0, 0] + X = [[0], [1]] + y = [1, 2] + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + # 5. 1d but weith a negative value + weights = [2, -1] + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + +@pytest.mark.parametrize('f, fam', + [('normal', NormalDistribution()), + ('poisson', PoissonDistribution()), + ('gamma', GammaDistribution()), + ('inverse.gaussian', InverseGaussianDistribution()), + ('binomial', BinomialDistribution())]) +def test_glm_family_argument(f, fam): + """Test GLM family argument set as string.""" + y = np.array([0.1, 0.5]) # in range of all distributions + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) + assert_equal(type(glm._family_instance), type(fam)) + + glm = GeneralizedLinearRegressor(family='not a family', + fit_intercept=False) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('l, link', + [('identity', IdentityLink()), + ('log', LogLink()), + ('logit', LogitLink())]) +def test_glm_link_argument(l, link): + """Test GLM link argument set as string.""" + y = np.array([0.1, 0.5]) # in range of all distributions + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) + assert_equal(type(glm._link_instance), type(link)) + + glm = GeneralizedLinearRegressor(family='normal', link='not a link') + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('alpha', ['not a number', -4.2]) +def test_glm_alpha_argument(alpha): + """Test GLM for invalid alpha argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', alpha=alpha) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]]) +def test_glm_l1_ratio_argument(l1_ratio): + """Test GLM for invalid l1_ratio argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3], + [-1]]) +def test_glm_P1_argument(P1): + """Test GLM for invalid P1 argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True) + with pytest.raises((ValueError, TypeError)): + glm.fit(X, y) + + +@pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], + sparse.csr_matrix([1, 2, 3]), [-1]]) +def test_glm_P2_argument(P2): + """Test GLM for invalid P2 argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(P2=P2, check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) + + +def test_glm_P2_positive_semidefinite(): + """Test GLM for a positive semi-definite P2 argument.""" + n_samples, n_features = 10, 5 + y = np.arange(n_samples) + X = np.zeros((n_samples, n_features)) + P2 = np.diag([100, 10, 5, 0, -1E-5]) + rng = np.random.RandomState(42) + # construct random orthogonal matrix Q + Q, R = linalg.qr(rng.randn(n_features, n_features)) + P2 = Q.T @ P2 @ Q + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, + check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) + + P2 = sparse.csr_matrix(P2) + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, + check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) +def test_glm_fit_intercept_argument(fit_intercept): + """Test GLM for invalid fit_intercept argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('solver, l1_ratio', + [('not a solver', 0), (1, 0), ([1], 0), + ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)]) +def test_glm_solver_argument(solver, l1_ratio): + """Test GLM for invalid solver argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]]) +def test_glm_max_iter_argument(max_iter): + """Test GLM for invalid max_iter argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(max_iter=max_iter) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]]) +def test_glm_tol_argument(tol): + """Test GLM for invalid tol argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(tol=tol) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]]) +def test_glm_warm_start_argument(warm_start): + """Test GLM for invalid warm_start argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(warm_start=warm_start) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('start_params', + ['not a start_params', ['zero'], [0, 0, 0], + [[0, 0]], ['a', 'b']]) +def test_glm_start_params_argument(start_params): + """Test GLM for invalid start_params argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(start_params=start_params) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']]) +def test_glm_selection_argument(selection): + """Test GLM for invalid selection argument""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(selection=selection) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]]) +def test_glm_random_state_argument(random_state): + """Test GLM for invalid random_state argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(random_state=random_state) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]]) +def test_glm_diag_fisher_argument(diag_fisher): + """Test GLM for invalid diag_fisher arguments.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) +def test_glm_copy_X_argument(copy_X): + """Test GLM for invalid copy_X arguments.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(copy_X=copy_X) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]]) +def test_glm_check_input_argument(check_input): + """Test GLM for invalid check_input argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(check_input=check_input) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +def test_glm_identiy_regression(solver): + """Test GLM regression with identity link on a simple dataset.""" + coef = [1, 2] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T + y = np.dot(X, coef) + glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', + fit_intercept=False, solver=solver, + start_params='zero', tol=1e-7) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) + + +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecant()]) +@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), + ('lbfgs', 1e-6, 6), + ('newton-cg', 1e-7, 6), + ('cd', 1e-7, 6)]) +def test_glm_log_regression(family, solver, tol, dec): + """Test GLM regression with log link on a simple dataset.""" + coef = [0.2, -0.1] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T + y = np.exp(np.dot(X, coef)) + glm = GeneralizedLinearRegressor( + alpha=0, family=family, link='log', fit_intercept=False, + solver=solver, start_params='guess', tol=tol) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef, decimal=dec) + + +@pytest.mark.filterwarnings('ignore::DeprecationWarning') +@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 5), + ('lbfgs', 1e-6, 5), + ('newton-cg', 1e-5, 5), + ('cd', 1e-6, 6)]) +def test_normal_ridge(solver, tol, dec): + """Test ridge regression for Normal distributions. + + Compare to test_ridge in test_ridge.py. + """ + rng = np.random.RandomState(42) + alpha = 1.0 + + # 1. With more samples than features + n_samples, n_features, n_predict = 100, 7, 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=rng) + y = y[0:n_samples] + X, T = X[0:n_samples], X[n_samples:] + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, + solver='svd', normalize=False) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=True, + tol=tol, max_iter=100, solver=solver, + check_input=False, random_state=rng) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, + solver='svd', normalize=False) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=False, + tol=tol, max_iter=100, solver=solver, + check_input=False, random_state=rng, + fit_dispersion='chisqr') + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + mu = glm.predict(X) + assert_almost_equal(glm.dispersion_, + np.sum((y-mu)**2/(n_samples-n_features))) + + # 2. With more features than samples and sparse + n_samples, n_features, n_predict = 10, 100, 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=rng) + y = y[0:n_samples] + X, T = X[0:n_samples], X[n_samples:] + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, + solver='sag', normalize=False, max_iter=100000) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=True, + tol=tol, max_iter=300, solver=solver, + check_input=False, random_state=rng) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, + solver='sag', normalize=False, max_iter=1000) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=False, + tol=tol*2, max_iter=300, solver=solver, + check_input=False, random_state=rng) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2) + + +@pytest.mark.parametrize('solver, tol, dec', + [('irls', 1e-7, 6), + ('lbfgs', 1e-7, 5), + ('newton-cg', 1e-7, 5), + ('cd', 1e-7, 7)]) +def test_poisson_ridge(solver, tol, dec): + """Test ridge regression with poisson family and LogLink. + + Compare to R's glmnet""" + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.12889386979 + # a 0.29019207995 + # b 0.03741173122 + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + rng = np.random.RandomState(42) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, + fit_intercept=True, family='poisson', + link='log', tol=tol, + solver=solver, max_iter=300, + random_state=rng) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, -0.12889386979, + decimal=dec) + assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + decimal=dec) + + +@pytest.mark.parametrize('diag_fisher', [False, True]) +def test_normal_enet(diag_fisher): + """Test elastic net regression with normal/gaussian family.""" + alpha, l1_ratio = 0.3, 0.7 + n_samples, n_features = 20, 2 + rng = np.random.RandomState(42) + X = rng.randn(n_samples, n_features).copy(order='F') + beta = rng.randn(n_features) + y = 2 + np.dot(X, beta) + rng.randn(n_samples) + + # 1. test normal enet on dense data + glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, + family='normal', link='identity', + fit_intercept=True, tol=1e-8, + max_iter=100, selection='cyclic', + solver='cd', start_params='zero', + check_input=False, + diag_fisher=diag_fisher) + glm.fit(X, y) + + enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, + normalize=False, tol=1e-8, copy_X=True) + enet.fit(X, y) + + assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) + assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + + # 2. test normal enet on sparse data + X = sparse.csc_matrix(X) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) + assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + + +def test_poisson_enet(): + """Test elastic net regression with poisson family and LogLink. + + Compare to R's glmnet""" + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.03550978409 + # a 0.16936423283 + # b . + glmnet_intercept = -0.03550978409 + glmnet_coef = [0.16936423283, 0.] + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + rng = np.random.RandomState(42) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', + link='log', solver='cd', tol=1e-8, + selection='random', random_state=rng, + start_params='guess') + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) + + # test results with general optimization procedure + def obj(coef): + pd = PoissonDistribution() + link = LogLink() + N = y.shape[0] + mu = link.inverse(X @ coef[1:] + coef[0]) + alpha, l1_ratio = (1, 0.5) + return 1./(2.*N) * pd.deviance(y, mu) \ + + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \ + + alpha * l1_ratio * np.sum(np.abs(coef[1:])) + res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10, + options={'maxiter': 1000, 'disp': False}) + assert_almost_equal(glm.intercept_, res.x[0], decimal=5) + assert_almost_equal(glm.coef_, res.x[1:], decimal=5) + assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))), + res.fun, decimal=8) + + # same for start_params='zero' and selection='cyclic' + # with reduced precision + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', + link='log', solver='cd', tol=1e-5, + selection='cyclic', start_params='zero') + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + + # check warm_start, therefore start with different alpha + glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, + family='poisson', max_iter=300, + link='log', solver='cd', tol=1e-5, + selection='cyclic', start_params='zero') + glm.fit(X, y) + # warm start with original alpha and use of sparse matrices + glm.warm_start = True + glm.alpha = 1 + X = sparse.csr_matrix(X) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + + +@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10]) +def test_binomial_enet(alpha): + """Test elastic net regression with binomial family and LogitLink. + + Compare to LogisticRegression. + """ + l1_ratio = 0.5 + n_samples = 500 + rng = np.random.RandomState(42) + X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6, + n_informative=5, n_redundant=0, n_repeated=0, + random_state=rng) + log = LogisticRegression( + penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6, + max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), + solver='saga') + log.fit(X, y) + glm = GeneralizedLinearRegressor( + family=BinomialDistribution(), link=LogitLink(), fit_intercept=False, + alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic', + tol=1e-7) + glm.fit(X, y) + assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6) + assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6) From 9b574bdc0b3a4a1081f73563d0468c864fcbbb22 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 11 Jun 2019 07:30:52 -0500 Subject: [PATCH 058/209] Fix docstrings for the new print_changed_only=True by default --- doc/modules/linear_model.rst | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 52f28346cc047..b9f0e96734ae5 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -942,17 +942,11 @@ follows: >>> from sklearn.linear_model import GeneralizedLinearRegressor >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') - >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE - GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5, - check_input=True, copy_X=True, diag_fisher=False, - family='poisson', fit_dispersion=None, - fit_intercept=True, l1_ratio=0, link='log', - max_iter=100, random_state=None, selection='cyclic', - solver='auto', start_params='guess', tol=0.0001, - verbose=0, warm_start=False) - >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) + GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> reg.coef_ array([0.24630169, 0.43373464]) - >>> reg.intercept_ #doctest: +ELLIPSIS + >>> reg.intercept_ -0.76383633... From 90299fdd8a4745965eb8f9dbe26ddb685f1531f1 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 12 Jun 2019 04:31:50 -0500 Subject: [PATCH 059/209] Increase coverage --- sklearn/linear_model/_glm.py | 22 +++----- sklearn/linear_model/tests/test_glm.py | 78 ++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 17 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 8152e84ac7253..055aacf26d747 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -182,7 +182,7 @@ def link(self, mu): mu : array, shape (n_samples,) Usually the (predicted) mean. """ - raise NotImplementedError + pass @abstractmethod def derivative(self, mu): @@ -193,7 +193,7 @@ def derivative(self, mu): mu : array, shape (n_samples,) Usually the (predicted) mean. """ - raise NotImplementedError + pass @abstractmethod def inverse(self, lin_pred): @@ -207,7 +207,7 @@ def inverse(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - raise NotImplementedError + pass @abstractmethod def inverse_derivative(self, lin_pred): @@ -218,7 +218,7 @@ def inverse_derivative(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - raise NotImplementedError + pass @abstractmethod def inverse_derivative2(self, lin_pred): @@ -229,7 +229,7 @@ def inverse_derivative2(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - raise NotImplementedError + pass class IdentityLink(Link): @@ -400,7 +400,7 @@ def unit_variance(self, mu): mu : array, shape (n_samples,) Predicted mean. """ - raise NotImplementedError() + pass @abstractmethod def unit_variance_derivative(self, mu): @@ -413,7 +413,7 @@ def unit_variance_derivative(self, mu): mu : array, shape (n_samples,) Target values. """ - raise NotImplementedError() + pass def variance(self, mu, phi=1, weights=1): r"""Compute the variance function. @@ -473,7 +473,7 @@ def unit_deviance(self, y, mu): mu : array, shape (n_samples,) Predicted mean. """ - raise NotImplementedError() + pass def unit_deviance_derivative(self, y, mu): r"""Compute the derivative of the unit deviance w.r.t. mu. @@ -1079,7 +1079,6 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, B[idx:, idx:] += P2.toarray() else: B[idx:, idx:] += P2 - # A = -score + coef_P2 A = -score A[idx:] += coef_P2 # A += d @ (H+P2) but so far d=0 @@ -1302,9 +1301,6 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32], order='F', copy=copy_X) if sparse.issparse(X): - if not sparse.isspmatrix_csc(X): - raise ValueError("If X is sparse, it must be in csc format" - "; got (format={})".format(X.format)) if not sparse.isspmatrix_csc(P2): raise ValueError("If X is sparse, P2 must also be sparse csc" "format. Got P2 not sparse.") @@ -2021,7 +2017,7 @@ def fit(self, X, y, sample_weight=None): _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, max_inner_iter=1000, selection=self.selection, - random_state=self.random_state, + random_state=random_state, diag_fisher=self.diag_fisher) coef += d # for simplicity no line search here else: diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index d42a8739f6aa0..230bbdabae201 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -21,10 +21,20 @@ GeneralizedHyperbolicSecant, BinomialDistribution, ) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge +from sklearn.metrics import mean_absolute_error from sklearn.utils.testing import assert_array_equal +@pytest.fixture(scope="module") +def regression_data(): + X, y = make_regression(n_samples=107, + n_features=10, + n_informative=80, noise=0.5, + random_state=2) + return X, y + + @pytest.mark.parametrize('link', Link.__subclasses__()) def test_link_properties(link): """Test link inverse and derivative.""" @@ -39,6 +49,10 @@ def test_link_properties(link): # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) assert_allclose(link.derivative(link.inverse(x)), 1./link.inverse_derivative(x)) + + assert ( + link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) + # for LogitLink, in the following x should be between 0 and 1. # assert_almost_equal(link.inverse_derivative(link.link(x)), # 1./link.derivative(x), decimal=decimal) @@ -108,7 +122,7 @@ def f(coef): def test_sample_weights_validation(): """Test the raised errors in the validation of sample_weight.""" - # 1. scalar value but not positive + # scalar value but not positive X = [[1]] y = [1] weights = 0 @@ -116,17 +130,20 @@ def test_sample_weights_validation(): with pytest.raises(ValueError): glm.fit(X, y, weights) - # 2. 2d array + # Positive weights are accepted + glm.fit(X, y, sample_weight=1) + + # 2d array weights = [[0]] with pytest.raises(ValueError): glm.fit(X, y, weights) - # 3. 1d but wrong length + # 1d but wrong length weights = [1, 0] with pytest.raises(ValueError): glm.fit(X, y, weights) - # 4. 1d but only zeros (sum not greater than 0) + # 1d but only zeros (sum not greater than 0) weights = [0, 0] X = [[0], [1]] y = [1, 2] @@ -643,3 +660,56 @@ def test_binomial_enet(alpha): glm.fit(X, y) assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6) assert_allclose(log.coef_[0, :], glm.coef_, rtol=5e-6) + + +@pytest.mark.parametrize( + "params", + [ + {"solver": "irls", "start_params": "guess"}, + {"solver": "irls", "start_params": "zero"}, + {"solver": "lbfgs", "start_params": "guess"}, + {"solver": "lbfgs", "start_params": "zero"}, + {"solver": "newton-cg"}, + {"solver": "cd", "selection": "cyclic", "diag_fisher": False}, + {"solver": "cd", "selection": "cyclic", "diag_fisher": True}, + {"solver": "cd", "selection": "random", "diag_fisher": False}, + ], + ids=lambda params: ', '.join("%s=%s" % (key, val) + for key, val in params.items()) +) +def test_solver_equivalence(params, regression_data): + X, y = regression_data + est_ref = GeneralizedLinearRegressor(random_state=2) + est_ref.fit(X, y) + + estimator = GeneralizedLinearRegressor(**params) + estimator.set_params(random_state=2) + + estimator.fit(X, y) + + assert_allclose(estimator.intercept_, est_ref.intercept_, rtol=1e-4) + assert_allclose(estimator.coef_, est_ref.coef_, rtol=1e-4) + assert_allclose( + mean_absolute_error(estimator.predict(X), y), + mean_absolute_error(est_ref.predict(X), y), + rtol=1e-4 + ) + + +def test_fit_dispersion(regression_data): + X, y = regression_data + + est1 = GeneralizedLinearRegressor(random_state=2) + est1.fit(X, y) + assert not hasattr(est1, "dispersion_") + + est2 = GeneralizedLinearRegressor(random_state=2, fit_dispersion="chisqr") + est2.fit(X, y) + assert isinstance(est2.dispersion_, float) + + est3 = GeneralizedLinearRegressor( + random_state=2, fit_dispersion="deviance") + est3.fit(X, y) + assert isinstance(est3.dispersion_, float) + + assert_allclose(est2.dispersion_, est3.dispersion_) From e3a5a9aa6855928d89bdc5dad7f3a46ba934fb8e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 12 Jun 2019 12:00:28 -0500 Subject: [PATCH 060/209] More tests and addressing some review comments --- sklearn/linear_model/_glm.py | 51 +++++++++++++------------- sklearn/linear_model/tests/test_glm.py | 43 +++++++++++++++++++++- 2 files changed, 67 insertions(+), 27 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 055aacf26d747..acce438b2b5e2 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -6,8 +6,6 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -# TODO: Should the option `normalize` be included (like other linear models)? -# So far, it is not included. User must pass a normalized X. # TODO: Add cross validation support, e.g. GCV? # TODO: Should GeneralizedLinearRegressor inherit from LinearModel? # So far, it does not. @@ -287,7 +285,6 @@ def inverse_derivative(self, lin_pred): return ep * (1. - ep) def inverse_derivative2(self, lin_pred): - ep = special.expit(lin_pred) ep = special.expit(lin_pred) return ep * (1. - ep) * (1. - 2 * ep) @@ -738,7 +735,19 @@ class TweedieDistribution(ExponentialDispersionModel): For ``0 0) - 1, pgtol=self.tol, maxiter=self.max_iter, factr=1e3) - if self.verbose > 0: - if info["warnflag"] == 1: - warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.", - ConvergenceWarning) - elif info["warnflag"] == 2: - warnings.warn("lbfgs failed for the reason: {0}" - .format(info["task"])) + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.", + ConvergenceWarning) + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}" + .format(info["task"])) self.n_iter_ = info['nit'] # 4.3 Newton-CG ####################################################### diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 230bbdabae201..7229f21840829 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -22,6 +22,7 @@ ) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.metrics import mean_absolute_error +from sklearn.exceptions import ConvergenceWarning from sklearn.utils.testing import assert_array_equal @@ -72,6 +73,23 @@ def test_family_bounds(family, expected): assert_array_equal(result, expected) +def test_tweedie_distribution_power(): + with pytest.raises(ValueError, match="no distribution exists"): + TweedieDistribution(power=0.5) + + with pytest.raises(TypeError, match="must be a real number"): + TweedieDistribution(power=1j) + + with pytest.raises(TypeError, match="must be a real number"): + dist = TweedieDistribution() + dist.power = 1j + + dist = TweedieDistribution() + assert dist._include_lower_bound is False + dist.power = 1 + assert dist._include_lower_bound is True + + @pytest.mark.parametrize( 'family, chk_values', [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), @@ -97,7 +115,8 @@ def test_deviance_zero(family, chk_values): (GammaDistribution(), LogLink()), (InverseGaussianDistribution(), LogLink()), (TweedieDistribution(power=1.5), LogLink()), - (TweedieDistribution(power=4.5), LogLink())]) + (TweedieDistribution(power=4.5), LogLink())], + ids=lambda args: args.__class__.__name__) def test_fisher_matrix(family, link): """Test the Fisher matrix numerically. Trick: Use numerical differentiation with y = mu""" @@ -110,6 +129,11 @@ def test_fisher_matrix(family, link): weights = rng.randn(10)**2 + 1 fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, weights=weights, link=link) + # check that the Fisher matrix is square and positive definite + assert fisher.ndim == 2 + assert fisher.shape[0] == fisher.shape[1] + assert np.all(np.linalg.eigvals(fisher) >= 0) + approx = np.array([]).reshape(0, coef.shape[0]) for i in range(coef.shape[0]): def f(coef): @@ -119,6 +143,13 @@ def f(coef): [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]) assert_allclose(fisher, approx, rtol=1e-3) + # check the observed information matrix + oim = family._observed_information(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link) + assert oim.ndim == 2 + assert oim.shape == fisher.shape + assert_allclose(oim, fisher) + def test_sample_weights_validation(): """Test the raised errors in the validation of sample_weight.""" @@ -713,3 +744,13 @@ def test_fit_dispersion(regression_data): assert isinstance(est3.dispersion_, float) assert_allclose(est2.dispersion_, est3.dispersion_) + + +@pytest.mark.parametrize("solver", ["irls", "lbfgs", "newton-cg", "cd"]) +def test_convergence_warning(solver, regression_data): + X, y = regression_data + + est = GeneralizedLinearRegressor(solver=solver, random_state=2, + max_iter=1, tol=1e-20) + with pytest.warns(ConvergenceWarning): + est.fit(X, y) From 54b80b8db780fbd3b438c6d5dc0001cdafe7a47e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 13 Jun 2019 09:50:53 -0500 Subject: [PATCH 061/209] TST More specific checks of error messages in tests --- sklearn/linear_model/_glm.py | 6 ++-- sklearn/linear_model/tests/test_glm.py | 44 ++++++++++++++------------ 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index acce438b2b5e2..c4f8cf7a975d3 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -1941,8 +1941,10 @@ def fit(self, X, y, sample_weight=None): k = np.min([10, n_features // 10 + 1]) sigma = 0 # start searching near this value which = 'SA' # find smallest algebraic eigenvalues first - if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma, - which=which) >= epsneg): + eigenvalues = splinalg.eigsh(P2, k=k, sigma=sigma, + which=which, + return_eigenvectors=False) + if not np.all(eigenvalues >= epsneg): raise ValueError("P2 must be positive semi-definite.") else: if not np.all(linalg.eigvalsh(P2) >= epsneg): diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 7229f21840829..de1a5262b36ce 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -158,7 +158,7 @@ def test_sample_weights_validation(): y = [1] weights = 0 glm = GeneralizedLinearRegressor(fit_intercept=False) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="weights must be non-negative"): glm.fit(X, y, weights) # Positive weights are accepted @@ -166,24 +166,26 @@ def test_sample_weights_validation(): # 2d array weights = [[0]] - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="must be 1D array or scalar"): glm.fit(X, y, weights) # 1d but wrong length weights = [1, 0] - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="weights must have the same length as y"): glm.fit(X, y, weights) # 1d but only zeros (sum not greater than 0) weights = [0, 0] X = [[0], [1]] y = [1, 2] - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="must have at least one positive element"): glm.fit(X, y, weights) # 5. 1d but with a negative value weights = [2, -1] - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="weights must be non-negative"): glm.fit(X, y, weights) @@ -202,7 +204,7 @@ def test_glm_family_argument(f, fam): glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="family must be"): glm.fit(X, y) @@ -218,7 +220,7 @@ def test_glm_link_argument(l, link): assert isinstance(glm._link_instance, link.__class__) glm = GeneralizedLinearRegressor(family='normal', link='not a link') - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="link must be"): glm.fit(X, y) @@ -228,7 +230,8 @@ def test_glm_alpha_argument(alpha): y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family='normal', alpha=alpha) - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="Penalty term must be a non-negative"): glm.fit(X, y) @@ -238,7 +241,8 @@ def test_glm_l1_ratio_argument(l1_ratio): y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio) - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="l1_ratio must be a number in interval.*0, 1"): glm.fit(X, y) @@ -276,13 +280,13 @@ def test_glm_P2_positive_semidefinite(): P2 = Q.T @ P2 @ Q glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, check_input=True) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="P2 must be positive semi-definite"): glm.fit(X, y) P2 = sparse.csr_matrix(P2) glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, check_input=True) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="P2 must be positive semi-definite"): glm.fit(X, y) @@ -292,7 +296,7 @@ def test_glm_fit_intercept_argument(fit_intercept): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="fit_intercept must be bool"): glm.fit(X, y) @@ -314,7 +318,7 @@ def test_glm_max_iter_argument(max_iter): y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(max_iter=max_iter) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="must be a positive integer"): glm.fit(X, y) @@ -324,7 +328,7 @@ def test_glm_tol_argument(tol): y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(tol=tol) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="stopping criteria must be positive"): glm.fit(X, y) @@ -334,7 +338,7 @@ def test_glm_warm_start_argument(warm_start): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(warm_start=warm_start) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="warm_start must be bool"): glm.fit(X, y) @@ -356,7 +360,7 @@ def test_glm_selection_argument(selection): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(selection=selection) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="argument selection must be"): glm.fit(X, y) @@ -366,7 +370,7 @@ def test_glm_random_state_argument(random_state): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(random_state=random_state) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="cannot be used to seed"): glm.fit(X, y) @@ -376,7 +380,7 @@ def test_glm_diag_fisher_argument(diag_fisher): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="diag_fisher must be bool"): glm.fit(X, y) @@ -386,7 +390,7 @@ def test_glm_copy_X_argument(copy_X): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(copy_X=copy_X) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="copy_X must be bool"): glm.fit(X, y) @@ -396,7 +400,7 @@ def test_glm_check_input_argument(check_input): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(check_input=check_input) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="check_input must be bool"): glm.fit(X, y) From 7db0320f460676d8bed8e01bf64657a89532e2cb Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 14 Jun 2019 10:44:16 -0500 Subject: [PATCH 062/209] Add PoissonRegressor alias --- sklearn/linear_model/__init__.py | 5 +- sklearn/linear_model/_glm.py | 163 +++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index e5ede64413eb5..121418f901a1a 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -19,7 +19,7 @@ MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) from ._glm import (TweedieDistribution, - GeneralizedLinearRegressor) + GeneralizedLinearRegressor, PoissonRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -82,4 +82,5 @@ 'ridge_regression', 'RANSACRegressor', 'GeneralizedLinearRegressor', - 'TweedieDistribution'] + 'TweedieDistribution', + 'PoissonRegressor'] diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index c4f8cf7a975d3..b18731e73f328 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -2326,3 +2326,166 @@ def score(self, X, y, sample_weight=None): y_mean = np.average(y, weights=weights) dev_null = self._family_instance.deviance(y, y_mean, weights=weights) return 1. - dev / dev_null + + def _more_tags(self): + return {"requires_positive_y": True} + + +class PoissonRegressor(GeneralizedLinearRegressor): + """Regression with the response variable y following a Poisson distribution + + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean of the target y as mu=h(X*w). + The fit minimizes the following objective function with L2 regularization:: + + 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 + + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, optional (default=1) + Constant that multiplies the penalty terms and thus determines the + regularization strength. + See the notes for the exact mathematical meaning of this + parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + case, the design matrix X must have full column rank + (no collinearities). + + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) + Method for estimation of the dispersion parameter phi. Whether to use + the chi squared statistic or the deviance statistic. If None, the + dispersion is not estimated. + + solver : {'irls', 'lbfgs', 'newton-cg'}, optional (default='irls') + Algorithm to use in the optimization problem: + + 'irls' + Iterated reweighted least squares. It is the standard algorithm + for GLMs. + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. + + 'newton-cg' + Newton conjugate gradient algorithm. + + Note that all solvers except lbfgs use the fisher matrix, i.e. the + expected Hessian instead of the Hessian matrix. + + max_iter : int, optional (default=100) + The maximal number of iterations for solver algorithms. + + tol : float, optional (default=1e-4) + Stopping criterion. For the irls, newton-cg and lbfgs solvers, + the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient (derivative) of + the objective function. + + warm_start : boolean, optional (default=False) + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` (supersedes option + ``start_params``). If set to ``True`` or if the attribute ``coef_`` + does not exit (first call to ``fit``), option ``start_params`` sets the + start values for ``coef_`` and ``intercept_``. + + start_params : {'guess', 'zero', array of shape (n_features*, )}, \ + optional (default='guess') + Relevant only if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not yet exist). + + 'guess' + Start values of mu are calculated by family.starting_mu(..). Then, + one Newton step obtains start values for ``coef_``. If + ``solver='irls'``, it uses one irls step. This gives usually good + starting values. + + 'zero' + All coefficients are set to zero. If ``fit_intercept=True``, the + start value for the intercept is obtained by the weighted average of y. + + array + The array of size n_features* is directly used as start values + for ``coef_``. If ``fit_intercept=True``, the first element + is assumed to be the start value for the ``intercept_``. + Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes + the intercept in counting. + + random_state : {int, RandomState instance, None}, optional (default=None) + If int, random_state is the seed used by the random + number generator; if RandomState instance, random_state is the random + number generator; if None, the random number generator is the + RandomState instance used by `np.random`. Used when ``selection`` == + 'random'. + + copy_X : boolean, optional, (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + verbose : int, optional (default=0) + For the lbfgs solver set verbose to any positive number for verbosity. + + Attributes + ---------- + coef_ : array, shape (n_features,) + Estimated coefficients for the linear predictor (X*coef_+intercept_) in + the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + dispersion_ : float + The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. + + n_iter_ : int + Actual number of iterations used in solver. + + Notes + ----- + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + :ref:`User Guide `. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. + + For alpha > 0, the feature matrix X should be standardized in order to + penalize features equally strong. + + If the target y is a ratio, appropriate sample weights s should be + provided. + As an example, consider Poisson distributed counts z (integers) and + weights s=exposure (time, money, persons years, ...). Then you fit + y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``. + The weights are necessary for the right (finite sample) mean. + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + in this case one might say that y has a 'scaled' Poisson distributions. + + References + ---------- + For the coordinate descent implementation: + * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + """ + def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None, + solver='irls', max_iter=100, + tol=1e-4, warm_start=False, start_params='guess', + random_state=None, copy_X=True, check_input=True, verbose=0): + + super().__init__(alpha=alpha, fit_intercept=fit_intercept, + family="poisson", link='log', + fit_dispersion=fit_dispersion, solver=solver, + max_iter=max_iter, tol=tol, warm_start=warm_start, + start_params=start_params, random_state=random_state, + copy_X=copy_X, verbose=verbose) From dcfe9edaf83509d2b6d98ee840c28f674ea4f496 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 27 Jun 2019 08:41:04 -0500 Subject: [PATCH 063/209] TST Simplify comparison with ridge --- sklearn/linear_model/tests/test_glm.py | 94 ++++++-------------------- 1 file changed, 22 insertions(+), 72 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index de1a5262b36ce..1416bdcfad680 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -26,6 +26,8 @@ from sklearn.utils.testing import assert_array_equal +GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg', 'cd'] + @pytest.fixture(scope="module") def regression_data(): @@ -404,7 +406,7 @@ def test_glm_check_input_argument(check_input): glm.fit(X, y) -@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +@pytest.mark.parametrize('solver', GLM_SOLVERS) def test_glm_identity_regression(solver): """Test GLM regression with identity link on a simple dataset.""" coef = [1., 2.] @@ -442,97 +444,45 @@ def test_glm_log_regression(family, solver, tol): # newton-cg may issue a LineSearchWarning, which we filter out @pytest.mark.filterwarnings('ignore:The line search algorithm') @pytest.mark.filterwarnings('ignore:Line Search failed') -@pytest.mark.parametrize('solver, tol', [('irls', 1e-6), - ('lbfgs', 1e-6), - ('newton-cg', 1e-6), - ('cd', 1e-6)]) -def test_normal_ridge(solver, tol): +@pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) +@pytest.mark.parametrize('fit_intercept', [True, False]) +@pytest.mark.parametrize('solver', GLM_SOLVERS) +def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): """Test ridge regression for Normal distributions. + Case n_samples >> n_features + Compare to test_ridge in test_ridge.py. """ - rng = np.random.RandomState(42) alpha = 1.0 - - # 1. With more samples than features - n_samples, n_features, n_predict = 100, 7, 10 + n_predict = 10 X, y, coef = make_regression(n_samples=n_samples+n_predict, n_features=n_features, n_informative=n_features-2, noise=0.5, - coef=True, random_state=rng) + coef=True, random_state=42) y = y[0:n_samples] X, T = X[0:n_samples], X[n_samples:] - # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, - solver='svd', normalize=False) - ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', - link='identity', fit_intercept=True, - tol=tol, max_iter=100, solver=solver, - check_input=False, random_state=rng) - glm.fit(X, y) - assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6) - assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) - assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6) - - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, - solver='svd', normalize=False) - ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', - link='identity', fit_intercept=False, - tol=tol, max_iter=100, solver=solver, - check_input=False, random_state=rng, - fit_dispersion='chisqr') - glm.fit(X, y) - assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_, rtol=1e-5) - assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) - assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6) - mu = glm.predict(X) - assert_allclose(glm.dispersion_, - np.sum((y-mu)**2/(n_samples-n_features))) - - # 2. With more features than samples and sparse - n_samples, n_features, n_predict = 10, 100, 10 - X, y, coef = make_regression(n_samples=n_samples+n_predict, - n_features=n_features, - n_informative=n_features-2, noise=0.5, - coef=True, random_state=rng) - y = y[0:n_samples] - X, T = X[0:n_samples], X[n_samples:] + if n_samples > n_features: + ridge_params = {"solver": "svd"} + else: + ridge_params = {"solver": "sag", "max_iter": 10000, "tol": 1e-9} # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, - solver='sag', normalize=False, max_iter=100000, - random_state=42) + ridge = Ridge(alpha=alpha*n_samples, normalize=False, + random_state=42, **ridge_params) ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=True, - tol=tol, max_iter=300, solver=solver, - check_input=False, random_state=rng) + max_iter=300, solver=solver, tol=1e-6, + check_input=False, random_state=42) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6) assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, - solver='sag', normalize=False, max_iter=1000, - random_state=42) - ridge.fit(X, y) - - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', - link='identity', fit_intercept=False, - tol=tol*2, max_iter=300, solver=solver, - check_input=False, random_state=rng) - glm.fit(X, y) - assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_, rtol=1e-4) - assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) - assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) - @pytest.mark.parametrize('solver, tol', [('irls', 1e-7), @@ -559,7 +509,7 @@ def test_poisson_ridge(solver, tol): rng = np.random.RandomState(42) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, fit_intercept=True, family='poisson', - link='log', tol=tol, + link='log', tol=1e-7, solver=solver, max_iter=300, random_state=rng) glm.fit(X, y) @@ -750,7 +700,7 @@ def test_fit_dispersion(regression_data): assert_allclose(est2.dispersion_, est3.dispersion_) -@pytest.mark.parametrize("solver", ["irls", "lbfgs", "newton-cg", "cd"]) +@pytest.mark.parametrize("solver", GLM_SOLVERS) def test_convergence_warning(solver, regression_data): X, y = regression_data From 4879bb6e057a95c38ee4950d13ef2dbd98da0a19 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 28 Jun 2019 09:54:59 -0500 Subject: [PATCH 064/209] EXA Add plot_tweedie_regression_insurance_claims.py --- ...lot_tweedie_regression_insurance_claims.py | 500 ++++++++++++++++++ 1 file changed, 500 insertions(+) create mode 100644 examples/linear_model/plot_tweedie_regression_insurance_claims.py diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py new file mode 100644 index 0000000000000..cb2ff667e8379 --- /dev/null +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -0,0 +1,500 @@ +""" +====================================== +Tweedie regression on insurance claims +====================================== + +This example illustrate the use Poisson, Gamma and Tweedie regression +on the French Motor Third-Party Liability Claims dataset, and is inspired +by an R tutorial [1]. + +Insurance claims data consist of the number of claims and the total claim +amount. Often, the final goal is to predict the expected value, i.e. the mean, +of the total claim amount. There are several possibilities to do that, two of +which are: + +1. Model the number of claims with a Poisson distribution, the average + claim amount as a Gamma distribution and multiply the predictions, to get + the total claim amount. +2. Model total claim amount directly, typically with a Tweedie distribution. + +In this example we will illustrate both approaches. We start by defining a few +helper functions for loading the data and visualizing results. + + +.. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor + Third-Party Liability Claims (November 8, 2018). + `doi:10.2139/ssrn.3164764 `_ + +""" +print(__doc__) + +# Authors: Christian Lorentzen +# Roman Yurchak +# License: BSD 3 clause +from functools import partial + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +from sklearn.compose import ColumnTransformer +from sklearn.linear_model import GeneralizedLinearRegressor +from sklearn.linear_model._glm import TweedieDistribution +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.preprocessing import StandardScaler, KBinsDiscretizer + +from sklearn.metrics import mean_absolute_error + + +def load_mtpl2(n_samples=100000): + """Fetcher for French Motor Third-Party Liability Claims dataset + + Parameters + ---------- + n_samples: int, default=100000 + number of samples to select (for faster run time). + """ + + # Note: this should use the OpenML DataFrame fetcher in the future + df_freq = pd.read_csv( + "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv", + dtype={"IDpol": np.int}, + index_col=0, + ) + + df_sev = pd.read_csv( + "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff", + index_col=0, + ) + + # sum ClaimAmount over identical IDs + df_sev = df_sev.groupby(level=0).sum() + + df = df_freq.join(df_sev, how="left") + df["ClaimAmount"].fillna(0, inplace=True) + + # unquote string fields + for column_name in df.columns[df.dtypes.values == np.object]: + df[column_name] = df[column_name].str.strip("'") + return df.iloc[:n_samples] + + +def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None, + title=None, kind_weight=None, ax=None): + """Plot observed and predicted - aggregated per feature level. + + Parameters + ---------- + df : DataFrame with at least one column named feature + observed : str + a column name of the observed target + predicted : frame + a dataframe, with the same index as df, with the predicted target + weight : str + column name with the values of weights/exposure + """ + # aggregate observed and predicted variables by feature level + df_ = df.loc[:, [feature, weight]].copy() + df_["observed"] = df[observed] * df[weight] + df_["predicted"] = y_predicted * df[weight] + df_ = ( + df_.groupby([feature])[weight, "observed", "predicted"] + .sum() + .assign(observed=lambda x: x["observed"] / x[weight]) + .assign(predicted=lambda x: x["predicted"] / x[weight]) + ) + + ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax) + y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8 + ax.fill_between( + df_.index, + 0, + y_max * df_[weight] / df_[weight].values.max(), + color="g", + alpha=0.1, + ) + ax.set( + ylabel=y_label if y_label is not None else None, + title=title if title is not None else "Train: Observed vs Predicted", + ) + + +############################################################################## +# +# 1. Loading datasets and pre-processing +# -------------------------------------- +# +# We construct the freMTPL2 dataset by joining the freMTPL2freq table, +# containing the number of claims (``ClaimNb``) with the freMTPL2sev table +# containing the claim amount (``ClaimAmount``) for the same user ids. + +df = load_mtpl2(n_samples=100000) + +# Note: filter out claims with zero amount, as the severity model +# requires a strictly positive target values. +df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 + +# correct for unreasonable observations (that might be data error) +df["ClaimNb"].clip(upper=4, inplace=True) +df["Exposure"].clip(upper=1, inplace=True) + +column_trans = ColumnTransformer( + [ + ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ( + "Veh_Brand_Gas_Region", + OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), + ("BonusMalus", "passthrough", ["BonusMalus"]), + ( + "Density_log", + make_pipeline( + FunctionTransformer(np.log, validate=False), StandardScaler() + ), + ["Density"], + ), + ], + remainder="drop", +) +X = column_trans.fit_transform(df) + + +df["Frequency"] = df.ClaimNb / df.Exposure +df["AvgClaimAmount"] = df.ClaimAmount / np.fmax(df.ClaimNb, 1) + +print(df[df.ClaimAmount > 0].head()) + +############################################################################## +# +# 2. Frequency model -- Poisson distribution +# ------------------------------------------- +# +# The number of claims (``ClaimNb``) is a positive integer that can be modeled +# as a Poisson distribution. It is then assumed to be the number of discrete +# events occuring with a constant rate in a given time interval (``Exposure``). +# Here we model the frequency ``y = ClaimNb / Exposure``, +# which is still a (scaled) Poisson distribution. +# +# A very important property of the Poisson distribution is its mean-variance +# relation: The variance is proportional to the mean. + +df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) + +glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0) +glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) + + +def mean_deviance(estimator, y, y_pred, weights): + if hasattr(estimator, "_family_instance"): + return estimator._family_instance.deviance(y, y_pred, weights) / len(y) + else: + return np.nan + + +def score_estimator( + estimator, X_train, X_test, df_train, df_test, target, weights +): + res = [] + + for subset_label, X, df in [ + ("train", X_train, df_train), + ("test", X_test, df_test), + ]: + y, _weights = df[target], df[weights] + + for score_label, metric in [ + ("D² explaned", None), + ("mean deviance", partial(mean_deviance, estimator)), + ("mean abs. error", mean_absolute_error), + ]: + if estimator.__class__.__name__ == "ClaimProdEstimator": + # ClaimProdEstimator is the product of the frequency and + # severity models, together with a denormalized by the exposure + # values. It does not fully follow the scikit-learn API and we + # must handle it separately. + y_pred = estimator.predict(X, exposure=df.Exposure.values) + else: + y_pred = estimator.predict(X) + if metric is None: + if not hasattr(estimator, "score"): + continue + score = estimator.score(X, y, _weights) + else: + score = metric(y, y_pred, _weights) + + res.append( + {"subset": subset_label, "metric": score_label, "score": score} + ) + + res = ( + pd.DataFrame(res) + .set_index(["metric", "subset"]) + .score.unstack(-1) + .round(3) + ) + return res + + +scores = score_estimator( + glm_freq, + X_train, + X_test, + df_train, + df_test, + target="Frequency", + weights="Exposure", +) +print(scores) + +############################################################################## +# +# We can visually compare observed and predicted values, aggregated by +# the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance +# bonus/penalty (``BonusMalus``), + +fig, ax = plt.subplots(2, 2, figsize=(16, 8)) +fig.subplots_adjust(hspace=0.3, wspace=0.2) + +plot_obs_pred( + df_train, + "DrivAge", + "Frequency", + glm_freq.predict(X_train), + weight="Exposure", + y_label="Claim Frequency", + title="train data", + ax=ax[0, 0], +) + +plot_obs_pred( + df_test, + "DrivAge", + "Frequency", + glm_freq.predict(X_test), + weight="Exposure", + y_label="Claim Frequency", + title="test data", + ax=ax[0, 1], +) + +plot_obs_pred( + df_test, + "VehAge", + "Frequency", + glm_freq.predict(X_test), + weight="Exposure", + y_label="Claim Frequency", + title="test data", + ax=ax[1, 0], +) + +plot_obs_pred( + df_test, + "BonusMalus", + "Frequency", + glm_freq.predict(X_test), + weight="Exposure", + y_label="Claim Frequency", + title="test data", + ax=ax[1, 1], +) + + +############################################################################## +# +# 3. Severity model - Gamma Distribution +# --------------------------------------- +# The mean claim amount or severity (`AvgClaimAmount`) can be empirically +# shown to follow a Gamma distribution. We fit a GLM model for the severity +# with the same features as the frequency model. +# +# Note: +# - We filter out ``ClaimAmount == 0``` as the Gamma distribution as support +# on :math:`(0, \infty)` not :math:`[0, \infty)`. +# - We use ``ClaimNb`` as sample weights. + +mask_train = df_train["ClaimAmount"] > 0 +mask_test = df_test["ClaimAmount"] > 0 + +glm_sev = GeneralizedLinearRegressor(family="gamma", alpha=1) + +glm_sev.fit( + X_train[mask_train.values], + df_train.loc[mask_train, "AvgClaimAmount"], + sample_weight=df_train.loc[mask_train, "ClaimNb"], +) + + +scores = score_estimator( + glm_sev, + X_train[mask_train.values], + X_test[mask_test.values], + df_train[mask_train], + df_test[mask_test], + target="AvgClaimAmount", + weights="ClaimNb", +) +print(scores) + +############################################################################## +# +# Note that the resulting model is conditional on having at least one claim, +# and cannot be used to predict the average claim amount in general, + +print( + "Mean AvgClaim Amount: %.2f " + % df_train.AvgClaimAmount.mean() +) +print( + "Mean AvgClaim Amount | NbClaim > 0: %.2f" + % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean() +) +print( + "Predicted Mean AvgClaim Amount: %.2f" + % glm_sev.predict(X_train).mean() +) + + +############################################################################## +# +# We can visually compare observed and predicted values, aggregated for +# the drivers age (``Driv Age``), + +fig, ax = plt.subplots(1, 2, figsize=(16, 4)) + +# plot DivAge +plot_obs_pred( + df_train.loc[mask_train], + "DrivAge", + "AvgClaimAmount", + glm_sev.predict(X_train[mask_train.values]), + weight="Exposure", + y_label="Average Claim Severity", + title="train data", + ax=ax[0], +) + +plot_obs_pred( + df_test.loc[mask_test], + "DrivAge", + "AvgClaimAmount", + glm_sev.predict(X_test[mask_test.values]), + weight="Exposure", + y_label="Average Claim Severity", + title="test data", + ax=ax[1], +) + + +############################################################################## +# +# 3. Total Claims Amount -- Compound Poisson distribution +# ------------------------------------------------------- +# +# As mentionned in the introduction, the total claim amount can be modeled +# either as the product of the frequency model by the severity model. + + +class ClaimProdEstimator: + """Total claim amount estimator + + Computed as the product of the frequency model by the serverity model, + denormalized by exposure. + """ + + def __init__(self, est_freq, est_sev): + self.est_freq = est_freq + self.est_sev = est_sev + + def predict(self, X, exposure): + """Predict the total claim amount + + The predict method is not compatible with the scikit-learn API. + """ + return exposure * self.est_freq.predict(X) * self.est_sev.predict(X) + + +est_prod = ClaimProdEstimator(glm_freq, glm_sev) + +scores = score_estimator( + est_prod, + X_train, + X_test, + df_train, + df_test, + target="ClaimAmount", + weights="Exposure", +) +print(scores) + + +############################################################################## +# +# or as a unique Compound Poisson model, also corresponding to a Tweedie model +# with a power :math:`p \in (1, 2)`. We determine the optimal hyperparameter +# ``p`` with a grid search, + +from sklearn.model_selection import GridSearchCV + +# this takes a while +params = { + "family": [ + TweedieDistribution(power=power) for power in np.linspace(1, 2, 8) + ] +} + +glm_total = GridSearchCV( + GeneralizedLinearRegressor(), cv=3, param_grid=params, n_jobs=-1 +) +glm_total.fit( + X_train, df_train["ClaimAmount"], sample_weight=df_train["Exposure"] +) + + +print( + "Best hyperparameters: power=%.2f\n" + % glm_total.best_estimator_.family.power +) + +scores = score_estimator( + glm_total.best_estimator_, + X_train, + X_test, + df_train, + df_test, + target="ClaimAmount", + weights="Exposure", +) +print(scores) + +############################################################################## +# +# In this example, the mean absolute error is lower for the Compound Poisson +# model than when using separate models for frequency and severity. +# +# We can additionally validate these models by comparing observed and predicted +# total claim amount over the test and train subsets. + +res = [] +for subset_label, X, df in [ + ("train", X_train, df_train), + ("test", X_test, df_test), +]: + res.append( + { + "subset": subset_label, + "observed": df.ClaimAmount.values.sum(), + "predicted, frequency*severity model": np.sum( + est_prod.predict(X, exposure=df.Exposure.values) + ), + "predicted, tweedie, p=%.2f" + % glm_total.best_estimator_.family.power: np.sum( + glm_total.best_estimator_.predict(X) + ), + } + ) + +print(pd.DataFrame(res).set_index("subset").T) From 56069e5b3f5b453d9e8a487c9d27e20900ce4d63 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 28 Jun 2019 10:11:37 -0500 Subject: [PATCH 065/209] EXA Fix issues with older pandas versions in example --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index cb2ff667e8379..063d12e6e291b 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -137,8 +137,8 @@ def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None, df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 # correct for unreasonable observations (that might be data error) -df["ClaimNb"].clip(upper=4, inplace=True) -df["Exposure"].clip(upper=1, inplace=True) +df["ClaimNb"] = df["ClaimNb"].clip(upper=4) +df["Exposure"] = df["Exposure"].clip(upper=1) column_trans = ColumnTransformer( [ From 53f3c5f6670bbdefcd5ddf6d63e405401909cf06 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 09:14:30 -0500 Subject: [PATCH 066/209] DOC Add second poisson regression example --- ...plot_poisson_regression_non_normal_loss.py | 257 ++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 examples/linear_model/plot_poisson_regression_non_normal_loss.py diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py new file mode 100644 index 0000000000000..b06adcb787560 --- /dev/null +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -0,0 +1,257 @@ +""" +====================================== +Poisson regression and non normal loss +====================================== + +This example illustrate the use linear Poisson regression +on the French Motor Third-Party Liability Claims dataset [1] and compare +it with learning models with least squared error. + + +We start by defining a few helper functions for loading the data and +visualizing results. + + +.. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor + Third-Party Liability Claims (November 8, 2018). + `doi:10.2139/ssrn.3164764 `_ + +""" +print(__doc__) + +# Authors: Christian Lorentzen +# Roman Yurchak +# License: BSD 3 clause +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +from scipy.special import xlogy + +from sklearn.compose import ColumnTransformer +from sklearn.linear_model import GeneralizedLinearRegressor, LinearRegression +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.preprocessing import StandardScaler, KBinsDiscretizer +from sklearn.ensemble import GradientBoostingRegressor + +from sklearn.metrics import mean_squared_error, mean_absolute_error + + +def load_mtpl2(n_samples=100000): + """Fetcher for French Motor Third-Party Liability Claims dataset + + Parameters + ---------- + n_samples: int, default=100000 + number of samples to select (for faster run time). + """ + + # Note: this should use the OpenML DataFrame fetcher in the future + df_freq = pd.read_csv( + "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv", + dtype={"IDpol": np.int}, + index_col=0, + ) + + df_sev = pd.read_csv( + "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff", + index_col=0, + ) + + # sum ClaimAmount over identical IDs + df_sev = df_sev.groupby(level=0).sum() + + df = df_freq.join(df_sev, how="left") + df["ClaimAmount"].fillna(0, inplace=True) + + # unquote string fields + for column_name in df.columns[df.dtypes.values == np.object]: + df[column_name] = df[column_name].str.strip("'") + return df.iloc[:n_samples] + + +############################################################################## +# +# 1. Loading datasets and pre-processing +# -------------------------------------- +# +# We construct the freMTPL2 dataset by joining the freMTPL2freq table, +# containing the number of claims (``ClaimNb``) with the freMTPL2sev table +# containing the claim amount (``ClaimAmount``) for the same user ids. + +df = load_mtpl2(n_samples=100000) + +# Note: filter out claims with zero amount, as the severity model +# requires a strictly positive target values. +df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 + +# correct for unreasonable observations (that might be data error) +df["ClaimNb"] = df["ClaimNb"].clip(upper=4) +df["Exposure"] = df["Exposure"].clip(upper=1) + +column_trans = ColumnTransformer( + [ + ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ( + "Veh_Brand_Gas_Region", + OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), + ("BonusMalus", "passthrough", ["BonusMalus"]), + ( + "Density_log", + make_pipeline( + FunctionTransformer(np.log, validate=False), StandardScaler() + ), + ["Density"], + ), + ], + remainder="drop", +) +X = column_trans.fit_transform(df) + +############################################################################## +# +# The number of claims (``ClaimNb``) is a positive integer that can be modeled +# as a Poisson distribution. It is then assumed to be the number of discrete +# events occurring with a constant rate in a given time interval +# (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``, +# which is still a (scaled) Poisson distribution. +# +# A very important property of the Poisson distribution is its mean-variance +# relation: The variance is proportional to the mean. + +df["Frequency"] = df.ClaimNb / df.Exposure + +print( + pd.cut(df.Frequency, [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts() +) + +############################################################################## +# +# It worth noting that 96 % of users have 0 claims, and if we were to convert +# this problem into a binary classification task, it would be significantly +# imbalanced. +# +# To evaluate the pertinence of the used metrics, we will consider as a +# baseline an estimator that returns 0 for any input. + +df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) + + +def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): + y_true = np.atleast_1d(y_true) + y_pred = np.atleast_1d(y_pred) + dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred) + return np.average(dev, weights=sample_weights) + + +eps = 1e-5 +print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, np.zeros(len(df_test)), + df_test.Exposure.values)) +print("MAE: %.3f" % mean_absolute_error( + df_test.Frequency.values, np.zeros(len(df_test)), + df_test.Exposure.values)) +print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( + df_test.Frequency.values, eps + np.zeros(len(df_test)), + df_test.Exposure.values)) + + +############################################################################## +# +# We start by modeling the target variable with the least squares linear +# regression model, + + +linregr = LinearRegression() +linregr.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) + +print("LinearRegression") +print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, linregr.predict(X_test), + df_test.Exposure.values)) +print("MSE: %.3f" % mean_absolute_error( + df_test.Frequency.values, linregr.predict(X_test), + df_test.Exposure.values)) +print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( + df_test.Frequency.values, np.fmax(linregr.predict(X_test), eps), + df_test.Exposure.values)) + +############################################################################## +# +# The Poisson deviance cannot be computed because negative values are +# predicted by the model, + +print('Number Negatives: %s / total: %s' % ( + (linregr.predict(X_test) < 0).sum(), X_test.shape[0])) + +############################################################################## +# +# Next we fit the Poisson regressor on the target variable, + +glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0) +glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) + +print("PoissonRegressor") +print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, glm_freq.predict(X_test), + df_test.Exposure.values)) +print("MAE: %.3f" % mean_absolute_error( + df_test.Frequency.values, glm_freq.predict(X_test), + df_test.Exposure.values)) +print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( + df_test.Frequency.values, glm_freq.predict(X_test), + df_test.Exposure.values)) + +############################################################################## +# +# Finally we will consider a non linear model with Gradient boosting that +# still minimizes the least square error. + + +gbr = GradientBoostingRegressor(max_depth=3) +gbr.fit(X_train, df_train.Frequency.values, + sample_weight=df_train.Exposure.values) + + +print("GradientBoostingRegressor") +print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) +print("MAE: %.3f" % mean_absolute_error( + df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) +print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( + df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) + +############################################################################## +# +# In this example, although Gradient boosting minimizes the least square error, +# because of a higher predictive power it also results in a smaller Poisson +# deviance than the Poisson regression model. +# +# Evaluating models with a single train / test split is prone to numerical +# errors, we can verify that we would also get equivalent resuts with the +# cross-validation score. +# +# The difference between these models can also be visualized by comparing the +# histogram of observed target values with that of predicted values, + + +fig, ax = plt.subplots(1, 4, figsize=(16, 3)) + +df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=ax[0]) + +ax[0].set_title('Experimental data') + +for idx, model in enumerate([linregr, glm_freq, gbr]): + y_pred = model.predict(X_train) + + pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=ax[idx+1]) + ax[idx+1].set_title(model.__class__.__name__) + +for axi in ax: + axi.set( + yscale='log', + xlabel="y (Frequency)" + ) From be5a3c485684ae45835258a71a1870a59549fbda Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 09:55:57 -0500 Subject: [PATCH 067/209] Add GeneralizedHyperbolicSecant and BinomialDistributions --- sklearn/linear_model/_glm.py | 109 ++++--------------------- sklearn/linear_model/tests/test_glm.py | 34 +------- 2 files changed, 18 insertions(+), 125 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index b18731e73f328..736e50960dcda 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -6,36 +6,6 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -# TODO: Add cross validation support, e.g. GCV? -# TODO: Should GeneralizedLinearRegressor inherit from LinearModel? -# So far, it does not. -# TODO: Include further classes in class.rst? ExponentialDispersionModel? -# TweedieDistribution? -# TODO: Negative values in P1 are not allowed so far. They could be used -# for group lasso. - -# Design Decisions: -# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. -# Estimators in sklearn are either regressors or classifiers. A GLM can do -# both depending on the distr (Normal => regressor, Binomial => classifier). -# Solution: GeneralizedLinearRegressor since this is the focus. -# - Allow for finer control of penalty terms: -# L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude -# factors from the L1 penalty. -# L2: w*P2*w with P2 a positive (semi-) definite matrix, e.g. P2 could be -# a 1st or 2nd order difference matrix (compare B-spline penalties and -# Tikhonov regularization). -# - The link function (instance of class Link) is necessary for the evaluation -# of deviance, score, Fisher and Hessian matrix as functions of the -# coefficients, which is needed by optimizers. -# Solution: link as argument in those functions -# - Which name/symbol for sample_weight in docu? -# sklearn.linear_models uses w for coefficients, standard literature on -# GLMs use beta for coefficients and w for (sample) weights. -# So far, coefficients=w and sample weights=s. -# - The intercept term is the first index, i.e. coef[0] - - from __future__ import division from abc import ABCMeta, abstractmethod import numbers @@ -51,6 +21,7 @@ from ..utils.validation import check_is_fitted, check_random_state + def _check_weights(sample_weight, n_samples): """Check that sample weights are non-negative and have the right shape.""" if sample_weight is None: @@ -854,47 +825,12 @@ def __init__(self): super(InverseGaussianDistribution, self).__init__(power=3) -class GeneralizedHyperbolicSecant(ExponentialDispersionModel): - """A class for the Generalized Hyperbolic Secant (GHS) distribution. - - The GHS distribution is for targets y in (-inf, inf). - """ - def __init__(self): - self._lower_bound = -np.Inf - self._upper_bound = np.Inf - self._include_lower_bound = False - self._include_upper_bound = False - - def unit_variance(self, mu): - return 1 + mu**2 - - def unit_variance_derivative(self, mu): - return 2 * mu - - def unit_deviance(self, y, mu): - return (2 * y * (np.arctan(y) - np.arctan(mu)) + - np.log((1 + mu**2)/(1 + y**2))) - - -class BinomialDistribution(ExponentialDispersionModel): - """A class for the Binomial distribution. - - The Binomial distribution is for targets y in [0, 1]. - """ - def __init__(self): - self._lower_bound = 0 - self._upper_bound = 1 - self._include_lower_bound = True - self._include_upper_bound = True - - def unit_variance(self, mu): - return mu * (1 - mu) - - def unit_variance_derivative(self, mu): - return 1 - 2 * mu - - def unit_deviance(self, y, mu): - return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) +EDM_DISTRIBUTIONS = { + 'normal': NormalDistribution, + 'poisson': PoissonDistribution, + 'gamma': GammaDistribution, + 'inverse.gaussian': InverseGaussianDistribution, +} def _irls_step(X, W, P2, z, fit_intercept=True): @@ -1690,28 +1626,19 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # 1. input validation # ####################################################################### - # 1.1 validate arguments of __init__ ################################## + # 1.1 validate arguments of __init__ # Guarantee that self._family_instance is an instance of class # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): self._family_instance = self.family + elif self.family in EDM_DISTRIBUTIONS: + self._family_instance = EDM_DISTRIBUTIONS[self.family]() else: - if self.family == 'normal': - self._family_instance = NormalDistribution() - elif self.family == 'poisson': - self._family_instance = PoissonDistribution() - elif self.family == 'gamma': - self._family_instance = GammaDistribution() - elif self.family == 'inverse.gaussian': - self._family_instance = InverseGaussianDistribution() - elif self.family == 'binomial': - self._family_instance = BinomialDistribution() - else: - raise ValueError( - "The family must be an instance of class" - " ExponentialDispersionModel or an element of" - " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " - "'binomial']; got (family={0})".format(self.family)) + raise ValueError( + "The family must be an instance of class" + " ExponentialDispersionModel or an element of" + " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " + "'binomial']; got (family={0})".format(self.family)) # Guarantee that self._link_instance is set to an instance of # class Link @@ -1724,11 +1651,6 @@ def fit(self, X, y, sample_weight=None): self._link_instance = IdentityLink() if self._family_instance.power >= 1: self._link_instance = LogLink() - elif isinstance(self._family_instance, - GeneralizedHyperbolicSecant): - self._link_instance = IdentityLink() - elif isinstance(self._family_instance, BinomialDistribution): - self._link_instance = LogitLink() else: raise ValueError("No default link known for the " "specified distribution family. Please " @@ -2048,7 +1970,6 @@ def fit(self, X, y, sample_weight=None): # 4. fit # ####################################################################### # algorithms for optimization - # TODO: Parallelize it? # 4.1 IRLS ############################################################ # Note: we already set P2 = l2*P2, see above diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 1416bdcfad680..f51f630ebae7e 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -18,7 +18,6 @@ TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, - GeneralizedHyperbolicSecant, BinomialDistribution, ) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.metrics import mean_absolute_error @@ -103,7 +102,7 @@ def test_tweedie_distribution_power(): (TweedieDistribution(power=1.5), [0.1, 1.5]), (TweedieDistribution(power=2.5), [0.1, 1.5]), (TweedieDistribution(power=-4), [0.1, 1.5]), - (GeneralizedHyperbolicSecant(), [0.1, 1.5])]) +]) def test_deviance_zero(family, chk_values): """Test deviance(y,y) = 0 for different families.""" for x in chk_values: @@ -196,7 +195,7 @@ def test_sample_weights_validation(): ('poisson', PoissonDistribution()), ('gamma', GammaDistribution()), ('inverse.gaussian', InverseGaussianDistribution()), - ('binomial', BinomialDistribution())]) +]) def test_glm_family_argument(f, fam): """Test GLM family argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions @@ -424,7 +423,7 @@ def test_glm_identity_regression(solver): [NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), - GeneralizedHyperbolicSecant()]) +]) @pytest.mark.parametrize('solver, tol', [('irls', 1e-6), ('lbfgs', 1e-6), ('newton-cg', 1e-7), @@ -620,33 +619,6 @@ def obj(coef): assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4) -@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10]) -def test_binomial_enet(alpha): - """Test elastic net regression with binomial family and LogitLink. - - Compare to LogisticRegression. - """ - l1_ratio = 0.5 - n_samples = 500 - rng = np.random.RandomState(42) - X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6, - n_informative=5, n_redundant=0, n_repeated=0, - random_state=rng) - log = LogisticRegression( - penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6, - max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), - solver='saga') - log.fit(X, y) - - glm = GeneralizedLinearRegressor( - family=BinomialDistribution(), link=LogitLink(), fit_intercept=False, - alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic', - tol=1e-7) - glm.fit(X, y) - assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6) - assert_allclose(log.coef_[0, :], glm.coef_, rtol=5e-6) - - @pytest.mark.parametrize( "params", [ From e67fecb9bc1ee056ad7934803818fd46a0a1f8b3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 10:00:24 -0500 Subject: [PATCH 068/209] Remove start params option --- sklearn/linear_model/_glm.py | 157 +++-------------------------------- 1 file changed, 13 insertions(+), 144 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 736e50960dcda..18eba80080670 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -1457,33 +1457,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` - as initialization for ``coef_`` and ``intercept_`` (supersedes option - ``start_params``). If set to ``True`` or if the attribute ``coef_`` - does not exit (first call to ``fit``), option ``start_params`` sets the - start values for ``coef_`` and ``intercept_``. - - start_params : {'guess', 'zero', array of shape (n_features*, )}, \ - optional (default='guess') - Relevant only if ``warm_start=False`` or if fit is called - the first time (``self.coef_`` does not yet exist). - - 'guess' - Start values of mu are calculated by family.starting_mu(..). Then, - one Newton step obtains start values for ``coef_``. If - ``solver='irls'``, it uses one irls step, else the Newton step is - calculated by the cd solver. - This gives usually good starting values. - - 'zero' - All coefficients are set to zero. If ``fit_intercept=True``, the - start value for the intercept is obtained by the weighted average of y. - - array - The array of size n_features* is directly used as start values - for ``coef_``. If ``fit_intercept=True``, the first element - is assumed to be the start value for the ``intercept_``. - Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes - the intercept in counting. + as initialization for ``coef_`` and ``intercept_``. selection : str, optional (default='cyclic') For the solver 'cd' (coordinate descent), the coordinates (features) @@ -1503,7 +1477,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): 'random'. diag_fisher : boolean, optional, (default=False) - Only relevant for solver 'cd' (see also ``start_params='guess'``). + Only relevant for solver 'cd'. If ``False``, the full Fisher matrix (expected Hessian) is computed in each outer iteration (Newton iteration). If ``True``, only a diagonal matrix (stored as 1d array) is computed, such that @@ -1576,7 +1550,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, - tol=1e-4, warm_start=False, start_params='guess', + tol=1e-4, warm_start=False, selection='cyclic', random_state=None, diag_fisher=False, copy_X=True, check_input=True, verbose=0): self.alpha = alpha @@ -1591,7 +1565,6 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', self.max_iter = max_iter self.tol = tol self.warm_start = warm_start - self.start_params = start_params self.selection = selection self.random_state = random_state self.diag_fisher = diag_fisher @@ -1742,7 +1715,7 @@ def fit(self, X, y, sample_weight=None): n_samples, n_features = X.shape # 1.3 arguments to take special care ################################## - # P1, P2, start_params + # P1, P2 if isinstance(self.P1, str) and self.P1 == 'identity': P1 = np.ones(n_features) else: @@ -1793,25 +1766,6 @@ def fit(self, X, y, sample_weight=None): "got (P2.shape=({0}, {1})), needed ({2}, {2})" .format(P2.shape[0], P2.shape[1], X.shape[1])) - start_params = self.start_params - if isinstance(start_params, str): - if start_params not in ['guess', 'zero']: - raise ValueError("The argument start_params must be 'guess', " - "'zero' or an array of correct length; " - "got(start_params={0})".format(start_params)) - else: - start_params = check_array(start_params, accept_sparse=False, - force_all_finite=True, ensure_2d=False, - dtype=_dtype, copy=True) - if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or - (start_params.ndim != 1)): - raise ValueError("Start values for parameters must have the" - "right length and dimension; required (length" - "={0}, ndim=1); got (length={1}, ndim={2})." - .format(X.shape[1] + self.fit_intercept, - start_params.shape[0], - start_params.ndim)) - l1 = self.alpha * self.l1_ratio l2 = self.alpha * (1 - self.l1_ratio) # P1 and P2 are now for sure copies @@ -1899,72 +1853,12 @@ def fit(self, X, y, sample_weight=None): self.coef_)) else: coef = self.coef_ - elif isinstance(start_params, str): - if start_params == 'guess': - # Set mu=starting_mu of the family and do one Newton step - # If solver=cd use cd, else irls - mu = family.starting_mu(y, weights=weights) - eta = link.link(mu) # linear predictor - if solver in ['cd', 'lbfgs', 'newton-cg']: - # see function _cd_solver - sigma_inv = 1/family.variance(mu, phi=1, weights=weights) - d1 = link.inverse_derivative(eta) - temp = sigma_inv * d1 * (y - mu) - if self.fit_intercept: - score = np.concatenate(([temp.sum()], temp @ X)) - else: - score = temp @ X # same as X.T @ temp - - d2_sigma_inv = d1 * d1 * sigma_inv - diag_fisher = self.diag_fisher - if diag_fisher: - fisher = d2_sigma_inv - else: - fisher = \ - _safe_sandwich_dot(X, d2_sigma_inv, - intercept=self.fit_intercept) - # set up space for search direction d for inner loop - if self.fit_intercept: - coef = np.zeros(n_features+1) - else: - coef = np.zeros(n_features) - d = np.zeros_like(coef) - # initial stopping tolerance of inner loop - # use L1-norm of minimum of norm of subgradient of F - # use less restrictive tolerance for initial guess - inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, - P1=P1) - inner_tol = 4 * linalg.norm(inner_tol, ord=1) - # just one outer loop = Newton step - n_cycles = 0 - d, coef_P2, n_cycles, inner_tol = \ - _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, - inner_tol, max_inner_iter=1000, - selection=self.selection, - random_state=random_state, - diag_fisher=self.diag_fisher) - coef += d # for simplicity no line search here - else: - # See _irls_solver - # h'(eta) - hp = link.inverse_derivative(eta) - # working weights W, in principle a diagonal matrix - # therefore here just as 1d array - W = (hp**2 / family.variance(mu, phi=1, weights=weights)) - # working observations - z = eta + (y-mu)/hp - # solve A*coef = b - # A = X' W X + l2 P2, b = X' W z - coef = _irls_step(X, W, P2, z, - fit_intercept=self.fit_intercept) - else: # start_params == 'zero' - if self.fit_intercept: - coef = np.zeros(n_features+1) - coef[0] = link.link(np.average(y, weights=weights)) - else: - coef = np.zeros(n_features) - else: # assign given array as start values - coef = start_params + else: + if self.fit_intercept: + coef = np.zeros(n_features+1) + coef[0] = link.link(np.average(y, weights=weights)) + else: + coef = np.zeros(n_features) ####################################################################### # 4. fit # @@ -2312,32 +2206,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` - as initialization for ``coef_`` and ``intercept_`` (supersedes option - ``start_params``). If set to ``True`` or if the attribute ``coef_`` - does not exit (first call to ``fit``), option ``start_params`` sets the - start values for ``coef_`` and ``intercept_``. - - start_params : {'guess', 'zero', array of shape (n_features*, )}, \ - optional (default='guess') - Relevant only if ``warm_start=False`` or if fit is called - the first time (``self.coef_`` does not yet exist). - - 'guess' - Start values of mu are calculated by family.starting_mu(..). Then, - one Newton step obtains start values for ``coef_``. If - ``solver='irls'``, it uses one irls step. This gives usually good - starting values. - - 'zero' - All coefficients are set to zero. If ``fit_intercept=True``, the - start value for the intercept is obtained by the weighted average of y. - - array - The array of size n_features* is directly used as start values - for ``coef_``. If ``fit_intercept=True``, the first element - is assumed to be the start value for the ``intercept_``. - Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes - the intercept in counting. + as initialization for ``coef_`` and ``intercept_`` . random_state : {int, RandomState instance, None}, optional (default=None) If int, random_state is the seed used by the random @@ -2401,12 +2270,12 @@ class PoissonRegressor(GeneralizedLinearRegressor): """ def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None, solver='irls', max_iter=100, - tol=1e-4, warm_start=False, start_params='guess', + tol=1e-4, warm_start=False, random_state=None, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="poisson", link='log', fit_dispersion=fit_dispersion, solver=solver, max_iter=max_iter, tol=tol, warm_start=warm_start, - start_params=start_params, random_state=random_state, + random_state=random_state, copy_X=copy_X, verbose=verbose) From 62f4448101c16a797d74119fe1df5b45b93136b0 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 10:31:42 -0500 Subject: [PATCH 069/209] Remove L1 penalty and CD solver --- sklearn/linear_model/_glm.py | 536 +------------------------ sklearn/linear_model/tests/test_glm.py | 185 +-------- 2 files changed, 36 insertions(+), 685 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 18eba80080670..15211c6038007 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -93,49 +93,6 @@ def _safe_sandwich_dot(X, d, intercept=False): return res -def _min_norm_sugrad(coef, grad, P2, P1): - """Compute the gradient of all subgradients with minimal L2-norm. - - subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1) - - g_i = grad_i + (P2*coef)_i - - if coef_i > 0: g_i + P1_i - if coef_i < 0: g_i - P1_i - if coef_i = 0: sign(g_i) * max(|g_i|-P1_i, 0) - - Parameters - ---------- - coef : ndarray - coef[0] may be intercept. - - grad : ndarray, shape=coef.shape - - P2 : {1d or 2d array, None} - always without intercept, ``None`` means P2 = 0 - - P1 : ndarray - always without intercept - """ - intercept = (coef.size == P1.size + 1) - idx = 1 if intercept else 0 # offset if coef[0] is intercept - # compute grad + coef @ P2 without intercept - grad_wP2 = grad[idx:].copy() - if P2 is None: - pass - elif P2.ndim == 1: - grad_wP2 += coef[idx:] * P2 - else: - grad_wP2 += coef[idx:] @ P2 - res = np.where(coef[idx:] == 0, - np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0), - grad_wP2 + np.sign(coef[idx:]) * P1) - if intercept: - return np.concatenate(([grad[0]], res)) - else: - return res - - class Link(metaclass=ABCMeta): """Abstract base class for Link functions.""" @@ -915,7 +872,7 @@ def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) # D2 = link.inverse_derivative(eta)^2 = D^2 # W = D2/V(mu) - # l2 = alpha * (1 - l1_ratio) + # l2 = alpha # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w # = -X' D (y-mu)/V(mu) + l2 P2 w # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 @@ -981,345 +938,6 @@ def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, return coef, n_iter -def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, - max_inner_iter=1000, selection='cyclic', - random_state=None, diag_fisher=False): - """Compute inner loop of coordinate descent, i.e. cycles through features. - - Minimization of 1-d subproblems:: - - min_z q(d+z*e_j) - q(d) - = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 - - A = f'(w) + d*H(w) + (w+d)*P2 - B = H+P2 - Note: f'=-score and H=fisher are updated at the end of outer iteration. - """ - # TODO: use sparsity (coefficient already 0 due to L1 penalty) - # => active set of features for featurelist, see paper - # of Improved GLMNET or Gap Safe Screening Rules - # https://arxiv.org/abs/1611.05780 - n_samples, n_features = X.shape - intercept = (coef.size == X.shape[1] + 1) - idx = 1 if intercept else 0 # offset if coef[0] is intercept - B = fisher - if P2.ndim == 1: - coef_P2 = coef[idx:] * P2 - if not diag_fisher: - idiag = np.arange(start=idx, stop=B.shape[0]) - # B[np.diag_indices_from(B)] += P2 - B[(idiag, idiag)] += P2 - else: - coef_P2 = coef[idx:] @ P2 - if not diag_fisher: - if sparse.issparse(P2): - B[idx:, idx:] += P2.toarray() - else: - B[idx:, idx:] += P2 - A = -score - A[idx:] += coef_P2 - # A += d @ (H+P2) but so far d=0 - # inner loop - for inner_iter in range(1, max_inner_iter+1): - inner_iter += 1 - n_cycles += 1 - # cycle through features, update intercept separately at the end - if selection == 'random': - featurelist = random_state.permutation(n_features) - else: - featurelist = np.arange(n_features) - for j in featurelist: - # minimize_z: a z + 1/2 b z^2 + c |d+z| - # a = A_j - # b = B_jj > 0 - # c = |P1_j| = P1_j > 0, see 1.3 - # d = w_j + d_j - # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) - # with beta = z+d, beta_hat = d-a/b and gamma = c/b - # z = 1/b * S(bd-a,c) - d - # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding - jdx = j+idx # index for arrays containing entries for intercept - a = A[jdx] - if diag_fisher: - # Note: fisher is ndarray of shape (n_samples,) => no idx - # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway - Bj = np.zeros_like(A) - if intercept: - Bj[0] = fisher.sum() - if sparse.issparse(X): - Bj[idx:] = _safe_toarray(X[:, j].transpose() @ - X.multiply(fisher[:, np.newaxis]) - ).ravel() - else: - Bj[idx:] = (fisher * X[:, j]) @ X - - if P2.ndim == 1: - Bj[idx:] += P2[j] - else: - if sparse.issparse(P2): - # slice columns as P2 is csc - Bj[idx:] += P2[:, j].toarray().ravel() - else: - Bj[idx:] += P2[:, j] - b = Bj[jdx] - else: - b = B[jdx, jdx] - - # those ten lines are what it is all about - if b <= 0: - z = 0 - elif P1[j] == 0: - z = -a/b - elif a + P1[j] < b * (coef[jdx] + d[jdx]): - z = -(a + P1[j])/b - elif a - P1[j] > b * (coef[jdx] + d[jdx]): - z = -(a - P1[j])/b - else: - z = -(coef[jdx] + d[jdx]) - - # update direction d - d[jdx] += z - # update A because d_j is now d_j+z - # A = f'(w) + d*H(w) + (w+d)*P2 - # => A += (H+P2)*e_j z = B_j * z - # Note: B is symmetric B = B.transpose - if diag_fisher: - # Bj = B[:, j] calculated above, still valid - A += Bj * z - else: - # B is symmetric, C- or F-contiguous, but never sparse - if B.flags['F_CONTIGUOUS']: - # slice columns like for sparse csc - A += B[:, jdx] * z - else: # B.flags['C_CONTIGUOUS'] might be true - # slice rows - A += B[jdx, :] * z - # end of cycle over features - # update intercept - if intercept: - if diag_fisher: - Bj = np.zeros_like(A) - Bj[0] = fisher.sum() - Bj[1:] = fisher @ X - b = Bj[0] - else: - b = B[0, 0] - z = 0 if b <= 0 else -A[0]/b - d[0] += z - if diag_fisher: - A += Bj * z - else: - if B.flags['F_CONTIGUOUS']: - A += B[:, 0] * z - else: - A += B[0, :] * z - # end of complete cycle - # stopping criterion for inner loop - # sum_i(|minimum of norm of subgrad of q(d)_i|) - # subgrad q(d) = A + subgrad ||P1*(w+d)||_1 - mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1) - mn_subgrad = linalg.norm(mn_subgrad, ord=1) - if mn_subgrad <= inner_tol: - if inner_iter == 1: - inner_tol = inner_tol/4. - break - # end of inner loop - return d, coef_P2, n_cycles, inner_tol - - -def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, - max_iter=100, max_inner_iter=1000, tol=1e-4, - selection='cyclic ', random_state=None, - diag_fisher=False, copy_X=True): - """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. - - The objective being minimized in the coefficients w=coef is:: - - F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 - - An Improved GLMNET for L1-regularized Logistic Regression: - - 1. Find optimal descent direction d by minimizing - min_d F(w+d) = min_d F(w+d) - F(w) - 2. Quadratic approximation of F(w+d)-F(w) = q(d): - using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: - q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d - + ||P1*(w+d)||_1 - ||P1*w||_1 - Then minimize q(d): min_d q(d) - 3. Coordinate descent by updating coordinate j (d -> d+z*e_j): - min_z q(d+z*e_j) - = min_z q(d+z*e_j) - q(d) - = min_z A_j z + 1/2 B_jj z^2 - + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 - A = f'(w) + d*H(w) + (w+d)*P2 - B = H + P2 - - Repeat steps 1-3 until convergence. - Note: Use Fisher matrix instead of Hessian for H. - Note: f' = -score, H = Fisher matrix - - Parameters - ---------- - coef : ndarray, shape (c,) - If fit_intercept=False, shape c=X.shape[1]. - If fit_intercept=True, then c=X.shape[1] + 1. - - X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) - Training data (with intercept included if present). If not sparse, - pass directly as Fortran-contiguous data to avoid - unnecessary memory duplication. - - y : ndarray, shape (n_samples,) - Target values. - - weights: ndarray, shape (n_samples,) - Sample weights with which the deviance is weighted. The weights must - bee normalized and sum to 1. - - P1 : {ndarray}, shape (n_features,) - The L1-penalty vector (=diagonal matrix) - - P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features) - The L2-penalty matrix or vector (=diagonal matrix). If a matrix is - passed, it must be symmetric. If X is sparse, P2 must also be sparse. - - fit_intercept : boolean, optional (default=True) - Specifies if a constant (a.k.a. bias or intercept) should be - added to the linear predictor (X*coef+intercept). - - family : ExponentialDispersionModel - - link : Link - - max_iter : int, optional (default=100) - Maximum numer of outer (Newton) iterations. - - max_inner_iter : int, optional (default=1000) - Maximum number of iterations in each inner loop, i.e. max number of - cycles over all features per inner loop. - - tol : float, optional (default=1e-4) - Convergence criterion is - sum_i(|minimum of norm of subgrad of objective_i|)<=tol. - - selection : str, optional (default='cyclic') - If 'random', randomly chose features in inner loop. - - random_state : {int, RandomState instance, None}, optional (default=None) - - diag_fisher : boolean, optional (default=False) - ``False`` calculates full fisher matrix, ``True`` only diagonal matrix - s.t. fisher = X.T @ diag @ X. This saves storage but needs more - matrix-vector multiplications. - - copy_X : boolean, optional (default=True) - If ``True``, X will be copied; else, it may be overwritten. - - Returns - ------- - coef : ndarray, shape (c,) - If fit_intercept=False, shape c=X.shape[1]. - If fit_intercept=True, then c=X.shape[1] + 1. - - n_iter : number of outer iterations = newton iterations - - n_cycles : number of cycles over features - - References - ---------- - Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin - An Improved GLMNET for L1-regularized Logistic Regression, - Journal of Machine Learning Research 13 (2012) 1999-2030 - https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf - """ - X = check_array(X, 'csc', dtype=[np.float64, np.float32], - order='F', copy=copy_X) - if P2.ndim == 2: - P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32], - order='F', copy=copy_X) - if sparse.issparse(X): - if not sparse.isspmatrix_csc(P2): - raise ValueError("If X is sparse, P2 must also be sparse csc" - "format. Got P2 not sparse.") - random_state = check_random_state(random_state) - # Note: we already set P2 = l2*P2, P1 = l1*P1 - # Note: we already symmetrized P2 = 1/2 (P2 + P2') - n_iter = 0 # number of outer iterations - n_cycles = 0 # number of (complete) cycles over features - converged = False - n_samples, n_features = X.shape - idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept - # line search parameters - (beta, sigma) = (0.5, 0.01) - # some precalculations - # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a - # 1d array representing a diagonal matrix. - eta, mu, score, fisher = family._eta_mu_score_fisher( - coef=coef, phi=1, X=X, y=y, weights=weights, link=link, - diag_fisher=diag_fisher) - # set up space for search direction d for inner loop - d = np.zeros_like(coef) - # initial stopping tolerance of inner loop - # use L1-norm of minimum of norm of subgradient of F - inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) - inner_tol = linalg.norm(inner_tol, ord=1) - # outer loop - while n_iter < max_iter: - n_iter += 1 - # initialize search direction d (to be optimized) with zero - d.fill(0) - # inner loop = _cd_cycle - d, coef_P2, n_cycles, inner_tol = \ - _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, - max_inner_iter=max_inner_iter, selection=selection, - random_state=random_state, diag_fisher=diag_fisher) - # line search by sequence beta^k, k=0, 1, .. - # F(w + lambda d) - F(w) <= lambda * bound - # bound = sigma * (f'(w)*d + w*P2*d - # +||P1 (w+d)||_1 - ||P1 w||_1) - P1w_1 = linalg.norm(P1 * coef[idx:], ord=1) - P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1) - # Note: coef_P2 already calculated and still valid - bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1) - Fw = (0.5 * family.deviance(y, mu, weights) + - 0.5 * (coef_P2 @ coef[idx:]) + P1w_1) - la = 1./beta - for k in range(20): - la *= beta # starts with la=1 - coef_wd = coef + la * d - mu_wd = link.inverse(_safe_lin_pred(X, coef_wd)) - Fwd = (0.5 * family.deviance(y, mu_wd, weights) + - linalg.norm(P1 * coef_wd[idx:], ord=1)) - if P2.ndim == 1: - Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:]) - else: - Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:])) - if Fwd - Fw <= sigma * la * bound: - break - # update coefficients - coef += la * d - # calculate eta, mu, score, Fisher matrix for next iteration - eta, mu, score, fisher = family._eta_mu_score_fisher( - coef=coef, phi=1, X=X, y=y, weights=weights, link=link, - diag_fisher=diag_fisher) - # stopping criterion for outer loop - # sum_i(|minimum-norm of subgrad of F(w)_i|) - # fp_wP2 = f'(w) + w*P2 - # Note: eta, mu and score are already updated - mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) - mn_subgrad = linalg.norm(mn_subgrad, ord=1) - if mn_subgrad <= tol: - converged = True - break - # end of outer loop - if not converged: - warnings.warn("Coordinate descent failed to converge. Increase" - " the maximum number of iterations max_iter" - " (currently {0})".format(max_iter), ConvergenceWarning) - - return coef, n_iter, n_cycles - - class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. @@ -1329,28 +947,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): priors as regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) - + alpha * l1_ratio * ||P1*w||_1 - + 1/2 * alpha * (1 - l1_ratio) * w*P2*w - - with inverse link function h and s=sample_weight. Note that for - ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). - For ``P1=P2='identity'``, the penalty is the elastic net:: - - alpha * l1_ratio * ||w||_1 - + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 - - If you are interested in controlling the L1 and L2 penalties - separately, keep in mind that this is equivalent to:: + + 1/2 * alpha * w*P2*w - a * L1 + b * L2 - - where:: - - alpha = a + b and l1_ratio = a / (a + b) - - The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet, - while ``alpha`` corresponds to the lambda parameter in glmnet. - Specifically, l1_ratio = 1 is the lasso penalty. + with inverse link function h and s=sample_weight. + The parameter ``alpha`` corresponds to the lambda parameter in glmnet. Read more in the :ref:`User Guide `. @@ -1364,19 +964,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): case, the design matrix X must have full column rank (no collinearities). - l1_ratio : float, optional (default=0) - The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For - ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it - is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a - combination of L1 and L2. - - P1 : {'identity', array-like}, shape (n_features,), optional \ - (default='identity') - With this array, you can exclude coefficients from the L1 penalty. - Set the corresponding value to 1 (include) or 0 (exclude). The - default value ``'identity'`` is the same as a 1d array of ones. - Note that n_features = X.shape[1]. - P2 : {'identity', array-like, sparse matrix}, shape \ (n_features,) or (n_features, n_features), optional \ (default='identity') @@ -1416,18 +1003,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \ + solver : {'auto', 'irls', 'lbfgs', 'newton-cg'}, \ optional (default='auto') Algorithm to use in the optimization problem: 'auto' - Sets 'irls' if l1_ratio equals 0, else 'cd'. - - 'cd' - Coordinate descent algorithm. It can deal with L1 as well as L2 - penalties. Note that in order to avoid unnecessary memory - duplication of X in the ``fit`` method, X should be directly passed - as a Fortran-contiguous numpy array or sparse csc matrix. + Sets 'irls' 'irls' Iterated reweighted least squares. @@ -1450,31 +1031,17 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of - the objective function. For the cd solver, convergence is reached - when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the - subgradient of the objective and minimum-norm of ``g_i`` is the element - of the subgradient ``g_i`` with the smallest L2-norm. + the objective function. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` as initialization for ``coef_`` and ``intercept_``. - selection : str, optional (default='cyclic') - For the solver 'cd' (coordinate descent), the coordinates (features) - can be updated in either cyclic or random order. - If set to 'random', a random coefficient is updated every iteration - rather than looping over features sequentially in the same order. This - (setting to 'random') often leads to significantly faster convergence - especially when tol is higher than 1e-4. - random_state : {int, RandomState instance, None}, optional (default=None) - The seed of the pseudo random number generator that selects a random - feature to be updated for solver 'cd' (coordinate descent). If int, random_state is the seed used by the random number generator; if RandomState instance, random_state is the random number generator; if None, the random number generator is the - RandomState instance used by `np.random`. Used when ``selection`` == - 'random'. + RandomState instance used by `np.random`. diag_fisher : boolean, optional, (default=False) Only relevant for solver 'cd'. @@ -1547,15 +1114,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Journal of Machine Learning Research 13 (2012) 1999-2030 https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ - def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', + def __init__(self, alpha=1.0, P2='identity', fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, - selection='cyclic', random_state=None, diag_fisher=False, + random_state=None, diag_fisher=False, copy_X=True, check_input=True, verbose=0): self.alpha = alpha - self.l1_ratio = l1_ratio - self.P1 = P1 self.P2 = P2 self.fit_intercept = fit_intercept self.family = family @@ -1565,7 +1130,6 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', self.max_iter = max_iter self.tol = tol self.warm_start = warm_start - self.selection = selection self.random_state = random_state self.diag_fisher = diag_fisher self.copy_X = copy_X @@ -1645,28 +1209,16 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: raise ValueError("Penalty term must be a non-negative number;" " got (alpha={0})".format(self.alpha)) - if (not isinstance(self.l1_ratio, numbers.Number) or - self.l1_ratio < 0 or self.l1_ratio > 1): - raise ValueError("l1_ratio must be a number in interval [0, 1];" - " got (l1_ratio={0})".format(self.l1_ratio)) if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']: + if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg']: raise ValueError("GeneralizedLinearRegressor supports only solvers" - " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';" + " 'auto', 'irls', 'lbfgs', 'newton-cg';" " got {0}".format(self.solver)) solver = self.solver if self.solver == 'auto': - if self.l1_ratio == 0: - solver = 'irls' - else: - solver = 'cd' - if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']): - raise ValueError("The chosen solver (solver={0}) can't deal " - "with L1 penalties, which are included with " - "(alpha={1}) and (l1_ratio={2})." - .format(solver, self.alpha, self.l1_ratio)) + solver = 'irls' if (not isinstance(self.max_iter, int) or self.max_iter <= 0): raise ValueError("Maximum number of iteration must be a positive " @@ -1678,10 +1230,6 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.warm_start, bool): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) - if self.selection not in ['cyclic', 'random']: - raise ValueError("The argument selection must be 'cyclic' or " - "'random'; got (selection={0})" - .format(self.selection)) random_state = check_random_state(self.random_state) if not isinstance(self.diag_fisher, bool): raise ValueError("The argument diag_fisher must be bool;" @@ -1698,16 +1246,10 @@ def fit(self, X, y, sample_weight=None): # 1.2 validate arguments of fit ####################################### _dtype = [np.float64, np.float32] - if solver == 'cd': - _stype = ['csc'] - else: - _stype = ['csc', 'csr'] + _stype = ['csc', 'csr'] X, y = check_X_y(X, y, accept_sparse=_stype, dtype=_dtype, y_numeric=True, multi_output=False, copy=self.copy_X) - # Without converting y to float, deviance might raise - # ValueError: Integers to negative integer powers are not allowed. - # Also, y must not be sparse. y = np.asarray(y, dtype=np.float64) weights = _check_weights(sample_weight, y.shape[0]) @@ -1715,23 +1257,8 @@ def fit(self, X, y, sample_weight=None): n_samples, n_features = X.shape # 1.3 arguments to take special care ################################## - # P1, P2 - if isinstance(self.P1, str) and self.P1 == 'identity': - P1 = np.ones(n_features) - else: - P1 = np.atleast_1d(self.P1) - try: - P1 = P1.astype(np.float64, casting='safe', copy=False) - except TypeError: - raise TypeError("The given P1 cannot be converted to a numeric" - "array; got (P1.dtype={0})." - .format(P1.dtype)) - if (P1.ndim != 1) or (P1.shape[0] != n_features): - raise ValueError("P1 must be either 'identity' or a 1d array " - "with the length of X.shape[1]; " - "got (P1.shape[0]={0}), " - "needed (X.shape[1]={1})." - .format(P1.shape[0], n_features)) + # P2 + # If X is sparse, make P2 sparse, too. if isinstance(self.P2, str) and self.P2 == 'identity': if sparse.issparse(X): @@ -1766,10 +1293,8 @@ def fit(self, X, y, sample_weight=None): "got (P2.shape=({0}, {1})), needed ({2}, {2})" .format(P2.shape[0], P2.shape[1], X.shape[1])) - l1 = self.alpha * self.l1_ratio - l2 = self.alpha * (1 - self.l1_ratio) - # P1 and P2 are now for sure copies - P1 = l1 * P1 + l2 = self.alpha + # P2 is now for sure a copy P2 = l2 * P2 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric @@ -1792,11 +1317,6 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" .format(family.__class__.__name__)) - # check if P1 has only non-negative values, negative values might - # indicate group lasso in the future. - if not isinstance(self.P1, str): # if self.P1 != 'identity': - if not np.all(P1 >= 0): - raise ValueError("P1 must not have negative values.") # check if P2 is positive semidefinite # np.linalg.cholesky(P2) 'only' asserts positive definite if not isinstance(self.P2, str): # self.P2 != 'identity' @@ -1845,8 +1365,6 @@ def fit(self, X, y, sample_weight=None): # Note: Since phi=self.dispersion_ does not enter the estimation # of mu_i=E[y_i], set it to 1. - # set start values for coef - coef = None if self.warm_start and hasattr(self, 'coef_'): if self.fit_intercept: coef = np.concatenate((np.array([self.intercept_]), @@ -1975,18 +1493,6 @@ def Hs(coef): args=args, maxiter=self.max_iter, tol=self.tol) - # 4.4 coordinate descent ############################################## - # Note: we already set P1 = l1*P1, see above - # Note: we already set P2 = l2*P2, see above - # Note: we already symmetrized P2 = 1/2 (P2 + P2') - elif solver == 'cd': - coef, self.n_iter_, self._n_cycles = \ - _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, - P2=P2, fit_intercept=self.fit_intercept, - family=family, link=link, - max_iter=self.max_iter, tol=self.tol, - selection=self.selection, random_state=random_state, - diag_fisher=self.diag_fisher, copy_X=self.copy_X) ####################################################################### # 5. postprocessing # @@ -2097,9 +1603,6 @@ def estimate_phi(self, X, y, sample_weight=None): dev = self._family_instance.deviance(y, mu, weights) return dev/(n_samples - n_features) - # Note: check_estimator(GeneralizedLinearRegressor) might raise - # "AssertionError: -0.28014056555724598 not greater than 0.5" - # unless GeneralizedLinearRegressor has a score which passes the test. def score(self, X, y, sample_weight=None): """Compute D^2, the percentage of deviance explained. @@ -2212,8 +1715,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): If int, random_state is the seed used by the random number generator; if RandomState instance, random_state is the random number generator; if None, the random number generator is the - RandomState instance used by `np.random`. Used when ``selection`` == - 'random'. + RandomState instance used by `np.random`. copy_X : boolean, optional, (default=True) If ``True``, X will be copied; else, it may be overwritten. diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index f51f630ebae7e..8fc1241e1da7a 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -25,7 +25,7 @@ from sklearn.utils.testing import assert_array_equal -GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg', 'cd'] +GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg'] @pytest.fixture(scope="module") @@ -236,28 +236,6 @@ def test_glm_alpha_argument(alpha): glm.fit(X, y) -@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]]) -def test_glm_l1_ratio_argument(l1_ratio): - """Test GLM for invalid l1_ratio argument.""" - y = np.array([1, 2]) - X = np.array([[1], [2]]) - glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio) - with pytest.raises(ValueError, - match="l1_ratio must be a number in interval.*0, 1"): - glm.fit(X, y) - - -@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3], - [-1]]) -def test_glm_P1_argument(P1): - """Test GLM for invalid P1 argument.""" - y = np.array([1, 2]) - X = np.array([[1], [2]]) - glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True) - with pytest.raises((ValueError, TypeError)): - glm.fit(X, y) - - @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], sparse.csr_matrix([1, 2, 3]), [-1]]) def test_glm_P2_argument(P2): @@ -301,14 +279,13 @@ def test_glm_fit_intercept_argument(fit_intercept): glm.fit(X, y) -@pytest.mark.parametrize('solver, l1_ratio', - [('not a solver', 0), (1, 0), ([1], 0), - ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)]) -def test_glm_solver_argument(solver, l1_ratio): +@pytest.mark.parametrize('solver', + ['not a solver', 1, [1]]) +def test_glm_solver_argument(solver): """Test GLM for invalid solver argument.""" y = np.array([1, 2]) X = np.array([[1], [2]]) - glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio) + glm = GeneralizedLinearRegressor(solver=solver) with pytest.raises(ValueError): glm.fit(X, y) @@ -343,28 +320,6 @@ def test_glm_warm_start_argument(warm_start): glm.fit(X, y) -@pytest.mark.parametrize('start_params', - ['not a start_params', ['zero'], [0, 0, 0], - [[0, 0]], ['a', 'b']]) -def test_glm_start_params_argument(start_params): - """Test GLM for invalid start_params argument.""" - y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(start_params=start_params) - with pytest.raises(ValueError): - glm.fit(X, y) - - -@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']]) -def test_glm_selection_argument(selection): - """Test GLM for invalid selection argument""" - y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(selection=selection) - with pytest.raises(ValueError, match="argument selection must be"): - glm.fit(X, y) - - @pytest.mark.parametrize('random_state', ['a string', 0.5, [0]]) def test_glm_random_state_argument(random_state): """Test GLM for invalid random_state argument.""" @@ -413,7 +368,7 @@ def test_glm_identity_regression(solver): y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', fit_intercept=False, solver=solver, - start_params='zero', tol=1e-7) + tol=1e-7) res = glm.fit(X, y) assert_allclose(res.coef_, coef, rtol=1e-6) @@ -427,7 +382,7 @@ def test_glm_identity_regression(solver): @pytest.mark.parametrize('solver, tol', [('irls', 1e-6), ('lbfgs', 1e-6), ('newton-cg', 1e-7), - ('cd', 1e-7)]) +]) def test_glm_log_regression(family, solver, tol): """Test GLM regression with log link on a simple dataset.""" coef = [0.2, -0.1] @@ -435,7 +390,7 @@ def test_glm_log_regression(family, solver, tol): y = np.exp(np.dot(X, coef)) glm = GeneralizedLinearRegressor( alpha=0, family=family, link='log', fit_intercept=False, - solver=solver, start_params='guess', tol=tol) + solver=solver, tol=tol) res = glm.fit(X, y) assert_allclose(res.coef_, coef, rtol=5e-6) @@ -472,14 +427,14 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): random_state=42, **ridge_params) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + glm = GeneralizedLinearRegressor(alpha=1.0, family='normal', link='identity', fit_intercept=True, max_iter=300, solver=solver, tol=1e-6, check_input=False, random_state=42) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6) - assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) @@ -487,7 +442,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): [('irls', 1e-7), ('lbfgs', 1e-7), ('newton-cg', 1e-7), - ('cd', 1e-7)]) +]) def test_poisson_ridge(solver, tol): """Test ridge regression with poisson family and LogLink. @@ -506,130 +461,24 @@ def test_poisson_ridge(solver, tol): X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) rng = np.random.RandomState(42) - glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, + glm = GeneralizedLinearRegressor(alpha=1, fit_intercept=True, family='poisson', link='log', tol=1e-7, solver=solver, max_iter=300, random_state=rng) glm.fit(X, y) assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) - assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-6) - - -@pytest.mark.parametrize('diag_fisher', [False, True]) -def test_normal_enet(diag_fisher): - """Test elastic net regression with normal/gaussian family.""" - alpha, l1_ratio = 0.3, 0.7 - n_samples, n_features = 20, 2 - rng = np.random.RandomState(42) - X = rng.randn(n_samples, n_features).copy(order='F') - beta = rng.randn(n_features) - y = 2 + np.dot(X, beta) + rng.randn(n_samples) - - # 1. test normal enet on dense data - glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, - family='normal', link='identity', - fit_intercept=True, tol=1e-8, - max_iter=100, selection='cyclic', - solver='cd', start_params='zero', - check_input=False, - diag_fisher=diag_fisher) - glm.fit(X, y) - - enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, - normalize=False, tol=1e-8, copy_X=True) - enet.fit(X, y) - - assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7) - assert_allclose(glm.coef_, enet.coef_, rtol=5e-5) - - # 2. test normal enet on sparse data - X = sparse.csc_matrix(X) - glm.fit(X, y) - assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7) - assert_allclose(glm.coef_, enet.coef_, rtol=5e-5) - - -def test_poisson_enet(): - """Test elastic net regression with poisson family and LogLink. - - Compare to R's glmnet""" - # library("glmnet") - # options(digits=10) - # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) - # x <- data.matrix(df[,c("a", "b")]) - # y <- df$y - # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson", - # standardize=F, thresh=1e-10, nlambda=10000) - # coef(fit, s=1) - # (Intercept) -0.03550978409 - # a 0.16936423283 - # b . - glmnet_intercept = -0.03550978409 - glmnet_coef = [0.16936423283, 0.] - X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T - y = np.array([0, 1, 1, 2]) - rng = np.random.RandomState(42) - glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', solver='cd', tol=1e-8, - selection='random', random_state=rng, - start_params='guess') - glm.fit(X, y) - assert_allclose(glm.intercept_, glmnet_intercept, rtol=2e-6) - assert_allclose(glm.coef_, glmnet_coef, rtol=2e-7) - - # test results with general optimization procedure - def obj(coef): - pd = PoissonDistribution() - link = LogLink() - N = y.shape[0] - mu = link.inverse(X @ coef[1:] + coef[0]) - alpha, l1_ratio = (1, 0.5) - return 1./(2.*N) * pd.deviance(y, mu) \ - + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \ - + alpha * l1_ratio * np.sum(np.abs(coef[1:])) - res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10, - options={'maxiter': 1000, 'disp': False}) - assert_allclose(glm.intercept_, res.x[0], rtol=5e-5) - assert_allclose(glm.coef_, res.x[1:], rtol=1e-5, atol=1e-9) - assert_allclose(obj(np.concatenate(([glm.intercept_], glm.coef_))), - res.fun, rtol=1e-8) - - # same for start_params='zero' and selection='cyclic' - # with reduced precision - glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', solver='cd', tol=1e-5, - selection='cyclic', start_params='zero') - glm.fit(X, y) - assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4) - assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4) - - # check warm_start, therefore start with different alpha - glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, - family='poisson', max_iter=300, - link='log', solver='cd', tol=1e-5, - selection='cyclic', start_params='zero') - glm.fit(X, y) - # warm start with original alpha and use of sparse matrices - glm.warm_start = True - glm.alpha = 1 - X = sparse.csr_matrix(X) - glm.fit(X, y) - assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4) - assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4) + assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5) @pytest.mark.parametrize( "params", [ - {"solver": "irls", "start_params": "guess"}, - {"solver": "irls", "start_params": "zero"}, - {"solver": "lbfgs", "start_params": "guess"}, - {"solver": "lbfgs", "start_params": "zero"}, + {"solver": "irls" }, + {"solver": "irls" }, + {"solver": "lbfgs" }, + {"solver": "lbfgs"}, {"solver": "newton-cg"}, - {"solver": "cd", "selection": "cyclic", "diag_fisher": False}, - {"solver": "cd", "selection": "cyclic", "diag_fisher": True}, - {"solver": "cd", "selection": "random", "diag_fisher": False}, ], ids=lambda params: ', '.join("%s=%s" % (key, val) for key, val in params.items()) From d25042e23c595b34cd5382a60bfd39613ac3a2ae Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 10:50:27 -0500 Subject: [PATCH 070/209] Remove newton CG algorithm --- sklearn/linear_model/_glm.py | 100 +++---------------------- sklearn/linear_model/tests/test_glm.py | 8 +- 2 files changed, 13 insertions(+), 95 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 15211c6038007..e11d7c08064ce 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -17,7 +17,6 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, check_X_y -from ..utils.optimize import newton_cg from ..utils.validation import check_is_fitted, check_random_state @@ -1003,7 +1002,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'auto', 'irls', 'lbfgs', 'newton-cg'}, \ + solver : {'auto', 'irls', 'lbfgs'}, \ optional (default='auto') Algorithm to use in the optimization problem: @@ -1016,10 +1015,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): L1 penalties. 'lbfgs' - Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties. + Calls scipy's L-BFGS-B optimizer. - 'newton-cg', 'lbfgs' - Newton conjugate gradient algorithm cannot deal with L1 penalties. Note that all solvers except lbfgs use the fisher matrix, i.e. the expected Hessian instead of the Hessian matrix. @@ -1028,7 +1025,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The maximal number of iterations for solver algorithms. tol : float, optional (default=1e-4) - Stopping criterion. For the irls, newton-cg and lbfgs solvers, + Stopping criterion. For the irls, and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of the objective function. @@ -1212,9 +1209,9 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg']: + if self.solver not in ['auto', 'irls', 'lbfgs']: raise ValueError("GeneralizedLinearRegressor supports only solvers" - " 'auto', 'irls', 'lbfgs', 'newton-cg';" + "'auto', 'irls', 'lbfgs';" " got {0}".format(self.solver)) solver = self.solver if self.solver == 'auto': @@ -1410,6 +1407,9 @@ def func(coef, X, y, weights, P2, family, link): return obj, objp args = (X, y, weights, P2, family, link) + # TODO: refactor this once + # https://github.com/scikit-learn/scikit-learn/pull/14250 + # is merged. coef, loss, info = fmin_l_bfgs_b( func, coef, fprime=None, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, @@ -1423,76 +1423,6 @@ def func(coef, X, y, weights, P2, family, link): .format(info["task"])) self.n_iter_ = info['nit'] - # 4.3 Newton-CG ####################################################### - # We use again the fisher matrix instead of the hessian. More - # precisely, expected hessian of deviance. - elif solver == 'newton-cg': - def func(coef, X, y, weights, P2, family, link): - intercept = (coef.size == X.shape[1] + 1) - idx = 1 if intercept else 0 # offset if coef[0] is intercept - if P2.ndim == 1: - L2 = coef[idx:] @ (P2 * coef[idx:]) - else: - L2 = coef[idx:] @ (P2 @ coef[idx:]) - mu = link.inverse(_safe_lin_pred(X, coef)) - return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 - - def grad(coef, X, y, weights, P2, family, link): - mu, devp = \ - family._mu_deviance_derivative(coef, X, y, weights, link) - intercept = (coef.size == X.shape[1] + 1) - idx = 1 if intercept else 0 # offset if coef[0] is intercept - if P2.ndim == 1: - L2 = P2 * coef[idx:] - else: - L2 = P2 @ coef[idx:] - objp = 0.5 * devp - objp[idx:] += L2 - return objp - - def grad_hess(coef, X, y, weights, P2, family, link): - intercept = (coef.size == X.shape[1] + 1) - idx = 1 if intercept else 0 # offset if coef[0] is intercept - if P2.ndim == 1: - L2 = P2 * coef[idx:] - else: - L2 = P2 @ coef[idx:] - eta = _safe_lin_pred(X, coef) - mu = link.inverse(eta) - d1 = link.inverse_derivative(eta) - temp = d1 * family.deviance_derivative(y, mu, weights) - if intercept: - grad = np.concatenate(([0.5 * temp.sum()], - 0.5 * temp @ X + L2)) - else: - grad = 0.5 * temp @ X + L2 # same as 0.5* X.T @ temp + L2 - - # expected hessian = fisher = X.T @ diag_matrix @ X - # calculate only diag_matrix - diag = d1**2 / family.variance(mu, phi=1, weights=weights) - if intercept: - h0i = np.concatenate(([diag.sum()], diag @ X)) - - def Hs(coef): - # return (0.5 * fisher + P2) @ coef - # ret = 0.5 * (X.T @ (diag * (X @ coef))) - ret = 0.5 * ((diag * (X @ coef[idx:])) @ X) - if P2.ndim == 1: - ret += P2 * coef[idx:] - else: - ret += P2 @ coef[idx:] - if intercept: - ret = np.concatenate(([0.5 * (h0i @ coef)], - ret + 0.5 * coef[0] * h0i[1:])) - return ret - - return grad, Hs - - args = (X, y, weights, P2, family, link) - coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef, - args=args, maxiter=self.max_iter, - tol=self.tol) - ####################################################################### # 5. postprocessing # @@ -1511,7 +1441,7 @@ def Hs(coef): return self - def linear_predictor(self, X): + def _linear_predictor(self, X): """Compute the linear_predictor = X*coef_ + intercept_. Parameters @@ -1552,7 +1482,7 @@ def predict(self, X, sample_weight=None): X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype='numeric', copy=True, ensure_2d=True, allow_nd=False) - eta = self.linear_predictor(X) + eta = self._linear_predictor(X) mu = self._link_instance.inverse(eta) weights = _check_weights(sample_weight, X.shape[0]) @@ -1682,7 +1612,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'irls', 'lbfgs', 'newton-cg'}, optional (default='irls') + solver : {'irls', 'lbfgs'}, optional (default='irls') Algorithm to use in the optimization problem: 'irls' @@ -1692,17 +1622,11 @@ class PoissonRegressor(GeneralizedLinearRegressor): 'lbfgs' Calls scipy's L-BFGS-B optimizer. - 'newton-cg' - Newton conjugate gradient algorithm. - - Note that all solvers except lbfgs use the fisher matrix, i.e. the - expected Hessian instead of the Hessian matrix. - max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. tol : float, optional (default=1e-4) - Stopping criterion. For the irls, newton-cg and lbfgs solvers, + Stopping criterion. For the irls, and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of the objective function. diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 8fc1241e1da7a..0cee56afb3042 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -25,7 +25,7 @@ from sklearn.utils.testing import assert_array_equal -GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg'] +GLM_SOLVERS = ['irls', 'lbfgs'] @pytest.fixture(scope="module") @@ -381,7 +381,6 @@ def test_glm_identity_regression(solver): ]) @pytest.mark.parametrize('solver, tol', [('irls', 1e-6), ('lbfgs', 1e-6), - ('newton-cg', 1e-7), ]) def test_glm_log_regression(family, solver, tol): """Test GLM regression with log link on a simple dataset.""" @@ -395,9 +394,6 @@ def test_glm_log_regression(family, solver, tol): assert_allclose(res.coef_, coef, rtol=5e-6) -# newton-cg may issue a LineSearchWarning, which we filter out -@pytest.mark.filterwarnings('ignore:The line search algorithm') -@pytest.mark.filterwarnings('ignore:Line Search failed') @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) @pytest.mark.parametrize('fit_intercept', [True, False]) @pytest.mark.parametrize('solver', GLM_SOLVERS) @@ -441,7 +437,6 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): @pytest.mark.parametrize('solver, tol', [('irls', 1e-7), ('lbfgs', 1e-7), - ('newton-cg', 1e-7), ]) def test_poisson_ridge(solver, tol): """Test ridge regression with poisson family and LogLink. @@ -478,7 +473,6 @@ def test_poisson_ridge(solver, tol): {"solver": "irls" }, {"solver": "lbfgs" }, {"solver": "lbfgs"}, - {"solver": "newton-cg"}, ], ids=lambda params: ', '.join("%s=%s" % (key, val) for key, val in params.items()) From 07ee4954ef118227832d9ac2ad562a5aec7af38b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 10:54:20 -0500 Subject: [PATCH 071/209] Remove fisher_matrix, _observed_information and _eta_mu_score_fisher --- sklearn/linear_model/_glm.py | 120 ------------------------- sklearn/linear_model/tests/test_glm.py | 43 --------- 2 files changed, 163 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index e11d7c08064ce..fa434821bb80e 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -253,9 +253,6 @@ class ExponentialDispersionModel(metaclass=ABCMeta): _mu_deviance_derivative _score - _fisher_matrix - _observed_information - _eta_mu_score_fisher References ---------- @@ -518,123 +515,6 @@ def _score(self, coef, phi, X, y, weights, link): score = temp @ X # sampe as X.T @ temp return score - def _fisher_matrix(self, coef, phi, X, y, weights, link): - r"""Compute the Fisher information matrix. - - The Fisher information matrix, also known as expected information - matrix is given by - - .. math: - - \mathbf{F}(\boldsymbol{w}) = - \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial - \boldsymbol{w}} \right] - = \mathrm{E}\left[ - -\frac{\partial^2 loglike}{\partial\boldsymbol{w} - \partial\boldsymbol{w}^T}\right] - = \mathbf{X}^T W \mathbf{X} \,, - - with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, - see func:`_score`. - """ - lin_pred = _safe_lin_pred(X, coef) - mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) - d = link.inverse_derivative(lin_pred) - d2_sigma_inv = sigma_inv * d * d - intercept = (coef.size == X.shape[1] + 1) - fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, - intercept=intercept) - return fisher_matrix - - def _observed_information(self, coef, phi, X, y, weights, link): - r"""Compute the observed information matrix. - - The observed information matrix, also known as the negative of - the Hessian matrix of the log-likelihood, is given by - - .. math: - - \mathbf{H}(\boldsymbol{w}) = - -\frac{\partial^2 loglike}{\partial\boldsymbol{w} - \partial\boldsymbol{w}^T} - = \mathbf{X}^T \left[ - - \mathbf{D}' \mathbf{R} - + \mathbf{D}^2 \mathbf{V} \mathbf{R} - + \mathbf{D}^2 - \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,, - - with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`, - :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{ - v(\mu_i)} - \right)`, - see :func:`score_` function and :func:`_fisher_matrix`. - """ - lin_pred = _safe_lin_pred(X, coef) - mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) - dp = link.inverse_derivative2(lin_pred) - d2 = link.inverse_derivative(lin_pred)**2 - v = self.unit_variance_derivative(mu)/self.unit_variance(mu) - r = y - mu - temp = sigma_inv * (-dp * r + d2 * v * r + d2) - intercept = (coef.size == X.shape[1] + 1) - observed_information = _safe_sandwich_dot(X, temp, - intercept=intercept) - return observed_information - - def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, - diag_fisher=False): - """Compute linear predictor, mean, score function and fisher matrix. - - It calculates the linear predictor, the mean, score function - (derivative of log-likelihood) and Fisher information matrix - all in one go as function of `coef` (:math:`w`) and the data. - - Parameters - ---------- - diag_fisher : boolean, optional (default=False) - If ``True``, returns only an array d such that - fisher = X.T @ np.diag(d) @ X. - - Returns - ------- - (eta, mu, score, fisher) : tuple with 4 elements - The 4 elements are: - - * eta: ndarray, shape (X.shape[0],) - * mu: ndarray, shape (X.shape[0],) - * score: ndarray, shape (X.shape[0],) - * fisher: - - * If diag_fisher is ``False``, the full fisher matrix, - an array of shape (X.shape[1], X.shape[1]) - * If diag_fisher is ``True`, an array of shape (X.shape[0]) - """ - intercept = (coef.size == X.shape[1] + 1) - # eta = linear predictor - eta = _safe_lin_pred(X, coef) - mu = link.inverse(eta) - sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) - d1 = link.inverse_derivative(eta) # = h'(eta) - # Alternatively: - # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g - # d1 = 1./link.derivative(mu) - d1_sigma_inv = d1 * sigma_inv - temp = d1_sigma_inv * (y - mu) - if intercept: - score = np.concatenate(([temp.sum()], temp @ X)) - else: - score = temp @ X - - d2_sigma_inv = d1 * d1_sigma_inv - if diag_fisher: - fisher_matrix = d2_sigma_inv - else: - fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, - intercept=intercept) - return eta, mu, score, fisher_matrix - class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 0cee56afb3042..b9716388fb36c 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -109,49 +109,6 @@ def test_deviance_zero(family, chk_values): assert_allclose(family.deviance(x, x), 0, atol=1e-9) -@pytest.mark.parametrize( - 'family, link', - [(NormalDistribution(), IdentityLink()), - (PoissonDistribution(), LogLink()), - (GammaDistribution(), LogLink()), - (InverseGaussianDistribution(), LogLink()), - (TweedieDistribution(power=1.5), LogLink()), - (TweedieDistribution(power=4.5), LogLink())], - ids=lambda args: args.__class__.__name__) -def test_fisher_matrix(family, link): - """Test the Fisher matrix numerically. - Trick: Use numerical differentiation with y = mu""" - coef = np.array([-2, 1, 0, 1, 2.5]) - phi = 0.5 - rng = np.random.RandomState(42) - X = rng.randn(10, 5) - lin_pred = np.dot(X, coef) - mu = link.inverse(lin_pred) - weights = rng.randn(10)**2 + 1 - fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, - weights=weights, link=link) - # check that the Fisher matrix is square and positive definite - assert fisher.ndim == 2 - assert fisher.shape[0] == fisher.shape[1] - assert np.all(np.linalg.eigvals(fisher) >= 0) - - approx = np.array([]).reshape(0, coef.shape[0]) - for i in range(coef.shape[0]): - def f(coef): - return -family._score(coef=coef, phi=phi, X=X, y=mu, - weights=weights, link=link)[i] - approx = np.vstack( - [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]) - assert_allclose(fisher, approx, rtol=1e-3) - - # check the observed information matrix - oim = family._observed_information(coef=coef, phi=phi, X=X, y=mu, - weights=weights, link=link) - assert oim.ndim == 2 - assert oim.shape == fisher.shape - assert_allclose(oim, fisher) - - def test_sample_weights_validation(): """Test the raised errors in the validation of sample_weight.""" # scalar value but not positive From d0eb2850b91e74d1e8591f1bbd758cb06d85a3bc Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 11:21:18 -0500 Subject: [PATCH 072/209] Remove matrix L2 penalty and IRLS solver --- sklearn/linear_model/_glm.py | 327 ++----------------------- sklearn/linear_model/tests/test_glm.py | 57 +---- 2 files changed, 26 insertions(+), 358 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index fa434821bb80e..b469b3c4edd17 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -669,164 +669,16 @@ def __init__(self): } -def _irls_step(X, W, P2, z, fit_intercept=True): - """Compute one step in iteratively reweighted least squares. - - Solve A w = b for w with - A = (X' W X + P2) - b = X' W z - z = eta + D^-1 (y-mu) - - See also fit method of :class:`GeneralizedLinearRegressor`. - - Parameters - ---------- - X : {ndarray, sparse matrix}, shape (n_samples, n_features) - Training data (with intercept included if present) - - W : ndarray, shape (n_samples,) - - P2 : {ndarray, sparse matrix}, shape (n_features, n_features) - The L2-penalty matrix or vector (=diagonal matrix) - - z : ndarray, shape (n_samples,) - Working observations - - fit_intercept : boolean, optional (default=True) - - Returns - ------- - coef : ndarray, shape (c,) - If fit_intercept=False, shape c=X.shape[1]. - If fit_intercept=True, then c=X.shapee[1] + 1. - """ - # Note: solve vs least squares, what is more appropriate? - # scipy.linalg.solve seems faster, but scipy.linalg.lstsq - # is more robust. - # Note: X.T @ W @ X is not sparse, even when X is sparse. - # Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b) - if fit_intercept: - Wz = W * z - if sparse.issparse(X): - b = np.concatenate(([Wz.sum()], X.transpose() @ Wz)) - else: - b = np.concatenate(([Wz.sum()], X.T @ Wz)) - A = _safe_sandwich_dot(X, W, intercept=fit_intercept) - if P2.ndim == 1: - idx = np.arange(start=1, stop=A.shape[0]) - A[(idx, idx)] += P2 # add to diag elements without intercept - elif sparse.issparse(P2): - A[1:, 1:] += P2.toarray() - else: - A[1:, 1:] += P2 - else: - if sparse.issparse(X): - XtW = X.transpose().multiply(W) - # for older versions of numpy and scipy, A may be a np.matrix - A = _safe_toarray(XtW @ X) - else: - XtW = (X.T * W) - A = XtW @ X - b = XtW @ z - if P2.ndim == 1: - A[np.diag_indices_from(A)] += P2 - elif sparse.issparse(P2): - A += P2.toarray() - else: - A += P2 - - coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) - return coef - - -def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, - max_iter, tol): - """Solve GLM with L2 penalty by IRLS algorithm. - - Note: If X is sparse, P2 must also be sparse. - """ - # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' - # Obj = objective function = 1/2 Dev + l2/2 w P2 w - # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 - # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) - # D2 = link.inverse_derivative(eta)^2 = D^2 - # W = D2/V(mu) - # l2 = alpha - # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w - # = -X' D (y-mu)/V(mu) + l2 P2 w - # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 - # Use Fisher matrix instead of full info matrix -X'(...) X, - # i.e. E[Dev''] with E[y-mu]=0: - # Obj'' ~ X' W X + l2 P2 - # (1): w = (X' W X + l2 P2)^-1 X' W z, - # with z = eta + D^-1 (y-mu) - # Note: P2 must be symmetrized - # Note: ' denotes derivative, but also transpose for matrices - - eta = _safe_lin_pred(X, coef) - mu = link.inverse(eta) - # D = h'(eta) - hp = link.inverse_derivative(eta) - V = family.variance(mu, phi=1, weights=weights) - - converged = False - n_iter = 0 - while n_iter < max_iter: - n_iter += 1 - # coef_old not used so far. - # coef_old = coef - # working weights W, in principle a diagonal matrix - # therefore here just as 1d array - W = hp**2 / V - # working observations - z = eta + (y - mu) / hp - # solve A*coef = b - # A = X' W X + P2, b = X' W z - coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept) - # updated linear predictor - # do it here for updated values for tolerance - eta = _safe_lin_pred(X, coef) - mu = link.inverse(eta) - hp = link.inverse_derivative(eta) - V = family.variance(mu, phi=1, weights=weights) - - # which tolerace? |coef - coef_old| or gradient? - # use gradient for compliance with newton-cg and lbfgs - # gradient = -X' D (y-mu)/V(mu) + l2 P2 w - temp = hp * (y - mu) / V - if sparse.issparse(X): - gradient = -(X.transpose() @ temp) - else: - gradient = -(X.T @ temp) - idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept - if P2.ndim == 1: - gradient += P2 * coef[idx:] - else: - gradient += P2 @ coef[idx:] - if fit_intercept: - gradient = np.concatenate(([-temp.sum()], gradient)) - if (np.max(np.abs(gradient)) <= tol): - converged = True - break - - if not converged: - warnings.warn("irls failed to converge. Increase the number " - "of iterations (currently {0})" - .format(max_iter), ConvergenceWarning) - - return coef, n_iter - - class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and predicting the mean of the target y as mu=h(X*w). Therefore, - the fit minimizes the following objective function with combined L1 and L2 + the fit minimizes the following objective function with L2 priors as regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) - + 1/2 * alpha * w*P2*w + + 1/2 * alpha * |w|_2 with inverse link function h and s=sample_weight. The parameter ``alpha`` corresponds to the lambda parameter in glmnet. @@ -843,18 +695,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): case, the design matrix X must have full column rank (no collinearities). - P2 : {'identity', array-like, sparse matrix}, shape \ - (n_features,) or (n_features, n_features), optional \ - (default='identity') - With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`. - This gives a fine control over this penalty (Tikhonov regularization). - A 2d array is directly used as the square matrix P2. A 1d array is - interpreted as diagonal (square) matrix. The default 'identity' sets - the identity matrix, which gives the usual squared L2-norm. If you just - want to exclude certain coefficients, pass a 1d array filled with 1, - and 0 for the coefficients to be excluded. - Note that P2 must be positive semi-definite. - fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). @@ -882,17 +722,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'auto', 'irls', 'lbfgs'}, \ - optional (default='auto') + solver : {'auto', 'lbfgs'}, optional (default='auto') Algorithm to use in the optimization problem: 'auto' - Sets 'irls' - - 'irls' - Iterated reweighted least squares. - It is the standard algorithm for GLMs. It cannot deal with - L1 penalties. + Sets 'lbfgs' 'lbfgs' Calls scipy's L-BFGS-B optimizer. @@ -905,7 +739,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The maximal number of iterations for solver algorithms. tol : float, optional (default=1e-4) - Stopping criterion. For the irls, and lbfgs solvers, + Stopping criterion. For the lbfgs solver, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of the objective function. @@ -920,22 +754,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): number generator; if None, the random number generator is the RandomState instance used by `np.random`. - diag_fisher : boolean, optional, (default=False) - Only relevant for solver 'cd'. - If ``False``, the full Fisher matrix (expected Hessian) is computed in - each outer iteration (Newton iteration). If ``True``, only a diagonal - matrix (stored as 1d array) is computed, such that - fisher = X.T @ diag @ X. This saves memory and matrix-matrix - multiplications, but needs more matrix-vector multiplications. If you - use large sparse X or if you have many features, - i.e. n_features >> n_samples, you might set this option to ``True``. - copy_X : boolean, optional, (default=True) If ``True``, X will be copied; else, it may be overwritten. check_input : boolean, optional (default=True) Allow to bypass several checks on input: y values in range of family, - sample_weight non-negative, P2 positive semi-definite. + sample_weight non-negative. Don't use this parameter unless you know what you do. verbose : int, optional (default=0) @@ -991,14 +815,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Journal of Machine Learning Research 13 (2012) 1999-2030 https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ - def __init__(self, alpha=1.0, P2='identity', + def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, - random_state=None, diag_fisher=False, + random_state=None, copy_X=True, check_input=True, verbose=0): self.alpha = alpha - self.P2 = P2 self.fit_intercept = fit_intercept self.family = family self.link = link @@ -1008,7 +831,6 @@ def __init__(self, alpha=1.0, P2='identity', self.tol = tol self.warm_start = warm_start self.random_state = random_state - self.diag_fisher = diag_fisher self.copy_X = copy_X self.check_input = check_input self.verbose = verbose @@ -1051,8 +873,8 @@ def fit(self, X, y, sample_weight=None): raise ValueError( "The family must be an instance of class" " ExponentialDispersionModel or an element of" - " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " - "'binomial']; got (family={0})".format(self.family)) + " ['normal', 'poisson', 'gamma', 'inverse.gaussian']" + "; got (family={0})".format(self.family)) # Guarantee that self._link_instance is set to an instance of # class Link @@ -1089,13 +911,13 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver not in ['auto', 'irls', 'lbfgs']: + if self.solver not in ['auto', 'lbfgs']: raise ValueError("GeneralizedLinearRegressor supports only solvers" - "'auto', 'irls', 'lbfgs';" + "'auto', 'lbfgs';" " got {0}".format(self.solver)) solver = self.solver if self.solver == 'auto': - solver = 'irls' + solver = 'lbfgs' if (not isinstance(self.max_iter, int) or self.max_iter <= 0): raise ValueError("Maximum number of iteration must be a positive " @@ -1108,9 +930,6 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) random_state = check_random_state(self.random_state) - if not isinstance(self.diag_fisher, bool): - raise ValueError("The argument diag_fisher must be bool;" - " got {0}".format(self.diag_fisher)) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) @@ -1133,95 +952,12 @@ def fit(self, X, y, sample_weight=None): n_samples, n_features = X.shape - # 1.3 arguments to take special care ################################## - # P2 - - # If X is sparse, make P2 sparse, too. - if isinstance(self.P2, str) and self.P2 == 'identity': - if sparse.issparse(X): - P2 = (sparse.dia_matrix((np.ones(n_features), 0), - shape=(n_features, n_features))).tocsc() - else: - P2 = np.ones(n_features) - else: - P2 = check_array(self.P2, copy=True, - accept_sparse=_stype, - dtype=_dtype, ensure_2d=False) - if P2.ndim == 1: - P2 = np.asarray(P2) - if P2.shape[0] != n_features: - raise ValueError("P2 should be a 1d array of shape " - "(n_features,) with " - "n_features=X.shape[1]; " - "got (P2.shape=({0},)), needed ({1},)" - .format(P2.shape[0], X.shape[1])) - if sparse.issparse(X): - P2 = (sparse.dia_matrix((P2, 0), - shape=(n_features, n_features))).tocsc() - elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and - P2.shape[0] == X.shape[1]): - if sparse.issparse(X): - P2 = (sparse.dia_matrix((P2, 0), - shape=(n_features, n_features))).tocsc() - else: - raise ValueError("P2 must be either None or an array of shape " - "(n_features, n_features) with " - "n_features=X.shape[1]; " - "got (P2.shape=({0}, {1})), needed ({2}, {2})" - .format(P2.shape[0], P2.shape[1], X.shape[1])) - - l2 = self.alpha - # P2 is now for sure a copy - P2 = l2 * P2 - # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') - # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric - if P2.ndim == 2: - if sparse.issparse(P2): - if sparse.isspmatrix_csc(P2): - P2 = 0.5 * (P2 + P2.transpose()).tocsc() - else: - P2 = 0.5 * (P2 + P2.transpose()).tocsr() - else: - P2 = 0.5 * (P2 + P2.T) - - # For coordinate descent, if X is sparse, P2 must also be csc - if solver == 'cd' and sparse.issparse(X): - P2 = sparse.csc_matrix(P2) - # 1.4 additional validations ########################################## if self.check_input: if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" .format(family.__class__.__name__)) - # check if P2 is positive semidefinite - # np.linalg.cholesky(P2) 'only' asserts positive definite - if not isinstance(self.P2, str): # self.P2 != 'identity' - # due to numerical precision, we allow eigenvalues to be a - # tiny bit negative - epsneg = -10 * np.finfo(P2.dtype).epsneg - if P2.ndim == 1 or P2.shape[0] == 1: - p2 = P2 - if sparse.issparse(P2): - p2 = P2.toarray() - if not np.all(p2 >= 0): - raise ValueError("1d array P2 must not have negative " - "values.") - elif sparse.issparse(P2): - # for sparse matrices, not all eigenvals can be computed - # efficiently, use only half of n_features - # k = how many eigenvals to compute - k = np.min([10, n_features // 10 + 1]) - sigma = 0 # start searching near this value - which = 'SA' # find smallest algebraic eigenvalues first - eigenvalues = splinalg.eigsh(P2, k=k, sigma=sigma, - which=which, - return_eigenvectors=False) - if not np.all(eigenvalues >= epsneg): - raise ValueError("P2 must be positive semi-definite.") - else: - if not np.all(linalg.eigvalsh(P2) >= epsneg): - raise ValueError("P2 must be positive semi-definite.") # TODO: if alpha=0 check that X is not rank deficient # TODO: what else to check? @@ -1229,10 +965,10 @@ def fit(self, X, y, sample_weight=None): # 2. rescaling of weights (sample_weight) # ####################################################################### # IMPORTANT NOTE: Since we want to minimize - # 1/(2*sum(sample_weight)) * deviance + L1 + L2, + # 1/(2*sum(sample_weight)) * deviance + L2, # deviance = sum(sample_weight * unit_deviance), # we rescale weights such that sum(weights) = 1 and this becomes - # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + # 1/2*deviance + L2 with deviance=sum(weights * unit_deviance) weights_sum = np.sum(weights) weights = weights/weights_sum @@ -1260,33 +996,21 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # algorithms for optimization - # 4.1 IRLS ############################################################ - # Note: we already set P2 = l2*P2, see above - # Note: we already symmetrized P2 = 1/2 (P2 + P2') - if solver == 'irls': - coef, self.n_iter_ = \ - _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2, - fit_intercept=self.fit_intercept, family=family, - link=link, max_iter=self.max_iter, tol=self.tol) - - # 4.2 L-BFGS ########################################################## - elif solver == 'lbfgs': - def func(coef, X, y, weights, P2, family, link): + # 4.1 L-BFGS ########################################################## + if solver == 'lbfgs': + def func(coef, X, y, weights, alpha, family, link): mu, devp = \ family._mu_deviance_derivative(coef, X, y, weights, link) dev = family.deviance(y, mu, weights) intercept = (coef.size == X.shape[1] + 1) idx = 1 if intercept else 0 # offset if coef[0] is intercept - if P2.ndim == 1: - L2 = P2 * coef[idx:] - else: - L2 = P2 @ coef[idx:] + L2 = alpha * coef[idx:] obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2) objp = 0.5 * devp objp[idx:] += L2 return obj, objp - args = (X, y, weights, P2, family, link) + args = (X, y, weights, self.alpha, family, link) # TODO: refactor this once # https://github.com/scikit-learn/scikit-learn/pull/14250 # is merged. @@ -1492,13 +1216,9 @@ class PoissonRegressor(GeneralizedLinearRegressor): the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'irls', 'lbfgs'}, optional (default='irls') + solver : {'lbfgs'}, optional (default='lbfgs') Algorithm to use in the optimization problem: - 'irls' - Iterated reweighted least squares. It is the standard algorithm - for GLMs. - 'lbfgs' Calls scipy's L-BFGS-B optimizer. @@ -1506,7 +1226,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): The maximal number of iterations for solver algorithms. tol : float, optional (default=1e-4) - Stopping criterion. For the irls, and lbfgs solvers, + Stopping criterion. For the lbfgs solver, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of the objective function. @@ -1575,8 +1295,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None, - solver='irls', max_iter=100, - tol=1e-4, warm_start=False, + solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, random_state=None, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index b9716388fb36c..877b37993f587 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -25,7 +25,7 @@ from sklearn.utils.testing import assert_array_equal -GLM_SOLVERS = ['irls', 'lbfgs'] +GLM_SOLVERS = ['lbfgs'] @pytest.fixture(scope="module") @@ -193,39 +193,6 @@ def test_glm_alpha_argument(alpha): glm.fit(X, y) -@pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], - sparse.csr_matrix([1, 2, 3]), [-1]]) -def test_glm_P2_argument(P2): - """Test GLM for invalid P2 argument.""" - y = np.array([1, 2]) - X = np.array([[1], [2]]) - glm = GeneralizedLinearRegressor(P2=P2, check_input=True) - with pytest.raises(ValueError): - glm.fit(X, y) - - -def test_glm_P2_positive_semidefinite(): - """Test GLM for a positive semi-definite P2 argument.""" - n_samples, n_features = 10, 5 - y = np.arange(n_samples) - X = np.zeros((n_samples, n_features)) - P2 = np.diag([100, 10, 5, 0, -1E-5]) - rng = np.random.RandomState(42) - # construct random orthogonal matrix Q - Q, R = linalg.qr(rng.randn(n_features, n_features)) - P2 = Q.T @ P2 @ Q - glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, - check_input=True) - with pytest.raises(ValueError, match="P2 must be positive semi-definite"): - glm.fit(X, y) - - P2 = sparse.csr_matrix(P2) - glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, - check_input=True) - with pytest.raises(ValueError, match="P2 must be positive semi-definite"): - glm.fit(X, y) - - @pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) def test_glm_fit_intercept_argument(fit_intercept): """Test GLM for invalid fit_intercept argument.""" @@ -287,16 +254,6 @@ def test_glm_random_state_argument(random_state): glm.fit(X, y) -@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]]) -def test_glm_diag_fisher_argument(diag_fisher): - """Test GLM for invalid diag_fisher arguments.""" - y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher) - with pytest.raises(ValueError, match="diag_fisher must be bool"): - glm.fit(X, y) - - @pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) def test_glm_copy_X_argument(copy_X): """Test GLM for invalid copy_X arguments.""" @@ -336,9 +293,7 @@ def test_glm_identity_regression(solver): GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), ]) -@pytest.mark.parametrize('solver, tol', [('irls', 1e-6), - ('lbfgs', 1e-6), -]) +@pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-6)]) def test_glm_log_regression(family, solver, tol): """Test GLM regression with log link on a simple dataset.""" coef = [0.2, -0.1] @@ -391,10 +346,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) -@pytest.mark.parametrize('solver, tol', - [('irls', 1e-7), - ('lbfgs', 1e-7), -]) +@pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-7)]) def test_poisson_ridge(solver, tol): """Test ridge regression with poisson family and LogLink. @@ -426,9 +378,6 @@ def test_poisson_ridge(solver, tol): @pytest.mark.parametrize( "params", [ - {"solver": "irls" }, - {"solver": "irls" }, - {"solver": "lbfgs" }, {"solver": "lbfgs"}, ], ids=lambda params: ', '.join("%s=%s" % (key, val) From 1e4b5380b7ad699e5d059ef6d9836c37a7ba16e2 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 11:23:31 -0500 Subject: [PATCH 073/209] Remove plot_poisson_spline_regression.py example --- .../plot_poisson_spline_regression.py | 85 ------------------- 1 file changed, 85 deletions(-) delete mode 100644 examples/linear_model/plot_poisson_spline_regression.py diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py deleted file mode 100644 index 30b5881bba1f5..0000000000000 --- a/examples/linear_model/plot_poisson_spline_regression.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -================================= -Poisson Regression with B-Splines -================================= - -As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` -example, a Poisson regression with penalized B-splines (P-splines) [1]_ is -fitted on slightly different sinusoidal, Poisson distributed data and -compared to an AdaBoost model with decision trees. -One can see, that this is a hard problem for both estimators. - -.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines - and penalties". Statist. Sci. 11 (1996), no. 2, 89--121. - `doi:10.1214/ss/1038425655 - `_ - -""" -print(__doc__) - -# Author: Christian Lorentzen -# based on the AdaBoost regression example from Noel Dawe -# License: BSD 3 clause - -# importing necessary libraries -import numpy as np -from scipy.linalg import toeplitz -# from scipy.interpolate import BSpline -from scipy.interpolate import splev -import matplotlib.pyplot as plt -from sklearn.tree import DecisionTreeRegressor -from sklearn.ensemble import AdaBoostRegressor -from sklearn.linear_model import GeneralizedLinearRegressor - - -# Create the dataset -xmin, xmax = 0, 6 -rng = np.random.RandomState(1) -X = np.linspace(xmin, xmax, 500)[:, np.newaxis] -y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel()) -y = rng.poisson(y_true, X.shape[0]) - -# b-spline basis -nknots, degree = 40, 3 -ns = nknots - degree - 1 # number of base spline functions -dx = (xmax - xmin) / (nknots - 1 - 2 * degree) -knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots) -coef = np.zeros(ns) -splineBasis = np.empty((X.shape[0], ns), dtype=float) -for i in range(ns): - coef[i] = 1 -# splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ -# .ravel() - splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel() - coef[i] = 0 - -# second order difference matrix -P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float) -P2[0, 0] = P2[-1, -1] = 1 - -# Fit regression model -regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), - n_estimators=10, random_state=rng) - -regr_2 = GeneralizedLinearRegressor(family='poisson', link='log', - fit_intercept=True, alpha=0.02, - l1_ratio=0.1, P2=P2) - -regr_1.fit(X, y) -regr_2.fit(splineBasis, y) - -# Predict -y_1 = regr_1.predict(X) -y_2 = regr_2.predict(splineBasis) - -# Plot the results -plt.figure() -plt.plot(X, y_true, c="b", label="true mean") -plt.scatter(X, y, c="k", marker='.', label="training samples") -plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2) -plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2) -plt.xlabel("data") -plt.ylabel("target") -plt.title("Regression Comparison") -plt.legend() -plt.show() From 3265148aa4df59bd1a4c8ea5eb8e2102bb650006 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 11:38:48 -0500 Subject: [PATCH 074/209] Remove random_state parameter --- sklearn/linear_model/_glm.py | 30 ++++--------------- sklearn/linear_model/tests/test_glm.py | 41 ++++++++------------------ 2 files changed, 17 insertions(+), 54 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index b469b3c4edd17..4a1f7c260e649 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -10,15 +10,13 @@ from abc import ABCMeta, abstractmethod import numbers import numpy as np -from scipy import linalg, sparse, special -import scipy.sparse.linalg as splinalg +from scipy import sparse, special from scipy.optimize import fmin_l_bfgs_b import warnings from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, check_X_y -from ..utils.validation import check_is_fitted, check_random_state - +from ..utils.validation import check_is_fitted def _check_weights(sample_weight, n_samples): @@ -680,7 +678,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * |w|_2 - with inverse link function h and s=sample_weight. + with inverse link function h and s=sample_weight. The parameter ``alpha`` corresponds to the lambda parameter in glmnet. Read more in the :ref:`User Guide `. @@ -742,18 +740,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Stopping criterion. For the lbfgs solver, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of - the objective function. + the objective function. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` as initialization for ``coef_`` and ``intercept_``. - random_state : {int, RandomState instance, None}, optional (default=None) - If int, random_state is the seed used by the random - number generator; if RandomState instance, random_state is the random - number generator; if None, the random number generator is the - RandomState instance used by `np.random`. - copy_X : boolean, optional, (default=True) If ``True``, X will be copied; else, it may be overwritten. @@ -819,7 +811,6 @@ def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, - random_state=None, copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.fit_intercept = fit_intercept @@ -830,7 +821,6 @@ def __init__(self, alpha=1.0, self.max_iter = max_iter self.tol = tol self.warm_start = warm_start - self.random_state = random_state self.copy_X = copy_X self.check_input = check_input self.verbose = verbose @@ -929,7 +919,6 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.warm_start, bool): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) - random_state = check_random_state(self.random_state) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) @@ -959,7 +948,6 @@ def fit(self, X, y, sample_weight=None): "range for family {0}" .format(family.__class__.__name__)) # TODO: if alpha=0 check that X is not rank deficient - # TODO: what else to check? ####################################################################### # 2. rescaling of weights (sample_weight) # @@ -1027,7 +1015,6 @@ def func(coef, X, y, weights, alpha, family, link): .format(info["task"])) self.n_iter_ = info['nit'] - ####################################################################### # 5. postprocessing # ####################################################################### @@ -1235,12 +1222,6 @@ class PoissonRegressor(GeneralizedLinearRegressor): If set to ``True``, reuse the solution of the previous call to ``fit`` as initialization for ``coef_`` and ``intercept_`` . - random_state : {int, RandomState instance, None}, optional (default=None) - If int, random_state is the seed used by the random - number generator; if RandomState instance, random_state is the random - number generator; if None, the random number generator is the - RandomState instance used by `np.random`. - copy_X : boolean, optional, (default=True) If ``True``, X will be copied; else, it may be overwritten. @@ -1296,11 +1277,10 @@ class PoissonRegressor(GeneralizedLinearRegressor): """ def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, - random_state=None, copy_X=True, check_input=True, verbose=0): + copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="poisson", link='log', fit_dispersion=fit_dispersion, solver=solver, max_iter=max_iter, tol=tol, warm_start=warm_start, - random_state=random_state, copy_X=copy_X, verbose=verbose) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 877b37993f587..a921c7e065878 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -5,10 +5,8 @@ import numpy as np from numpy.testing import assert_allclose import pytest -import scipy as sp -from scipy import linalg, optimize, sparse -from sklearn.datasets import make_classification, make_regression +from sklearn.datasets import make_regression from sklearn.linear_model import GeneralizedLinearRegressor from sklearn.linear_model._glm import ( Link, @@ -19,7 +17,7 @@ NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, ) -from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge +from sklearn.linear_model import Ridge from sklearn.metrics import mean_absolute_error from sklearn.exceptions import ConvergenceWarning @@ -101,8 +99,7 @@ def test_tweedie_distribution_power(): (TweedieDistribution(power=-1), [0.1, 1.5]), (TweedieDistribution(power=1.5), [0.1, 1.5]), (TweedieDistribution(power=2.5), [0.1, 1.5]), - (TweedieDistribution(power=-4), [0.1, 1.5]), -]) + (TweedieDistribution(power=-4), [0.1, 1.5])]) def test_deviance_zero(family, chk_values): """Test deviance(y,y) = 0 for different families.""" for x in chk_values: @@ -151,8 +148,7 @@ def test_sample_weights_validation(): [('normal', NormalDistribution()), ('poisson', PoissonDistribution()), ('gamma', GammaDistribution()), - ('inverse.gaussian', InverseGaussianDistribution()), -]) + ('inverse.gaussian', InverseGaussianDistribution())]) def test_glm_family_argument(f, fam): """Test GLM family argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions @@ -244,16 +240,6 @@ def test_glm_warm_start_argument(warm_start): glm.fit(X, y) -@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]]) -def test_glm_random_state_argument(random_state): - """Test GLM for invalid random_state argument.""" - y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(random_state=random_state) - with pytest.raises(ValueError, match="cannot be used to seed"): - glm.fit(X, y) - - @pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) def test_glm_copy_X_argument(copy_X): """Test GLM for invalid copy_X arguments.""" @@ -291,8 +277,7 @@ def test_glm_identity_regression(solver): 'family', [NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), -]) + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]) @pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-6)]) def test_glm_log_regression(family, solver, tol): """Test GLM regression with log link on a simple dataset.""" @@ -338,7 +323,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): glm = GeneralizedLinearRegressor(alpha=1.0, family='normal', link='identity', fit_intercept=True, max_iter=300, solver=solver, tol=1e-6, - check_input=False, random_state=42) + check_input=False) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6) @@ -369,7 +354,7 @@ def test_poisson_ridge(solver, tol): fit_intercept=True, family='poisson', link='log', tol=1e-7, solver=solver, max_iter=300, - random_state=rng) + ) glm.fit(X, y) assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5) @@ -385,11 +370,10 @@ def test_poisson_ridge(solver, tol): ) def test_solver_equivalence(params, regression_data): X, y = regression_data - est_ref = GeneralizedLinearRegressor(random_state=2) + est_ref = GeneralizedLinearRegressor() est_ref.fit(X, y) estimator = GeneralizedLinearRegressor(**params) - estimator.set_params(random_state=2) estimator.fit(X, y) @@ -405,16 +389,15 @@ def test_solver_equivalence(params, regression_data): def test_fit_dispersion(regression_data): X, y = regression_data - est1 = GeneralizedLinearRegressor(random_state=2) + est1 = GeneralizedLinearRegressor() est1.fit(X, y) assert not hasattr(est1, "dispersion_") - est2 = GeneralizedLinearRegressor(random_state=2, fit_dispersion="chisqr") + est2 = GeneralizedLinearRegressor(fit_dispersion="chisqr") est2.fit(X, y) assert isinstance(est2.dispersion_, float) - est3 = GeneralizedLinearRegressor( - random_state=2, fit_dispersion="deviance") + est3 = GeneralizedLinearRegressor(fit_dispersion="deviance") est3.fit(X, y) assert isinstance(est3.dispersion_, float) @@ -425,7 +408,7 @@ def test_fit_dispersion(regression_data): def test_convergence_warning(solver, regression_data): X, y = regression_data - est = GeneralizedLinearRegressor(solver=solver, random_state=2, + est = GeneralizedLinearRegressor(solver=solver, max_iter=1, tol=1e-20) with pytest.warns(ConvergenceWarning): est.fit(X, y) From 1862ab6811db3a6a4ad54719fcd048dc151d0a37 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 11:55:13 -0500 Subject: [PATCH 075/209] Lint --- sklearn/linear_model/tests/test_glm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index a921c7e065878..1712f7b5e1d3d 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -349,7 +349,6 @@ def test_poisson_ridge(solver, tol): # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) - rng = np.random.RandomState(42) glm = GeneralizedLinearRegressor(alpha=1, fit_intercept=True, family='poisson', link='log', tol=1e-7, From 4154074a7367be310976a8a1bba00ed737cf9e3d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 10 Jul 2019 14:16:05 +0200 Subject: [PATCH 076/209] Fix docstring --- doc/modules/linear_model.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index eba51315d2ae8..681a13cdf9d42 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -940,9 +940,9 @@ follows: >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.coef_ - array([0.24630169, 0.43373464]) + array([0.2463..., 0.4337...]) >>> reg.intercept_ - -0.76383633... + -0.7638... .. topic:: Examples: From c5d77d78d78a57b242e401f0f2f3efa572e74105 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 10 Jul 2019 14:20:42 +0200 Subject: [PATCH 077/209] Remove unused core --- sklearn/linear_model/_glm.py | 68 +----------------------------------- 1 file changed, 1 insertion(+), 67 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 4a1f7c260e649..244c781ae8f48 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -10,7 +10,7 @@ from abc import ABCMeta, abstractmethod import numbers import numpy as np -from scipy import sparse, special +from scipy import special from scipy.optimize import fmin_l_bfgs_b import warnings from ..base import BaseEstimator, RegressorMixin @@ -54,42 +54,6 @@ def _safe_lin_pred(X, coef): return X @ coef -def _safe_toarray(X): - """Returns a numpy array.""" - if sparse.issparse(X): - return X.toarray() - else: - return np.asarray(X) - - -def _safe_sandwich_dot(X, d, intercept=False): - """Compute sandwich product X.T @ diag(d) @ X. - - With ``intercept=True``, X is treated as if a column of 1 were appended as - first column of X. - X can be sparse, d must be an ndarray. Always returns a ndarray.""" - if sparse.issparse(X): - temp = (X.transpose() @ X.multiply(d[:, np.newaxis])) - # for older versions of numpy and scipy, temp may be a np.matrix - temp = _safe_toarray(temp) - else: - temp = (X.T * d) @ X - if intercept: - dim = X.shape[1] + 1 - if sparse.issparse(X): - order = 'F' if sparse.isspmatrix_csc(X) else 'C' - else: - order = 'F' if X.flags['F_CONTIGUOUS'] else 'C' - res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order) - res[0, 0] = d.sum() - res[1:, 0] = d @ X - res[0, 1:] = res[1:, 0] - res[1:, 1:] = temp - else: - res = temp - return res - - class Link(metaclass=ABCMeta): """Abstract base class for Link functions.""" @@ -250,7 +214,6 @@ class ExponentialDispersionModel(metaclass=ABCMeta): starting_mu _mu_deviance_derivative - _score References ---------- @@ -484,35 +447,6 @@ def _mu_deviance_derivative(self, coef, X, y, weights, link): devp = temp @ X # sampe as X.T @ temp return mu, devp - def _score(self, coef, phi, X, y, weights, link): - r"""Compute the score function. - - The score function is the derivative of the - log-likelihood w.r.t. `coef` (:math:`w`). - It is given by - - .. math: - - \mathbf{score}(\boldsymbol{w}) - = \frac{\partial loglike}{\partial\boldsymbol{w}} - = \mathbf{X}^T \mathbf{D} - \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, - - with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and - :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. - Note: The derivative of the deviance w.r.t. coef equals -2 * score. - """ - lin_pred = _safe_lin_pred(X, coef) - mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) - d = link.inverse_derivative(lin_pred) - temp = sigma_inv * d * (y - mu) - if coef.size == X.shape[1] + 1: - score = np.concatenate(([temp.sum()], temp @ X)) - else: - score = temp @ X # sampe as X.T @ temp - return score - class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. From 9ab5ac2506bd33d84c96f9f848b62cb4ee5b0853 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 13 Jul 2019 15:13:14 +0200 Subject: [PATCH 078/209] Update examples/linear_model/plot_poisson_regression_non_normal_loss.py Co-Authored-By: Alexandre Gramfort --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index b06adcb787560..471c137840e82 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -1,6 +1,6 @@ """ ====================================== -Poisson regression and non normal loss +Poisson regression and non-normal loss ====================================== This example illustrate the use linear Poisson regression From e4d0be190239435e7dcd4d27cd616006909db98b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 13 Jul 2019 15:13:26 +0200 Subject: [PATCH 079/209] Update examples/linear_model/plot_poisson_regression_non_normal_loss.py Co-Authored-By: Alexandre Gramfort --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 471c137840e82..0537704b2cf1f 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -3,7 +3,7 @@ Poisson regression and non-normal loss ====================================== -This example illustrate the use linear Poisson regression +This example illustrates the use of linear Poisson regression on the French Motor Third-Party Liability Claims dataset [1] and compare it with learning models with least squared error. From 6ff4d588f5adae2aed6810ddff64733c5bc595a3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 13 Jul 2019 15:13:50 +0200 Subject: [PATCH 080/209] Update doc/modules/linear_model.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 681a13cdf9d42..2f750b33623aa 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -929,7 +929,7 @@ Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link :math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with -:math:`h(Xw)=\exp(Xw)`. +:math:`h(x^\top w)=\exp(x^\top w)`. Note that the feature matrix `X` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as From 13102d5a2b4538734f162a70979df874a31c7798 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 13 Jul 2019 15:13:59 +0200 Subject: [PATCH 081/209] Update doc/modules/linear_model.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 2f750b33623aa..49662064827f3 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -928,7 +928,7 @@ are the following: Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link -:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with +:math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the log-link with :math:`h(x^\top w)=\exp(x^\top w)`. Note that the feature matrix `X` should be standardized before fitting. This From af89e5281b880b19346db625838e1623275b438a Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 13 Jul 2019 15:14:12 +0200 Subject: [PATCH 082/209] Update doc/modules/linear_model.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 49662064827f3..c8c103288c5ab 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -898,7 +898,7 @@ ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear combination of the input variables :math:`X` via an inverse link function :math:`h` as -.. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). +.. math:: \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an exponential dispersion model (EDM) [11]_. The objective function being minimized From ddc4b717bcece9c8d0347d46b9f4a89b681aec96 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 13 Jul 2019 15:27:22 +0200 Subject: [PATCH 083/209] Use scipy.optimize.minimize interface for LBFGS optimizer --- sklearn/linear_model/_glm.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 244c781ae8f48..939249e42e4f4 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -6,16 +6,16 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -from __future__ import division from abc import ABCMeta, abstractmethod import numbers + import numpy as np from scipy import special -from scipy.optimize import fmin_l_bfgs_b -import warnings +import scipy.optimize + from ..base import BaseEstimator, RegressorMixin -from ..exceptions import ConvergenceWarning from ..utils import check_array, check_X_y +from ..utils.optimize import _check_optimize_result from ..utils.validation import check_is_fitted @@ -933,21 +933,18 @@ def func(coef, X, y, weights, alpha, family, link): return obj, objp args = (X, y, weights, self.alpha, family, link) - # TODO: refactor this once - # https://github.com/scikit-learn/scikit-learn/pull/14250 - # is merged. - coef, loss, info = fmin_l_bfgs_b( - func, coef, fprime=None, args=args, - iprint=(self.verbose > 0) - 1, pgtol=self.tol, - maxiter=self.max_iter, factr=1e3) - if info["warnflag"] == 1: - warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.", - ConvergenceWarning) - elif info["warnflag"] == 2: - warnings.warn("lbfgs failed for the reason: {0}" - .format(info["task"])) - self.n_iter_ = info['nit'] + + opt_res = scipy.optimize.minimize( + func, coef, method="L-BFGS-B", jac=True, + options={ + "maxiter": self.max_iter, + "iprint": (self.verbose > 0) - 1, + "gtol": self.tol, + "ftol": 1e3*np.finfo(float).eps, + }, + args=args) + self.n_iter_ = _check_optimize_result("lbfgs", opt_res) + coef = opt_res.x ####################################################################### # 5. postprocessing # From 426ae1d711c27d14a7dd7d22763fee5d9d20c1f5 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Jul 2019 18:20:09 +0200 Subject: [PATCH 084/209] EXA wording and score in plot_tweedie_regression_insurance_claims.html --- ...lot_tweedie_regression_insurance_claims.py | 137 ++++++++++-------- 1 file changed, 78 insertions(+), 59 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 063d12e6e291b..00111b811f923 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -3,7 +3,7 @@ Tweedie regression on insurance claims ====================================== -This example illustrate the use Poisson, Gamma and Tweedie regression +This example illustrates the use of Poisson, Gamma and Tweedie regression on the French Motor Third-Party Liability Claims dataset, and is inspired by an R tutorial [1]. @@ -13,9 +13,10 @@ which are: 1. Model the number of claims with a Poisson distribution, the average - claim amount as a Gamma distribution and multiply the predictions, to get - the total claim amount. -2. Model total claim amount directly, typically with a Tweedie distribution. + claim amount as a Gamma distribution and multiply the predictions of both in + order to get the total claim amount. +2. Model total claim amount directly, typically with a Tweedie distribution of + Tweedie power :math:`p \\in (1, 2)`. In this example we will illustrate both approaches. We start by defining a few helper functions for loading the data and visualizing results. @@ -49,7 +50,7 @@ def load_mtpl2(n_samples=100000): - """Fetcher for French Motor Third-Party Liability Claims dataset + """Fetch the French Motor Third-Party Liability Claims dataset. Parameters ---------- @@ -81,24 +82,27 @@ def load_mtpl2(n_samples=100000): return df.iloc[:n_samples] -def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None, +def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, title=None, kind_weight=None, ax=None): """Plot observed and predicted - aggregated per feature level. Parameters ---------- - df : DataFrame with at least one column named feature + df : DataFrame with at least three columns named feature, weight and + observed + feature: str + a column name of df for the feature to be plotted + weight : str + column name of df with the values of weights or exposure observed : str - a column name of the observed target + a column name of df with the observed target predicted : frame a dataframe, with the same index as df, with the predicted target - weight : str - column name with the values of weights/exposure """ # aggregate observed and predicted variables by feature level df_ = df.loc[:, [feature, weight]].copy() df_["observed"] = df[observed] * df[weight] - df_["predicted"] = y_predicted * df[weight] + df_["predicted"] = predicted * df[weight] df_ = ( df_.groupby([feature])[weight, "observed", "predicted"] .sum() @@ -126,9 +130,10 @@ def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None, # 1. Loading datasets and pre-processing # -------------------------------------- # -# We construct the freMTPL2 dataset by joining the freMTPL2freq table, -# containing the number of claims (``ClaimNb``) with the freMTPL2sev table -# containing the claim amount (``ClaimAmount``) for the same user ids. +# We construct the freMTPL2 dataset by joining the freMTPL2freq table, +# containing the number of claims (``ClaimNb``), with the freMTPL2sev table, +# containing the claim amount (``ClaimAmount``) for the same policy ids +# (``IDpol``). df = load_mtpl2(n_samples=100000) @@ -206,14 +211,14 @@ def score_estimator( y, _weights = df[target], df[weights] for score_label, metric in [ - ("D² explaned", None), + ("D² explained", None), ("mean deviance", partial(mean_deviance, estimator)), ("mean abs. error", mean_absolute_error), ]: if estimator.__class__.__name__ == "ClaimProdEstimator": - # ClaimProdEstimator is the product of the frequency and - # severity models, together with a denormalized by the exposure - # values. It does not fully follow the scikit-learn API and we + # ClaimProdEstimator is the product of frequency and severity + # models, denormalized by the exposure values. + # It does not fully follow the scikit-learn API and we # must handle it separately. y_pred = estimator.predict(X, exposure=df.Exposure.values) else: @@ -253,50 +258,50 @@ def score_estimator( # # We can visually compare observed and predicted values, aggregated by # the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance -# bonus/penalty (``BonusMalus``), +# bonus/malus (``BonusMalus``). fig, ax = plt.subplots(2, 2, figsize=(16, 8)) fig.subplots_adjust(hspace=0.3, wspace=0.2) plot_obs_pred( - df_train, - "DrivAge", - "Frequency", - glm_freq.predict(X_train), + df=df_train, + feature="DrivAge", weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_train), y_label="Claim Frequency", title="train data", ax=ax[0, 0], ) plot_obs_pred( - df_test, - "DrivAge", - "Frequency", - glm_freq.predict(X_test), + df=df_test, + feature="DrivAge", weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_test), y_label="Claim Frequency", title="test data", ax=ax[0, 1], ) plot_obs_pred( - df_test, - "VehAge", - "Frequency", - glm_freq.predict(X_test), + df=df_test, + feature="VehAge", weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_test), y_label="Claim Frequency", title="test data", ax=ax[1, 0], ) plot_obs_pred( - df_test, - "BonusMalus", - "Frequency", - glm_freq.predict(X_test), + df=df_test, + feature="BonusMalus", weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_test), y_label="Claim Frequency", title="test data", ax=ax[1, 1], @@ -308,12 +313,13 @@ def score_estimator( # 3. Severity model - Gamma Distribution # --------------------------------------- # The mean claim amount or severity (`AvgClaimAmount`) can be empirically -# shown to follow a Gamma distribution. We fit a GLM model for the severity -# with the same features as the frequency model. +# shown to follow approximately a Gamma distribution. We fit a GLM model for +# the severity with the same features as the frequency model. # # Note: -# - We filter out ``ClaimAmount == 0``` as the Gamma distribution as support -# on :math:`(0, \infty)` not :math:`[0, \infty)`. +# +# - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support +# on :math:`(0, \infty)`, not :math:`[0, \infty)`. # - We use ``ClaimNb`` as sample weights. mask_train = df_train["ClaimAmount"] > 0 @@ -341,19 +347,20 @@ def score_estimator( ############################################################################## # -# Note that the resulting model is conditional on having at least one claim, -# and cannot be used to predict the average claim amount in general, +# Note that the resulting model is the average claim amount per claim. As such, +# it is conditional on having at least one claim, and cannot be used to predict +# the average claim amount per policy in general. print( - "Mean AvgClaim Amount: %.2f " + "Mean AvgClaim Amount per policy: %.2f " % df_train.AvgClaimAmount.mean() ) print( - "Mean AvgClaim Amount | NbClaim > 0: %.2f" + "Mean AvgClaim Amount | NbClaim > 0: %.2f" % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean() ) print( - "Predicted Mean AvgClaim Amount: %.2f" + "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f" % glm_sev.predict(X_train).mean() ) @@ -361,28 +368,28 @@ def score_estimator( ############################################################################## # # We can visually compare observed and predicted values, aggregated for -# the drivers age (``Driv Age``), +# the drivers age (``DrivAge``). fig, ax = plt.subplots(1, 2, figsize=(16, 4)) # plot DivAge plot_obs_pred( - df_train.loc[mask_train], - "DrivAge", - "AvgClaimAmount", - glm_sev.predict(X_train[mask_train.values]), + df=df_train.loc[mask_train], + feature="DrivAge", weight="Exposure", + observed="AvgClaimAmount", + predicted=glm_sev.predict(X_train[mask_train.values]), y_label="Average Claim Severity", title="train data", ax=ax[0], ) plot_obs_pred( - df_test.loc[mask_test], - "DrivAge", - "AvgClaimAmount", - glm_sev.predict(X_test[mask_test.values]), + df=df_test.loc[mask_test], + feature="DrivAge", weight="Exposure", + observed="AvgClaimAmount", + predicted=glm_sev.predict(X_test[mask_test.values]), y_label="Average Claim Severity", title="test data", ax=ax[1], @@ -391,31 +398,41 @@ def score_estimator( ############################################################################## # -# 3. Total Claims Amount -- Compound Poisson distribution +# 4. Total Claims Amount -- Compound Poisson distribution # ------------------------------------------------------- # # As mentionned in the introduction, the total claim amount can be modeled -# either as the product of the frequency model by the severity model. +# either as the product of the frequency model by the severity model, class ClaimProdEstimator: - """Total claim amount estimator + """Total claim amount estimator. Computed as the product of the frequency model by the serverity model, - denormalized by exposure. + denormalized by exposure. Use Tweedie deviance with `p=1.5`. """ def __init__(self, est_freq, est_sev): self.est_freq = est_freq self.est_sev = est_sev + self._family_instance = TweedieDistribution(power=1.5) def predict(self, X, exposure): - """Predict the total claim amount + """Predict the total claim amount. The predict method is not compatible with the scikit-learn API. """ return exposure * self.est_freq.predict(X) * self.est_sev.predict(X) + def score(self, X, y, sample_weight=None): + """Compute D², the percentage of deviance explained.""" + mu = self.predict(X, exposure=sample_weight) + dev = self._family_instance.deviance(y, mu, weights=sample_weight) + y_mean = np.average(y, weights=sample_weight) + dev_null = self._family_instance.deviance(y, y_mean, + weights=sample_weight) + return 1. - dev / dev_null + est_prod = ClaimProdEstimator(glm_freq, glm_sev) @@ -476,7 +493,9 @@ def predict(self, X, exposure): # model than when using separate models for frequency and severity. # # We can additionally validate these models by comparing observed and predicted -# total claim amount over the test and train subsets. +# total claim amount over the test and train subsets. We see that in our case +# the frequency-severity model underestimates the total claim amount, whereas +# the Tweedie model overestimates. res = [] for subset_label, X, df in [ From a4043847d0dfde68bb09dc0d9de3ab10fc07d41b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 15 Jul 2019 14:15:26 +0200 Subject: [PATCH 085/209] Address review comments --- .../plot_tweedie_regression_insurance_claims.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 00111b811f923..1c8dd42df336d 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -46,7 +46,7 @@ from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer -from sklearn.metrics import mean_absolute_error +from sklearn.metrics import mean_absolute_error, mean_squared_error def load_mtpl2(n_samples=100000): @@ -83,7 +83,7 @@ def load_mtpl2(n_samples=100000): def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, - title=None, kind_weight=None, ax=None): + title=None, ax=None): """Plot observed and predicted - aggregated per feature level. Parameters @@ -141,9 +141,11 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # requires a strictly positive target values. df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 -# correct for unreasonable observations (that might be data error) +# Correct for unreasonable observations (that might be data error) +# and a few exceptionally large claim amounts df["ClaimNb"] = df["ClaimNb"].clip(upper=4) df["Exposure"] = df["Exposure"].clip(upper=1) +df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000) column_trans = ColumnTransformer( [ @@ -188,7 +190,9 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) -glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0) +# Some of the features are colinear, we use a weak penalization to avoid +# numerical issues. +glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=1e-2) glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) @@ -214,6 +218,7 @@ def score_estimator( ("D² explained", None), ("mean deviance", partial(mean_deviance, estimator)), ("mean abs. error", mean_absolute_error), + ("mean squared error", mean_squared_error), ]: if estimator.__class__.__name__ == "ClaimProdEstimator": # ClaimProdEstimator is the product of frequency and severity @@ -325,7 +330,7 @@ def score_estimator( mask_train = df_train["ClaimAmount"] > 0 mask_test = df_test["ClaimAmount"] > 0 -glm_sev = GeneralizedLinearRegressor(family="gamma", alpha=1) +glm_sev = GeneralizedLinearRegressor(family="gamma") glm_sev.fit( X_train[mask_train.values], From 65796a3d3e74aff111c187b6be20e0e9287e0797 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 16 Jul 2019 16:41:02 +0200 Subject: [PATCH 086/209] Review comments on the documentation --- doc/modules/linear_model.rst | 50 +++++++++++++++++------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index c8c103288c5ab..b6f7c2b82c1f5 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -893,27 +893,23 @@ to warm-starting (see :term:`Glossary `). Generalized Linear Regression ============================= -:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two -ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear +:class:`GeneralizedLinearRegressor` generalizes linear models in two ways +[10]_. First, the predicted values :math:`\hat{y}` are linked to a linear combination of the input variables :math:`X` via an inverse link function :math:`h` as .. math:: \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [11]_. The objective function being minimized -becomes +exponential dispersion model (EDM) [11]_. The objective function being +minimized becomes -.. math:: \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1 - +\frac{\alpha(1-\rho)}{2} w^T P_2 w +.. math:: \frac{1}{2 \sum s_i}D(y, \hat{y}; s) +\frac{\alpha}{2} ||w||_2 -with sample weights :math:`s`. -:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in -the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows -for a more versatile L2 penalty. +with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`. -Use cases, where a loss different from the squared loss might be appropriate, -are the following: +In the following use cases, a loss different from the squared loss might be +appropriate, * If the target values :math:`y` are counts (non-negative integer valued) or frequencies (non-negative), you might use a Poisson deviance with log-link. @@ -928,10 +924,10 @@ are the following: Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link -:math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the log-link with -:math:`h(x^\top w)=\exp(x^\top w)`. +:math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the +log-link with :math:`h(x^\top w)=\exp(x^\top w)`. -Note that the feature matrix `X` should be standardized before fitting. This +Note that the feature matrix ``X`` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as follows: @@ -947,7 +943,8 @@ follows: .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py` + * :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py` + * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_regression_non_normal_loss.py` Mathematical formulation ------------------------ @@ -967,12 +964,9 @@ Note that the first assumption implies function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the same as specifying a unit variance function (they are one-to-one). -Including penalties helps to avoid overfitting or, in case of L1 penalty, to -obtain sparse solutions. But there are also other motivations to include them, -e.g. accounting for the dependence structure of :math:`y`. - -The objective function, which is independent of :math:`\phi`, is minimized with -respect to the coefficients :math:`w`. +The objective function (the penalized negative log likelihood) is +independent of :math:`\phi` and is minimized with respect to the +coefficients :math:`w`. The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` likelihood as @@ -1005,12 +999,16 @@ Two remarks: .. topic:: References: - .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. - See also `Exponential dispersion model. `_ + .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemática, no. 51. See also + `Exponential dispersion model. + `_ - .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ + .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. + `_ Stochastic Gradient Descent - SGD ================================= From e44afe7b6c9d2c7169ea2d61ecc58c389e67c6de Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 16 Jul 2019 18:08:30 +0200 Subject: [PATCH 087/209] Split the implementation into several files --- sklearn/linear_model/__init__.py | 3 +- sklearn/linear_model/_glm/__init__.py | 5 + sklearn/linear_model/_glm/distribution.py | 443 ++++++++++++++ sklearn/linear_model/{_glm.py => _glm/glm.py} | 576 +----------------- sklearn/linear_model/_glm/link.py | 135 ++++ sklearn/linear_model/_glm/tests/__init__.py | 1 + .../_glm/tests/test_distribution.py | 61 ++ .../linear_model/{ => _glm}/tests/test_glm.py | 78 +-- sklearn/linear_model/_glm/tests/test_link.py | 38 ++ 9 files changed, 702 insertions(+), 638 deletions(-) create mode 100644 sklearn/linear_model/_glm/__init__.py create mode 100644 sklearn/linear_model/_glm/distribution.py rename sklearn/linear_model/{_glm.py => _glm/glm.py} (62%) create mode 100644 sklearn/linear_model/_glm/link.py create mode 100644 sklearn/linear_model/_glm/tests/__init__.py create mode 100644 sklearn/linear_model/_glm/tests/test_distribution.py rename sklearn/linear_model/{ => _glm}/tests/test_glm.py (81%) create mode 100644 sklearn/linear_model/_glm/tests/test_link.py diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 121418f901a1a..1c0df55d27c90 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,8 +18,7 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from ._glm import (TweedieDistribution, - GeneralizedLinearRegressor, PoissonRegressor) +from ._glm.glm import (GeneralizedLinearRegressor, PoissonRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py new file mode 100644 index 0000000000000..9a88e5604de8a --- /dev/null +++ b/sklearn/linear_model/_glm/__init__.py @@ -0,0 +1,5 @@ +# License: BSD 3 clause + +from . import distribution + +__all__ = ['distribution'] diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py new file mode 100644 index 0000000000000..2dc720124b48b --- /dev/null +++ b/sklearn/linear_model/_glm/distribution.py @@ -0,0 +1,443 @@ +""" +Distribution functions used in GLM +""" + +# Author: Christian Lorentzen +# License: BSD 3 clause + +from abc import ABCMeta, abstractmethod +import numbers + +import numpy as np +from scipy.special import xlogy + + +def _safe_lin_pred(X, coef): + """Compute the linear predictor taking care if intercept is present.""" + if coef.size == X.shape[1] + 1: + return X @ coef[1:] + coef[0] + else: + return X @ coef + + +class ExponentialDispersionModel(metaclass=ABCMeta): + r"""Base class for reproductive Exponential Dispersion Models (EDM). + + The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by + + .. math:: p(y| \theta, \phi) = c(y, \phi) + \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) + = \tilde{c}(y, \phi) + \exp\left(-\frac{d(y, \mu)}{2\phi}\right) + + with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, + unit variance :math:`v(\mu)` and + unit deviance :math:`d(y,\mu)`. + + Attributes + ---------- + lower_bound + upper_bound + include_lower_bound + include_upper_bound + + Methods + ------- + in_y_range + unit_variance + unit_variance_derivative + variance + variance_derivative + unit_deviance + unit_deviance_derivative + deviance + deviance_derivative + starting_mu + + _mu_deviance_derivative + + References + ---------- + + https://en.wikipedia.org/wiki/Exponential_dispersion_model. + """ + @property + def lower_bound(self): + """Get the lower bound of values for Y~EDM.""" + return self._lower_bound + + @property + def upper_bound(self): + """Get the upper bound of values for Y~EDM.""" + return self._upper_bound + + @property + def include_lower_bound(self): + """Get True if lower bound for y is included: y >= lower_bound.""" + return self._include_lower_bound + + @property + def include_upper_bound(self): + """Get True if upper bound for y is included: y <= upper_bound.""" + return self._include_upper_bound + + def in_y_range(self, x): + """Returns ``True`` if x is in the valid range of Y~EDM. + + Parameters + ---------- + x : array, shape (n_samples,) + Target values. + """ + if self.include_lower_bound: + if self.include_upper_bound: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less(x, self.upper_bound)) + else: + if self.include_upper_bound: + return np.logical_and(np.greater(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater(x, self.lower_bound), + np.less(x, self.upper_bound)) + + @abstractmethod + def unit_variance(self, mu): + r"""Compute the unit variance function. + + The unit variance :math:`v(\mu)` determines the variance as + a function of the mean :math:`\mu` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. + It can also be derived from the unit deviance :math:`d(y,\mu)` as + + .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ + \partial\mu^2}}\big|_{y=\mu} + + See also :func:`variance`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + pass + + @abstractmethod + def unit_variance_derivative(self, mu): + r"""Compute the derivative of the unit variance w.r.t. mu. + + Return :math:`v'(\mu)`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Target values. + """ + pass + + def variance(self, mu, phi=1, weights=1): + r"""Compute the variance function. + + The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is + :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, + with unit variance :math:`v(\mu)` and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return phi/weights * self.unit_variance(mu) + + def variance_derivative(self, mu, phi=1, weights=1): + r"""Compute the derivative of the variance w.r.t. mu. + + Returns + :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] + =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` + and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return phi/weights * self.unit_variance_derivative(mu) + + @abstractmethod + def unit_deviance(self, y, mu): + r"""Compute the unit deviance. + + The unit_deviance :math:`d(y,\mu)` can be defined by the + log-likelihood as + :math:`d(y,\mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + """ + pass + + def unit_deviance_derivative(self, y, mu): + r"""Compute the derivative of the unit deviance w.r.t. mu. + + The derivative of the unit deviance is given by + :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + with unit variance :math:`v(\mu)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + """ + return -2 * (y - mu) / self.unit_variance(mu) + + def deviance(self, y, mu, weights=1): + r"""Compute the deviance. + + The deviance is a weighted sum of the per sample unit deviances, + :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` + with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. + In terms of the log-likelihood it is :math:`D = -2\phi\cdot + \left(loglike(y,\mu,\frac{phi}{s}) + - loglike(y,y,\frac{phi}{s})\right)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return np.sum(weights * self.unit_deviance(y, mu)) + + def deviance_derivative(self, y, mu, weights=1): + """Compute the derivative of the deviance w.r.t. mu. + + It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return weights * self.unit_deviance_derivative(y, mu) + + def starting_mu(self, y, weights=1, ind_weight=0.5): + """Set starting values for the mean mu. + + These may be good starting points for the (unpenalized) IRLS solver. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + + ind_weight : float (default=0.5) + Must be between 0 and 1. Specifies how much weight is given to the + individual observations instead of the mean of y. + """ + return (ind_weight * y + + (1. - ind_weight) * np.average(y, weights=weights)) + + def _mu_deviance_derivative(self, coef, X, y, weights, link): + """Compute mu and the derivative of the deviance w.r.t coef.""" + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + d1 = link.inverse_derivative(lin_pred) + temp = d1 * self.deviance_derivative(y, mu, weights) + if coef.size == X.shape[1] + 1: + devp = np.concatenate(([temp.sum()], temp @ X)) + else: + devp = temp @ X # sampe as X.T @ temp + return mu, devp + + +class TweedieDistribution(ExponentialDispersionModel): + r"""A class for the Tweedie distribution. + + A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely + defined by it's mean-variance relationship + :math:`\mathrm{Var}[Y] \propto \mu^power`. + + Special cases are: + + ===== ================ + Power Distribution + ===== ================ + 0 Normal + 1 Poisson + (0,1) Compound Poisson + 2 Gamma + 3 Inverse Gaussian + + Parameters + ---------- + power : float (default=0) + The variance power of the `unit_variance` + :math:`v(\mu) = \mu^{power}`. + For ``0 0) and (power < 1): + raise ValueError('For 0 1) and (power < 2): + # Compound Poisson + self._lower_bound = 0 + self._include_lower_bound = True + elif power == 2: + # GammaDistribution + self._lower_bound = 0 + self._include_lower_bound = False + elif (power > 2) and (power < 3): + # Positive Stable + self._lower_bound = 0 + self._include_lower_bound = False + elif power == 3: + # InverseGaussianDistribution + self._lower_bound = 0 + self._include_lower_bound = False + elif power > 3: + # Positive Stable + self._lower_bound = 0 + self._include_lower_bound = False + else: # pragma: no cover + # this branch should be unreachable. + raise ValueError + + self._power = power + + def unit_variance(self, mu): + """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + return np.power(mu, self.power) + + def unit_variance_derivative(self, mu): + """Compute the derivative of the unit variance of a Tweedie + distribution v(mu)=power*mu**(power-1). + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + return self.power * np.power(mu, self.power - 1) + + def unit_deviance(self, y, mu): + p = self.power + if p == 0: + # NormalDistribution + return (y - mu)**2 + if p == 1: + # PoissonDistribution + # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 + return 2 * (xlogy(y, y/mu) - y + mu) + elif p == 2: + # GammaDistribution + return 2 * (np.log(mu/y) + y/mu - 1) + else: + # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) + # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) + return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - + y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + + +class NormalDistribution(TweedieDistribution): + """Class for the Normal (aka Gaussian) distribution""" + def __init__(self): + super(NormalDistribution, self).__init__(power=0) + + +class PoissonDistribution(TweedieDistribution): + """Class for the scaled Poisson distribution""" + def __init__(self): + super(PoissonDistribution, self).__init__(power=1) + + +class GammaDistribution(TweedieDistribution): + """Class for the Gamma distribution""" + def __init__(self): + super(GammaDistribution, self).__init__(power=2) + + +class InverseGaussianDistribution(TweedieDistribution): + """Class for the scaled InverseGaussianDistribution distribution""" + def __init__(self): + super(InverseGaussianDistribution, self).__init__(power=3) + + +EDM_DISTRIBUTIONS = { + 'normal': NormalDistribution, + 'poisson': PoissonDistribution, + 'gamma': GammaDistribution, + 'inverse.gaussian': InverseGaussianDistribution, +} diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm/glm.py similarity index 62% rename from sklearn/linear_model/_glm.py rename to sklearn/linear_model/_glm/glm.py index 939249e42e4f4..bff8ea43fd550 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -6,17 +6,26 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -from abc import ABCMeta, abstractmethod import numbers import numpy as np -from scipy import special import scipy.optimize -from ..base import BaseEstimator, RegressorMixin -from ..utils import check_array, check_X_y -from ..utils.optimize import _check_optimize_result -from ..utils.validation import check_is_fitted +from ...base import BaseEstimator, RegressorMixin +from ...utils import check_array, check_X_y +from ...utils.optimize import _check_optimize_result +from ...utils.validation import check_is_fitted +from .distribution import ( + ExponentialDispersionModel, + TweedieDistribution, + EDM_DISTRIBUTIONS +) +from .link import ( + Link, + IdentityLink, + LogLink, + LogitLink, +) def _check_weights(sample_weight, n_samples): @@ -46,561 +55,6 @@ def _check_weights(sample_weight, n_samples): return weights -def _safe_lin_pred(X, coef): - """Compute the linear predictor taking care if intercept is present.""" - if coef.size == X.shape[1] + 1: - return X @ coef[1:] + coef[0] - else: - return X @ coef - - -class Link(metaclass=ABCMeta): - """Abstract base class for Link functions.""" - - @abstractmethod - def link(self, mu): - """Compute the link function g(mu). - - The link function links the mean mu=E[Y] to the so called linear - predictor (X*w), i.e. g(mu) = linear predictor. - - Parameters - ---------- - mu : array, shape (n_samples,) - Usually the (predicted) mean. - """ - pass - - @abstractmethod - def derivative(self, mu): - """Compute the derivative of the link g'(mu). - - Parameters - ---------- - mu : array, shape (n_samples,) - Usually the (predicted) mean. - """ - pass - - @abstractmethod - def inverse(self, lin_pred): - """Compute the inverse link function h(lin_pred). - - Gives the inverse relationship between linear predictor and the mean - mu=E[Y], i.e. h(linear predictor) = mu. - - Parameters - ---------- - lin_pred : array, shape (n_samples,) - Usually the (fitted) linear predictor. - """ - pass - - @abstractmethod - def inverse_derivative(self, lin_pred): - """Compute the derivative of the inverse link function h'(lin_pred). - - Parameters - ---------- - lin_pred : array, shape (n_samples,) - Usually the (fitted) linear predictor. - """ - pass - - @abstractmethod - def inverse_derivative2(self, lin_pred): - """Compute 2nd derivative of the inverse link function h''(lin_pred). - - Parameters - ---------- - lin_pred : array, shape (n_samples,) - Usually the (fitted) linear predictor. - """ - pass - - -class IdentityLink(Link): - """The identity link function g(x)=x.""" - - def link(self, mu): - return mu - - def derivative(self, mu): - return np.ones_like(mu) - - def inverse(self, lin_pred): - return lin_pred - - def inverse_derivative(self, lin_pred): - return np.ones_like(lin_pred) - - def inverse_derivative2(self, lin_pred): - return np.zeros_like(lin_pred) - - -class LogLink(Link): - """The log link function g(x)=log(x).""" - - def link(self, mu): - return np.log(mu) - - def derivative(self, mu): - return 1./mu - - def inverse(self, lin_pred): - return np.exp(lin_pred) - - def inverse_derivative(self, lin_pred): - return np.exp(lin_pred) - - def inverse_derivative2(self, lin_pred): - return np.exp(lin_pred) - - -class LogitLink(Link): - """The logit link function g(x)=logit(x).""" - - def link(self, mu): - return special.logit(mu) - - def derivative(self, mu): - return 1. / (mu * (1 - mu)) - - def inverse(self, lin_pred): - return special.expit(lin_pred) - - def inverse_derivative(self, lin_pred): - ep = special.expit(lin_pred) - return ep * (1. - ep) - - def inverse_derivative2(self, lin_pred): - ep = special.expit(lin_pred) - return ep * (1. - ep) * (1. - 2 * ep) - - -class ExponentialDispersionModel(metaclass=ABCMeta): - r"""Base class for reproductive Exponential Dispersion Models (EDM). - - The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by - - .. math:: p(y| \theta, \phi) = c(y, \phi) - \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) - = \tilde{c}(y, \phi) - \exp\left(-\frac{d(y, \mu)}{2\phi}\right) - - with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, - variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, - unit variance :math:`v(\mu)` and - unit deviance :math:`d(y,\mu)`. - - Attributes - ---------- - lower_bound - upper_bound - include_lower_bound - include_upper_bound - - Methods - ------- - in_y_range - unit_variance - unit_variance_derivative - variance - variance_derivative - unit_deviance - unit_deviance_derivative - deviance - deviance_derivative - starting_mu - - _mu_deviance_derivative - - References - ---------- - - https://en.wikipedia.org/wiki/Exponential_dispersion_model. - """ - @property - def lower_bound(self): - """Get the lower bound of values for Y~EDM.""" - return self._lower_bound - - @property - def upper_bound(self): - """Get the upper bound of values for Y~EDM.""" - return self._upper_bound - - @property - def include_lower_bound(self): - """Get True if lower bound for y is included: y >= lower_bound.""" - return self._include_lower_bound - - @property - def include_upper_bound(self): - """Get True if upper bound for y is included: y <= upper_bound.""" - return self._include_upper_bound - - def in_y_range(self, x): - """Returns ``True`` if x is in the valid range of Y~EDM. - - Parameters - ---------- - x : array, shape (n_samples,) - Target values. - """ - if self.include_lower_bound: - if self.include_upper_bound: - return np.logical_and(np.greater_equal(x, self.lower_bound), - np.less_equal(x, self.upper_bound)) - else: - return np.logical_and(np.greater_equal(x, self.lower_bound), - np.less(x, self.upper_bound)) - else: - if self.include_upper_bound: - return np.logical_and(np.greater(x, self.lower_bound), - np.less_equal(x, self.upper_bound)) - else: - return np.logical_and(np.greater(x, self.lower_bound), - np.less(x, self.upper_bound)) - - @abstractmethod - def unit_variance(self, mu): - r"""Compute the unit variance function. - - The unit variance :math:`v(\mu)` determines the variance as - a function of the mean :math:`\mu` by - :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. - It can also be derived from the unit deviance :math:`d(y,\mu)` as - - .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ - \partial\mu^2}}\big|_{y=\mu} - - See also :func:`variance`. - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - """ - pass - - @abstractmethod - def unit_variance_derivative(self, mu): - r"""Compute the derivative of the unit variance w.r.t. mu. - - Return :math:`v'(\mu)`. - - Parameters - ---------- - mu : array, shape (n_samples,) - Target values. - """ - pass - - def variance(self, mu, phi=1, weights=1): - r"""Compute the variance function. - - The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is - :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, - with unit variance :math:`v(\mu)` and weights :math:`s_i`. - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - - phi : float (default=1) - Dispersion parameter. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - """ - return phi/weights * self.unit_variance(mu) - - def variance_derivative(self, mu, phi=1, weights=1): - r"""Compute the derivative of the variance w.r.t. mu. - - Returns - :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] - =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` - and weights :math:`s_i`. - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - - phi : float (default=1) - Dispersion parameter. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - """ - return phi/weights * self.unit_variance_derivative(mu) - - @abstractmethod - def unit_deviance(self, y, mu): - r"""Compute the unit deviance. - - The unit_deviance :math:`d(y,\mu)` can be defined by the - log-likelihood as - :math:`d(y,\mu) = -2\phi\cdot - \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - mu : array, shape (n_samples,) - Predicted mean. - """ - pass - - def unit_deviance_derivative(self, y, mu): - r"""Compute the derivative of the unit deviance w.r.t. mu. - - The derivative of the unit deviance is given by - :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` - with unit variance :math:`v(\mu)`. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - mu : array, shape (n_samples,) - Predicted mean. - """ - return -2 * (y - mu) / self.unit_variance(mu) - - def deviance(self, y, mu, weights=1): - r"""Compute the deviance. - - The deviance is a weighted sum of the per sample unit deviances, - :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` - with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. - In terms of the log-likelihood it is :math:`D = -2\phi\cdot - \left(loglike(y,\mu,\frac{phi}{s}) - - loglike(y,y,\frac{phi}{s})\right)`. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - mu : array, shape (n_samples,) - Predicted mean. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - """ - return np.sum(weights * self.unit_deviance(y, mu)) - - def deviance_derivative(self, y, mu, weights=1): - """Compute the derivative of the deviance w.r.t. mu. - - It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - mu : array, shape (n_samples,) - Predicted mean. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - """ - return weights * self.unit_deviance_derivative(y, mu) - - def starting_mu(self, y, weights=1, ind_weight=0.5): - """Set starting values for the mean mu. - - These may be good starting points for the (unpenalized) IRLS solver. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - - ind_weight : float (default=0.5) - Must be between 0 and 1. Specifies how much weight is given to the - individual observations instead of the mean of y. - """ - return (ind_weight * y + - (1. - ind_weight) * np.average(y, weights=weights)) - - def _mu_deviance_derivative(self, coef, X, y, weights, link): - """Compute mu and the derivative of the deviance w.r.t coef.""" - lin_pred = _safe_lin_pred(X, coef) - mu = link.inverse(lin_pred) - d1 = link.inverse_derivative(lin_pred) - temp = d1 * self.deviance_derivative(y, mu, weights) - if coef.size == X.shape[1] + 1: - devp = np.concatenate(([temp.sum()], temp @ X)) - else: - devp = temp @ X # sampe as X.T @ temp - return mu, devp - - -class TweedieDistribution(ExponentialDispersionModel): - r"""A class for the Tweedie distribution. - - A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely - defined by it's mean-variance relationship - :math:`\mathrm{Var}[Y] \propto \mu^power`. - - Special cases are: - - ===== ================ - Power Distribution - ===== ================ - 0 Normal - 1 Poisson - (0,1) Compound Poisson - 2 Gamma - 3 Inverse Gaussian - - Parameters - ---------- - power : float (default=0) - The variance power of the `unit_variance` - :math:`v(\mu) = \mu^{power}`. - For ``0 0) and (power < 1): - raise ValueError('For 0 1) and (power < 2): - # Compound Poisson - self._lower_bound = 0 - self._include_lower_bound = True - elif power == 2: - # GammaDistribution - self._lower_bound = 0 - self._include_lower_bound = False - elif (power > 2) and (power < 3): - # Positive Stable - self._lower_bound = 0 - self._include_lower_bound = False - elif power == 3: - # InverseGaussianDistribution - self._lower_bound = 0 - self._include_lower_bound = False - elif power > 3: - # Positive Stable - self._lower_bound = 0 - self._include_lower_bound = False - else: # pragma: no cover - # this branch should be unreachable. - raise ValueError - - self._power = power - - def unit_variance(self, mu): - """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - """ - return np.power(mu, self.power) - - def unit_variance_derivative(self, mu): - """Compute the derivative of the unit variance of a Tweedie - distribution v(mu)=power*mu**(power-1). - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - """ - return self.power * np.power(mu, self.power - 1) - - def unit_deviance(self, y, mu): - p = self.power - if p == 0: - # NormalDistribution - return (y - mu)**2 - if p == 1: - # PoissonDistribution - # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 - return 2 * (special.xlogy(y, y/mu) - y + mu) - elif p == 2: - # GammaDistribution - return 2 * (np.log(mu/y) + y/mu - 1) - else: - # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) - # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) - return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) - - -class NormalDistribution(TweedieDistribution): - """Class for the Normal (aka Gaussian) distribution""" - def __init__(self): - super(NormalDistribution, self).__init__(power=0) - - -class PoissonDistribution(TweedieDistribution): - """Class for the scaled Poisson distribution""" - def __init__(self): - super(PoissonDistribution, self).__init__(power=1) - - -class GammaDistribution(TweedieDistribution): - """Class for the Gamma distribution""" - def __init__(self): - super(GammaDistribution, self).__init__(power=2) - - -class InverseGaussianDistribution(TweedieDistribution): - """Class for the scaled InverseGaussianDistribution distribution""" - def __init__(self): - super(InverseGaussianDistribution, self).__init__(power=3) - - -EDM_DISTRIBUTIONS = { - 'normal': NormalDistribution, - 'poisson': PoissonDistribution, - 'gamma': GammaDistribution, - 'inverse.gaussian': InverseGaussianDistribution, -} - - class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py new file mode 100644 index 0000000000000..f79f6163ada48 --- /dev/null +++ b/sklearn/linear_model/_glm/link.py @@ -0,0 +1,135 @@ +""" +Link functions used in GLM +""" + +# Author: Christian Lorentzen +# License: BSD 3 clause + +from abc import ABCMeta, abstractmethod + +import numpy as np +from scipy.special import expit, logit + + +class Link(metaclass=ABCMeta): + """Abstract base class for Link functions.""" + + @abstractmethod + def link(self, mu): + """Compute the link function g(mu). + + The link function links the mean mu=E[Y] to the so called linear + predictor (X*w), i.e. g(mu) = linear predictor. + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the (predicted) mean. + """ + pass + + @abstractmethod + def derivative(self, mu): + """Compute the derivative of the link g'(mu). + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the (predicted) mean. + """ + pass + + @abstractmethod + def inverse(self, lin_pred): + """Compute the inverse link function h(lin_pred). + + Gives the inverse relationship between linear predictor and the mean + mu=E[Y], i.e. h(linear predictor) = mu. + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + pass + + @abstractmethod + def inverse_derivative(self, lin_pred): + """Compute the derivative of the inverse link function h'(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + pass + + @abstractmethod + def inverse_derivative2(self, lin_pred): + """Compute 2nd derivative of the inverse link function h''(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + pass + + +class IdentityLink(Link): + """The identity link function g(x)=x.""" + + def link(self, mu): + return mu + + def derivative(self, mu): + return np.ones_like(mu) + + def inverse(self, lin_pred): + return lin_pred + + def inverse_derivative(self, lin_pred): + return np.ones_like(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.zeros_like(lin_pred) + + +class LogLink(Link): + """The log link function g(x)=log(x).""" + + def link(self, mu): + return np.log(mu) + + def derivative(self, mu): + return 1./mu + + def inverse(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.exp(lin_pred) + + +class LogitLink(Link): + """The logit link function g(x)=logit(x).""" + + def link(self, mu): + return logit(mu) + + def derivative(self, mu): + return 1. / (mu * (1 - mu)) + + def inverse(self, lin_pred): + return expit(lin_pred) + + def inverse_derivative(self, lin_pred): + ep = expit(lin_pred) + return ep * (1. - ep) + + def inverse_derivative2(self, lin_pred): + ep = expit(lin_pred) + return ep * (1. - ep) * (1. - 2 * ep) diff --git a/sklearn/linear_model/_glm/tests/__init__.py b/sklearn/linear_model/_glm/tests/__init__.py new file mode 100644 index 0000000000000..588cf7e93eef0 --- /dev/null +++ b/sklearn/linear_model/_glm/tests/__init__.py @@ -0,0 +1 @@ +# License: BSD 3 clause diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py new file mode 100644 index 0000000000000..1a2efdba0e563 --- /dev/null +++ b/sklearn/linear_model/_glm/tests/test_distribution.py @@ -0,0 +1,61 @@ +# Authors: Christian Lorentzen +# +# License: BSD 3 clause + +from numpy.testing import assert_allclose +from numpy.testing import assert_array_equal +import pytest + +from sklearn.linear_model._glm.distribution import ( + TweedieDistribution, + NormalDistribution, PoissonDistribution, + GammaDistribution, InverseGaussianDistribution, +) + + +@pytest.mark.parametrize( + 'family, expected', + [(NormalDistribution(), [True, True, True]), + (PoissonDistribution(), [False, True, True]), + (TweedieDistribution(power=1.5), [False, True, True]), + (GammaDistribution(), [False, False, True]), + (InverseGaussianDistribution(), [False, False, True]), + (TweedieDistribution(power=4.5), [False, False, True])]) +def test_family_bounds(family, expected): + """Test the valid range of distributions at -1, 0, 1.""" + result = family.in_y_range([-1, 0, 1]) + assert_array_equal(result, expected) + + +def test_tweedie_distribution_power(): + with pytest.raises(ValueError, match="no distribution exists"): + TweedieDistribution(power=0.5) + + with pytest.raises(TypeError, match="must be a real number"): + TweedieDistribution(power=1j) + + with pytest.raises(TypeError, match="must be a real number"): + dist = TweedieDistribution() + dist.power = 1j + + dist = TweedieDistribution() + assert dist._include_lower_bound is False + dist.power = 1 + assert dist._include_lower_bound is True + + +@pytest.mark.parametrize( + 'family, chk_values', + [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), + (PoissonDistribution(), [0.1, 1.5]), + (GammaDistribution(), [0.1, 1.5]), + (InverseGaussianDistribution(), [0.1, 1.5]), + (TweedieDistribution(power=-2.5), [0.1, 1.5]), + (TweedieDistribution(power=-1), [0.1, 1.5]), + (TweedieDistribution(power=1.5), [0.1, 1.5]), + (TweedieDistribution(power=2.5), [0.1, 1.5]), + (TweedieDistribution(power=-4), [0.1, 1.5])]) +def test_deviance_zero(family, chk_values): + """Test deviance(y,y) = 0 for different families.""" + for x in chk_values: + assert_allclose(family.deviance(x, x), 0, atol=1e-9) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py similarity index 81% rename from sklearn/linear_model/tests/test_glm.py rename to sklearn/linear_model/_glm/tests/test_glm.py index 1712f7b5e1d3d..ffac6dd11c243 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -8,11 +8,12 @@ from sklearn.datasets import make_regression from sklearn.linear_model import GeneralizedLinearRegressor -from sklearn.linear_model._glm import ( - Link, +from sklearn.linear_model._glm.link import ( IdentityLink, LogLink, LogitLink, +) +from sklearn.linear_model._glm.distribution import ( TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, @@ -21,8 +22,6 @@ from sklearn.metrics import mean_absolute_error from sklearn.exceptions import ConvergenceWarning -from sklearn.utils.testing import assert_array_equal - GLM_SOLVERS = ['lbfgs'] @@ -35,77 +34,6 @@ def regression_data(): return X, y -@pytest.mark.parametrize('link', Link.__subclasses__()) -def test_link_properties(link): - """Test link inverse and derivative.""" - rng = np.random.RandomState(42) - x = rng.rand(100)*100 - link = link() # instantiate object - if isinstance(link, LogitLink): - # careful for large x, note expit(36) = 1 - # limit max eta to 15 - x = x / 100 * 15 - assert_allclose(link.link(link.inverse(x)), x) - # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) - assert_allclose(link.derivative(link.inverse(x)), - 1./link.inverse_derivative(x)) - - assert ( - link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) - - # for LogitLink, in the following x should be between 0 and 1. - # assert_almost_equal(link.inverse_derivative(link.link(x)), - # 1./link.derivative(x), decimal=decimal) - - -@pytest.mark.parametrize( - 'family, expected', - [(NormalDistribution(), [True, True, True]), - (PoissonDistribution(), [False, True, True]), - (TweedieDistribution(power=1.5), [False, True, True]), - (GammaDistribution(), [False, False, True]), - (InverseGaussianDistribution(), [False, False, True]), - (TweedieDistribution(power=4.5), [False, False, True])]) -def test_family_bounds(family, expected): - """Test the valid range of distributions at -1, 0, 1.""" - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, expected) - - -def test_tweedie_distribution_power(): - with pytest.raises(ValueError, match="no distribution exists"): - TweedieDistribution(power=0.5) - - with pytest.raises(TypeError, match="must be a real number"): - TweedieDistribution(power=1j) - - with pytest.raises(TypeError, match="must be a real number"): - dist = TweedieDistribution() - dist.power = 1j - - dist = TweedieDistribution() - assert dist._include_lower_bound is False - dist.power = 1 - assert dist._include_lower_bound is True - - -@pytest.mark.parametrize( - 'family, chk_values', - [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), - (PoissonDistribution(), [0.1, 1.5]), - (GammaDistribution(), [0.1, 1.5]), - (InverseGaussianDistribution(), [0.1, 1.5]), - (TweedieDistribution(power=-2.5), [0.1, 1.5]), - (TweedieDistribution(power=-1), [0.1, 1.5]), - (TweedieDistribution(power=1.5), [0.1, 1.5]), - (TweedieDistribution(power=2.5), [0.1, 1.5]), - (TweedieDistribution(power=-4), [0.1, 1.5])]) -def test_deviance_zero(family, chk_values): - """Test deviance(y,y) = 0 for different families.""" - for x in chk_values: - assert_allclose(family.deviance(x, x), 0, atol=1e-9) - - def test_sample_weights_validation(): """Test the raised errors in the validation of sample_weight.""" # scalar value but not positive diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py new file mode 100644 index 0000000000000..3a2a21c4c04e0 --- /dev/null +++ b/sklearn/linear_model/_glm/tests/test_link.py @@ -0,0 +1,38 @@ +# Authors: Christian Lorentzen +# +# License: BSD 3 clause +import numpy as np +from numpy.testing import assert_allclose +import pytest + +from sklearn.linear_model._glm.link import ( + IdentityLink, + LogLink, + LogitLink, +) + + +LINK_FUNCTIONS = [IdentityLink, LogLink, LogitLink] + + +@pytest.mark.parametrize('link', LINK_FUNCTIONS) +def test_link_properties(link): + """Test link inverse and derivative.""" + rng = np.random.RandomState(42) + x = rng.rand(100)*100 + link = link() # instantiate object + if isinstance(link, LogitLink): + # careful for large x, note expit(36) = 1 + # limit max eta to 15 + x = x / 100 * 15 + assert_allclose(link.link(link.inverse(x)), x) + # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) + assert_allclose(link.derivative(link.inverse(x)), + 1./link.inverse_derivative(x)) + + assert ( + link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) + + # for LogitLink, in the following x should be between 0 and 1. + # assert_almost_equal(link.inverse_derivative(link.link(x)), + # 1./link.derivative(x), decimal=decimal) From 5927379debd8e591d72c5915ad434fc0592f5665 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 16 Jul 2019 18:27:36 +0200 Subject: [PATCH 088/209] Fix CI --- sklearn/linear_model/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 1c0df55d27c90..8fc662a48a6ae 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -81,5 +81,4 @@ 'ridge_regression', 'RANSACRegressor', 'GeneralizedLinearRegressor', - 'TweedieDistribution', 'PoissonRegressor'] From a6df2a788b5a4907b6c3a9b51a758f5b05e95257 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 16 Jul 2019 18:57:53 +0200 Subject: [PATCH 089/209] Add test_deviance_derivative --- sklearn/linear_model/__init__.py | 2 +- sklearn/linear_model/_glm/__init__.py | 4 +- .../_glm/tests/test_distribution.py | 44 +++++++++++++++++-- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 8fc662a48a6ae..e8f63e95cdfa8 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,7 +18,7 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from ._glm.glm import (GeneralizedLinearRegressor, PoissonRegressor) +from ._glm import (GeneralizedLinearRegressor, PoissonRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py index 9a88e5604de8a..ac0f2c206ffe0 100644 --- a/sklearn/linear_model/_glm/__init__.py +++ b/sklearn/linear_model/_glm/__init__.py @@ -1,5 +1,5 @@ # License: BSD 3 clause -from . import distribution +from .glm import (GeneralizedLinearRegressor, PoissonRegressor) -__all__ = ['distribution'] +__all__ = ['GeneralizedLinearRegressor', 'PoissonRegressor'] diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py index 1a2efdba0e563..f457fc882a809 100644 --- a/sklearn/linear_model/_glm/tests/test_distribution.py +++ b/sklearn/linear_model/_glm/tests/test_distribution.py @@ -1,9 +1,12 @@ # Authors: Christian Lorentzen # # License: BSD 3 clause - -from numpy.testing import assert_allclose -from numpy.testing import assert_array_equal +import numpy as np +from numpy.testing import ( + assert_allclose, + assert_array_equal, +) +from scipy.optimize import check_grad import pytest from sklearn.linear_model._glm.distribution import ( @@ -59,3 +62,38 @@ def test_deviance_zero(family, chk_values): """Test deviance(y,y) = 0 for different families.""" for x in chk_values: assert_allclose(family.deviance(x, x), 0, atol=1e-9) + + +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), + PoissonDistribution(), + GammaDistribution(), + InverseGaussianDistribution(), + TweedieDistribution(power=-2.5), + TweedieDistribution(power=-1), + TweedieDistribution(power=1.5), + TweedieDistribution(power=2.5), + TweedieDistribution(power=-4)], + ids=lambda x: x.__class__.__name__ +) +def test_deviance_derivative(family): + """Test deviance derivative for different families.""" + rng = np.random.RandomState(0) + y_true = rng.rand(10) + # make data positive + y_true += np.abs(y_true.min()) + 1e-2 + + y_pred = y_true + np.fmax(rng.rand(10), 0.) + + dev = family.deviance(y_true, y_pred) + assert isinstance(dev, float) + dev_derivative = family.deviance_derivative(y_true, y_pred) + assert dev_derivative.shape == y_pred.shape + + err = check_grad( + lambda mu: family.deviance(y_true, mu), + lambda mu: family.deviance_derivative(y_true, mu), + y_pred, + ) / np.linalg.norm(dev_derivative) + assert err < 1e-6 From 5af89a70fd4a168f3ab1d0d1966d6357e6bc4e68 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 16 Jul 2019 19:58:54 +0200 Subject: [PATCH 090/209] Fix sklearn/linear_model/setup.py --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 2 +- sklearn/linear_model/_glm/tests/test_distribution.py | 2 +- sklearn/linear_model/setup.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 1c8dd42df336d..fba1cc42e20a7 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -40,7 +40,7 @@ from sklearn.compose import ColumnTransformer from sklearn.linear_model import GeneralizedLinearRegressor -from sklearn.linear_model._glm import TweedieDistribution +from sklearn.linear_model._glm.distribution import TweedieDistribution from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py index f457fc882a809..f984077bed5cb 100644 --- a/sklearn/linear_model/_glm/tests/test_distribution.py +++ b/sklearn/linear_model/_glm/tests/test_distribution.py @@ -96,4 +96,4 @@ def test_deviance_derivative(family): lambda mu: family.deviance_derivative(y_true, mu), y_pred, ) / np.linalg.norm(dev_derivative) - assert err < 1e-6 + assert abs(err) < 1e-6 diff --git a/sklearn/linear_model/setup.py b/sklearn/linear_model/setup.py index 8226412fdecbd..5cf7040d4c9d4 100644 --- a/sklearn/linear_model/setup.py +++ b/sklearn/linear_model/setup.py @@ -42,6 +42,7 @@ def configuration(parent_package='', top_path=None): # add other directories config.add_subpackage('tests') + config.add_subpackage('_glm') return config From cd347d4610d00dca9e7a01de2067595a4e4b8a59 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 17 Jul 2019 14:01:44 +0200 Subject: [PATCH 091/209] Remove variance and variance_derivative methods from distributions --- sklearn/linear_model/_glm/distribution.py | 72 ++--------------------- 1 file changed, 4 insertions(+), 68 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 2dc720124b48b..18e7cf0c0a227 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -44,16 +44,13 @@ class ExponentialDispersionModel(metaclass=ABCMeta): Methods ------- + deviance + deviance_derivative in_y_range - unit_variance - unit_variance_derivative - variance - variance_derivative unit_deviance unit_deviance_derivative - deviance - deviance_derivative - starting_mu + unit_variance + unit_variance_derivative _mu_deviance_derivative @@ -139,47 +136,6 @@ def unit_variance_derivative(self, mu): """ pass - def variance(self, mu, phi=1, weights=1): - r"""Compute the variance function. - - The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is - :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, - with unit variance :math:`v(\mu)` and weights :math:`s_i`. - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - - phi : float (default=1) - Dispersion parameter. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - """ - return phi/weights * self.unit_variance(mu) - - def variance_derivative(self, mu, phi=1, weights=1): - r"""Compute the derivative of the variance w.r.t. mu. - - Returns - :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] - =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` - and weights :math:`s_i`. - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - - phi : float (default=1) - Dispersion parameter. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - """ - return phi/weights * self.unit_variance_derivative(mu) - @abstractmethod def unit_deviance(self, y, mu): r"""Compute the unit deviance. @@ -257,26 +213,6 @@ def deviance_derivative(self, y, mu, weights=1): """ return weights * self.unit_deviance_derivative(y, mu) - def starting_mu(self, y, weights=1, ind_weight=0.5): - """Set starting values for the mean mu. - - These may be good starting points for the (unpenalized) IRLS solver. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - - ind_weight : float (default=0.5) - Must be between 0 and 1. Specifies how much weight is given to the - individual observations instead of the mean of y. - """ - return (ind_weight * y + - (1. - ind_weight) * np.average(y, weights=weights)) - def _mu_deviance_derivative(self, coef, X, y, weights, link): """Compute mu and the derivative of the deviance w.r.t coef.""" lin_pred = _safe_lin_pred(X, coef) From 0d7f9cd9babe928a01266f43f1f5e4e41829075b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 17 Jul 2019 15:05:49 +0200 Subject: [PATCH 092/209] Improve coverage --- sklearn/linear_model/_glm/distribution.py | 6 ++-- sklearn/linear_model/_glm/link.py | 10 +++---- sklearn/linear_model/_glm/tests/test_glm.py | 31 +++++++++++++++++++++ 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 18e7cf0c0a227..6fffd816ff8f9 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -121,7 +121,7 @@ def unit_variance(self, mu): mu : array, shape (n_samples,) Predicted mean. """ - pass + pass # pragma: no cover @abstractmethod def unit_variance_derivative(self, mu): @@ -134,7 +134,7 @@ def unit_variance_derivative(self, mu): mu : array, shape (n_samples,) Target values. """ - pass + pass # pragma: no cover @abstractmethod def unit_deviance(self, y, mu): @@ -153,7 +153,7 @@ def unit_deviance(self, y, mu): mu : array, shape (n_samples,) Predicted mean. """ - pass + pass # pragma: no cover def unit_deviance_derivative(self, y, mu): r"""Compute the derivative of the unit deviance w.r.t. mu. diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py index f79f6163ada48..ec9a7b7736eb1 100644 --- a/sklearn/linear_model/_glm/link.py +++ b/sklearn/linear_model/_glm/link.py @@ -26,7 +26,7 @@ def link(self, mu): mu : array, shape (n_samples,) Usually the (predicted) mean. """ - pass + pass # pragma: no cover @abstractmethod def derivative(self, mu): @@ -37,7 +37,7 @@ def derivative(self, mu): mu : array, shape (n_samples,) Usually the (predicted) mean. """ - pass + pass # pragma: no cover @abstractmethod def inverse(self, lin_pred): @@ -51,7 +51,7 @@ def inverse(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - pass + pass # pragma: no cover @abstractmethod def inverse_derivative(self, lin_pred): @@ -62,7 +62,7 @@ def inverse_derivative(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - pass + pass # pragma: no cover @abstractmethod def inverse_derivative2(self, lin_pred): @@ -73,7 +73,7 @@ def inverse_derivative2(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - pass + pass # pragma: no cover class IdentityLink(Link): diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index ffac6dd11c243..a0ac77b65aa9f 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -219,6 +219,37 @@ def test_glm_log_regression(family, solver, tol): assert_allclose(res.coef_, coef, rtol=5e-6) +@pytest.mark.parametrize('fit_intercept', [True, False]) +def test_warm_start(fit_intercept): + n_samples, n_features = 100, 10 + n_predict = 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=42) + + glm1 = GeneralizedLinearRegressor( + warm_start=False, + fit_intercept=fit_intercept, + max_iter=1000 + ) + glm1.fit(X, y) + + glm2 = GeneralizedLinearRegressor( + warm_start=True, + fit_intercept=fit_intercept, + max_iter=1 + ) + glm2.fit(X, y) + assert glm1.score(X, y) > glm2.score(X, y) + glm2.set_params(max_iter=1000) + glm2.fit(X, y) + assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-4, atol=1e-5) + assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) + # TODO: investigate why this doesn't match + # assert glm1.n_iter_ == glm2.n_iter_ + 2 + + @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) @pytest.mark.parametrize('fit_intercept', [True, False]) @pytest.mark.parametrize('solver', GLM_SOLVERS) From dbffad84fcab9521265e85d5e7cbf9da0673b380 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 17 Jul 2019 15:44:44 +0200 Subject: [PATCH 093/209] Remove mentions of the binomial distribution --- sklearn/linear_model/_glm/glm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index bff8ea43fd550..0115f9d160337 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -85,7 +85,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \ + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} \ or an instance of class ExponentialDispersionModel, \ optional(default='normal') The distributional assumption of the GLM, i.e. which distribution from @@ -101,8 +101,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'log' for families 'poisson', 'gamma', 'inverse.gaussian' - - 'logit' for family 'binomial' - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) Method for estimation of the dispersion parameter phi. Whether to use the chi squared statistic or the deviance statistic. If None, the From 3187204dc4c1bb41a6131c49262c8380a34b9ec1 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 19 Jul 2019 17:37:11 +0200 Subject: [PATCH 094/209] Use common simple weight validation --- sklearn/linear_model/_glm/glm.py | 37 +++------------------ sklearn/linear_model/_glm/tests/test_glm.py | 19 ++--------- 2 files changed, 7 insertions(+), 49 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 0115f9d160337..546e71c583fa8 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -14,7 +14,7 @@ from ...base import BaseEstimator, RegressorMixin from ...utils import check_array, check_X_y from ...utils.optimize import _check_optimize_result -from ...utils.validation import check_is_fitted +from ...utils.validation import check_is_fitted, _check_sample_weight from .distribution import ( ExponentialDispersionModel, TweedieDistribution, @@ -28,33 +28,6 @@ ) -def _check_weights(sample_weight, n_samples): - """Check that sample weights are non-negative and have the right shape.""" - if sample_weight is None: - weights = np.ones(n_samples) - elif np.isscalar(sample_weight): - if sample_weight <= 0: - raise ValueError("Sample weights must be non-negative.") - weights = sample_weight * np.ones(n_samples) - else: - _dtype = [np.float64, np.float32] - weights = check_array(sample_weight, accept_sparse=False, - force_all_finite=True, ensure_2d=False, - dtype=_dtype) - if weights.ndim > 1: - raise ValueError("Sample weight must be 1D array or scalar") - elif weights.shape[0] != n_samples: - raise ValueError("Sample weights must have the same length as " - "y") - if not np.all(weights >= 0): - raise ValueError("Sample weights must be non-negative.") - elif not np.sum(weights) > 0: - raise ValueError("Sample weights must have at least one positive " - "element.") - - return weights - - class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. @@ -323,7 +296,7 @@ def fit(self, X, y, sample_weight=None): copy=self.copy_X) y = np.asarray(y, dtype=np.float64) - weights = _check_weights(sample_weight, y.shape[0]) + weights = _check_sample_weight(sample_weight, X) n_samples, n_features = X.shape @@ -458,7 +431,7 @@ def predict(self, X, sample_weight=None): allow_nd=False) eta = self._linear_predictor(X) mu = self._link_instance.inverse(eta) - weights = _check_weights(sample_weight, X.shape[0]) + weights = _check_sample_weight(sample_weight, X) return mu*weights @@ -487,7 +460,7 @@ def estimate_phi(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=_dtype, y_numeric=True, multi_output=False) n_samples, n_features = X.shape - weights = _check_weights(sample_weight, n_samples) + weights = _check_sample_weight(sample_weight, X) eta = X @ self.coef_ if self.fit_intercept is True: eta += self.intercept_ @@ -542,7 +515,7 @@ def score(self, X, y, sample_weight=None): # Note, default score defined in RegressorMixin is R^2 score. # TODO: make D^2 a score function in module metrics (and thereby get # input validation and so on) - weights = _check_weights(sample_weight, y.shape[0]) + weights = _check_sample_weight(sample_weight, X) mu = self.predict(X) dev = self._family_instance.deviance(y, mu, weights=weights) y_mean = np.average(y, weights=weights) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index a0ac77b65aa9f..2909c85ba8c40 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -41,8 +41,6 @@ def test_sample_weights_validation(): y = [1] weights = 0 glm = GeneralizedLinearRegressor(fit_intercept=False) - with pytest.raises(ValueError, match="weights must be non-negative"): - glm.fit(X, y, weights) # Positive weights are accepted glm.fit(X, y, sample_weight=1) @@ -54,21 +52,8 @@ def test_sample_weights_validation(): # 1d but wrong length weights = [1, 0] - with pytest.raises(ValueError, - match="weights must have the same length as y"): - glm.fit(X, y, weights) - - # 1d but only zeros (sum not greater than 0) - weights = [0, 0] - X = [[0], [1]] - y = [1, 2] - with pytest.raises(ValueError, - match="must have at least one positive element"): - glm.fit(X, y, weights) - - # 5. 1d but with a negative value - weights = [2, -1] - with pytest.raises(ValueError, match="weights must be non-negative"): + msg = r"sample_weight.shape == \(2,\), expected \(1,\)!" + with pytest.raises(ValueError, match=msg): glm.fit(X, y, weights) From cc03c1ad58db4108ea1bfc3d2d47225be8822bd8 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 19 Jul 2019 17:41:01 +0200 Subject: [PATCH 095/209] Simplify comments formatting --- sklearn/linear_model/_glm/glm.py | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 546e71c583fa8..ea3e4964529bf 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -208,12 +208,6 @@ def fit(self, X, y, sample_weight=None): ------- self : returns an instance of self. """ - ####################################################################### - # 1. input validation # - ####################################################################### - # 1.1 validate arguments of __init__ - # Guarantee that self._family_instance is an instance of class - # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): self._family_instance = self.family elif self.family in EDM_DISTRIBUTIONS: @@ -288,7 +282,6 @@ def fit(self, X, y, sample_weight=None): family = self._family_instance link = self._link_instance - # 1.2 validate arguments of fit ####################################### _dtype = [np.float64, np.float32] _stype = ['csc', 'csr'] X, y = check_X_y(X, y, accept_sparse=_stype, @@ -300,7 +293,6 @@ def fit(self, X, y, sample_weight=None): n_samples, n_features = X.shape - # 1.4 additional validations ########################################## if self.check_input: if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " @@ -308,9 +300,8 @@ def fit(self, X, y, sample_weight=None): .format(family.__class__.__name__)) # TODO: if alpha=0 check that X is not rank deficient - ####################################################################### - # 2. rescaling of weights (sample_weight) # - ####################################################################### + # rescaling of sample_weight + # # IMPORTANT NOTE: Since we want to minimize # 1/(2*sum(sample_weight)) * deviance + L2, # deviance = sum(sample_weight * unit_deviance), @@ -319,9 +310,7 @@ def fit(self, X, y, sample_weight=None): weights_sum = np.sum(weights) weights = weights/weights_sum - ####################################################################### - # 3. initialization of coef = (intercept_, coef_) # - ####################################################################### + # initialization of coef = (intercept_, coef) # Note: Since phi=self.dispersion_ does not enter the estimation # of mu_i=E[y_i], set it to 1. @@ -338,12 +327,8 @@ def fit(self, X, y, sample_weight=None): else: coef = np.zeros(n_features) - ####################################################################### - # 4. fit # - ####################################################################### # algorithms for optimization - # 4.1 L-BFGS ########################################################## if solver == 'lbfgs': def func(coef, X, y, weights, alpha, family, link): mu, devp = \ @@ -371,9 +356,6 @@ def func(coef, X, y, weights, alpha, family, link): self.n_iter_ = _check_optimize_result("lbfgs", opt_res) coef = opt_res.x - ####################################################################### - # 5. postprocessing # - ####################################################################### if self.fit_intercept: self.intercept_ = coef[0] self.coef_ = coef[1:] @@ -425,9 +407,8 @@ def predict(self, X, sample_weight=None): C : array, shape (n_samples,) Returns predicted values times sample_weight. """ - # TODO: Is copy=True necessary? X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype='numeric', copy=True, ensure_2d=True, + dtype='numeric', ensure_2d=True, allow_nd=False) eta = self._linear_predictor(X) mu = self._link_instance.inverse(eta) From aa52b4a32e216bfdd516776ac6796f8c0b4e5137 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 22 Jul 2019 15:50:47 +0200 Subject: [PATCH 096/209] Refactor to use TweedieDistribition in metrics --- sklearn/linear_model/_glm/distribution.py | 33 ++++++++++++++--------- sklearn/metrics/regression.py | 14 +++------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 6fffd816ff8f9..b99e1b40b2871 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -330,21 +330,28 @@ def unit_variance_derivative(self, mu): def unit_deviance(self, y, mu): p = self.power - if p == 0: - # NormalDistribution - return (y - mu)**2 - if p == 1: - # PoissonDistribution - # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 - return 2 * (xlogy(y, y/mu) - y + mu) + if p < 0: + # 'Extreme stable', y_true any realy number, y_pred > 0 + dev = 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p) * (2-p)) - + y * np.power(mu, 1-p)/(1-p) + + np.power(mu, 2-p)/(2-p)) + elif p == 0: + # Normal distribution, y_true and y_pred any real number + dev = (y - mu)**2 + elif p < 1: + raise ValueError("Tweedie deviance is only defined for p<=0 and " + "p>=1.") + elif p == 1: + # Poisson distribution + dev = 2 * (xlogy(y, y/mu) - y + mu) elif p == 2: - # GammaDistribution - return 2 * (np.log(mu/y) + y/mu - 1) + # Gamma distribution + dev = 2 * (np.log(mu/y) + y/mu - 1) else: - # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) - # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) - return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + dev = 2 * (np.power(y, 2-p)/((1-p) * (2-p)) - + y * np.power(mu, 1-p)/(1-p) + + np.power(mu, 2-p)/(2-p)) + return dev class NormalDistribution(TweedieDistribution): diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 2cba3d31ec84a..2f6c442c1f824 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -665,6 +665,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): >>> mean_tweedie_deviance(y_true, y_pred, p=1) 1.4260... """ + from ..linear_model._glm.distribution import TweedieDistribution y_type, y_true, y_pred, _ = _check_reg_targets( y_true, y_pred, None, dtype=[np.float64, np.float32]) if y_type == 'continuous-multioutput': @@ -681,12 +682,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): # 'Extreme stable', y_true any realy number, y_pred > 0 if (y_pred <= 0).any(): raise ValueError(message + "strictly positive y_pred.") - dev = 2 * (np.power(np.maximum(y_true, 0), 2-p)/((1-p) * (2-p)) - - y_true * np.power(y_pred, 1-p)/(1-p) + - np.power(y_pred, 2-p)/(2-p)) elif p == 0: - # Normal distribution, y_true and y_pred any real number - dev = (y_true - y_pred)**2 + pass elif p < 1: raise ValueError("Tweedie deviance is only defined for p<=0 and " "p>=1.") @@ -695,12 +692,10 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): if (y_true < 0).any() or (y_pred <= 0).any(): raise ValueError(message + "non-negative y_true and strictly " "positive y_pred.") - dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred) elif p == 2: # Gamma distribution, y_true and y_pred > 0 if (y_true <= 0).any() or (y_pred <= 0).any(): raise ValueError(message + "strictly positive y_true and y_pred.") - dev = 2 * (np.log(y_pred/y_true) + y_true/y_pred - 1) else: if p < 2: # 1 < p < 2 is Compound Poisson, y_true >= 0, y_pred > 0 @@ -712,9 +707,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): raise ValueError(message + "strictly positive y_true and " "y_pred.") - dev = 2 * (np.power(y_true, 2-p)/((1-p) * (2-p)) - - y_true * np.power(y_pred, 1-p)/(1-p) + - np.power(y_pred, 2-p)/(2-p)) + dist = TweedieDistribution(power=p) + dev = dist.unit_deviance(y_true, y_pred) return np.average(dev, weights=sample_weight) From 816aa8f36f3f96a6616a7aac0b4d2583b71f0fc7 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 25 Jul 2019 07:54:28 +0200 Subject: [PATCH 097/209] WIP --- .../plot_poisson_regression_non_normal_loss.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 0537704b2cf1f..6b12370a9433c 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -25,7 +25,6 @@ import numpy as np import matplotlib.pyplot as plt import pandas as pd -from scipy.special import xlogy from sklearn.compose import ColumnTransformer from sklearn.linear_model import GeneralizedLinearRegressor, LinearRegression @@ -36,6 +35,7 @@ from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error, mean_absolute_error +from sklearn.metrics import mean_poisson_deviance def load_mtpl2(n_samples=100000): @@ -140,13 +140,6 @@ def load_mtpl2(n_samples=100000): df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) -def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): - y_true = np.atleast_1d(y_true) - y_pred = np.atleast_1d(y_pred) - dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred) - return np.average(dev, weights=sample_weights) - - eps = 1e-5 print("MSE: %.3f" % mean_squared_error( df_test.Frequency.values, np.zeros(len(df_test)), @@ -154,7 +147,7 @@ def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): print("MAE: %.3f" % mean_absolute_error( df_test.Frequency.values, np.zeros(len(df_test)), df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( +print("mean Poisson deviance: %.3f" % mean_poisson_deviance( df_test.Frequency.values, eps + np.zeros(len(df_test)), df_test.Exposure.values)) @@ -175,7 +168,7 @@ def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): print("MSE: %.3f" % mean_absolute_error( df_test.Frequency.values, linregr.predict(X_test), df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( +print("mean Poisson deviance: %.3f" % mean_poisson_deviance( df_test.Frequency.values, np.fmax(linregr.predict(X_test), eps), df_test.Exposure.values)) @@ -201,7 +194,7 @@ def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): print("MAE: %.3f" % mean_absolute_error( df_test.Frequency.values, glm_freq.predict(X_test), df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( +print("mean Poisson deviance: %.3f" % mean_poisson_deviance( df_test.Frequency.values, glm_freq.predict(X_test), df_test.Exposure.values)) @@ -221,7 +214,7 @@ def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) print("MAE: %.3f" % mean_absolute_error( df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( +print("mean Poisson deviance: %.3f" % mean_poisson_deviance( df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) ############################################################################## From 6500c81a354c13de43ce0956240893b21c3979fb Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 25 Jul 2019 16:47:13 +0200 Subject: [PATCH 098/209] Use Poisson deviance in examples --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 6b12370a9433c..ba9c0c8898fcd 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -184,7 +184,7 @@ def load_mtpl2(n_samples=100000): # # Next we fit the Poisson regressor on the target variable, -glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0) +glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0, max_iter=1000) glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) print("PoissonRegressor") @@ -215,7 +215,8 @@ def load_mtpl2(n_samples=100000): print("MAE: %.3f" % mean_absolute_error( df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) + df_test.Frequency.values, np.fmax(gbr.predict(X_test), eps), + df_test.Exposure.values)) ############################################################################## # From 59a6d9dd2aeae4901f138f6d1c892c48c427f9a8 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 25 Jul 2019 18:22:26 +0200 Subject: [PATCH 099/209] Use PoissonRegressor and GammaRegressor in examples --- doc/modules/classes.rst | 2 + ...plot_poisson_regression_non_normal_loss.py | 4 +- ...lot_tweedie_regression_insurance_claims.py | 15 +- sklearn/linear_model/__init__.py | 2 +- sklearn/linear_model/_glm/__init__.py | 4 +- sklearn/linear_model/_glm/glm.py | 146 +++++++++++++++--- sklearn/linear_model/_glm/tests/test_glm.py | 3 +- 7 files changed, 145 insertions(+), 31 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 6346c0c65fadc..177cd0780f9be 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -750,6 +750,7 @@ Kernels: linear_model.BayesianRidge linear_model.ElasticNet linear_model.ElasticNetCV + linear_model.GammaRegressor linear_model.GeneralizedLinearRegressor linear_model.HuberRegressor linear_model.Lars @@ -771,6 +772,7 @@ Kernels: linear_model.PassiveAggressiveClassifier linear_model.PassiveAggressiveRegressor linear_model.Perceptron + linear_model.PoissonRegressor linear_model.RANSACRegressor linear_model.Ridge linear_model.RidgeClassifier diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index ba9c0c8898fcd..713866a712aea 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -27,7 +27,7 @@ import pandas as pd from sklearn.compose import ColumnTransformer -from sklearn.linear_model import GeneralizedLinearRegressor, LinearRegression +from sklearn.linear_model import PoissonRegressor, LinearRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder @@ -184,7 +184,7 @@ def load_mtpl2(n_samples=100000): # # Next we fit the Poisson regressor on the target variable, -glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0, max_iter=1000) +glm_freq = PoissonRegressor(alpha=0, max_iter=1000) glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) print("PoissonRegressor") diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index fba1cc42e20a7..55d0ca24ce477 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -39,7 +39,8 @@ import pandas as pd from sklearn.compose import ColumnTransformer -from sklearn.linear_model import GeneralizedLinearRegressor +from sklearn.linear_model import PoissonRegressor, GammaRegressor +from sklearn.linear_model._glm import GeneralizedLinearRegressor from sklearn.linear_model._glm.distribution import TweedieDistribution from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline @@ -192,7 +193,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # Some of the features are colinear, we use a weak penalization to avoid # numerical issues. -glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=1e-2) +glm_freq = PoissonRegressor(alpha=1e-2) glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) @@ -330,7 +331,7 @@ def score_estimator( mask_train = df_train["ClaimAmount"] > 0 mask_test = df_test["ClaimAmount"] > 0 -glm_sev = GeneralizedLinearRegressor(family="gamma") +glm_sev = GammaRegressor() glm_sev.fit( X_train[mask_train.values], @@ -464,12 +465,16 @@ def score(self, X, y, sample_weight=None): # this takes a while params = { "family": [ - TweedieDistribution(power=power) for power in np.linspace(1, 2, 8) + TweedieDistribution(power=power) + # exclude upper bound as power=2 does not support null y samples. + for power in np.linspace(1 + 1e-4, 2 - 1e-4, 8) ] } + glm_total = GridSearchCV( - GeneralizedLinearRegressor(), cv=3, param_grid=params, n_jobs=-1 + GeneralizedLinearRegressor(tol=1e-3, max_iter=500), cv=3, + param_grid=params, n_jobs=-1 ) glm_total.fit( X_train, df_train["ClaimAmount"], sample_weight=df_train["Exposure"] diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index e8f63e95cdfa8..d4227a126e5ec 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,7 +18,7 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from ._glm import (GeneralizedLinearRegressor, PoissonRegressor) +from ._glm import (GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py index ac0f2c206ffe0..a7d4b67817730 100644 --- a/sklearn/linear_model/_glm/__init__.py +++ b/sklearn/linear_model/_glm/__init__.py @@ -1,5 +1,5 @@ # License: BSD 3 clause -from .glm import (GeneralizedLinearRegressor, PoissonRegressor) +from .glm import GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor -__all__ = ['GeneralizedLinearRegressor', 'PoissonRegressor'] +__all__ = ["GeneralizedLinearRegressor", "PoissonRegressor", "GammaRegressor"] diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index ea3e4964529bf..12486e188951a 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -24,7 +24,6 @@ Link, IdentityLink, LogLink, - LogitLink, ) @@ -64,7 +63,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The distributional assumption of the GLM, i.e. which distribution from the EDM, specifies the loss function to be minimized. - link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \ + link : {'auto', 'identity', 'log'} or an instance of class Link, \ optional (default='auto') The link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). Option 'auto' sets the link depending on @@ -160,11 +159,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): References ---------- - For the coordinate descent implementation: - * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin - An Improved GLMNET for L1-regularized Logistic Regression, - Journal of Machine Learning Research 13 (2012) 1999-2030 - https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemática, no. 51. See also + `Exponential dispersion model. + `_ """ def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', @@ -240,12 +241,10 @@ def fit(self, X, y, sample_weight=None): self._link_instance = IdentityLink() elif self.link == 'log': self._link_instance = LogLink() - elif self.link == 'logit': - self._link_instance = LogitLink() else: raise ValueError( "The link must be an instance of class Link or " - "an element of ['auto', 'identity', 'log', 'logit']; " + "an element of ['auto', 'identity', 'log']; " "got (link={0})".format(self.link)) if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: @@ -606,18 +605,127 @@ class PoissonRegressor(GeneralizedLinearRegressor): References ---------- - For the coordinate descent implementation: - * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin - An Improved GLMNET for L1-regularized Logistic Regression, - Journal of Machine Learning Research 13 (2012) 1999-2030 - https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemática, no. 51. See also + `Exponential dispersion model. + `_ """ - def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None, - solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, - copy_X=True, check_input=True, verbose=0): + def __init__(self, alpha=1.0, fit_intercept=True, link='log', + fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, + warm_start=False, copy_X=True, check_input=True, verbose=0): + + super().__init__(alpha=alpha, fit_intercept=fit_intercept, + family="poisson", link=link, + fit_dispersion=fit_dispersion, solver=solver, + max_iter=max_iter, tol=tol, warm_start=warm_start, + copy_X=copy_X, verbose=verbose) + + +class GammaRegressor(GeneralizedLinearRegressor): + """Regression with the response variable y following a Gamma distribution + + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean of the target y as mu=h(X*w). + The fit minimizes the following objective function with L2 regularization:: + + 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 + + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, optional (default=1) + Constant that multiplies the penalty terms and thus determines the + regularization strength. + See the notes for the exact mathematical meaning of this + parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + case, the design matrix X must have full column rank + (no collinearities). + + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) + Method for estimation of the dispersion parameter phi. Whether to use + the chi squared statistic or the deviance statistic. If None, the + dispersion is not estimated. + + solver : {'lbfgs'}, optional (default='lbfgs') + Algorithm to use in the optimization problem: + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. + + max_iter : int, optional (default=100) + The maximal number of iterations for solver algorithms. + + tol : float, optional (default=1e-4) + Stopping criterion. For the lbfgs solver, + the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient (derivative) of + the objective function. + + warm_start : boolean, optional (default=False) + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` . + + copy_X : boolean, optional, (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + verbose : int, optional (default=0) + For the lbfgs solver set verbose to any positive number for verbosity. + + Attributes + ---------- + coef_ : array, shape (n_features,) + Estimated coefficients for the linear predictor (X*coef_+intercept_) in + the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + dispersion_ : float + The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. + + n_iter_ : int + Actual number of iterations used in solver. + + Notes + ----- + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + :ref:`User Guide `. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. + + + References + ---------- + .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemática, no. 51. See also + `Exponential dispersion model. + `_ + """ + def __init__(self, alpha=1.0, fit_intercept=True, link='log', + fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, + warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, - family="poisson", link='log', + family="gamma", link=link, fit_dispersion=fit_dispersion, solver=solver, max_iter=max_iter, tol=tol, warm_start=warm_start, copy_X=copy_X, verbose=verbose) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 2909c85ba8c40..089f251d77049 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -77,8 +77,7 @@ def test_glm_family_argument(f, fam): @pytest.mark.parametrize('l, link', [('identity', IdentityLink()), - ('log', LogLink()), - ('logit', LogitLink())]) + ('log', LogLink())]) def test_glm_link_argument(l, link): """Test GLM link argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions From 03a8a2d06acaf1b2d57bdef51991fecb7482cf75 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 11:12:48 +0200 Subject: [PATCH 100/209] Improve documentation wording --- doc/modules/linear_model.rst | 41 ++++++++++--------- ...plot_poisson_regression_non_normal_loss.py | 5 +-- sklearn/linear_model/__init__.py | 6 ++- sklearn/linear_model/_glm/tests/test_glm.py | 1 - 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index b6f7c2b82c1f5..0eaaab2ecded5 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -893,7 +893,7 @@ to warm-starting (see :term:`Glossary `). Generalized Linear Regression ============================= -:class:`GeneralizedLinearRegressor` generalizes linear models in two ways +Generalized Linear Models (GLM) extend linear models in two ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear combination of the input variables :math:`X` via an inverse link function :math:`h` as @@ -901,13 +901,31 @@ combination of the input variables :math:`X` via an inverse link function .. math:: \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [11]_. The objective function being -minimized becomes +exponential dispersion model (EDM) [11]_. The minimized objective function is +the penalized negative log likelihood, .. math:: \frac{1}{2 \sum s_i}D(y, \hat{y}; s) +\frac{\alpha}{2} ||w||_2 with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`. +The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` +likelihood as + +.. math:: d(y, \mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) + - loglike(y,y,\phi)\right) \\ + D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) + +===================================== =============================== ================================= ============================================ +Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` +===================================== =============================== ================================= ============================================ +Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` +Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` +Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` +Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` +===================================== =============================== ================================= ============================================ + + In the following use cases, a loss different from the squared loss might be appropriate, @@ -968,23 +986,6 @@ The objective function (the penalized negative log likelihood) is independent of :math:`\phi` and is minimized with respect to the coefficients :math:`w`. -The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` -likelihood as - -.. math:: d(y, \mu) = -2\phi\cdot - \left(loglike(y,\mu,\phi) - - loglike(y,y,\phi)\right) \\ - D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) - -===================================== =============================== ================================= ============================================ -Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` -===================================== =============================== ================================= ============================================ -Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` -Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` -Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` -Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` -===================================== =============================== ================================= ============================================ - Two remarks: * The deviances for at least Normal, Poisson and Gamma distributions are diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 713866a712aea..695a7c7594472 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -4,9 +4,8 @@ ====================================== This example illustrates the use of linear Poisson regression -on the French Motor Third-Party Liability Claims dataset [1] and compare -it with learning models with least squared error. - +on the French Motor Third-Party Liability Claims dataset [1] and compares +it with models learned with least squared error. We start by defining a few helper functions for loading the data and visualizing results. diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index d4227a126e5ec..63a52a9c9898e 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,7 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from ._glm import (GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor) +from ._glm import (GeneralizedLinearRegressor, PoissonRegressor, + GammaRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -81,4 +82,5 @@ 'ridge_regression', 'RANSACRegressor', 'GeneralizedLinearRegressor', - 'PoissonRegressor'] + 'PoissonRegressor', + 'GammaRegressor'] diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 089f251d77049..542c18b65cad2 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -11,7 +11,6 @@ from sklearn.linear_model._glm.link import ( IdentityLink, LogLink, - LogitLink, ) from sklearn.linear_model._glm.distribution import ( TweedieDistribution, From bbf7f38fa69a9335397fa1cdfd73d153cbc3670d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 11:29:54 +0200 Subject: [PATCH 101/209] Use dataframe OpenML fetcher --- ...plot_poisson_regression_non_normal_loss.py | 19 ++++++++----------- ...lot_tweedie_regression_insurance_claims.py | 19 ++++++++----------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 695a7c7594472..347a424d8f4fe 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -25,6 +25,7 @@ import matplotlib.pyplot as plt import pandas as pd +from sklearn.datasets import fetch_openml from sklearn.compose import ColumnTransformer from sklearn.linear_model import PoissonRegressor, LinearRegression from sklearn.model_selection import train_test_split @@ -46,20 +47,16 @@ def load_mtpl2(n_samples=100000): number of samples to select (for faster run time). """ - # Note: this should use the OpenML DataFrame fetcher in the future - df_freq = pd.read_csv( - "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv", - dtype={"IDpol": np.int}, - index_col=0, - ) + # freMTPL2freq dataset from https://www.openml.org/d/41214 + df_freq = fetch_openml(data_id=41214, as_frame=True)['data'] + df_freq['IDpol'] = df_freq['IDpol'].astype(np.int) + df_freq.set_index('IDpol', inplace=True) - df_sev = pd.read_csv( - "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff", - index_col=0, - ) + # freMTPL2sev dataset from https://www.openml.org/d/41215 + df_sev = fetch_openml(data_id=41215, as_frame=True)['data'] # sum ClaimAmount over identical IDs - df_sev = df_sev.groupby(level=0).sum() + df_sev = df_sev.groupby('IDpol').sum() df = df_freq.join(df_sev, how="left") df["ClaimAmount"].fillna(0, inplace=True) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 55d0ca24ce477..418127699dc1a 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -38,6 +38,7 @@ import matplotlib.pyplot as plt import pandas as pd +from sklearn.datasets import fetch_openml from sklearn.compose import ColumnTransformer from sklearn.linear_model import PoissonRegressor, GammaRegressor from sklearn.linear_model._glm import GeneralizedLinearRegressor @@ -59,20 +60,16 @@ def load_mtpl2(n_samples=100000): number of samples to select (for faster run time). """ - # Note: this should use the OpenML DataFrame fetcher in the future - df_freq = pd.read_csv( - "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv", - dtype={"IDpol": np.int}, - index_col=0, - ) + # freMTPL2freq dataset from https://www.openml.org/d/41214 + df_freq = fetch_openml(data_id=41214, as_frame=True)['data'] + df_freq['IDpol'] = df_freq['IDpol'].astype(np.int) + df_freq.set_index('IDpol', inplace=True) - df_sev = pd.read_csv( - "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff", - index_col=0, - ) + # freMTPL2sev dataset from https://www.openml.org/d/41215 + df_sev = fetch_openml(data_id=41215, as_frame=True)['data'] # sum ClaimAmount over identical IDs - df_sev = df_sev.groupby(level=0).sum() + df_sev = df_sev.groupby('IDpol').sum() df = df_freq.join(df_sev, how="left") df["ClaimAmount"].fillna(0, inplace=True) From 49a3a8e116322c59f56b9d0b3fdb77028b144dd3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 12:00:26 +0200 Subject: [PATCH 102/209] Refactor distibution bounds --- sklearn/linear_model/_glm/distribution.py | 111 +++++------------- .../_glm/tests/test_distribution.py | 21 +++- 2 files changed, 50 insertions(+), 82 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index b99e1b40b2871..e6635d2003850 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -6,6 +6,7 @@ # License: BSD 3 clause from abc import ABCMeta, abstractmethod +from collections import namedtuple import numbers import numpy as np @@ -20,6 +21,10 @@ def _safe_lin_pred(X, coef): return X @ coef +DistributionBoundary = namedtuple("DistributionBoundary", + ("value", "inclusive")) + + class ExponentialDispersionModel(metaclass=ABCMeta): r"""Base class for reproductive Exponential Dispersion Models (EDM). @@ -35,13 +40,6 @@ class ExponentialDispersionModel(metaclass=ABCMeta): unit variance :math:`v(\mu)` and unit deviance :math:`d(y,\mu)`. - Attributes - ---------- - lower_bound - upper_bound - include_lower_bound - include_upper_bound - Methods ------- deviance @@ -52,55 +50,33 @@ class ExponentialDispersionModel(metaclass=ABCMeta): unit_variance unit_variance_derivative - _mu_deviance_derivative - References ---------- - https://en.wikipedia.org/wiki/Exponential_dispersion_model. """ - @property - def lower_bound(self): - """Get the lower bound of values for Y~EDM.""" - return self._lower_bound - @property - def upper_bound(self): - """Get the upper bound of values for Y~EDM.""" - return self._upper_bound - - @property - def include_lower_bound(self): - """Get True if lower bound for y is included: y >= lower_bound.""" - return self._include_lower_bound - - @property - def include_upper_bound(self): - """Get True if upper bound for y is included: y <= upper_bound.""" - return self._include_upper_bound - - def in_y_range(self, x): - """Returns ``True`` if x is in the valid range of Y~EDM. + def in_y_range(self, y): + """Returns ``True`` if y is in the valid range of Y~EDM. Parameters ---------- - x : array, shape (n_samples,) + y : array, shape (n_samples,) Target values. """ - if self.include_lower_bound: - if self.include_upper_bound: - return np.logical_and(np.greater_equal(x, self.lower_bound), - np.less_equal(x, self.upper_bound)) - else: - return np.logical_and(np.greater_equal(x, self.lower_bound), - np.less(x, self.upper_bound)) + if hasattr(self, '_upper_bound'): + # All currently supported distributions have an upper bound at + # +inf, however this may need to be implemented for other + # distributions + raise NotImplementedError + + if not isinstance(self._lower_bound, DistributionBoundary): + raise TypeError('_lower_bound attribute must be of type ' + 'DistributionBoundary') + + if self._lower_bound.inclusive: + return np.greater_equal(y, self._lower_bound.value) else: - if self.include_upper_bound: - return np.logical_and(np.greater(x, self.lower_bound), - np.less_equal(x, self.upper_bound)) - else: - return np.logical_and(np.greater(x, self.lower_bound), - np.less(x, self.upper_bound)) + return np.greater(y, self._lower_bound.value) @abstractmethod def unit_variance(self, mu): @@ -265,42 +241,17 @@ def power(self, power): raise TypeError('power must be a real number, input was {0}' .format(power)) - self._upper_bound = np.Inf - self._include_upper_bound = False - if power < 0: - # Extreme Stable - self._lower_bound = -np.Inf - self._include_lower_bound = False - elif power == 0: - # NormalDistribution - self._lower_bound = -np.Inf - self._include_lower_bound = False - elif (power > 0) and (power < 1): + if power <= 0: + # Extreme Stable or Normal distribution + self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False) + elif 0 < power < 1: raise ValueError('For 0 1) and (power < 2): - # Compound Poisson - self._lower_bound = 0 - self._include_lower_bound = True - elif power == 2: - # GammaDistribution - self._lower_bound = 0 - self._include_lower_bound = False - elif (power > 2) and (power < 3): - # Positive Stable - self._lower_bound = 0 - self._include_lower_bound = False - elif power == 3: - # InverseGaussianDistribution - self._lower_bound = 0 - self._include_lower_bound = False - elif power > 3: - # Positive Stable - self._lower_bound = 0 - self._include_lower_bound = False + elif 1 <= power < 2: + # Poisson or Compound Poisson distribution + self._lower_bound = DistributionBoundary(0, inclusive=True) + elif power >= 2: + # Gamma, Positive Stable, Inverse Gaussian distributions + self._lower_bound = DistributionBoundary(0, inclusive=False) else: # pragma: no cover # this branch should be unreachable. raise ValueError diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py index f984077bed5cb..f9c329f35caa4 100644 --- a/sklearn/linear_model/_glm/tests/test_distribution.py +++ b/sklearn/linear_model/_glm/tests/test_distribution.py @@ -13,6 +13,7 @@ TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, + DistributionBoundary ) @@ -30,6 +31,19 @@ def test_family_bounds(family, expected): assert_array_equal(result, expected) +def test_invalid_distribution_bound(): + dist = TweedieDistribution() + dist._lower_bound = 0 + with pytest.raises(TypeError, + match="must be of type DistributionBoundary"): + dist.in_y_range([-1, 0, 1]) + + dist = TweedieDistribution() + dist._upper_bound = None + with pytest.raises(NotImplementedError): + dist.in_y_range([-1, 0, 1]) + + def test_tweedie_distribution_power(): with pytest.raises(ValueError, match="no distribution exists"): TweedieDistribution(power=0.5) @@ -42,9 +56,12 @@ def test_tweedie_distribution_power(): dist.power = 1j dist = TweedieDistribution() - assert dist._include_lower_bound is False + assert isinstance(dist._lower_bound, DistributionBoundary) + + assert dist._lower_bound.inclusive is False dist.power = 1 - assert dist._include_lower_bound is True + assert dist._lower_bound.value == 0.0 + assert dist._lower_bound.inclusive is True @pytest.mark.parametrize( From 228e8c8633d3bae7bbe9a36fa3fccb97567fd653 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 15:13:58 +0200 Subject: [PATCH 103/209] Move deviance checks under destribution --- sklearn/linear_model/_glm/distribution.py | 70 +++++++++++++++++-- .../_glm/tests/test_distribution.py | 3 +- sklearn/metrics/regression.py | 35 +--------- sklearn/metrics/tests/test_regression.py | 12 ++-- 4 files changed, 74 insertions(+), 46 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index e6635d2003850..c30996662114e 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -113,7 +113,7 @@ def unit_variance_derivative(self, mu): pass # pragma: no cover @abstractmethod - def unit_deviance(self, y, mu): + def unit_deviance(self, y, mu, check_input=False): r"""Compute the unit deviance. The unit_deviance :math:`d(y,\mu)` can be defined by the @@ -128,6 +128,14 @@ def unit_deviance(self, y, mu): mu : array, shape (n_samples,) Predicted mean. + + check_input : bool, default=False + If True raise an exception on invalid y or mu values, otherwise + they will be propagated as NaN. + Returns + ------- + deviance: array, shape (n_samples,) + Computed deviance """ pass # pragma: no cover @@ -245,7 +253,8 @@ def power(self, power): # Extreme Stable or Normal distribution self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False) elif 0 < power < 1: - raise ValueError('For 0=1.') elif 1 <= power < 2: # Poisson or Compound Poisson distribution self._lower_bound = DistributionBoundary(0, inclusive=True) @@ -279,15 +288,66 @@ def unit_variance_derivative(self, mu): """ return self.power * np.power(mu, self.power - 1) - def unit_deviance(self, y, mu): + def unit_deviance(self, y, mu, check_input=False): + r"""Compute the unit deviance. + + The unit_deviance :math:`d(y,\mu)` can be defined by the + log-likelihood as + :math:`d(y,\mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + check_input : bool, default=False + If True raise an exception on invalid y or mu values, otherwise + they will be propagated as NaN. + Returns + ------- + deviance: array, shape (n_samples,) + Computed deviance + """ p = self.power + + if check_input: + message = ("Mean Tweedie deviance error with p={} can only be " + "used on ".format(p)) + if p < 0: + # 'Extreme stable', y any realy number, mu > 0 + if (mu <= 0).any(): + raise ValueError(message + "strictly positive mu.") + elif p == 0: + # Normal, y and mu can be any real number + pass + elif 0 < p < 1: + raise ValueError("Tweedie deviance is only defined for p<=0 " + "and p>=1.") + elif 1 <= p < 2: + # Poisson and Compount poisson distribution, y >= 0, mu > 0 + if (y < 0).any() or (mu <= 0).any(): + raise ValueError(message + "non-negative y and strictly " + "positive mu.") + elif p >= 2: + # Gamma and Extreme stable distribution, y and mu > 0 + if (y <= 0).any() or (mu <= 0).any(): + raise ValueError(message + "strictly positive y and mu.") + else: # pragma: nocover + # Unreachable statement + raise ValueError + if p < 0: - # 'Extreme stable', y_true any realy number, y_pred > 0 + # 'Extreme stable', y any realy number, mu > 0 dev = 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p) * (2-p)) - y * np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + elif p == 0: - # Normal distribution, y_true and y_pred any real number + # Normal distribution, y and mu any real number dev = (y - mu)**2 elif p < 1: raise ValueError("Tweedie deviance is only defined for p<=0 and " diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py index f9c329f35caa4..82e493b7a2149 100644 --- a/sklearn/linear_model/_glm/tests/test_distribution.py +++ b/sklearn/linear_model/_glm/tests/test_distribution.py @@ -45,7 +45,8 @@ def test_invalid_distribution_bound(): def test_tweedie_distribution_power(): - with pytest.raises(ValueError, match="no distribution exists"): + msg = "distribution is only defined for p<=0 and p>=1" + with pytest.raises(ValueError, match=msg): TweedieDistribution(power=0.5) with pytest.raises(TypeError, match="must be a real number"): diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 2f6c442c1f824..538fd7eec4631 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -22,9 +22,7 @@ # Christian Lorentzen # License: BSD 3 clause - import numpy as np -from scipy.special import xlogy import warnings from ..utils.validation import (check_array, check_consistent_length, @@ -676,39 +674,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): sample_weight = column_or_1d(sample_weight) sample_weight = sample_weight[:, np.newaxis] - message = ("Mean Tweedie deviance error with p={} can only be used on " - .format(p)) - if p < 0: - # 'Extreme stable', y_true any realy number, y_pred > 0 - if (y_pred <= 0).any(): - raise ValueError(message + "strictly positive y_pred.") - elif p == 0: - pass - elif p < 1: - raise ValueError("Tweedie deviance is only defined for p<=0 and " - "p>=1.") - elif p == 1: - # Poisson distribution, y_true >= 0, y_pred > 0 - if (y_true < 0).any() or (y_pred <= 0).any(): - raise ValueError(message + "non-negative y_true and strictly " - "positive y_pred.") - elif p == 2: - # Gamma distribution, y_true and y_pred > 0 - if (y_true <= 0).any() or (y_pred <= 0).any(): - raise ValueError(message + "strictly positive y_true and y_pred.") - else: - if p < 2: - # 1 < p < 2 is Compound Poisson, y_true >= 0, y_pred > 0 - if (y_true < 0).any() or (y_pred <= 0).any(): - raise ValueError(message + "non-negative y_true and strictly " - "positive y_pred.") - else: - if (y_true <= 0).any() or (y_pred <= 0).any(): - raise ValueError(message + "strictly positive y_true and " - "y_pred.") - dist = TweedieDistribution(power=p) - dev = dist.unit_deviance(y_true, y_pred) + dev = dist.unit_deviance(y_true, y_pred, check_input=True) return np.average(dev, weights=sample_weight) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 526c27f0a036c..d46bca1301b1e 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -101,35 +101,35 @@ def test_regression_metrics_at_limits(): assert_allclose(mean_tweedie_deviance([0], [1.], p=p), 2./(2.-p), rtol=1e-3) with pytest.raises(ValueError, - match="can only be used on strictly positive y_pred."): + match="can only be used on strictly positive mu."): mean_tweedie_deviance([0.], [0.], p=p) assert_almost_equal(mean_tweedie_deviance([0.], [0.], p=0), 0.00, 2) - msg = "only be used on non-negative y_true and strictly positive y_pred." + msg = "only be used on non-negative y and strictly positive mu." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], p=1.0) p = 1.5 assert_allclose(mean_tweedie_deviance([0.], [1.], p=p), 2./(2.-p)) - msg = "only be used on non-negative y_true and strictly positive y_pred." + msg = "only be used on non-negative y and strictly positive mu." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], p=p) p = 2. assert_allclose(mean_tweedie_deviance([1.], [1.], p=p), 0.00, atol=1e-8) - msg = "can only be used on strictly positive y_true and y_pred." + msg = "can only be used on strictly positive y and mu." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], p=p) p = 3. assert_allclose(mean_tweedie_deviance([1.], [1.], p=p), 0.00, atol=1e-8) - msg = "can only be used on strictly positive y_true and y_pred." + msg = "can only be used on strictly positive y and mu." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], p=p) with pytest.raises(ValueError, - match="deviance is only defined for p<=0 and p>=1."): + match="is only defined for p<=0 and p>=1"): mean_tweedie_deviance([0.], [0.], p=0.5) From 09a57c9cfedfebb2452c50921260c310987a0057 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 18:12:55 +0200 Subject: [PATCH 104/209] Expose TweedieRegressor --- doc/modules/linear_model.rst | 6 +- ...lot_tweedie_regression_insurance_claims.py | 26 ++- sklearn/linear_model/__init__.py | 7 +- sklearn/linear_model/_glm/__init__.py | 14 +- sklearn/linear_model/_glm/glm.py | 154 ++++++++++++++++++ sklearn/linear_model/_glm/tests/test_glm.py | 38 ++++- 6 files changed, 221 insertions(+), 24 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 0eaaab2ecded5..cab918b06f3b0 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -949,10 +949,10 @@ Note that the feature matrix ``X`` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as follows: - >>> from sklearn.linear_model import GeneralizedLinearRegressor - >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> from sklearn.linear_model import TweedieRegressor + >>> reg = TweedieRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) - GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + TweedieRegressor(alpha=0.5, family='poisson', link='log') >>> reg.coef_ array([0.2463..., 0.4337...]) >>> reg.intercept_ diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 418127699dc1a..55a21c8d8723d 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -41,8 +41,7 @@ from sklearn.datasets import fetch_openml from sklearn.compose import ColumnTransformer from sklearn.linear_model import PoissonRegressor, GammaRegressor -from sklearn.linear_model._glm import GeneralizedLinearRegressor -from sklearn.linear_model._glm.distribution import TweedieDistribution +from sklearn.linear_model import TweedieRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder @@ -418,7 +417,6 @@ class ClaimProdEstimator: def __init__(self, est_freq, est_sev): self.est_freq = est_freq self.est_sev = est_sev - self._family_instance = TweedieDistribution(power=1.5) def predict(self, X, exposure): """Predict the total claim amount. @@ -429,11 +427,14 @@ def predict(self, X, exposure): def score(self, X, y, sample_weight=None): """Compute D², the percentage of deviance explained.""" + # TODO: remove this private import once d2_score is available + from sklearn.linear_model._glm.distribution import TweedieDistribution + mu = self.predict(X, exposure=sample_weight) - dev = self._family_instance.deviance(y, mu, weights=sample_weight) + family = TweedieDistribution(power=1.5) + dev = family.deviance(y, mu, weights=sample_weight) y_mean = np.average(y, weights=sample_weight) - dev_null = self._family_instance.deviance(y, y_mean, - weights=sample_weight) + dev_null = family.deviance(y, y_mean, weights=sample_weight) return 1. - dev / dev_null @@ -459,18 +460,13 @@ def score(self, X, y, sample_weight=None): from sklearn.model_selection import GridSearchCV -# this takes a while -params = { - "family": [ - TweedieDistribution(power=power) - # exclude upper bound as power=2 does not support null y samples. - for power in np.linspace(1 + 1e-4, 2 - 1e-4, 8) - ] -} +# exclude upper bound as power=2 does not support null y values. +params = {"power": np.linspace(1 + 1e-4, 2 - 1e-4, 8)} +# this takes a while glm_total = GridSearchCV( - GeneralizedLinearRegressor(tol=1e-3, max_iter=500), cv=3, + TweedieRegressor(tol=1e-3, max_iter=500), cv=3, param_grid=params, n_jobs=-1 ) glm_total.fit( diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 63a52a9c9898e..46d1efe63de2e 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,8 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from ._glm import (GeneralizedLinearRegressor, PoissonRegressor, - GammaRegressor) +from ._glm import (PoissonRegressor, + GammaRegressor, TweedieRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -83,4 +83,5 @@ 'RANSACRegressor', 'GeneralizedLinearRegressor', 'PoissonRegressor', - 'GammaRegressor'] + 'GammaRegressor', + 'TweedieRegressor'] diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py index a7d4b67817730..3b5c0d95d6124 100644 --- a/sklearn/linear_model/_glm/__init__.py +++ b/sklearn/linear_model/_glm/__init__.py @@ -1,5 +1,15 @@ # License: BSD 3 clause -from .glm import GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor +from .glm import ( + GeneralizedLinearRegressor, + PoissonRegressor, + GammaRegressor, + TweedieRegressor +) -__all__ = ["GeneralizedLinearRegressor", "PoissonRegressor", "GammaRegressor"] +__all__ = [ + "GeneralizedLinearRegressor", + "PoissonRegressor", + "GammaRegressor", + "TweedieRegressor" +] diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 12486e188951a..ca33c45c76292 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -623,6 +623,15 @@ def __init__(self, alpha=1.0, fit_intercept=True, link='log', max_iter=max_iter, tol=tol, warm_start=warm_start, copy_X=copy_X, verbose=verbose) + @property + def family(self): + return "poisson" + + @family.setter + def family(self, value): + if value != "poisson": + raise ValueError("PoissonRegressor.family must be 'poisson'!") + class GammaRegressor(GeneralizedLinearRegressor): """Regression with the response variable y following a Gamma distribution @@ -729,3 +738,148 @@ def __init__(self, alpha=1.0, fit_intercept=True, link='log', fit_dispersion=fit_dispersion, solver=solver, max_iter=max_iter, tol=tol, warm_start=warm_start, copy_X=copy_X, verbose=verbose) + + @property + def family(self): + return "gamma" + + @family.setter + def family(self, value): + if value != "gamma": + raise ValueError("GammaRegressor.family must be 'gamma'!") + + +class TweedieRegressor(GeneralizedLinearRegressor): + """Regression with the response variable y following a Tweedie distribution + + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean of the target y as mu=h(X*w). + The fit minimizes the following objective function with L2 regularization:: + + 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 + + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + power : float (default=0) + The variance power: :math:`v(\mu) = \mu^{power}`. + For ``0`. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. + + + References + ---------- + .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemática, no. 51. See also + `Exponential dispersion model. + `_ + """ + def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log', + fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, + warm_start=False, copy_X=True, check_input=True, verbose=0): + + super().__init__(alpha=alpha, fit_intercept=fit_intercept, + family=TweedieDistribution(power=power), link=link, + fit_dispersion=fit_dispersion, solver=solver, + max_iter=max_iter, tol=tol, warm_start=warm_start, + copy_X=copy_X, verbose=verbose) + + @property + def family(self): + dist = TweedieDistribution(power=self.power) + # TODO: make the returned object immutable + return dist + + @family.setter + def family(self, value): + if isinstance(value, TweedieDistribution): + self.power = value.power + else: + raise TypeError("TweedieRegressor.family must be of type " + "TweedieDistribution!") diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 542c18b65cad2..a56155fe03f22 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -7,7 +7,12 @@ import pytest from sklearn.datasets import make_regression -from sklearn.linear_model import GeneralizedLinearRegressor +from sklearn.linear_model._glm import GeneralizedLinearRegressor +from sklearn.linear_model import ( + TweedieRegressor, + PoissonRegressor, + GammaRegressor +) from sklearn.linear_model._glm.link import ( IdentityLink, LogLink, @@ -353,3 +358,34 @@ def test_convergence_warning(solver, regression_data): max_iter=1, tol=1e-20) with pytest.warns(ConvergenceWarning): est.fit(X, y) + + +def test_poisson_regression_family(regression_data): + est = PoissonRegressor() + est.family == "poisson" + + msg = "PoissonRegressor.family must be 'poisson'!" + with pytest.raises(ValueError, match=msg): + est.family = 0 + + +def test_gamma_regression_family(regression_data): + est = GammaRegressor() + est.family == "gamma" + + msg = "GammaRegressor.family must be 'gamma'!" + with pytest.raises(ValueError, match=msg): + est.family = 0 + + +def test_tweedie_regression_family(regression_data): + power = 2.0 + est = TweedieRegressor(power=power) + assert isinstance(est.family, TweedieDistribution) + assert est.family.power == power + msg = "TweedieRegressor.family must be of type TweedieDistribution!" + with pytest.raises(TypeError, match=msg): + est.family = None + + # TODO: the following should not be allowed + # est.family.power = 2 From 4b485cac821d30ece504a43d0c3316140cec5d33 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 18:32:07 +0200 Subject: [PATCH 105/209] Improve documentation --- doc/modules/classes.rst | 2 +- doc/modules/linear_model.rst | 38 ++++++++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 177cd0780f9be..e5b61faf352e4 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -751,7 +751,6 @@ Kernels: linear_model.ElasticNet linear_model.ElasticNetCV linear_model.GammaRegressor - linear_model.GeneralizedLinearRegressor linear_model.HuberRegressor linear_model.Lars linear_model.LarsCV @@ -781,6 +780,7 @@ Kernels: linear_model.SGDClassifier linear_model.SGDRegressor linear_model.TheilSenRegressor + linear_model.TweedieRegressor .. autosummary:: :toctree: generated/ diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index cab918b06f3b0..f4a48fcaf3acd 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -916,14 +916,14 @@ likelihood as - loglike(y,y,\phi)\right) \\ D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) -===================================== =============================== ================================= ============================================ -Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` -===================================== =============================== ================================= ============================================ -Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` -Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` -Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` -Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` -===================================== =============================== ================================= ============================================ +================= =============================== ================================= ============================================ +Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` +================= =============================== ================================= ============================================ +Normal :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` +Poisson :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` +Gamma :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` +Inverse Gaussian :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` +================= =============================== ================================= ============================================ In the following use cases, a loss different from the squared loss might be @@ -945,14 +945,32 @@ it is convenient to apply a link function different from the identity link :math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the log-link with :math:`h(x^\top w)=\exp(x^\top w)`. +:class:`linear_model.TweedieRegressor` implements a generalized linear model +for the Tweedie distribution, that allows to model any of the above mentionned +distribution using the appropriate power parameter `p`, + + - `p = 0`: Normal distribution. Specialized solvers such as + :class:`linear_model.Ridge`, :class:`linear_model.ElasticNet` are generally + more appropriate in this case. + + - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for + convinience however it is strictly equivalent to `TweedieRegressor(power=1)`. + + - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for + convinience however it is also strictly equivalent to + `TweedieRegressor(power=2)`. + + - `p = 3`: Inverse Gamma distribution. + + Note that the feature matrix ``X`` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as follows: >>> from sklearn.linear_model import TweedieRegressor - >>> reg = TweedieRegressor(alpha=0.5, family='poisson', link='log') + >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) - TweedieRegressor(alpha=0.5, family='poisson', link='log') + TweedieRegressor(alpha=0.5, power=1) >>> reg.coef_ array([0.2463..., 0.4337...]) >>> reg.intercept_ From aa0adf1e6304d9f8aa3497e2fa406244e8d3405b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 18:34:13 +0200 Subject: [PATCH 106/209] Lint --- sklearn/linear_model/_glm/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index ca33c45c76292..7d7ef099cd04a 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -766,7 +766,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): Parameters ---------- power : float (default=0) - The variance power: :math:`v(\mu) = \mu^{power}`. + The variance power: :math:`v(\\mu) = \\mu^{power}`. For ``0 Date: Tue, 30 Jul 2019 14:13:46 +0200 Subject: [PATCH 107/209] Fix __init__ --- sklearn/linear_model/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 46d1efe63de2e..9f696a14985c3 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -81,7 +81,6 @@ 'orthogonal_mp_gram', 'ridge_regression', 'RANSACRegressor', - 'GeneralizedLinearRegressor', 'PoissonRegressor', 'GammaRegressor', 'TweedieRegressor'] From 7a9d0674174af423b5bfe8290631db10c53a946b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 2 Aug 2019 15:39:19 +0200 Subject: [PATCH 108/209] Update doc/modules/linear_model.rst Co-Authored-By: Joel Nothman --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index f4a48fcaf3acd..8123eac5fc4a2 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -954,7 +954,7 @@ distribution using the appropriate power parameter `p`, more appropriate in this case. - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for - convinience however it is strictly equivalent to `TweedieRegressor(power=1)`. + convenience however it is strictly equivalent to `TweedieRegressor(power=1)`. - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for convinience however it is also strictly equivalent to From 18b45037fa6cb4d172c4e1b3cb92c1c33c0700c4 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 2 Aug 2019 15:39:29 +0200 Subject: [PATCH 109/209] Update doc/modules/linear_model.rst Co-Authored-By: Joel Nothman --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 8123eac5fc4a2..d159a8ad77039 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -957,7 +957,7 @@ distribution using the appropriate power parameter `p`, convenience however it is strictly equivalent to `TweedieRegressor(power=1)`. - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for - convinience however it is also strictly equivalent to + convenience however it is strictly equivalent to `TweedieRegressor(power=2)`. - `p = 3`: Inverse Gamma distribution. From 29658d66ff34eb633f2728d69a956cde760c271e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 2 Aug 2019 15:39:40 +0200 Subject: [PATCH 110/209] Update doc/modules/linear_model.rst Co-Authored-By: Joel Nothman --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index d159a8ad77039..02c01b674f467 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -965,7 +965,7 @@ distribution using the appropriate power parameter `p`, Note that the feature matrix ``X`` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as -follows: +follows:: >>> from sklearn.linear_model import TweedieRegressor >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log') From 1ea70d3a0c2da89ff164aef9975080de66a1cebd Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 7 Aug 2019 08:22:56 +0200 Subject: [PATCH 111/209] Fix typos in documentation --- doc/modules/linear_model.rst | 13 +++++++------ sklearn/linear_model/_glm/distribution.py | 2 +- sklearn/linear_model/_glm/glm.py | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 02c01b674f467..ce160d341af5f 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -733,8 +733,8 @@ of a single trial are modeled using a `logistic function `_. Logistic regression is implemented in :class:`LogisticRegression`. -This implementation can fit binary, One-vs-Rest, or multinomial logistic -regression with optional :math:`\ell_1`, :math:`\ell_2` or Elastic-Net +This implementation can fit binary, One-vs-Rest, or multinomial logistic +regression with optional :math:`\ell_1`, :math:`\ell_2` or Elastic-Net regularization. .. note:: @@ -946,18 +946,19 @@ it is convenient to apply a link function different from the identity link log-link with :math:`h(x^\top w)=\exp(x^\top w)`. :class:`linear_model.TweedieRegressor` implements a generalized linear model -for the Tweedie distribution, that allows to model any of the above mentionned -distribution using the appropriate power parameter `p`, +for the Tweedie distribution, that allows to model any of the above mentioned +distributions using the appropriate power parameter `p`, - `p = 0`: Normal distribution. Specialized solvers such as :class:`linear_model.Ridge`, :class:`linear_model.ElasticNet` are generally more appropriate in this case. - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for - convenience however it is strictly equivalent to `TweedieRegressor(power=1)`. + convenience. However, it is strictly equivalent to + `TweedieRegressor(power=1)`. - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for - convenience however it is strictly equivalent to + convenience. However, it is strictly equivalent to `TweedieRegressor(power=2)`. - `p = 3`: Inverse Gamma distribution. diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index c30996662114e..c7b4ec6c7836d 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -224,7 +224,7 @@ class TweedieDistribution(ExponentialDispersionModel): ===== ================ 0 Normal 1 Poisson - (0,1) Compound Poisson + (1,2) Compound Poisson 2 Gamma 3 Inverse Gaussian diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 7d7ef099cd04a..af411c38f87ca 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -776,7 +776,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): ===== ================ 0 Normal 1 Poisson - (0,1) Compound Poisson + (1,2) Compound Poisson 2 Gamma 3 Inverse Gaussian From efdcb5be1ed18d681c9b7c358b6f23adb6d0f795 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 16:58:46 +0300 Subject: [PATCH 112/209] Update doc/modules/linear_model.rst Co-Authored-By: Nicolas Hug --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index ce160d341af5f..67e4fbbde182c 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -964,7 +964,7 @@ distributions using the appropriate power parameter `p`, - `p = 3`: Inverse Gamma distribution. -Note that the feature matrix ``X`` should be standardized before fitting. This +Note that the feature matrix `X` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as follows:: From ef0d063ec406c73aaee715af4cf06e5ca1f3b78c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 16:59:26 +0300 Subject: [PATCH 113/209] Update doc/modules/linear_model.rst Co-Authored-By: Nicolas Hug --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 67e4fbbde182c..9645690d594f6 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -945,7 +945,7 @@ it is convenient to apply a link function different from the identity link :math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the log-link with :math:`h(x^\top w)=\exp(x^\top w)`. -:class:`linear_model.TweedieRegressor` implements a generalized linear model +:class:`TweedieRegressor` implements a generalized linear model for the Tweedie distribution, that allows to model any of the above mentioned distributions using the appropriate power parameter `p`, From 0125e1cbe9105f8d4f4cf08ee652add64deee384 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 16:59:59 +0300 Subject: [PATCH 114/209] Update doc/modules/linear_model.rst Co-Authored-By: Nicolas Hug --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 9645690d594f6..fe52cad236392 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -950,7 +950,7 @@ for the Tweedie distribution, that allows to model any of the above mentioned distributions using the appropriate power parameter `p`, - `p = 0`: Normal distribution. Specialized solvers such as - :class:`linear_model.Ridge`, :class:`linear_model.ElasticNet` are generally + :class:`Ridge`, :class:`ElasticNet` are generally more appropriate in this case. - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for From 6a8a600258c8be783f02dad7519fc2aa012418eb Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 17:02:46 +0300 Subject: [PATCH 115/209] Update examples/linear_model/plot_poisson_regression_non_normal_loss.py Co-Authored-By: Nicolas Hug --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 347a424d8f4fe..6cee852866018 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -238,7 +238,7 @@ def load_mtpl2(n_samples=100000): y_pred = model.predict(X_train) pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=ax[idx+1]) - ax[idx+1].set_title(model.__class__.__name__) + ax[idx + 1].set_title(model.__class__.__name__) for axi in ax: axi.set( From 73f3bd1f646ec1dc1b9a3808149cc185d3fdea0d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 17:13:00 +0300 Subject: [PATCH 116/209] Rename inverse.gaussian to inverse-gaussian --- sklearn/linear_model/_glm/distribution.py | 2 +- sklearn/linear_model/_glm/glm.py | 6 +++--- sklearn/linear_model/_glm/tests/test_glm.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index c7b4ec6c7836d..950fa3fbb03e7 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -393,5 +393,5 @@ def __init__(self): 'normal': NormalDistribution, 'poisson': PoissonDistribution, 'gamma': GammaDistribution, - 'inverse.gaussian': InverseGaussianDistribution, + 'inverse-gaussian': InverseGaussianDistribution, } diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index af411c38f87ca..86f4c544d8f84 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -57,7 +57,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} \ + family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} \ or an instance of class ExponentialDispersionModel, \ optional(default='normal') The distributional assumption of the GLM, i.e. which distribution from @@ -71,7 +71,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'identity' for family 'normal' - - 'log' for families 'poisson', 'gamma', 'inverse.gaussian' + - 'log' for families 'poisson', 'gamma', 'inverse-gaussian' fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) Method for estimation of the dispersion parameter phi. Whether to use @@ -217,7 +217,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError( "The family must be an instance of class" " ExponentialDispersionModel or an element of" - " ['normal', 'poisson', 'gamma', 'inverse.gaussian']" + " ['normal', 'poisson', 'gamma', 'inverse-gaussian']" "; got (family={0})".format(self.family)) # Guarantee that self._link_instance is set to an instance of diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index a56155fe03f22..12edbefbf2833 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -65,7 +65,7 @@ def test_sample_weights_validation(): [('normal', NormalDistribution()), ('poisson', PoissonDistribution()), ('gamma', GammaDistribution()), - ('inverse.gaussian', InverseGaussianDistribution())]) + ('inverse-gaussian', InverseGaussianDistribution())]) def test_glm_family_argument(f, fam): """Test GLM family argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions From 11b178fffc7d279596acd122c7b6c33ba4b84a2f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 17:17:42 +0300 Subject: [PATCH 117/209] Remove sample_weight parameter from predict --- sklearn/linear_model/_glm/glm.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 86f4c544d8f84..921317cd0965a 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -388,32 +388,25 @@ def _linear_predictor(self, X): allow_nd=False) return X @ self.coef_ + self.intercept_ - def predict(self, X, sample_weight=None): + def predict(self, X): """Predict using GLM with feature matrix X. - If sample_weight is given, returns prediction*sample_weight. - Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Samples. - sample_weight : {None, array-like}, shape (n_samples,), optional \ - (default=None) - Returns ------- C : array, shape (n_samples,) - Returns predicted values times sample_weight. + Returns predicted values. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype='numeric', ensure_2d=True, allow_nd=False) eta = self._linear_predictor(X) mu = self._link_instance.inverse(eta) - weights = _check_sample_weight(sample_weight, X) - - return mu*weights + return mu def estimate_phi(self, X, y, sample_weight=None): """Estimate/fit the dispersion parameter phi. From 3806fbe05c164d3fd74aada0b8a068c43d3998cd Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 17:22:20 +0300 Subject: [PATCH 118/209] Remove redundant check_array in predict --- sklearn/linear_model/_glm/glm.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 921317cd0965a..0ab9e8c1db777 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -384,7 +384,7 @@ def _linear_predictor(self, X): """ check_is_fitted(self, "coef_") X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype='numeric', copy=True, ensure_2d=True, + dtype='numeric', ensure_2d=True, allow_nd=False) return X @ self.coef_ + self.intercept_ @@ -401,9 +401,7 @@ def predict(self, X): C : array, shape (n_samples,) Returns predicted values. """ - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype='numeric', ensure_2d=True, - allow_nd=False) + # check_array is done in _linear_predictor eta = self._linear_predictor(X) mu = self._link_instance.inverse(eta) return mu From ae1c6721876fe5e9f1f14cd092fb198067899457 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 11 Aug 2019 23:11:52 +0200 Subject: [PATCH 119/209] Update doc/modules/linear_model.rst --- doc/modules/linear_model.rst | 66 +++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index fe52cad236392..09b657c26b915 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -900,30 +900,31 @@ combination of the input variables :math:`X` via an inverse link function .. math:: \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p). -Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [11]_. The minimized objective function is -the penalized negative log likelihood, +Secondly, the squared loss function is replaced by the unit deviance :math:`d` +of a reproductive exponential dispersion model (EDM) [11]_. The minimization +problem becomes -.. math:: \frac{1}{2 \sum s_i}D(y, \hat{y}; s) +\frac{\alpha}{2} ||w||_2 +.. math:: \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, x_i)) + \frac{\alpha}{2} ||w||_2 with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`. - -The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` +The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` likelihood as .. math:: d(y, \mu) = -2\phi\cdot \left(loglike(y,\mu,\phi) - - loglike(y,y,\phi)\right) \\ - D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) + - loglike(y,y,\phi)\right) + +The following table lists some specific EDM distributions—all are Tweedie +distributions—and some properties. -================= =============================== ================================= ============================================ -Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` -================= =============================== ================================= ============================================ -Normal :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` -Poisson :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` -Gamma :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` -Inverse Gaussian :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` -================= =============================== ================================= ============================================ +================= =============================== ====================================== ============================================ +Distribution Target Domain Unit Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` +================= =============================== ====================================== ============================================ +Normal :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` +Poisson :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` +Gamma :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` +Inverse Gaussian :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` +================= =============================== ====================================== ============================================ In the following use cases, a loss different from the squared loss might be @@ -947,7 +948,8 @@ log-link with :math:`h(x^\top w)=\exp(x^\top w)`. :class:`TweedieRegressor` implements a generalized linear model for the Tweedie distribution, that allows to model any of the above mentioned -distributions using the appropriate power parameter `p`, +distributions using the appropriate power parameter `p`, i.e. the exponent of +the unit variance function, - `p = 0`: Normal distribution. Specialized solvers such as :class:`Ridge`, :class:`ElasticNet` are generally @@ -964,9 +966,16 @@ distributions using the appropriate power parameter `p`, - `p = 3`: Inverse Gamma distribution. -Note that the feature matrix `X` should be standardized before fitting. This -ensures that the penalty treats features equally. The estimator can be used as -follows:: +Note: +* The feature matrix `X` should be standardized before fitting. This + ensures that the penalty treats features equally. +* If you want to model a relative frequency, i.e. counts per exposure (time, + volume, ...) you can do so by a Poisson distribution and passing + :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together + with :math:`s=\mathrm{exposure}` as sample weights. This is done in both + examples linked below. + +The estimator can be used as follows:: >>> from sklearn.linear_model import TweedieRegressor >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log') @@ -993,7 +1002,7 @@ In the unpenalized case, the assumptions are the following: with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter :math:`\phi` and sample weights :math:`s_i`. * The aim is to predict the expectation :math:`\mu_i` with - :math:`\hat{y_i} = h(\eta_i)`, linear predictor + :math:`\hat{y}_i = h(\eta_i)`, linear predictor :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`. Note that the first assumption implies @@ -1001,21 +1010,16 @@ Note that the first assumption implies function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the same as specifying a unit variance function (they are one-to-one). -The objective function (the penalized negative log likelihood) is -independent of :math:`\phi` and is minimized with respect to the -coefficients :math:`w`. - -Two remarks: +A few remarks: +* The deviance is independent of :math:`\phi`. Therefore, also the estimation + of the coefficients :math:`w` is independent of the dispersion parameter of + the EDM. +* The minimization is equivalent to (penalized) maximum likelihood estimation. * The deviances for at least Normal, Poisson and Gamma distributions are strictly consistent scoring functions for the mean :math:`\mu`, see Eq. (19)-(20) in [12]_. -* If you want to model a frequency, i.e. counts per exposure (time, volume, ...) - you can do so by a Poisson distribution and passing - :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together - with :math:`s=\mathrm{exposure}` as sample weights. - .. topic:: References: From f07c831c29bc37ab0b922e1482d9f027f84049c1 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 11 Aug 2019 23:32:57 +0200 Subject: [PATCH 120/209] Remove dispersion --- sklearn/linear_model/_glm/glm.py | 116 +++----------------- sklearn/linear_model/_glm/tests/test_glm.py | 18 --- 2 files changed, 15 insertions(+), 119 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 0ab9e8c1db777..cf9b00527c8cf 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -73,11 +73,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'log' for families 'poisson', 'gamma', 'inverse-gaussian' - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) - Method for estimation of the dispersion parameter phi. Whether to use - the chi squared statistic or the deviance statistic. If None, the - dispersion is not estimated. - solver : {'auto', 'lbfgs'}, optional (default='auto') Algorithm to use in the optimization problem: @@ -124,9 +119,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): intercept_ : float Intercept (a.k.a. bias) added to linear predictor. - dispersion_ : float - The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. - n_iter_ : int Actual number of iterations used in solver. @@ -169,14 +161,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """ def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', - fit_dispersion=None, solver='auto', max_iter=100, - tol=1e-4, warm_start=False, + solver='auto', max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.fit_intercept = fit_intercept self.family = family self.link = link - self.fit_dispersion = fit_dispersion self.solver = solver self.max_iter = max_iter self.tol = tol @@ -310,8 +300,8 @@ def fit(self, X, y, sample_weight=None): weights = weights/weights_sum # initialization of coef = (intercept_, coef) - # Note: Since phi=self.dispersion_ does not enter the estimation - # of mu_i=E[y_i], set it to 1. + # Note: The dispersion parameter phi does not enter the estimation + # of mu_i=E[y_i]. if self.warm_start and hasattr(self, 'coef_'): if self.fit_intercept: @@ -363,10 +353,6 @@ def func(coef, X, y, weights, alpha, family, link): self.intercept_ = 0. self.coef_ = coef - if self.fit_dispersion in ['chisqr', 'deviance']: - # attention because of rescaling of weights - self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum - return self def _linear_predictor(self, X): @@ -406,51 +392,6 @@ def predict(self, X): mu = self._link_instance.inverse(eta) return mu - def estimate_phi(self, X, y, sample_weight=None): - """Estimate/fit the dispersion parameter phi. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Training data. - - y : array-like, shape (n_samples,) - Target values. - - sample_weight : {None, array-like}, shape (n_samples,), optional \ - (default=None) - Sample weights. - - Returns - ------- - phi : float - Dispersion parameter. - """ - check_is_fitted(self, "coef_") - _dtype = [np.float64, np.float32] - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - dtype=_dtype, y_numeric=True, multi_output=False) - n_samples, n_features = X.shape - weights = _check_sample_weight(sample_weight, X) - eta = X @ self.coef_ - if self.fit_intercept is True: - eta += self.intercept_ - n_features += 1 - if n_samples <= n_features: - raise ValueError("Estimation of dispersion parameter phi requires" - " more samples than features, got" - " samples=X.shape[0]={0} and" - " n_features=X.shape[1]+fit_intercept={1}." - .format(n_samples, n_features)) - mu = self._link_instance.inverse(eta) - if self.fit_dispersion == 'chisqr': - chisq = np.sum(weights*(y-mu)**2 / - self._family_instance.unit_variance(mu)) - return chisq/(n_samples - n_features) - elif self.fit_dispersion == 'deviance': - dev = self._family_instance.deviance(y, mu, weights) - return dev/(n_samples - n_features) - def score(self, X, y, sample_weight=None): """Compute D^2, the percentage of deviance explained. @@ -525,11 +466,6 @@ class PoissonRegressor(GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) - Method for estimation of the dispersion parameter phi. Whether to use - the chi squared statistic or the deviance statistic. If None, the - dispersion is not estimated. - solver : {'lbfgs'}, optional (default='lbfgs') Algorithm to use in the optimization problem: @@ -564,9 +500,6 @@ class PoissonRegressor(GeneralizedLinearRegressor): intercept_ : float Intercept (a.k.a. bias) added to linear predictor. - dispersion_ : float - The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. - n_iter_ : int Actual number of iterations used in solver. @@ -605,14 +538,13 @@ class PoissonRegressor(GeneralizedLinearRegressor): `_ """ def __init__(self, alpha=1.0, fit_intercept=True, link='log', - fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, check_input=True, verbose=0): + solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, + copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="poisson", link=link, - fit_dispersion=fit_dispersion, solver=solver, - max_iter=max_iter, tol=tol, warm_start=warm_start, - copy_X=copy_X, verbose=verbose) + solver=solver, max_iter=max_iter, tol=tol, + warm_start=warm_start, copy_X=copy_X, verbose=verbose) @property def family(self): @@ -652,11 +584,6 @@ class GammaRegressor(GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) - Method for estimation of the dispersion parameter phi. Whether to use - the chi squared statistic or the deviance statistic. If None, the - dispersion is not estimated. - solver : {'lbfgs'}, optional (default='lbfgs') Algorithm to use in the optimization problem: @@ -691,9 +618,6 @@ class GammaRegressor(GeneralizedLinearRegressor): intercept_ : float Intercept (a.k.a. bias) added to linear predictor. - dispersion_ : float - The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. - n_iter_ : int Actual number of iterations used in solver. @@ -721,14 +645,13 @@ class GammaRegressor(GeneralizedLinearRegressor): `_ """ def __init__(self, alpha=1.0, fit_intercept=True, link='log', - fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, check_input=True, verbose=0): + solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, + copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="gamma", link=link, - fit_dispersion=fit_dispersion, solver=solver, - max_iter=max_iter, tol=tol, warm_start=warm_start, - copy_X=copy_X, verbose=verbose) + solver=solver, max_iter=max_iter, tol=tol, + warm_start=warm_start, copy_X=copy_X, verbose=verbose) @property def family(self): @@ -783,11 +706,6 @@ class TweedieRegressor(GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) - Method for estimation of the dispersion parameter phi. Whether to use - the chi squared statistic or the deviance statistic. If None, the - dispersion is not estimated. - solver : {'lbfgs'}, optional (default='lbfgs') Algorithm to use in the optimization problem: @@ -822,9 +740,6 @@ class TweedieRegressor(GeneralizedLinearRegressor): intercept_ : float Intercept (a.k.a. bias) added to linear predictor. - dispersion_ : float - The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. - n_iter_ : int Actual number of iterations used in solver. @@ -852,14 +767,13 @@ class TweedieRegressor(GeneralizedLinearRegressor): `_ """ def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log', - fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, check_input=True, verbose=0): + solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, + copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family=TweedieDistribution(power=power), link=link, - fit_dispersion=fit_dispersion, solver=solver, - max_iter=max_iter, tol=tol, warm_start=warm_start, - copy_X=copy_X, verbose=verbose) + solver=solver, max_iter=max_iter, tol=tol, + warm_start=warm_start, copy_X=copy_X, verbose=verbose) @property def family(self): diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 12edbefbf2833..5bd80cbf76fcf 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -332,24 +332,6 @@ def test_solver_equivalence(params, regression_data): ) -def test_fit_dispersion(regression_data): - X, y = regression_data - - est1 = GeneralizedLinearRegressor() - est1.fit(X, y) - assert not hasattr(est1, "dispersion_") - - est2 = GeneralizedLinearRegressor(fit_dispersion="chisqr") - est2.fit(X, y) - assert isinstance(est2.dispersion_, float) - - est3 = GeneralizedLinearRegressor(fit_dispersion="deviance") - est3.fit(X, y) - assert isinstance(est3.dispersion_, float) - - assert_allclose(est2.dispersion_, est3.dispersion_) - - @pytest.mark.parametrize("solver", GLM_SOLVERS) def test_convergence_warning(solver, regression_data): X, y = regression_data From ebbbe9cd1c670e15aca835b92730f570c49e6e4c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 13 Aug 2019 08:44:50 +0200 Subject: [PATCH 121/209] Update doc/modules/linear_model.rst Co-Authored-By: Nicolas Hug --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 09b657c26b915..f352860b6826f 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -1003,7 +1003,7 @@ In the unpenalized case, the assumptions are the following: :math:`\phi` and sample weights :math:`s_i`. * The aim is to predict the expectation :math:`\mu_i` with :math:`\hat{y}_i = h(\eta_i)`, linear predictor - :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`. + :math:`\eta_i=(Xw)_i` and inverse link function :math:`h`. Note that the first assumption implies :math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance From 918e2574ad99724d4d4a437af5283f32442d5752 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 13 Aug 2019 08:53:57 +0200 Subject: [PATCH 122/209] Update doc/modules/linear_model.rst --- doc/modules/linear_model.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 09b657c26b915..32b7494ba7c00 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -927,6 +927,9 @@ Inverse Gaussian :math:`y \in (0, \infty)` :math:`\mu^3` ================= =============================== ====================================== ============================================ +Usage +----- + In the following use cases, a loss different from the squared loss might be appropriate, @@ -944,7 +947,7 @@ Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link :math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the -log-link with :math:`h(x^\top w)=\exp(x^\top w)`. +log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`. :class:`TweedieRegressor` implements a generalized linear model for the Tweedie distribution, that allows to model any of the above mentioned @@ -1018,7 +1021,9 @@ A few remarks: * The minimization is equivalent to (penalized) maximum likelihood estimation. * The deviances for at least Normal, Poisson and Gamma distributions are strictly consistent scoring functions for the mean :math:`\mu`, see Eq. - (19)-(20) in [12]_. + (19)-(20) in [12]_. This means that, given an appropriate feature matrix `X`, + you get good (asymptotic) estimators for the expectation when using these + deviances. .. topic:: References: From 37d0f47bac45c27d01f949a835ecddad471b8d42 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:14:20 +0300 Subject: [PATCH 123/209] Use double `` when necessary --- doc/modules/linear_model.rst | 10 +++++----- sklearn/linear_model/_glm/glm.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index d0e3f9542a641..7bb684e8bcf87 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -951,22 +951,22 @@ log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`. :class:`TweedieRegressor` implements a generalized linear model for the Tweedie distribution, that allows to model any of the above mentioned -distributions using the appropriate power parameter `p`, i.e. the exponent of +distributions using the appropriate power parameter ``p``, i.e. the exponent of the unit variance function, - - `p = 0`: Normal distribution. Specialized solvers such as + - ``p = 0``: Normal distribution. Specialized solvers such as :class:`Ridge`, :class:`ElasticNet` are generally more appropriate in this case. - - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for + - ``p = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed for convenience. However, it is strictly equivalent to `TweedieRegressor(power=1)`. - - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for + - ``p = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for convenience. However, it is strictly equivalent to `TweedieRegressor(power=2)`. - - `p = 3`: Inverse Gamma distribution. + - ``p = 3``: Inverse Gamma distribution. Note: diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index cf9b00527c8cf..baed5fecb2b97 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -397,7 +397,7 @@ def score(self, X, y, sample_weight=None): D^2 is a generalization of the coefficient of determination R^2. R^2 uses squared error and D^2 deviance. Note that those two are equal - for family='normal'. + for ``family='normal'``. D^2 is defined as :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, From 9c337f25ea389c15e32ce943292fbe7b953a95b3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:16:59 +0300 Subject: [PATCH 124/209] ax -> axes in plot_poisson_regression_non_normal_loss.py --- .../plot_poisson_regression_non_normal_loss.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 6cee852866018..5c0b64faea255 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -228,19 +228,19 @@ def load_mtpl2(n_samples=100000): # histogram of observed target values with that of predicted values, -fig, ax = plt.subplots(1, 4, figsize=(16, 3)) +fig, axes = plt.subplots(1, 4, figsize=(16, 3)) -df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=ax[0]) +df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=axes[0]) -ax[0].set_title('Experimental data') +axes[0].set_title('Experimental data') for idx, model in enumerate([linregr, glm_freq, gbr]): y_pred = model.predict(X_train) - pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=ax[idx+1]) - ax[idx + 1].set_title(model.__class__.__name__) + pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=axes[idx+1]) + axes[idx + 1].set_title(model.__class__.__name__) -for axi in ax: +for axi in axes: axi.set( yscale='log', xlabel="y (Frequency)" From 5e05935caf29ff6bffb25ab5c83a1e4c855b83b0 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:21:13 +0300 Subject: [PATCH 125/209] Update sklearn/linear_model/_glm/distribution.py Co-Authored-By: Nicolas Hug --- sklearn/linear_model/_glm/distribution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 950fa3fbb03e7..b1845eb13e921 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -206,7 +206,7 @@ def _mu_deviance_derivative(self, coef, X, y, weights, link): if coef.size == X.shape[1] + 1: devp = np.concatenate(([temp.sum()], temp @ X)) else: - devp = temp @ X # sampe as X.T @ temp + devp = temp @ X # same as X.T @ temp return mu, devp From 4a6821393715e42150fb8673f3d45840002d0114 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:26:47 +0300 Subject: [PATCH 126/209] Remove solver=auto --- sklearn/linear_model/_glm/glm.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index baed5fecb2b97..4786d231336b6 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -73,19 +73,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'log' for families 'poisson', 'gamma', 'inverse-gaussian' - solver : {'auto', 'lbfgs'}, optional (default='auto') + solver : 'lbfgs', optional (default='lbfgs') Algorithm to use in the optimization problem: - 'auto' - Sets 'lbfgs' - 'lbfgs' Calls scipy's L-BFGS-B optimizer. - - Note that all solvers except lbfgs use the fisher matrix, i.e. the - expected Hessian instead of the Hessian matrix. - max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -161,7 +154,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """ def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', - solver='auto', max_iter=100, tol=1e-4, warm_start=False, + solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.fit_intercept = fit_intercept @@ -243,14 +236,11 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver not in ['auto', 'lbfgs']: + if self.solver not in ['lbfgs']: raise ValueError("GeneralizedLinearRegressor supports only solvers" - "'auto', 'lbfgs';" - " got {0}".format(self.solver)) + "'lbfgs'; got {0}".format(self.solver)) solver = self.solver - if self.solver == 'auto': - solver = 'lbfgs' - if (not isinstance(self.max_iter, int) + if (not isinstance(self.max_iter, numbers.Integral) or self.max_iter <= 0): raise ValueError("Maximum number of iteration must be a positive " "integer;" From 8ee5c85a0140175e354ca9f8dd4db08f23606b9f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:38:56 +0300 Subject: [PATCH 127/209] Update sklearn/linear_model/_glm/glm.py Co-Authored-By: Nicolas Hug --- sklearn/linear_model/_glm/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index baed5fecb2b97..f770337e40e62 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -384,7 +384,7 @@ def predict(self, X): Returns ------- - C : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Returns predicted values. """ # check_array is done in _linear_predictor From a1f8aabb6042aac39929a65faaac78e3b6b68e32 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:39:53 +0300 Subject: [PATCH 128/209] More review comments --- sklearn/linear_model/_glm/glm.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index f81408a83a5cf..25f018c3e0eb4 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -261,16 +261,14 @@ def fit(self, X, y, sample_weight=None): family = self._family_instance link = self._link_instance - _dtype = [np.float64, np.float32] - _stype = ['csc', 'csr'] - X, y = check_X_y(X, y, accept_sparse=_stype, - dtype=_dtype, y_numeric=True, multi_output=False, - copy=self.copy_X) + X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], + dtype=[np.float64, np.float32], + y_numeric=True, multi_output=False, copy=self.copy_X) y = np.asarray(y, dtype=np.float64) weights = _check_sample_weight(sample_weight, X) - n_samples, n_features = X.shape + _, n_features = X.shape if self.check_input: if not np.all(family.in_y_range(y)): @@ -287,7 +285,7 @@ def fit(self, X, y, sample_weight=None): # we rescale weights such that sum(weights) = 1 and this becomes # 1/2*deviance + L2 with deviance=sum(weights * unit_deviance) weights_sum = np.sum(weights) - weights = weights/weights_sum + weights = weights / weights_sum # initialization of coef = (intercept_, coef) # Note: The dispersion parameter phi does not enter the estimation @@ -355,7 +353,7 @@ def _linear_predictor(self, X): Returns ------- - C : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") From c0999ead8e8e834456f4a0f61ec77cec94790f4d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:44:36 +0300 Subject: [PATCH 129/209] Addressing reviews in tests --- sklearn/linear_model/_glm/glm.py | 4 ---- sklearn/linear_model/_glm/tests/test_glm.py | 16 ++++++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 25f018c3e0eb4..c1f7aecd32e4f 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -287,10 +287,6 @@ def fit(self, X, y, sample_weight=None): weights_sum = np.sum(weights) weights = weights / weights_sum - # initialization of coef = (intercept_, coef) - # Note: The dispersion parameter phi does not enter the estimation - # of mu_i=E[y_i]. - if self.warm_start and hasattr(self, 'coef_'): if self.fit_intercept: coef = np.concatenate((np.array([self.intercept_]), diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 5bd80cbf76fcf..c4e8c883ff379 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -61,17 +61,17 @@ def test_sample_weights_validation(): glm.fit(X, y, weights) -@pytest.mark.parametrize('f, fam', +@pytest.mark.parametrize('name, instance', [('normal', NormalDistribution()), ('poisson', PoissonDistribution()), ('gamma', GammaDistribution()), ('inverse-gaussian', InverseGaussianDistribution())]) -def test_glm_family_argument(f, fam): +def test_glm_family_argument(name, instance): """Test GLM family argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) - glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) - assert isinstance(glm._family_instance, fam.__class__) + glm = GeneralizedLinearRegressor(family=name, alpha=0).fit(X, y) + assert isinstance(glm._family_instance, instance.__class__) glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) @@ -79,15 +79,15 @@ def test_glm_family_argument(f, fam): glm.fit(X, y) -@pytest.mark.parametrize('l, link', +@pytest.mark.parametrize('name, instance', [('identity', IdentityLink()), ('log', LogLink())]) -def test_glm_link_argument(l, link): +def test_glm_link_argument(name, instance): """Test GLM link argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) - glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) - assert isinstance(glm._link_instance, link.__class__) + glm = GeneralizedLinearRegressor(family='normal', link=name).fit(X, y) + assert isinstance(glm._link_instance, instance.__class__) glm = GeneralizedLinearRegressor(family='normal', link='not a link') with pytest.raises(ValueError, match="link must be"): From e09e3368d5e3c173569dba1669cbf97334d20db0 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:50:37 +0300 Subject: [PATCH 130/209] More comments in tests --- sklearn/linear_model/_glm/tests/test_glm.py | 52 ++++----------------- 1 file changed, 9 insertions(+), 43 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index c4e8c883ff379..ebcab6395e5b4 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -23,7 +23,6 @@ GammaDistribution, InverseGaussianDistribution, ) from sklearn.linear_model import Ridge -from sklearn.metrics import mean_absolute_error from sklearn.exceptions import ConvergenceWarning GLM_SOLVERS = ['lbfgs'] @@ -183,8 +182,7 @@ def test_glm_identity_regression(solver): X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', - fit_intercept=False, solver=solver, - tol=1e-7) + fit_intercept=False, solver=solver) res = glm.fit(X, y) assert_allclose(res.coef_, coef, rtol=1e-6) @@ -242,18 +240,13 @@ def test_warm_start(fit_intercept): @pytest.mark.parametrize('fit_intercept', [True, False]) @pytest.mark.parametrize('solver', GLM_SOLVERS) def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): - """Test ridge regression for Normal distributions. - - Case n_samples >> n_features - - Compare to test_ridge in test_ridge.py. - """ + """Compare with Ridge regression for Normal distributions.""" alpha = 1.0 n_predict = 10 - X, y, coef = make_regression(n_samples=n_samples+n_predict, - n_features=n_features, - n_informative=n_features-2, noise=0.5, - coef=True, random_state=42) + X, y, _ = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=42) y = y[0:n_samples] X, T = X[0:n_samples], X[n_samples:] @@ -279,10 +272,9 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): @pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-7)]) -def test_poisson_ridge(solver, tol): - """Test ridge regression with poisson family and LogLink. - - Compare to R's glmnet""" +def test_poisson_glmnet(solver, tol): + """Compare Poisson regression with L2 regularization and LogLink to glmnet + """ # library("glmnet") # options(digits=10) # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) @@ -306,32 +298,6 @@ def test_poisson_ridge(solver, tol): assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5) -@pytest.mark.parametrize( - "params", - [ - {"solver": "lbfgs"}, - ], - ids=lambda params: ', '.join("%s=%s" % (key, val) - for key, val in params.items()) -) -def test_solver_equivalence(params, regression_data): - X, y = regression_data - est_ref = GeneralizedLinearRegressor() - est_ref.fit(X, y) - - estimator = GeneralizedLinearRegressor(**params) - - estimator.fit(X, y) - - assert_allclose(estimator.intercept_, est_ref.intercept_, rtol=1e-4) - assert_allclose(estimator.coef_, est_ref.coef_, rtol=1e-4) - assert_allclose( - mean_absolute_error(estimator.predict(X), y), - mean_absolute_error(est_ref.predict(X), y), - rtol=1e-4 - ) - - @pytest.mark.parametrize("solver", GLM_SOLVERS) def test_convergence_warning(solver, regression_data): X, y = regression_data From 6601d3049d3637c1823ff709d5416b306386d46a Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 17 Aug 2019 15:49:25 +0200 Subject: [PATCH 131/209] Update linear_model.rst --- doc/modules/linear_model.rst | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 7bb684e8bcf87..8b3764e7cefd0 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -914,8 +914,8 @@ likelihood as \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right) -The following table lists some specific EDM distributions—all are Tweedie -distributions—and some properties. +The following table lists some specific EDM distributions—all are Tweedie +distributions—and some of their properties. ================= =============================== ====================================== ============================================ Distribution Target Domain Unit Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` @@ -943,7 +943,7 @@ appropriate, you might try an Inverse Gaussian deviance (or even higher variance powers of the Tweedie family). -Since the linear predictor :math:`Xw` can be negative and +Since the linear predictor :math:`x^\top w` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link :math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the @@ -951,32 +951,33 @@ log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`. :class:`TweedieRegressor` implements a generalized linear model for the Tweedie distribution, that allows to model any of the above mentioned -distributions using the appropriate power parameter ``p``, i.e. the exponent of -the unit variance function, +distributions using the appropriate ``power`` parameter, i.e. the exponent +of the unit variance function, - - ``p = 0``: Normal distribution. Specialized solvers such as + - ``power = 0``: Normal distribution. Specialized solvers such as :class:`Ridge`, :class:`ElasticNet` are generally more appropriate in this case. - - ``p = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed for + - ``power = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed for convenience. However, it is strictly equivalent to `TweedieRegressor(power=1)`. - - ``p = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for + - ``power = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for convenience. However, it is strictly equivalent to `TweedieRegressor(power=2)`. - - ``p = 3``: Inverse Gamma distribution. + - ``power = 3``: Inverse Gamma distribution. -Note: -* The feature matrix `X` should be standardized before fitting. This - ensures that the penalty treats features equally. -* If you want to model a relative frequency, i.e. counts per exposure (time, - volume, ...) you can do so by a Poisson distribution and passing - :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together - with :math:`s=\mathrm{exposure}` as sample weights. This is done in both - examples linked below. +.. note:: + + * The feature matrix `X` should be standardized before fitting. This + ensures that the penalty treats features equally. + * If you want to model a relative frequency, i.e. counts per exposure (time, + volume, ...) you can do so by a Poisson distribution and passing + :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values + together with :math:`s=\mathrm{exposure}` as sample weights. This is done + in both examples linked below. The estimator can be used as follows:: From 5174dae1cbc7ea9442243f15f3887f8252520f46 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 17 Aug 2019 18:55:24 +0200 Subject: [PATCH 132/209] Address check_is_fitted deprication of attributes --- sklearn/linear_model/_glm/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index c1f7aecd32e4f..13ca3673c8b3d 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -352,7 +352,7 @@ def _linear_predictor(self, X): y_pred : array, shape (n_samples,) Returns predicted values of linear predictor. """ - check_is_fitted(self, "coef_") + check_is_fitted(self) X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype='numeric', ensure_2d=True, allow_nd=False) From 61dc13fdd5b71fe0e1d30e5a9b58934441a9dcb1 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 17 Aug 2019 20:18:21 +0200 Subject: [PATCH 133/209] No LaTeX in docstrings --- sklearn/linear_model/_glm/distribution.py | 89 ++++++++++------------- sklearn/linear_model/_glm/glm.py | 66 ++++++++--------- 2 files changed, 70 insertions(+), 85 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index b1845eb13e921..1baf99a465326 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -26,19 +26,15 @@ def _safe_lin_pred(X, coef): class ExponentialDispersionModel(metaclass=ABCMeta): - r"""Base class for reproductive Exponential Dispersion Models (EDM). + """Base class for reproductive Exponential Dispersion Models (EDM). - The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by + The pdf of Y∼EDM(μ, φ) is given by:: - .. math:: p(y| \theta, \phi) = c(y, \phi) - \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) - = \tilde{c}(y, \phi) - \exp\left(-\frac{d(y, \mu)}{2\phi}\right) + p(y| θ, φ) = c1(y, φ) * exp((θy-A(θ))/φ) + = c2(y, φ) * exp(-d(y, μ)/(2φ)) - with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, - variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, - unit variance :math:`v(\mu)` and - unit deviance :math:`d(y,\mu)`. + with mean E[Y] = A'(θ) = μ, variance Var[Y] = φ * v(μ), + unit variance v(μ), unit deviance d(y,μ) and dispersion parameter φ. Methods ------- @@ -56,7 +52,7 @@ class ExponentialDispersionModel(metaclass=ABCMeta): """ def in_y_range(self, y): - """Returns ``True`` if y is in the valid range of Y~EDM. + """Returns ``True`` if y is in the valid range of Y∼EDM. Parameters ---------- @@ -80,17 +76,13 @@ def in_y_range(self, y): @abstractmethod def unit_variance(self, mu): - r"""Compute the unit variance function. + """Compute the unit variance function. - The unit variance :math:`v(\mu)` determines the variance as - a function of the mean :math:`\mu` by - :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. - It can also be derived from the unit deviance :math:`d(y,\mu)` as + The unit variance v(μ) determines the variance as a function of the + mean μ by Var[Y_i] = φ/s_i * v(μ_i). + It can also be derived from the unit deviance d(y,μ) as:: - .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ - \partial\mu^2}}\big|_{y=\mu} - - See also :func:`variance`. + v(μ) = 2/(∂^2 d(y,μ)/(∂ μ^2))|_{y=μ} Parameters ---------- @@ -101,9 +93,9 @@ def unit_variance(self, mu): @abstractmethod def unit_variance_derivative(self, mu): - r"""Compute the derivative of the unit variance w.r.t. mu. + """Compute the derivative of the unit variance w.r.t. mu. - Return :math:`v'(\mu)`. + Return v'(μ). Parameters ---------- @@ -114,12 +106,11 @@ def unit_variance_derivative(self, mu): @abstractmethod def unit_deviance(self, y, mu, check_input=False): - r"""Compute the unit deviance. + """Compute the unit deviance. + + The unit_deviance d(y,μ) can be defined by the log-likelihood as:: - The unit_deviance :math:`d(y,\mu)` can be defined by the - log-likelihood as - :math:`d(y,\mu) = -2\phi\cdot - \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + d(y,μ) = -2φ * (loglike(y,μ,φ) - loglike(y,y,φ)) Parameters ---------- @@ -140,11 +131,10 @@ def unit_deviance(self, y, mu, check_input=False): pass # pragma: no cover def unit_deviance_derivative(self, y, mu): - r"""Compute the derivative of the unit deviance w.r.t. mu. + """Compute the derivative of the unit deviance w.r.t. mu. The derivative of the unit deviance is given by - :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` - with unit variance :math:`v(\mu)`. + ∂ d(y,μ)/(∂ μ) = -2(y-μ)/v(μ) with unit variance v(μ). Parameters ---------- @@ -157,14 +147,13 @@ def unit_deviance_derivative(self, y, mu): return -2 * (y - mu) / self.unit_variance(mu) def deviance(self, y, mu, weights=1): - r"""Compute the deviance. + """Compute the deviance. The deviance is a weighted sum of the per sample unit deviances, - :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` - with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. - In terms of the log-likelihood it is :math:`D = -2\phi\cdot - \left(loglike(y,\mu,\frac{phi}{s}) - - loglike(y,y,\frac{phi}{s})\right)`. + D = sum_i s_i * d(y_i,μ_i) + with weights s_i and unit deviance d(y,μ). + In terms of the log-likelihood it is + D = -2φ * (loglike(y,μ,φ/s) - loglike(y,y,φ/s)). Parameters ---------- @@ -182,7 +171,7 @@ def deviance(self, y, mu, weights=1): def deviance_derivative(self, y, mu, weights=1): """Compute the derivative of the deviance w.r.t. mu. - It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. + It gives ∂ D(y, μ; weights)/(∂ μ). Parameters ---------- @@ -211,11 +200,10 @@ def _mu_deviance_derivative(self, coef, X, y, weights, link): class TweedieDistribution(ExponentialDispersionModel): - r"""A class for the Tweedie distribution. + """A class for the Tweedie distribution. - A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely - defined by it's mean-variance relationship - :math:`\mathrm{Var}[Y] \propto \mu^power`. + A Tweedie distribution with mean μ=E[Y] is uniquely defined by it's + mean-variance relationship Var[Y] ∝ μ^power. Special cases are: @@ -231,8 +219,7 @@ class TweedieDistribution(ExponentialDispersionModel): Parameters ---------- power : float (default=0) - The variance power of the `unit_variance` - :math:`v(\mu) = \mu^{power}`. + The variance power of the unit variance v(μ) = μ^power. For ``0=1.') + raise ValueError('Tweedie distribution is only defined for ' + 'power<=0 and p>=1.') elif 1 <= power < 2: # Poisson or Compound Poisson distribution self._lower_bound = DistributionBoundary(0, inclusive=True) @@ -279,7 +266,7 @@ def unit_variance(self, mu): def unit_variance_derivative(self, mu): """Compute the derivative of the unit variance of a Tweedie - distribution v(mu)=power*mu**(power-1). + distribution v(mu)=power * mu**(power-1). Parameters ---------- @@ -289,12 +276,10 @@ def unit_variance_derivative(self, mu): return self.power * np.power(mu, self.power - 1) def unit_deviance(self, y, mu, check_input=False): - r"""Compute the unit deviance. + """Compute the unit deviance. - The unit_deviance :math:`d(y,\mu)` can be defined by the - log-likelihood as - :math:`d(y,\mu) = -2\phi\cdot - \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + The unit deviance d(y,μ) can be defined by the log-likelihood as + d(y,μ) = -2φ * (loglike(y,μ,φ) - loglike(y,y,φ)). Parameters ---------- @@ -328,7 +313,7 @@ def unit_deviance(self, y, mu, check_input=False): raise ValueError("Tweedie deviance is only defined for p<=0 " "and p>=1.") elif 1 <= p < 2: - # Poisson and Compount poisson distribution, y >= 0, mu > 0 + # Poisson and Compound poisson distribution, y >= 0, mu > 0 if (y < 0).any() or (mu <= 0).any(): raise ValueError(message + "non-negative y and strictly " "positive mu.") diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 13ca3673c8b3d..fb653b5e4361f 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -31,7 +31,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as mu=h(X*w). Therefore, + fitting and predicting the mean of the target y as μ=h(X*w). Therefore, the fit minimizes the following objective function with L2 priors as regularizer:: @@ -118,16 +118,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and + Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of + and given by the specific EDM, see :ref:`User Guide `. - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + The parameters w (``coef_`` and ``intercept_``) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. - For alpha > 0, the feature matrix X should be standardized in order to + For ``alpha > 0``, the feature matrix X should be standardized in order to penalize features equally strong. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. @@ -138,7 +138,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + Consider ȳ = (sum_i s_i y_i)(sum_i s_i), in this case one might say that y has a 'scaled' Poisson distributions. The same holds for other distributions. @@ -183,9 +183,9 @@ def fit(self, X, y, sample_weight=None): optional (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has - Var[Y_i]=phi/w_i * v(mu). - If Y_i ~ EDM(mu, phi/w_i), then - sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a + Var[Y_i]=φ/w_i * v(mu). + If Y_i ~ EDM(mu, φ/w_i), then + sum(w*Y)/sum(w) ~ EDM(mu, φ/sum(w)), i.e. the mean of y is a weighted average with weights=sample_weight. Returns @@ -218,7 +218,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("No default link known for the " "specified distribution family. Please " "set link manually, i.e. not to 'auto'; " - "got (link='auto', family={}" + "got (link='auto', family={})" .format(self.family)) elif self.link == 'identity': self._link_instance = IdentityLink() @@ -383,11 +383,10 @@ def score(self, X, y, sample_weight=None): R^2 uses squared error and D^2 deviance. Note that those two are equal for ``family='normal'``. - D^2 is defined as - :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, - :math:`D_{null}` is the null deviance, i.e. the deviance of a model - with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. - The mean :math:`\\bar{y}` is averaged by sample_weight. + D^2 is defined as D^2 = 1 - D(y_true,y_pred) / D_null, + D_null is the null deviance, i.e. the deviance of a model + with intercept alone, which corresponds to y_pred = ȳ. + The mean ȳ is averaged by sample_weight. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). @@ -490,17 +489,18 @@ class PoissonRegressor(GeneralizedLinearRegressor): Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and + Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of + and given by the specific EDM, see :ref:`User Guide `. - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + The parameters w (``coef_`` and ``intercept_``) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. - For alpha > 0, the feature matrix X should be standardized in order to - penalize features equally strong. + For ``alpha > 0``, the feature matrix X should be standardized in order to + penalize features equally strong. Call + :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. If the target y is a ratio, appropriate sample weights s should be provided. @@ -508,7 +508,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + Consider ȳ = (sum_i s_i y_i)(sum_i s_i), in this case one might say that y has a 'scaled' Poisson distributions. References @@ -608,12 +608,12 @@ class GammaRegressor(GeneralizedLinearRegressor): Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and + Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of + and given by the specific EDM, see :ref:`User Guide `. - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + The parameters w (``coef_`` and ``intercept_``) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. @@ -664,7 +664,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): Parameters ---------- power : float (default=0) - The variance power: :math:`v(\\mu) = \\mu^{power}`. + The variance power: v(μ) = μ^{power}. For ``0`. - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + The parameters w (``coef_`` and ``intercept_``) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. From 44524cafc0aac0289267f1099b2ec2f8d53fe6c7 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 17 Aug 2019 20:28:54 +0200 Subject: [PATCH 134/209] Replace Tweedie p->power --- sklearn/linear_model/_glm/distribution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 1baf99a465326..e82f787d7e710 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -241,7 +241,7 @@ def power(self, power): self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False) elif 0 < power < 1: raise ValueError('Tweedie distribution is only defined for ' - 'power<=0 and p>=1.') + 'power<=0 and power>=1.') elif 1 <= power < 2: # Poisson or Compound Poisson distribution self._lower_bound = DistributionBoundary(0, inclusive=True) From 58d240973a175ee705d4e8cfbd4e9b6495eca609 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 17 Aug 2019 20:34:11 +0200 Subject: [PATCH 135/209] Replace Tweedie p->power --- sklearn/linear_model/_glm/distribution.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index e82f787d7e710..e0b3511734cee 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -300,7 +300,7 @@ def unit_deviance(self, y, mu, check_input=False): p = self.power if check_input: - message = ("Mean Tweedie deviance error with p={} can only be " + message = ("Mean Tweedie deviance error with power={} can only be " "used on ".format(p)) if p < 0: # 'Extreme stable', y any realy number, mu > 0 @@ -310,8 +310,8 @@ def unit_deviance(self, y, mu, check_input=False): # Normal, y and mu can be any real number pass elif 0 < p < 1: - raise ValueError("Tweedie deviance is only defined for p<=0 " - "and p>=1.") + raise ValueError("Tweedie deviance is only defined for " + "power<=0 and power>=1.") elif 1 <= p < 2: # Poisson and Compound poisson distribution, y >= 0, mu > 0 if (y < 0).any() or (mu <= 0).any(): @@ -335,8 +335,8 @@ def unit_deviance(self, y, mu, check_input=False): # Normal distribution, y and mu any real number dev = (y - mu)**2 elif p < 1: - raise ValueError("Tweedie deviance is only defined for p<=0 and " - "p>=1.") + raise ValueError("Tweedie deviance is only defined for power<=0 " + "and power>=1.") elif p == 1: # Poisson distribution dev = 2 * (xlogy(y, y/mu) - y + mu) From ee351e1f7137bc225d8f964558009612303b37db Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 18 Aug 2019 00:11:57 +0200 Subject: [PATCH 136/209] Fix tests due to Tweedie p->power --- sklearn/linear_model/_glm/tests/test_distribution.py | 2 +- sklearn/metrics/tests/test_regression.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py index 82e493b7a2149..ed81c8328c87a 100644 --- a/sklearn/linear_model/_glm/tests/test_distribution.py +++ b/sklearn/linear_model/_glm/tests/test_distribution.py @@ -45,7 +45,7 @@ def test_invalid_distribution_bound(): def test_tweedie_distribution_power(): - msg = "distribution is only defined for p<=0 and p>=1" + msg = "distribution is only defined for power<=0 and power>=1" with pytest.raises(ValueError, match=msg): TweedieDistribution(power=0.5) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 8e18321f80303..8febf9064e58e 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -133,7 +133,7 @@ def test_regression_metrics_at_limits(): mean_tweedie_deviance([0.], [0.], p=p) with pytest.raises(ValueError, - match="is only defined for p<=0 and p>=1"): + match="is only defined for power<=0 and power>=1"): mean_tweedie_deviance([0.], [0.], p=0.5) From 33fe9be737001045eb9fad5616f678fb27fb8d79 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 18 Aug 2019 12:09:24 +0200 Subject: [PATCH 137/209] Simplify super(...) --- sklearn/linear_model/_glm/distribution.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index e0b3511734cee..7ce51deb0921d 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -353,25 +353,25 @@ def unit_deviance(self, y, mu, check_input=False): class NormalDistribution(TweedieDistribution): """Class for the Normal (aka Gaussian) distribution""" def __init__(self): - super(NormalDistribution, self).__init__(power=0) + super().__init__(power=0) class PoissonDistribution(TweedieDistribution): """Class for the scaled Poisson distribution""" def __init__(self): - super(PoissonDistribution, self).__init__(power=1) + super().__init__(power=1) class GammaDistribution(TweedieDistribution): """Class for the Gamma distribution""" def __init__(self): - super(GammaDistribution, self).__init__(power=2) + super().__init__(power=2) class InverseGaussianDistribution(TweedieDistribution): """Class for the scaled InverseGaussianDistribution distribution""" def __init__(self): - super(InverseGaussianDistribution, self).__init__(power=3) + super().__init__(power=3) EDM_DISTRIBUTIONS = { From 94272e79d565672416fbc6772aa8c0eb9e7ce519 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 18 Aug 2019 12:26:12 +0200 Subject: [PATCH 138/209] Replace Link.link(..) by __call__(..) --- sklearn/linear_model/_glm/glm.py | 8 ++++---- sklearn/linear_model/_glm/link.py | 8 ++++---- sklearn/linear_model/_glm/tests/test_link.py | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index fb653b5e4361f..4d7a05095cb27 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -49,7 +49,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Constant that multiplies the penalty terms and thus determines the regularization strength. See the notes for the exact mathematical meaning of this - parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this case, the design matrix X must have full column rank (no collinearities). @@ -296,7 +296,7 @@ def fit(self, X, y, sample_weight=None): else: if self.fit_intercept: coef = np.zeros(n_features+1) - coef[0] = link.link(np.average(y, weights=weights)) + coef[0] = link(np.average(y, weights=weights)) else: coef = np.zeros(n_features) @@ -441,7 +441,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): Constant that multiplies the penalty terms and thus determines the regularization strength. See the notes for the exact mathematical meaning of this - parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this case, the design matrix X must have full column rank (no collinearities). @@ -560,7 +560,7 @@ class GammaRegressor(GeneralizedLinearRegressor): Constant that multiplies the penalty terms and thus determines the regularization strength. See the notes for the exact mathematical meaning of this - parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this case, the design matrix X must have full column rank (no collinearities). diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py index ec9a7b7736eb1..1ef485cc85a55 100644 --- a/sklearn/linear_model/_glm/link.py +++ b/sklearn/linear_model/_glm/link.py @@ -15,7 +15,7 @@ class Link(metaclass=ABCMeta): """Abstract base class for Link functions.""" @abstractmethod - def link(self, mu): + def __call__(self, mu): """Compute the link function g(mu). The link function links the mean mu=E[Y] to the so called linear @@ -79,7 +79,7 @@ def inverse_derivative2(self, lin_pred): class IdentityLink(Link): """The identity link function g(x)=x.""" - def link(self, mu): + def __call__(self, mu): return mu def derivative(self, mu): @@ -98,7 +98,7 @@ def inverse_derivative2(self, lin_pred): class LogLink(Link): """The log link function g(x)=log(x).""" - def link(self, mu): + def __call__(self, mu): return np.log(mu) def derivative(self, mu): @@ -117,7 +117,7 @@ def inverse_derivative2(self, lin_pred): class LogitLink(Link): """The logit link function g(x)=logit(x).""" - def link(self, mu): + def __call__(self, mu): return logit(mu) def derivative(self, mu): diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py index 3a2a21c4c04e0..feafb151e14a2 100644 --- a/sklearn/linear_model/_glm/tests/test_link.py +++ b/sklearn/linear_model/_glm/tests/test_link.py @@ -25,7 +25,7 @@ def test_link_properties(link): # careful for large x, note expit(36) = 1 # limit max eta to 15 x = x / 100 * 15 - assert_allclose(link.link(link.inverse(x)), x) + assert_allclose(link(link.inverse(x)), x) # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) assert_allclose(link.derivative(link.inverse(x)), 1./link.inverse_derivative(x)) @@ -34,5 +34,5 @@ def test_link_properties(link): link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) # for LogitLink, in the following x should be between 0 and 1. - # assert_almost_equal(link.inverse_derivative(link.link(x)), + # assert_almost_equal(link.inverse_derivative(link(x)), # 1./link.derivative(x), decimal=decimal) From 2457039f82ea308e634e4853484df17029f311ac Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 18 Aug 2019 12:45:35 +0200 Subject: [PATCH 139/209] Replace 1. -> 1 --- sklearn/linear_model/_glm/glm.py | 2 +- sklearn/linear_model/_glm/link.py | 8 ++++---- sklearn/linear_model/_glm/tests/test_link.py | 11 ++++++----- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 4d7a05095cb27..b8d87799f8a93 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -415,7 +415,7 @@ def score(self, X, y, sample_weight=None): dev = self._family_instance.deviance(y, mu, weights=weights) y_mean = np.average(y, weights=weights) dev_null = self._family_instance.deviance(y, y_mean, weights=weights) - return 1. - dev / dev_null + return 1 - dev / dev_null def _more_tags(self): return {"requires_positive_y": True} diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py index 1ef485cc85a55..04b485c067cca 100644 --- a/sklearn/linear_model/_glm/link.py +++ b/sklearn/linear_model/_glm/link.py @@ -102,7 +102,7 @@ def __call__(self, mu): return np.log(mu) def derivative(self, mu): - return 1./mu + return 1 / mu def inverse(self, lin_pred): return np.exp(lin_pred) @@ -121,15 +121,15 @@ def __call__(self, mu): return logit(mu) def derivative(self, mu): - return 1. / (mu * (1 - mu)) + return 1 / (mu * (1 - mu)) def inverse(self, lin_pred): return expit(lin_pred) def inverse_derivative(self, lin_pred): ep = expit(lin_pred) - return ep * (1. - ep) + return ep * (1 - ep) def inverse_derivative2(self, lin_pred): ep = expit(lin_pred) - return ep * (1. - ep) * (1. - 2 * ep) + return ep * (1 - ep) * (1 - 2 * ep) diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py index feafb151e14a2..a631509baca79 100644 --- a/sklearn/linear_model/_glm/tests/test_link.py +++ b/sklearn/linear_model/_glm/tests/test_link.py @@ -19,20 +19,21 @@ def test_link_properties(link): """Test link inverse and derivative.""" rng = np.random.RandomState(42) - x = rng.rand(100)*100 + x = rng.rand(100) * 100 link = link() # instantiate object if isinstance(link, LogitLink): # careful for large x, note expit(36) = 1 # limit max eta to 15 x = x / 100 * 15 assert_allclose(link(link.inverse(x)), x) - # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) + # if g(h(x)) = x, then g'(h(x)) = 1/h'(x) + # g = link, h = link.inverse assert_allclose(link.derivative(link.inverse(x)), - 1./link.inverse_derivative(x)) + 1 / link.inverse_derivative(x)) assert ( link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) - # for LogitLink, in the following x should be between 0 and 1. + # for LogitLink, in the following x should be between 0 and 1 # assert_almost_equal(link.inverse_derivative(link(x)), - # 1./link.derivative(x), decimal=decimal) + # 1 / link.derivative(x), decimal=decimal) From 6396d2c4495681d7d774f7c7de7054b336ba2709 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 18 Aug 2019 13:46:45 +0200 Subject: [PATCH 140/209] Fix table in TweedieRegressor --- sklearn/linear_model/_glm/glm.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index b8d87799f8a93..f538aa6121747 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -664,19 +664,24 @@ class TweedieRegressor(GeneralizedLinearRegressor): Parameters ---------- power : float (default=0) - The variance power: v(μ) = μ^{power}. + The variance power: v(μ) = μ^power. For ``0 Date: Thu, 22 Aug 2019 12:02:28 +0300 Subject: [PATCH 141/209] Improve docstring in plot_tweedie_regression_insurance_claims.py --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 55a21c8d8723d..24b3afcadf120 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -85,8 +85,8 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, Parameters ---------- - df : DataFrame with at least three columns named feature, weight and - observed + df : DataFrame + input data feature: str a column name of df for the feature to be plotted weight : str From da66fd5dc0fb8fc89825e5cf75057eed70c09aed Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 22 Aug 2019 12:23:40 +0300 Subject: [PATCH 142/209] Use train_test_split in tests --- sklearn/linear_model/_glm/tests/test_glm.py | 36 +++++++++++---------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index ebcab6395e5b4..d6c7c7c3f767e 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -24,6 +24,7 @@ ) from sklearn.linear_model import Ridge from sklearn.exceptions import ConvergenceWarning +from sklearn.model_selection import train_test_split GLM_SOLVERS = ['lbfgs'] @@ -207,9 +208,8 @@ def test_glm_log_regression(family, solver, tol): @pytest.mark.parametrize('fit_intercept', [True, False]) def test_warm_start(fit_intercept): - n_samples, n_features = 100, 10 - n_predict = 10 - X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_samples, n_features = 110, 10 + X, y, coef = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_features-2, noise=0.5, coef=True, random_state=42) @@ -230,7 +230,7 @@ def test_warm_start(fit_intercept): assert glm1.score(X, y) > glm2.score(X, y) glm2.set_params(max_iter=1000) glm2.fit(X, y) - assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-4, atol=1e-5) + assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) # TODO: investigate why this doesn't match # assert glm1.n_iter_ == glm2.n_iter_ + 2 @@ -242,13 +242,14 @@ def test_warm_start(fit_intercept): def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): """Compare with Ridge regression for Normal distributions.""" alpha = 1.0 - n_predict = 10 - X, y, _ = make_regression(n_samples=n_samples+n_predict, - n_features=n_features, - n_informative=n_features-2, noise=0.5, - coef=True, random_state=42) - y = y[0:n_samples] - X, T = X[0:n_samples], X[n_samples:] + test_size = 10 + X, y = make_regression(n_samples=n_samples + test_size, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + random_state=42) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=0 + ) if n_samples > n_features: ridge_params = {"solver": "svd"} @@ -258,17 +259,18 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha*n_samples, normalize=False, random_state=42, **ridge_params) - ridge.fit(X, y) + ridge.fit(X_train, y_train) glm = GeneralizedLinearRegressor(alpha=1.0, family='normal', link='identity', fit_intercept=True, - max_iter=300, solver=solver, tol=1e-6, - check_input=False) - glm.fit(X, y) + solver=solver, check_input=False, + max_iter=300) + glm.fit(X_train, y_train) assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6) + assert_allclose(glm.coef_, ridge.coef_, atol=5e-5) assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) - assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) + assert_allclose(glm.predict(X_train), ridge.predict(X_train), rtol=5e-5) + assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=5e-5) @pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-7)]) From b9bc170b2ef7673d6b40103a6583cd9a56d7c517 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 22 Aug 2019 14:07:32 +0300 Subject: [PATCH 143/209] Fix TODO in test_warm_start --- sklearn/linear_model/_glm/tests/test_glm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index d6c7c7c3f767e..4975d1454b922 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -232,8 +232,7 @@ def test_warm_start(fit_intercept): glm2.fit(X, y) assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) - # TODO: investigate why this doesn't match - # assert glm1.n_iter_ == glm2.n_iter_ + 2 + assert glm1.n_iter_ == glm2.n_iter_ @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) From ab6c5d82ea6149f8acc20834abcc735c630a516c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 22 Aug 2019 14:31:06 +0300 Subject: [PATCH 144/209] Revert "No LaTeX in docstrings" This reverts commit 61dc13fdd5b71fe0e1d30e5a9b58934441a9dcb1. --- sklearn/linear_model/_glm/distribution.py | 85 +++++++++++++---------- sklearn/linear_model/_glm/glm.py | 66 +++++++++--------- 2 files changed, 83 insertions(+), 68 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 7ce51deb0921d..5754cb391ec61 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -26,15 +26,19 @@ def _safe_lin_pred(X, coef): class ExponentialDispersionModel(metaclass=ABCMeta): - """Base class for reproductive Exponential Dispersion Models (EDM). + r"""Base class for reproductive Exponential Dispersion Models (EDM). - The pdf of Y∼EDM(μ, φ) is given by:: + The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by - p(y| θ, φ) = c1(y, φ) * exp((θy-A(θ))/φ) - = c2(y, φ) * exp(-d(y, μ)/(2φ)) + .. math:: p(y| \theta, \phi) = c(y, \phi) + \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) + = \tilde{c}(y, \phi) + \exp\left(-\frac{d(y, \mu)}{2\phi}\right) - with mean E[Y] = A'(θ) = μ, variance Var[Y] = φ * v(μ), - unit variance v(μ), unit deviance d(y,μ) and dispersion parameter φ. + with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, + unit variance :math:`v(\mu)` and + unit deviance :math:`d(y,\mu)`. Methods ------- @@ -52,7 +56,7 @@ class ExponentialDispersionModel(metaclass=ABCMeta): """ def in_y_range(self, y): - """Returns ``True`` if y is in the valid range of Y∼EDM. + """Returns ``True`` if y is in the valid range of Y~EDM. Parameters ---------- @@ -76,13 +80,17 @@ def in_y_range(self, y): @abstractmethod def unit_variance(self, mu): - """Compute the unit variance function. + r"""Compute the unit variance function. - The unit variance v(μ) determines the variance as a function of the - mean μ by Var[Y_i] = φ/s_i * v(μ_i). - It can also be derived from the unit deviance d(y,μ) as:: + The unit variance :math:`v(\mu)` determines the variance as + a function of the mean :math:`\mu` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. + It can also be derived from the unit deviance :math:`d(y,\mu)` as - v(μ) = 2/(∂^2 d(y,μ)/(∂ μ^2))|_{y=μ} + .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ + \partial\mu^2}}\big|_{y=\mu} + + See also :func:`variance`. Parameters ---------- @@ -93,9 +101,9 @@ def unit_variance(self, mu): @abstractmethod def unit_variance_derivative(self, mu): - """Compute the derivative of the unit variance w.r.t. mu. + r"""Compute the derivative of the unit variance w.r.t. mu. - Return v'(μ). + Return :math:`v'(\mu)`. Parameters ---------- @@ -106,11 +114,12 @@ def unit_variance_derivative(self, mu): @abstractmethod def unit_deviance(self, y, mu, check_input=False): - """Compute the unit deviance. - - The unit_deviance d(y,μ) can be defined by the log-likelihood as:: + r"""Compute the unit deviance. - d(y,μ) = -2φ * (loglike(y,μ,φ) - loglike(y,y,φ)) + The unit_deviance :math:`d(y,\mu)` can be defined by the + log-likelihood as + :math:`d(y,\mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` Parameters ---------- @@ -131,10 +140,11 @@ def unit_deviance(self, y, mu, check_input=False): pass # pragma: no cover def unit_deviance_derivative(self, y, mu): - """Compute the derivative of the unit deviance w.r.t. mu. + r"""Compute the derivative of the unit deviance w.r.t. mu. The derivative of the unit deviance is given by - ∂ d(y,μ)/(∂ μ) = -2(y-μ)/v(μ) with unit variance v(μ). + :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + with unit variance :math:`v(\mu)`. Parameters ---------- @@ -147,13 +157,14 @@ def unit_deviance_derivative(self, y, mu): return -2 * (y - mu) / self.unit_variance(mu) def deviance(self, y, mu, weights=1): - """Compute the deviance. + r"""Compute the deviance. The deviance is a weighted sum of the per sample unit deviances, - D = sum_i s_i * d(y_i,μ_i) - with weights s_i and unit deviance d(y,μ). - In terms of the log-likelihood it is - D = -2φ * (loglike(y,μ,φ/s) - loglike(y,y,φ/s)). + :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` + with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. + In terms of the log-likelihood it is :math:`D = -2\phi\cdot + \left(loglike(y,\mu,\frac{phi}{s}) + - loglike(y,y,\frac{phi}{s})\right)`. Parameters ---------- @@ -171,7 +182,7 @@ def deviance(self, y, mu, weights=1): def deviance_derivative(self, y, mu, weights=1): """Compute the derivative of the deviance w.r.t. mu. - It gives ∂ D(y, μ; weights)/(∂ μ). + It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. Parameters ---------- @@ -200,10 +211,11 @@ def _mu_deviance_derivative(self, coef, X, y, weights, link): class TweedieDistribution(ExponentialDispersionModel): - """A class for the Tweedie distribution. + r"""A class for the Tweedie distribution. - A Tweedie distribution with mean μ=E[Y] is uniquely defined by it's - mean-variance relationship Var[Y] ∝ μ^power. + A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely + defined by it's mean-variance relationship + :math:`\mathrm{Var}[Y] \propto \mu^power`. Special cases are: @@ -219,7 +231,8 @@ class TweedieDistribution(ExponentialDispersionModel): Parameters ---------- power : float (default=0) - The variance power of the unit variance v(μ) = μ^power. + The variance power of the `unit_variance` + :math:`v(\mu) = \mu^{power}`. For ``0=1.") elif 1 <= p < 2: - # Poisson and Compound poisson distribution, y >= 0, mu > 0 + # Poisson and Compount poisson distribution, y >= 0, mu > 0 if (y < 0).any() or (mu <= 0).any(): raise ValueError(message + "non-negative y and strictly " "positive mu.") diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index f538aa6121747..69e68c1eda0f8 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -31,7 +31,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as μ=h(X*w). Therefore, + fitting and predicting the mean of the target y as mu=h(X*w). Therefore, the fit minimizes the following objective function with L2 priors as regularizer:: @@ -118,16 +118,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and - Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of - and given by the specific EDM, see + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see :ref:`User Guide `. - The parameters w (``coef_`` and ``intercept_``) are estimated by + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. - For ``alpha > 0``, the feature matrix X should be standardized in order to + For alpha > 0, the feature matrix X should be standardized in order to penalize features equally strong. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. @@ -138,7 +138,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Consider ȳ = (sum_i s_i y_i)(sum_i s_i), + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, in this case one might say that y has a 'scaled' Poisson distributions. The same holds for other distributions. @@ -183,9 +183,9 @@ def fit(self, X, y, sample_weight=None): optional (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has - Var[Y_i]=φ/w_i * v(mu). - If Y_i ~ EDM(mu, φ/w_i), then - sum(w*Y)/sum(w) ~ EDM(mu, φ/sum(w)), i.e. the mean of y is a + Var[Y_i]=phi/w_i * v(mu). + If Y_i ~ EDM(mu, phi/w_i), then + sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a weighted average with weights=sample_weight. Returns @@ -218,7 +218,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("No default link known for the " "specified distribution family. Please " "set link manually, i.e. not to 'auto'; " - "got (link='auto', family={})" + "got (link='auto', family={}" .format(self.family)) elif self.link == 'identity': self._link_instance = IdentityLink() @@ -383,10 +383,11 @@ def score(self, X, y, sample_weight=None): R^2 uses squared error and D^2 deviance. Note that those two are equal for ``family='normal'``. - D^2 is defined as D^2 = 1 - D(y_true,y_pred) / D_null, - D_null is the null deviance, i.e. the deviance of a model - with intercept alone, which corresponds to y_pred = ȳ. - The mean ȳ is averaged by sample_weight. + D^2 is defined as + :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, + :math:`D_{null}` is the null deviance, i.e. the deviance of a model + with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. + The mean :math:`\\bar{y}` is averaged by sample_weight. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). @@ -489,18 +490,17 @@ class PoissonRegressor(GeneralizedLinearRegressor): Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and - Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of - and given by the specific EDM, see + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see :ref:`User Guide `. - The parameters w (``coef_`` and ``intercept_``) are estimated by + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. - For ``alpha > 0``, the feature matrix X should be standardized in order to - penalize features equally strong. Call - :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. + For alpha > 0, the feature matrix X should be standardized in order to + penalize features equally strong. If the target y is a ratio, appropriate sample weights s should be provided. @@ -508,7 +508,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Consider ȳ = (sum_i s_i y_i)(sum_i s_i), + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, in this case one might say that y has a 'scaled' Poisson distributions. References @@ -608,12 +608,12 @@ class GammaRegressor(GeneralizedLinearRegressor): Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and - Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of - and given by the specific EDM, see + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see :ref:`User Guide `. - The parameters w (``coef_`` and ``intercept_``) are estimated by + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. @@ -664,7 +664,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): Parameters ---------- power : float (default=0) - The variance power: v(μ) = μ^power. + The variance power: :math:`v(\\mu) = \\mu^{power}`. For ``0`. - The parameters w (``coef_`` and ``intercept_``) are estimated by + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. From b424a070f696262174959c1927c921d301fd8ba5 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 22 Aug 2019 14:42:39 +0300 Subject: [PATCH 145/209] Remove n_iter_ check when warm start. --- sklearn/linear_model/_glm/glm.py | 2 +- sklearn/linear_model/_glm/tests/test_glm.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 69e68c1eda0f8..38f27eec397e8 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -218,7 +218,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("No default link known for the " "specified distribution family. Please " "set link manually, i.e. not to 'auto'; " - "got (link='auto', family={}" + "got (link='auto', family={})" .format(self.family)) elif self.link == 'identity': self._link_instance = IdentityLink() diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 4975d1454b922..879d382ce28dc 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -232,7 +232,6 @@ def test_warm_start(fit_intercept): glm2.fit(X, y) assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) - assert glm1.n_iter_ == glm2.n_iter_ @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) From 95a90580c9d034e7bcb9c488cc19d09ac692deca Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 22 Aug 2019 15:02:00 +0300 Subject: [PATCH 146/209] Rename variable L2 -> coef_scaled --- sklearn/linear_model/_glm/glm.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 38f27eec397e8..a7128eb78df07 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -304,15 +304,16 @@ def fit(self, X, y, sample_weight=None): if solver == 'lbfgs': def func(coef, X, y, weights, alpha, family, link): - mu, devp = \ - family._mu_deviance_derivative(coef, X, y, weights, link) + mu, devp = family._mu_deviance_derivative( + coef, X, y, weights, link + ) dev = family.deviance(y, mu, weights) intercept = (coef.size == X.shape[1] + 1) idx = 1 if intercept else 0 # offset if coef[0] is intercept - L2 = alpha * coef[idx:] - obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2) + coef_scaled = alpha * coef[idx:] + obj = 0.5 * dev + 0.5 * (coef[idx:] @ coef_scaled) objp = 0.5 * devp - objp[idx:] += L2 + objp[idx:] += coef_scaled return obj, objp args = (X, y, weights, self.alpha, family, link) From 59eceb42d874d94d0bb7fdced4f3e05fd030dc62 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 22 Aug 2019 15:59:54 +0300 Subject: [PATCH 147/209] Minor fixes --- .../plot_tweedie_regression_insurance_claims.py | 3 --- sklearn/linear_model/_glm/distribution.py | 12 ++++++------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 24b3afcadf120..24451069eeb68 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -181,9 +181,6 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # events occuring with a constant rate in a given time interval (``Exposure``). # Here we model the frequency ``y = ClaimNb / Exposure``, # which is still a (scaled) Poisson distribution. -# -# A very important property of the Poisson distribution is its mean-variance -# relation: The variance is proportional to the mean. df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 5754cb391ec61..8779cd616cc5a 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -342,9 +342,9 @@ def unit_deviance(self, y, mu, check_input=False): if p < 0: # 'Extreme stable', y any realy number, mu > 0 - dev = 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p) * (2-p)) - - y * np.power(mu, 1-p)/(1-p) + - np.power(mu, 2-p)/(2-p)) + dev = 2 * (np.power(np.maximum(y, 0), 2-p) / ((1-p) * (2-p)) + - y * np.power(mu, 1-p) / (1-p) + + np.power(mu, 2-p) / (2-p)) elif p == 0: # Normal distribution, y and mu any real number @@ -359,9 +359,9 @@ def unit_deviance(self, y, mu, check_input=False): # Gamma distribution dev = 2 * (np.log(mu/y) + y/mu - 1) else: - dev = 2 * (np.power(y, 2-p)/((1-p) * (2-p)) - - y * np.power(mu, 1-p)/(1-p) + - np.power(mu, 2-p)/(2-p)) + dev = 2 * (np.power(y, 2-p) / ((1-p) * (2-p)) + - y * np.power(mu, 1-p) / (1-p) + + np.power(mu, 2-p) / (2-p)) return dev From 04f30f40e572d6e919fef8a4c2d4bcb27d562b6b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 10:13:27 +0300 Subject: [PATCH 148/209] Better wording in example --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 5c0b64faea255..d7f9b65660453 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -5,7 +5,9 @@ This example illustrates the use of linear Poisson regression on the French Motor Third-Party Liability Claims dataset [1] and compares -it with models learned with least squared error. +it with models learned with least squared error. The goal is to predict the +number of insurance claims (or frequency) following car accidents for a user +given historical data over a population of users. We start by defining a few helper functions for loading the data and visualizing results. From 3630b5277f68cd9805ec5f3af21b7fe9336cd46d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 11:25:19 +0300 Subject: [PATCH 149/209] Improvements in plot_poisson_regression_non_normal_loss.py --- ...plot_poisson_regression_non_normal_loss.py | 95 ++++++++++--------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index d7f9b65660453..6a98030817245 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -28,6 +28,7 @@ import pandas as pd from sklearn.datasets import fetch_openml +from sklearn.dummy import DummyRegressor from sklearn.compose import ColumnTransformer from sklearn.linear_model import PoissonRegressor, LinearRegression from sklearn.model_selection import train_test_split @@ -78,7 +79,7 @@ def load_mtpl2(n_samples=100000): # containing the number of claims (``ClaimNb``) with the freMTPL2sev table # containing the claim amount (``ClaimAmount``) for the same user ids. -df = load_mtpl2(n_samples=100000) +df = load_mtpl2(n_samples=50000) # Note: filter out claims with zero amount, as the severity model # requires a strictly positive target values. @@ -117,8 +118,6 @@ def load_mtpl2(n_samples=100000): # (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``, # which is still a (scaled) Poisson distribution. # -# A very important property of the Poisson distribution is its mean-variance -# relation: The variance is proportional to the mean. df["Frequency"] = df.ClaimNb / df.Exposure @@ -135,49 +134,50 @@ def load_mtpl2(n_samples=100000): # To evaluate the pertinence of the used metrics, we will consider as a # baseline an estimator that returns 0 for any input. -df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) +df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) +dummy = DummyRegressor(strategy='constant', constant=0) +dummy.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) -eps = 1e-5 -print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, np.zeros(len(df_test)), - df_test.Exposure.values)) -print("MAE: %.3f" % mean_absolute_error( - df_test.Frequency.values, np.zeros(len(df_test)), - df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, eps + np.zeros(len(df_test)), - df_test.Exposure.values)) +############################################################################## +# +# The Poisson deviance cannot be computed on negative values predicted by the +# model, so we set the minimum predicted value to eps, + + +def score_estimator(estimator, df_test, eps=1e-5): + """Score an estimatr on the test set""" + + print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, estimator.predict(X_test), + df_test.Exposure.values)) + print("MAE: %.3f" % mean_absolute_error( + df_test.Frequency.values, estimator.predict(X_test), + df_test.Exposure.values)) + print("mean Poisson deviance: %.3f" % mean_poisson_deviance( + df_test.Frequency.values, np.fmax(estimator.predict(X_test), eps), + df_test.Exposure.values)) + + +print("DummyRegressor") +score_estimator(dummy, df_test) ############################################################################## # # We start by modeling the target variable with the least squares linear # regression model, - linregr = LinearRegression() linregr.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) -print("LinearRegression") -print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, linregr.predict(X_test), - df_test.Exposure.values)) -print("MSE: %.3f" % mean_absolute_error( - df_test.Frequency.values, linregr.predict(X_test), - df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, np.fmax(linregr.predict(X_test), eps), - df_test.Exposure.values)) - -############################################################################## -# -# The Poisson deviance cannot be computed because negative values are -# predicted by the model, print('Number Negatives: %s / total: %s' % ( (linregr.predict(X_test) < 0).sum(), X_test.shape[0])) +print("LinearRegression") +score_estimator(linregr, df_test) + ############################################################################## # # Next we fit the Poisson regressor on the target variable, @@ -186,15 +186,7 @@ def load_mtpl2(n_samples=100000): glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) print("PoissonRegressor") -print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, glm_freq.predict(X_test), - df_test.Exposure.values)) -print("MAE: %.3f" % mean_absolute_error( - df_test.Frequency.values, glm_freq.predict(X_test), - df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, glm_freq.predict(X_test), - df_test.Exposure.values)) +score_estimator(glm_freq, df_test) ############################################################################## # @@ -202,19 +194,13 @@ def load_mtpl2(n_samples=100000): # still minimizes the least square error. -gbr = GradientBoostingRegressor(max_depth=3) +gbr = GradientBoostingRegressor() gbr.fit(X_train, df_train.Frequency.values, sample_weight=df_train.Exposure.values) print("GradientBoostingRegressor") -print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) -print("MAE: %.3f" % mean_absolute_error( - df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, np.fmax(gbr.predict(X_test), eps), - df_test.Exposure.values)) +score_estimator(gbr, df_test) ############################################################################## # @@ -231,6 +217,7 @@ def load_mtpl2(n_samples=100000): fig, axes = plt.subplots(1, 4, figsize=(16, 3)) +fig.subplots_adjust(bottom=0.2) df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=axes[0]) @@ -247,3 +234,17 @@ def load_mtpl2(n_samples=100000): yscale='log', xlabel="y (Frequency)" ) + +############################################################################## +# +# The experimental data presents a long tail distribution for ``y``. In all +# models we predict the mean expected value, so we will have necessairily fewer +# extreme values. Additionally normal distribution used in ``Ridge`` and +# ``GradientBoostingRegressor`` has a constant variance, while for the Poisson +# distribution used in ``PoissonRegressor``, the variance is proportional to +# the mean predicted value. +# +# Thus, among the considered estimators, +# ``PoissonRegressor`` and ``GradientBoostingRegressor`` are better suited for +# modeling the long tail distribution of the data as compared to the ``Ridge`` +# estimator. From 516eadba0cbddd9c016f66fc6dce41773f0350e7 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 12:28:06 +0300 Subject: [PATCH 150/209] Improvements in plot_tweedie_regression_insurance_claims.py --- ...plot_poisson_regression_non_normal_loss.py | 4 +-- ...lot_tweedie_regression_insurance_claims.py | 30 +++++++++++++++---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 6a98030817245..9deca6c25032a 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -116,8 +116,8 @@ def load_mtpl2(n_samples=100000): # as a Poisson distribution. It is then assumed to be the number of discrete # events occurring with a constant rate in a given time interval # (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``, -# which is still a (scaled) Poisson distribution. -# +# which is still a (scaled) Poisson distribution, and use ``Exposure`` as +# `sample_weight`. df["Frequency"] = df.ClaimNb / df.Exposure diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 24451069eeb68..f866518b69db8 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -80,7 +80,7 @@ def load_mtpl2(n_samples=100000): def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, - title=None, ax=None): + title=None, ax=None, fill_legend=False): """Plot observed and predicted - aggregated per feature level. Parameters @@ -95,6 +95,8 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, a column name of df with the observed target predicted : frame a dataframe, with the same index as df, with the predicted target + fill_legend : bool, default=False + wgether to show fill_between legend """ # aggregate observed and predicted variables by feature level df_ = df.loc[:, [feature, weight]].copy() @@ -109,13 +111,15 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax) y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8 - ax.fill_between( + p2 = ax.fill_between( df_.index, 0, y_max * df_[weight] / df_[weight].values.max(), color="g", alpha=0.1, ) + if fill_legend: + ax.legend([p2], ["{} distribution".format(feature)]) ax.set( ylabel=y_label if y_label is not None else None, title=title if title is not None else "Train: Observed vs Predicted", @@ -132,7 +136,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # containing the claim amount (``ClaimAmount``) for the same policy ids # (``IDpol``). -df = load_mtpl2(n_samples=100000) +df = load_mtpl2(n_samples=60000) # Note: filter out claims with zero amount, as the severity model # requires a strictly positive target values. @@ -180,9 +184,10 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # as a Poisson distribution. It is then assumed to be the number of discrete # events occuring with a constant rate in a given time interval (``Exposure``). # Here we model the frequency ``y = ClaimNb / Exposure``, -# which is still a (scaled) Poisson distribution. +# which is still a (scaled) Poisson distribution, and use ``Exposure`` as +# `sample_weight`. -df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) +df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) # Some of the features are colinear, we use a weak penalization to avoid # numerical issues. @@ -200,6 +205,7 @@ def mean_deviance(estimator, y, y_pred, weights): def score_estimator( estimator, X_train, X_test, df_train, df_test, target, weights ): + """Evaluate an estimator on train and test sets with different metrics""" res = [] for subset_label, X, df in [ @@ -282,6 +288,7 @@ def score_estimator( y_label="Claim Frequency", title="test data", ax=ax[0, 1], + fill_legend=True ) plot_obs_pred( @@ -293,6 +300,7 @@ def score_estimator( y_label="Claim Frequency", title="test data", ax=ax[1, 0], + fill_legend=True ) plot_obs_pred( @@ -304,11 +312,17 @@ def score_estimator( y_label="Claim Frequency", title="test data", ax=ax[1, 1], + fill_legend=True ) ############################################################################## # +# According to the observed data, the frequency of accidents is higher for +# drivers younger than 30 years old, and it positively correlated with the +# `BonusMalus` variable. Out model is able to mostly correctly model +# this behaviour. +# # 3. Severity model - Gamma Distribution # --------------------------------------- # The mean claim amount or severity (`AvgClaimAmount`) can be empirically @@ -392,11 +406,15 @@ def score_estimator( y_label="Average Claim Severity", title="test data", ax=ax[1], + fill_legend=True ) ############################################################################## # +# Overall the drivers age (``DrivAge``) has a weak impact on the claim +# severity, both in observed and predicted data. +# # 4. Total Claims Amount -- Compound Poisson distribution # ------------------------------------------------------- # @@ -517,3 +535,5 @@ def score(self, X, y, sample_weight=None): ) print(pd.DataFrame(res).set_index("subset").T) + +plt.show() From 5e14928507f04893d131942c57e566c5a5789517 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 12:35:28 +0300 Subject: [PATCH 151/209] Drop unused ExponentialDispersionModel._upper_bound --- sklearn/linear_model/_glm/distribution.py | 7 +------ sklearn/linear_model/_glm/tests/test_distribution.py | 5 ----- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 8779cd616cc5a..e3dfd1fcd3cf5 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -63,11 +63,7 @@ def in_y_range(self, y): y : array, shape (n_samples,) Target values. """ - if hasattr(self, '_upper_bound'): - # All currently supported distributions have an upper bound at - # +inf, however this may need to be implemented for other - # distributions - raise NotImplementedError + # Note that currently supported distributions have +inf upper bound if not isinstance(self._lower_bound, DistributionBoundary): raise TypeError('_lower_bound attribute must be of type ' @@ -236,7 +232,6 @@ class TweedieDistribution(ExponentialDispersionModel): For ``0 Date: Wed, 28 Aug 2019 13:10:55 +0300 Subject: [PATCH 152/209] Move notes and references from docstrings to user manual --- doc/modules/linear_model.rst | 12 ++++ sklearn/linear_model/_glm/glm.py | 117 ------------------------------- 2 files changed, 12 insertions(+), 117 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index fc92cd2002948..b127a2c345a36 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -986,6 +986,18 @@ of the unit variance function, :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together with :math:`s=\mathrm{exposure}` as sample weights. This is done in both examples linked below. + * The fit itself does not need Y to be from an EDM, but only assumes + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. + * If the target y is a ratio, appropriate sample weights s should be + provided. + As an example, consider Poisson distributed counts z (integers) and + weights s=exposure (time, money, persons years, ...). Then you fit + y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``. + The weights are necessary for the right (finite sample) mean. + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + in this case one might say that y has a 'scaled' Poisson distributions. + The same holds for other distributions. The estimator can be used as follows:: diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index a7128eb78df07..826f5924776b3 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -114,43 +114,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): n_iter_ : int Actual number of iterations used in solver. - - Notes - ----- - The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see - :ref:`User Guide `. - - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - minimizing the deviance plus penalty term, which is equivalent to - (penalized) maximum likelihood estimation. - - For alpha > 0, the feature matrix X should be standardized in order to - penalize features equally strong. Call - :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. - - If the target y is a ratio, appropriate sample weights s should be - provided. - As an example, consider Poisson distributed counts z (integers) and - weights s=exposure (time, money, persons years, ...). Then you fit - y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, - sample_weight=s)``. The weights are necessary for the right (finite - sample) mean. - Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, - in this case one might say that y has a 'scaled' Poisson distributions. - The same holds for other distributions. - - References - ---------- - .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, - Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - - .. Jørgensen, B. (1992). The theory of exponential dispersion models - and analysis of deviance. Monografias de matemática, no. 51. See also - `Exponential dispersion model. - `_ """ def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', @@ -487,40 +450,6 @@ class PoissonRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in solver. - - Notes - ----- - The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see - :ref:`User Guide `. - - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - minimizing the deviance plus penalty term, which is equivalent to - (penalized) maximum likelihood estimation. - - For alpha > 0, the feature matrix X should be standardized in order to - penalize features equally strong. - - If the target y is a ratio, appropriate sample weights s should be - provided. - As an example, consider Poisson distributed counts z (integers) and - weights s=exposure (time, money, persons years, ...). Then you fit - y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``. - The weights are necessary for the right (finite sample) mean. - Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, - in this case one might say that y has a 'scaled' Poisson distributions. - - References - ---------- - .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, - Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - - .. Jørgensen, B. (1992). The theory of exponential dispersion models - and analysis of deviance. Monografias de matemática, no. 51. See also - `Exponential dispersion model. - `_ """ def __init__(self, alpha=1.0, fit_intercept=True, link='log', solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, @@ -605,29 +534,6 @@ class GammaRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in solver. - - Notes - ----- - The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see - :ref:`User Guide `. - - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - minimizing the deviance plus penalty term, which is equivalent to - (penalized) maximum likelihood estimation. - - - References - ---------- - .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, - Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - - .. Jørgensen, B. (1992). The theory of exponential dispersion models - and analysis of deviance. Monografias de matemática, no. 51. See also - `Exponential dispersion model. - `_ """ def __init__(self, alpha=1.0, fit_intercept=True, link='log', solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, @@ -732,29 +638,6 @@ class TweedieRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in solver. - - Notes - ----- - The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see - :ref:`User Guide `. - - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - minimizing the deviance plus penalty term, which is equivalent to - (penalized) maximum likelihood estimation. - - - References - ---------- - .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, - Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - - .. Jørgensen, B. (1992). The theory of exponential dispersion models - and analysis of deviance. Monografias de matemática, no. 51. See also - `Exponential dispersion model. - `_ """ def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log', solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, From 752d6aa90bc034426b0348a4fd12f0fc3421027d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 14:01:04 +0300 Subject: [PATCH 153/209] More explanatory comments in the code --- sklearn/linear_model/_glm/glm.py | 22 +++++++++++++++++---- sklearn/linear_model/_glm/tests/test_glm.py | 3 +++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 826f5924776b3..54560dbae0867 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -462,6 +462,10 @@ def __init__(self, alpha=1.0, fit_intercept=True, link='log', @property def family(self): + # We use a property with a setter, since the GLM solver relies + # on self.family attribute, but we can't set it in __init__ according + # to scikit-learn API constraints. This attribute is made read-only + # to disallow changing distribution to other than Poisson. return "poisson" @family.setter @@ -546,6 +550,10 @@ def __init__(self, alpha=1.0, fit_intercept=True, link='log', @property def family(self): + # We use a property with a setter, since the GLM solver relies + # on self.family attribute, but we can't set it in __init__ according + # to scikit-learn API constraints. This attribute is made read-only + # to disallow changing distribution to other than Gamma. return "gamma" @family.setter @@ -571,7 +579,10 @@ class TweedieRegressor(GeneralizedLinearRegressor): Parameters ---------- power : float (default=0) - The variance power: :math:`v(\\mu) = \\mu^{power}`. + The power determines the underlying target distribution. By + definition it links distribution variance (:math:`v`) and + mean (:math:`\\mu`): :math:`v(\\mu) = \\mu^{power}`. + For ``0 glm2.score(X, y) glm2.set_params(max_iter=1000) glm2.fit(X, y) + # The two model are not exactly identical since the lbfgs solver + # computes the approximate hessian from previous iterations, which + # will not be strictly identical in the case of a warm start. assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) From 38a4ad4e44ed4c4b27c48306afc74b85710dd889 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 14:18:43 +0300 Subject: [PATCH 154/209] Fix requires_positive_y tag --- sklearn/linear_model/_glm/glm.py | 11 ++++++++++- sklearn/linear_model/_glm/tests/test_glm.py | 13 +++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 54560dbae0867..7c518bb3f8149 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -383,7 +383,16 @@ def score(self, X, y, sample_weight=None): return 1 - dev / dev_null def _more_tags(self): - return {"requires_positive_y": True} + # create the _family_instance if fit wasn't called yet. + if hasattr(self, '_family_instance'): + _family_instance = self._family_instance + elif isinstance(self.family, ExponentialDispersionModel): + _family_instance = self.family + elif self.family in EDM_DISTRIBUTIONS: + _family_instance = EDM_DISTRIBUTIONS[self.family]() + else: + raise ValueError + return {"requires_positive_y": not _family_instance.in_y_range(-1.0)} class PoissonRegressor(GeneralizedLinearRegressor): diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 93d7ee1f08324..898d3c4edf9c0 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -340,3 +340,16 @@ def test_tweedie_regression_family(regression_data): # TODO: the following should not be allowed # est.family.power = 2 + + +@pytest.mark.parametrize( + 'estimator, value', + [ + (PoissonRegressor(), True), + (GammaRegressor(), True), + (TweedieRegressor(power=1.5), True), + (TweedieRegressor(power=0), False) + ], +) +def test_tags(estimator, value): + assert estimator._get_tags()['requires_positive_y'] is value From c15a1cc573a555cc1a2e35c0063f6795cc2f7d84 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 14:56:06 +0300 Subject: [PATCH 155/209] Remove Link.inverse_derivative2 --- doc/modules/linear_model.rst | 4 ++-- sklearn/linear_model/_glm/link.py | 21 -------------------- sklearn/linear_model/_glm/tests/test_link.py | 7 ------- 3 files changed, 2 insertions(+), 30 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index b127a2c345a36..622ab335059ab 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -919,8 +919,8 @@ The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` likelihood as .. math:: d(y, \mu) = -2\phi\cdot - \left(loglike(y,\mu,\phi) - - loglike(y,y,\phi)\right) + \left( log P(y|\mu,\phi) + - log P(y|y,\phi)\right) The following table lists some specific EDM distributions—all are Tweedie distributions—and some of their properties. diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py index 04b485c067cca..b257036ac4727 100644 --- a/sklearn/linear_model/_glm/link.py +++ b/sklearn/linear_model/_glm/link.py @@ -64,17 +64,6 @@ def inverse_derivative(self, lin_pred): """ pass # pragma: no cover - @abstractmethod - def inverse_derivative2(self, lin_pred): - """Compute 2nd derivative of the inverse link function h''(lin_pred). - - Parameters - ---------- - lin_pred : array, shape (n_samples,) - Usually the (fitted) linear predictor. - """ - pass # pragma: no cover - class IdentityLink(Link): """The identity link function g(x)=x.""" @@ -91,9 +80,6 @@ def inverse(self, lin_pred): def inverse_derivative(self, lin_pred): return np.ones_like(lin_pred) - def inverse_derivative2(self, lin_pred): - return np.zeros_like(lin_pred) - class LogLink(Link): """The log link function g(x)=log(x).""" @@ -110,9 +96,6 @@ def inverse(self, lin_pred): def inverse_derivative(self, lin_pred): return np.exp(lin_pred) - def inverse_derivative2(self, lin_pred): - return np.exp(lin_pred) - class LogitLink(Link): """The logit link function g(x)=logit(x).""" @@ -129,7 +112,3 @@ def inverse(self, lin_pred): def inverse_derivative(self, lin_pred): ep = expit(lin_pred) return ep * (1 - ep) - - def inverse_derivative2(self, lin_pred): - ep = expit(lin_pred) - return ep * (1 - ep) * (1 - 2 * ep) diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py index a631509baca79..36219e09b58e3 100644 --- a/sklearn/linear_model/_glm/tests/test_link.py +++ b/sklearn/linear_model/_glm/tests/test_link.py @@ -30,10 +30,3 @@ def test_link_properties(link): # g = link, h = link.inverse assert_allclose(link.derivative(link.inverse(x)), 1 / link.inverse_derivative(x)) - - assert ( - link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) - - # for LogitLink, in the following x should be between 0 and 1 - # assert_almost_equal(link.inverse_derivative(link(x)), - # 1 / link.derivative(x), decimal=decimal) From 37de07b7b78fe7ced78da6ed068fde3e4a08425b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 30 Aug 2019 12:41:44 +0300 Subject: [PATCH 156/209] Rename p to power parameter in mean_tweedie_deviance --- doc/modules/model_evaluation.rst | 45 ++++++++--------- sklearn/metrics/regression.py | 25 +++++----- sklearn/metrics/tests/test_regression.py | 61 ++++++++++++------------ 3 files changed, 67 insertions(+), 64 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 2db5053e08cce..1fe8ece51df67 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -2028,14 +2028,14 @@ Mean Poisson, Gamma, and Tweedie deviances The :func:`mean_tweedie_deviance` function computes the `mean Tweedie deviance error `_ -with power parameter `p`. This is a metric that elicits predicted expectation +with a ``power`` parameter. This is a metric that elicits predicted expectation values of regression targets. Following special cases exist, -- when `p=0` it is equivalent to :func:`mean_squared_error`. -- when `p=1` it is equivalent to :func:`mean_poisson_deviance`. -- when `p=2` it is equivalent to :func:`mean_gamma_deviance`. +- when ``power=0`` it is equivalent to :func:`mean_squared_error`. +- when ``power=1`` it is equivalent to :func:`mean_poisson_deviance`. +- when ``power=2`` it is equivalent to :func:`mean_gamma_deviance`. If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample, and :math:`y_i` is the corresponding true value, then the mean Tweedie @@ -2046,48 +2046,49 @@ deviance error (D) estimated over :math:`n_{\text{samples}}` is defined as \text{D}(y, \hat{y}) = \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} \begin{cases} - (y_i-\hat{y}_i)^2, & \text{for }p=0\text{ (Normal)}\\ - 2(y_i \log(y/\hat{y}_i) + \hat{y}_i - y_i), & \text{for }p=1\text{ (Poisson)}\\ - 2(\log(\hat{y}_i/y_i) + y_i/\hat{y}_i - 1), & \text{for }p=2\text{ (Gamma)}\\ + (y_i-\hat{y}_i)^2, & \text{for }\text{power}=0\text{ (Normal)}\\ + 2(y_i \log(y/\hat{y}_i) + \hat{y}_i - y_i), & \text{for power}=1\text{ (Poisson)}\\ + 2(\log(\hat{y}_i/y_i) + y_i/\hat{y}_i - 1), & \text{for power}=2\text{ (Gamma)}\\ 2\left(\frac{\max(y_i,0)^{2-p}}{(1-p)(2-p)}- \frac{y\,\hat{y}^{1-p}_i}{1-p}+\frac{\hat{y}^{2-p}_i}{2-p}\right), & \text{otherwise} \end{cases} -Tweedie deviance is a homogeneous function of degree ``2-p``. -Thus, Gamma distribution with `p=2` means that simultaneously scaling `y_true` -and `y_pred` has no effect on the deviance. For Poisson distribution `p=1` -the deviance scales linearly, and for Normal distribution (`p=0`), -quadratically. In general, the higher `p` the less weight is given to extreme -deviations between true and predicted targets. +Tweedie deviance is a homogeneous function of degree ``2-power``. +Thus, Gamma distribution with ``power=2`` means that simultaneously scaling +``y_true`` and ``y_pred`` has no effect on the deviance. For Poisson +distribution ``power=1`` the deviance scales linearly, and for Normal +distribution (``power=0``), quadratically. In general, the higher +``power`` the less weight is given to extreme deviations between true +and predicted targets. For instance, let's compare the two predictions 1.0 and 100 that are both 50% of their corresponding true value. -The mean squared error (``p=0``) is very sensitive to the +The mean squared error (``power=0``) is very sensitive to the prediction difference of the second point,:: >>> from sklearn.metrics import mean_tweedie_deviance - >>> mean_tweedie_deviance([1.0], [1.5], p=0) + >>> mean_tweedie_deviance([1.0], [1.5], power=0) 0.25 - >>> mean_tweedie_deviance([100.], [150.], p=0) + >>> mean_tweedie_deviance([100.], [150.], power=0) 2500.0 If we increase ``p`` to 1,:: - >>> mean_tweedie_deviance([1.0], [1.5], p=1) + >>> mean_tweedie_deviance([1.0], [1.5], power=1) 0.18... - >>> mean_tweedie_deviance([100.], [150.], p=1) + >>> mean_tweedie_deviance([100.], [150.], power=1) 18.9... -the difference in errors decreases. Finally, by setting, ``p=2``:: +the difference in errors decreases. Finally, by setting, ``power=2``:: - >>> mean_tweedie_deviance([1.0], [1.5], p=2) + >>> mean_tweedie_deviance([1.0], [1.5], power=2) 0.14... - >>> mean_tweedie_deviance([100.], [150.], p=2) + >>> mean_tweedie_deviance([100.], [150.], power=2) 0.14... -we would get identical errors. The deviance when `p=2` is thus only +we would get identical errors. The deviance when ``power=2`` is thus only sensitive to relative errors. .. _clustering_metrics: diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index a1ae25fec278c..73db0acc945e6 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -624,7 +624,7 @@ def max_error(y_true, y_pred): return np.max(np.abs(y_true - y_pred)) -def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): +def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0): """Mean Tweedie deviance regression loss. Read more in the :ref:`User Guide `. @@ -640,20 +640,21 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): sample_weight : array-like, shape (n_samples,), optional Sample weights. - p : float, optional - Tweedie power parameter. Either p <= 0 or p >= 1. + power : float, default=0 + Tweedie power parameter. Either power <= 0 or power >= 1. The higher `p` the less weight is given to extreme deviations between true and predicted targets. - - p < 0: Extreme stable distribution. Requires: y_pred > 0. - - p = 0 : Normal distribution, output corresponds to + - power < 0: Extreme stable distribution. Requires: y_pred > 0. + - power = 0 : Normal distribution, output corresponds to mean_squared_error. y_true and y_pred can be any real numbers. - - p = 1 : Poisson distribution. Requires: y_true >= 0 and y_pred > 0. + - power = 1 : Poisson distribution. Requires: y_true >= 0 and + y_pred > 0. - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0 and y_pred > 0. - - p = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0. - - p = 3 : Inverse Gaussian distribution. Requires: y_true > 0 + - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0. + - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0 and y_pred > 0. - otherwise : Positive stable distribution. Requires: y_true > 0 and y_pred > 0. @@ -668,7 +669,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): >>> from sklearn.metrics import mean_tweedie_deviance >>> y_true = [2, 0, 1, 4] >>> y_pred = [0.5, 0.5, 2., 2.] - >>> mean_tweedie_deviance(y_true, y_pred, p=1) + >>> mean_tweedie_deviance(y_true, y_pred, power=1) 1.4260... """ from ..linear_model._glm.distribution import TweedieDistribution @@ -682,7 +683,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): sample_weight = column_or_1d(sample_weight) sample_weight = sample_weight[:, np.newaxis] - dist = TweedieDistribution(power=p) + dist = TweedieDistribution(power=power) dev = dist.unit_deviance(y_true, y_pred, check_input=True) return np.average(dev, weights=sample_weight) @@ -721,7 +722,7 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None): 1.4260... """ return mean_tweedie_deviance( - y_true, y_pred, sample_weight=sample_weight, p=1 + y_true, y_pred, sample_weight=sample_weight, power=1 ) @@ -759,5 +760,5 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None): 1.0568... """ return mean_tweedie_deviance( - y_true, y_pred, sample_weight=sample_weight, p=2 + y_true, y_pred, sample_weight=sample_weight, power=2 ) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 6dc31676357f5..0f987a088bb84 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -36,7 +36,7 @@ def test_regression_metrics(n_samples=50): assert_almost_equal(max_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 1.) - assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=0), + assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=0), mean_squared_error(y_true, y_pred)) # Tweedie deviance needs positive y_pred, except for p=0, @@ -45,15 +45,15 @@ def test_regression_metrics(n_samples=50): y_true = np.arange(1, 1 + n_samples) y_pred = 2 * y_true n = n_samples - assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=-1), + assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=-1), 5/12 * n * (n**2 + 2 * n + 1)) - assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=1), + assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=1), (n + 1) * (1 - np.log(2))) - assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=2), + assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=2), 2 * np.log(2) - 1) - assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=3/2), + assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=3/2), ((6 * np.sqrt(2) - 8) / n) * np.sqrt(y_true).sum()) - assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=3), + assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=3), np.sum(1 / y_true) / (4 * n)) @@ -101,40 +101,41 @@ def test_regression_metrics_at_limits(): mean_squared_log_error, [1., -2., 3.], [1., 2., 3.]) # Tweedie deviance error - p = -1.2 - assert_allclose(mean_tweedie_deviance([0], [1.], p=p), - 2./(2.-p), rtol=1e-3) + power = -1.2 + assert_allclose(mean_tweedie_deviance([0], [1.], power=power), + 2 / (2 - power), rtol=1e-3) with pytest.raises(ValueError, match="can only be used on strictly positive mu."): - mean_tweedie_deviance([0.], [0.], p=p) - assert_almost_equal(mean_tweedie_deviance([0.], [0.], p=0), 0.00, 2) + mean_tweedie_deviance([0.], [0.], power=power) + assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2) msg = "only be used on non-negative y and strictly positive mu." with pytest.raises(ValueError, match=msg): - mean_tweedie_deviance([0.], [0.], p=1.0) + mean_tweedie_deviance([0.], [0.], power=1.0) - p = 1.5 - assert_allclose(mean_tweedie_deviance([0.], [1.], p=p), 2./(2.-p)) + power = 1.5 + assert_allclose(mean_tweedie_deviance([0.], [1.], power=power), + 2 / (2 - power)) msg = "only be used on non-negative y and strictly positive mu." with pytest.raises(ValueError, match=msg): - mean_tweedie_deviance([0.], [0.], p=p) - p = 2. - assert_allclose(mean_tweedie_deviance([1.], [1.], p=p), 0.00, + mean_tweedie_deviance([0.], [0.], power=power) + power = 2. + assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) msg = "can only be used on strictly positive y and mu." with pytest.raises(ValueError, match=msg): - mean_tweedie_deviance([0.], [0.], p=p) - p = 3. - assert_allclose(mean_tweedie_deviance([1.], [1.], p=p), + mean_tweedie_deviance([0.], [0.], power=power) + power = 3. + assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) msg = "can only be used on strictly positive y and mu." with pytest.raises(ValueError, match=msg): - mean_tweedie_deviance([0.], [0.], p=p) + mean_tweedie_deviance([0.], [0.], power=power) with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"): - mean_tweedie_deviance([0.], [0.], p=0.5) + mean_tweedie_deviance([0.], [0.], power=0.5) def test__check_reg_targets(): @@ -274,21 +275,21 @@ def test_tweedie_deviance_continuity(): y_true = np.random.RandomState(0).rand(n_samples) + 0.1 y_pred = np.random.RandomState(1).rand(n_samples) + 0.1 - assert_allclose(mean_tweedie_deviance(y_true, y_pred, p=0 - 1e-10), - mean_tweedie_deviance(y_true, y_pred, p=0)) + assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=0 - 1e-10), + mean_tweedie_deviance(y_true, y_pred, power=0)) # Ws we get closer to the limit, with 1e-12 difference the absolute # tolerance to pass the below check increases. There are likely # numerical precision issues on the edges of different definition # regions. - assert_allclose(mean_tweedie_deviance(y_true, y_pred, p=1 + 1e-10), - mean_tweedie_deviance(y_true, y_pred, p=1), + assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=1 + 1e-10), + mean_tweedie_deviance(y_true, y_pred, power=1), atol=1e-6) - assert_allclose(mean_tweedie_deviance(y_true, y_pred, p=2 - 1e-10), - mean_tweedie_deviance(y_true, y_pred, p=2), + assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=2 - 1e-10), + mean_tweedie_deviance(y_true, y_pred, power=2), atol=1e-6) - assert_allclose(mean_tweedie_deviance(y_true, y_pred, p=2 + 1e-10), - mean_tweedie_deviance(y_true, y_pred, p=2), + assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=2 + 1e-10), + mean_tweedie_deviance(y_true, y_pred, power=2), atol=1e-6) From adbf997ba53d68e53a8d5997ffecd98ce4e13863 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 30 Aug 2019 12:07:41 +0200 Subject: [PATCH 157/209] Rename predicted mean mu to y_pred --- sklearn/linear_model/_glm/distribution.py | 173 +++++++++--------- sklearn/linear_model/_glm/glm.py | 37 ++-- sklearn/linear_model/_glm/link.py | 42 ++--- .../_glm/tests/test_distribution.py | 4 +- 4 files changed, 132 insertions(+), 124 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index e3dfd1fcd3cf5..a5e42bcee5d1c 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -28,17 +28,17 @@ def _safe_lin_pred(X, coef): class ExponentialDispersionModel(metaclass=ABCMeta): r"""Base class for reproductive Exponential Dispersion Models (EDM). - The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by + The pdf of :math:`Y\sim \mathrm{EDM}(y_\textrm{pred}, \phi)` is given by .. math:: p(y| \theta, \phi) = c(y, \phi) \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) = \tilde{c}(y, \phi) - \exp\left(-\frac{d(y, \mu)}{2\phi}\right) + \exp\left(-\frac{d(y, y_\textrm{pred})}{2\phi}\right) - with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, - variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, - unit variance :math:`v(\mu)` and - unit deviance :math:`d(y,\mu)`. + with mean :math:`\mathrm{E}[Y] = A'(\theta) = y_\textrm{pred}`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(y_\textrm{pred})`, + unit variance :math:`v(y_\textrm{pred})` and + unit deviance :math:`d(y,y_\textrm{pred})`. Methods ------- @@ -75,58 +75,60 @@ def in_y_range(self, y): return np.greater(y, self._lower_bound.value) @abstractmethod - def unit_variance(self, mu): + def unit_variance(self, y_pred): r"""Compute the unit variance function. - The unit variance :math:`v(\mu)` determines the variance as - a function of the mean :math:`\mu` by - :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. - It can also be derived from the unit deviance :math:`d(y,\mu)` as + The unit variance :math:`v(y_\textrm{pred})` determines the variance as + a function of the mean :math:`y_\textrm{pred}` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(y_\textrm{pred}_i)`. + It can also be derived from the unit deviance + :math:`d(y,y_\textrm{pred})` as - .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ - \partial\mu^2}}\big|_{y=\mu} + .. math:: v(y_\textrm{pred}) = \frac{2}{ + \frac{\partial^2 d(y,y_\textrm{pred})}{ + \partialy_\textrm{pred}^2}}\big|_{y=y_\textrm{pred}} See also :func:`variance`. Parameters ---------- - mu : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Predicted mean. """ pass # pragma: no cover @abstractmethod - def unit_variance_derivative(self, mu): - r"""Compute the derivative of the unit variance w.r.t. mu. + def unit_variance_derivative(self, y_pred): + r"""Compute the derivative of the unit variance w.r.t. y_pred. - Return :math:`v'(\mu)`. + Return :math:`v'(y_\textrm{pred})`. Parameters ---------- - mu : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Target values. """ pass # pragma: no cover @abstractmethod - def unit_deviance(self, y, mu, check_input=False): + def unit_deviance(self, y, y_pred, check_input=False): r"""Compute the unit deviance. - The unit_deviance :math:`d(y,\mu)` can be defined by the + The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the log-likelihood as - :math:`d(y,\mu) = -2\phi\cdot - \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + :math:`d(y,y_\textrm{pred}) = -2\phi\cdot + \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).` Parameters ---------- y : array, shape (n_samples,) Target values. - mu : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Predicted mean. check_input : bool, default=False - If True raise an exception on invalid y or mu values, otherwise + If True raise an exception on invalid y or y_pred values, otherwise they will be propagated as NaN. Returns ------- @@ -135,31 +137,33 @@ def unit_deviance(self, y, mu, check_input=False): """ pass # pragma: no cover - def unit_deviance_derivative(self, y, mu): - r"""Compute the derivative of the unit deviance w.r.t. mu. + def unit_deviance_derivative(self, y, y_pred): + r"""Compute the derivative of the unit deviance w.r.t. y_pred. The derivative of the unit deviance is given by - :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` - with unit variance :math:`v(\mu)`. + :math:`\frac{\partial}{\partialy_\textrm{pred}}d(y,y_\textrm{pred}) + = -2\frac{y-y_\textrm{pred}}{v(y_\textrm{pred})}` + with unit variance :math:`v(y_\textrm{pred})`. Parameters ---------- y : array, shape (n_samples,) Target values. - mu : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Predicted mean. """ - return -2 * (y - mu) / self.unit_variance(mu) + return -2 * (y - y_pred) / self.unit_variance(y_pred) - def deviance(self, y, mu, weights=1): + def deviance(self, y, y_pred, weights=1): r"""Compute the deviance. The deviance is a weighted sum of the per sample unit deviances, - :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` - with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. + :math:`D = \sum_i s_i \cdot d(y_i, y_\textrm{pred}_i)` + with weights :math:`s_i` and unit deviance + :math:`d(y,y_\textrm{pred})`. In terms of the log-likelihood it is :math:`D = -2\phi\cdot - \left(loglike(y,\mu,\frac{phi}{s}) + \left(loglike(y,y_\textrm{pred},\frac{phi}{s}) - loglike(y,y,\frac{phi}{s})\right)`. Parameters @@ -167,51 +171,52 @@ def deviance(self, y, mu, weights=1): y : array, shape (n_samples,) Target values. - mu : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Predicted mean. weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. """ - return np.sum(weights * self.unit_deviance(y, mu)) + return np.sum(weights * self.unit_deviance(y, y_pred)) - def deviance_derivative(self, y, mu, weights=1): - """Compute the derivative of the deviance w.r.t. mu. + def deviance_derivative(self, y, y_pred, weights=1): + r"""Compute the derivative of the deviance w.r.t. y_pred. - It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. + It gives :math:`\frac{\partial}{\partial y_\textrm{pred}} + D(y, \y_\textrm{pred}; weights)`. Parameters ---------- y : array, shape (n_samples,) Target values. - mu : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Predicted mean. weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. """ - return weights * self.unit_deviance_derivative(y, mu) + return weights * self.unit_deviance_derivative(y, y_pred) - def _mu_deviance_derivative(self, coef, X, y, weights, link): - """Compute mu and the derivative of the deviance w.r.t coef.""" + def _y_pred_deviance_derivative(self, coef, X, y, weights, link): + """Compute y_pred and the derivative of the deviance w.r.t coef.""" lin_pred = _safe_lin_pred(X, coef) - mu = link.inverse(lin_pred) + y_pred = link.inverse(lin_pred) d1 = link.inverse_derivative(lin_pred) - temp = d1 * self.deviance_derivative(y, mu, weights) + temp = d1 * self.deviance_derivative(y, y_pred, weights) if coef.size == X.shape[1] + 1: devp = np.concatenate(([temp.sum()], temp @ X)) else: devp = temp @ X # same as X.T @ temp - return mu, devp + return y_pred, devp class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. - A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely - defined by it's mean-variance relationship - :math:`\mathrm{Var}[Y] \propto \mu^power`. + A Tweedie distribution with mean :math:`y_\textrm{pred}=\mathrm{E}[Y]` + is uniquely defined by it's mean-variance relationship + :math:`\mathrm{Var}[Y] \propto y_\textrm{pred}^power`. Special cases are: @@ -228,7 +233,7 @@ class TweedieDistribution(ExponentialDispersionModel): ---------- power : float (default=0) The variance power of the `unit_variance` - :math:`v(\mu) = \mu^{power}`. + :math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`. For ``0 0 - if (mu <= 0).any(): - raise ValueError(message + "strictly positive mu.") + # 'Extreme stable', y any realy number, y_pred > 0 + if (y_pred <= 0).any(): + raise ValueError(message + "strictly positive y_pred.") elif p == 0: - # Normal, y and mu can be any real number + # Normal, y and y_pred can be any real number pass elif 0 < p < 1: raise ValueError("Tweedie deviance is only defined for " "power<=0 and power>=1.") elif 1 <= p < 2: - # Poisson and Compount poisson distribution, y >= 0, mu > 0 - if (y < 0).any() or (mu <= 0).any(): + # Poisson and Compount poisson distribution, y >= 0, y_pred > 0 + if (y < 0).any() or (y_pred <= 0).any(): raise ValueError(message + "non-negative y and strictly " - "positive mu.") + "positive y_pred.") elif p >= 2: - # Gamma and Extreme stable distribution, y and mu > 0 - if (y <= 0).any() or (mu <= 0).any(): - raise ValueError(message + "strictly positive y and mu.") + # Gamma and Extreme stable distribution, y and y_pred > 0 + if (y <= 0).any() or (y_pred <= 0).any(): + raise ValueError(message + + "strictly positive y and y_pred.") else: # pragma: nocover # Unreachable statement raise ValueError if p < 0: - # 'Extreme stable', y any realy number, mu > 0 + # 'Extreme stable', y any realy number, y_pred > 0 dev = 2 * (np.power(np.maximum(y, 0), 2-p) / ((1-p) * (2-p)) - - y * np.power(mu, 1-p) / (1-p) - + np.power(mu, 2-p) / (2-p)) + - y * np.power(y_pred, 1-p) / (1-p) + + np.power(y_pred, 2-p) / (2-p)) elif p == 0: - # Normal distribution, y and mu any real number - dev = (y - mu)**2 + # Normal distribution, y and y_pred any real number + dev = (y - y_pred)**2 elif p < 1: raise ValueError("Tweedie deviance is only defined for power<=0 " "and power>=1.") elif p == 1: # Poisson distribution - dev = 2 * (xlogy(y, y/mu) - y + mu) + dev = 2 * (xlogy(y, y/y_pred) - y + y_pred) elif p == 2: # Gamma distribution - dev = 2 * (np.log(mu/y) + y/mu - 1) + dev = 2 * (np.log(y_pred/y) + y/y_pred - 1) else: dev = 2 * (np.power(y, 2-p) / ((1-p) * (2-p)) - - y * np.power(mu, 1-p) / (1-p) - + np.power(mu, 2-p) / (2-p)) + - y * np.power(y_pred, 1-p) / (1-p) + + np.power(y_pred, 2-p) / (2-p)) return dev diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 7c518bb3f8149..cab89766c0b92 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -31,8 +31,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as mu=h(X*w). Therefore, - the fit minimizes the following objective function with L2 + fitting and predicting the mean of the target y as y_pred=h(X*w). + Therefore, the fit minimizes the following objective function with L2 priors as regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) @@ -66,8 +66,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): link : {'auto', 'identity', 'log'} or an instance of class Link, \ optional (default='auto') The link function of the GLM, i.e. mapping from linear predictor - (X*coef) to expectation (mu). Option 'auto' sets the link depending on - the chosen family as follows: + (X*coef) to expectation (y_pred). Option 'auto' sets the link + depending on the chosen family as follows: - 'identity' for family 'normal' @@ -146,9 +146,9 @@ def fit(self, X, y, sample_weight=None): optional (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has - Var[Y_i]=phi/w_i * v(mu). - If Y_i ~ EDM(mu, phi/w_i), then - sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a + Var[Y_i]=phi/w_i * v(y_pred). + If Y_i ~ EDM(y_pred, phi/w_i), then + sum(w*Y)/sum(w) ~ EDM(y_pred, phi/sum(w)), i.e. the mean of y is a weighted average with weights=sample_weight. Returns @@ -267,10 +267,10 @@ def fit(self, X, y, sample_weight=None): if solver == 'lbfgs': def func(coef, X, y, weights, alpha, family, link): - mu, devp = family._mu_deviance_derivative( + y_pred, devp = family._y_pred_deviance_derivative( coef, X, y, weights, link ) - dev = family.deviance(y, mu, weights) + dev = family.deviance(y, y_pred, weights) intercept = (coef.size == X.shape[1] + 1) idx = 1 if intercept else 0 # offset if coef[0] is intercept coef_scaled = alpha * coef[idx:] @@ -337,8 +337,8 @@ def predict(self, X): """ # check_array is done in _linear_predictor eta = self._linear_predictor(X) - mu = self._link_instance.inverse(eta) - return mu + y_pred = self._link_instance.inverse(eta) + return y_pred def score(self, X, y, sample_weight=None): """Compute D^2, the percentage of deviance explained. @@ -376,8 +376,8 @@ def score(self, X, y, sample_weight=None): # TODO: make D^2 a score function in module metrics (and thereby get # input validation and so on) weights = _check_sample_weight(sample_weight, X) - mu = self.predict(X) - dev = self._family_instance.deviance(y, mu, weights=weights) + y_pred = self.predict(X) + dev = self._family_instance.deviance(y, y_pred, weights=weights) y_mean = np.average(y, weights=weights) dev_null = self._family_instance.deviance(y, y_mean, weights=weights) return 1 - dev / dev_null @@ -399,7 +399,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): """Regression with the response variable y following a Poisson distribution GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as mu=h(X*w). + fitting and predicting the mean of the target y as y_pred=h(X*w). The fit minimizes the following objective function with L2 regularization:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 @@ -487,7 +487,7 @@ class GammaRegressor(GeneralizedLinearRegressor): """Regression with the response variable y following a Gamma distribution GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as mu=h(X*w). + fitting and predicting the mean of the target y as y_pred=h(X*w). The fit minimizes the following objective function with L2 regularization:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 @@ -572,10 +572,10 @@ def family(self, value): class TweedieRegressor(GeneralizedLinearRegressor): - """Regression with the response variable y following a Tweedie distribution + r"""Regression with the response variable y following a Tweedie distribution GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as mu=h(X*w). + fitting and predicting the mean of the target y as y_pred=h(X*w). The fit minimizes the following objective function with L2 regularization:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 @@ -590,7 +590,8 @@ class TweedieRegressor(GeneralizedLinearRegressor): power : float (default=0) The power determines the underlying target distribution. By definition it links distribution variance (:math:`v`) and - mean (:math:`\\mu`): :math:`v(\\mu) = \\mu^{power}`. + mean (:math:`\y_\textrm{pred}`): + :math:`v(\y_\textrm{pred}) = \y_\textrm{pred}^{power}`. For ``0 Date: Fri, 30 Aug 2019 12:13:43 +0200 Subject: [PATCH 158/209] Fix link parameter documentation in TweedieRegression --- sklearn/linear_model/_glm/glm.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index cab89766c0b92..d6da8b8b80949 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -460,12 +460,12 @@ class PoissonRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, link='log', + def __init__(self, alpha=1.0, fit_intercept=True, solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, - family="poisson", link=link, + family="poisson", link='log', solver=solver, max_iter=max_iter, tol=tol, warm_start=warm_start, copy_X=copy_X, verbose=verbose) @@ -548,12 +548,12 @@ class GammaRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, link='log', - solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, + def __init__(self, alpha=1.0, fit_intercept=True, solver='lbfgs', + max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, - family="gamma", link=link, + family="gamma", link='log', solver=solver, max_iter=max_iter, tol=tol, warm_start=warm_start, copy_X=copy_X, verbose=verbose) @@ -619,6 +619,15 @@ class TweedieRegressor(GeneralizedLinearRegressor): case, the design matrix X must have full column rank (no collinearities). + link : {'auto', 'identity', 'log'}, default='auto' + The link function of the GLM, i.e. mapping from linear predictor + (X*coef) to expectation (y_pred). Option 'auto' sets the link + depending on the chosen family as follows: + + - 'identity' for Normal distribution + + - 'log' for Poisson, Gamma or Inverse Gaussian distributions + fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). @@ -659,9 +668,9 @@ class TweedieRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in solver. """ - def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log', - solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, - copy_X=True, check_input=True, verbose=0): + def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, + link='auto', solver='lbfgs', max_iter=100, tol=1e-4, + warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family=TweedieDistribution(power=power), link=link, From 3b526e98ce11514090cbdad890f3c2d13664cc83 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 30 Aug 2019 12:44:05 +0200 Subject: [PATCH 159/209] EXA Use a simpler pipeline for GBDT in poisson regression example --- ...plot_poisson_regression_non_normal_loss.py | 66 ++++++++++++++----- 1 file changed, 48 insertions(+), 18 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 9deca6c25032a..7ab772fb59920 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -34,6 +34,7 @@ from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.preprocessing import OrdinalEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.ensemble import GradientBoostingRegressor @@ -134,10 +135,14 @@ def load_mtpl2(n_samples=100000): # To evaluate the pertinence of the used metrics, we will consider as a # baseline an estimator that returns 0 for any input. -df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) +df_train, df_test = train_test_split(df, random_state=0) -dummy = DummyRegressor(strategy='constant', constant=0) -dummy.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) +dummy = make_pipeline( + column_trans, + DummyRegressor(strategy='constant', constant=0) +) +dummy.fit(df_train, df_train.Frequency, + dummyregressor__sample_weight=df_train.Exposure) ############################################################################## # @@ -149,14 +154,14 @@ def score_estimator(estimator, df_test, eps=1e-5): """Score an estimatr on the test set""" print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, estimator.predict(X_test), + df_test.Frequency.values, estimator.predict(df_test), df_test.Exposure.values)) print("MAE: %.3f" % mean_absolute_error( - df_test.Frequency.values, estimator.predict(X_test), + df_test.Frequency.values, estimator.predict(df_test), df_test.Exposure.values)) print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, np.fmax(estimator.predict(X_test), eps), + df_test.Frequency.values, np.fmax(estimator.predict(df_test), eps), df_test.Exposure.values)) @@ -168,12 +173,14 @@ def score_estimator(estimator, df_test, eps=1e-5): # We start by modeling the target variable with the least squares linear # regression model, -linregr = LinearRegression() -linregr.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) +linregr = make_pipeline(column_trans, LinearRegression()) +linregr.fit(df_train, df_train.Frequency, + linearregression__sample_weight=df_train.Exposure) print('Number Negatives: %s / total: %s' % ( - (linregr.predict(X_test) < 0).sum(), X_test.shape[0])) + (linregr.predict(df_train) < 0).sum(), + df_train.shape[0])) print("LinearRegression") score_estimator(linregr, df_test) @@ -182,8 +189,12 @@ def score_estimator(estimator, df_test, eps=1e-5): # # Next we fit the Poisson regressor on the target variable, -glm_freq = PoissonRegressor(alpha=0, max_iter=1000) -glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) +glm_freq = make_pipeline( + column_trans, + PoissonRegressor(alpha=0, max_iter=1000) +) +glm_freq.fit(df_train, df_train.Frequency, + poissonregressor__sample_weight=df_train.Exposure) print("PoissonRegressor") score_estimator(glm_freq, df_test) @@ -191,12 +202,31 @@ def score_estimator(estimator, df_test, eps=1e-5): ############################################################################## # # Finally we will consider a non linear model with Gradient boosting that -# still minimizes the least square error. - - -gbr = GradientBoostingRegressor() -gbr.fit(X_train, df_train.Frequency.values, - sample_weight=df_train.Exposure.values) +# still minimizes the least square error. Gradient Boostring Decision Trees do +# not require for categorical data to be one hot encoded, therefore here we use +# a simpler pre-processing pipeline without ``KBinsDiscretizer`` and with +# ``OrdinalEncoder`` instead of ``OneHotEncoder``. + + +gbr = make_pipeline( + ColumnTransformer( + [ + ( + "Veh_Brand_Gas_Region", + OrdinalEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), + ("Continious", "passthrough", ["VehAge", "DrivAge", "BonusMalus"]), + ("Density_log", make_pipeline( + FunctionTransformer(np.log, validate=False), StandardScaler()), + ["Density"]), + ], + remainder="drop", + ), + GradientBoostingRegressor() +) +gbr.fit(df_train, df_train.Frequency.values, + gradientboostingregressor__sample_weight=df_train.Exposure.values) print("GradientBoostingRegressor") @@ -224,7 +254,7 @@ def score_estimator(estimator, df_test, eps=1e-5): axes[0].set_title('Experimental data') for idx, model in enumerate([linregr, glm_freq, gbr]): - y_pred = model.predict(X_train) + y_pred = model.predict(df_train) pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=axes[idx+1]) axes[idx + 1].set_title(model.__class__.__name__) From b1eb611f1d7364af25f1d8f4222ff0a9cb494efa Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Sep 2019 16:49:52 +0200 Subject: [PATCH 160/209] Minor fixes for user guide --- doc/modules/linear_model.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 622ab335059ab..29c329f6f333a 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -874,7 +874,7 @@ with 'log' loss, which might be even faster but requires more tuning. It is possible to obtain the p-values and confidence intervals for coefficients in cases of regression without penalization. The `statsmodels package ` natively supports this. - Within sklearn, one could use bootstrapping instead as well. + Within sklearn, one could use bootstrapping instead as well. :class:`LogisticRegressionCV` implements Logistic Regression with built-in @@ -919,8 +919,8 @@ The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` likelihood as .. math:: d(y, \mu) = -2\phi\cdot - \left( log P(y|\mu,\phi) - - log P(y|y,\phi)\right) + \left( \log p(y|\mu,\phi) + - \log p(y|y,\phi)\right). The following table lists some specific EDM distributions—all are Tweedie distributions—and some of their properties. @@ -939,7 +939,7 @@ Usage ----- In the following use cases, a loss different from the squared loss might be -appropriate, +appropriate: * If the target values :math:`y` are counts (non-negative integer valued) or frequencies (non-negative), you might use a Poisson deviance with log-link. @@ -960,7 +960,7 @@ log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`. :class:`TweedieRegressor` implements a generalized linear model for the Tweedie distribution, that allows to model any of the above mentioned distributions using the appropriate ``power`` parameter, i.e. the exponent -of the unit variance function, +of the unit variance function: - ``power = 0``: Normal distribution. Specialized solvers such as :class:`Ridge`, :class:`ElasticNet` are generally @@ -987,15 +987,15 @@ of the unit variance function, together with :math:`s=\mathrm{exposure}` as sample weights. This is done in both examples linked below. * The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. - * If the target y is a ratio, appropriate sample weights s should be + the first two moments to be :math:`E[Y_i]=\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\frac{\phi}{s_i} v(\mu_i)`. + * If the target `y` is a ratio, appropriate sample weights ``s`` should be provided. As an example, consider Poisson distributed counts z (integers) and weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + Consider :math:`\bar{y} = \frac{\\sum_i s_i y_i}{\sum_i s_i}`, in this case one might say that y has a 'scaled' Poisson distributions. The same holds for other distributions. From d964c01324a9945253549fbfb2ec1d3b3b18b79f Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Sep 2019 19:01:08 +0200 Subject: [PATCH 161/209] EXA Poisson: minor changes --- ...plot_poisson_regression_non_normal_loss.py | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 7ab772fb59920..b71fba9236ef5 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -3,11 +3,11 @@ Poisson regression and non-normal loss ====================================== -This example illustrates the use of linear Poisson regression +This example illustrates the use of log-linear Poisson regression on the French Motor Third-Party Liability Claims dataset [1] and compares it with models learned with least squared error. The goal is to predict the -number of insurance claims (or frequency) following car accidents for a user -given historical data over a population of users. +number of insurance claims (or frequency) following car accidents for a +policyholder given historical data over a population of policyholders. We start by defining a few helper functions for loading the data and visualizing results. @@ -48,7 +48,8 @@ def load_mtpl2(n_samples=100000): Parameters ---------- n_samples: int, default=100000 - number of samples to select (for faster run time). + number of samples to select (for faster run time). Full dataset has + 678013 samples. """ # freMTPL2freq dataset from https://www.openml.org/d/41214 @@ -76,14 +77,15 @@ def load_mtpl2(n_samples=100000): # 1. Loading datasets and pre-processing # -------------------------------------- # -# We construct the freMTPL2 dataset by joining the freMTPL2freq table, +# We construct the freMTPL2 dataset by joining the freMTPL2freq table, # containing the number of claims (``ClaimNb``) with the freMTPL2sev table -# containing the claim amount (``ClaimAmount``) for the same user ids. +# containing the claim amount (``ClaimAmount``) for the same policy ids +# (``IDpol``). df = load_mtpl2(n_samples=50000) # Note: filter out claims with zero amount, as the severity model -# requires a strictly positive target values. +# requires strictly positive target values. df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 # correct for unreasonable observations (that might be data error) @@ -116,9 +118,9 @@ def load_mtpl2(n_samples=100000): # The number of claims (``ClaimNb``) is a positive integer that can be modeled # as a Poisson distribution. It is then assumed to be the number of discrete # events occurring with a constant rate in a given time interval -# (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``, -# which is still a (scaled) Poisson distribution, and use ``Exposure`` as -# `sample_weight`. +# (``Exposure``, in units of years). Here we model the frequency +# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution, +# and use ``Exposure`` as `sample_weight`. df["Frequency"] = df.ClaimNb / df.Exposure @@ -126,20 +128,23 @@ def load_mtpl2(n_samples=100000): pd.cut(df.Frequency, [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts() ) +print("Average Frequency = {}" + .format(np.average(df.Frequency, weights=df.Exposure))) + ############################################################################## # -# It worth noting that 96 % of users have 0 claims, and if we were to convert -# this problem into a binary classification task, it would be significantly -# imbalanced. +# It worth noting that 96 % of policyholders have zero claims, and if we were +# to convert this problem into a binary classification task, it would be +# significantly imbalanced. # # To evaluate the pertinence of the used metrics, we will consider as a -# baseline an estimator that returns 0 for any input. +# baseline an estimator that returns the mean of the training sample. df_train, df_test = train_test_split(df, random_state=0) dummy = make_pipeline( column_trans, - DummyRegressor(strategy='constant', constant=0) + DummyRegressor(strategy='mean') ) dummy.fit(df_train, df_train.Frequency, dummyregressor__sample_weight=df_train.Exposure) @@ -257,7 +262,7 @@ def score_estimator(estimator, df_test, eps=1e-5): y_pred = model.predict(df_train) pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=axes[idx+1]) - axes[idx + 1].set_title(model.__class__.__name__) + axes[idx + 1].set_title(model[-1].__class__.__name__) for axi in axes: axi.set( From a1844b8543a9a43fadfc0523e48cd0135e58ba37 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 2 Sep 2019 20:29:56 +0200 Subject: [PATCH 162/209] Fix mu->y_pred and p->power --- doc/whats_new/v0.22.rst | 16 ++++++++-------- .../plot_tweedie_regression_insurance_claims.py | 4 ++-- sklearn/metrics/regression.py | 4 ++-- sklearn/metrics/scorer.py | 4 ++-- sklearn/metrics/tests/test_common.py | 4 ++-- sklearn/metrics/tests/test_regression.py | 10 +++++----- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 6e3d4822b261d..d6d52732ba714 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -241,10 +241,10 @@ Changelog :user:`Mohamed Maskani `, and :user:`Thomas Fan `. - |Feature| Add :class:`metrics.mean_tweedie_deviance` measuring the - Tweedie deviance for a power parameter ``p``. Also add mean Poisson deviance - :class:`metrics.mean_poisson_deviance` and mean Gamma deviance + Tweedie deviance for a power parameter ``power``. Also add mean Poisson + deviance :class:`metrics.mean_poisson_deviance` and mean Gamma deviance :class:`metrics.mean_gamma_deviance` that are special cases of the Tweedie - deviance for `p=1` and `p=2` respectively. + deviance for `power=1` and `power=2` respectively. :pr:`13938` by :user:`Christian Lorentzen ` and `Roman Yurchak`_. @@ -306,19 +306,19 @@ Changelog - |Enhancement| SVM now throws more specific error when fit on non-square data and kernel = precomputed. :class:`svm.BaseLibSVM` :pr:`14336` by :user:`Gregory Dexter `. - + :mod:`sklearn.tree` ................... - |Feature| Adds minimal cost complexity pruning, controlled by ``ccp_alpha``, to :class:`tree.DecisionTreeClassifier`, :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeClassifier`, :class:`tree.ExtraTreeRegressor`, - :class:`ensemble.RandomForestClassifier`, + :class:`ensemble.RandomForestClassifier`, :class:`ensemble.RandomForestRegressor`, - :class:`ensemble.ExtraTreesClassifier`, + :class:`ensemble.ExtraTreesClassifier`, :class:`ensemble.ExtraTreesRegressor`, - :class:`ensemble.RandomTreesEmbedding`, - :class:`ensemble.GradientBoostingClassifier`, + :class:`ensemble.RandomTreesEmbedding`, + :class:`ensemble.GradientBoostingClassifier`, and :class:`ensemble.GradientBoostingRegressor`. :pr:`12887` by `Thomas Fan`_. diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index f866518b69db8..eb9769814ade5 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -426,7 +426,7 @@ class ClaimProdEstimator: """Total claim amount estimator. Computed as the product of the frequency model by the serverity model, - denormalized by exposure. Use Tweedie deviance with `p=1.5`. + denormalized by exposure. Use Tweedie deviance with `power=1.5`. """ def __init__(self, est_freq, est_sev): @@ -527,7 +527,7 @@ def score(self, X, y, sample_weight=None): "predicted, frequency*severity model": np.sum( est_prod.predict(X, exposure=df.Exposure.values) ), - "predicted, tweedie, p=%.2f" + "predicted, tweedie, power=%.2f" % glm_total.best_estimator_.family.power: np.sum( glm_total.best_estimator_.predict(X) ), diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 73db0acc945e6..706c484334d21 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -693,7 +693,7 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None): """Mean Poisson deviance regression loss. Poisson deviance is equivalent to the Tweedie deviance with - the power parameter `p=1`. + the power parameter `power=1`. Read more in the :ref:`User Guide `. @@ -730,7 +730,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None): """Mean Gamma deviance regression loss. Gamma deviance is equivalent to the Tweedie deviance with - the power parameter `p=2`. It is invariant to scaling of + the power parameter `power=2`. It is invariant to scaling of the target variable, and mesures relative errors. Read more in the :ref:`User Guide `. diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index bf2892bdf83a2..e2496c83b666d 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -499,11 +499,11 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, greater_is_better=False, squared=False) neg_mean_poisson_deviance_scorer = make_scorer( - mean_tweedie_deviance, p=1., greater_is_better=False + mean_tweedie_deviance, power=1., greater_is_better=False ) neg_mean_gamma_deviance_scorer = make_scorer( - mean_tweedie_deviance, p=2., greater_is_better=False + mean_tweedie_deviance, power=2., greater_is_better=False ) # Standard Classification Scores diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 6459f93c68449..a8cabe984e563 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -102,11 +102,11 @@ "median_absolute_error": median_absolute_error, "explained_variance_score": explained_variance_score, "r2_score": partial(r2_score, multioutput='variance_weighted'), - "mean_normal_deviance": partial(mean_tweedie_deviance, p=0), + "mean_normal_deviance": partial(mean_tweedie_deviance, power=0), "mean_poisson_deviance": mean_poisson_deviance, "mean_gamma_deviance": mean_gamma_deviance, "mean_compound_poisson_deviance": - partial(mean_tweedie_deviance, p=1.4), + partial(mean_tweedie_deviance, power=1.4), } CLASSIFICATION_METRICS = { diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 0f987a088bb84..c3947db5ed857 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -105,31 +105,31 @@ def test_regression_metrics_at_limits(): assert_allclose(mean_tweedie_deviance([0], [1.], power=power), 2 / (2 - power), rtol=1e-3) with pytest.raises(ValueError, - match="can only be used on strictly positive mu."): + match="can only be used on strictly positive y_pred."): mean_tweedie_deviance([0.], [0.], power=power) assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2) - msg = "only be used on non-negative y and strictly positive mu." + msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=1.0) power = 1.5 assert_allclose(mean_tweedie_deviance([0.], [1.], power=power), 2 / (2 - power)) - msg = "only be used on non-negative y and strictly positive mu." + msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) power = 2. assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) - msg = "can only be used on strictly positive y and mu." + msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) power = 3. assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) - msg = "can only be used on strictly positive y and mu." + msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) From f5133920b47070f5b252dc4d8015745195130c44 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 3 Sep 2019 18:34:08 +0200 Subject: [PATCH 163/209] EXA Tweedie: some improvements --- ...lot_tweedie_regression_insurance_claims.py | 56 +++++++++++-------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index eb9769814ade5..22a26d880a869 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -13,8 +13,8 @@ which are: 1. Model the number of claims with a Poisson distribution, the average - claim amount as a Gamma distribution and multiply the predictions of both in - order to get the total claim amount. + claim amount per claim, also known as severity, as a Gamma distribution and + multiply the predictions of both in order to get the total claim amount. 2. Model total claim amount directly, typically with a Tweedie distribution of Tweedie power :math:`p \\in (1, 2)`. @@ -42,6 +42,7 @@ from sklearn.compose import ColumnTransformer from sklearn.linear_model import PoissonRegressor, GammaRegressor from sklearn.linear_model import TweedieRegressor +from sklearn.metrics import mean_tweedie_deviance from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder @@ -56,7 +57,8 @@ def load_mtpl2(n_samples=100000): Parameters ---------- n_samples: int, default=100000 - number of samples to select (for faster run time). + number of samples to select (for faster run time). Full dataset has + 678013 samples. """ # freMTPL2freq dataset from https://www.openml.org/d/41214 @@ -139,7 +141,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, df = load_mtpl2(n_samples=60000) # Note: filter out claims with zero amount, as the severity model -# requires a strictly positive target values. +# requires strictly positive target values. df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 # Correct for unreasonable observations (that might be data error) @@ -182,10 +184,10 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # # The number of claims (``ClaimNb``) is a positive integer that can be modeled # as a Poisson distribution. It is then assumed to be the number of discrete -# events occuring with a constant rate in a given time interval (``Exposure``). -# Here we model the frequency ``y = ClaimNb / Exposure``, -# which is still a (scaled) Poisson distribution, and use ``Exposure`` as -# `sample_weight`. +# events occuring with a constant rate in a given time interval +# (``Exposure``, in units of years). Here we model the frequency +# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution, +# and use ``Exposure`` as `sample_weight`. df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) @@ -197,7 +199,10 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, def mean_deviance(estimator, y, y_pred, weights): if hasattr(estimator, "_family_instance"): - return estimator._family_instance.deviance(y, y_pred, weights) / len(y) + if weights is None: + weights = np.ones_like(y) + return (estimator._family_instance.deviance(y, y_pred, weights) + / np.sum(weights)) else: return np.nan @@ -320,10 +325,10 @@ def score_estimator( # # According to the observed data, the frequency of accidents is higher for # drivers younger than 30 years old, and it positively correlated with the -# `BonusMalus` variable. Out model is able to mostly correctly model +# `BonusMalus` variable. Our model is able to mostly correctly model # this behaviour. # -# 3. Severity model - Gamma Distribution +# 3. Severity model - Gamma distribution # --------------------------------------- # The mean claim amount or severity (`AvgClaimAmount`) can be empirically # shown to follow approximately a Gamma distribution. We fit a GLM model for @@ -333,7 +338,7 @@ def score_estimator( # # - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support # on :math:`(0, \infty)`, not :math:`[0, \infty)`. -# - We use ``ClaimNb`` as sample weights. +# - We use ``ClaimNb`` as `sample_weight`. mask_train = df_train["ClaimAmount"] > 0 mask_test = df_test["ClaimAmount"] > 0 @@ -360,6 +365,8 @@ def score_estimator( ############################################################################## # +# Here, the scores for the test data call for caution as they are significantly +# worse than for the training data indicating an overfit. # Note that the resulting model is the average claim amount per claim. As such, # it is conditional on having at least one claim, and cannot be used to predict # the average claim amount per policy in general. @@ -412,10 +419,10 @@ def score_estimator( ############################################################################## # -# Overall the drivers age (``DrivAge``) has a weak impact on the claim +# Overall, the drivers age (``DrivAge``) has a weak impact on the claim # severity, both in observed and predicted data. # -# 4. Total Claims Amount -- Compound Poisson distribution +# 4. Total claim amount -- Compound Poisson distribution # ------------------------------------------------------- # # As mentionned in the introduction, the total claim amount can be modeled @@ -426,12 +433,16 @@ class ClaimProdEstimator: """Total claim amount estimator. Computed as the product of the frequency model by the serverity model, - denormalized by exposure. Use Tweedie deviance with `power=1.5`. + denormalized by exposure. For scores, use Tweedie deviance with + `power=1.5`. """ def __init__(self, est_freq, est_sev): + from sklearn.linear_model._glm.distribution import TweedieDistribution + self.est_freq = est_freq self.est_sev = est_sev + self._family_instance = TweedieDistribution(power=1.5) def predict(self, X, exposure): """Predict the total claim amount. @@ -442,14 +453,13 @@ def predict(self, X, exposure): def score(self, X, y, sample_weight=None): """Compute D², the percentage of deviance explained.""" - # TODO: remove this private import once d2_score is available - from sklearn.linear_model._glm.distribution import TweedieDistribution - + # TODO: use d2_score directly once it is available mu = self.predict(X, exposure=sample_weight) - family = TweedieDistribution(power=1.5) - dev = family.deviance(y, mu, weights=sample_weight) - y_mean = np.average(y, weights=sample_weight) - dev_null = family.deviance(y, y_mean, weights=sample_weight) + dev = mean_tweedie_deviance( + y, mu, sample_weight=sample_weight, power=1.5) + y_mean = np.average(y, weights=sample_weight) * np.ones_like(y) + dev_null = mean_tweedie_deviance( + y, y_mean, sample_weight=sample_weight, power=1.5) return 1. - dev / dev_null @@ -475,7 +485,7 @@ def score(self, X, y, sample_weight=None): from sklearn.model_selection import GridSearchCV -# exclude upper bound as power=2 does not support null y values. +# exclude upper bound as power>=2 does not support y=0. params = {"power": np.linspace(1 + 1e-4, 2 - 1e-4, 8)} From 84229a6d5fbe6cf9964f573496c66fe8c88bd2ab Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 3 Sep 2019 21:19:44 +0200 Subject: [PATCH 164/209] Fix doc test --- doc/modules/linear_model.rst | 2 +- .../linear_model/plot_poisson_regression_non_normal_loss.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 29c329f6f333a..ee418af1d414b 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -1004,7 +1004,7 @@ The estimator can be used as follows:: >>> from sklearn.linear_model import TweedieRegressor >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) - TweedieRegressor(alpha=0.5, power=1) + TweedieRegressor(alpha=0.5, link='log', power=1) >>> reg.coef_ array([0.2463..., 0.4337...]) >>> reg.intercept_ diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index b71fba9236ef5..d739c37d2bb60 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -206,8 +206,8 @@ def score_estimator(estimator, df_test, eps=1e-5): ############################################################################## # -# Finally we will consider a non linear model with Gradient boosting that -# still minimizes the least square error. Gradient Boostring Decision Trees do +# Finally, we will consider a non linear model with Gradient boosting that +# still minimizes the least square error. Gradient Boosting Decision Trees do # not require for categorical data to be one hot encoded, therefore here we use # a simpler pre-processing pipeline without ``KBinsDiscretizer`` and with # ``OrdinalEncoder`` instead of ``OneHotEncoder``. From 8c6c255cbad9cae5c82b5154f94f9a6a14cc6b3a Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 11 Sep 2019 15:20:10 +0200 Subject: [PATCH 165/209] Fix test --- sklearn/metrics/tests/test_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 17bf7f828948c..f29e7d2ad1c13 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -118,7 +118,7 @@ def test_regression_metrics_at_limits(): power = 1.5 assert_allclose(mean_tweedie_deviance([0.], [1.], power=power), 2 / (2 - power)) - msg = "only be used on non-negative y_true and strictly positive y_pred." + msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) power = 2. From 0a2331385daca833429b82a3493471cb45329ac4 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 12 Sep 2019 17:17:41 +0200 Subject: [PATCH 166/209] EXA Use Ridge and remove eps --- ...plot_poisson_regression_non_normal_loss.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index d739c37d2bb60..4fb16f6419209 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -30,7 +30,7 @@ from sklearn.datasets import fetch_openml from sklearn.dummy import DummyRegressor from sklearn.compose import ColumnTransformer -from sklearn.linear_model import PoissonRegressor, LinearRegression +from sklearn.linear_model import Ridge, PoissonRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder @@ -152,22 +152,28 @@ def load_mtpl2(n_samples=100000): ############################################################################## # # The Poisson deviance cannot be computed on negative values predicted by the -# model, so we set the minimum predicted value to eps, +# model, so all models need to return positive preditions if we intend to +# use this metric, -def score_estimator(estimator, df_test, eps=1e-5): +def score_estimator(estimator, df_test): """Score an estimatr on the test set""" + y_pred = estimator.predict(df_test) + print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, estimator.predict(df_test), + df_test.Frequency.values, y_pred, df_test.Exposure.values)) print("MAE: %.3f" % mean_absolute_error( - df_test.Frequency.values, estimator.predict(df_test), + df_test.Frequency.values, y_pred, df_test.Exposure.values)) + # ignore negative predictions + mask = y_pred > 0 + print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, np.fmax(estimator.predict(df_test), eps), - df_test.Exposure.values)) + df_test.Frequency.values[mask], y_pred[mask], + df_test.Exposure.values[mask])) print("DummyRegressor") @@ -178,16 +184,16 @@ def score_estimator(estimator, df_test, eps=1e-5): # We start by modeling the target variable with the least squares linear # regression model, -linregr = make_pipeline(column_trans, LinearRegression()) +linregr = make_pipeline(column_trans, Ridge(alpha=1.0)) linregr.fit(df_train, df_train.Frequency, - linearregression__sample_weight=df_train.Exposure) + ridge__sample_weight=df_train.Exposure) print('Number Negatives: %s / total: %s' % ( (linregr.predict(df_train) < 0).sum(), df_train.shape[0])) -print("LinearRegression") +print("Ridge") score_estimator(linregr, df_test) ############################################################################## @@ -196,7 +202,7 @@ def score_estimator(estimator, df_test, eps=1e-5): glm_freq = make_pipeline( column_trans, - PoissonRegressor(alpha=0, max_iter=1000) + PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000) ) glm_freq.fit(df_train, df_train.Frequency, poissonregressor__sample_weight=df_train.Exposure) From 976b436ebbeff0f6d21f9ae06352cb6bb25c174e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 16 Sep 2019 13:26:16 +0200 Subject: [PATCH 167/209] Address comments in plot_poisson_regression_non_normal_loss.py --- ...plot_poisson_regression_non_normal_loss.py | 73 +++++++++++-------- 1 file changed, 41 insertions(+), 32 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 4fb16f6419209..fa1bc09a76285 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -23,6 +23,8 @@ # Authors: Christian Lorentzen # Roman Yurchak # License: BSD 3 clause +import warnings + import numpy as np import matplotlib.pyplot as plt import pandas as pd @@ -36,7 +38,7 @@ from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.preprocessing import OrdinalEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer -from sklearn.ensemble import GradientBoostingRegressor +from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.metrics import mean_poisson_deviance @@ -149,11 +151,6 @@ def load_mtpl2(n_samples=100000): dummy.fit(df_train, df_train.Frequency, dummyregressor__sample_weight=df_train.Exposure) -############################################################################## -# -# The Poisson deviance cannot be computed on negative values predicted by the -# model, so all models need to return positive preditions if we intend to -# use this metric, def score_estimator(estimator, df_test): @@ -168,11 +165,17 @@ def score_estimator(estimator, df_test): df_test.Frequency.values, y_pred, df_test.Exposure.values)) - # ignore negative predictions + # ignore negative predictions, as they are invalid for + # the Poisson deviance mask = y_pred > 0 + if (~mask).any(): + warnings.warn("estimator yields negative predictions for {} samples " + "out of {}. These will be ignored while computing the " + "poisson deviance".format((~mask).sum(), mask.shape[0])) print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values[mask], y_pred[mask], + df_test.Frequency.values[mask], + y_pred[mask], df_test.Exposure.values[mask])) @@ -184,14 +187,21 @@ def score_estimator(estimator, df_test): # We start by modeling the target variable with the least squares linear # regression model, -linregr = make_pipeline(column_trans, Ridge(alpha=1.0)) +linregr = make_pipeline( + column_trans, + Ridge(alpha=1.0) +) linregr.fit(df_train, df_train.Frequency, ridge__sample_weight=df_train.Exposure) - -print('Number Negatives: %s / total: %s' % ( - (linregr.predict(df_train) < 0).sum(), - df_train.shape[0])) +############################################################################## +# +# The Poisson deviance cannot be computed on negative values predicted by the +# model. For models that do return a few negative predictions +# (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples, +# meaning that the obtained Poisson deviance is approximate. An alternative +# apporach could be to use class:`compose.TransformedTargetRegressor` +# meta-estimator to map ``y_pred`` to strictly positive domain. print("Ridge") score_estimator(linregr, df_test) @@ -212,40 +222,38 @@ def score_estimator(estimator, df_test): ############################################################################## # -# Finally, we will consider a non linear model with Gradient boosting that -# still minimizes the least square error. Gradient Boosting Decision Trees do +# Finally, we will consider a non linear model with a random forest that +# still minimizes the least square error. Random forest does # not require for categorical data to be one hot encoded, therefore here we use -# a simpler pre-processing pipeline without ``KBinsDiscretizer`` and with -# ``OrdinalEncoder`` instead of ``OneHotEncoder``. +# a simpler pre-processing pipeline with :class:`preprocessing.OrdinalEncoder`, gbr = make_pipeline( ColumnTransformer( [ ( - "Veh_Brand_Gas_Region", - OrdinalEncoder(), + "Veh_Brand_Gas_Region", OrdinalEncoder(), ["VehBrand", "VehPower", "VehGas", "Region", "Area"], ), - ("Continious", "passthrough", ["VehAge", "DrivAge", "BonusMalus"]), - ("Density_log", make_pipeline( - FunctionTransformer(np.log, validate=False), StandardScaler()), - ["Density"]), + ( + "Continious", "passthrough", + ["VehAge", "DrivAge", "BonusMalus", "Density"] + ), ], remainder="drop", ), - GradientBoostingRegressor() + RandomForestRegressor(min_weight_fraction_leaf=1e-2) ) gbr.fit(df_train, df_train.Frequency.values, - gradientboostingregressor__sample_weight=df_train.Exposure.values) + randomforestregressor__sample_weight=df_train.Exposure.values) -print("GradientBoostingRegressor") +print("RandomForestRegressor") score_estimator(gbr, df_test) ############################################################################## # -# In this example, although Gradient boosting minimizes the least square error, +# In this example, although random forest minimizes the least square error, # because of a higher predictive power it also results in a smaller Poisson # deviance than the Poisson regression model. # @@ -281,11 +289,12 @@ def score_estimator(estimator, df_test): # The experimental data presents a long tail distribution for ``y``. In all # models we predict the mean expected value, so we will have necessairily fewer # extreme values. Additionally normal distribution used in ``Ridge`` and -# ``GradientBoostingRegressor`` has a constant variance, while for the Poisson +# ``RandomForestRegressor`` has a constant variance, while for the Poisson # distribution used in ``PoissonRegressor``, the variance is proportional to # the mean predicted value. # -# Thus, among the considered estimators, -# ``PoissonRegressor`` and ``GradientBoostingRegressor`` are better suited for -# modeling the long tail distribution of the data as compared to the ``Ridge`` -# estimator. +# Thus, among the considered estimators, ``PoissonRegressor`` is better suited +# for modeling the long tail distribution of the data as compared to the +# ``Ridge`` and ``RandomForestRegressor`` estimators. + +plt.show() From 7c850d1a7c9a5f468ee0136ee91e2d662aefa4da Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 16 Sep 2019 13:28:06 +0200 Subject: [PATCH 168/209] Lint --- examples/linear_model/plot_poisson_regression_non_normal_loss.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index fa1bc09a76285..5098016f22913 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -152,7 +152,6 @@ def load_mtpl2(n_samples=100000): dummyregressor__sample_weight=df_train.Exposure) - def score_estimator(estimator, df_test): """Score an estimatr on the test set""" From f64dc4a4c51128c6d1017911e23760a866ca4007 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 16 Sep 2019 15:24:30 +0200 Subject: [PATCH 169/209] Simplify plot_tweedie_regression_insurance_claims.py example --- ...plot_poisson_regression_non_normal_loss.py | 2 - ...lot_tweedie_regression_insurance_claims.py | 85 ++++++------------- 2 files changed, 25 insertions(+), 62 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 5098016f22913..769f321ff1562 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -295,5 +295,3 @@ def score_estimator(estimator, df_test): # Thus, among the considered estimators, ``PoissonRegressor`` is better suited # for modeling the long tail distribution of the data as compared to the # ``Ridge`` and ``RandomForestRegressor`` estimators. - -plt.show() diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 22a26d880a869..5a312f656d9ce 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -197,16 +197,6 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) -def mean_deviance(estimator, y, y_pred, weights): - if hasattr(estimator, "_family_instance"): - if weights is None: - weights = np.ones_like(y) - return (estimator._family_instance.deviance(y, y_pred, weights) - / np.sum(weights)) - else: - return np.nan - - def score_estimator( estimator, X_train, X_test, df_train, df_test, target, weights ): @@ -221,18 +211,25 @@ def score_estimator( for score_label, metric in [ ("D² explained", None), - ("mean deviance", partial(mean_deviance, estimator)), + ("mean deviance", mean_tweedie_deviance), ("mean abs. error", mean_absolute_error), ("mean squared error", mean_squared_error), ]: - if estimator.__class__.__name__ == "ClaimProdEstimator": - # ClaimProdEstimator is the product of frequency and severity - # models, denormalized by the exposure values. - # It does not fully follow the scikit-learn API and we - # must handle it separately. - y_pred = estimator.predict(X, exposure=df.Exposure.values) + if isinstance(estimator, tuple) and len(estimator) == 2: + # Score the model consisting of the product of frequency and + # severity models, denormalized by the exposure values. + est_freq, est_sev = estimator + y_pred = (df.Exposure.values * est_freq.predict(X) + * est_sev.predict(X)) + power = 1.5 else: y_pred = estimator.predict(X) + power = getattr(getattr(estimator, "_family_instance"), + "power") + + if score_label == "mean deviance": + metric = partial(mean_tweedie_deviance, power=power) + if metric is None: if not hasattr(estimator, "score"): continue @@ -248,7 +245,8 @@ def score_estimator( pd.DataFrame(res) .set_index(["metric", "subset"]) .score.unstack(-1) - .round(3) + .round(2) + .loc[:, ['train', 'test']] ) return res @@ -425,48 +423,16 @@ def score_estimator( # 4. Total claim amount -- Compound Poisson distribution # ------------------------------------------------------- # -# As mentionned in the introduction, the total claim amount can be modeled +# As mentioned in the introduction, the total claim amount can be modeled # either as the product of the frequency model by the severity model, +# denormalized by exposure. In the following code sample, the +# ``score_estimator`` is extended to score such a model. The mean deviance +# is computed assuming a Tweedie distribution with ``power=1.5`` to be +# comparable with the model from the following section, -class ClaimProdEstimator: - """Total claim amount estimator. - - Computed as the product of the frequency model by the serverity model, - denormalized by exposure. For scores, use Tweedie deviance with - `power=1.5`. - """ - - def __init__(self, est_freq, est_sev): - from sklearn.linear_model._glm.distribution import TweedieDistribution - - self.est_freq = est_freq - self.est_sev = est_sev - self._family_instance = TweedieDistribution(power=1.5) - - def predict(self, X, exposure): - """Predict the total claim amount. - - The predict method is not compatible with the scikit-learn API. - """ - return exposure * self.est_freq.predict(X) * self.est_sev.predict(X) - - def score(self, X, y, sample_weight=None): - """Compute D², the percentage of deviance explained.""" - # TODO: use d2_score directly once it is available - mu = self.predict(X, exposure=sample_weight) - dev = mean_tweedie_deviance( - y, mu, sample_weight=sample_weight, power=1.5) - y_mean = np.average(y, weights=sample_weight) * np.ones_like(y) - dev_null = mean_tweedie_deviance( - y, y_mean, sample_weight=sample_weight, power=1.5) - return 1. - dev / dev_null - - -est_prod = ClaimProdEstimator(glm_freq, glm_sev) - scores = score_estimator( - est_prod, + (glm_freq, glm_sev), X_train, X_test, df_train, @@ -479,7 +445,8 @@ def score(self, X, y, sample_weight=None): ############################################################################## # -# or as a unique Compound Poisson model, also corresponding to a Tweedie model +# Indeed, an alternative approach for modeling the total loss is with a unique +# Compound Poisson model, also corresponding to a Tweedie model # with a power :math:`p \in (1, 2)`. We determine the optimal hyperparameter # ``p`` with a grid search, @@ -535,7 +502,7 @@ def score(self, X, y, sample_weight=None): "subset": subset_label, "observed": df.ClaimAmount.values.sum(), "predicted, frequency*severity model": np.sum( - est_prod.predict(X, exposure=df.Exposure.values) + df.Exposure.values*glm_freq.predict(X)*glm_sev.predict(X) ), "predicted, tweedie, power=%.2f" % glm_total.best_estimator_.family.power: np.sum( @@ -545,5 +512,3 @@ def score(self, X, y, sample_weight=None): ) print(pd.DataFrame(res).set_index("subset").T) - -plt.show() From b1f5bde2ea12f7195d5efcf30d3a28fa9db1ef7f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 18 Sep 2019 16:57:48 +0200 Subject: [PATCH 170/209] Add "lift curve" for model validation in Poisson example --- ...plot_poisson_regression_non_normal_loss.py | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 769f321ff1562..3cdbc7cc1a789 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -39,6 +39,7 @@ from sklearn.preprocessing import OrdinalEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.ensemble import RandomForestRegressor +from sklearn.utils import gen_batches from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.metrics import mean_poisson_deviance @@ -295,3 +296,93 @@ def score_estimator(estimator, df_test): # Thus, among the considered estimators, ``PoissonRegressor`` is better suited # for modeling the long tail distribution of the data as compared to the # ``Ridge`` and ``RandomForestRegressor`` estimators. +# +# To ensure that estimators yield reasonable predictions for different +# policyholder types, we can bin test samples according to `y_pred` returned by +# each model. Then for each bin, compare the mean predicted `y_pred`, with +# the mean observed target. + + +def _lift_curve(y_true, y_pred, sample_weights=None, n_bins=100): + """Compare predictions and observations for bins + ordered by y_pred + + We order the samples by ``y_pred`` and split it in bins. + In each bin the observed mean is compared with the predicted + mean. + + Parameters + ---------- + y_true: array-like of shape (n_samples,) + Ground truth (correct) target values. + y_pred: array-like of shape (n_samples,) + Estimated target values. + sample_weight : array-like of shape (n_samples,) + Sample weights. + n_bins: int + number of bins to use + + Returns + ------- + bin_centers: ndarray of shape (n_bins,) + bin centers + y_true_bin: ndarray of shape (n_bins,) + average y_pred for each bin + y_pred_bin: ndarray of shape (n_bins,) + average y_pred for each bin + """ + idx_sort = np.argsort(y_pred) + + bin_centers = np.arange(0, 1, 1/n_bins) + 0.5/n_bins + + y_pred_bin = np.zeros(n_bins) + y_true_bin = np.zeros(n_bins) + bin_size = len(y_true) // n_bins + for n, sl in enumerate(gen_batches(len(y_true), bin_size)): + weights = sample_weights[idx_sort][sl] + y_pred_bin[n] = np.average( + y_pred[idx_sort][sl], weights=weights + ) + y_true_bin[n] = np.average( + y_true[idx_sort][sl], + weights=weights + ) + return bin_centers, y_true_bin, y_pred_bin + + +fig, ax = plt.subplots(1, 3, figsize=(12, 3.2)) +plt.subplots_adjust(wspace=0.3) + + +for axi, (label, model, color) in zip(ax, [ + ('Ridge', linregr, 'b'), + ('PoissonRegressor', glm_freq, 'k'), + ('Random Forest', gbr, 'r') +]): + y_pred = model.predict(df_test) + + q, y_true_seg, y_pred_seg = _lift_curve( + df_test.Frequency.values, + y_pred, + sample_weights=df_test.Exposure.values, + n_bins=10) + + axi.plot(q, y_pred_seg, 'o'+color, label="predictions", ms=5) + axi.step(q, y_true_seg, '--'+color, label="observations", + where='mid') + axi.set_xlim(0, 1.0) + axi.set( + title=label, + xlabel='Fraction of samples sorted by y_pred', + ylabel='Mean Frequency (y_pred)' + + ) + + axi.legend() + + +############################################################################## +# +# On the above figure, ``PoissonRegressor`` is the model which presents the +# best consistency between predicted and observed targets, both for low +# and high target values. From a9ab4e4975d3b8b1bf27e73561bbadc921aac5bf Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 25 Sep 2019 19:24:11 +0200 Subject: [PATCH 171/209] Various improvements to the model comparison example --- ...plot_poisson_regression_non_normal_loss.py | 153 +++++++++--------- 1 file changed, 79 insertions(+), 74 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 3cdbc7cc1a789..5c044d5530bd8 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -39,7 +39,7 @@ from sklearn.preprocessing import OrdinalEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.ensemble import RandomForestRegressor -from sklearn.utils import gen_batches +from sklearn.utils import gen_even_slices from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.metrics import mean_poisson_deviance @@ -149,8 +149,8 @@ def load_mtpl2(n_samples=100000): column_trans, DummyRegressor(strategy='mean') ) -dummy.fit(df_train, df_train.Frequency, - dummyregressor__sample_weight=df_train.Exposure) +dummy.fit(df_train, df_train["Frequency"], + dummyregressor__sample_weight=df_train["Exposure"]) def score_estimator(estimator, df_test): @@ -159,11 +159,11 @@ def score_estimator(estimator, df_test): y_pred = estimator.predict(df_test) print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, y_pred, - df_test.Exposure.values)) + df_test["Frequency"], y_pred, + df_test["Exposure"])) print("MAE: %.3f" % mean_absolute_error( - df_test.Frequency.values, y_pred, - df_test.Exposure.values)) + df_test["Frequency"], y_pred, + df_test["Exposure"])) # ignore negative predictions, as they are invalid for # the Poisson deviance @@ -174,12 +174,12 @@ def score_estimator(estimator, df_test): "poisson deviance".format((~mask).sum(), mask.shape[0])) print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values[mask], + df_test["Frequency"][mask], y_pred[mask], - df_test.Exposure.values[mask])) + df_test["Exposure"][mask])) -print("DummyRegressor") +print("Constant mean frequency evaluation:") score_estimator(dummy, df_test) ############################################################################## @@ -187,12 +187,12 @@ def score_estimator(estimator, df_test): # We start by modeling the target variable with the least squares linear # regression model, -linregr = make_pipeline( +ridge = make_pipeline( column_trans, Ridge(alpha=1.0) ) -linregr.fit(df_train, df_train.Frequency, - ridge__sample_weight=df_train.Exposure) +ridge.fit(df_train, df_train["Frequency"], + ridge__sample_weight=df_train["Exposure"]) ############################################################################## # @@ -203,32 +203,33 @@ def score_estimator(estimator, df_test): # apporach could be to use class:`compose.TransformedTargetRegressor` # meta-estimator to map ``y_pred`` to strictly positive domain. -print("Ridge") -score_estimator(linregr, df_test) +print("Ridge evaluation:") +score_estimator(ridge, df_test) ############################################################################## # # Next we fit the Poisson regressor on the target variable, -glm_freq = make_pipeline( +poisson = make_pipeline( column_trans, PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000) ) -glm_freq.fit(df_train, df_train.Frequency, - poissonregressor__sample_weight=df_train.Exposure) +poisson.fit(df_train, df_train["Frequency"], + poissonregressor__sample_weight=df_train["Exposure"]) -print("PoissonRegressor") -score_estimator(glm_freq, df_test) +print("PoissonRegressor evaluation:") +score_estimator(poisson, df_test) ############################################################################## # -# Finally, we will consider a non linear model with a random forest that -# still minimizes the least square error. Random forest does -# not require for categorical data to be one hot encoded, therefore here we use -# a simpler pre-processing pipeline with :class:`preprocessing.OrdinalEncoder`, - - -gbr = make_pipeline( +# Finally, we will consider a non-linear model, namely a random forest. Random +# forests do not require the categorical data to be one-hot encoded, instead +# we encode each category label with an arbirtrary integer using +# :class:`preprocessing.OrdinalEncoder` to make the model faster to train (the +# same information is encoded with a small number of features than with +# one-hot encoding). + +rf = make_pipeline( ColumnTransformer( [ ( @@ -242,53 +243,55 @@ def score_estimator(estimator, df_test): ], remainder="drop", ), - RandomForestRegressor(min_weight_fraction_leaf=1e-2) + RandomForestRegressor(min_weight_fraction_leaf=0.01, n_jobs=2) ) -gbr.fit(df_train, df_train.Frequency.values, - randomforestregressor__sample_weight=df_train.Exposure.values) +rf.fit(df_train, df_train["Frequency"].values, + randomforestregressor__sample_weight=df_train["Exposure"].values) + +print("RandomForestRegressor evaluation:") +score_estimator(rf, df_test) -print("RandomForestRegressor") -score_estimator(gbr, df_test) ############################################################################## # -# In this example, although random forest minimizes the least square error, -# because of a higher predictive power it also results in a smaller Poisson -# deviance than the Poisson regression model. +# The random forest model also minimizes the conditional least square error. +# However because of a higher predictive power it also results in a smaller +# Poisson deviance than the Poisson regression model. # -# Evaluating models with a single train / test split is prone to numerical -# errors, we can verify that we would also get equivalent resuts with the -# cross-validation score. +# Not that Evaluating models with a single train / test split is prone to +# random fluctuations. We can verify that we would also get equivalent +# conclusions with cross-validated performance metrics. # -# The difference between these models can also be visualized by comparing the +# The qualitative difference between these models can also be visualized by comparing the # histogram of observed target values with that of predicted values, fig, axes = plt.subplots(1, 4, figsize=(16, 3)) fig.subplots_adjust(bottom=0.2) +n_bins = 20 +df_train["Frequency"].hist(bins=np.linspace(-1, 10, n_bins), ax=axes[0]) -df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=axes[0]) +axes[0].set_title("Data") +axes[0].set_xlabel("y (observed Frequency)") -axes[0].set_title('Experimental data') - -for idx, model in enumerate([linregr, glm_freq, gbr]): +for idx, model in enumerate([ridge, poisson, rf]): y_pred = model.predict(df_train) - pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=axes[idx+1]) + pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), ax=axes[idx+1]) axes[idx + 1].set_title(model[-1].__class__.__name__) for axi in axes: axi.set( yscale='log', - xlabel="y (Frequency)" + xlabel="y_pred (predicted expected Frequency)" ) ############################################################################## # # The experimental data presents a long tail distribution for ``y``. In all -# models we predict the mean expected value, so we will have necessairily fewer -# extreme values. Additionally normal distribution used in ``Ridge`` and +# models we predict the mean expected value, so we will have necessairily +# fewer extreme values. Additionally normal distribution used in ``Ridge`` and # ``RandomForestRegressor`` has a constant variance, while for the Poisson # distribution used in ``PoissonRegressor``, the variance is proportional to # the mean predicted value. @@ -298,14 +301,13 @@ def score_estimator(estimator, df_test): # ``Ridge`` and ``RandomForestRegressor`` estimators. # # To ensure that estimators yield reasonable predictions for different -# policyholder types, we can bin test samples according to `y_pred` returned by -# each model. Then for each bin, compare the mean predicted `y_pred`, with -# the mean observed target. +# policyholder types, we can bin test samples according to `y_pred` returned +# by each model. Then for each bin, compare the mean predicted `y_pred`, with +# the mean observed target: -def _lift_curve(y_true, y_pred, sample_weights=None, n_bins=100): - """Compare predictions and observations for bins - ordered by y_pred +def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100): + """Compare predictions and observations for bins ordered by y_pred We order the samples by ``y_pred`` and split it in bins. In each bin the observed mean is compared with the predicted @@ -332,14 +334,12 @@ def _lift_curve(y_true, y_pred, sample_weights=None, n_bins=100): average y_pred for each bin """ idx_sort = np.argsort(y_pred) - bin_centers = np.arange(0, 1, 1/n_bins) + 0.5/n_bins - y_pred_bin = np.zeros(n_bins) y_true_bin = np.zeros(n_bins) - bin_size = len(y_true) // n_bins - for n, sl in enumerate(gen_batches(len(y_true), bin_size)): - weights = sample_weights[idx_sort][sl] + + for n, sl in enumerate(gen_even_slices(len(y_true), n_bins)): + weights = sample_weight[idx_sort][sl] y_pred_bin[n] = np.average( y_pred[idx_sort][sl], weights=weights ) @@ -350,39 +350,44 @@ def _lift_curve(y_true, y_pred, sample_weights=None, n_bins=100): return bin_centers, y_true_bin, y_pred_bin -fig, ax = plt.subplots(1, 3, figsize=(12, 3.2)) +fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.2)) plt.subplots_adjust(wspace=0.3) - -for axi, (label, model, color) in zip(ax, [ - ('Ridge', linregr, 'b'), - ('PoissonRegressor', glm_freq, 'k'), - ('Random Forest', gbr, 'r') +for axi, (label, model) in zip(ax, [ + ('Ridge', ridge), + ('PoissonRegressor', poisson), + ('Random Forest', rf) ]): y_pred = model.predict(df_test) - q, y_true_seg, y_pred_seg = _lift_curve( - df_test.Frequency.values, + q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( + df_test["Frequency"].values, y_pred, - sample_weights=df_test.Exposure.values, - n_bins=10) + sample_weights=df_test["Exposure"].values, + n_bins=5) - axi.plot(q, y_pred_seg, 'o'+color, label="predictions", ms=5) - axi.step(q, y_true_seg, '--'+color, label="observations", - where='mid') + axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions") + axi.plot(q, y_true_seg, marker='x', linestyle="--", label="observations") axi.set_xlim(0, 1.0) + axi.set_ylim(0, 0.3) axi.set( title=label, xlabel='Fraction of samples sorted by y_pred', ylabel='Mean Frequency (y_pred)' ) - axi.legend() ############################################################################## # # On the above figure, ``PoissonRegressor`` is the model which presents the -# best consistency between predicted and observed targets, both for low -# and high target values. +# best consistency between predicted and observed targets, both for low and +# high target values. +# +# The ridge regression model tends to predict very low expected frequencies +# that do not match the data. +# +# The random forest regression model also tends to exaggerate low predicted +# frequencies although to a lower extent than ridge. It also tends to +# exaggerate high frequencies on the other hand. From be7bb67e2859b6c74854aeb942a008c3947757e2 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 25 Sep 2019 19:31:51 +0200 Subject: [PATCH 172/209] Add cumulated claims plot --- ...plot_poisson_regression_non_normal_loss.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 5c044d5530bd8..cbfbfc45dd703 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -391,3 +391,48 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100 # The random forest regression model also tends to exaggerate low predicted # frequencies although to a lower extent than ridge. It also tends to # exaggerate high frequencies on the other hand. + + + +def _cumulated_claims(y_true, y_pred, exposure): + idx_sort = np.argsort(y_pred)[::-1] + sorted_exposure = exposure[idx_sort] + sorted_frequencies = y_true[idx_sort] + cumulated_exposure = np.cumsum(sorted_exposure) + cumulated_exposure /= cumulated_exposure[-1] + cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies) + cumulated_claims /= cumulated_claims[-1] + return cumulated_exposure, cumulated_claims + + +fig, ax = plt.subplots(figsize=(8, 8)) +plt.subplots_adjust(wspace=0.3) + +for (label, model) in [ + ('Ridge', ridge), + ('PoissonRegressor', poisson), + ('Random Forest', rf) +]: + y_pred = model.predict(df_test) + cum_exposure, cum_claims = _cumulated_claims( + df_test["Frequency"].values, + y_pred, + df_test["Exposure"].values) + ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) + +# Oracle model +cum_exposure, cum_claims = _cumulated_claims( + df_test["Frequency"].values, + df_test["Frequency"].values, + df_test["Exposure"].values) +ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label="Oracle") + +# Random Baseline +ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline") +ax.set( + title="Cumulated claims by model", + xlabel='Fraction of cumulated exposure (from riskiest to safest)', + ylabel='Fraction of cumulated number of claims' + +) +ax.legend() From 4125c20c9a5d06a0d3ed8241cceb6aaf582b350f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 26 Sep 2019 09:05:17 +0200 Subject: [PATCH 173/209] Improve the cumulated nb claims plot --- ...plot_poisson_regression_non_normal_loss.py | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index cbfbfc45dd703..22a4b419cf483 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -263,8 +263,9 @@ def score_estimator(estimator, df_test): # random fluctuations. We can verify that we would also get equivalent # conclusions with cross-validated performance metrics. # -# The qualitative difference between these models can also be visualized by comparing the -# histogram of observed target values with that of predicted values, +# The qualitative difference between these models can also be visualized by +# comparing the histogram of observed target values with that of predicted +# values, fig, axes = plt.subplots(1, 4, figsize=(16, 3)) @@ -306,7 +307,8 @@ def score_estimator(estimator, df_test): # the mean observed target: -def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100): +def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, + n_bins=100): """Compare predictions and observations for bins ordered by y_pred We order the samples by ``y_pred`` and split it in bins. @@ -363,7 +365,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100 q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( df_test["Frequency"].values, y_pred, - sample_weights=df_test["Exposure"].values, + sample_weight=df_test["Exposure"].values, n_bins=5) axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions") @@ -391,11 +393,21 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100 # The random forest regression model also tends to exaggerate low predicted # frequencies although to a lower extent than ridge. It also tends to # exaggerate high frequencies on the other hand. - +# +# However for some business applications we are not necessarily interested in +# the the ability of the model in predicting the expected frequency value but +# instead in predicting which customer profiles are the riskiest and which are +# the safest. In this case the model evaluation would cast the problem as a +# ranking problem rather than a regression problem. +# +# To compare the 3 models under this light on, one can plot the fraction +# of cumulated number of claims vs the fraction of cumulated of exposure +# for test samples ordered by the model predictions, from riskiest to safest +# according to each model: def _cumulated_claims(y_true, y_pred, exposure): - idx_sort = np.argsort(y_pred)[::-1] + idx_sort = np.argsort(y_pred)[::-1] # from riskiest to safest sorted_exposure = exposure[idx_sort] sorted_frequencies = y_true[idx_sort] cumulated_exposure = np.cumsum(sorted_exposure) @@ -420,7 +432,7 @@ def _cumulated_claims(y_true, y_pred, exposure): df_test["Exposure"].values) ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) -# Oracle model +# Oracle model: y_pred == y_test cum_exposure, cum_claims = _cumulated_claims( df_test["Frequency"].values, df_test["Frequency"].values, @@ -433,6 +445,22 @@ def _cumulated_claims(y_true, y_pred, exposure): title="Cumulated claims by model", xlabel='Fraction of cumulated exposure (from riskiest to safest)', ylabel='Fraction of cumulated number of claims' - ) ax.legend() + +############################################################################## +# +# This plot reveals that the random forest model is almost uniformly the best +# at sorting customers by risk profiles even if the absolute value of the +# predicted expected frequencies are less well calibrated than for the linear +# Poisson model. +# +# +# All three models are significantly better than chance but also very far from +# making perfect predictions. +# +# This last point is expected due to the nature of the problem: the occurence +# of accidents is mostly dominated by environmental causes that are not +# captured in the columns of the dataset. + +plt.show() From 0070d527e7b3399cae9a6bd6447b4de636191481 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 26 Sep 2019 10:41:02 +0200 Subject: [PATCH 174/209] Fix wrong xlabel in histogram plot --- .../plot_poisson_regression_non_normal_loss.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 22a4b419cf483..47e08b618758c 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -274,16 +274,15 @@ def score_estimator(estimator, df_test): df_train["Frequency"].hist(bins=np.linspace(-1, 10, n_bins), ax=axes[0]) axes[0].set_title("Data") +axes[0].set_yscale('log') axes[0].set_xlabel("y (observed Frequency)") for idx, model in enumerate([ridge, poisson, rf]): y_pred = model.predict(df_train) pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), ax=axes[idx+1]) - axes[idx + 1].set_title(model[-1].__class__.__name__) - -for axi in axes: - axi.set( + axes[idx + 1].set( + title=model[-1].__class__.__name__, yscale='log', xlabel="y_pred (predicted expected Frequency)" ) From 9d6bb5258ac604bc523beb1c8b85357344a9929e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 26 Sep 2019 12:03:51 +0200 Subject: [PATCH 175/209] More example improvements (preprocessors + plots) --- ...plot_poisson_regression_non_normal_loss.py | 130 +++++++++--------- 1 file changed, 63 insertions(+), 67 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 47e08b618758c..76f957c57b6da 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -40,6 +40,7 @@ from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.ensemble import RandomForestRegressor from sklearn.utils import gen_even_slices +from sklearn.metrics import auc from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.metrics import mean_poisson_deviance @@ -95,26 +96,33 @@ def load_mtpl2(n_samples=100000): df["ClaimNb"] = df["ClaimNb"].clip(upper=4) df["Exposure"] = df["Exposure"].clip(upper=1) -column_trans = ColumnTransformer( +############################################################################## +# +# The remaining columns can be used to predict the frequency of claim events. +# Those columns are very heterogeneous with a mix of categorical and numeric +# variables with different scales, possibly with heavy tails. +# +# In order to fit linear models with those predictors it is therefore +# necessary to perform standard feature transformation as follows: + +log_scale_transformer = make_pipeline( + FunctionTransformer(np.log, validate=False), + StandardScaler() +) + +linear_model_preprocessor = ColumnTransformer( [ - ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), - ( - "Veh_Brand_Gas_Region", - OneHotEncoder(), - ["VehBrand", "VehPower", "VehGas", "Region", "Area"], - ), - ("BonusMalus", "passthrough", ["BonusMalus"]), - ( - "Density_log", - make_pipeline( - FunctionTransformer(np.log, validate=False), StandardScaler() - ), - ["Density"], - ), + ("passthrough_numeric", "passthrough", + ["BonusMalus"]), + ("binned_numeric", KBinsDiscretizer(n_bins=10), + ["VehAge", "DrivAge"]), + ("log_scaled_numeric", log_scale_transformer, + ["Density"]), + ("onehot_categorical", OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), ], remainder="drop", ) -X = column_trans.fit_transform(df) ############################################################################## # @@ -141,12 +149,13 @@ def load_mtpl2(n_samples=100000): # significantly imbalanced. # # To evaluate the pertinence of the used metrics, we will consider as a -# baseline an estimator that returns the mean of the training sample. +# baseline an estimator that constantly predicts the mean frequency of the +# training sample. df_train, df_test = train_test_split(df, random_state=0) dummy = make_pipeline( - column_trans, + linear_model_preprocessor, DummyRegressor(strategy='mean') ) dummy.fit(df_train, df_train["Frequency"], @@ -187,10 +196,7 @@ def score_estimator(estimator, df_test): # We start by modeling the target variable with the least squares linear # regression model, -ridge = make_pipeline( - column_trans, - Ridge(alpha=1.0) -) +ridge = make_pipeline(linear_model_preprocessor, Ridge(alpha=1.0)) ridge.fit(df_train, df_train["Frequency"], ridge__sample_weight=df_train["Exposure"]) @@ -211,7 +217,7 @@ def score_estimator(estimator, df_test): # Next we fit the Poisson regressor on the target variable, poisson = make_pipeline( - column_trans, + linear_model_preprocessor, PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000) ) poisson.fit(df_train, df_train["Frequency"], @@ -229,20 +235,17 @@ def score_estimator(estimator, df_test): # same information is encoded with a small number of features than with # one-hot encoding). +rf_preprocessor = ColumnTransformer( + [ + ("categorical", OrdinalEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), + ("numeric", "passthrough", + ["VehAge", "DrivAge", "BonusMalus", "Density"]), + ], + remainder="drop", +) rf = make_pipeline( - ColumnTransformer( - [ - ( - "Veh_Brand_Gas_Region", OrdinalEncoder(), - ["VehBrand", "VehPower", "VehGas", "Region", "Area"], - ), - ( - "Continious", "passthrough", - ["VehAge", "DrivAge", "BonusMalus", "Density"] - ), - ], - remainder="drop", - ), + rf_preprocessor, RandomForestRegressor(min_weight_fraction_leaf=0.01, n_jobs=2) ) rf.fit(df_train, df_train["Frequency"].values, @@ -351,14 +354,10 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, return bin_centers, y_true_bin, y_pred_bin -fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.2)) +fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5)) plt.subplots_adjust(wspace=0.3) -for axi, (label, model) in zip(ax, [ - ('Ridge', ridge), - ('PoissonRegressor', poisson), - ('Random Forest', rf) -]): +for axi, model in zip(ax, [ridge, poisson, rf]): y_pred = model.predict(df_test) q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( @@ -372,19 +371,19 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, axi.set_xlim(0, 1.0) axi.set_ylim(0, 0.3) axi.set( - title=label, + title=model[-1].__class__.__name__, xlabel='Fraction of samples sorted by y_pred', ylabel='Mean Frequency (y_pred)' ) axi.legend() - +plt.tight_layout() ############################################################################## # # On the above figure, ``PoissonRegressor`` is the model which presents the # best consistency between predicted and observed targets, both for low and -# high target values. +# high predicted target values. # # The ridge regression model tends to predict very low expected frequencies # that do not match the data. @@ -393,16 +392,16 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # frequencies although to a lower extent than ridge. It also tends to # exaggerate high frequencies on the other hand. # -# However for some business applications we are not necessarily interested in -# the the ability of the model in predicting the expected frequency value but -# instead in predicting which customer profiles are the riskiest and which are -# the safest. In this case the model evaluation would cast the problem as a -# ranking problem rather than a regression problem. +# However, for some business applications, we are not necessarily interested +# in the the ability of the model in predicting the expected frequency value +# but instead in predicting which policyholder groups are the riskiest and +# which are the safest. In this case the model evaluation would cast the +# problem as a ranking problem rather than a regression problem. # -# To compare the 3 models under this light on, one can plot the fraction -# of cumulated number of claims vs the fraction of cumulated of exposure -# for test samples ordered by the model predictions, from riskiest to safest -# according to each model: +# To compare the 3 models under this light on, one can plot the fraction of +# cumulated number of claims vs the fraction of cumulated of exposure for test +# samples ordered by the model predictions, from riskiest to safest according +# to each model: def _cumulated_claims(y_true, y_pred, exposure): @@ -417,18 +416,16 @@ def _cumulated_claims(y_true, y_pred, exposure): fig, ax = plt.subplots(figsize=(8, 8)) -plt.subplots_adjust(wspace=0.3) -for (label, model) in [ - ('Ridge', ridge), - ('PoissonRegressor', poisson), - ('Random Forest', rf) -]: +for model in [ridge, poisson, rf]: y_pred = model.predict(df_test) cum_exposure, cum_claims = _cumulated_claims( df_test["Frequency"].values, y_pred, df_test["Exposure"].values) + area = auc(cum_exposure, cum_claims) + label = "{} (area under curve: {:.3f})".format( + model[-1].__class__.__name__, area) ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test @@ -449,17 +446,16 @@ def _cumulated_claims(y_true, y_pred, exposure): ############################################################################## # -# This plot reveals that the random forest model is almost uniformly the best -# at sorting customers by risk profiles even if the absolute value of the -# predicted expected frequencies are less well calibrated than for the linear -# Poisson model. -# +# This plot reveals that the random forest model is slightly better at ranking +# policyholders by risk profiles even if the absolute value of the predicted +# expected frequencies are less well calibrated than for the linear Poisson +# model. # # All three models are significantly better than chance but also very far from # making perfect predictions. # -# This last point is expected due to the nature of the problem: the occurence -# of accidents is mostly dominated by environmental causes that are not +# This last point is expected due to the nature of the problem: the occurrence +# of accidents is mostly dominated by circumstantial causes that are not # captured in the columns of the dataset. plt.show() From b353b2dcdb25bd2e55dffaacda1b80856f4b5b78 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 26 Sep 2019 13:14:09 +0200 Subject: [PATCH 176/209] Simplify dataset + use more data --- ...plot_poisson_regression_non_normal_loss.py | 99 ++++++++----------- 1 file changed, 39 insertions(+), 60 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 76f957c57b6da..da2a4bd2dccf8 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -9,10 +9,6 @@ number of insurance claims (or frequency) following car accidents for a policyholder given historical data over a population of policyholders. -We start by defining a few helper functions for loading the data and -visualizing results. - - .. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor Third-Party Liability Claims (November 8, 2018). `doi:10.2139/ssrn.3164764 `_ @@ -46,7 +42,7 @@ from sklearn.metrics import mean_poisson_deviance -def load_mtpl2(n_samples=100000): +def load_mtpl2(n_samples=None): """Fetcher for French Motor Third-Party Liability Claims dataset Parameters @@ -57,43 +53,27 @@ def load_mtpl2(n_samples=100000): """ # freMTPL2freq dataset from https://www.openml.org/d/41214 - df_freq = fetch_openml(data_id=41214, as_frame=True)['data'] - df_freq['IDpol'] = df_freq['IDpol'].astype(np.int) - df_freq.set_index('IDpol', inplace=True) - - # freMTPL2sev dataset from https://www.openml.org/d/41215 - df_sev = fetch_openml(data_id=41215, as_frame=True)['data'] - - # sum ClaimAmount over identical IDs - df_sev = df_sev.groupby('IDpol').sum() - - df = df_freq.join(df_sev, how="left") - df["ClaimAmount"].fillna(0, inplace=True) + df = fetch_openml(data_id=41214, as_frame=True)['data'] # unquote string fields for column_name in df.columns[df.dtypes.values == np.object]: df[column_name] = df[column_name].str.strip("'") - return df.iloc[:n_samples] + if n_samples is not None: + return df.iloc[:n_samples] + return df ############################################################################## # -# 1. Loading datasets and pre-processing -# -------------------------------------- +# Let's load the motor claim dataset. We ignore the severity data for this +# study for the sake of simplicitly. # -# We construct the freMTPL2 dataset by joining the freMTPL2freq table, -# containing the number of claims (``ClaimNb``) with the freMTPL2sev table -# containing the claim amount (``ClaimAmount``) for the same policy ids -# (``IDpol``). - -df = load_mtpl2(n_samples=50000) +# We also subsample the data for the sake of computational cost and running +# time. Using the full dataset would lead to similar conclusions. -# Note: filter out claims with zero amount, as the severity model -# requires strictly positive target values. -df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 +df = load_mtpl2(n_samples=300000) -# correct for unreasonable observations (that might be data error) -df["ClaimNb"] = df["ClaimNb"].clip(upper=4) +# Correct for unreasonable observations (that might be data error) df["Exposure"] = df["Exposure"].clip(upper=1) ############################################################################## @@ -133,14 +113,14 @@ def load_mtpl2(n_samples=100000): # ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution, # and use ``Exposure`` as `sample_weight`. -df["Frequency"] = df.ClaimNb / df.Exposure +df["Frequency"] = df["ClaimNb"] / df["Exposure"] print( - pd.cut(df.Frequency, [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts() + pd.cut(df["Frequency"], [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts() ) print("Average Frequency = {}" - .format(np.average(df.Frequency, weights=df.Exposure))) + .format(np.average(df["Frequency"], weights=df["Exposure"]))) ############################################################################## # @@ -262,13 +242,13 @@ def score_estimator(estimator, df_test): # However because of a higher predictive power it also results in a smaller # Poisson deviance than the Poisson regression model. # -# Not that Evaluating models with a single train / test split is prone to -# random fluctuations. We can verify that we would also get equivalent -# conclusions with cross-validated performance metrics. +# Evaluating models with a single train / test split is prone to random +# fluctuations. If computation resources allow, it should be verified that +# cross-validated performance metrics would lead to similar conclusions. # # The qualitative difference between these models can also be visualized by # comparing the histogram of observed target values with that of predicted -# values, +# values: fig, axes = plt.subplots(1, 4, figsize=(16, 3)) @@ -293,8 +273,8 @@ def score_estimator(estimator, df_test): ############################################################################## # # The experimental data presents a long tail distribution for ``y``. In all -# models we predict the mean expected value, so we will have necessairily -# fewer extreme values. Additionally normal distribution used in ``Ridge`` and +# models we predict the mean expected value, so we will have necessarily fewer +# extreme values. Additionally normal distribution used in ``Ridge`` and # ``RandomForestRegressor`` has a constant variance, while for the Poisson # distribution used in ``PoissonRegressor``, the variance is proportional to # the mean predicted value. @@ -364,12 +344,12 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, df_test["Frequency"].values, y_pred, sample_weight=df_test["Exposure"].values, - n_bins=5) + n_bins=10) axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions") axi.plot(q, y_true_seg, marker='x', linestyle="--", label="observations") axi.set_xlim(0, 1.0) - axi.set_ylim(0, 0.3) + axi.set_ylim(0, 0.6) axi.set( title=model[-1].__class__.__name__, xlabel='Fraction of samples sorted by y_pred', @@ -381,16 +361,13 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, ############################################################################## # -# On the above figure, ``PoissonRegressor`` is the model which presents the -# best consistency between predicted and observed targets, both for low and -# high predicted target values. -# -# The ridge regression model tends to predict very low expected frequencies -# that do not match the data. +# The ``Ridge`` regression model can predict very low expected frequencies +# that do not match the data. It can therefore severly under-estimate the risk +# for some policyholders. # -# The random forest regression model also tends to exaggerate low predicted -# frequencies although to a lower extent than ridge. It also tends to -# exaggerate high frequencies on the other hand. +# ``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency +# between predicted and observed targets, especially for low predicted target +# values. # # However, for some business applications, we are not necessarily interested # in the the ability of the model in predicting the expected frequency value @@ -399,9 +376,8 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # problem as a ranking problem rather than a regression problem. # # To compare the 3 models under this light on, one can plot the fraction of -# cumulated number of claims vs the fraction of cumulated of exposure for test -# samples ordered by the model predictions, from riskiest to safest according -# to each model: +# the number of claims vs the fraction of exposure for test samples ordered by +# the model predictions, from riskiest to safest according to each model: def _cumulated_claims(y_true, y_pred, exposure): @@ -433,16 +409,19 @@ def _cumulated_claims(y_true, y_pred, exposure): df_test["Frequency"].values, df_test["Frequency"].values, df_test["Exposure"].values) -ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label="Oracle") +area = auc(cum_exposure, cum_claims) +label = "Oracle (area under curve: {:.3f})".format(area) +ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label) # Random Baseline -ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline") +ax.plot([0, 1], [0, 1], linestyle="--", color="black", + label="Random baseline") ax.set( - title="Cumulated claims by model", - xlabel='Fraction of cumulated exposure (from riskiest to safest)', - ylabel='Fraction of cumulated number of claims' + title="Cumulated number of claims by model", + xlabel='Fraction of exposure (from riskiest to safest)', + ylabel='Fraction of number of claims' ) -ax.legend() +ax.legend(loc="lower right") ############################################################################## # From 88757fdb99cc516be230fe08ec1ebfb7bea0b694 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 26 Sep 2019 15:59:36 +0200 Subject: [PATCH 177/209] Remove solver parameter from {Poisson,Gamma,Tweedie}Regression --- sklearn/linear_model/_glm/glm.py | 54 +++++++++++--------------------- sklearn/neighbors/base.py | 8 ++--- 2 files changed, 22 insertions(+), 40 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index d6da8b8b80949..f7985c0f3bae3 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -80,7 +80,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Calls scipy's L-BFGS-B optimizer. max_iter : int, optional (default=100) - The maximal number of iterations for solver algorithms. + The maximal number of iterations for the solver. tol : float, optional (default=1e-4) Stopping criterion. For the lbfgs solver, @@ -113,7 +113,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Intercept (a.k.a. bias) added to linear predictor. n_iter_ : int - Actual number of iterations used in solver. + Actual number of iterations used in the solver. """ def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', @@ -423,14 +423,8 @@ class PoissonRegressor(GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - solver : {'lbfgs'}, optional (default='lbfgs') - Algorithm to use in the optimization problem: - - 'lbfgs' - Calls scipy's L-BFGS-B optimizer. - max_iter : int, optional (default=100) - The maximal number of iterations for solver algorithms. + The maximal number of iterations for the solver. tol : float, optional (default=1e-4) Stopping criterion. For the lbfgs solver, @@ -458,16 +452,16 @@ class PoissonRegressor(GeneralizedLinearRegressor): Intercept (a.k.a. bias) added to linear predictor. n_iter_ : int - Actual number of iterations used in solver. + Actual number of iterations used in the solver. """ def __init__(self, alpha=1.0, fit_intercept=True, - solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, + max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, - family="poisson", link='log', - solver=solver, max_iter=max_iter, tol=tol, - warm_start=warm_start, copy_X=copy_X, verbose=verbose) + family="poisson", link='log', max_iter=max_iter, + tol=tol, warm_start=warm_start, copy_X=copy_X, + verbose=verbose) @property def family(self): @@ -511,14 +505,8 @@ class GammaRegressor(GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - solver : {'lbfgs'}, optional (default='lbfgs') - Algorithm to use in the optimization problem: - - 'lbfgs' - Calls scipy's L-BFGS-B optimizer. - max_iter : int, optional (default=100) - The maximal number of iterations for solver algorithms. + The maximal number of iterations for the solver. tol : float, optional (default=1e-4) Stopping criterion. For the lbfgs solver, @@ -546,16 +534,16 @@ class GammaRegressor(GeneralizedLinearRegressor): Intercept (a.k.a. bias) added to linear predictor. n_iter_ : int - Actual number of iterations used in solver. + Actual number of iterations used in the solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, solver='lbfgs', + def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, - family="gamma", link='log', - solver=solver, max_iter=max_iter, tol=tol, - warm_start=warm_start, copy_X=copy_X, verbose=verbose) + family="gamma", link='log', max_iter=max_iter, + tol=tol, warm_start=warm_start, copy_X=copy_X, + verbose=verbose) @property def family(self): @@ -632,14 +620,8 @@ class TweedieRegressor(GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - solver : {'lbfgs'}, optional (default='lbfgs') - Algorithm to use in the optimization problem: - - 'lbfgs' - Calls scipy's L-BFGS-B optimizer. - max_iter : int, optional (default=100) - The maximal number of iterations for solver algorithms. + The maximal number of iterations for the solver. tol : float, optional (default=1e-4) Stopping criterion. For the lbfgs solver, @@ -666,15 +648,15 @@ class TweedieRegressor(GeneralizedLinearRegressor): Intercept (a.k.a. bias) added to linear predictor. n_iter_ : int - Actual number of iterations used in solver. + Actual number of iterations used in the solver. """ def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, - link='auto', solver='lbfgs', max_iter=100, tol=1e-4, + link='auto', max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family=TweedieDistribution(power=power), link=link, - solver=solver, max_iter=max_iter, tol=tol, + max_iter=max_iter, tol=tol, warm_start=warm_start, copy_X=copy_X, verbose=verbose) @property diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 9548a619b0b14..d178d607d3636 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -289,13 +289,13 @@ def _pairwise(self): return self.metric == 'precomputed' -def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance): +def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance, **kwargs): """Helper for the Parallel calls in KNeighborsMixin.kneighbors The Cython method tree.query is not directly picklable by cloudpickle under PyPy. """ - return tree.query(data, n_neighbors, return_distance) + return tree.query(data, n_neighbors, return_distance, **kwargs) class KNeighborsMixin: @@ -336,7 +336,7 @@ def _kneighbors_reduce_func(self, dist, start, result = neigh_ind return result - def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, **kwargs): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. @@ -458,7 +458,7 @@ class from an array representing our data set and ask who's parallel_kwargs = {"prefer": "threads"} result = Parallel(n_jobs, **parallel_kwargs)( delayed_query( - self._tree, X[s], n_neighbors, return_distance) + self._tree, X[s], n_neighbors, return_distance, **kwargs) for s in gen_even_slices(X.shape[0], n_jobs) ) else: From 6d119d43577e46f1a0dde29980df20434a920739 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 26 Sep 2019 16:02:15 +0200 Subject: [PATCH 178/209] Revert some accidental changes from 88757fdb99cc516be230fe08ec1ebfb7bea0b694. --- sklearn/neighbors/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index d178d607d3636..9548a619b0b14 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -289,13 +289,13 @@ def _pairwise(self): return self.metric == 'precomputed' -def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance, **kwargs): +def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance): """Helper for the Parallel calls in KNeighborsMixin.kneighbors The Cython method tree.query is not directly picklable by cloudpickle under PyPy. """ - return tree.query(data, n_neighbors, return_distance, **kwargs) + return tree.query(data, n_neighbors, return_distance) class KNeighborsMixin: @@ -336,7 +336,7 @@ def _kneighbors_reduce_func(self, dist, start, result = neigh_ind return result - def kneighbors(self, X=None, n_neighbors=None, return_distance=True, **kwargs): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. @@ -458,7 +458,7 @@ class from an array representing our data set and ask who's parallel_kwargs = {"prefer": "threads"} result = Parallel(n_jobs, **parallel_kwargs)( delayed_query( - self._tree, X[s], n_neighbors, return_distance, **kwargs) + self._tree, X[s], n_neighbors, return_distance) for s in gen_even_slices(X.shape[0], n_jobs) ) else: From b735eb786ca9ef03a9bbfc25851e62a4fbb71f3b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 26 Sep 2019 16:23:12 +0200 Subject: [PATCH 179/209] Additional comment about the use of properties with setters --- sklearn/linear_model/_glm/distribution.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index a5e42bcee5d1c..5f9e9ed06847c 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -245,6 +245,9 @@ def power(self): @power.setter def power(self, power): + # We use a property with a setter, to update lower and + # upper bound when the power parameter is updated e.g. in grid + # search. if not isinstance(power, numbers.Real): raise TypeError('power must be a real number, input was {0}' .format(power)) From 2d911143067a9ab0fb206cbbb0b13fa228955969 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 26 Sep 2019 16:43:13 +0200 Subject: [PATCH 180/209] Add additional tests for link derivatives --- sklearn/linear_model/_glm/tests/test_link.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py index 36219e09b58e3..27ec4ed19bdc2 100644 --- a/sklearn/linear_model/_glm/tests/test_link.py +++ b/sklearn/linear_model/_glm/tests/test_link.py @@ -4,6 +4,7 @@ import numpy as np from numpy.testing import assert_allclose import pytest +from scipy.optimize import check_grad from sklearn.linear_model._glm.link import ( IdentityLink, @@ -15,12 +16,12 @@ LINK_FUNCTIONS = [IdentityLink, LogLink, LogitLink] -@pytest.mark.parametrize('link', LINK_FUNCTIONS) -def test_link_properties(link): +@pytest.mark.parametrize('Link', LINK_FUNCTIONS) +def test_link_properties(Link): """Test link inverse and derivative.""" rng = np.random.RandomState(42) x = rng.rand(100) * 100 - link = link() # instantiate object + link = Link() if isinstance(link, LogitLink): # careful for large x, note expit(36) = 1 # limit max eta to 15 @@ -30,3 +31,15 @@ def test_link_properties(link): # g = link, h = link.inverse assert_allclose(link.derivative(link.inverse(x)), 1 / link.inverse_derivative(x)) + + +@pytest.mark.parametrize('Link', LINK_FUNCTIONS) +def test_link_derivative(Link): + link = Link() + x = np.random.RandomState(0).rand(1) + err = check_grad(link, link.derivative, x) / link.derivative(x) + assert abs(err) < 1e-6 + + err = (check_grad(link.inverse, link.inverse_derivative, x) + / link.derivative(x)) + assert abs(err) < 1e-6 From 89103bc417646864a1aa85c616a3148cb26ac2ed Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Sun, 29 Sep 2019 23:06:18 +0200 Subject: [PATCH 181/209] cosmits + typos --- doc/modules/linear_model.rst | 4 +-- ...plot_poisson_regression_non_normal_loss.py | 36 +++++++++---------- ...lot_tweedie_regression_insurance_claims.py | 6 ++-- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index a9b2e66599537..e53f309076b3b 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -996,8 +996,8 @@ of the unit variance function: weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Consider :math:`\bar{y} = \frac{\\sum_i s_i y_i}{\sum_i s_i}`, - in this case one might say that y has a 'scaled' Poisson distributions. + Considering :math:`\bar{y} = \frac{\\sum_i s_i y_i}{\sum_i s_i}`, + in this case one might say that y has a 'scaled' Poisson distribution. The same holds for other distributions. The estimator can be used as follows:: diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index da2a4bd2dccf8..d99654cf04080 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -129,8 +129,8 @@ def load_mtpl2(n_samples=None): # significantly imbalanced. # # To evaluate the pertinence of the used metrics, we will consider as a -# baseline an estimator that constantly predicts the mean frequency of the -# training sample. +# baseline a "dummy" estimator that constantly predicts the mean frequency of +# the training sample. df_train, df_test = train_test_split(df, random_state=0) @@ -143,16 +143,16 @@ def load_mtpl2(n_samples=None): def score_estimator(estimator, df_test): - """Score an estimatr on the test set""" + """Score an estimator on the test set""" y_pred = estimator.predict(df_test) - print("MSE: %.3f" % mean_squared_error( - df_test["Frequency"], y_pred, - df_test["Exposure"])) - print("MAE: %.3f" % mean_absolute_error( - df_test["Frequency"], y_pred, - df_test["Exposure"])) + print("MSE: %.3f" % + mean_squared_error(df_test["Frequency"], y_pred, + df_test["Exposure"])) + print("MAE: %.3f" % + mean_absolute_error(df_test["Frequency"], y_pred, + df_test["Exposure"])) # ignore negative predictions, as they are invalid for # the Poisson deviance @@ -160,12 +160,12 @@ def score_estimator(estimator, df_test): if (~mask).any(): warnings.warn("estimator yields negative predictions for {} samples " "out of {}. These will be ignored while computing the " - "poisson deviance".format((~mask).sum(), mask.shape[0])) + "Poisson deviance".format((~mask).sum(), mask.shape[0])) - print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test["Frequency"][mask], - y_pred[mask], - df_test["Exposure"][mask])) + print("mean Poisson deviance: %.3f" % + mean_poisson_deviance(df_test["Frequency"][mask], + y_pred[mask], + df_test["Exposure"][mask])) print("Constant mean frequency evaluation:") @@ -285,8 +285,8 @@ def score_estimator(estimator, df_test): # # To ensure that estimators yield reasonable predictions for different # policyholder types, we can bin test samples according to `y_pred` returned -# by each model. Then for each bin, compare the mean predicted `y_pred`, with -# the mean observed target: +# by each model. Then for each bin, we compare the mean predicted `y_pred`, +# with the mean observed target: def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, @@ -325,7 +325,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, for n, sl in enumerate(gen_even_slices(len(y_true), n_bins)): weights = sample_weight[idx_sort][sl] y_pred_bin[n] = np.average( - y_pred[idx_sort][sl], weights=weights + y_pred[idx_sort][sl], weights=weights ) y_true_bin[n] = np.average( y_true[idx_sort][sl], @@ -337,7 +337,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5)) plt.subplots_adjust(wspace=0.3) -for axi, model in zip(ax, [ridge, poisson, rf]): +for axi, model in zip(ax, [ridge, poisson, rf]): y_pred = model.predict(df_test) q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 5a312f656d9ce..1c50541fcd85c 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -98,7 +98,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, predicted : frame a dataframe, with the same index as df, with the predicted target fill_legend : bool, default=False - wgether to show fill_between legend + whether to show fill_between legend """ # aggregate observed and predicted variables by feature level df_ = df.loc[:, [feature, weight]].copy() @@ -219,8 +219,8 @@ def score_estimator( # Score the model consisting of the product of frequency and # severity models, denormalized by the exposure values. est_freq, est_sev = estimator - y_pred = (df.Exposure.values * est_freq.predict(X) - * est_sev.predict(X)) + y_pred = (df.Exposure.values * est_freq.predict(X) * + est_sev.predict(X)) power = 1.5 else: y_pred = estimator.predict(X) From 4f28a44bff0a7d84de13f66e0d2b5264897f6c95 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 30 Sep 2019 11:20:25 +0200 Subject: [PATCH 182/209] Address some of Alex's comments - Link -> BaseLink - Removed reference to none existing notes - Use X.dtype for dtype of y - remove check_input --- sklearn/linear_model/_glm/glm.py | 51 ++++++++------------- sklearn/linear_model/_glm/link.py | 8 ++-- sklearn/linear_model/_glm/tests/test_glm.py | 2 +- 3 files changed, 25 insertions(+), 36 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index f7985c0f3bae3..819e36e13addf 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -21,7 +21,7 @@ EDM_DISTRIBUTIONS ) from .link import ( - Link, + BaseLink, IdentityLink, LogLink, ) @@ -47,10 +47,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): ---------- alpha : float, optional (default=1) Constant that multiplies the penalty terms and thus determines the - regularization strength. - See the notes for the exact mathematical meaning of this - parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this - case, the design matrix X must have full column rank + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix X must have full column rank (no collinearities). fit_intercept : boolean, optional (default=True) @@ -63,7 +61,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The distributional assumption of the GLM, i.e. which distribution from the EDM, specifies the loss function to be minimized. - link : {'auto', 'identity', 'log'} or an instance of class Link, \ + link : {'auto', 'identity', 'log'} or an instance of class BaseLink, \ optional (default='auto') The link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (y_pred). Option 'auto' sets the link @@ -167,8 +165,8 @@ def fit(self, X, y, sample_weight=None): "; got (family={0})".format(self.family)) # Guarantee that self._link_instance is set to an instance of - # class Link - if isinstance(self.link, Link): + # class BaseLink + if isinstance(self.link, BaseLink): self._link_instance = self.link else: if self.link == 'auto': @@ -227,7 +225,7 @@ def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], dtype=[np.float64, np.float32], y_numeric=True, multi_output=False, copy=self.copy_X) - y = np.asarray(y, dtype=np.float64) + y = np.asarray(y, dtype=X.dtype) weights = _check_sample_weight(sample_weight, X) @@ -247,8 +245,7 @@ def fit(self, X, y, sample_weight=None): # deviance = sum(sample_weight * unit_deviance), # we rescale weights such that sum(weights) = 1 and this becomes # 1/2*deviance + L2 with deviance=sum(weights * unit_deviance) - weights_sum = np.sum(weights) - weights = weights / weights_sum + weights = weights / weights.sum() if self.warm_start and hasattr(self, 'coef_'): if self.fit_intercept: @@ -318,7 +315,7 @@ def _linear_predictor(self, X): """ check_is_fitted(self) X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype='numeric', ensure_2d=True, + dtype=[np.float64, np.float32], ensure_2d=True, allow_nd=False) return X @ self.coef_ + self.intercept_ @@ -413,10 +410,8 @@ class PoissonRegressor(GeneralizedLinearRegressor): ---------- alpha : float, optional (default=1) Constant that multiplies the penalty terms and thus determines the - regularization strength. - See the notes for the exact mathematical meaning of this - parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this - case, the design matrix X must have full column rank + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix X must have full column rank (no collinearities). fit_intercept : boolean, optional (default=True) @@ -454,9 +449,8 @@ class PoissonRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in the solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, - max_iter=100, tol=1e-4, warm_start=False, - copy_X=True, check_input=True, verbose=0): + def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4, + warm_start=False, copy_X=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="poisson", link='log', max_iter=max_iter, @@ -495,10 +489,8 @@ class GammaRegressor(GeneralizedLinearRegressor): ---------- alpha : float, optional (default=1) Constant that multiplies the penalty terms and thus determines the - regularization strength. - See the notes for the exact mathematical meaning of this - parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this - case, the design matrix X must have full column rank + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix X must have full column rank (no collinearities). fit_intercept : boolean, optional (default=True) @@ -536,9 +528,8 @@ class GammaRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in the solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, - max_iter=100, tol=1e-4, warm_start=False, - copy_X=True, check_input=True, verbose=0): + def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4, + warm_start=False, copy_X=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="gamma", link='log', max_iter=max_iter, @@ -601,10 +592,8 @@ class TweedieRegressor(GeneralizedLinearRegressor): alpha : float, optional (default=1) Constant that multiplies the penalty terms and thus determines the - regularization strength. - See the notes for the exact mathematical meaning of this - parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this - case, the design matrix X must have full column rank + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix X must have full column rank (no collinearities). link : {'auto', 'identity', 'log'}, default='auto' @@ -652,7 +641,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): """ def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='auto', max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, check_input=True, verbose=0): + warm_start=False, copy_X=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family=TweedieDistribution(power=power), link=link, diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py index cfdc6f181a832..7c404798b2c18 100644 --- a/sklearn/linear_model/_glm/link.py +++ b/sklearn/linear_model/_glm/link.py @@ -11,7 +11,7 @@ from scipy.special import expit, logit -class Link(metaclass=ABCMeta): +class BaseLink(metaclass=ABCMeta): """Abstract base class for Link functions.""" @abstractmethod @@ -65,7 +65,7 @@ def inverse_derivative(self, lin_pred): pass # pragma: no cover -class IdentityLink(Link): +class IdentityLink(BaseLink): """The identity link function g(x)=x.""" def __call__(self, y_pred): @@ -81,7 +81,7 @@ def inverse_derivative(self, lin_pred): return np.ones_like(lin_pred) -class LogLink(Link): +class LogLink(BaseLink): """The log link function g(x)=log(x).""" def __call__(self, y_pred): @@ -97,7 +97,7 @@ def inverse_derivative(self, lin_pred): return np.exp(lin_pred) -class LogitLink(Link): +class LogitLink(BaseLink): """The logit link function g(x)=logit(x).""" def __call__(self, y_pred): diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 898d3c4edf9c0..030cdf8a9b141 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -338,7 +338,7 @@ def test_tweedie_regression_family(regression_data): with pytest.raises(TypeError, match=msg): est.family = None - # TODO: the following should not be allowed + # XXX: following is currently allowed, but maybe it shouldn't be # est.family.power = 2 From d4dfd0b13c9bd3ce7fae0d7ed2b0ad40411f46cc Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 30 Sep 2019 12:10:01 +0200 Subject: [PATCH 183/209] Removing unnecessary comments / asarray call --- sklearn/linear_model/_glm/glm.py | 1 - sklearn/linear_model/_glm/tests/test_glm.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 819e36e13addf..65de7f9532717 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -225,7 +225,6 @@ def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], dtype=[np.float64, np.float32], y_numeric=True, multi_output=False, copy=self.copy_X) - y = np.asarray(y, dtype=X.dtype) weights = _check_sample_weight(sample_weight, X) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 030cdf8a9b141..ef49fabdee4b1 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -338,9 +338,6 @@ def test_tweedie_regression_family(regression_data): with pytest.raises(TypeError, match=msg): est.family = None - # XXX: following is currently allowed, but maybe it shouldn't be - # est.family.power = 2 - @pytest.mark.parametrize( 'estimator, value', From 64d6fbd0dd99d66789c3043558f80883a974e99e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 10:57:18 +0200 Subject: [PATCH 184/209] Update doc/modules/linear_model.rst Co-Authored-By: Nicolas Hug --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index e53f309076b3b..8945235279d9d 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -923,7 +923,7 @@ likelihood as \left( \log p(y|\mu,\phi) - \log p(y|y,\phi)\right). -The following table lists some specific EDM distributions—all are Tweedie +The following table lists some specific EDM distributions—all are instances of Tweedie distributions—and some of their properties. ================= =============================== ====================================== ============================================ From 82ace9f399c83d7f023f9dafcdada88540c0fd25 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 11:14:34 +0200 Subject: [PATCH 185/209] Remove unused solver parameter in tests --- sklearn/linear_model/_glm/tests/test_glm.py | 30 +++++++-------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index ef49fabdee4b1..a5df69b50c967 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -26,8 +26,6 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.model_selection import train_test_split -GLM_SOLVERS = ['lbfgs'] - @pytest.fixture(scope="module") def regression_data(): @@ -176,14 +174,13 @@ def test_glm_check_input_argument(check_input): glm.fit(X, y) -@pytest.mark.parametrize('solver', GLM_SOLVERS) -def test_glm_identity_regression(solver): +def test_glm_identity_regression(): """Test GLM regression with identity link on a simple dataset.""" coef = [1., 2.] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', - fit_intercept=False, solver=solver) + fit_intercept=False) res = glm.fit(X, y) assert_allclose(res.coef_, coef, rtol=1e-6) @@ -193,15 +190,14 @@ def test_glm_identity_regression(solver): [NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]) -@pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-6)]) -def test_glm_log_regression(family, solver, tol): +def test_glm_log_regression(family): """Test GLM regression with log link on a simple dataset.""" coef = [0.2, -0.1] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) glm = GeneralizedLinearRegressor( alpha=0, family=family, link='log', fit_intercept=False, - solver=solver, tol=tol) + tol=1e-6) res = glm.fit(X, y) assert_allclose(res.coef_, coef, rtol=5e-6) @@ -239,8 +235,7 @@ def test_warm_start(fit_intercept): @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) @pytest.mark.parametrize('fit_intercept', [True, False]) -@pytest.mark.parametrize('solver', GLM_SOLVERS) -def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): +def test_normal_ridge_comparison(n_samples, n_features, fit_intercept): """Compare with Ridge regression for Normal distributions.""" alpha = 1.0 test_size = 10 @@ -264,8 +259,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): glm = GeneralizedLinearRegressor(alpha=1.0, family='normal', link='identity', fit_intercept=True, - solver=solver, check_input=False, - max_iter=300) + check_input=False, max_iter=300) glm.fit(X_train, y_train) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, atol=5e-5) @@ -274,8 +268,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=5e-5) -@pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-7)]) -def test_poisson_glmnet(solver, tol): +def test_poisson_glmnet(): """Compare Poisson regression with L2 regularization and LogLink to glmnet """ # library("glmnet") @@ -294,19 +287,16 @@ def test_poisson_glmnet(solver, tol): glm = GeneralizedLinearRegressor(alpha=1, fit_intercept=True, family='poisson', link='log', tol=1e-7, - solver=solver, max_iter=300, - ) + max_iter=300) glm.fit(X, y) assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5) -@pytest.mark.parametrize("solver", GLM_SOLVERS) -def test_convergence_warning(solver, regression_data): +def test_convergence_warning(regression_data): X, y = regression_data - est = GeneralizedLinearRegressor(solver=solver, - max_iter=1, tol=1e-20) + est = GeneralizedLinearRegressor(max_iter=1, tol=1e-20) with pytest.warns(ConvergenceWarning): est.fit(X, y) From 5288a0ff156c254df7e465971127685f5532fbf3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 12:13:35 +0200 Subject: [PATCH 186/209] Add test for sample_weight consistency --- sklearn/linear_model/_glm/tests/test_glm.py | 36 +++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index a5df69b50c967..2a54b759011e7 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -181,8 +181,40 @@ def test_glm_identity_regression(): y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', fit_intercept=False) - res = glm.fit(X, y) - assert_allclose(res.coef_, coef, rtol=1e-6) + glm.fit(X, y) + assert_allclose(glm.coef_, coef, rtol=1e-6) + + +def test_glm_sample_weight_consistentcy(): + """Test that the impact of sample_weight is consistent""" + rng = np.random.RandomState(0) + n_samples, n_features = 10, 5 + + X = rng.rand(n_samples, n_features) + y = rng.rand(n_samples) + glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', + fit_intercept=False) + glm.fit(X, y) + coef = glm.coef_.copy() + + # sample_weight=np.ones(..) should be equivalent to sample_weight=None + sample_weight = np.ones(y.shape) + glm.fit(X, y, sample_weight=sample_weight) + assert_allclose(glm.coef_, coef, rtol=1e-6) + + # sample_weight are normalized to 1 so, scaling them has no effect + sample_weight = 2*np.ones(y.shape) + glm.fit(X, y, sample_weight=sample_weight) + assert_allclose(glm.coef_, coef, rtol=1e-6) + + # setting one element of sample_weight to 0 is equivalent to removing + # the correspoding sample + sample_weight = np.ones(y.shape) + sample_weight[-1] = 0 + glm.fit(X, y, sample_weight=sample_weight) + coef1 = glm.coef_.copy() + glm.fit(X[:-1], y[:-1]) + assert_allclose(glm.coef_, coef1, rtol=1e-6) @pytest.mark.parametrize( From 499e8d244b5f6bf3ad7856dace4338b1cf5d31e1 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 14:41:47 +0200 Subject: [PATCH 187/209] Move GLM losses under sklearn._loss.glm_distribution --- sklearn/_loss/__init__.py | 0 .../_glm/distribution.py => _loss/glm_distribution.py} | 0 sklearn/_loss/tests/__init__.py | 0 .../tests/test_glm_distribution.py} | 2 +- sklearn/linear_model/_glm/glm.py | 2 +- sklearn/linear_model/_glm/tests/test_glm.py | 2 +- sklearn/metrics/regression.py | 2 +- 7 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 sklearn/_loss/__init__.py rename sklearn/{linear_model/_glm/distribution.py => _loss/glm_distribution.py} (100%) create mode 100644 sklearn/_loss/tests/__init__.py rename sklearn/{linear_model/_glm/tests/test_distribution.py => _loss/tests/test_glm_distribution.py} (98%) diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/_loss/glm_distribution.py similarity index 100% rename from sklearn/linear_model/_glm/distribution.py rename to sklearn/_loss/glm_distribution.py diff --git a/sklearn/_loss/tests/__init__.py b/sklearn/_loss/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/_loss/tests/test_glm_distribution.py similarity index 98% rename from sklearn/linear_model/_glm/tests/test_distribution.py rename to sklearn/_loss/tests/test_glm_distribution.py index 97c3a485ef4bb..cb4c5ae07e4d1 100644 --- a/sklearn/linear_model/_glm/tests/test_distribution.py +++ b/sklearn/_loss/tests/test_glm_distribution.py @@ -9,7 +9,7 @@ from scipy.optimize import check_grad import pytest -from sklearn.linear_model._glm.distribution import ( +from sklearn._loss.glm_distribution import ( TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 65de7f9532717..360db6e4f741e 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -15,7 +15,7 @@ from ...utils import check_array, check_X_y from ...utils.optimize import _check_optimize_result from ...utils.validation import check_is_fitted, _check_sample_weight -from .distribution import ( +from ..._loss.glm_distribution import ( ExponentialDispersionModel, TweedieDistribution, EDM_DISTRIBUTIONS diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 2a54b759011e7..9e21ae7775cf4 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -17,7 +17,7 @@ IdentityLink, LogLink, ) -from sklearn.linear_model._glm.distribution import ( +from sklearn._loss.glm_distribution import ( TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 706c484334d21..f7ef99794727b 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -25,6 +25,7 @@ import numpy as np import warnings +from .._loss.glm_distribution import TweedieDistribution from ..utils.validation import (check_array, check_consistent_length, _num_samples) from ..utils.validation import column_or_1d @@ -672,7 +673,6 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0): >>> mean_tweedie_deviance(y_true, y_pred, power=1) 1.4260... """ - from ..linear_model._glm.distribution import TweedieDistribution y_type, y_true, y_pred, _ = _check_reg_targets( y_true, y_pred, None, dtype=[np.float64, np.float32]) if y_type == 'continuous-multioutput': From f4aa839da1f0226ff8ed98adb22d6e90446d7120 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 15:21:42 +0200 Subject: [PATCH 188/209] Update sklearn/linear_model/_glm/glm.py Co-Authored-By: Nicolas Hug --- sklearn/linear_model/_glm/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 360db6e4f741e..dd6f847895434 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -571,7 +571,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): mean (:math:`\y_\textrm{pred}`): :math:`v(\y_\textrm{pred}) = \y_\textrm{pred}^{power}`. - For ``0 Date: Thu, 3 Oct 2019 14:55:48 +0200 Subject: [PATCH 189/209] Add missing config.add_subpackage in setup.py --- sklearn/linear_model/setup.py | 1 + sklearn/setup.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/sklearn/linear_model/setup.py b/sklearn/linear_model/setup.py index 5cf7040d4c9d4..e50a30eca73da 100644 --- a/sklearn/linear_model/setup.py +++ b/sklearn/linear_model/setup.py @@ -43,6 +43,7 @@ def configuration(parent_package='', top_path=None): # add other directories config.add_subpackage('tests') config.add_subpackage('_glm') + config.add_subpackage('_glm/tests') return config diff --git a/sklearn/setup.py b/sklearn/setup.py index 53f6d3f6eb30c..3913965a375cd 100644 --- a/sklearn/setup.py +++ b/sklearn/setup.py @@ -47,6 +47,8 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('experimental/tests') config.add_subpackage('ensemble/_hist_gradient_boosting') config.add_subpackage('ensemble/_hist_gradient_boosting/tests') + config.add_subpackage('_loss/') + config.add_subpackage('_loss/tests') # submodules which have their own setup.py config.add_subpackage('cluster') From d71fb9f8fb54608124f4947931ecf1cbaaba425f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 15:28:50 +0200 Subject: [PATCH 190/209] Address Nicolas comments in the documentation (partial) --- doc/modules/linear_model.rst | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 8945235279d9d..6667057dc5073 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -907,13 +907,13 @@ Generalized Linear Models (GLM) extend linear models in two ways combination of the input variables :math:`X` via an inverse link function :math:`h` as -.. math:: \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p). +.. math:: \hat{y}(w, X) = h(x^\top w) = h(w_0 + w_1 X_1 + ... + w_p X_p). Secondly, the squared loss function is replaced by the unit deviance :math:`d` of a reproductive exponential dispersion model (EDM) [11]_. The minimization problem becomes -.. math:: \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, x_i)) + \frac{\alpha}{2} ||w||_2 +.. math:: \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, X_i)) + \frac{\alpha}{2} ||w||_2 with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`. The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` @@ -939,8 +939,8 @@ Inverse Gaussian :math:`y \in (0, \infty)` :math:`\mu^3` Usage ----- -In the following use cases, a loss different from the squared loss might be -appropriate: +A GLM loss different from the classical squared loss might be appropriate in +the following cases: * If the target values :math:`y` are counts (non-negative integer valued) or frequencies (non-negative), you might use a Poisson deviance with log-link. @@ -985,13 +985,8 @@ of the unit variance function: * If you want to model a relative frequency, i.e. counts per exposure (time, volume, ...) you can do so by a Poisson distribution and passing :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values - together with :math:`s=\mathrm{exposure}` as sample weights. This is done - in both examples linked below. - * The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\frac{\phi}{s_i} v(\mu_i)`. - * If the target `y` is a ratio, appropriate sample weights ``s`` should be - provided. + together with :math:`s=\mathrm{exposure}` as sample weights. + As an example, consider Poisson distributed counts z (integers) and weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``. @@ -1000,6 +995,10 @@ of the unit variance function: in this case one might say that y has a 'scaled' Poisson distribution. The same holds for other distributions. + * The fit itself does not need Y to be from an EDM, but only assumes + the first two moments to be :math:`E[Y_i]=\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\frac{\phi}{s_i} v(\mu_i)`. + The estimator can be used as follows:: >>> from sklearn.linear_model import TweedieRegressor From fa90272e4d1925b4caa967febf417c4ea9e96457 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 16:10:40 +0200 Subject: [PATCH 191/209] More cleanups in the plot_tweedie_regression_insurance_claims.py example --- ...lot_tweedie_regression_insurance_claims.py | 57 ++++++++----------- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 1c50541fcd85c..dfd5555c83af4 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -142,7 +142,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # Note: filter out claims with zero amount, as the severity model # requires strictly positive target values. -df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 +df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0 # Correct for unreasonable observations (that might be data error) # and a few exceptionally large claim amounts @@ -150,30 +150,26 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, df["Exposure"] = df["Exposure"].clip(upper=1) df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000) +log_scale_transformer = make_pipeline( + FunctionTransformer(np.log, validate=False), + StandardScaler() +) + column_trans = ColumnTransformer( [ - ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), - ( - "Veh_Brand_Gas_Region", - OneHotEncoder(), - ["VehBrand", "VehPower", "VehGas", "Region", "Area"], - ), - ("BonusMalus", "passthrough", ["BonusMalus"]), - ( - "Density_log", - make_pipeline( - FunctionTransformer(np.log, validate=False), StandardScaler() - ), - ["Density"], - ), + ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ("onehot_categorical", OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), + ("passthrough_numeric", "passthrough", ["BonusMalus"]), + ("log_scaled_numeric", log_scale_transformer, ["Density"]), ], remainder="drop", ) X = column_trans.fit_transform(df) -df["Frequency"] = df.ClaimNb / df.Exposure -df["AvgClaimAmount"] = df.ClaimAmount / np.fmax(df.ClaimNb, 1) +df["Frequency"] = df["ClaimNb"] / df["Exposure"] +df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1) print(df[df.ClaimAmount > 0].head()) @@ -268,7 +264,7 @@ def score_estimator( # the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance # bonus/malus (``BonusMalus``). -fig, ax = plt.subplots(2, 2, figsize=(16, 8)) +fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(16, 8)) fig.subplots_adjust(hspace=0.3, wspace=0.2) plot_obs_pred( @@ -369,18 +365,12 @@ def score_estimator( # it is conditional on having at least one claim, and cannot be used to predict # the average claim amount per policy in general. -print( - "Mean AvgClaim Amount per policy: %.2f " - % df_train.AvgClaimAmount.mean() -) -print( - "Mean AvgClaim Amount | NbClaim > 0: %.2f" - % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean() -) -print( - "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f" - % glm_sev.predict(X_train).mean() -) +print("Mean AvgClaim Amount per policy: %.2f " + % df_train["AvgClaimAmount"].mean()) +print("Mean AvgClaim Amount | NbClaim > 0: %.2f" + % df_train["AvgClaimAmount"][df_train["AvgClaimAmount"] > 0].mean()) +print("Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f" + % glm_sev.predict(X_train).mean()) ############################################################################## @@ -388,7 +378,7 @@ def score_estimator( # We can visually compare observed and predicted values, aggregated for # the drivers age (``DrivAge``). -fig, ax = plt.subplots(1, 2, figsize=(16, 4)) +fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 4)) # plot DivAge plot_obs_pred( @@ -500,9 +490,9 @@ def score_estimator( res.append( { "subset": subset_label, - "observed": df.ClaimAmount.values.sum(), + "observed": df["ClaimAmount"].values.sum(), "predicted, frequency*severity model": np.sum( - df.Exposure.values*glm_freq.predict(X)*glm_sev.predict(X) + df["Exposure"].values*glm_freq.predict(X)*glm_sev.predict(X) ), "predicted, tweedie, power=%.2f" % glm_total.best_estimator_.family.power: np.sum( @@ -512,3 +502,4 @@ def score_estimator( ) print(pd.DataFrame(res).set_index("subset").T) +plt.plot() From 4d16f318ca07cf8a00200201042bba177d4d27ba Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 6 Oct 2019 16:13:21 +0200 Subject: [PATCH 192/209] Typos and text improvement in poisson example --- doc/modules/linear_model.rst | 2 +- ...plot_poisson_regression_non_normal_loss.py | 65 ++++++++++--------- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 6667057dc5073..4a5aeab305b8a 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -991,7 +991,7 @@ of the unit variance function: weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Considering :math:`\bar{y} = \frac{\\sum_i s_i y_i}{\sum_i s_i}`, + Considering :math:`\bar{y} = \frac{\sum_i s_i y_i}{\sum_i s_i}`, in this case one might say that y has a 'scaled' Poisson distribution. The same holds for other distributions. diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index d99654cf04080..3ecb02108de4f 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -6,8 +6,8 @@ This example illustrates the use of log-linear Poisson regression on the French Motor Third-Party Liability Claims dataset [1] and compares it with models learned with least squared error. The goal is to predict the -number of insurance claims (or frequency) following car accidents for a -policyholder given historical data over a population of policyholders. +expected number of insurance claims (or frequency) following car accidents for +a policyholder given historical data over a population of policyholders. .. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor Third-Party Liability Claims (November 8, 2018). @@ -42,8 +42,8 @@ from sklearn.metrics import mean_poisson_deviance -def load_mtpl2(n_samples=None): - """Fetcher for French Motor Third-Party Liability Claims dataset +def load_mtpl2(n_samples=100000): + """Fetch the French Motor Third-Party Liability Claims dataset. Parameters ---------- @@ -122,9 +122,13 @@ def load_mtpl2(n_samples=None): print("Average Frequency = {}" .format(np.average(df["Frequency"], weights=df["Exposure"]))) +print("Percentage of zero claims = {0:%}" + .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() / + df["Exposure"].sum())) + ############################################################################## # -# It worth noting that 96 % of policyholders have zero claims, and if we were +# It worth noting that 92 % of policyholders have zero claims, and if we were # to convert this problem into a binary classification task, it would be # significantly imbalanced. # @@ -143,7 +147,7 @@ def load_mtpl2(n_samples=None): def score_estimator(estimator, df_test): - """Score an estimator on the test set""" + """Score an estimator on the test set.""" y_pred = estimator.predict(df_test) @@ -154,13 +158,14 @@ def score_estimator(estimator, df_test): mean_absolute_error(df_test["Frequency"], y_pred, df_test["Exposure"])) - # ignore negative predictions, as they are invalid for + # ignore non-positive predictions, as they are invalid for # the Poisson deviance mask = y_pred > 0 if (~mask).any(): - warnings.warn("estimator yields negative predictions for {} samples " - "out of {}. These will be ignored while computing the " - "Poisson deviance".format((~mask).sum(), mask.shape[0])) + warnings.warn("Estimator yields non-positive predictions for {} " + "samples out of {}. These will be ignored while " + "computing the Poisson deviance" + .format((~mask).sum(), mask.shape[0])) print("mean Poisson deviance: %.3f" % mean_poisson_deviance(df_test["Frequency"][mask], @@ -182,12 +187,12 @@ def score_estimator(estimator, df_test): ############################################################################## # -# The Poisson deviance cannot be computed on negative values predicted by the -# model. For models that do return a few negative predictions +# The Poisson deviance cannot be computed on non-positive values predicted by +# the model. For models that do return a few non-positive predictions # (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples, # meaning that the obtained Poisson deviance is approximate. An alternative -# apporach could be to use class:`compose.TransformedTargetRegressor` -# meta-estimator to map ``y_pred`` to strictly positive domain. +# approach could be to use :class:`compose.TransformedTargetRegressor` +# meta-estimator to map ``y_pred`` to a strictly positive domain. print("Ridge evaluation:") score_estimator(ridge, df_test) @@ -210,9 +215,9 @@ def score_estimator(estimator, df_test): # # Finally, we will consider a non-linear model, namely a random forest. Random # forests do not require the categorical data to be one-hot encoded, instead -# we encode each category label with an arbirtrary integer using +# we encode each category label with an arbitrary integer using # :class:`preprocessing.OrdinalEncoder` to make the model faster to train (the -# same information is encoded with a small number of features than with +# same information is encoded with a smaller number of features than with # one-hot encoding). rf_preprocessor = ColumnTransformer( @@ -238,12 +243,13 @@ def score_estimator(estimator, df_test): ############################################################################## # -# The random forest model also minimizes the conditional least square error. -# However because of a higher predictive power it also results in a smaller -# Poisson deviance than the Poisson regression model. +# Like the Ridge regression above, the random forest model minimizes the +# conditional squared error, too. However, because of a higher predictive +# power, it also results in a smaller Poisson deviance than the Poisson +# regression model. # # Evaluating models with a single train / test split is prone to random -# fluctuations. If computation resources allow, it should be verified that +# fluctuations. If computing resources allow, it should be verified that # cross-validated performance metrics would lead to similar conclusions. # # The qualitative difference between these models can also be visualized by @@ -274,7 +280,7 @@ def score_estimator(estimator, df_test): # # The experimental data presents a long tail distribution for ``y``. In all # models we predict the mean expected value, so we will have necessarily fewer -# extreme values. Additionally normal distribution used in ``Ridge`` and +# extreme values. Additionally, normal distribution used in ``Ridge`` and # ``RandomForestRegressor`` has a constant variance, while for the Poisson # distribution used in ``PoissonRegressor``, the variance is proportional to # the mean predicted value. @@ -291,11 +297,10 @@ def score_estimator(estimator, df_test): def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100): - """Compare predictions and observations for bins ordered by y_pred + """Compare predictions and observations for bins ordered by y_pred. We order the samples by ``y_pred`` and split it in bins. - In each bin the observed mean is compared with the predicted - mean. + In each bin the observed mean is compared with the predicted mean. Parameters ---------- @@ -306,7 +311,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, sample_weight : array-like of shape (n_samples,) Sample weights. n_bins: int - number of bins to use + Number of bins to use. Returns ------- @@ -370,10 +375,10 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # values. # # However, for some business applications, we are not necessarily interested -# in the the ability of the model in predicting the expected frequency value -# but instead in predicting which policyholder groups are the riskiest and -# which are the safest. In this case the model evaluation would cast the -# problem as a ranking problem rather than a regression problem. +# in the ability of the model to predict the expected frequency value, but +# instead to predict which policyholder groups are the riskiest and which are +# the safest. In this case, the model evaluation would cast the problem as a +# ranking problem rather than a regression problem. # # To compare the 3 models under this light on, one can plot the fraction of # the number of claims vs the fraction of exposure for test samples ordered by @@ -435,6 +440,6 @@ def _cumulated_claims(y_true, y_pred, exposure): # # This last point is expected due to the nature of the problem: the occurrence # of accidents is mostly dominated by circumstantial causes that are not -# captured in the columns of the dataset. +# captured in the columns of the dataset or that are indeed random. plt.show() From 15eb1d39c6c2bbbdceeb64996d0f297ef4f80ebf Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 6 Oct 2019 16:20:33 +0200 Subject: [PATCH 193/209] EXA sharey for histograms --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 3ecb02108de4f..769c072c624fb 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -257,7 +257,7 @@ def score_estimator(estimator, df_test): # values: -fig, axes = plt.subplots(1, 4, figsize=(16, 3)) +fig, axes = plt.subplots(1, 4, figsize=(16, 3), sharey=True) fig.subplots_adjust(bottom=0.2) n_bins = 20 df_train["Frequency"].hist(bins=np.linspace(-1, 10, n_bins), ax=axes[0]) @@ -265,6 +265,7 @@ def score_estimator(estimator, df_test): axes[0].set_title("Data") axes[0].set_yscale('log') axes[0].set_xlabel("y (observed Frequency)") +axes[0].set_ylim([1E2, 5E5]) for idx, model in enumerate([ridge, poisson, rf]): y_pred = model.predict(df_train) From 3d097c686dfb016a4561afca37f28ad0d40dc0f3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 8 Oct 2019 15:07:56 +0200 Subject: [PATCH 194/209] Plot y_pred histograms on the test set --- ...plot_poisson_regression_non_normal_loss.py | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 769c072c624fb..0e948873da570 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -256,26 +256,32 @@ def score_estimator(estimator, df_test): # comparing the histogram of observed target values with that of predicted # values: - -fig, axes = plt.subplots(1, 4, figsize=(16, 3), sharey=True) +fig, axes = plt.subplots(2, 4, figsize=(16, 6), sharey=True) fig.subplots_adjust(bottom=0.2) n_bins = 20 -df_train["Frequency"].hist(bins=np.linspace(-1, 10, n_bins), ax=axes[0]) - -axes[0].set_title("Data") -axes[0].set_yscale('log') -axes[0].set_xlabel("y (observed Frequency)") -axes[0].set_ylim([1E2, 5E5]) - -for idx, model in enumerate([ridge, poisson, rf]): - y_pred = model.predict(df_train) - - pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), ax=axes[idx+1]) - axes[idx + 1].set( - title=model[-1].__class__.__name__, - yscale='log', - xlabel="y_pred (predicted expected Frequency)" - ) +for row_idx, label, df in zip(range(2), + ["train", "test"], + [df_train, df_test]): + df["Frequency"].hist(bins=np.linspace(-1, 30, n_bins), + ax=axes[row_idx, 0]) + + axes[row_idx, 0].set_title("Data") + axes[row_idx, 0].set_yscale('log') + axes[row_idx, 0].set_xlabel("y (observed Frequency)") + axes[row_idx, 0].set_ylim([1e1, 5e5]) + axes[row_idx, 0].set_ylabel(label + " samples") + + for idx, model in enumerate([ridge, poisson, rf]): + y_pred = model.predict(df) + + pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), + ax=axes[row_idx, idx+1]) + axes[row_idx, idx + 1].set( + title=model[-1].__class__.__name__, + yscale='log', + xlabel="y_pred (predicted expected Frequency)" + ) +plt.tight_layout() ############################################################################## # From 31f5b3d6a7a5e01055bc8bd914b51d3d92bcafb5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 9 Oct 2019 16:30:37 +0200 Subject: [PATCH 195/209] Compound Poisson => Compound Poisson Gamma --- ...lot_tweedie_regression_insurance_claims.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index dfd5555c83af4..3bdb7d93f0130 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -410,15 +410,15 @@ def score_estimator( # Overall, the drivers age (``DrivAge``) has a weak impact on the claim # severity, both in observed and predicted data. # -# 4. Total claim amount -- Compound Poisson distribution -# ------------------------------------------------------- +# 4. Total claim amount -- Compound Poisson Gamma distribution +# ------------------------------------------------------------ # # As mentioned in the introduction, the total claim amount can be modeled # either as the product of the frequency model by the severity model, # denormalized by exposure. In the following code sample, the -# ``score_estimator`` is extended to score such a model. The mean deviance -# is computed assuming a Tweedie distribution with ``power=1.5`` to be -# comparable with the model from the following section, +# ``score_estimator`` is extended to score such a model. The mean deviance is +# computed assuming a Tweedie distribution with ``power=1.5`` to be comparable +# with the model from the following section, scores = score_estimator( @@ -436,9 +436,9 @@ def score_estimator( ############################################################################## # # Indeed, an alternative approach for modeling the total loss is with a unique -# Compound Poisson model, also corresponding to a Tweedie model -# with a power :math:`p \in (1, 2)`. We determine the optimal hyperparameter -# ``p`` with a grid search, +# Compound Poisson Gamma model, also corresponding to a Tweedie model with a +# power :math:`p \in (1, 2)`. We determine the optimal hyperparameter ``p`` +# with a grid search: from sklearn.model_selection import GridSearchCV @@ -475,12 +475,13 @@ def score_estimator( ############################################################################## # # In this example, the mean absolute error is lower for the Compound Poisson -# model than when using separate models for frequency and severity. +# Gamma model than when using the product of the predictions of separate +# models for frequency and severity. # -# We can additionally validate these models by comparing observed and predicted -# total claim amount over the test and train subsets. We see that in our case -# the frequency-severity model underestimates the total claim amount, whereas -# the Tweedie model overestimates. +# We can additionally validate these models by comparing observed and +# predicted total claim amount over the test and train subsets. We see that, +# on average, the frequency-severity model underestimates the total claim +# amount, whereas the Tweedie model overestimates. res = [] for subset_label, X, df in [ From a498ff546da062fb4f6c33bef9d18e45e18249c8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 9 Oct 2019 17:07:02 +0200 Subject: [PATCH 196/209] Compound Poisson => Compound Poisson Gamma --- sklearn/linear_model/_glm/glm.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index dd6f847895434..aae438733d424 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -28,7 +28,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - """Regression via a Generalized Linear Model (GLM) with penalties. + """Regression via a penalized Generalized Linear Model (GLM). GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and predicting the mean of the target y as y_pred=h(X*w). @@ -575,19 +575,19 @@ class TweedieRegressor(GeneralizedLinearRegressor): Special cases are: - +-------+------------------+ - | Power | Distribution | - +=======+==================+ - | 0 | Normal | - +-------+------------------+ - | 1 | Poisson | - +-------+------------------+ - | (1,2) | Compound Poisson | - +-------+------------------+ - | 2 | Gamma | - +-------+------------------+ - | 3 | Inverse Gaussian | - +-------+------------------+ + +-------+------------------------+ + | Power | Distribution | + +=======+========================+ + | 0 | Normal | + +-------+------------------------+ + | 1 | Poisson | + +-------+------------------------+ + | (1,2) | Compound Poisson Gamma | + +-------+------------------------+ + | 2 | Gamma | + +-------+------------------------+ + | 3 | Inverse Gaussian | + +-------+------------------------+ alpha : float, optional (default=1) Constant that multiplies the penalty terms and thus determines the From 3fae28a06968ab8ce0d95b5e4a53b2e7a7d66205 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 9 Oct 2019 19:23:19 +0200 Subject: [PATCH 197/209] Various improvement in Tweedie regression example --- ...lot_tweedie_regression_insurance_claims.py | 156 +++++++++++++----- 1 file changed, 118 insertions(+), 38 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 3bdb7d93f0130..4b450fe34bb1e 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -48,7 +48,7 @@ from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer -from sklearn.metrics import mean_absolute_error, mean_squared_error +from sklearn.metrics import mean_absolute_error, mean_squared_error, auc def load_mtpl2(n_samples=100000): @@ -157,11 +157,14 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, column_trans = ColumnTransformer( [ - ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ("binned_numeric", KBinsDiscretizer(n_bins=10), + ["VehAge", "DrivAge"]), ("onehot_categorical", OneHotEncoder(), - ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), - ("passthrough_numeric", "passthrough", ["BonusMalus"]), - ("log_scaled_numeric", log_scale_transformer, ["Density"]), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), + ("passthrough_numeric", "passthrough", + ["BonusMalus"]), + ("log_scaled_numeric", log_scale_transformer, + ["Density"]), ], remainder="drop", ) @@ -194,7 +197,8 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, def score_estimator( - estimator, X_train, X_test, df_train, df_test, target, weights + estimator, X_train, X_test, df_train, df_test, target, weights, + power=None, ): """Evaluate an estimator on train and test sets with different metrics""" res = [] @@ -217,13 +221,15 @@ def score_estimator( est_freq, est_sev = estimator y_pred = (df.Exposure.values * est_freq.predict(X) * est_sev.predict(X)) - power = 1.5 else: y_pred = estimator.predict(X) - power = getattr(getattr(estimator, "_family_instance"), - "power") + if power is None: + power = getattr(getattr(estimator, "_family_instance"), + "power") if score_label == "mean deviance": + if power is None: + continue metric = partial(mean_tweedie_deviance, power=power) if metric is None: @@ -378,7 +384,7 @@ def score_estimator( # We can visually compare observed and predicted values, aggregated for # the drivers age (``DrivAge``). -fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 4)) +fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 6)) # plot DivAge plot_obs_pred( @@ -403,7 +409,7 @@ def score_estimator( ax=ax[1], fill_legend=True ) - +plt.tight_layout() ############################################################################## # @@ -411,16 +417,16 @@ def score_estimator( # severity, both in observed and predicted data. # # 4. Total claim amount -- Compound Poisson Gamma distribution -# ------------------------------------------------------------ +# ------------------------------------------------------------ # # As mentioned in the introduction, the total claim amount can be modeled # either as the product of the frequency model by the severity model, # denormalized by exposure. In the following code sample, the # ``score_estimator`` is extended to score such a model. The mean deviance is -# computed assuming a Tweedie distribution with ``power=1.5`` to be comparable -# with the model from the following section, - +# computed assuming a Tweedie distribution with ``power=2`` to be comparable +# with the model from the following section: +eps = 1e-4 scores = score_estimator( (glm_freq, glm_sev), X_train, @@ -429,40 +435,54 @@ def score_estimator( df_test, target="ClaimAmount", weights="Exposure", + power=2-eps, ) print(scores) ############################################################################## # -# Indeed, an alternative approach for modeling the total loss is with a unique -# Compound Poisson Gamma model, also corresponding to a Tweedie model with a -# power :math:`p \in (1, 2)`. We determine the optimal hyperparameter ``p`` -# with a grid search: +# Instead of taking the product of two independently fit models for frequency +# and severity one can directly model the total loss is with a unique Compound +# Poisson Gamma generalized linear model (with a log link function). This +# model is a special case of the Tweedie model with a power parameter :math:`p +# \in (1, 2)`. +# +# We determine the optimal hyperparameter ``p`` with a grid search so as to +# minimize the deviance: from sklearn.model_selection import GridSearchCV -# exclude upper bound as power>=2 does not support y=0. -params = {"power": np.linspace(1 + 1e-4, 2 - 1e-4, 8)} - - -# this takes a while -glm_total = GridSearchCV( - TweedieRegressor(tol=1e-3, max_iter=500), cv=3, - param_grid=params, n_jobs=-1 +# exclude upper bound as power>=2 as p=2 would lead to an undefined unit +# deviance on data points with y=0. +params = {"power": np.linspace(1 + eps, 2 - eps, 5)} + +X_train_small, _, df_train_small, _ = train_test_split( + X_train, df_train, train_size=5000) + +# This can takes a while on the full training set, therefore we do the +# hyper-parameter search on a random subset, hoping that the best value of +# power does not depend too much on the dataset size. We use a bit +# penalization to avoid numerical issues with colinear features and speed-up +# convergence. +glm_total = TweedieRegressor(max_iter=10000, alpha=1e-2) +search = GridSearchCV( + glm_total, cv=3, + param_grid=params, n_jobs=-1, verbose=10, + refit=False, ) -glm_total.fit( - X_train, df_train["ClaimAmount"], sample_weight=df_train["Exposure"] +search.fit( + X_train_small, df_train_small["ClaimAmount"], + sample_weight=df_train_small["Exposure"] ) +print("Best hyper-parameters: %s" % search.best_params_) - -print( - "Best hyperparameters: power=%.2f\n" - % glm_total.best_estimator_.family.power -) +glm_total.set_params(**search.best_params_) +glm_total.fit(X_train, df_train["ClaimAmount"], + sample_weight=df_train["Exposure"]) scores = score_estimator( - glm_total.best_estimator_, + glm_total, X_train, X_test, df_train, @@ -496,11 +516,71 @@ def score_estimator( df["Exposure"].values*glm_freq.predict(X)*glm_sev.predict(X) ), "predicted, tweedie, power=%.2f" - % glm_total.best_estimator_.family.power: np.sum( - glm_total.best_estimator_.predict(X) - ), + % glm_total.power: np.sum(glm_total.predict(X)), } ) print(pd.DataFrame(res).set_index("subset").T) + +############################################################################## +# +# Finally, we can compare the two models using a plot of cumulated claims: for +# each model, the policyholders are ranked from riskiest to safest and the +# actual cumulated claims are plotted against the cumulated exposure. +# +# The area under the curve can be used as a model selection metric to quantify +# the ability of the model to rank policyholders. Note that this metric does +# not reflect the ability of the models to make accurate predictions in terms +# of absolute value of total claim amounts but only in terms of relative +# amounts as a ranking metric. +# +# Both models are able to rank policyholders by risky-ness significantly +# better than chance although they are also both far from perfect due to the +# natural difficulty of the prediction problem from few features. + + +def _cumulated_claims(y_true, y_pred, exposure): + idx_sort = np.argsort(y_pred)[::-1] # from riskiest to safest + sorted_exposure = exposure[idx_sort] + sorted_frequencies = y_true[idx_sort] + cumulated_exposure = np.cumsum(sorted_exposure) + cumulated_exposure /= cumulated_exposure[-1] + cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies) + cumulated_claims /= cumulated_claims[-1] + return cumulated_exposure, cumulated_claims + + +fig, ax = plt.subplots(figsize=(8, 8)) + +y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test) +y_pred_total = glm_total.predict(X_test) + +for label, y_pred in [("Frequency * Severity model", y_pred_product), + ("Compound Poisson Gamma", y_pred_total)]: + cum_exposure, cum_claims = _cumulated_claims( + df_test["Frequency"].values, + y_pred, + df_test["Exposure"].values) + area = auc(cum_exposure, cum_claims) + label += " (area under curve: {:.3f})".format(area) + ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) + +# Oracle model: y_pred == y_test +cum_exposure, cum_claims = _cumulated_claims( + df_test["Frequency"].values, + df_test["Frequency"].values, + df_test["Exposure"].values) +area = auc(cum_exposure, cum_claims) +label = "Oracle (area under curve: {:.3f})".format(area) +ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label) + +# Random Baseline +ax.plot([0, 1], [0, 1], linestyle="--", color="black", + label="Random baseline") +ax.set( + title="Cumulated claim amount by model", + xlabel='Fraction of exposure (from riskiest to safest)', + ylabel='Fraction of total claim amount' +) +ax.legend(loc="lower right") plt.plot() From a47798afe6e30e75b66f274a2323838c6a1401ea Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 10 Oct 2019 11:07:09 +0200 Subject: [PATCH 198/209] Update doc/modules/linear_model.rst Co-Authored-By: Thomas J Fan --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index d9e60a3517f8e..3119b9b0db94b 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -915,7 +915,7 @@ problem becomes .. math:: \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, X_i)) + \frac{\alpha}{2} ||w||_2 -with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`. +with sample weights :math:`s_i`, and L2 regularization penalty :math:`\alpha`. The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` likelihood as From 83391dd56bac107b21eea4cb258f3831a56d02ff Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 10 Oct 2019 11:49:53 +0200 Subject: [PATCH 199/209] Use latest docstring conventions everywhere --- sklearn/_loss/glm_distribution.py | 36 +++++------ sklearn/linear_model/_glm/glm.py | 100 +++++++++++++++--------------- sklearn/linear_model/_glm/link.py | 8 +-- sklearn/metrics/regression.py | 6 +- 4 files changed, 74 insertions(+), 76 deletions(-) diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py index 5f9e9ed06847c..4020f74427c44 100644 --- a/sklearn/_loss/glm_distribution.py +++ b/sklearn/_loss/glm_distribution.py @@ -60,7 +60,7 @@ def in_y_range(self, y): Parameters ---------- - y : array, shape (n_samples,) + y : array of shape (n_samples,) Target values. """ # Note that currently supported distributions have +inf upper bound @@ -92,7 +92,7 @@ def unit_variance(self, y_pred): Parameters ---------- - y_pred : array, shape (n_samples,) + y_pred : array of shape (n_samples,) Predicted mean. """ pass # pragma: no cover @@ -105,7 +105,7 @@ def unit_variance_derivative(self, y_pred): Parameters ---------- - y_pred : array, shape (n_samples,) + y_pred : array of shape (n_samples,) Target values. """ pass # pragma: no cover @@ -121,10 +121,10 @@ def unit_deviance(self, y, y_pred, check_input=False): Parameters ---------- - y : array, shape (n_samples,) + y : array of shape (n_samples,) Target values. - y_pred : array, shape (n_samples,) + y_pred : array of shape (n_samples,) Predicted mean. check_input : bool, default=False @@ -132,7 +132,7 @@ def unit_deviance(self, y, y_pred, check_input=False): they will be propagated as NaN. Returns ------- - deviance: array, shape (n_samples,) + deviance: array of shape (n_samples,) Computed deviance """ pass # pragma: no cover @@ -147,10 +147,10 @@ def unit_deviance_derivative(self, y, y_pred): Parameters ---------- - y : array, shape (n_samples,) + y : array of shape (n_samples,) Target values. - y_pred : array, shape (n_samples,) + y_pred : array of shape (n_samples,) Predicted mean. """ return -2 * (y - y_pred) / self.unit_variance(y_pred) @@ -168,13 +168,13 @@ def deviance(self, y, y_pred, weights=1): Parameters ---------- - y : array, shape (n_samples,) + y : array of shape (n_samples,) Target values. - y_pred : array, shape (n_samples,) + y_pred : array of shape (n_samples,) Predicted mean. - weights : array, shape (n_samples,) (default=1) + weights : {int, array of shape (n_samples,)}, default=1 Weights or exposure to which variance is inverse proportional. """ return np.sum(weights * self.unit_deviance(y, y_pred)) @@ -193,7 +193,7 @@ def deviance_derivative(self, y, y_pred, weights=1): y_pred : array, shape (n_samples,) Predicted mean. - weights : array, shape (n_samples,) (default=1) + weights : {int, array of shape (n_samples,)}, default=1 Weights or exposure to which variance is inverse proportional. """ return weights * self.unit_deviance_derivative(y, y_pred) @@ -231,7 +231,7 @@ class TweedieDistribution(ExponentialDispersionModel): Parameters ---------- - power : float (default=0) + power : float, default=0 The variance power of the `unit_variance` :math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`. For ``0 0. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns @@ -743,7 +743,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None): y_pred : array-like of shape (n_samples,) Estimated target values. Requires y_pred > 0. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns From 3bfb54e9e172b6f2cae3e48a7ece4d011937e786 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 10 Oct 2019 11:53:42 +0200 Subject: [PATCH 200/209] Drop check_input parameter --- sklearn/linear_model/_glm/glm.py | 22 ++++++--------------- sklearn/linear_model/_glm/tests/test_glm.py | 12 +---------- 2 files changed, 7 insertions(+), 27 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 2012f3cbb32a9..990de8114a717 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -92,11 +92,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. - check_input : bool, default=True - Allow to bypass several checks on input: y values in range of family, - sample_weight non-negative. - Don't use this parameter unless you know what you do. - verbose : int, default=0 For the lbfgs solver set verbose to any positive number for verbosity. @@ -115,7 +110,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, - copy_X=True, check_input=True, verbose=0): + copy_X=True, verbose=0): self.alpha = alpha self.fit_intercept = fit_intercept self.family = family @@ -125,7 +120,6 @@ def __init__(self, alpha=1.0, self.tol = tol self.warm_start = warm_start self.copy_X = copy_X - self.check_input = check_input self.verbose = verbose def fit(self, X, y, sample_weight=None): @@ -213,9 +207,6 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) - if not isinstance(self.check_input, bool): - raise ValueError("The argument check_input must be bool; got " - "(check_input={0})".format(self.check_input)) family = self._family_instance link = self._link_instance @@ -228,12 +219,11 @@ def fit(self, X, y, sample_weight=None): _, n_features = X.shape - if self.check_input: - if not np.all(family.in_y_range(y)): - raise ValueError("Some value(s) of y are out of the valid " - "range for family {0}" - .format(family.__class__.__name__)) - # TODO: if alpha=0 check that X is not rank deficient + if not np.all(family.in_y_range(y)): + raise ValueError("Some value(s) of y are out of the valid " + "range for family {0}" + .format(family.__class__.__name__)) + # TODO: if alpha=0 check that X is not rank deficient # rescaling of sample_weight # diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 9e21ae7775cf4..c0ff6508db9c9 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -164,16 +164,6 @@ def test_glm_copy_X_argument(copy_X): glm.fit(X, y) -@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]]) -def test_glm_check_input_argument(check_input): - """Test GLM for invalid check_input argument.""" - y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(check_input=check_input) - with pytest.raises(ValueError, match="check_input must be bool"): - glm.fit(X, y) - - def test_glm_identity_regression(): """Test GLM regression with identity link on a simple dataset.""" coef = [1., 2.] @@ -291,7 +281,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept): glm = GeneralizedLinearRegressor(alpha=1.0, family='normal', link='identity', fit_intercept=True, - check_input=False, max_iter=300) + max_iter=300) glm.fit(X_train, y_train) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, atol=5e-5) From d325fe23348f8dabfcc55dd4fbd8fa82fd60ff8d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 10 Oct 2019 11:57:10 +0200 Subject: [PATCH 201/209] Use keyword only arguments SLEP009 --- sklearn/linear_model/_glm/glm.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 990de8114a717..8ef912f9596b6 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -107,7 +107,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): n_iter_ : int Actual number of iterations used in the solver. """ - def __init__(self, alpha=1.0, + def __init__(self, *, alpha=1.0, fit_intercept=True, family='normal', link='auto', solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, copy_X=True, verbose=0): @@ -435,8 +435,8 @@ class PoissonRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in the solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, verbose=0): + def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, + tol=1e-4, warm_start=False, copy_X=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="poisson", link='log', max_iter=max_iter, @@ -514,8 +514,8 @@ class GammaRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in the solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, verbose=0): + def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, + tol=1e-4, warm_start=False, copy_X=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="gamma", link='log', max_iter=max_iter, @@ -626,7 +626,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in the solver. """ - def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, + def __init__(self, *, power=0.0, alpha=1.0, fit_intercept=True, link='auto', max_iter=100, tol=1e-4, warm_start=False, copy_X=True, verbose=0): From 661cf56e04bc23e53a7173bc43d998187892d9ec Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 10 Oct 2019 12:04:37 +0200 Subject: [PATCH 202/209] Move _y_pred_deviance_derivative from losses as a private function --- sklearn/_loss/glm_distribution.py | 19 ------------------- sklearn/linear_model/_glm/glm.py | 25 +++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py index 4020f74427c44..55365d382c03b 100644 --- a/sklearn/_loss/glm_distribution.py +++ b/sklearn/_loss/glm_distribution.py @@ -13,14 +13,6 @@ from scipy.special import xlogy -def _safe_lin_pred(X, coef): - """Compute the linear predictor taking care if intercept is present.""" - if coef.size == X.shape[1] + 1: - return X @ coef[1:] + coef[0] - else: - return X @ coef - - DistributionBoundary = namedtuple("DistributionBoundary", ("value", "inclusive")) @@ -198,17 +190,6 @@ def deviance_derivative(self, y, y_pred, weights=1): """ return weights * self.unit_deviance_derivative(y, y_pred) - def _y_pred_deviance_derivative(self, coef, X, y, weights, link): - """Compute y_pred and the derivative of the deviance w.r.t coef.""" - lin_pred = _safe_lin_pred(X, coef) - y_pred = link.inverse(lin_pred) - d1 = link.inverse_derivative(lin_pred) - temp = d1 * self.deviance_derivative(y, y_pred, weights) - if coef.size == X.shape[1] + 1: - devp = np.concatenate(([temp.sum()], temp @ X)) - else: - devp = temp @ X # same as X.T @ temp - return y_pred, devp class TweedieDistribution(ExponentialDispersionModel): diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 8ef912f9596b6..b29dcd89a35a6 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -27,6 +27,27 @@ ) +def _safe_lin_pred(X, coef): + """Compute the linear predictor taking care if intercept is present.""" + if coef.size == X.shape[1] + 1: + return X @ coef[1:] + coef[0] + else: + return X @ coef + + +def _y_pred_deviance_derivative(coef, X, y, weights, family, link): + """Compute y_pred and the derivative of the deviance w.r.t coef.""" + lin_pred = _safe_lin_pred(X, coef) + y_pred = link.inverse(lin_pred) + d1 = link.inverse_derivative(lin_pred) + temp = d1 * family.deviance_derivative(y, y_pred, weights) + if coef.size == X.shape[1] + 1: + devp = np.concatenate(([temp.sum()], temp @ X)) + else: + devp = temp @ X # same as X.T @ temp + return y_pred, devp + + class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a penalized Generalized Linear Model (GLM). @@ -251,8 +272,8 @@ def fit(self, X, y, sample_weight=None): if solver == 'lbfgs': def func(coef, X, y, weights, alpha, family, link): - y_pred, devp = family._y_pred_deviance_derivative( - coef, X, y, weights, link + y_pred, devp = _y_pred_deviance_derivative( + coef, X, y, weights, family, link ) dev = family.deviance(y, y_pred, weights) intercept = (coef.size == X.shape[1] + 1) From 560c180fee5c1217353af106beeca04a0b9bd732 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 10 Oct 2019 15:28:21 +0200 Subject: [PATCH 203/209] Fix cumulated claim amount curve in Tweedie regression example --- ...lot_tweedie_regression_insurance_claims.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 4b450fe34bb1e..7e4a8599dec0e 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -539,15 +539,15 @@ def score_estimator( # natural difficulty of the prediction problem from few features. -def _cumulated_claims(y_true, y_pred, exposure): - idx_sort = np.argsort(y_pred)[::-1] # from riskiest to safest - sorted_exposure = exposure[idx_sort] - sorted_frequencies = y_true[idx_sort] - cumulated_exposure = np.cumsum(sorted_exposure) +def _cumulated_claim_amount(y_true, y_pred, exposure): + ranking = np.argsort(y_pred)[::-1] # from riskiest to safest + ranked_exposure = exposure[ranking] + ranked_claim_amount = y_true[ranking] + cumulated_exposure = np.cumsum(ranked_exposure) cumulated_exposure /= cumulated_exposure[-1] - cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies) - cumulated_claims /= cumulated_claims[-1] - return cumulated_exposure, cumulated_claims + cumulated_claim_amount = np.cumsum(ranked_claim_amount) + cumulated_claim_amount /= cumulated_claim_amount[-1] + return cumulated_exposure, cumulated_claim_amount fig, ax = plt.subplots(figsize=(8, 8)) @@ -557,8 +557,8 @@ def _cumulated_claims(y_true, y_pred, exposure): for label, y_pred in [("Frequency * Severity model", y_pred_product), ("Compound Poisson Gamma", y_pred_total)]: - cum_exposure, cum_claims = _cumulated_claims( - df_test["Frequency"].values, + cum_exposure, cum_claims = _cumulated_claim_amount( + df_test["ClaimAmount"].values, y_pred, df_test["Exposure"].values) area = auc(cum_exposure, cum_claims) @@ -566,9 +566,9 @@ def _cumulated_claims(y_true, y_pred, exposure): ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test -cum_exposure, cum_claims = _cumulated_claims( - df_test["Frequency"].values, - df_test["Frequency"].values, +cum_exposure, cum_claims = _cumulated_claim_amount( + df_test["ClaimAmount"].values, + df_test["ClaimAmount"].values, df_test["Exposure"].values) area = auc(cum_exposure, cum_claims) label = "Oracle (area under curve: {:.3f})".format(area) From 0ea2dce29a52829fb2c0ace0c992f68e1f052f0a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 10 Oct 2019 16:15:25 +0200 Subject: [PATCH 204/209] PEP8 --- sklearn/_loss/glm_distribution.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py index 55365d382c03b..dbfac6af673ae 100644 --- a/sklearn/_loss/glm_distribution.py +++ b/sklearn/_loss/glm_distribution.py @@ -191,7 +191,6 @@ def deviance_derivative(self, y, y_pred, weights=1): return weights * self.unit_deviance_derivative(y, y_pred) - class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. From a608c703b5960994c569e0191775c3dfa0c805e0 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 10 Oct 2019 19:07:31 +0200 Subject: [PATCH 205/209] WIP implementation of Gini coeff and Lorenz curve --- sklearn/metrics/__init__.py | 4 +++ sklearn/metrics/ranking.py | 53 +++++++++++++++++++++++++++++++++++++ sklearn/metrics/scorer.py | 7 +++-- 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index b0846f2ff6828..9f284e9df54fb 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -14,6 +14,8 @@ from .ranking import precision_recall_curve from .ranking import roc_auc_score from .ranking import roc_curve +from .ranking import gini_score +from .ranking import lorenz_curve from .classification import accuracy_score from .classification import balanced_accuracy_score @@ -106,6 +108,7 @@ 'fbeta_score', 'fowlkes_mallows_score', 'get_scorer', + 'gini_score', 'hamming_loss', 'hinge_loss', 'homogeneity_completeness_v_measure', @@ -114,6 +117,7 @@ 'jaccard_similarity_score', 'label_ranking_average_precision_score', 'label_ranking_loss', + 'lorenz_curve', 'log_loss', 'make_scorer', 'nan_euclidean_distances', diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index d1a14910897f1..01f7ee510d845 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -1390,3 +1390,56 @@ def ndcg_score(y_true, y_score, k=None, sample_weight=None, ignore_ties=False): _check_dcg_target_type(y_true) gain = _ndcg_sample_scores(y_true, y_score, k=k, ignore_ties=ignore_ties) return np.average(gain, weights=sample_weight) + + +def lorenz_curve(y_true, y_pred, sample_weight=None, + ascending_predictions=True, + normalize=True, + return_gini=False): + y_true = check_array(y_true, ensure_2d=False) + y_pred = check_array(y_pred, ensure_2d=False) + check_consistent_length(y_true, y_pred) + y_true_min = y_true.min() + if y_true_min < 0: + raise ValueError("lorenz_curve is only defined for regression problems" + " with non-negative target values. Observed minimum" + " target value is %f" % y_true_min) + if sample_weight is None: + sample_weight = np.ones(len(y_true), dtype=np.float64) + else: + sample_weight = check_array(sample_weight, ensure_2d=False) + check_consistent_length(y_true, sample_weight) + + # Rank the ranking base on y_pred + ranking = np.argsort(y_pred) + if not ascending_predictions: + ranking = ranking[::-1] + + ranked_sample_weight = sample_weight[ranking] + ranked_target = y_true[ranking] + + # Accumulate the sample weights and target values + cumulated_samples = np.cumsum(ranked_sample_weight) + cumulated_target = np.cumsum(ranked_target) + + # Normalize to report fractions instead of absolute values. + # Normalization is necessary to compute the Gini index from + # the area under the Lorenz curve + if normalize: + cumulated_samples /= cumulated_samples[-1] + cumulated_target /= cumulated_target[-1] + + if return_gini: + if not normalize or not ascending_predictions: + raise ValueError("Gini coefficient requires normalize=True" + " and ascending_predictions=True") + gini = 1 - 2 * auc(cumulated_samples, cumulated_target) + return cumulated_samples, cumulated_target, gini + return cumulated_samples, cumulated_target + + +def gini_score(y_true, y_pred, sample_weight=None): + cumulated_weights, cumulated_values = lorenz_curve( + y_true, y_pred, sample_weight=sample_weight, + ascending_predictions=True, normalize=True) + return 1 - 2 * auc(cumulated_weights, cumulated_values) diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 25b826ff91f75..06942f71333d6 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -31,7 +31,7 @@ f1_score, roc_auc_score, average_precision_score, precision_score, recall_score, log_loss, balanced_accuracy_score, explained_variance_score, - brier_score_loss, jaccard_score) + brier_score_loss, jaccard_score, gini_score) from .cluster import adjusted_rand_score from .cluster import homogeneity_score @@ -634,6 +634,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, mean_gamma_deviance, greater_is_better=False ) +gini_scorer = make_scorer(gini_score) + # Standard Classification Scores accuracy_scorer = make_scorer(accuracy_score) balanced_accuracy_scorer = make_scorer(balanced_accuracy_score) @@ -707,7 +709,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, mutual_info_score=mutual_info_scorer, adjusted_mutual_info_score=adjusted_mutual_info_scorer, normalized_mutual_info_score=normalized_mutual_info_scorer, - fowlkes_mallows_score=fowlkes_mallows_scorer) + fowlkes_mallows_score=fowlkes_mallows_scorer, + gini_score=gini_scorer) for name, metric in [('precision', precision_score), From 853f8b785bf24eff8e39b653eee49a73042a8e86 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 10 Oct 2019 19:08:01 +0200 Subject: [PATCH 206/209] Use Lorenz curve in Tweedie example --- ...lot_tweedie_regression_insurance_claims.py | 79 +++++++++---------- 1 file changed, 37 insertions(+), 42 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 7e4a8599dec0e..fb44484c2d0bf 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -48,15 +48,16 @@ from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer -from sklearn.metrics import mean_absolute_error, mean_squared_error, auc +from sklearn.metrics import mean_absolute_error, mean_squared_error +from sklearn.metrics import lorenz_curve -def load_mtpl2(n_samples=100000): +def load_mtpl2(n_samples=None): """Fetch the French Motor Third-Party Liability Claims dataset. Parameters ---------- - n_samples: int, default=100000 + n_samples: int, default=None number of samples to select (for faster run time). Full dataset has 678013 samples. """ @@ -138,7 +139,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # containing the claim amount (``ClaimAmount``) for the same policy ids # (``IDpol``). -df = load_mtpl2(n_samples=60000) +df = load_mtpl2() # Note: filter out claims with zero amount, as the severity model # requires strictly positive target values. @@ -188,7 +189,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution, # and use ``Exposure`` as `sample_weight`. -df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) +df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=40) # Some of the features are colinear, we use a weak penalization to avoid # numerical issues. @@ -449,7 +450,7 @@ def score_estimator( # \in (1, 2)`. # # We determine the optimal hyperparameter ``p`` with a grid search so as to -# minimize the deviance: +# maximize the Gini coefficient (a risk ranking metric): from sklearn.model_selection import GridSearchCV @@ -458,7 +459,7 @@ def score_estimator( params = {"power": np.linspace(1 + eps, 2 - eps, 5)} X_train_small, _, df_train_small, _ = train_test_split( - X_train, df_train, train_size=5000) + X_train, df_train, train_size=5000, random_state=0) # This can takes a while on the full training set, therefore we do the # hyper-parameter search on a random subset, hoping that the best value of @@ -467,15 +468,17 @@ def score_estimator( # convergence. glm_total = TweedieRegressor(max_iter=10000, alpha=1e-2) search = GridSearchCV( - glm_total, cv=3, - param_grid=params, n_jobs=-1, verbose=10, - refit=False, + glm_total, param_grid=params, cv=3, scoring="gini_score", + n_jobs=-1, verbose=1, refit=False ) search.fit( X_train_small, df_train_small["ClaimAmount"], sample_weight=df_train_small["Exposure"] ) print("Best hyper-parameters: %s" % search.best_params_) +cv_results = pd.DataFrame(search.cv_results_).sort_values( + "mean_test_score", ascending=False) +print(cv_results[["param_power", "mean_test_score", "std_test_score"]]) glm_total.set_params(**search.best_params_) glm_total.fit(X_train, df_train["ClaimAmount"], @@ -524,32 +527,26 @@ def score_estimator( ############################################################################## # -# Finally, we can compare the two models using a plot of cumulated claims: for -# each model, the policyholders are ranked from riskiest to safest and the -# actual cumulated claims are plotted against the cumulated exposure. +# Finally, we can compare the two models using a plot of Lorenz curve of +# cumulated claims: for each model, the policyholders are ranked from safest +# to riskiest and the actual cumulated claims are plotted against the +# cumulated exposure. # -# The area under the curve can be used as a model selection metric to quantify -# the ability of the model to rank policyholders. Note that this metric does -# not reflect the ability of the models to make accurate predictions in terms -# of absolute value of total claim amounts but only in terms of relative -# amounts as a ranking metric. +# The Gini coefficient can be computed from the areas under curve to compare +# the model to the random baseline. This coefficient can be used as a model +# selection metric to quantify the ability of the model to rank policyholders. +# A Gini coefficient close to 0 means random ranking, while larger Gini +# coefficient of 1 mean more discriminative rankings. +# +# Note that this metric does not reflect the ability of the models to make +# accurate predictions in terms of absolute value of total claim amounts but +# only in terms of relative amounts as a ranking metric. # # Both models are able to rank policyholders by risky-ness significantly # better than chance although they are also both far from perfect due to the # natural difficulty of the prediction problem from few features. -def _cumulated_claim_amount(y_true, y_pred, exposure): - ranking = np.argsort(y_pred)[::-1] # from riskiest to safest - ranked_exposure = exposure[ranking] - ranked_claim_amount = y_true[ranking] - cumulated_exposure = np.cumsum(ranked_exposure) - cumulated_exposure /= cumulated_exposure[-1] - cumulated_claim_amount = np.cumsum(ranked_claim_amount) - cumulated_claim_amount /= cumulated_claim_amount[-1] - return cumulated_exposure, cumulated_claim_amount - - fig, ax = plt.subplots(figsize=(8, 8)) y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test) @@ -557,21 +554,19 @@ def _cumulated_claim_amount(y_true, y_pred, exposure): for label, y_pred in [("Frequency * Severity model", y_pred_product), ("Compound Poisson Gamma", y_pred_total)]: - cum_exposure, cum_claims = _cumulated_claim_amount( - df_test["ClaimAmount"].values, - y_pred, - df_test["Exposure"].values) - area = auc(cum_exposure, cum_claims) - label += " (area under curve: {:.3f})".format(area) + cum_exposure, cum_claims, gini = lorenz_curve( + df_test["ClaimAmount"], y_pred, + sample_weight=df_test["Exposure"], + return_gini=True) + label += " (Gini coefficient: {:.3f})".format(gini) ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test -cum_exposure, cum_claims = _cumulated_claim_amount( - df_test["ClaimAmount"].values, - df_test["ClaimAmount"].values, - df_test["Exposure"].values) -area = auc(cum_exposure, cum_claims) -label = "Oracle (area under curve: {:.3f})".format(area) +cum_exposure, cum_claims, gini = lorenz_curve( + df_test["ClaimAmount"], df_test["ClaimAmount"], + sample_weight=df_test["Exposure"], + return_gini=True) +label = "Oracle (Gini coefficient: {:.3f})".format(gini) ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label) # Random Baseline @@ -582,5 +577,5 @@ def _cumulated_claim_amount(y_true, y_pred, exposure): xlabel='Fraction of exposure (from riskiest to safest)', ylabel='Fraction of total claim amount' ) -ax.legend(loc="lower right") +ax.legend(loc="upper left") plt.plot() From b3b55e8cf36124c15f264b60a97c940db6e138a7 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 10 Oct 2019 19:15:31 +0200 Subject: [PATCH 207/209] PEP8 --- sklearn/metrics/ranking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 01f7ee510d845..11879dfd13033 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -993,7 +993,7 @@ def label_ranking_loss(y_true, y_score, sample_weight=None): unique_inverse[y_true.indices[start:stop]], minlength=len(unique_scores)) all_at_reversed_rank = np.bincount(unique_inverse, - minlength=len(unique_scores)) + minlength=len(unique_scores)) false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank # if the scores are ordered, it's possible to count the number of From 640f0179a27af62c4510287fc80a7a63e2db7dc9 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 11 Oct 2019 09:07:52 +0200 Subject: [PATCH 208/209] Make sure labels/weights are floats before normalizing --- sklearn/metrics/ranking.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 11879dfd13033..2fb8e5c429df5 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -1396,8 +1396,10 @@ def lorenz_curve(y_true, y_pred, sample_weight=None, ascending_predictions=True, normalize=True, return_gini=False): - y_true = check_array(y_true, ensure_2d=False) - y_pred = check_array(y_pred, ensure_2d=False) + y_true = check_array(y_true, ensure_2d=False, + dtype=[np.float64, np.float32]) + y_pred = check_array(y_pred, ensure_2d=False, + dtype=[np.float64, np.float32]) check_consistent_length(y_true, y_pred) y_true_min = y_true.min() if y_true_min < 0: @@ -1407,7 +1409,8 @@ def lorenz_curve(y_true, y_pred, sample_weight=None, if sample_weight is None: sample_weight = np.ones(len(y_true), dtype=np.float64) else: - sample_weight = check_array(sample_weight, ensure_2d=False) + sample_weight = check_array(sample_weight, ensure_2d=False, + dtype=[np.float64, np.float32]) check_consistent_length(y_true, sample_weight) # Rank the ranking base on y_pred From 6dd197ab0774b66921d0e93112664d4755478303 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 11 Oct 2019 09:08:20 +0200 Subject: [PATCH 209/209] Update scorer test framework --- sklearn/metrics/tests/test_score_objects.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index cfabed6d2c4ac..8aaa3e0658fdf 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -47,7 +47,8 @@ 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'max_error', 'neg_mean_poisson_deviance', - 'neg_mean_gamma_deviance'] + 'neg_mean_gamma_deviance', + 'gini_score'] CLF_SCORERS = ['accuracy', 'balanced_accuracy', 'f1', 'f1_weighted', 'f1_macro', 'f1_micro', @@ -73,7 +74,8 @@ 'jaccard_samples'] REQUIRE_POSITIVE_Y_SCORERS = ['neg_mean_poisson_deviance', - 'neg_mean_gamma_deviance'] + 'neg_mean_gamma_deviance', + 'gini_score'] def _require_positive_y(y):