Skip to content

FIX avoid parameter validation in init in TweedieRegressor #22124

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,10 @@ Changelog
for the highs based solvers.
:pr:`21086` by :user:`Venkatachalam Natchiappan <venkyyuvy>`.

- |Fix| The input paramter `family` of :class:`linear_model.TweedieRegressor` is not
validated in `__init__` anymore. The input parameter `family` is now a string.
:pr:`22124` by :user:`Maren Westermann <marenwestermann>`.

:mod:`sklearn.metrics`
......................

Expand Down
91 changes: 49 additions & 42 deletions sklearn/linear_model/_glm/glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,28 +168,18 @@ def fit(self, X, y, sample_weight=None):
self : object
Fitted model.
"""
if isinstance(self.family, ExponentialDispersionModel):
self._family_instance = self.family
elif self.family in EDM_DISTRIBUTIONS:
self._family_instance = EDM_DISTRIBUTIONS[self.family]()
else:
raise ValueError(
"The family must be an instance of class"
" ExponentialDispersionModel or an element of"
" ['normal', 'poisson', 'gamma', 'inverse-gaussian']"
"; got (family={0})".format(self.family)
)

family_instance = self._get_family_instance()
# Guarantee that self._link_instance is set to an instance of
# class BaseLink
if isinstance(self.link, BaseLink):
self._link_instance = self.link
else:
if self.link == "auto":
if isinstance(self._family_instance, TweedieDistribution):
if self._family_instance.power <= 0:
if isinstance(family_instance, TweedieDistribution):
if family_instance.power <= 0:
self._link_instance = IdentityLink()
if self._family_instance.power >= 1:
if family_instance.power >= 1:
self._link_instance = LogLink()
else:
raise ValueError(
Expand Down Expand Up @@ -243,7 +233,6 @@ def fit(self, X, y, sample_weight=None):
"The argument warm_start must be bool; got {0}".format(self.warm_start)
)

family = self._family_instance
link = self._link_instance

X, y = self._validate_data(
Expand All @@ -259,10 +248,10 @@ def fit(self, X, y, sample_weight=None):

_, n_features = X.shape

if not np.all(family.in_y_range(y)):
if not np.all(family_instance.in_y_range(y)):
raise ValueError(
"Some value(s) of y are out of the valid range for family {0}".format(
family.__class__.__name__
family_instance.__class__.__name__
)
)
# TODO: if alpha=0 check that X is not rank deficient
Expand Down Expand Up @@ -305,7 +294,7 @@ def func(coef, X, y, weights, alpha, family, link):
objp[offset:] += coef_scaled
return obj, objp

args = (X, y, weights, self.alpha, family, link)
args = (X, y, weights, self.alpha, family_instance, link)

opt_res = scipy.optimize.minimize(
func,
Expand Down Expand Up @@ -411,22 +400,37 @@ def score(self, X, y, sample_weight=None):
# input validation and so on)
weights = _check_sample_weight(sample_weight, X)
y_pred = self.predict(X)
dev = self._family_instance.deviance(y, y_pred, weights=weights)
family_instance = self._get_family_instance()
dev = family_instance.deviance(y, y_pred, weights=weights)
y_mean = np.average(y, weights=weights)
dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
dev_null = family_instance.deviance(y, y_mean, weights=weights)
return 1 - dev / dev_null

def _more_tags(self):
# create the _family_instance if fit wasn't called yet.
if hasattr(self, "_family_instance"):
_family_instance = self._family_instance
elif isinstance(self.family, ExponentialDispersionModel):
_family_instance = self.family
def _get_family_instance(self):
"""Return the distribution of a family.

This method returns an instance of ``ExponentialDispersionModel`` based the
value stored in the ``family`` attribute.

Returns
-------
family_instance : ExponentialDispersionModel
Returns an instance of ExponentialDispersionModel.
"""
if isinstance(self.family, ExponentialDispersionModel):
return self.family
elif self.family in EDM_DISTRIBUTIONS:
_family_instance = EDM_DISTRIBUTIONS[self.family]()
return EDM_DISTRIBUTIONS[self.family]()
else:
raise ValueError
return {"requires_positive_y": not _family_instance.in_y_range(-1.0)}
raise ValueError(
"The family must be an instance of class"
" ExponentialDispersionModel or an element of"
" ['normal', 'poisson', 'gamma', 'inverse-gaussian']"
"; got (family={0})".format(self.family)
)

def _more_tags(self):
return {"requires_positive_y": not self._get_family_instance().in_y_range(-1.0)}


class PoissonRegressor(GeneralizedLinearRegressor):
Expand Down Expand Up @@ -787,11 +791,12 @@ def __init__(
warm_start=False,
verbose=0,
):
self.power = power

super().__init__(
alpha=alpha,
fit_intercept=fit_intercept,
family=TweedieDistribution(power=power),
family="tweedie",
link=link,
max_iter=max_iter,
tol=tol,
Expand All @@ -802,18 +807,20 @@ def __init__(
@property
def family(self):
"""Return the family of the regressor."""
# We use a property with a setter to make sure that the family is
# always a Tweedie distribution, and that self.power and
# self.family.power are identical by construction.
dist = TweedieDistribution(power=self.power)
# TODO: make the returned object immutable
return dist
# Make this attribute read-only to avoid mis-uses e.g. in GridSearch.
return "tweedie"

@family.setter
def family(self, value):
if isinstance(value, TweedieDistribution):
self.power = value.power
else:
raise TypeError(
"TweedieRegressor.family must be of type TweedieDistribution!"
)
if value != "tweedie":
raise ValueError("TweedieRegressor.family must be 'tweedie'!")

def _get_family_instance(self):
"""Return an instance of ``TweedieDistribution``.

Returns
-------
family_instance : TweedieDistribution
Returns an instance of TweedieDistribution.
"""
return TweedieDistribution(power=self.power)
Copy link
Member

@ogrisel ogrisel Jan 5, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I would be in favor of removing the TweedieRegressor.family property getter and setter and instead validate the consistency of self.family in _get_family_instance which should be only called at fit time instead of __init__ or set_params time:

Suggested change
return TweedieDistribution(power=self.power)
if isinstance(self.family, TweedieDistribution):
# ignore self.power if the user passed a distribution instance directly
# for backward compat
return self.family
elif self.family == "tweedie":
return TweedieDistribution(power=self.power)
else:
raise ValueError(
"TweedieRegressor.family must always be 'tweedie', "
f"got: {self.family!r}"
)

disclaimer: untested, there might be typos.

19 changes: 9 additions & 10 deletions sklearn/linear_model/_glm/tests/test_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_glm_family_argument(name, instance):
y = np.array([0.1, 0.5]) # in range of all distributions
X = np.array([[1], [2]])
glm = GeneralizedLinearRegressor(family=name, alpha=0).fit(X, y)
assert isinstance(glm._family_instance, instance.__class__)
assert isinstance(glm._get_family_instance(), instance.__class__)

glm = GeneralizedLinearRegressor(family="not a family")
with pytest.raises(ValueError, match="family must be"):
Expand Down Expand Up @@ -429,23 +429,22 @@ def test_gamma_regression_family(regression_data):


def test_tweedie_regression_family(regression_data):
# Make sure the family attribute is always a TweedieDistribution and that
# Make sure the family is a TweedieDistribution and that
# the power attribute is properly updated
power = 2.0
est = TweedieRegressor(power=power)
assert isinstance(est.family, TweedieDistribution)
assert est.family.power == power
assert isinstance(est._get_family_instance(), TweedieDistribution)
assert est._get_family_instance().power == power
assert est.power == power

new_power = 0
new_family = TweedieDistribution(power=new_power)
est.family = new_family
assert isinstance(est.family, TweedieDistribution)
assert est.family.power == new_power
est.power = new_power
assert isinstance(est._get_family_instance(), TweedieDistribution)
assert est._get_family_instance().power == new_power
assert est.power == new_power

msg = "TweedieRegressor.family must be of type TweedieDistribution!"
with pytest.raises(TypeError, match=msg):
msg = "TweedieRegressor.family must be 'tweedie'!"
with pytest.raises(ValueError, match=msg):
est.family = None


Expand Down
1 change: 0 additions & 1 deletion sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,6 @@ def test_transformers_get_feature_names_out(transformer):
"FeatureUnion",
"SGDOneClassSVM",
"TheilSenRegressor",
"TweedieRegressor",
]
VALIDATE_ESTIMATOR_INIT = set(VALIDATE_ESTIMATOR_INIT)

Expand Down