Skip to content

API fix params validation in SGD inherited models #20683

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Aug 16, 2021
12 changes: 12 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,18 @@ Changelog
coordinate descent solver. Otherwise, an error will be raised.
:pr:`19391` by :user:`Shao Yang Hong <hongshaoyang>`.

- |API| Keyword validation has moved from `__init__` and `set_params` to `fit`
for the following estimators conforming to scikit-learn's conventions:
:class:`linear_model.SGDClassifier`,
:class:`linear_model.SparseSGDClassifier`,
:class:`linear_model.SGDRegressor`,
:class:`linear_model.SparseSGDRegressor`,
:class:`linear_model.SGDOneClassSVM`,
:class:`linear_model.SparseSGDOneClassSVM`,
:class:`linear_model.PassiveAggressiveClassifier`,
:class:`linear_model.PassiveAggressiveRegressor`.
:pr:`20683` by `Guillaume Lemaitre`_.

:mod:`sklearn.manifold`
.......................

Expand Down
20 changes: 0 additions & 20 deletions sklearn/linear_model/_stochastic_gradient.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,26 +121,6 @@ def __init__(
self.average = average
self.max_iter = max_iter
self.tol = tol
# current tests expect init to do parameter validation
# but we are not allowed to set attributes
self._validate_params()

def set_params(self, **kwargs):
"""Set and validate the parameters of estimator.

Parameters
----------
**kwargs : dict
Estimator parameters.

Returns
-------
self : object
Estimator instance.
"""
super().set_params(**kwargs)
self._validate_params()
return self

@abstractmethod
def fit(self, X, y):
Expand Down
48 changes: 41 additions & 7 deletions sklearn/linear_model/tests/test_passive_aggressive.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pytest

from sklearn.base import is_classifier
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_almost_equal
Expand Down Expand Up @@ -136,11 +137,13 @@ def test_classifier_correctness(loss):
assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2)


def test_classifier_undefined_methods():
@pytest.mark.parametrize(
"response_method", ["predict_proba", "predict_log_proba", "transform"]
)
def test_classifier_undefined_methods(response_method):
clf = PassiveAggressiveClassifier(max_iter=100)
for meth in ("predict_proba", "predict_log_proba", "transform"):
with pytest.raises(AttributeError):
getattr(clf, meth)
with pytest.raises(AttributeError):
getattr(clf, response_method)


def test_class_weights():
Expand Down Expand Up @@ -279,6 +282,37 @@ def test_regressor_correctness(loss):

def test_regressor_undefined_methods():
reg = PassiveAggressiveRegressor(max_iter=100)
for meth in ("transform",):
with pytest.raises(AttributeError):
getattr(reg, meth)
with pytest.raises(AttributeError):
reg.transform(X)


@pytest.mark.parametrize(
"klass", [PassiveAggressiveClassifier, PassiveAggressiveRegressor]
)
@pytest.mark.parametrize("fit_method", ["fit", "partial_fit"])
@pytest.mark.parametrize(
"params, err_msg",
[
({"loss": "foobar"}, "The loss foobar is not supported"),
({"max_iter": -1}, "max_iter must be > zero"),
({"shuffle": "false"}, "shuffle must be either True or False"),
({"early_stopping": "false"}, "early_stopping must be either True or False"),
(
{"validation_fraction": -0.1},
r"validation_fraction must be in range \(0, 1\)",
),
({"n_iter_no_change": 0}, "n_iter_no_change must be >= 1"),
],
)
def test_passive_aggressive_estimator_params_validation(
klass, fit_method, params, err_msg
):
"""Validate parameters in the different PassiveAggressive estimators."""
sgd_estimator = klass(**params)

with pytest.raises(ValueError, match=err_msg):
if is_classifier(sgd_estimator) and fit_method == "partial_fit":
fit_params = {"classes": np.unique(y)}
else:
fit_params = {}
getattr(sgd_estimator, fit_method)(X, y, **fit_params)
225 changes: 84 additions & 141 deletions sklearn/linear_model/tests/test_sgd.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import pickle
import pytest

import joblib
import pytest
import numpy as np
from numpy.testing import assert_allclose
import scipy.sparse as sp
import joblib

from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
Expand Down Expand Up @@ -216,30 +216,55 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):


@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_sgd_bad_alpha(klass):
# Check whether expected ValueError on bad alpha
with pytest.raises(ValueError):
klass(alpha=-0.1)


@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
"klass",
[
SGDClassifier,
SparseSGDClassifier,
SGDRegressor,
SparseSGDRegressor,
SGDOneClassSVM,
SparseSGDOneClassSVM,
],
)
def test_sgd_bad_penalty(klass):
# Check whether expected ValueError on bad penalty
with pytest.raises(ValueError):
klass(penalty="foobar", l1_ratio=0.85)


@pytest.mark.parametrize("fit_method", ["fit", "partial_fit"])
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
"params, err_msg",
[
({"alpha": -0.1}, "alpha must be >= 0"),
({"penalty": "foobar", "l1_ratio": 0.85}, "Penalty foobar is not supported"),
({"loss": "foobar"}, "The loss foobar is not supported"),
({"l1_ratio": 1.1}, r"l1_ratio must be in \[0, 1\]"),
({"learning_rate": "<unknown>"}, "learning rate <unknown> is not supported"),
({"nu": -0.5}, r"nu must be in \(0, 1]"),
({"nu": 2}, r"nu must be in \(0, 1]"),
({"alpha": 0, "learning_rate": "optimal"}, "alpha must be > 0"),
({"eta0": 0, "learning_rate": "constant"}, "eta0 must be > 0"),
({"max_iter": -1}, "max_iter must be > zero"),
({"shuffle": "false"}, "shuffle must be either True or False"),
({"early_stopping": "false"}, "early_stopping must be either True or False"),
(
{"validation_fraction": -0.1},
r"validation_fraction must be in range \(0, 1\)",
),
({"n_iter_no_change": 0}, "n_iter_no_change must be >= 1"),
],
)
def test_sgd_bad_loss(klass):
# Check whether expected ValueError on bad loss
with pytest.raises(ValueError):
klass(loss="foobar")
def test_sgd_estimator_params_validation(klass, fit_method, params, err_msg):
"""Validate parameters in the different SGD estimators."""
try:
sgd_estimator = klass(**params)
except TypeError as err:
if "__init__() got an unexpected keyword argument" in str(err):
# skip test if the parameter is not supported by the estimator
return
raise err

with pytest.raises(ValueError, match=err_msg):
if is_classifier(sgd_estimator) and fit_method == "partial_fit":
fit_params = {"classes": np.unique(Y)}
else:
fit_params = {}
getattr(sgd_estimator, fit_method)(X, Y, **fit_params)


def _test_warm_start(klass, X, Y, lr):
Expand Down Expand Up @@ -408,16 +433,6 @@ def test_late_onset_averaging_reached(klass):
assert_almost_equal(clf1.intercept_, average_intercept, decimal=16)


@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_sgd_bad_alpha_for_optimal_learning_rate(klass):
# Check whether expected ValueError on bad alpha, i.e. 0
# since alpha is used to compute the optimal learning rate
with pytest.raises(ValueError):
klass(alpha=0, learning_rate="optimal")


@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
Expand Down Expand Up @@ -540,115 +555,56 @@ def test_sgd_clf(klass):
assert_array_equal(clf.predict(T), true_result)


@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_bad_l1_ratio(klass):
# Check whether expected ValueError on bad l1_ratio
with pytest.raises(ValueError):
klass(l1_ratio=1.1)


@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
)
def test_sgd_bad_learning_rate_schedule(klass):
# Check whether expected ValueError on bad learning_rate
with pytest.raises(ValueError):
klass(learning_rate="<unknown>")


@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
)
def test_sgd_bad_eta0(klass):
# Check whether expected ValueError on bad eta0
with pytest.raises(ValueError):
klass(eta0=0, learning_rate="constant")


@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
)
def test_sgd_max_iter_param(klass):
# Test parameter validity check
with pytest.raises(ValueError):
klass(max_iter=-10000)


@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
)
def test_sgd_shuffle_param(klass):
# Test parameter validity check
with pytest.raises(ValueError):
klass(shuffle="false")


@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_early_stopping_param(klass):
# Test parameter validity check
with pytest.raises(ValueError):
klass(early_stopping="false")


@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_validation_fraction(klass):
# Test parameter validity check
with pytest.raises(ValueError):
klass(validation_fraction=-0.1)


@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_n_iter_no_change(klass):
# Test parameter validity check
with pytest.raises(ValueError):
klass(n_iter_no_change=0)


@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
)
def test_argument_coef(klass):
# Checks coef_init not allowed as model argument (only fit)
# Provided coef_ does not match dataset
with pytest.raises(TypeError):
klass(coef_init=np.zeros((3,)))


@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
)
def test_provide_coef(klass):
# Checks coef_init shape for the warm starts
# Provided coef_ does not match dataset.
with pytest.raises(ValueError):
"""Check that the shape of `coef_init` is validated."""
with pytest.raises(ValueError, match="Provided coef_init does not match dataset"):
klass().fit(X, Y, coef_init=np.zeros((3,)))


@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
"klass, fit_params",
[
(SGDClassifier, {"intercept_init": np.zeros((3,))}),
(SparseSGDClassifier, {"intercept_init": np.zeros((3,))}),
(SGDOneClassSVM, {"offset_init": np.zeros((3,))}),
(SparseSGDOneClassSVM, {"offset_init": np.zeros((3,))}),
],
)
def test_set_intercept(klass):
# Checks intercept_ shape for the warm starts
# Provided intercept_ does not match dataset.
if klass in [SGDClassifier, SparseSGDClassifier]:
with pytest.raises(ValueError):
klass().fit(X, Y, intercept_init=np.zeros((3,)))
elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
with pytest.raises(ValueError):
klass().fit(X, Y, offset_init=np.zeros((3,)))
def test_set_intercept_offset(klass, fit_params):
"""Check that `intercept_init` or `offset_init` is validated."""
sgd_estimator = klass()
with pytest.raises(ValueError, match="does not match dataset"):
sgd_estimator.fit(X, Y, **fit_params)


@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_sgd_early_stopping_with_partial_fit(klass):
# Test parameter validity check
with pytest.raises(ValueError):
"""Check that we raise an error for `early_stopping` used with
`partial_fit`.
"""
err_msg = "early_stopping should be False with partial_fit"
with pytest.raises(ValueError, match=err_msg):
klass(early_stopping=True).partial_fit(X, Y)


@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_set_intercept_binary(klass):
# Checks intercept_ shape for the warm starts in binary case
klass().fit(X5, Y5, intercept_init=0)
@pytest.mark.parametrize(
"klass, fit_params",
[
(SGDClassifier, {"intercept_init": 0}),
(SparseSGDClassifier, {"intercept_init": 0}),
(SGDOneClassSVM, {"offset_init": 0}),
(SparseSGDOneClassSVM, {"offset_init": 0}),
],
)
def test_set_intercept_offset_binary(klass, fit_params):
"""Check that we can pass a scaler with binary classification to
`intercept_init` or `offset_init`."""
klass().fit(X5, Y5, **fit_params)


@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
Expand Down Expand Up @@ -1537,19 +1493,6 @@ def asgd_oneclass(klass, X, eta, nu, coef_init=None, offset_init=0.0):
return average_coef, 1 - average_intercept


@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
@pytest.mark.parametrize("nu", [-0.5, 2])
def test_bad_nu_values(klass, nu):
msg = r"nu must be in \(0, 1]"
with pytest.raises(ValueError, match=msg):
klass(nu=nu)

clf = klass(nu=0.05)
clf2 = clone(clf)
with pytest.raises(ValueError, match=msg):
clf2.set_params(nu=nu)


@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
def _test_warm_start_oneclass(klass, X, lr):
# Test that explicit warm restart...
Expand Down