From d615be0e45b062ab90a1867bac9cc2551f417ebb Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Aug 2021 12:22:54 +0200
Subject: [PATCH 01/14] API fix params validation in SGD models

---
 sklearn/linear_model/_stochastic_gradient.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 1c718df7765cf..7981ee3f43f81 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -120,9 +120,6 @@ def __init__(
         self.average = average
         self.max_iter = max_iter
         self.tol = tol
-        # current tests expect init to do parameter validation
-        # but we are not allowed to set attributes
-        self._validate_params()
 
     def set_params(self, **kwargs):
         """Set and validate the parameters of estimator.
@@ -138,7 +135,6 @@ def set_params(self, **kwargs):
             Estimator instance.
         """
         super().set_params(**kwargs)
-        self._validate_params()
         return self
 
     @abstractmethod

From 9f69304358fa9c6e3d30060b7363543048272442 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Aug 2021 14:10:37 +0200
Subject: [PATCH 02/14] TST refactor validation params SGD tests

---
 sklearn/linear_model/tests/test_sgd.py | 38 ++++++++++++--------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 04abdcd9d6f0e..133057910574d 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -216,30 +216,26 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_sgd_bad_alpha(klass):
-    # Check whether expected ValueError on bad alpha
-    with pytest.raises(ValueError):
-        klass(alpha=-0.1)
-
-
+@pytest.mark.parametrize("fit_method", ["fit", "partial_fit"])
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
-def test_sgd_bad_penalty(klass):
-    # Check whether expected ValueError on bad penalty
-    with pytest.raises(ValueError):
-        klass(penalty="foobar", l1_ratio=0.85)
-
-
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+    "params, err_msg",
+    [
+        ({"alpha": -0.1}, "alpha must be >= 0"),
+        ({"penalty": "foobar", "l1_ratio": 0.85}, "Penalty foobar is not supported"),
+        ({"loss": "foobar"}, "The loss foobar is not supported"),
+    ],
 )
-def test_sgd_bad_loss(klass):
-    # Check whether expected ValueError on bad loss
-    with pytest.raises(ValueError):
-        klass(loss="foobar")
+def test_sgd_estimator_params_validation(SGDEstimator, fit_method, params, err_msg):
+    sgd_estimator = SGDEstimator(**params)
+    with pytest.raises(ValueError, match=err_msg):
+        if is_classifier(sgd_estimator) and fit_method == "partial_fit":
+            fit_params = {"classes": np.unique(Y)}
+        else:
+            fit_params = {}
+        getattr(sgd_estimator, fit_method)(X, Y, **fit_params)
 
 
 def _test_warm_start(klass, X, Y, lr):

From 794eb89b7d0c87a557a0882b1fc693a48b767a96 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Aug 2021 15:15:24 +0200
Subject: [PATCH 03/14] TST refactor SGD test

---
 sklearn/linear_model/tests/test_sgd.py | 715 ++++++++++++-------------
 1 file changed, 351 insertions(+), 364 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 133057910574d..4347be499a60f 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -1,11 +1,11 @@
 import pickle
-import pytest
 
+import joblib
+import pytest
 import numpy as np
-from numpy.testing import assert_allclose
 import scipy.sparse as sp
-import joblib
 
+from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
@@ -181,7 +181,7 @@ def SparseSGDOneClassSVM(**kwargs):
 
 # a simple implementation of ASGD to use for testing
 # uses squared loss to find the gradient
-def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
+def asgd(SGDEstimator, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
     if weight_init is None:
         weights = np.zeros(X.shape[1])
     else:
@@ -193,7 +193,7 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
     decay = 1.0
 
     # sparse data has a fixed decay of .01
-    if klass in (SparseSGDClassifier, SparseSGDRegressor):
+    if SGDEstimator in (SparseSGDClassifier, SparseSGDRegressor):
         decay = 0.01
 
     for i, entry in enumerate(X):
@@ -217,7 +217,14 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
 
 @pytest.mark.parametrize(
     "SGDEstimator",
-    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
+    [
+        SGDClassifier,
+        SparseSGDClassifier,
+        SGDRegressor,
+        SparseSGDRegressor,
+        SGDOneClassSVM,
+        SparseSGDOneClassSVM,
+    ],
 )
 @pytest.mark.parametrize("fit_method", ["fit", "partial_fit"])
 @pytest.mark.parametrize(
@@ -226,10 +233,32 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
         ({"alpha": -0.1}, "alpha must be >= 0"),
         ({"penalty": "foobar", "l1_ratio": 0.85}, "Penalty foobar is not supported"),
         ({"loss": "foobar"}, "The loss foobar is not supported"),
+        ({"l1_ratio": 1.1}, r"l1_ratio must be in \[0, 1\]"),
+        ({"learning_rate": "<unknown>"}, "learning rate <unknown> is not supported"),
+        ({"nu": -0.5}, r"nu must be in \(0, 1]"),
+        ({"nu": 2}, r"nu must be in \(0, 1]"),
+        ({"alpha": 0, "learning_rate": "optimal"}, "alpha must be > 0"),
+        ({"eta0": 0, "learning_rate": "constant"}, "eta0 must be > 0"),
+        ({"max_iter": -1}, "max_iter must be > zero"),
+        ({"shuffle": "false"}, "shuffle must be either True or False"),
+        ({"early_stopping": "false"}, "early_stopping must be either True or False"),
+        (
+            {"validation_fraction": -0.1},
+            r"validation_fraction must be in range \(0, 1\)",
+        ),
+        ({"n_iter_no_change": 0}, "n_iter_no_change must be >= 1"),
     ],
 )
 def test_sgd_estimator_params_validation(SGDEstimator, fit_method, params, err_msg):
-    sgd_estimator = SGDEstimator(**params)
+    """Validate parameters in the different SGD estimators."""
+    try:
+        sgd_estimator = SGDEstimator(**params)
+    except TypeError as err:
+        if "__init__() got an unexpected keyword argument" in str(err):
+            # skip test if the parameter is not supported by the estimator
+            return
+        raise err
+
     with pytest.raises(ValueError, match=err_msg):
         if is_classifier(sgd_estimator) and fit_method == "partial_fit":
             fit_params = {"classes": np.unique(Y)}
@@ -238,16 +267,16 @@ def test_sgd_estimator_params_validation(SGDEstimator, fit_method, params, err_m
         getattr(sgd_estimator, fit_method)(X, Y, **fit_params)
 
 
-def _test_warm_start(klass, X, Y, lr):
+def _test_warm_start(SGDEstimator, X, Y, lr):
     # Test that explicit warm restart...
-    clf = klass(alpha=0.01, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf = SGDEstimator(alpha=0.01, eta0=0.01, shuffle=False, learning_rate=lr)
     clf.fit(X, Y)
 
-    clf2 = klass(alpha=0.001, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf2 = SGDEstimator(alpha=0.001, eta0=0.01, shuffle=False, learning_rate=lr)
     clf2.fit(X, Y, coef_init=clf.coef_.copy(), intercept_init=clf.intercept_.copy())
 
     # ... and implicit warm restart are equivalent.
-    clf3 = klass(
+    clf3 = SGDEstimator(
         alpha=0.01, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr
     )
     clf3.fit(X, Y)
@@ -263,19 +292,21 @@ def _test_warm_start(klass, X, Y, lr):
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
 @pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
-def test_warm_start(klass, lr):
-    _test_warm_start(klass, X, Y, lr)
+def test_warm_start(SGDEstimator, lr):
+    _test_warm_start(SGDEstimator, X, Y, lr)
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_input_format(klass):
+def test_input_format(SGDEstimator):
     # Input format tests.
-    clf = klass(alpha=0.01, shuffle=False)
+    clf = SGDEstimator(alpha=0.01, shuffle=False)
     clf.fit(X, Y)
     Y_ = np.array(Y)[:, np.newaxis]
 
@@ -285,23 +316,24 @@ def test_input_format(klass):
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_clone(klass):
+def test_clone(SGDEstimator):
     # Test whether clone works ok.
-    clf = klass(alpha=0.01, penalty="l1")
+    clf = SGDEstimator(alpha=0.01, penalty="l1")
     clf = clone(clf)
     clf.set_params(penalty="l2")
     clf.fit(X, Y)
 
-    clf2 = klass(alpha=0.01, penalty="l2")
+    clf2 = SGDEstimator(alpha=0.01, penalty="l2")
     clf2.fit(X, Y)
 
     assert_array_equal(clf.coef_, clf2.coef_)
 
 
 @pytest.mark.parametrize(
-    "klass",
+    "SGDEstimator",
     [
         SGDClassifier,
         SparseSGDClassifier,
@@ -311,8 +343,8 @@ def test_clone(klass):
         SparseSGDOneClassSVM,
     ],
 )
-def test_plain_has_no_average_attr(klass):
-    clf = klass(average=True, eta0=0.01)
+def test_plain_has_no_average_attr(SGDEstimator):
+    clf = SGDEstimator(average=True, eta0=0.01)
     clf.fit(X, Y)
 
     assert hasattr(clf, "_average_coef")
@@ -320,7 +352,7 @@ def test_plain_has_no_average_attr(klass):
     assert hasattr(clf, "_standard_intercept")
     assert hasattr(clf, "_standard_coef")
 
-    clf = klass()
+    clf = SGDEstimator()
     clf.fit(X, Y)
 
     assert not hasattr(clf, "_average_coef")
@@ -330,7 +362,7 @@ def test_plain_has_no_average_attr(klass):
 
 
 @pytest.mark.parametrize(
-    "klass",
+    "SGDEstimator",
     [
         SGDClassifier,
         SparseSGDClassifier,
@@ -340,9 +372,9 @@ def test_plain_has_no_average_attr(klass):
         SparseSGDOneClassSVM,
     ],
 )
-def test_late_onset_averaging_not_reached(klass):
-    clf1 = klass(average=600)
-    clf2 = klass()
+def test_late_onset_averaging_not_reached(SGDEstimator):
+    clf1 = SGDEstimator(average=600)
+    clf2 = SGDEstimator()
     for _ in range(100):
         if is_classifier(clf1):
             clf1.partial_fit(X, Y, classes=np.unique(Y))
@@ -352,23 +384,29 @@ def test_late_onset_averaging_not_reached(klass):
             clf2.partial_fit(X, Y)
 
     assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16)
-    if klass in [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]:
+    if SGDEstimator in [
+        SGDClassifier,
+        SparseSGDClassifier,
+        SGDRegressor,
+        SparseSGDRegressor,
+    ]:
         assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16)
-    elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
+    elif SGDEstimator in [SGDOneClassSVM, SparseSGDOneClassSVM]:
         assert_allclose(clf1.offset_, clf2.offset_)
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_late_onset_averaging_reached(klass):
+def test_late_onset_averaging_reached(SGDEstimator):
     eta0 = 0.001
     alpha = 0.0001
     Y_encode = np.array(Y)
     Y_encode[Y_encode == 1] = -1.0
     Y_encode[Y_encode == 2] = 1.0
 
-    clf1 = klass(
+    clf1 = SGDEstimator(
         average=7,
         learning_rate="constant",
         loss="squared_error",
@@ -377,7 +415,7 @@ def test_late_onset_averaging_reached(klass):
         max_iter=2,
         shuffle=False,
     )
-    clf2 = klass(
+    clf2 = SGDEstimator(
         average=0,
         learning_rate="constant",
         loss="squared_error",
@@ -391,7 +429,7 @@ def test_late_onset_averaging_reached(klass):
     clf2.fit(X, Y_encode)
 
     average_weights, average_intercept = asgd(
-        klass,
+        SGDEstimator,
         X,
         Y_encode,
         eta0,
@@ -405,50 +443,43 @@ def test_late_onset_averaging_reached(klass):
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
-)
-def test_sgd_bad_alpha_for_optimal_learning_rate(klass):
-    # Check whether expected ValueError on bad alpha, i.e. 0
-    # since alpha is used to compute the optimal learning rate
-    with pytest.raises(ValueError):
-        klass(alpha=0, learning_rate="optimal")
-
-
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_early_stopping(klass):
+def test_early_stopping(SGDEstimator):
     X = iris.data[iris.target > 0]
     Y = iris.target[iris.target > 0]
     for early_stopping in [True, False]:
         max_iter = 1000
-        clf = klass(early_stopping=early_stopping, tol=1e-3, max_iter=max_iter).fit(
-            X, Y
-        )
+        clf = SGDEstimator(
+            early_stopping=early_stopping, tol=1e-3, max_iter=max_iter
+        ).fit(X, Y)
         assert clf.n_iter_ < max_iter
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_adaptive_longer_than_constant(klass):
-    clf1 = klass(learning_rate="adaptive", eta0=0.01, tol=1e-3, max_iter=100)
+def test_adaptive_longer_than_constant(SGDEstimator):
+    clf1 = SGDEstimator(learning_rate="adaptive", eta0=0.01, tol=1e-3, max_iter=100)
     clf1.fit(iris.data, iris.target)
-    clf2 = klass(learning_rate="constant", eta0=0.01, tol=1e-3, max_iter=100)
+    clf2 = SGDEstimator(learning_rate="constant", eta0=0.01, tol=1e-3, max_iter=100)
     clf2.fit(iris.data, iris.target)
     assert clf1.n_iter_ > clf2.n_iter_
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_validation_set_not_used_for_training(klass):
+def test_validation_set_not_used_for_training(SGDEstimator):
     X, Y = iris.data, iris.target
     validation_fraction = 0.4
     seed = 42
     shuffle = False
     max_iter = 10
-    clf1 = klass(
+    clf1 = SGDEstimator(
         early_stopping=True,
         random_state=np.random.RandomState(seed),
         validation_fraction=validation_fraction,
@@ -461,7 +492,7 @@ def test_validation_set_not_used_for_training(klass):
     clf1.fit(X, Y)
     assert clf1.n_iter_ == max_iter
 
-    clf2 = klass(
+    clf2 = SGDEstimator(
         early_stopping=False,
         random_state=np.random.RandomState(seed),
         learning_rate="constant",
@@ -484,14 +515,15 @@ def test_validation_set_not_used_for_training(klass):
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_n_iter_no_change(klass):
+def test_n_iter_no_change(SGDEstimator):
     X, Y = iris.data, iris.target
     # test that n_iter_ increases monotonically with n_iter_no_change
     for early_stopping in [True, False]:
         n_iter_list = [
-            klass(
+            SGDEstimator(
                 early_stopping=early_stopping,
                 n_iter_no_change=n_iter_no_change,
                 tol=1e-4,
@@ -505,11 +537,12 @@ def test_n_iter_no_change(klass):
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_not_enough_sample_for_early_stopping(klass):
+def test_not_enough_sample_for_early_stopping(SGDEstimator):
     # test an error is raised if the training or validation set is empty
-    clf = klass(early_stopping=True, validation_fraction=0.99)
+    clf = SGDEstimator(early_stopping=True, validation_fraction=0.99)
     with pytest.raises(ValueError):
         clf.fit(X3, Y3)
 
@@ -518,12 +551,12 @@ def test_not_enough_sample_for_early_stopping(klass):
 # Classification Test Case
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_clf(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_clf(SGDEstimator):
     # Check that SGD gives any results :-)
 
     for loss in ("hinge", "squared_hinge", "log", "modified_huber"):
-        clf = klass(
+        clf = SGDEstimator(
             penalty="l2",
             alpha=0.01,
             fit_intercept=True,
@@ -536,119 +569,62 @@ def test_sgd_clf(klass):
         assert_array_equal(clf.predict(T), true_result)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_bad_l1_ratio(klass):
-    # Check whether expected ValueError on bad l1_ratio
-    with pytest.raises(ValueError):
-        klass(l1_ratio=1.1)
-
-
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
-)
-def test_sgd_bad_learning_rate_schedule(klass):
-    # Check whether expected ValueError on bad learning_rate
-    with pytest.raises(ValueError):
-        klass(learning_rate="<unknown>")
-
-
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
-)
-def test_sgd_bad_eta0(klass):
-    # Check whether expected ValueError on bad eta0
-    with pytest.raises(ValueError):
-        klass(eta0=0, learning_rate="constant")
-
-
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM],
 )
-def test_sgd_max_iter_param(klass):
-    # Test parameter validity check
-    with pytest.raises(ValueError):
-        klass(max_iter=-10000)
+def test_provide_coef(SGDEstimator):
+    """Check that the shape of `coef_init` is validated."""
+    with pytest.raises(ValueError, match="Provided coef_init does not match dataset"):
+        SGDEstimator().fit(X, Y, coef_init=np.zeros((3,)))
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
-)
-def test_sgd_shuffle_param(klass):
-    # Test parameter validity check
-    with pytest.raises(ValueError):
-        klass(shuffle="false")
-
-
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_early_stopping_param(klass):
-    # Test parameter validity check
-    with pytest.raises(ValueError):
-        klass(early_stopping="false")
-
-
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_validation_fraction(klass):
-    # Test parameter validity check
-    with pytest.raises(ValueError):
-        klass(validation_fraction=-0.1)
-
-
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_n_iter_no_change(klass):
-    # Test parameter validity check
-    with pytest.raises(ValueError):
-        klass(n_iter_no_change=0)
-
-
-@pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
+    "SGDEstimator, fit_params",
+    [
+        (SGDClassifier, {"intercept_init": np.zeros((3,))}),
+        (SparseSGDClassifier, {"intercept_init": np.zeros((3,))}),
+        (SGDOneClassSVM, {"offset_init": np.zeros((3,))}),
+        (SparseSGDOneClassSVM, {"offset_init": np.zeros((3,))}),
+    ],
 )
-def test_argument_coef(klass):
-    # Checks coef_init not allowed as model argument (only fit)
-    # Provided coef_ does not match dataset
-    with pytest.raises(TypeError):
-        klass(coef_init=np.zeros((3,)))
+def test_set_intercept_offset(SGDEstimator, fit_params):
+    """Check that `intercept_init` or `offset_init` is validated."""
+    sgd_estimator = SGDEstimator()
+    with pytest.raises(ValueError, match="does not match dataset"):
+        sgd_estimator.fit(X, Y, **fit_params)
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_provide_coef(klass):
-    # Checks coef_init shape for the warm starts
-    # Provided coef_ does not match dataset.
-    with pytest.raises(ValueError):
-        klass().fit(X, Y, coef_init=np.zeros((3,)))
+def test_sgd_early_stopping_with_partial_fit(SGDEstimator):
+    """Check that we raise an error for `early_stopping` used with
+    `partial_fit`.
+    """
+    err_msg = "early_stopping should be False with partial_fit"
+    with pytest.raises(ValueError, match=err_msg):
+        SGDEstimator(early_stopping=True).partial_fit(X, Y)
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
+    "SGDEstimator, fit_params",
+    [
+        (SGDClassifier, {"intercept_init": 0}),
+        (SparseSGDClassifier, {"intercept_init": 0}),
+        (SGDOneClassSVM, {"offset_init": 0}),
+        (SparseSGDOneClassSVM, {"offset_init": 0}),
+    ],
 )
-def test_set_intercept(klass):
-    # Checks intercept_ shape for the warm starts
-    # Provided intercept_ does not match dataset.
-    if klass in [SGDClassifier, SparseSGDClassifier]:
-        with pytest.raises(ValueError):
-            klass().fit(X, Y, intercept_init=np.zeros((3,)))
-    elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
-        with pytest.raises(ValueError):
-            klass().fit(X, Y, offset_init=np.zeros((3,)))
-
-
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_early_stopping_with_partial_fit(klass):
-    # Test parameter validity check
-    with pytest.raises(ValueError):
-        klass(early_stopping=True).partial_fit(X, Y)
-
-
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_set_intercept_binary(klass):
-    # Checks intercept_ shape for the warm starts in binary case
-    klass().fit(X5, Y5, intercept_init=0)
+def test_set_intercept_offset_binary(SGDEstimator, fit_params):
+    """Check that we can pass a scaler with binary classification to
+    `intercept_init` or `offset_init`."""
+    SGDEstimator().fit(X5, Y5, **fit_params)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_average_binary_computed_correctly(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_average_binary_computed_correctly(SGDEstimator):
     # Checks the SGDClassifier correctly computes the average weights
     eta = 0.1
     alpha = 2.0
@@ -658,7 +634,7 @@ def test_average_binary_computed_correctly(klass):
     X = rng.normal(size=(n_samples, n_features))
     w = rng.normal(size=n_features)
 
-    clf = klass(
+    clf = SGDEstimator(
         loss="squared_error",
         learning_rate="constant",
         eta0=eta,
@@ -675,32 +651,32 @@ def test_average_binary_computed_correctly(klass):
 
     clf.fit(X, y)
 
-    average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
+    average_weights, average_intercept = asgd(SGDEstimator, X, y, eta, alpha)
     average_weights = average_weights.reshape(1, -1)
     assert_array_almost_equal(clf.coef_, average_weights, decimal=14)
     assert_almost_equal(clf.intercept_, average_intercept, decimal=14)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_set_intercept_to_intercept(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_set_intercept_to_intercept(SGDEstimator):
     # Checks intercept_ shape consistency for the warm starts
     # Inconsistent intercept_ shape.
-    clf = klass().fit(X5, Y5)
-    klass().fit(X5, Y5, intercept_init=clf.intercept_)
-    clf = klass().fit(X, Y)
-    klass().fit(X, Y, intercept_init=clf.intercept_)
+    clf = SGDEstimator().fit(X5, Y5)
+    SGDEstimator().fit(X5, Y5, intercept_init=clf.intercept_)
+    clf = SGDEstimator().fit(X, Y)
+    SGDEstimator().fit(X, Y, intercept_init=clf.intercept_)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_at_least_two_labels(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_at_least_two_labels(SGDEstimator):
     # Target must have at least two labels
-    clf = klass(alpha=0.01, max_iter=20)
+    clf = SGDEstimator(alpha=0.01, max_iter=20)
     with pytest.raises(ValueError):
         clf.fit(X2, np.ones(9))
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_partial_fit_weight_class_balanced(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_weight_class_balanced(SGDEstimator):
     # partial_fit with class_weight='balanced' not supported"""
     regex = (
         r"class_weight 'balanced' is not supported for "
@@ -713,13 +689,13 @@ def test_partial_fit_weight_class_balanced(klass):
         r"parameter\."
     )
     with pytest.raises(ValueError, match=regex):
-        klass(class_weight="balanced").partial_fit(X, Y, classes=np.unique(Y))
+        SGDEstimator(class_weight="balanced").partial_fit(X, Y, classes=np.unique(Y))
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_multiclass(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass(SGDEstimator):
     # Multi-class test case
-    clf = klass(alpha=0.01, max_iter=20).fit(X2, Y2)
+    clf = SGDEstimator(alpha=0.01, max_iter=20).fit(X2, Y2)
     assert clf.coef_.shape == (3, 2)
     assert clf.intercept_.shape == (3,)
     assert clf.decision_function([[0, 0]]).shape == (1, 3)
@@ -727,12 +703,12 @@ def test_sgd_multiclass(klass):
     assert_array_equal(pred, true_result2)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_multiclass_average(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass_average(SGDEstimator):
     eta = 0.001
     alpha = 0.01
     # Multi-class average test case
-    clf = klass(
+    clf = SGDEstimator(
         loss="squared_error",
         learning_rate="constant",
         eta0=eta,
@@ -750,15 +726,15 @@ def test_sgd_multiclass_average(klass):
     for i, cl in enumerate(classes):
         y_i = np.ones(np_Y2.shape[0])
         y_i[np_Y2 != cl] = -1
-        average_coef, average_intercept = asgd(klass, X2, y_i, eta, alpha)
+        average_coef, average_intercept = asgd(SGDEstimator, X2, y_i, eta, alpha)
         assert_array_almost_equal(average_coef, clf.coef_[i], decimal=16)
         assert_almost_equal(average_intercept, clf.intercept_[i], decimal=16)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_multiclass_with_init_coef(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass_with_init_coef(SGDEstimator):
     # Multi-class test case
-    clf = klass(alpha=0.01, max_iter=20)
+    clf = SGDEstimator(alpha=0.01, max_iter=20)
     clf.fit(X2, Y2, coef_init=np.zeros((3, 2)), intercept_init=np.zeros(3))
     assert clf.coef_.shape == (3, 2)
     assert clf.intercept_.shape, (3,)
@@ -766,10 +742,10 @@ def test_sgd_multiclass_with_init_coef(klass):
     assert_array_equal(pred, true_result2)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_multiclass_njobs(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass_njobs(SGDEstimator):
     # Multi-class test case with multi-core support
-    clf = klass(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2)
+    clf = SGDEstimator(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2)
     assert clf.coef_.shape == (3, 2)
     assert clf.intercept_.shape == (3,)
     assert clf.decision_function([[0, 0]]).shape == (1, 3)
@@ -777,31 +753,31 @@ def test_sgd_multiclass_njobs(klass):
     assert_array_equal(pred, true_result2)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_set_coef_multiclass(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_set_coef_multiclass(SGDEstimator):
     # Checks coef_init and intercept_init shape for multi-class
     # problems
     # Provided coef_ does not match dataset
-    clf = klass()
+    clf = SGDEstimator()
     with pytest.raises(ValueError):
         clf.fit(X2, Y2, coef_init=np.zeros((2, 2)))
 
     # Provided coef_ does match dataset
-    clf = klass().fit(X2, Y2, coef_init=np.zeros((3, 2)))
+    clf = SGDEstimator().fit(X2, Y2, coef_init=np.zeros((3, 2)))
 
     # Provided intercept_ does not match dataset
-    clf = klass()
+    clf = SGDEstimator()
     with pytest.raises(ValueError):
         clf.fit(X2, Y2, intercept_init=np.zeros((1,)))
 
     # Provided intercept_ does match dataset.
-    clf = klass().fit(X2, Y2, intercept_init=np.zeros((3,)))
+    clf = SGDEstimator().fit(X2, Y2, intercept_init=np.zeros((3,)))
 
 
 # TODO: Remove filterwarnings in v1.2.
 @pytest.mark.filterwarnings("ignore:.*squared_loss.*:FutureWarning")
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_predict_proba_method_access(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_predict_proba_method_access(SGDEstimator):
     # Checks that SGDClassifier predict_proba and predict_log_proba methods
     # can either be accessed or raise an appropriate error message
     # otherwise. See
@@ -824,8 +800,8 @@ def test_sgd_predict_proba_method_access(klass):
                 clf.predict_log_proba
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_proba(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_proba(SGDEstimator):
     # Check SGD.predict_proba
 
     # Hinge loss does not allow for conditional prob estimate.
@@ -838,7 +814,7 @@ def test_sgd_proba(klass):
     # log and modified_huber losses can output probability estimates
     # binary case
     for loss in ["log", "modified_huber"]:
-        clf = klass(loss=loss, alpha=0.01, max_iter=10)
+        clf = SGDEstimator(loss=loss, alpha=0.01, max_iter=10)
         clf.fit(X, Y)
         p = clf.predict_proba([[3, 2]])
         assert p[0, 1] > 0.5
@@ -851,7 +827,7 @@ def test_sgd_proba(klass):
         assert p[0, 1] < p[0, 0]
 
     # log loss multiclass probability estimates
-    clf = klass(loss="log", alpha=0.01, max_iter=10).fit(X2, Y2)
+    clf = SGDEstimator(loss="log", alpha=0.01, max_iter=10).fit(X2, Y2)
 
     d = clf.decision_function([[0.1, -0.1], [0.3, 0.2]])
     p = clf.predict_proba([[0.1, -0.1], [0.3, 0.2]])
@@ -874,11 +850,11 @@ def test_sgd_proba(klass):
     # Modified Huber multiclass probability estimates; requires a separate
     # test because the hard zero/one probabilities may destroy the
     # ordering present in decision_function output.
-    clf = klass(loss="modified_huber", alpha=0.01, max_iter=10)
+    clf = SGDEstimator(loss="modified_huber", alpha=0.01, max_iter=10)
     clf.fit(X2, Y2)
     d = clf.decision_function([[3, 2]])
     p = clf.predict_proba([[3, 2]])
-    if klass != SparseSGDClassifier:
+    if SGDEstimator != SparseSGDClassifier:
         assert np.argmax(d, axis=1) == np.argmax(p, axis=1)
     else:  # XXX the sparse test gets a different X2 (?)
         assert np.argmin(d, axis=1) == np.argmin(p, axis=1)
@@ -893,8 +869,8 @@ def test_sgd_proba(klass):
         assert_array_almost_equal(p[0], [1 / 3.0] * 3)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_l1(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_l1(SGDEstimator):
     # Test L1 regularization
     n = len(X4)
     rng = np.random.RandomState(13)
@@ -904,7 +880,7 @@ def test_sgd_l1(klass):
     X = X4[idx, :]
     Y = Y4[idx]
 
-    clf = klass(
+    clf = SGDEstimator(
         penalty="l1",
         alpha=0.2,
         fit_intercept=False,
@@ -930,18 +906,20 @@ def test_sgd_l1(klass):
     assert_array_equal(pred, Y)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_class_weights(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_class_weights(SGDEstimator):
     # Test class weights.
     X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
     y = [1, 1, 1, -1, -1]
 
-    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight=None)
+    clf = SGDEstimator(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight=None)
     clf.fit(X, y)
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
 
     # we give a small weights to class 1
-    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight={1: 0.001})
+    clf = SGDEstimator(
+        alpha=0.1, max_iter=1000, fit_intercept=False, class_weight={1: 0.001}
+    )
     clf.fit(X, y)
 
     # now the hyperplane should rotate clock-wise and
@@ -949,41 +927,41 @@ def test_class_weights(klass):
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_equal_class_weight(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_equal_class_weight(SGDEstimator):
     # Test if equal class weights approx. equals no class weights.
     X = [[1, 0], [1, 0], [0, 1], [0, 1]]
     y = [0, 0, 1, 1]
-    clf = klass(alpha=0.1, max_iter=1000, class_weight=None)
+    clf = SGDEstimator(alpha=0.1, max_iter=1000, class_weight=None)
     clf.fit(X, y)
 
     X = [[1, 0], [0, 1]]
     y = [0, 1]
-    clf_weighted = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5, 1: 0.5})
+    clf_weighted = SGDEstimator(alpha=0.1, max_iter=1000, class_weight={0: 0.5, 1: 0.5})
     clf_weighted.fit(X, y)
 
     # should be similar up to some epsilon due to learning rate schedule
     assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_wrong_class_weight_label(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_wrong_class_weight_label(SGDEstimator):
     # ValueError due to not existing class label.
-    clf = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5})
+    clf = SGDEstimator(alpha=0.1, max_iter=1000, class_weight={0: 0.5})
     with pytest.raises(ValueError):
         clf.fit(X, Y)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_wrong_class_weight_format(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_wrong_class_weight_format(SGDEstimator):
     # ValueError due to wrong class_weight argument type.
-    clf = klass(alpha=0.1, max_iter=1000, class_weight=[0.5])
+    clf = SGDEstimator(alpha=0.1, max_iter=1000, class_weight=[0.5])
     with pytest.raises(ValueError):
         clf.fit(X, Y)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_weights_multiplied(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_weights_multiplied(SGDEstimator):
     # Tests that class_weight and sample_weight are multiplicative
     class_weights = {1: 0.6, 2: 0.3}
     rng = np.random.RandomState(0)
@@ -992,8 +970,8 @@ def test_weights_multiplied(klass):
     multiplied_together[Y4 == 1] *= class_weights[1]
     multiplied_together[Y4 == 2] *= class_weights[2]
 
-    clf1 = klass(alpha=0.1, max_iter=20, class_weight=class_weights)
-    clf2 = klass(alpha=0.1, max_iter=20)
+    clf1 = SGDEstimator(alpha=0.1, max_iter=20, class_weight=class_weights)
+    clf2 = SGDEstimator(alpha=0.1, max_iter=20)
 
     clf1.fit(X4, Y4, sample_weight=sample_weights)
     clf2.fit(X4, Y4, sample_weight=multiplied_together)
@@ -1001,8 +979,8 @@ def test_weights_multiplied(klass):
     assert_almost_equal(clf1.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_balanced_weight(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_balanced_weight(SGDEstimator):
     # Test class weights for imbalanced data"""
     # compute reference metrics on iris dataset that is quite balanced by
     # default
@@ -1013,12 +991,14 @@ def test_balanced_weight(klass):
     rng.shuffle(idx)
     X = X[idx]
     y = y[idx]
-    clf = klass(alpha=0.0001, max_iter=1000, class_weight=None, shuffle=False).fit(X, y)
+    clf = SGDEstimator(
+        alpha=0.0001, max_iter=1000, class_weight=None, shuffle=False
+    ).fit(X, y)
     f1 = metrics.f1_score(y, clf.predict(X), average="weighted")
     assert_almost_equal(f1, 0.96, decimal=1)
 
     # make the same prediction using balanced class_weight
-    clf_balanced = klass(
+    clf_balanced = SGDEstimator(
         alpha=0.0001, max_iter=1000, class_weight="balanced", shuffle=False
     ).fit(X, y)
     f1 = metrics.f1_score(y, clf_balanced.predict(X), average="weighted")
@@ -1036,25 +1016,25 @@ def test_balanced_weight(klass):
     y_imbalanced = np.concatenate([y] + [y_0] * 10)
 
     # fit a model on the imbalanced data without class weight info
-    clf = klass(max_iter=1000, class_weight=None, shuffle=False)
+    clf = SGDEstimator(max_iter=1000, class_weight=None, shuffle=False)
     clf.fit(X_imbalanced, y_imbalanced)
     y_pred = clf.predict(X)
     assert metrics.f1_score(y, y_pred, average="weighted") < 0.96
 
     # fit a model with balanced class_weight enabled
-    clf = klass(max_iter=1000, class_weight="balanced", shuffle=False)
+    clf = SGDEstimator(max_iter=1000, class_weight="balanced", shuffle=False)
     clf.fit(X_imbalanced, y_imbalanced)
     y_pred = clf.predict(X)
     assert metrics.f1_score(y, y_pred, average="weighted") > 0.96
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_sample_weights(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_sample_weights(SGDEstimator):
     # Test weights on individual samples
     X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
     y = [1, 1, 1, -1, -1]
 
-    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
+    clf = SGDEstimator(alpha=0.1, max_iter=1000, fit_intercept=False)
     clf.fit(X, y)
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
 
@@ -1067,31 +1047,32 @@ def test_sample_weights(klass):
 
 
 @pytest.mark.parametrize(
-    "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
+    "SGDEstimator",
+    [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM],
 )
-def test_wrong_sample_weights(klass):
+def test_wrong_sample_weights(SGDEstimator):
     # Test if ValueError is raised if sample_weight has wrong shape
-    if klass in [SGDClassifier, SparseSGDClassifier]:
-        clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
-    elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
-        clf = klass(nu=0.1, max_iter=1000, fit_intercept=False)
+    if SGDEstimator in [SGDClassifier, SparseSGDClassifier]:
+        clf = SGDEstimator(alpha=0.1, max_iter=1000, fit_intercept=False)
+    elif SGDEstimator in [SGDOneClassSVM, SparseSGDOneClassSVM]:
+        clf = SGDEstimator(nu=0.1, max_iter=1000, fit_intercept=False)
     # provided sample_weight too long
     with pytest.raises(ValueError):
         clf.fit(X, Y, sample_weight=np.arange(7))
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_partial_fit_exception(klass):
-    clf = klass(alpha=0.01)
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_exception(SGDEstimator):
+    clf = SGDEstimator(alpha=0.01)
     # classes was not specified
     with pytest.raises(ValueError):
         clf.partial_fit(X3, Y3)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_partial_fit_binary(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_binary(SGDEstimator):
     third = X.shape[0] // 3
-    clf = klass(alpha=0.01)
+    clf = SGDEstimator(alpha=0.01)
     classes = np.unique(Y)
 
     clf.partial_fit(X[:third], Y[:third], classes=classes)
@@ -1109,10 +1090,10 @@ def test_partial_fit_binary(klass):
     assert_array_equal(y_pred, true_result)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_partial_fit_multiclass(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_multiclass(SGDEstimator):
     third = X2.shape[0] // 3
-    clf = klass(alpha=0.01)
+    clf = SGDEstimator(alpha=0.01)
     classes = np.unique(Y2)
 
     clf.partial_fit(X2[:third], Y2[:third], classes=classes)
@@ -1127,10 +1108,10 @@ def test_partial_fit_multiclass(klass):
     assert id1, id2
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_partial_fit_multiclass_average(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_multiclass_average(SGDEstimator):
     third = X2.shape[0] // 3
-    clf = klass(alpha=0.01, average=X2.shape[0])
+    clf = SGDEstimator(alpha=0.01, average=X2.shape[0])
     classes = np.unique(Y2)
 
     clf.partial_fit(X2[:third], Y2[:third], classes=classes)
@@ -1142,27 +1123,29 @@ def test_partial_fit_multiclass_average(klass):
     assert clf.intercept_.shape == (3,)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_fit_then_partial_fit(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_fit_then_partial_fit(SGDEstimator):
     # Partial_fit should work after initial fit in the multiclass case.
     # Non-regression test for #2496; fit would previously produce a
     # Fortran-ordered coef_ that subsequent partial_fit couldn't handle.
-    clf = klass()
+    clf = SGDEstimator()
     clf.fit(X2, Y2)
     clf.partial_fit(X2, Y2)  # no exception here
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
 @pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
-def test_partial_fit_equal_fit_classif(klass, lr):
+def test_partial_fit_equal_fit_classif(SGDEstimator, lr):
     for X_, Y_, T_ in ((X, Y, T), (X2, Y2, T2)):
-        clf = klass(alpha=0.01, eta0=0.01, max_iter=2, learning_rate=lr, shuffle=False)
+        clf = SGDEstimator(
+            alpha=0.01, eta0=0.01, max_iter=2, learning_rate=lr, shuffle=False
+        )
         clf.fit(X_, Y_)
         y_pred = clf.decision_function(T_)
         t = clf.t_
 
         classes = np.unique(Y_)
-        clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
+        clf = SGDEstimator(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
         for i in range(2):
             clf.partial_fit(X_, Y_, classes=classes)
         y_pred2 = clf.decision_function(T_)
@@ -1171,10 +1154,10 @@ def test_partial_fit_equal_fit_classif(klass, lr):
         assert_array_almost_equal(y_pred, y_pred2, decimal=2)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_regression_losses(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_regression_losses(SGDEstimator):
     random_state = np.random.RandomState(1)
-    clf = klass(
+    clf = SGDEstimator(
         alpha=0.01,
         learning_rate="constant",
         eta0=0.1,
@@ -1184,7 +1167,7 @@ def test_regression_losses(klass):
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
-    clf = klass(
+    clf = SGDEstimator(
         alpha=0.01,
         learning_rate="constant",
         eta0=0.1,
@@ -1194,11 +1177,11 @@ def test_regression_losses(klass):
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
-    clf = klass(alpha=0.01, loss="huber", random_state=random_state)
+    clf = SGDEstimator(alpha=0.01, loss="huber", random_state=random_state)
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
-    clf = klass(
+    clf = SGDEstimator(
         alpha=0.01,
         learning_rate="constant",
         eta0=0.01,
@@ -1209,15 +1192,15 @@ def test_regression_losses(klass):
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_warm_start_multiclass(klass):
-    _test_warm_start(klass, X2, Y2, "optimal")
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_warm_start_multiclass(SGDEstimator):
+    _test_warm_start(SGDEstimator, X2, Y2, "optimal")
 
 
-@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
-def test_multiple_fit(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+def test_multiple_fit(SGDEstimator):
     # Test multiple calls of fit w/ different shaped inputs.
-    clf = klass(alpha=0.01, shuffle=False)
+    clf = SGDEstimator(alpha=0.01, shuffle=False)
     clf.fit(X, Y)
     assert hasattr(clf, "coef_")
 
@@ -1230,16 +1213,16 @@ def test_multiple_fit(klass):
 # Regression Test Case
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
-def test_sgd_reg(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_reg(SGDEstimator):
     # Check that SGD gives any results.
-    clf = klass(alpha=0.1, max_iter=2, fit_intercept=False)
+    clf = SGDEstimator(alpha=0.1, max_iter=2, fit_intercept=False)
     clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
     assert clf.coef_[0] == clf.coef_[1]
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
-def test_sgd_averaged_computed_correctly(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_averaged_computed_correctly(SGDEstimator):
     # Tests the average regressor matches the naive implementation
 
     eta = 0.001
@@ -1253,7 +1236,7 @@ def test_sgd_averaged_computed_correctly(klass):
     # simple linear function without noise
     y = np.dot(X, w)
 
-    clf = klass(
+    clf = SGDEstimator(
         loss="squared_error",
         learning_rate="constant",
         eta0=eta,
@@ -1265,14 +1248,14 @@ def test_sgd_averaged_computed_correctly(klass):
     )
 
     clf.fit(X, y)
-    average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
+    average_weights, average_intercept = asgd(SGDEstimator, X, y, eta, alpha)
 
     assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
     assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
-def test_sgd_averaged_partial_fit(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_averaged_partial_fit(SGDEstimator):
     # Tests whether the partial fit yields the same average as the fit
     eta = 0.001
     alpha = 0.01
@@ -1285,7 +1268,7 @@ def test_sgd_averaged_partial_fit(klass):
     # simple linear function without noise
     y = np.dot(X, w)
 
-    clf = klass(
+    clf = SGDEstimator(
         loss="squared_error",
         learning_rate="constant",
         eta0=eta,
@@ -1298,19 +1281,19 @@ def test_sgd_averaged_partial_fit(klass):
 
     clf.partial_fit(X[: int(n_samples / 2)][:], y[: int(n_samples / 2)])
     clf.partial_fit(X[int(n_samples / 2) :][:], y[int(n_samples / 2) :])
-    average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
+    average_weights, average_intercept = asgd(SGDEstimator, X, y, eta, alpha)
 
     assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
     assert_almost_equal(clf.intercept_[0], average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
-def test_average_sparse(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
+def test_average_sparse(SGDEstimator):
     # Checks the average weights on data with 0s
 
     eta = 0.001
     alpha = 0.01
-    clf = klass(
+    clf = SGDEstimator(
         loss="squared_error",
         learning_rate="constant",
         eta0=eta,
@@ -1325,14 +1308,14 @@ def test_average_sparse(klass):
 
     clf.partial_fit(X3[: int(n_samples / 2)][:], Y3[: int(n_samples / 2)])
     clf.partial_fit(X3[int(n_samples / 2) :][:], Y3[int(n_samples / 2) :])
-    average_weights, average_intercept = asgd(klass, X3, Y3, eta, alpha)
+    average_weights, average_intercept = asgd(SGDEstimator, X3, Y3, eta, alpha)
 
     assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
     assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
-def test_sgd_least_squares_fit(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_least_squares_fit(SGDEstimator):
     xmin, xmax = -5, 5
     n_samples = 100
     rng = np.random.RandomState(0)
@@ -1341,7 +1324,9 @@ def test_sgd_least_squares_fit(klass):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = klass(loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False)
+    clf = SGDEstimator(
+        loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False
+    )
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.99
@@ -1349,14 +1334,16 @@ def test_sgd_least_squares_fit(klass):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = klass(loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False)
+    clf = SGDEstimator(
+        loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False
+    )
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.5
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
-def test_sgd_epsilon_insensitive(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_epsilon_insensitive(SGDEstimator):
     xmin, xmax = -5, 5
     n_samples = 100
     rng = np.random.RandomState(0)
@@ -1365,7 +1352,7 @@ def test_sgd_epsilon_insensitive(klass):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = klass(
+    clf = SGDEstimator(
         loss="epsilon_insensitive",
         epsilon=0.01,
         alpha=0.1,
@@ -1379,7 +1366,7 @@ def test_sgd_epsilon_insensitive(klass):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = klass(
+    clf = SGDEstimator(
         loss="epsilon_insensitive",
         epsilon=0.01,
         alpha=0.1,
@@ -1391,8 +1378,8 @@ def test_sgd_epsilon_insensitive(klass):
     assert score > 0.5
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
-def test_sgd_huber_fit(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_huber_fit(SGDEstimator):
     xmin, xmax = -5, 5
     n_samples = 100
     rng = np.random.RandomState(0)
@@ -1401,7 +1388,9 @@ def test_sgd_huber_fit(klass):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False)
+    clf = SGDEstimator(
+        loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False
+    )
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.99
@@ -1409,14 +1398,16 @@ def test_sgd_huber_fit(klass):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False)
+    clf = SGDEstimator(
+        loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False
+    )
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.5
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
-def test_elasticnet_convergence(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
+def test_elasticnet_convergence(SGDEstimator):
     # Check that the SGD output is consistent with coordinate descent
 
     n_samples, n_features = 1000, 5
@@ -1434,7 +1425,7 @@ def test_elasticnet_convergence(klass):
                 alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False
             )
             cd.fit(X, y)
-            sgd = klass(
+            sgd = SGDEstimator(
                 penalty="elasticnet",
                 max_iter=50,
                 alpha=alpha,
@@ -1450,10 +1441,10 @@ def test_elasticnet_convergence(klass):
 
 
 @ignore_warnings
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
-def test_partial_fit(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
+def test_partial_fit(SGDEstimator):
     third = X.shape[0] // 3
-    clf = klass(alpha=0.01)
+    clf = SGDEstimator(alpha=0.01)
 
     clf.partial_fit(X[:third], Y[:third])
     assert clf.coef_.shape == (X.shape[1],)
@@ -1467,15 +1458,17 @@ def test_partial_fit(klass):
     assert id1, id2
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
 @pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
-def test_partial_fit_equal_fit(klass, lr):
-    clf = klass(alpha=0.01, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False)
+def test_partial_fit_equal_fit(SGDEstimator, lr):
+    clf = SGDEstimator(
+        alpha=0.01, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False
+    )
     clf.fit(X, Y)
     y_pred = clf.predict(T)
     t = clf.t_
 
-    clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
+    clf = SGDEstimator(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
     for i in range(2):
         clf.partial_fit(X, Y)
     y_pred2 = clf.predict(T)
@@ -1484,9 +1477,9 @@ def test_partial_fit_equal_fit(klass, lr):
     assert_array_almost_equal(y_pred, y_pred2, decimal=2)
 
 
-@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
-def test_loss_function_epsilon(klass):
-    clf = klass(epsilon=0.9)
+@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
+def test_loss_function_epsilon(SGDEstimator):
+    clf = SGDEstimator(epsilon=0.9)
     clf.set_params(epsilon=0.1)
     assert clf.loss_functions["huber"][1] == 0.1
 
@@ -1495,7 +1488,7 @@ def test_loss_function_epsilon(klass):
 # SGD One Class SVM Test Case
 
 # a simple implementation of ASGD to use for testing SGDOneClassSVM
-def asgd_oneclass(klass, X, eta, nu, coef_init=None, offset_init=0.0):
+def asgd_oneclass(SGDEstimator, X, eta, nu, coef_init=None, offset_init=0.0):
     if coef_init is None:
         coef = np.zeros(X.shape[1])
     else:
@@ -1508,7 +1501,7 @@ def asgd_oneclass(klass, X, eta, nu, coef_init=None, offset_init=0.0):
     decay = 1.0
 
     # sparse data has a fixed decay of .01
-    if klass == SparseSGDOneClassSVM:
+    if SGDEstimator == SparseSGDOneClassSVM:
         decay = 0.01
 
     for i, entry in enumerate(X):
@@ -1533,30 +1526,19 @@ def asgd_oneclass(klass, X, eta, nu, coef_init=None, offset_init=0.0):
     return average_coef, 1 - average_intercept
 
 
-@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
-@pytest.mark.parametrize("nu", [-0.5, 2])
-def test_bad_nu_values(klass, nu):
-    msg = r"nu must be in \(0, 1]"
-    with pytest.raises(ValueError, match=msg):
-        klass(nu=nu)
-
-    clf = klass(nu=0.05)
-    clf2 = clone(clf)
-    with pytest.raises(ValueError, match=msg):
-        clf2.set_params(nu=nu)
-
-
-@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def _test_warm_start_oneclass(klass, X, lr):
+@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def _test_warm_start_oneclass(SGDEstimator, X, lr):
     # Test that explicit warm restart...
-    clf = klass(nu=0.5, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf = SGDEstimator(nu=0.5, eta0=0.01, shuffle=False, learning_rate=lr)
     clf.fit(X)
 
-    clf2 = klass(nu=0.1, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf2 = SGDEstimator(nu=0.1, eta0=0.01, shuffle=False, learning_rate=lr)
     clf2.fit(X, coef_init=clf.coef_.copy(), offset_init=clf.offset_.copy())
 
     # ... and implicit warm restart are equivalent.
-    clf3 = klass(nu=0.5, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr)
+    clf3 = SGDEstimator(
+        nu=0.5, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr
+    )
     clf3.fit(X)
 
     assert clf3.t_ == clf.t_
@@ -1569,30 +1551,30 @@ def _test_warm_start_oneclass(klass, X, lr):
     assert_allclose(clf3.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
 @pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
-def test_warm_start_oneclass(klass, lr):
-    _test_warm_start_oneclass(klass, X, lr)
+def test_warm_start_oneclass(SGDEstimator, lr):
+    _test_warm_start_oneclass(SGDEstimator, X, lr)
 
 
-@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def test_clone_oneclass(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_clone_oneclass(SGDEstimator):
     # Test whether clone works ok.
-    clf = klass(nu=0.5)
+    clf = SGDEstimator(nu=0.5)
     clf = clone(clf)
     clf.set_params(nu=0.1)
     clf.fit(X)
 
-    clf2 = klass(nu=0.1)
+    clf2 = SGDEstimator(nu=0.1)
     clf2.fit(X)
 
     assert_array_equal(clf.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def test_partial_fit_oneclass(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_partial_fit_oneclass(SGDEstimator):
     third = X.shape[0] // 3
-    clf = klass(nu=0.1)
+    clf = SGDEstimator(nu=0.1)
 
     clf.partial_fit(X[:third])
     assert clf.coef_.shape == (X.shape[1],)
@@ -1609,17 +1591,17 @@ def test_partial_fit_oneclass(klass):
         clf.partial_fit(X[:, 1])
 
 
-@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
 @pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
-def test_partial_fit_equal_fit_oneclass(klass, lr):
-    clf = klass(nu=0.05, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False)
+def test_partial_fit_equal_fit_oneclass(SGDEstimator, lr):
+    clf = SGDEstimator(nu=0.05, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False)
     clf.fit(X)
     y_scores = clf.decision_function(T)
     t = clf.t_
     coef = clf.coef_
     offset = clf.offset_
 
-    clf = klass(nu=0.05, eta0=0.01, max_iter=1, learning_rate=lr, shuffle=False)
+    clf = SGDEstimator(nu=0.05, eta0=0.01, max_iter=1, learning_rate=lr, shuffle=False)
     for _ in range(2):
         clf.partial_fit(X)
     y_scores2 = clf.decision_function(T)
@@ -1630,18 +1612,18 @@ def test_partial_fit_equal_fit_oneclass(klass, lr):
     assert_allclose(clf.offset_, offset)
 
 
-@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def test_late_onset_averaging_reached_oneclass(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_late_onset_averaging_reached_oneclass(SGDEstimator):
     # Test average
     eta0 = 0.001
     nu = 0.05
 
     # 2 passes over the training set but average only at second pass
-    clf1 = klass(
+    clf1 = SGDEstimator(
         average=7, learning_rate="constant", eta0=eta0, nu=nu, max_iter=2, shuffle=False
     )
     # 1 pass over the training set with no averaging
-    clf2 = klass(
+    clf2 = SGDEstimator(
         average=0, learning_rate="constant", eta0=eta0, nu=nu, max_iter=1, shuffle=False
     )
 
@@ -1651,15 +1633,20 @@ def test_late_onset_averaging_reached_oneclass(klass):
     # Start from clf2 solution, compute averaging using asgd function and
     # compare with clf1 solution
     average_coef, average_offset = asgd_oneclass(
-        klass, X, eta0, nu, coef_init=clf2.coef_.ravel(), offset_init=clf2.offset_
+        SGDEstimator,
+        X,
+        eta0,
+        nu,
+        coef_init=clf2.coef_.ravel(),
+        offset_init=clf2.offset_,
     )
 
     assert_allclose(clf1.coef_.ravel(), average_coef.ravel())
     assert_allclose(clf1.offset_, average_offset)
 
 
-@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def test_sgd_averaged_computed_correctly_oneclass(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_sgd_averaged_computed_correctly_oneclass(SGDEstimator):
     # Tests the average SGD One-Class SVM matches the naive implementation
     eta = 0.001
     nu = 0.05
@@ -1668,7 +1655,7 @@ def test_sgd_averaged_computed_correctly_oneclass(klass):
     rng = np.random.RandomState(0)
     X = rng.normal(size=(n_samples, n_features))
 
-    clf = klass(
+    clf = SGDEstimator(
         learning_rate="constant",
         eta0=eta,
         nu=nu,
@@ -1679,14 +1666,14 @@ def test_sgd_averaged_computed_correctly_oneclass(klass):
     )
 
     clf.fit(X)
-    average_coef, average_offset = asgd_oneclass(klass, X, eta, nu)
+    average_coef, average_offset = asgd_oneclass(SGDEstimator, X, eta, nu)
 
     assert_allclose(clf.coef_, average_coef)
     assert_allclose(clf.offset_, average_offset)
 
 
-@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def test_sgd_averaged_partial_fit_oneclass(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_sgd_averaged_partial_fit_oneclass(SGDEstimator):
     # Tests whether the partial fit yields the same average as the fit
     eta = 0.001
     nu = 0.05
@@ -1695,7 +1682,7 @@ def test_sgd_averaged_partial_fit_oneclass(klass):
     rng = np.random.RandomState(0)
     X = rng.normal(size=(n_samples, n_features))
 
-    clf = klass(
+    clf = SGDEstimator(
         learning_rate="constant",
         eta0=eta,
         nu=nu,
@@ -1707,18 +1694,18 @@ def test_sgd_averaged_partial_fit_oneclass(klass):
 
     clf.partial_fit(X[: int(n_samples / 2)][:])
     clf.partial_fit(X[int(n_samples / 2) :][:])
-    average_coef, average_offset = asgd_oneclass(klass, X, eta, nu)
+    average_coef, average_offset = asgd_oneclass(SGDEstimator, X, eta, nu)
 
     assert_allclose(clf.coef_, average_coef)
     assert_allclose(clf.offset_, average_offset)
 
 
-@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def test_average_sparse_oneclass(klass):
+@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_average_sparse_oneclass(SGDEstimator):
     # Checks the average coef on data with 0s
     eta = 0.001
     nu = 0.01
-    clf = klass(
+    clf = SGDEstimator(
         learning_rate="constant",
         eta0=eta,
         nu=nu,
@@ -1732,7 +1719,7 @@ def test_average_sparse_oneclass(klass):
 
     clf.partial_fit(X3[: int(n_samples / 2)])
     clf.partial_fit(X3[int(n_samples / 2) :])
-    average_coef, average_offset = asgd_oneclass(klass, X3, eta, nu)
+    average_coef, average_offset = asgd_oneclass(SGDEstimator, X3, eta, nu)
 
     assert_allclose(clf.coef_, average_coef)
     assert_allclose(clf.offset_, average_offset)

From fd7664b919e2426c5e75c85c21e5c2580ff77cca Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Aug 2021 15:30:35 +0200
Subject: [PATCH 04/14] TST add passive aggressive params validation tests

---
 doc/whats_new/v1.0.rst                        | 12 +++++
 .../tests/test_passive_aggressive.py          | 49 ++++++++++++++++---
 2 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index db73dbbbc3844..67317c506805c 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -501,6 +501,18 @@ Changelog
   coordinate descent solver. Otherwise, an error will be raised.
   :pr:`19391` by :user:`Shao Yang Hong <hongshaoyang>`.
 
+- |API| Force the validation to be made at `fit` for the following estimators
+  to follow our scikit-learn convention:
+  :class:`linear_model.SGDClassifier`,
+  :class:`linear_model.SparseSGDClassifier`,
+  :class:`linear_model.SGDRegressor`,
+  :class:`linear_model.SparseSGDRegressor`,
+  :class:`linear_model.SGDOneClassSVM`,
+  :class:`linear_model.SparseSGDOneClassSVM`,
+  :class:`linear_model.PassiveAggressiveClassifier`,
+  :class:`linear_model.PassiveAggressiveRegressor`.
+  :pr:`20683` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.manifold`
 .......................
 
diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py
index a287d61406cdd..6eb2297990e36 100644
--- a/sklearn/linear_model/tests/test_passive_aggressive.py
+++ b/sklearn/linear_model/tests/test_passive_aggressive.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+from sklearn.base import is_classifier
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_almost_equal
@@ -136,11 +137,13 @@ def test_classifier_correctness(loss):
         assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2)
 
 
-def test_classifier_undefined_methods():
+@pytest.mark.parametrize(
+    "response_method", ["predict_proba", "predict_log_proba", "transform"]
+)
+def test_classifier_undefined_methods(response_method):
     clf = PassiveAggressiveClassifier(max_iter=100)
-    for meth in ("predict_proba", "predict_log_proba", "transform"):
-        with pytest.raises(AttributeError):
-            getattr(clf, meth)
+    with pytest.raises(AttributeError):
+        getattr(clf, response_method)
 
 
 def test_class_weights():
@@ -279,6 +282,38 @@ def test_regressor_correctness(loss):
 
 def test_regressor_undefined_methods():
     reg = PassiveAggressiveRegressor(max_iter=100)
-    for meth in ("transform",):
-        with pytest.raises(AttributeError):
-            getattr(reg, meth)
+    with pytest.raises(AttributeError):
+        reg.transform(X)
+
+
+@pytest.mark.parametrize(
+    "PassiveAggressiveEstimator",
+    [PassiveAggressiveClassifier, PassiveAggressiveRegressor],
+)
+@pytest.mark.parametrize("fit_method", ["fit", "partial_fit"])
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        ({"loss": "foobar"}, "The loss foobar is not supported"),
+        ({"max_iter": -1}, "max_iter must be > zero"),
+        ({"shuffle": "false"}, "shuffle must be either True or False"),
+        ({"early_stopping": "false"}, "early_stopping must be either True or False"),
+        (
+            {"validation_fraction": -0.1},
+            r"validation_fraction must be in range \(0, 1\)",
+        ),
+        ({"n_iter_no_change": 0}, "n_iter_no_change must be >= 1"),
+    ],
+)
+def test_passive_aggressive_estimator_params_validation(
+    PassiveAggressiveEstimator, fit_method, params, err_msg
+):
+    """Validate parameters in the different PassiveAggressive estimators."""
+    sgd_estimator = PassiveAggressiveEstimator(**params)
+
+    with pytest.raises(ValueError, match=err_msg):
+        if is_classifier(sgd_estimator) and fit_method == "partial_fit":
+            fit_params = {"classes": np.unique(y)}
+        else:
+            fit_params = {}
+        getattr(sgd_estimator, fit_method)(X, y, **fit_params)

From c34cf447dd5af64c9f91c7e99d3263e8c3b48ddc Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Aug 2021 18:53:01 +0200
Subject: [PATCH 05/14] remove set_params to use inheritance directly

---
 sklearn/linear_model/_stochastic_gradient.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 7981ee3f43f81..2310a756c8b08 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -121,22 +121,6 @@ def __init__(
         self.max_iter = max_iter
         self.tol = tol
 
-    def set_params(self, **kwargs):
-        """Set and validate the parameters of estimator.
-
-        Parameters
-        ----------
-        **kwargs : dict
-            Estimator parameters.
-
-        Returns
-        -------
-        self : object
-            Estimator instance.
-        """
-        super().set_params(**kwargs)
-        return self
-
     @abstractmethod
     def fit(self, X, y):
         """Fit model."""

From 6a30ab753ae2577debdfbbe717dd7433deefe827 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Aug 2021 18:53:48 +0200
Subject: [PATCH 06/14] iter

---
 doc/whats_new/v1.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 67317c506805c..bf8710c46594f 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -511,7 +511,7 @@ Changelog
   :class:`linear_model.SparseSGDOneClassSVM`,
   :class:`linear_model.PassiveAggressiveClassifier`,
   :class:`linear_model.PassiveAggressiveRegressor`.
-  :pr:`20683` by :user:`Guillaume Lemaitre <glemaitre>`.
+  :pr:`20683` by `Guillaume Lemaitre`_.
 
 :mod:`sklearn.manifold`
 .......................

From 11eab65310338b966e34e3fc8aca8271290169aa Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 6 Aug 2021 11:20:54 +0200
Subject: [PATCH 07/14] iter

---
 sklearn/linear_model/tests/test_sgd.py | 530 ++++++++++++-------------
 1 file changed, 256 insertions(+), 274 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 4347be499a60f..5ec7084ffafb4 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -181,7 +181,7 @@ def SparseSGDOneClassSVM(**kwargs):
 
 # a simple implementation of ASGD to use for testing
 # uses squared loss to find the gradient
-def asgd(SGDEstimator, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
+def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
     if weight_init is None:
         weights = np.zeros(X.shape[1])
     else:
@@ -193,7 +193,7 @@ def asgd(SGDEstimator, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
     decay = 1.0
 
     # sparse data has a fixed decay of .01
-    if SGDEstimator in (SparseSGDClassifier, SparseSGDRegressor):
+    if klass in (SparseSGDClassifier, SparseSGDRegressor):
         decay = 0.01
 
     for i, entry in enumerate(X):
@@ -216,7 +216,7 @@ def asgd(SGDEstimator, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [
         SGDClassifier,
         SparseSGDClassifier,
@@ -249,10 +249,10 @@ def asgd(SGDEstimator, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
         ({"n_iter_no_change": 0}, "n_iter_no_change must be >= 1"),
     ],
 )
-def test_sgd_estimator_params_validation(SGDEstimator, fit_method, params, err_msg):
+def test_sgd_estimator_params_validation(klass, fit_method, params, err_msg):
     """Validate parameters in the different SGD estimators."""
     try:
-        sgd_estimator = SGDEstimator(**params)
+        sgd_estimator = klass(**params)
     except TypeError as err:
         if "__init__() got an unexpected keyword argument" in str(err):
             # skip test if the parameter is not supported by the estimator
@@ -267,16 +267,16 @@ def test_sgd_estimator_params_validation(SGDEstimator, fit_method, params, err_m
         getattr(sgd_estimator, fit_method)(X, Y, **fit_params)
 
 
-def _test_warm_start(SGDEstimator, X, Y, lr):
+def _test_warm_start(klass, X, Y, lr):
     # Test that explicit warm restart...
-    clf = SGDEstimator(alpha=0.01, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf = klass(alpha=0.01, eta0=0.01, shuffle=False, learning_rate=lr)
     clf.fit(X, Y)
 
-    clf2 = SGDEstimator(alpha=0.001, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf2 = klass(alpha=0.001, eta0=0.01, shuffle=False, learning_rate=lr)
     clf2.fit(X, Y, coef_init=clf.coef_.copy(), intercept_init=clf.intercept_.copy())
 
     # ... and implicit warm restart are equivalent.
-    clf3 = SGDEstimator(
+    clf3 = klass(
         alpha=0.01, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr
     )
     clf3.fit(X, Y)
@@ -292,21 +292,21 @@ def _test_warm_start(SGDEstimator, X, Y, lr):
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
 @pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
-def test_warm_start(SGDEstimator, lr):
-    _test_warm_start(SGDEstimator, X, Y, lr)
+def test_warm_start(klass, lr):
+    _test_warm_start(klass, X, Y, lr)
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_input_format(SGDEstimator):
+def test_input_format(klass):
     # Input format tests.
-    clf = SGDEstimator(alpha=0.01, shuffle=False)
+    clf = klass(alpha=0.01, shuffle=False)
     clf.fit(X, Y)
     Y_ = np.array(Y)[:, np.newaxis]
 
@@ -316,24 +316,24 @@ def test_input_format(SGDEstimator):
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_clone(SGDEstimator):
+def test_clone(klass):
     # Test whether clone works ok.
-    clf = SGDEstimator(alpha=0.01, penalty="l1")
+    clf = klass(alpha=0.01, penalty="l1")
     clf = clone(clf)
     clf.set_params(penalty="l2")
     clf.fit(X, Y)
 
-    clf2 = SGDEstimator(alpha=0.01, penalty="l2")
+    clf2 = klass(alpha=0.01, penalty="l2")
     clf2.fit(X, Y)
 
     assert_array_equal(clf.coef_, clf2.coef_)
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [
         SGDClassifier,
         SparseSGDClassifier,
@@ -343,8 +343,8 @@ def test_clone(SGDEstimator):
         SparseSGDOneClassSVM,
     ],
 )
-def test_plain_has_no_average_attr(SGDEstimator):
-    clf = SGDEstimator(average=True, eta0=0.01)
+def test_plain_has_no_average_attr(klass):
+    clf = klass(average=True, eta0=0.01)
     clf.fit(X, Y)
 
     assert hasattr(clf, "_average_coef")
@@ -352,7 +352,7 @@ def test_plain_has_no_average_attr(SGDEstimator):
     assert hasattr(clf, "_standard_intercept")
     assert hasattr(clf, "_standard_coef")
 
-    clf = SGDEstimator()
+    clf = klass()
     clf.fit(X, Y)
 
     assert not hasattr(clf, "_average_coef")
@@ -362,7 +362,7 @@ def test_plain_has_no_average_attr(SGDEstimator):
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [
         SGDClassifier,
         SparseSGDClassifier,
@@ -372,9 +372,9 @@ def test_plain_has_no_average_attr(SGDEstimator):
         SparseSGDOneClassSVM,
     ],
 )
-def test_late_onset_averaging_not_reached(SGDEstimator):
-    clf1 = SGDEstimator(average=600)
-    clf2 = SGDEstimator()
+def test_late_onset_averaging_not_reached(klass):
+    clf1 = klass(average=600)
+    clf2 = klass()
     for _ in range(100):
         if is_classifier(clf1):
             clf1.partial_fit(X, Y, classes=np.unique(Y))
@@ -384,29 +384,29 @@ def test_late_onset_averaging_not_reached(SGDEstimator):
             clf2.partial_fit(X, Y)
 
     assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16)
-    if SGDEstimator in [
+    if klass in [
         SGDClassifier,
         SparseSGDClassifier,
         SGDRegressor,
         SparseSGDRegressor,
     ]:
         assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16)
-    elif SGDEstimator in [SGDOneClassSVM, SparseSGDOneClassSVM]:
+    elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
         assert_allclose(clf1.offset_, clf2.offset_)
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_late_onset_averaging_reached(SGDEstimator):
+def test_late_onset_averaging_reached(klass):
     eta0 = 0.001
     alpha = 0.0001
     Y_encode = np.array(Y)
     Y_encode[Y_encode == 1] = -1.0
     Y_encode[Y_encode == 2] = 1.0
 
-    clf1 = SGDEstimator(
+    clf1 = klass(
         average=7,
         learning_rate="constant",
         loss="squared_error",
@@ -415,7 +415,7 @@ def test_late_onset_averaging_reached(SGDEstimator):
         max_iter=2,
         shuffle=False,
     )
-    clf2 = SGDEstimator(
+    clf2 = klass(
         average=0,
         learning_rate="constant",
         loss="squared_error",
@@ -429,7 +429,7 @@ def test_late_onset_averaging_reached(SGDEstimator):
     clf2.fit(X, Y_encode)
 
     average_weights, average_intercept = asgd(
-        SGDEstimator,
+        klass,
         X,
         Y_encode,
         eta0,
@@ -443,43 +443,43 @@ def test_late_onset_averaging_reached(SGDEstimator):
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_early_stopping(SGDEstimator):
+def test_early_stopping(klass):
     X = iris.data[iris.target > 0]
     Y = iris.target[iris.target > 0]
     for early_stopping in [True, False]:
         max_iter = 1000
-        clf = SGDEstimator(
-            early_stopping=early_stopping, tol=1e-3, max_iter=max_iter
-        ).fit(X, Y)
+        clf = klass(early_stopping=early_stopping, tol=1e-3, max_iter=max_iter).fit(
+            X, Y
+        )
         assert clf.n_iter_ < max_iter
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_adaptive_longer_than_constant(SGDEstimator):
-    clf1 = SGDEstimator(learning_rate="adaptive", eta0=0.01, tol=1e-3, max_iter=100)
+def test_adaptive_longer_than_constant(klass):
+    clf1 = klass(learning_rate="adaptive", eta0=0.01, tol=1e-3, max_iter=100)
     clf1.fit(iris.data, iris.target)
-    clf2 = SGDEstimator(learning_rate="constant", eta0=0.01, tol=1e-3, max_iter=100)
+    clf2 = klass(learning_rate="constant", eta0=0.01, tol=1e-3, max_iter=100)
     clf2.fit(iris.data, iris.target)
     assert clf1.n_iter_ > clf2.n_iter_
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_validation_set_not_used_for_training(SGDEstimator):
+def test_validation_set_not_used_for_training(klass):
     X, Y = iris.data, iris.target
     validation_fraction = 0.4
     seed = 42
     shuffle = False
     max_iter = 10
-    clf1 = SGDEstimator(
+    clf1 = klass(
         early_stopping=True,
         random_state=np.random.RandomState(seed),
         validation_fraction=validation_fraction,
@@ -492,7 +492,7 @@ def test_validation_set_not_used_for_training(SGDEstimator):
     clf1.fit(X, Y)
     assert clf1.n_iter_ == max_iter
 
-    clf2 = SGDEstimator(
+    clf2 = klass(
         early_stopping=False,
         random_state=np.random.RandomState(seed),
         learning_rate="constant",
@@ -515,15 +515,15 @@ def test_validation_set_not_used_for_training(SGDEstimator):
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_n_iter_no_change(SGDEstimator):
+def test_n_iter_no_change(klass):
     X, Y = iris.data, iris.target
     # test that n_iter_ increases monotonically with n_iter_no_change
     for early_stopping in [True, False]:
         n_iter_list = [
-            SGDEstimator(
+            klass(
                 early_stopping=early_stopping,
                 n_iter_no_change=n_iter_no_change,
                 tol=1e-4,
@@ -537,12 +537,12 @@ def test_n_iter_no_change(SGDEstimator):
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_not_enough_sample_for_early_stopping(SGDEstimator):
+def test_not_enough_sample_for_early_stopping(klass):
     # test an error is raised if the training or validation set is empty
-    clf = SGDEstimator(early_stopping=True, validation_fraction=0.99)
+    clf = klass(early_stopping=True, validation_fraction=0.99)
     with pytest.raises(ValueError):
         clf.fit(X3, Y3)
 
@@ -551,12 +551,12 @@ def test_not_enough_sample_for_early_stopping(SGDEstimator):
 # Classification Test Case
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_clf(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_clf(klass):
     # Check that SGD gives any results :-)
 
     for loss in ("hinge", "squared_hinge", "log", "modified_huber"):
-        clf = SGDEstimator(
+        clf = klass(
             penalty="l2",
             alpha=0.01,
             fit_intercept=True,
@@ -570,17 +570,17 @@ def test_sgd_clf(SGDEstimator):
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM],
 )
-def test_provide_coef(SGDEstimator):
+def test_provide_coef(klass):
     """Check that the shape of `coef_init` is validated."""
     with pytest.raises(ValueError, match="Provided coef_init does not match dataset"):
-        SGDEstimator().fit(X, Y, coef_init=np.zeros((3,)))
+        klass().fit(X, Y, coef_init=np.zeros((3,)))
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator, fit_params",
+    "klass, fit_params",
     [
         (SGDClassifier, {"intercept_init": np.zeros((3,))}),
         (SparseSGDClassifier, {"intercept_init": np.zeros((3,))}),
@@ -588,28 +588,28 @@ def test_provide_coef(SGDEstimator):
         (SparseSGDOneClassSVM, {"offset_init": np.zeros((3,))}),
     ],
 )
-def test_set_intercept_offset(SGDEstimator, fit_params):
+def test_set_intercept_offset(klass, fit_params):
     """Check that `intercept_init` or `offset_init` is validated."""
-    sgd_estimator = SGDEstimator()
+    sgd_estimator = klass()
     with pytest.raises(ValueError, match="does not match dataset"):
         sgd_estimator.fit(X, Y, **fit_params)
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
 )
-def test_sgd_early_stopping_with_partial_fit(SGDEstimator):
+def test_sgd_early_stopping_with_partial_fit(klass):
     """Check that we raise an error for `early_stopping` used with
     `partial_fit`.
     """
     err_msg = "early_stopping should be False with partial_fit"
     with pytest.raises(ValueError, match=err_msg):
-        SGDEstimator(early_stopping=True).partial_fit(X, Y)
+        klass(early_stopping=True).partial_fit(X, Y)
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator, fit_params",
+    "klass, fit_params",
     [
         (SGDClassifier, {"intercept_init": 0}),
         (SparseSGDClassifier, {"intercept_init": 0}),
@@ -617,14 +617,14 @@ def test_sgd_early_stopping_with_partial_fit(SGDEstimator):
         (SparseSGDOneClassSVM, {"offset_init": 0}),
     ],
 )
-def test_set_intercept_offset_binary(SGDEstimator, fit_params):
+def test_set_intercept_offset_binary(klass, fit_params):
     """Check that we can pass a scaler with binary classification to
     `intercept_init` or `offset_init`."""
-    SGDEstimator().fit(X5, Y5, **fit_params)
+    klass().fit(X5, Y5, **fit_params)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_average_binary_computed_correctly(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_average_binary_computed_correctly(klass):
     # Checks the SGDClassifier correctly computes the average weights
     eta = 0.1
     alpha = 2.0
@@ -634,7 +634,7 @@ def test_average_binary_computed_correctly(SGDEstimator):
     X = rng.normal(size=(n_samples, n_features))
     w = rng.normal(size=n_features)
 
-    clf = SGDEstimator(
+    clf = klass(
         loss="squared_error",
         learning_rate="constant",
         eta0=eta,
@@ -651,32 +651,32 @@ def test_average_binary_computed_correctly(SGDEstimator):
 
     clf.fit(X, y)
 
-    average_weights, average_intercept = asgd(SGDEstimator, X, y, eta, alpha)
+    average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
     average_weights = average_weights.reshape(1, -1)
     assert_array_almost_equal(clf.coef_, average_weights, decimal=14)
     assert_almost_equal(clf.intercept_, average_intercept, decimal=14)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_set_intercept_to_intercept(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_set_intercept_to_intercept(klass):
     # Checks intercept_ shape consistency for the warm starts
     # Inconsistent intercept_ shape.
-    clf = SGDEstimator().fit(X5, Y5)
-    SGDEstimator().fit(X5, Y5, intercept_init=clf.intercept_)
-    clf = SGDEstimator().fit(X, Y)
-    SGDEstimator().fit(X, Y, intercept_init=clf.intercept_)
+    clf = klass().fit(X5, Y5)
+    klass().fit(X5, Y5, intercept_init=clf.intercept_)
+    clf = klass().fit(X, Y)
+    klass().fit(X, Y, intercept_init=clf.intercept_)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_at_least_two_labels(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_at_least_two_labels(klass):
     # Target must have at least two labels
-    clf = SGDEstimator(alpha=0.01, max_iter=20)
+    clf = klass(alpha=0.01, max_iter=20)
     with pytest.raises(ValueError):
         clf.fit(X2, np.ones(9))
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_partial_fit_weight_class_balanced(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_weight_class_balanced(klass):
     # partial_fit with class_weight='balanced' not supported"""
     regex = (
         r"class_weight 'balanced' is not supported for "
@@ -689,13 +689,13 @@ def test_partial_fit_weight_class_balanced(SGDEstimator):
         r"parameter\."
     )
     with pytest.raises(ValueError, match=regex):
-        SGDEstimator(class_weight="balanced").partial_fit(X, Y, classes=np.unique(Y))
+        klass(class_weight="balanced").partial_fit(X, Y, classes=np.unique(Y))
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_multiclass(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass(klass):
     # Multi-class test case
-    clf = SGDEstimator(alpha=0.01, max_iter=20).fit(X2, Y2)
+    clf = klass(alpha=0.01, max_iter=20).fit(X2, Y2)
     assert clf.coef_.shape == (3, 2)
     assert clf.intercept_.shape == (3,)
     assert clf.decision_function([[0, 0]]).shape == (1, 3)
@@ -703,12 +703,12 @@ def test_sgd_multiclass(SGDEstimator):
     assert_array_equal(pred, true_result2)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_multiclass_average(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass_average(klass):
     eta = 0.001
     alpha = 0.01
     # Multi-class average test case
-    clf = SGDEstimator(
+    clf = klass(
         loss="squared_error",
         learning_rate="constant",
         eta0=eta,
@@ -726,15 +726,15 @@ def test_sgd_multiclass_average(SGDEstimator):
     for i, cl in enumerate(classes):
         y_i = np.ones(np_Y2.shape[0])
         y_i[np_Y2 != cl] = -1
-        average_coef, average_intercept = asgd(SGDEstimator, X2, y_i, eta, alpha)
+        average_coef, average_intercept = asgd(klass, X2, y_i, eta, alpha)
         assert_array_almost_equal(average_coef, clf.coef_[i], decimal=16)
         assert_almost_equal(average_intercept, clf.intercept_[i], decimal=16)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_multiclass_with_init_coef(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass_with_init_coef(klass):
     # Multi-class test case
-    clf = SGDEstimator(alpha=0.01, max_iter=20)
+    clf = klass(alpha=0.01, max_iter=20)
     clf.fit(X2, Y2, coef_init=np.zeros((3, 2)), intercept_init=np.zeros(3))
     assert clf.coef_.shape == (3, 2)
     assert clf.intercept_.shape, (3,)
@@ -742,10 +742,10 @@ def test_sgd_multiclass_with_init_coef(SGDEstimator):
     assert_array_equal(pred, true_result2)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_multiclass_njobs(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass_njobs(klass):
     # Multi-class test case with multi-core support
-    clf = SGDEstimator(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2)
+    clf = klass(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2)
     assert clf.coef_.shape == (3, 2)
     assert clf.intercept_.shape == (3,)
     assert clf.decision_function([[0, 0]]).shape == (1, 3)
@@ -753,31 +753,31 @@ def test_sgd_multiclass_njobs(SGDEstimator):
     assert_array_equal(pred, true_result2)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_set_coef_multiclass(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_set_coef_multiclass(klass):
     # Checks coef_init and intercept_init shape for multi-class
     # problems
     # Provided coef_ does not match dataset
-    clf = SGDEstimator()
+    clf = klass()
     with pytest.raises(ValueError):
         clf.fit(X2, Y2, coef_init=np.zeros((2, 2)))
 
     # Provided coef_ does match dataset
-    clf = SGDEstimator().fit(X2, Y2, coef_init=np.zeros((3, 2)))
+    clf = klass().fit(X2, Y2, coef_init=np.zeros((3, 2)))
 
     # Provided intercept_ does not match dataset
-    clf = SGDEstimator()
+    clf = klass()
     with pytest.raises(ValueError):
         clf.fit(X2, Y2, intercept_init=np.zeros((1,)))
 
     # Provided intercept_ does match dataset.
-    clf = SGDEstimator().fit(X2, Y2, intercept_init=np.zeros((3,)))
+    clf = klass().fit(X2, Y2, intercept_init=np.zeros((3,)))
 
 
 # TODO: Remove filterwarnings in v1.2.
 @pytest.mark.filterwarnings("ignore:.*squared_loss.*:FutureWarning")
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_predict_proba_method_access(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_predict_proba_method_access(klass):
     # Checks that SGDClassifier predict_proba and predict_log_proba methods
     # can either be accessed or raise an appropriate error message
     # otherwise. See
@@ -800,8 +800,8 @@ def test_sgd_predict_proba_method_access(SGDEstimator):
                 clf.predict_log_proba
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_proba(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_proba(klass):
     # Check SGD.predict_proba
 
     # Hinge loss does not allow for conditional prob estimate.
@@ -814,7 +814,7 @@ def test_sgd_proba(SGDEstimator):
     # log and modified_huber losses can output probability estimates
     # binary case
     for loss in ["log", "modified_huber"]:
-        clf = SGDEstimator(loss=loss, alpha=0.01, max_iter=10)
+        clf = klass(loss=loss, alpha=0.01, max_iter=10)
         clf.fit(X, Y)
         p = clf.predict_proba([[3, 2]])
         assert p[0, 1] > 0.5
@@ -827,7 +827,7 @@ def test_sgd_proba(SGDEstimator):
         assert p[0, 1] < p[0, 0]
 
     # log loss multiclass probability estimates
-    clf = SGDEstimator(loss="log", alpha=0.01, max_iter=10).fit(X2, Y2)
+    clf = klass(loss="log", alpha=0.01, max_iter=10).fit(X2, Y2)
 
     d = clf.decision_function([[0.1, -0.1], [0.3, 0.2]])
     p = clf.predict_proba([[0.1, -0.1], [0.3, 0.2]])
@@ -850,11 +850,11 @@ def test_sgd_proba(SGDEstimator):
     # Modified Huber multiclass probability estimates; requires a separate
     # test because the hard zero/one probabilities may destroy the
     # ordering present in decision_function output.
-    clf = SGDEstimator(loss="modified_huber", alpha=0.01, max_iter=10)
+    clf = klass(loss="modified_huber", alpha=0.01, max_iter=10)
     clf.fit(X2, Y2)
     d = clf.decision_function([[3, 2]])
     p = clf.predict_proba([[3, 2]])
-    if SGDEstimator != SparseSGDClassifier:
+    if klass != SparseSGDClassifier:
         assert np.argmax(d, axis=1) == np.argmax(p, axis=1)
     else:  # XXX the sparse test gets a different X2 (?)
         assert np.argmin(d, axis=1) == np.argmin(p, axis=1)
@@ -869,8 +869,8 @@ def test_sgd_proba(SGDEstimator):
         assert_array_almost_equal(p[0], [1 / 3.0] * 3)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_sgd_l1(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sgd_l1(klass):
     # Test L1 regularization
     n = len(X4)
     rng = np.random.RandomState(13)
@@ -880,7 +880,7 @@ def test_sgd_l1(SGDEstimator):
     X = X4[idx, :]
     Y = Y4[idx]
 
-    clf = SGDEstimator(
+    clf = klass(
         penalty="l1",
         alpha=0.2,
         fit_intercept=False,
@@ -906,20 +906,18 @@ def test_sgd_l1(SGDEstimator):
     assert_array_equal(pred, Y)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_class_weights(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_class_weights(klass):
     # Test class weights.
     X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
     y = [1, 1, 1, -1, -1]
 
-    clf = SGDEstimator(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight=None)
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight=None)
     clf.fit(X, y)
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
 
     # we give a small weights to class 1
-    clf = SGDEstimator(
-        alpha=0.1, max_iter=1000, fit_intercept=False, class_weight={1: 0.001}
-    )
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight={1: 0.001})
     clf.fit(X, y)
 
     # now the hyperplane should rotate clock-wise and
@@ -927,41 +925,41 @@ def test_class_weights(SGDEstimator):
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_equal_class_weight(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_equal_class_weight(klass):
     # Test if equal class weights approx. equals no class weights.
     X = [[1, 0], [1, 0], [0, 1], [0, 1]]
     y = [0, 0, 1, 1]
-    clf = SGDEstimator(alpha=0.1, max_iter=1000, class_weight=None)
+    clf = klass(alpha=0.1, max_iter=1000, class_weight=None)
     clf.fit(X, y)
 
     X = [[1, 0], [0, 1]]
     y = [0, 1]
-    clf_weighted = SGDEstimator(alpha=0.1, max_iter=1000, class_weight={0: 0.5, 1: 0.5})
+    clf_weighted = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5, 1: 0.5})
     clf_weighted.fit(X, y)
 
     # should be similar up to some epsilon due to learning rate schedule
     assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_wrong_class_weight_label(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_wrong_class_weight_label(klass):
     # ValueError due to not existing class label.
-    clf = SGDEstimator(alpha=0.1, max_iter=1000, class_weight={0: 0.5})
+    clf = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5})
     with pytest.raises(ValueError):
         clf.fit(X, Y)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_wrong_class_weight_format(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_wrong_class_weight_format(klass):
     # ValueError due to wrong class_weight argument type.
-    clf = SGDEstimator(alpha=0.1, max_iter=1000, class_weight=[0.5])
+    clf = klass(alpha=0.1, max_iter=1000, class_weight=[0.5])
     with pytest.raises(ValueError):
         clf.fit(X, Y)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_weights_multiplied(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_weights_multiplied(klass):
     # Tests that class_weight and sample_weight are multiplicative
     class_weights = {1: 0.6, 2: 0.3}
     rng = np.random.RandomState(0)
@@ -970,8 +968,8 @@ def test_weights_multiplied(SGDEstimator):
     multiplied_together[Y4 == 1] *= class_weights[1]
     multiplied_together[Y4 == 2] *= class_weights[2]
 
-    clf1 = SGDEstimator(alpha=0.1, max_iter=20, class_weight=class_weights)
-    clf2 = SGDEstimator(alpha=0.1, max_iter=20)
+    clf1 = klass(alpha=0.1, max_iter=20, class_weight=class_weights)
+    clf2 = klass(alpha=0.1, max_iter=20)
 
     clf1.fit(X4, Y4, sample_weight=sample_weights)
     clf2.fit(X4, Y4, sample_weight=multiplied_together)
@@ -979,8 +977,8 @@ def test_weights_multiplied(SGDEstimator):
     assert_almost_equal(clf1.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_balanced_weight(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_balanced_weight(klass):
     # Test class weights for imbalanced data"""
     # compute reference metrics on iris dataset that is quite balanced by
     # default
@@ -991,14 +989,12 @@ def test_balanced_weight(SGDEstimator):
     rng.shuffle(idx)
     X = X[idx]
     y = y[idx]
-    clf = SGDEstimator(
-        alpha=0.0001, max_iter=1000, class_weight=None, shuffle=False
-    ).fit(X, y)
+    clf = klass(alpha=0.0001, max_iter=1000, class_weight=None, shuffle=False).fit(X, y)
     f1 = metrics.f1_score(y, clf.predict(X), average="weighted")
     assert_almost_equal(f1, 0.96, decimal=1)
 
     # make the same prediction using balanced class_weight
-    clf_balanced = SGDEstimator(
+    clf_balanced = klass(
         alpha=0.0001, max_iter=1000, class_weight="balanced", shuffle=False
     ).fit(X, y)
     f1 = metrics.f1_score(y, clf_balanced.predict(X), average="weighted")
@@ -1016,25 +1012,25 @@ def test_balanced_weight(SGDEstimator):
     y_imbalanced = np.concatenate([y] + [y_0] * 10)
 
     # fit a model on the imbalanced data without class weight info
-    clf = SGDEstimator(max_iter=1000, class_weight=None, shuffle=False)
+    clf = klass(max_iter=1000, class_weight=None, shuffle=False)
     clf.fit(X_imbalanced, y_imbalanced)
     y_pred = clf.predict(X)
     assert metrics.f1_score(y, y_pred, average="weighted") < 0.96
 
     # fit a model with balanced class_weight enabled
-    clf = SGDEstimator(max_iter=1000, class_weight="balanced", shuffle=False)
+    clf = klass(max_iter=1000, class_weight="balanced", shuffle=False)
     clf.fit(X_imbalanced, y_imbalanced)
     y_pred = clf.predict(X)
     assert metrics.f1_score(y, y_pred, average="weighted") > 0.96
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_sample_weights(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_sample_weights(klass):
     # Test weights on individual samples
     X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
     y = [1, 1, 1, -1, -1]
 
-    clf = SGDEstimator(alpha=0.1, max_iter=1000, fit_intercept=False)
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
     clf.fit(X, y)
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
 
@@ -1047,32 +1043,32 @@ def test_sample_weights(SGDEstimator):
 
 
 @pytest.mark.parametrize(
-    "SGDEstimator",
+    "klass",
     [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM],
 )
-def test_wrong_sample_weights(SGDEstimator):
+def test_wrong_sample_weights(klass):
     # Test if ValueError is raised if sample_weight has wrong shape
-    if SGDEstimator in [SGDClassifier, SparseSGDClassifier]:
-        clf = SGDEstimator(alpha=0.1, max_iter=1000, fit_intercept=False)
-    elif SGDEstimator in [SGDOneClassSVM, SparseSGDOneClassSVM]:
-        clf = SGDEstimator(nu=0.1, max_iter=1000, fit_intercept=False)
+    if klass in [SGDClassifier, SparseSGDClassifier]:
+        clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
+    elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
+        clf = klass(nu=0.1, max_iter=1000, fit_intercept=False)
     # provided sample_weight too long
     with pytest.raises(ValueError):
         clf.fit(X, Y, sample_weight=np.arange(7))
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_partial_fit_exception(SGDEstimator):
-    clf = SGDEstimator(alpha=0.01)
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_exception(klass):
+    clf = klass(alpha=0.01)
     # classes was not specified
     with pytest.raises(ValueError):
         clf.partial_fit(X3, Y3)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_partial_fit_binary(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_binary(klass):
     third = X.shape[0] // 3
-    clf = SGDEstimator(alpha=0.01)
+    clf = klass(alpha=0.01)
     classes = np.unique(Y)
 
     clf.partial_fit(X[:third], Y[:third], classes=classes)
@@ -1090,10 +1086,10 @@ def test_partial_fit_binary(SGDEstimator):
     assert_array_equal(y_pred, true_result)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_partial_fit_multiclass(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_multiclass(klass):
     third = X2.shape[0] // 3
-    clf = SGDEstimator(alpha=0.01)
+    clf = klass(alpha=0.01)
     classes = np.unique(Y2)
 
     clf.partial_fit(X2[:third], Y2[:third], classes=classes)
@@ -1108,10 +1104,10 @@ def test_partial_fit_multiclass(SGDEstimator):
     assert id1, id2
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_partial_fit_multiclass_average(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_multiclass_average(klass):
     third = X2.shape[0] // 3
-    clf = SGDEstimator(alpha=0.01, average=X2.shape[0])
+    clf = klass(alpha=0.01, average=X2.shape[0])
     classes = np.unique(Y2)
 
     clf.partial_fit(X2[:third], Y2[:third], classes=classes)
@@ -1123,29 +1119,27 @@ def test_partial_fit_multiclass_average(SGDEstimator):
     assert clf.intercept_.shape == (3,)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_fit_then_partial_fit(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_fit_then_partial_fit(klass):
     # Partial_fit should work after initial fit in the multiclass case.
     # Non-regression test for #2496; fit would previously produce a
     # Fortran-ordered coef_ that subsequent partial_fit couldn't handle.
-    clf = SGDEstimator()
+    clf = klass()
     clf.fit(X2, Y2)
     clf.partial_fit(X2, Y2)  # no exception here
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
 @pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
-def test_partial_fit_equal_fit_classif(SGDEstimator, lr):
+def test_partial_fit_equal_fit_classif(klass, lr):
     for X_, Y_, T_ in ((X, Y, T), (X2, Y2, T2)):
-        clf = SGDEstimator(
-            alpha=0.01, eta0=0.01, max_iter=2, learning_rate=lr, shuffle=False
-        )
+        clf = klass(alpha=0.01, eta0=0.01, max_iter=2, learning_rate=lr, shuffle=False)
         clf.fit(X_, Y_)
         y_pred = clf.decision_function(T_)
         t = clf.t_
 
         classes = np.unique(Y_)
-        clf = SGDEstimator(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
+        clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
         for i in range(2):
             clf.partial_fit(X_, Y_, classes=classes)
         y_pred2 = clf.decision_function(T_)
@@ -1154,10 +1148,10 @@ def test_partial_fit_equal_fit_classif(SGDEstimator, lr):
         assert_array_almost_equal(y_pred, y_pred2, decimal=2)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_regression_losses(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_regression_losses(klass):
     random_state = np.random.RandomState(1)
-    clf = SGDEstimator(
+    clf = klass(
         alpha=0.01,
         learning_rate="constant",
         eta0=0.1,
@@ -1167,7 +1161,7 @@ def test_regression_losses(SGDEstimator):
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
-    clf = SGDEstimator(
+    clf = klass(
         alpha=0.01,
         learning_rate="constant",
         eta0=0.1,
@@ -1177,11 +1171,11 @@ def test_regression_losses(SGDEstimator):
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
-    clf = SGDEstimator(alpha=0.01, loss="huber", random_state=random_state)
+    clf = klass(alpha=0.01, loss="huber", random_state=random_state)
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
-    clf = SGDEstimator(
+    clf = klass(
         alpha=0.01,
         learning_rate="constant",
         eta0=0.01,
@@ -1192,15 +1186,15 @@ def test_regression_losses(SGDEstimator):
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_warm_start_multiclass(SGDEstimator):
-    _test_warm_start(SGDEstimator, X2, Y2, "optimal")
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_warm_start_multiclass(klass):
+    _test_warm_start(klass, X2, Y2, "optimal")
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDClassifier, SparseSGDClassifier])
-def test_multiple_fit(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
+def test_multiple_fit(klass):
     # Test multiple calls of fit w/ different shaped inputs.
-    clf = SGDEstimator(alpha=0.01, shuffle=False)
+    clf = klass(alpha=0.01, shuffle=False)
     clf.fit(X, Y)
     assert hasattr(clf, "coef_")
 
@@ -1213,16 +1207,16 @@ def test_multiple_fit(SGDEstimator):
 # Regression Test Case
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
-def test_sgd_reg(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_reg(klass):
     # Check that SGD gives any results.
-    clf = SGDEstimator(alpha=0.1, max_iter=2, fit_intercept=False)
+    clf = klass(alpha=0.1, max_iter=2, fit_intercept=False)
     clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
     assert clf.coef_[0] == clf.coef_[1]
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
-def test_sgd_averaged_computed_correctly(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_averaged_computed_correctly(klass):
     # Tests the average regressor matches the naive implementation
 
     eta = 0.001
@@ -1236,7 +1230,7 @@ def test_sgd_averaged_computed_correctly(SGDEstimator):
     # simple linear function without noise
     y = np.dot(X, w)
 
-    clf = SGDEstimator(
+    clf = klass(
         loss="squared_error",
         learning_rate="constant",
         eta0=eta,
@@ -1248,14 +1242,14 @@ def test_sgd_averaged_computed_correctly(SGDEstimator):
     )
 
     clf.fit(X, y)
-    average_weights, average_intercept = asgd(SGDEstimator, X, y, eta, alpha)
+    average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
 
     assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
     assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
-def test_sgd_averaged_partial_fit(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_averaged_partial_fit(klass):
     # Tests whether the partial fit yields the same average as the fit
     eta = 0.001
     alpha = 0.01
@@ -1268,7 +1262,7 @@ def test_sgd_averaged_partial_fit(SGDEstimator):
     # simple linear function without noise
     y = np.dot(X, w)
 
-    clf = SGDEstimator(
+    clf = klass(
         loss="squared_error",
         learning_rate="constant",
         eta0=eta,
@@ -1281,19 +1275,19 @@ def test_sgd_averaged_partial_fit(SGDEstimator):
 
     clf.partial_fit(X[: int(n_samples / 2)][:], y[: int(n_samples / 2)])
     clf.partial_fit(X[int(n_samples / 2) :][:], y[int(n_samples / 2) :])
-    average_weights, average_intercept = asgd(SGDEstimator, X, y, eta, alpha)
+    average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
 
     assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
     assert_almost_equal(clf.intercept_[0], average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
-def test_average_sparse(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_average_sparse(klass):
     # Checks the average weights on data with 0s
 
     eta = 0.001
     alpha = 0.01
-    clf = SGDEstimator(
+    clf = klass(
         loss="squared_error",
         learning_rate="constant",
         eta0=eta,
@@ -1308,14 +1302,14 @@ def test_average_sparse(SGDEstimator):
 
     clf.partial_fit(X3[: int(n_samples / 2)][:], Y3[: int(n_samples / 2)])
     clf.partial_fit(X3[int(n_samples / 2) :][:], Y3[int(n_samples / 2) :])
-    average_weights, average_intercept = asgd(SGDEstimator, X3, Y3, eta, alpha)
+    average_weights, average_intercept = asgd(klass, X3, Y3, eta, alpha)
 
     assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
     assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
-def test_sgd_least_squares_fit(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_least_squares_fit(klass):
     xmin, xmax = -5, 5
     n_samples = 100
     rng = np.random.RandomState(0)
@@ -1324,9 +1318,7 @@ def test_sgd_least_squares_fit(SGDEstimator):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = SGDEstimator(
-        loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False
-    )
+    clf = klass(loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.99
@@ -1334,16 +1326,14 @@ def test_sgd_least_squares_fit(SGDEstimator):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = SGDEstimator(
-        loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False
-    )
+    clf = klass(loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.5
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
-def test_sgd_epsilon_insensitive(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_epsilon_insensitive(klass):
     xmin, xmax = -5, 5
     n_samples = 100
     rng = np.random.RandomState(0)
@@ -1352,7 +1342,7 @@ def test_sgd_epsilon_insensitive(SGDEstimator):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = SGDEstimator(
+    clf = klass(
         loss="epsilon_insensitive",
         epsilon=0.01,
         alpha=0.1,
@@ -1366,7 +1356,7 @@ def test_sgd_epsilon_insensitive(SGDEstimator):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = SGDEstimator(
+    clf = klass(
         loss="epsilon_insensitive",
         epsilon=0.01,
         alpha=0.1,
@@ -1378,8 +1368,8 @@ def test_sgd_epsilon_insensitive(SGDEstimator):
     assert score > 0.5
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
-def test_sgd_huber_fit(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_sgd_huber_fit(klass):
     xmin, xmax = -5, 5
     n_samples = 100
     rng = np.random.RandomState(0)
@@ -1388,9 +1378,7 @@ def test_sgd_huber_fit(SGDEstimator):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = SGDEstimator(
-        loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False
-    )
+    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.99
@@ -1398,16 +1386,14 @@ def test_sgd_huber_fit(SGDEstimator):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = SGDEstimator(
-        loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False
-    )
+    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
     assert score > 0.5
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
-def test_elasticnet_convergence(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_elasticnet_convergence(klass):
     # Check that the SGD output is consistent with coordinate descent
 
     n_samples, n_features = 1000, 5
@@ -1425,7 +1411,7 @@ def test_elasticnet_convergence(SGDEstimator):
                 alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False
             )
             cd.fit(X, y)
-            sgd = SGDEstimator(
+            sgd = klass(
                 penalty="elasticnet",
                 max_iter=50,
                 alpha=alpha,
@@ -1441,10 +1427,10 @@ def test_elasticnet_convergence(SGDEstimator):
 
 
 @ignore_warnings
-@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
-def test_partial_fit(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_partial_fit(klass):
     third = X.shape[0] // 3
-    clf = SGDEstimator(alpha=0.01)
+    clf = klass(alpha=0.01)
 
     clf.partial_fit(X[:third], Y[:third])
     assert clf.coef_.shape == (X.shape[1],)
@@ -1458,17 +1444,15 @@ def test_partial_fit(SGDEstimator):
     assert id1, id2
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
 @pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
-def test_partial_fit_equal_fit(SGDEstimator, lr):
-    clf = SGDEstimator(
-        alpha=0.01, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False
-    )
+def test_partial_fit_equal_fit(klass, lr):
+    clf = klass(alpha=0.01, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False)
     clf.fit(X, Y)
     y_pred = clf.predict(T)
     t = clf.t_
 
-    clf = SGDEstimator(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
+    clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
     for i in range(2):
         clf.partial_fit(X, Y)
     y_pred2 = clf.predict(T)
@@ -1477,9 +1461,9 @@ def test_partial_fit_equal_fit(SGDEstimator, lr):
     assert_array_almost_equal(y_pred, y_pred2, decimal=2)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDRegressor, SparseSGDRegressor])
-def test_loss_function_epsilon(SGDEstimator):
-    clf = SGDEstimator(epsilon=0.9)
+@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
+def test_loss_function_epsilon(klass):
+    clf = klass(epsilon=0.9)
     clf.set_params(epsilon=0.1)
     assert clf.loss_functions["huber"][1] == 0.1
 
@@ -1488,7 +1472,7 @@ def test_loss_function_epsilon(SGDEstimator):
 # SGD One Class SVM Test Case
 
 # a simple implementation of ASGD to use for testing SGDOneClassSVM
-def asgd_oneclass(SGDEstimator, X, eta, nu, coef_init=None, offset_init=0.0):
+def asgd_oneclass(klass, X, eta, nu, coef_init=None, offset_init=0.0):
     if coef_init is None:
         coef = np.zeros(X.shape[1])
     else:
@@ -1501,7 +1485,7 @@ def asgd_oneclass(SGDEstimator, X, eta, nu, coef_init=None, offset_init=0.0):
     decay = 1.0
 
     # sparse data has a fixed decay of .01
-    if SGDEstimator == SparseSGDOneClassSVM:
+    if klass == SparseSGDOneClassSVM:
         decay = 0.01
 
     for i, entry in enumerate(X):
@@ -1526,19 +1510,17 @@ def asgd_oneclass(SGDEstimator, X, eta, nu, coef_init=None, offset_init=0.0):
     return average_coef, 1 - average_intercept
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def _test_warm_start_oneclass(SGDEstimator, X, lr):
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def _test_warm_start_oneclass(klass, X, lr):
     # Test that explicit warm restart...
-    clf = SGDEstimator(nu=0.5, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf = klass(nu=0.5, eta0=0.01, shuffle=False, learning_rate=lr)
     clf.fit(X)
 
-    clf2 = SGDEstimator(nu=0.1, eta0=0.01, shuffle=False, learning_rate=lr)
+    clf2 = klass(nu=0.1, eta0=0.01, shuffle=False, learning_rate=lr)
     clf2.fit(X, coef_init=clf.coef_.copy(), offset_init=clf.offset_.copy())
 
     # ... and implicit warm restart are equivalent.
-    clf3 = SGDEstimator(
-        nu=0.5, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr
-    )
+    clf3 = klass(nu=0.5, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr)
     clf3.fit(X)
 
     assert clf3.t_ == clf.t_
@@ -1551,30 +1533,30 @@ def _test_warm_start_oneclass(SGDEstimator, X, lr):
     assert_allclose(clf3.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
 @pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
-def test_warm_start_oneclass(SGDEstimator, lr):
-    _test_warm_start_oneclass(SGDEstimator, X, lr)
+def test_warm_start_oneclass(klass, lr):
+    _test_warm_start_oneclass(klass, X, lr)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def test_clone_oneclass(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_clone_oneclass(klass):
     # Test whether clone works ok.
-    clf = SGDEstimator(nu=0.5)
+    clf = klass(nu=0.5)
     clf = clone(clf)
     clf.set_params(nu=0.1)
     clf.fit(X)
 
-    clf2 = SGDEstimator(nu=0.1)
+    clf2 = klass(nu=0.1)
     clf2.fit(X)
 
     assert_array_equal(clf.coef_, clf2.coef_)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def test_partial_fit_oneclass(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_partial_fit_oneclass(klass):
     third = X.shape[0] // 3
-    clf = SGDEstimator(nu=0.1)
+    clf = klass(nu=0.1)
 
     clf.partial_fit(X[:third])
     assert clf.coef_.shape == (X.shape[1],)
@@ -1591,17 +1573,17 @@ def test_partial_fit_oneclass(SGDEstimator):
         clf.partial_fit(X[:, 1])
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
 @pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
-def test_partial_fit_equal_fit_oneclass(SGDEstimator, lr):
-    clf = SGDEstimator(nu=0.05, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False)
+def test_partial_fit_equal_fit_oneclass(klass, lr):
+    clf = klass(nu=0.05, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False)
     clf.fit(X)
     y_scores = clf.decision_function(T)
     t = clf.t_
     coef = clf.coef_
     offset = clf.offset_
 
-    clf = SGDEstimator(nu=0.05, eta0=0.01, max_iter=1, learning_rate=lr, shuffle=False)
+    clf = klass(nu=0.05, eta0=0.01, max_iter=1, learning_rate=lr, shuffle=False)
     for _ in range(2):
         clf.partial_fit(X)
     y_scores2 = clf.decision_function(T)
@@ -1612,18 +1594,18 @@ def test_partial_fit_equal_fit_oneclass(SGDEstimator, lr):
     assert_allclose(clf.offset_, offset)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def test_late_onset_averaging_reached_oneclass(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_late_onset_averaging_reached_oneclass(klass):
     # Test average
     eta0 = 0.001
     nu = 0.05
 
     # 2 passes over the training set but average only at second pass
-    clf1 = SGDEstimator(
+    clf1 = klass(
         average=7, learning_rate="constant", eta0=eta0, nu=nu, max_iter=2, shuffle=False
     )
     # 1 pass over the training set with no averaging
-    clf2 = SGDEstimator(
+    clf2 = klass(
         average=0, learning_rate="constant", eta0=eta0, nu=nu, max_iter=1, shuffle=False
     )
 
@@ -1633,7 +1615,7 @@ def test_late_onset_averaging_reached_oneclass(SGDEstimator):
     # Start from clf2 solution, compute averaging using asgd function and
     # compare with clf1 solution
     average_coef, average_offset = asgd_oneclass(
-        SGDEstimator,
+        klass,
         X,
         eta0,
         nu,
@@ -1645,8 +1627,8 @@ def test_late_onset_averaging_reached_oneclass(SGDEstimator):
     assert_allclose(clf1.offset_, average_offset)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def test_sgd_averaged_computed_correctly_oneclass(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_sgd_averaged_computed_correctly_oneclass(klass):
     # Tests the average SGD One-Class SVM matches the naive implementation
     eta = 0.001
     nu = 0.05
@@ -1655,7 +1637,7 @@ def test_sgd_averaged_computed_correctly_oneclass(SGDEstimator):
     rng = np.random.RandomState(0)
     X = rng.normal(size=(n_samples, n_features))
 
-    clf = SGDEstimator(
+    clf = klass(
         learning_rate="constant",
         eta0=eta,
         nu=nu,
@@ -1666,14 +1648,14 @@ def test_sgd_averaged_computed_correctly_oneclass(SGDEstimator):
     )
 
     clf.fit(X)
-    average_coef, average_offset = asgd_oneclass(SGDEstimator, X, eta, nu)
+    average_coef, average_offset = asgd_oneclass(klass, X, eta, nu)
 
     assert_allclose(clf.coef_, average_coef)
     assert_allclose(clf.offset_, average_offset)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def test_sgd_averaged_partial_fit_oneclass(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_sgd_averaged_partial_fit_oneclass(klass):
     # Tests whether the partial fit yields the same average as the fit
     eta = 0.001
     nu = 0.05
@@ -1682,7 +1664,7 @@ def test_sgd_averaged_partial_fit_oneclass(SGDEstimator):
     rng = np.random.RandomState(0)
     X = rng.normal(size=(n_samples, n_features))
 
-    clf = SGDEstimator(
+    clf = klass(
         learning_rate="constant",
         eta0=eta,
         nu=nu,
@@ -1694,18 +1676,18 @@ def test_sgd_averaged_partial_fit_oneclass(SGDEstimator):
 
     clf.partial_fit(X[: int(n_samples / 2)][:])
     clf.partial_fit(X[int(n_samples / 2) :][:])
-    average_coef, average_offset = asgd_oneclass(SGDEstimator, X, eta, nu)
+    average_coef, average_offset = asgd_oneclass(klass, X, eta, nu)
 
     assert_allclose(clf.coef_, average_coef)
     assert_allclose(clf.offset_, average_offset)
 
 
-@pytest.mark.parametrize("SGDEstimator", [SGDOneClassSVM, SparseSGDOneClassSVM])
-def test_average_sparse_oneclass(SGDEstimator):
+@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_average_sparse_oneclass(klass):
     # Checks the average coef on data with 0s
     eta = 0.001
     nu = 0.01
-    clf = SGDEstimator(
+    clf = klass(
         learning_rate="constant",
         eta0=eta,
         nu=nu,
@@ -1719,7 +1701,7 @@ def test_average_sparse_oneclass(SGDEstimator):
 
     clf.partial_fit(X3[: int(n_samples / 2)])
     clf.partial_fit(X3[int(n_samples / 2) :])
-    average_coef, average_offset = asgd_oneclass(SGDEstimator, X3, eta, nu)
+    average_coef, average_offset = asgd_oneclass(klass, X3, eta, nu)
 
     assert_allclose(clf.coef_, average_coef)
     assert_allclose(clf.offset_, average_offset)

From 3ac6eb4146301ac3ca8231f182e2672c9fe1f3c9 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 6 Aug 2021 11:23:35 +0200
Subject: [PATCH 08/14] iter

---
 sklearn/linear_model/tests/test_sgd.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 5ec7084ffafb4..7495a823bcd2b 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -292,8 +292,7 @@ def _test_warm_start(klass, X, Y, lr):
 
 
 @pytest.mark.parametrize(
-    "klass",
-    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
 )
 @pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
 def test_warm_start(klass, lr):
@@ -301,8 +300,7 @@ def test_warm_start(klass, lr):
 
 
 @pytest.mark.parametrize(
-    "klass",
-    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
 )
 def test_input_format(klass):
     # Input format tests.
@@ -316,8 +314,7 @@ def test_input_format(klass):
 
 
 @pytest.mark.parametrize(
-    "klass",
-    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
 )
 def test_clone(klass):
     # Test whether clone works ok.
@@ -396,8 +393,7 @@ def test_late_onset_averaging_not_reached(klass):
 
 
 @pytest.mark.parametrize(
-    "klass",
-    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
 )
 def test_late_onset_averaging_reached(klass):
     eta0 = 0.001
@@ -470,8 +466,7 @@ def test_adaptive_longer_than_constant(klass):
 
 
 @pytest.mark.parametrize(
-    "klass",
-    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
 )
 def test_validation_set_not_used_for_training(klass):
     X, Y = iris.data, iris.target
@@ -537,8 +532,7 @@ def test_n_iter_no_change(klass):
 
 
 @pytest.mark.parametrize(
-    "klass",
-    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
 )
 def test_not_enough_sample_for_early_stopping(klass):
     # test an error is raised if the training or validation set is empty

From 5e994aa12cbc3239dcb77dc2040637292fae58c3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 6 Aug 2021 11:24:55 +0200
Subject: [PATCH 09/14] iter

---
 sklearn/linear_model/tests/test_passive_aggressive.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py
index 6eb2297990e36..3ff92bd69a43b 100644
--- a/sklearn/linear_model/tests/test_passive_aggressive.py
+++ b/sklearn/linear_model/tests/test_passive_aggressive.py
@@ -287,8 +287,7 @@ def test_regressor_undefined_methods():
 
 
 @pytest.mark.parametrize(
-    "PassiveAggressiveEstimator",
-    [PassiveAggressiveClassifier, PassiveAggressiveRegressor],
+    "klass", [PassiveAggressiveClassifier, PassiveAggressiveRegressor]
 )
 @pytest.mark.parametrize("fit_method", ["fit", "partial_fit"])
 @pytest.mark.parametrize(
@@ -306,10 +305,10 @@ def test_regressor_undefined_methods():
     ],
 )
 def test_passive_aggressive_estimator_params_validation(
-    PassiveAggressiveEstimator, fit_method, params, err_msg
+    klass, fit_method, params, err_msg
 ):
     """Validate parameters in the different PassiveAggressive estimators."""
-    sgd_estimator = PassiveAggressiveEstimator(**params)
+    sgd_estimator = klass(**params)
 
     with pytest.raises(ValueError, match=err_msg):
         if is_classifier(sgd_estimator) and fit_method == "partial_fit":

From 5214acd9d71dd1657dfa984746f82a30874575c0 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 6 Aug 2021 11:27:10 +0200
Subject: [PATCH 10/14] iter

---
 sklearn/linear_model/tests/test_sgd.py | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 7495a823bcd2b..c26f20764c641 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -381,12 +381,7 @@ def test_late_onset_averaging_not_reached(klass):
             clf2.partial_fit(X, Y)
 
     assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16)
-    if klass in [
-        SGDClassifier,
-        SparseSGDClassifier,
-        SGDRegressor,
-        SparseSGDRegressor,
-    ]:
+    if klass in [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]:
         assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16)
     elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
         assert_allclose(clf1.offset_, clf2.offset_)
@@ -439,8 +434,7 @@ def test_late_onset_averaging_reached(klass):
 
 
 @pytest.mark.parametrize(
-    "klass",
-    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
 )
 def test_early_stopping(klass):
     X = iris.data[iris.target > 0]
@@ -454,8 +448,7 @@ def test_early_stopping(klass):
 
 
 @pytest.mark.parametrize(
-    "klass",
-    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
 )
 def test_adaptive_longer_than_constant(klass):
     clf1 = klass(learning_rate="adaptive", eta0=0.01, tol=1e-3, max_iter=100)
@@ -590,8 +583,7 @@ def test_set_intercept_offset(klass, fit_params):
 
 
 @pytest.mark.parametrize(
-    "klass",
-    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
 )
 def test_sgd_early_stopping_with_partial_fit(klass):
     """Check that we raise an error for `early_stopping` used with
@@ -1037,8 +1029,7 @@ def test_sample_weights(klass):
 
 
 @pytest.mark.parametrize(
-    "klass",
-    [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM],
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
 )
 def test_wrong_sample_weights(klass):
     # Test if ValueError is raised if sample_weight has wrong shape

From 081dc420a1e010b1782ae9d4ef52297bbc9e9911 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 6 Aug 2021 11:30:05 +0200
Subject: [PATCH 11/14] iter

---
 sklearn/linear_model/tests/test_sgd.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index c26f20764c641..f6848eba8066e 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -503,8 +503,7 @@ def test_validation_set_not_used_for_training(klass):
 
 
 @pytest.mark.parametrize(
-    "klass",
-    [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor],
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
 )
 def test_n_iter_no_change(klass):
     X, Y = iris.data, iris.target
@@ -557,8 +556,7 @@ def test_sgd_clf(klass):
 
 
 @pytest.mark.parametrize(
-    "klass",
-    [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM],
+    "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
 )
 def test_provide_coef(klass):
     """Check that the shape of `coef_init` is validated."""

From ffe4507ceb82276a3f0a1e3d35085240138d7ef5 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 6 Aug 2021 11:34:03 +0200
Subject: [PATCH 12/14] iter

---
 sklearn/linear_model/tests/test_sgd.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index f6848eba8066e..7171c860254ff 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -1598,12 +1598,7 @@ def test_late_onset_averaging_reached_oneclass(klass):
     # Start from clf2 solution, compute averaging using asgd function and
     # compare with clf1 solution
     average_coef, average_offset = asgd_oneclass(
-        klass,
-        X,
-        eta0,
-        nu,
-        coef_init=clf2.coef_.ravel(),
-        offset_init=clf2.offset_,
+        klass, X, eta0, nu, coef_init=clf2.coef_.ravel(), offset_init=clf2.offset_
     )
 
     assert_allclose(clf1.coef_.ravel(), average_coef.ravel())

From 7861d2d750ac3fd907099986cfdc7a060dc09516 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 16 Aug 2021 11:28:57 +0200
Subject: [PATCH 13/14] Update doc/whats_new/v1.0.rst

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v1.0.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 7c26c638aaaef..91aacf1ada546 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -501,8 +501,8 @@ Changelog
   coordinate descent solver. Otherwise, an error will be raised.
   :pr:`19391` by :user:`Shao Yang Hong <hongshaoyang>`.
 
-- |API| Force the validation to be made at `fit` for the following estimators
-  to follow our scikit-learn convention:
+- |API| Keyword validation has moved from `__init__` and `set_params` to `fit`
+   for the following estimators conforming to scikit-learn's conventions:
   :class:`linear_model.SGDClassifier`,
   :class:`linear_model.SparseSGDClassifier`,
   :class:`linear_model.SGDRegressor`,

From 966f18d5661dfc3461bc5850bb584c3475215d30 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 16 Aug 2021 14:17:23 +0200
Subject: [PATCH 14/14] iter

---
 doc/whats_new/v1.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index a686a9d2ced2b..3beb29f1f3eda 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -528,7 +528,7 @@ Changelog
   :pr:`19391` by :user:`Shao Yang Hong <hongshaoyang>`.
 
 - |API| Keyword validation has moved from `__init__` and `set_params` to `fit`
-   for the following estimators conforming to scikit-learn's conventions:
+  for the following estimators conforming to scikit-learn's conventions:
   :class:`linear_model.SGDClassifier`,
   :class:`linear_model.SparseSGDClassifier`,
   :class:`linear_model.SGDRegressor`,