Skip to content

FIX make creation of dataset deterministic in SGD #19716

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
39f071c
check_random_state moved up in order to use it in make_dataset and un…
PierreAttard Mar 18, 2021
4053a37
address ogrisel comments
glemaitre Apr 21, 2021
c012680
Merge remote-tracking branch 'origin/main' into pr/PierreAttard/19716
glemaitre Apr 21, 2021
2598055
DOC add whats new
glemaitre Apr 21, 2021
edc334b
Merge remote-tracking branch 'origin/main' into pr/PierreAttard/19716
glemaitre Jul 27, 2021
b5240f8
TST make data generation deterministic
glemaitre Jul 27, 2021
ff23028
docstring
glemaitre Jul 27, 2021
91a4838
Merge branch 'scikit-learn:main' into random-state-not-used-in-make_d…
PierreAttard Jul 29, 2021
f00dfa0
Merge branch 'main' into random-state-not-used-in-make_dataset
Aug 23, 2021
380da62
revert merge with main
Aug 24, 2021
60b2462
merge changelog file
Aug 24, 2021
f12c969
revert the revert merge
Aug 24, 2021
3b9eb85
merge chnagelog file
Aug 24, 2021
fb9fa2e
merge chnagelog file V2 remove PR 19716
Aug 24, 2021
5c7cc32
merge chnagelog file V2 add PR 19716
Aug 24, 2021
5761e06
Merge branch 'main' into random-state-not-used-in-make_dataset
PierreAttard Aug 24, 2021
0efc4b1
remove ".git-blame-ignore-revs" file in order to match with sdist
Aug 26, 2021
83c98d2
Merge branch 'main' into random-state-not-used-in-make_dataset
Sep 1, 2021
1057a37
Add digit in doc for class PassiveAggressiveRegressor.
Sep 2, 2021
9cc24b1
add '.git-blame-ignore-revs' file
Sep 3, 2021
ee81723
merge changelog file V2 add PR 19716
Sep 8, 2021
5afc2ae
Merge branch 'main' into random-state-not-used-in-make_dataset
PierreAttard Sep 8, 2021
eef0170
Merge branch 'master' into pr/PierreAttard/19716
jeremiedbb Mar 23, 2022
9a99e09
merge clean-up
jeremiedbb Mar 23, 2022
b7f8a3c
more cln
jeremiedbb Mar 23, 2022
4ba1966
better document how random_state is used in dataset
jeremiedbb Mar 23, 2022
83c299f
no model is changed
jeremiedbb Mar 23, 2022
62f8d45
TST add non regression test about random seed control for SGD models
ogrisel Mar 31, 2022
ba51907
Update sklearn/linear_model/tests/test_sgd.py
jeremiedbb Mar 31, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion sklearn/linear_model/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,8 @@ def make_dataset(X, y, sample_weight, random_state=None):
The weight of each sample

random_state : int, RandomState instance or None (default)
Determines random number generation for dataset shuffling and noise.
Determines random number generation for dataset random sampling. It is not
used for dataset shuffling.
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.

Expand Down
6 changes: 4 additions & 2 deletions sklearn/linear_model/_stochastic_gradient.py
Original file line number Diff line number Diff line change
Expand Up @@ -1592,8 +1592,6 @@ def predict(self, X):
def _fit_regressor(
self, X, y, alpha, C, loss, learning_rate, sample_weight, max_iter
):
dataset, intercept_decay = make_dataset(X, y, sample_weight)

loss_function = self._get_loss_function(loss)
penalty_type = self._get_penalty_type(self.penalty)
learning_rate_type = self._get_learning_rate_type(learning_rate)
Expand All @@ -1611,6 +1609,10 @@ def _fit_regressor(
# Windows
seed = random_state.randint(0, np.iinfo(np.int32).max)

dataset, intercept_decay = make_dataset(
X, y, sample_weight, random_state=random_state
)

tol = self.tol if self.tol is not None else -np.inf

if self.average:
Expand Down
38 changes: 38 additions & 0 deletions sklearn/linear_model/tests/test_sgd.py
Original file line number Diff line number Diff line change
Expand Up @@ -2140,3 +2140,41 @@ def test_loss_squared_loss_deprecated(Estimator):
assert_allclose(est1.predict_proba(X), est2.predict_proba(X))
else:
assert_allclose(est1.predict(X), est2.predict(X))


@pytest.mark.parametrize(
"Estimator", [linear_model.SGDClassifier, linear_model.SGDRegressor]
)
def test_sgd_random_state(Estimator, global_random_seed):
# Train the same model on the same data without converging and check that we
# get reproducible results by fixing the random seed.
if Estimator == linear_model.SGDRegressor:
X, y = datasets.make_regression(random_state=global_random_seed)
else:
X, y = datasets.make_classification(random_state=global_random_seed)

# Fitting twice a model with the same hyper-parameters on the same training
# set with the same seed leads to the same results deterministically.

est = Estimator(random_state=global_random_seed, max_iter=1)
with pytest.warns(ConvergenceWarning):
coef_same_seed_a = est.fit(X, y).coef_
assert est.n_iter_ == 1

est = Estimator(random_state=global_random_seed, max_iter=1)
with pytest.warns(ConvergenceWarning):
coef_same_seed_b = est.fit(X, y).coef_
assert est.n_iter_ == 1

assert_allclose(coef_same_seed_a, coef_same_seed_b)

# Fitting twice a model with the same hyper-parameters on the same training
# set but with different random seed leads to different results after one
# epoch because of the random shuffling of the dataset.

est = Estimator(random_state=global_random_seed + 1, max_iter=1)
with pytest.warns(ConvergenceWarning):
coef_other_seed = est.fit(X, y).coef_
assert est.n_iter_ == 1

assert np.abs(coef_same_seed_a - coef_other_seed).max() > 1.0
4 changes: 2 additions & 2 deletions sklearn/utils/_seq_dataset.pyx.tp
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ cdef class SequentialDataset{{name_suffix}}:
Number of samples in the dataset.

seed : np.uint32_t
Seed used for random sampling.

Seed used for random sampling. This attribute is modified at each call to the
`random` method.
"""

cdef void next(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
Expand Down