|
1 | 1 | import numpy as np
|
2 | 2 | from scipy import sparse as sp
|
| 3 | +from scipy import stats |
3 | 4 |
|
4 | 5 | import pytest
|
5 | 6 |
|
6 | 7 | from sklearn.svm._bounds import l1_min_c
|
7 | 8 | from sklearn.svm import LinearSVC
|
8 | 9 | from sklearn.linear_model import LogisticRegression
|
| 10 | +from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap |
9 | 11 |
|
10 | 12 | from sklearn.utils._testing import assert_raise_message
|
11 | 13 |
|
@@ -74,3 +76,71 @@ def test_ill_posed_min_c():
|
74 | 76 | def test_unsupported_loss():
|
75 | 77 | with pytest.raises(ValueError):
|
76 | 78 | l1_min_c(dense_X, Y1, loss='l1')
|
| 79 | + |
| 80 | + |
| 81 | +_MAX_UNSIGNED_INT = 4294967295 |
| 82 | + |
| 83 | + |
| 84 | +@pytest.mark.parametrize('seed, val', |
| 85 | + [(None, 81), |
| 86 | + (0, 54), |
| 87 | + (_MAX_UNSIGNED_INT, 9)]) |
| 88 | +def test_newrand_set_seed(seed, val): |
| 89 | + """Test that `set_seed` produces deterministic results""" |
| 90 | + if seed is not None: |
| 91 | + set_seed_wrap(seed) |
| 92 | + x = bounded_rand_int_wrap(100) |
| 93 | + assert x == val, f'Expected {val} but got {x} instead' |
| 94 | + |
| 95 | + |
| 96 | +@pytest.mark.parametrize('seed', |
| 97 | + [-1, _MAX_UNSIGNED_INT + 1]) |
| 98 | +def test_newrand_set_seed_overflow(seed): |
| 99 | + """Test that `set_seed_wrap` is defined for unsigned 32bits ints""" |
| 100 | + with pytest.raises(OverflowError): |
| 101 | + set_seed_wrap(seed) |
| 102 | + |
| 103 | + |
| 104 | +@pytest.mark.parametrize('range_, n_pts', |
| 105 | + [(_MAX_UNSIGNED_INT, 10000), (100, 25)]) |
| 106 | +def test_newrand_bounded_rand_int(range_, n_pts): |
| 107 | + """Test that `bounded_rand_int` follows a uniform distribution""" |
| 108 | + n_iter = 100 |
| 109 | + ks_pvals = [] |
| 110 | + uniform_dist = stats.uniform(loc=0, scale=range_) |
| 111 | + # perform multiple samplings to make chance of outlier sampling negligible |
| 112 | + for _ in range(n_iter): |
| 113 | + # Deterministic random sampling |
| 114 | + sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)] |
| 115 | + res = stats.kstest(sample, uniform_dist.cdf) |
| 116 | + ks_pvals.append(res.pvalue) |
| 117 | + # Null hypothesis = samples come from an uniform distribution. |
| 118 | + # Under the null hypothesis, p-values should be uniformly distributed |
| 119 | + # and not concentrated on low values |
| 120 | + # (this may seem counter-intuitive but is backed by multiple refs) |
| 121 | + # So we can do two checks: |
| 122 | + |
| 123 | + # (1) check uniformity of p-values |
| 124 | + uniform_p_vals_dist = stats.uniform(loc=0, scale=1) |
| 125 | + res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf) |
| 126 | + assert res_pvals.pvalue > 0.05, ( |
| 127 | + "Null hypothesis rejected: generated random numbers are not uniform." |
| 128 | + " Details: the (meta) p-value of the test of uniform distribution" |
| 129 | + f" of p-values is {res_pvals.pvalue} which is not > 0.05") |
| 130 | + |
| 131 | + # (2) (safety belt) check that 90% of p-values are above 0.05 |
| 132 | + min_10pct_pval = np.percentile(ks_pvals, q=10) |
| 133 | + # lower 10th quantile pvalue <= 0.05 means that the test rejects the |
| 134 | + # null hypothesis that the sample came from the uniform distribution |
| 135 | + assert min_10pct_pval > 0.05, ( |
| 136 | + "Null hypothesis rejected: generated random numbers are not uniform. " |
| 137 | + f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05." |
| 138 | + ) |
| 139 | + |
| 140 | + |
| 141 | +@pytest.mark.parametrize('range_', |
| 142 | + [-1, _MAX_UNSIGNED_INT + 1]) |
| 143 | +def test_newrand_bounded_rand_int_limits(range_): |
| 144 | + """Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints""" |
| 145 | + with pytest.raises(OverflowError): |
| 146 | + bounded_rand_int_wrap(range_) |
0 commit comments