Skip to content

Commit 430c208

Browse files
Justin-Hubersmariejnothman
authored
TST Python wrapper and tests for sklearn/svm/newrand/ (#17157)
* Implemented issue #16862 * Implemented issue #16862 * Ready for pull request * Cleaned up redundancies * Removed test_set_seed_wrap for now * Update sklearn/svm/tests/test_bounds.py Co-authored-by: smarie <sylvain.marie@schneider-electric.com> * Review feedback changes * Style fixes * np.quantile -> np.percentile * Update sklearn/svm/src/newrand/newrand.h Merged suggestion Co-authored-by: smarie <sylvain.marie@schneider-electric.com> * Update sklearn/svm/tests/test_bounds.py Merge suggestions Co-authored-by: smarie <sylvain.marie@schneider-electric.com> * linting fix * linting fix * linting fix * Added deterministic test for mt19937 in svm/newrand.h * Linting fix * Review changes * Update sklearn/svm/tests/test_bounds.py Co-authored-by: smarie <sylvain.marie@schneider-electric.com> * Update sklearn/svm/tests/test_bounds.py Co-authored-by: Joel Nothman <joel.nothman@gmail.com> * Update sklearn/svm/tests/test_bounds.py Co-authored-by: Joel Nothman <joel.nothman@gmail.com> * Update test_bounds.py * Linting fixes Co-authored-by: smarie <sylvain.marie@schneider-electric.com> Co-authored-by: Joel Nothman <joel.nothman@gmail.com>
1 parent 7eff8ae commit 430c208

File tree

4 files changed

+97
-13
lines changed

4 files changed

+97
-13
lines changed

sklearn/svm/_newrand.pyx

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"""
2+
Wrapper for newrand.h
3+
4+
"""
5+
6+
cdef extern from "newrand.h":
7+
void set_seed(unsigned int)
8+
unsigned int bounded_rand_int(unsigned int)
9+
10+
def set_seed_wrap(unsigned int custom_seed):
11+
set_seed(custom_seed)
12+
13+
def bounded_rand_int_wrap(unsigned int range_):
14+
return bounded_rand_int(range_)

sklearn/svm/setup.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,15 @@ def configuration(parent_package='', top_path=None):
1010

1111
config.add_subpackage('tests')
1212

13+
# newrand wrappers
14+
config.add_extension('_newrand',
15+
sources=['_newrand.pyx'],
16+
include_dirs=[numpy.get_include(),
17+
join('src', 'newrand')],
18+
depends=[join('src', 'newrand', 'newrand.h')],
19+
language='c++',
20+
)
21+
1322
# Section LibSVM
1423

1524
# we compile both libsvm and libsvm_sparse

sklearn/svm/src/newrand/newrand.h

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,43 +5,34 @@
55
libsvm and liblinear.
66
Sylvain Marie, Schneider Electric
77
See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
8-
98
*/
109
#ifndef _NEWRAND_H
1110
#define _NEWRAND_H
1211

1312
#ifdef __cplusplus
13+
#include <random> // needed for cython to generate a .cpp file from newrand.h
1414
extern "C" {
1515
#endif
1616

1717
// Scikit-Learn-specific random number generator replacing `rand()` originally
1818
// used in LibSVM / LibLinear, to ensure the same behaviour on windows-linux,
1919
// with increased speed
2020
// - (1) Init a `mt_rand` object
21-
#if INT_MAX == 0x7FFFFFFF
2221
std::mt19937 mt_rand(std::mt19937::default_seed);
23-
#elif INT_MAX == 0x7FFFFFFFFFFFFFFF
24-
std::mt19937_64 mt_rand(std::mt19937::default_seed);
25-
#else
26-
info("Random number generator is not fixed for this system. Please report issue. INT_MAX=%d\n", INT_MAX);
27-
exit(1);
28-
#endif
2922

3023
// - (2) public `set_seed()` function that should be used instead of `srand()` to set a new seed.
3124
void set_seed(unsigned custom_seed) {
3225
mt_rand.seed(custom_seed);
3326
}
3427

3528
// - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere.
36-
inline int bounded_rand_int(int orig_range) {
37-
// "LibSVM / LibLinear Original way" - make a 31bit or 63bit positive
29+
inline uint32_t bounded_rand_int(uint32_t range) {
30+
// "LibSVM / LibLinear Original way" - make a 31bit positive
3831
// random number and use modulo to make it fit in the range
39-
// return abs( (int)mt_rand()) % orig_range;
32+
// return abs( (int)mt_rand()) % range;
4033

4134
// "Better way": tweaked Lemire post-processor
4235
// from http://www.pcg-random.org/posts/bounded-rands.html
43-
// TODO how could we make this casting safer, raising an error if lost information?
44-
uint32_t range = uint32_t(orig_range);
4536
uint32_t x = mt_rand();
4637
uint64_t m = uint64_t(x) * uint64_t(range);
4738
uint32_t l = uint32_t(m);

sklearn/svm/tests/test_bounds.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import numpy as np
22
from scipy import sparse as sp
3+
from scipy import stats
34

45
import pytest
56

67
from sklearn.svm._bounds import l1_min_c
78
from sklearn.svm import LinearSVC
89
from sklearn.linear_model import LogisticRegression
10+
from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap
911

1012
from sklearn.utils._testing import assert_raise_message
1113

@@ -74,3 +76,71 @@ def test_ill_posed_min_c():
7476
def test_unsupported_loss():
7577
with pytest.raises(ValueError):
7678
l1_min_c(dense_X, Y1, loss='l1')
79+
80+
81+
_MAX_UNSIGNED_INT = 4294967295
82+
83+
84+
@pytest.mark.parametrize('seed, val',
85+
[(None, 81),
86+
(0, 54),
87+
(_MAX_UNSIGNED_INT, 9)])
88+
def test_newrand_set_seed(seed, val):
89+
"""Test that `set_seed` produces deterministic results"""
90+
if seed is not None:
91+
set_seed_wrap(seed)
92+
x = bounded_rand_int_wrap(100)
93+
assert x == val, f'Expected {val} but got {x} instead'
94+
95+
96+
@pytest.mark.parametrize('seed',
97+
[-1, _MAX_UNSIGNED_INT + 1])
98+
def test_newrand_set_seed_overflow(seed):
99+
"""Test that `set_seed_wrap` is defined for unsigned 32bits ints"""
100+
with pytest.raises(OverflowError):
101+
set_seed_wrap(seed)
102+
103+
104+
@pytest.mark.parametrize('range_, n_pts',
105+
[(_MAX_UNSIGNED_INT, 10000), (100, 25)])
106+
def test_newrand_bounded_rand_int(range_, n_pts):
107+
"""Test that `bounded_rand_int` follows a uniform distribution"""
108+
n_iter = 100
109+
ks_pvals = []
110+
uniform_dist = stats.uniform(loc=0, scale=range_)
111+
# perform multiple samplings to make chance of outlier sampling negligible
112+
for _ in range(n_iter):
113+
# Deterministic random sampling
114+
sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)]
115+
res = stats.kstest(sample, uniform_dist.cdf)
116+
ks_pvals.append(res.pvalue)
117+
# Null hypothesis = samples come from an uniform distribution.
118+
# Under the null hypothesis, p-values should be uniformly distributed
119+
# and not concentrated on low values
120+
# (this may seem counter-intuitive but is backed by multiple refs)
121+
# So we can do two checks:
122+
123+
# (1) check uniformity of p-values
124+
uniform_p_vals_dist = stats.uniform(loc=0, scale=1)
125+
res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
126+
assert res_pvals.pvalue > 0.05, (
127+
"Null hypothesis rejected: generated random numbers are not uniform."
128+
" Details: the (meta) p-value of the test of uniform distribution"
129+
f" of p-values is {res_pvals.pvalue} which is not > 0.05")
130+
131+
# (2) (safety belt) check that 90% of p-values are above 0.05
132+
min_10pct_pval = np.percentile(ks_pvals, q=10)
133+
# lower 10th quantile pvalue <= 0.05 means that the test rejects the
134+
# null hypothesis that the sample came from the uniform distribution
135+
assert min_10pct_pval > 0.05, (
136+
"Null hypothesis rejected: generated random numbers are not uniform. "
137+
f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05."
138+
)
139+
140+
141+
@pytest.mark.parametrize('range_',
142+
[-1, _MAX_UNSIGNED_INT + 1])
143+
def test_newrand_bounded_rand_int_limits(range_):
144+
"""Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints"""
145+
with pytest.raises(OverflowError):
146+
bounded_rand_int_wrap(range_)

0 commit comments

Comments
 (0)