diff --git a/sklearn/svm/_newrand.pyx b/sklearn/svm/_newrand.pyx new file mode 100644 index 0000000000000..0f82cbcbfdd6f --- /dev/null +++ b/sklearn/svm/_newrand.pyx @@ -0,0 +1,14 @@ +""" +Wrapper for newrand.h + +""" + +cdef extern from "newrand.h": + void set_seed(unsigned int) + unsigned int bounded_rand_int(unsigned int) + +def set_seed_wrap(unsigned int custom_seed): + set_seed(custom_seed) + +def bounded_rand_int_wrap(unsigned int range_): + return bounded_rand_int(range_) diff --git a/sklearn/svm/setup.py b/sklearn/svm/setup.py index 989e6c7d6a316..395e398437558 100644 --- a/sklearn/svm/setup.py +++ b/sklearn/svm/setup.py @@ -10,6 +10,15 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('tests') + # newrand wrappers + config.add_extension('_newrand', + sources=['_newrand.pyx'], + include_dirs=[numpy.get_include(), + join('src', 'newrand')], + depends=[join('src', 'newrand', 'newrand.h')], + language='c++', + ) + # Section LibSVM # we compile both libsvm and libsvm_sparse diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h index 7cd7b4c9fbf2b..e01bea99ec17e 100644 --- a/sklearn/svm/src/newrand/newrand.h +++ b/sklearn/svm/src/newrand/newrand.h @@ -5,12 +5,12 @@ libsvm and liblinear. Sylvain Marie, Schneider Electric See - */ #ifndef _NEWRAND_H #define _NEWRAND_H #ifdef __cplusplus +#include // needed for cython to generate a .cpp file from newrand.h extern "C" { #endif @@ -18,14 +18,7 @@ extern "C" { // used in LibSVM / LibLinear, to ensure the same behaviour on windows-linux, // with increased speed // - (1) Init a `mt_rand` object -#if INT_MAX == 0x7FFFFFFF std::mt19937 mt_rand(std::mt19937::default_seed); -#elif INT_MAX == 0x7FFFFFFFFFFFFFFF -std::mt19937_64 mt_rand(std::mt19937::default_seed); -#else -info("Random number generator is not fixed for this system. Please report issue. INT_MAX=%d\n", INT_MAX); -exit(1); -#endif // - (2) public `set_seed()` function that should be used instead of `srand()` to set a new seed. void set_seed(unsigned custom_seed) { @@ -33,15 +26,13 @@ void set_seed(unsigned custom_seed) { } // - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere. -inline int bounded_rand_int(int orig_range) { - // "LibSVM / LibLinear Original way" - make a 31bit or 63bit positive +inline uint32_t bounded_rand_int(uint32_t range) { + // "LibSVM / LibLinear Original way" - make a 31bit positive // random number and use modulo to make it fit in the range - // return abs( (int)mt_rand()) % orig_range; + // return abs( (int)mt_rand()) % range; // "Better way": tweaked Lemire post-processor // from http://www.pcg-random.org/posts/bounded-rands.html - // TODO how could we make this casting safer, raising an error if lost information? - uint32_t range = uint32_t(orig_range); uint32_t x = mt_rand(); uint64_t m = uint64_t(x) * uint64_t(range); uint32_t l = uint32_t(m); diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 8454ebf64de1a..423d5ed7a7fba 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -1,11 +1,13 @@ import numpy as np from scipy import sparse as sp +from scipy import stats import pytest from sklearn.svm._bounds import l1_min_c from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression +from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap from sklearn.utils._testing import assert_raise_message @@ -74,3 +76,71 @@ def test_ill_posed_min_c(): def test_unsupported_loss(): with pytest.raises(ValueError): l1_min_c(dense_X, Y1, loss='l1') + + +_MAX_UNSIGNED_INT = 4294967295 + + +@pytest.mark.parametrize('seed, val', + [(None, 81), + (0, 54), + (_MAX_UNSIGNED_INT, 9)]) +def test_newrand_set_seed(seed, val): + """Test that `set_seed` produces deterministic results""" + if seed is not None: + set_seed_wrap(seed) + x = bounded_rand_int_wrap(100) + assert x == val, f'Expected {val} but got {x} instead' + + +@pytest.mark.parametrize('seed', + [-1, _MAX_UNSIGNED_INT + 1]) +def test_newrand_set_seed_overflow(seed): + """Test that `set_seed_wrap` is defined for unsigned 32bits ints""" + with pytest.raises(OverflowError): + set_seed_wrap(seed) + + +@pytest.mark.parametrize('range_, n_pts', + [(_MAX_UNSIGNED_INT, 10000), (100, 25)]) +def test_newrand_bounded_rand_int(range_, n_pts): + """Test that `bounded_rand_int` follows a uniform distribution""" + n_iter = 100 + ks_pvals = [] + uniform_dist = stats.uniform(loc=0, scale=range_) + # perform multiple samplings to make chance of outlier sampling negligible + for _ in range(n_iter): + # Deterministic random sampling + sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)] + res = stats.kstest(sample, uniform_dist.cdf) + ks_pvals.append(res.pvalue) + # Null hypothesis = samples come from an uniform distribution. + # Under the null hypothesis, p-values should be uniformly distributed + # and not concentrated on low values + # (this may seem counter-intuitive but is backed by multiple refs) + # So we can do two checks: + + # (1) check uniformity of p-values + uniform_p_vals_dist = stats.uniform(loc=0, scale=1) + res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf) + assert res_pvals.pvalue > 0.05, ( + "Null hypothesis rejected: generated random numbers are not uniform." + " Details: the (meta) p-value of the test of uniform distribution" + f" of p-values is {res_pvals.pvalue} which is not > 0.05") + + # (2) (safety belt) check that 90% of p-values are above 0.05 + min_10pct_pval = np.percentile(ks_pvals, q=10) + # lower 10th quantile pvalue <= 0.05 means that the test rejects the + # null hypothesis that the sample came from the uniform distribution + assert min_10pct_pval > 0.05, ( + "Null hypothesis rejected: generated random numbers are not uniform. " + f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05." + ) + + +@pytest.mark.parametrize('range_', + [-1, _MAX_UNSIGNED_INT + 1]) +def test_newrand_bounded_rand_int_limits(range_): + """Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints""" + with pytest.raises(OverflowError): + bounded_rand_int_wrap(range_)