From 9069c3f15f277b927de7e2df797c414a8d95cacf Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Thu, 7 May 2020 12:19:45 -0500 Subject: [PATCH 01/22] Implemented issue #16862 --- sklearn/svm/_newrand.pyx | 14 +++++++++++++ sklearn/svm/setup.py | 11 ++++++++++ sklearn/svm/src/newrand/newrand.h | 2 +- sklearn/svm/tests/test_bounds.py | 35 +++++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 sklearn/svm/_newrand.pyx diff --git a/sklearn/svm/_newrand.pyx b/sklearn/svm/_newrand.pyx new file mode 100644 index 0000000000000..627106be33022 --- /dev/null +++ b/sklearn/svm/_newrand.pyx @@ -0,0 +1,14 @@ +""" +Wrapper for newrand.h + +""" + +cdef extern from "newrand-cache.h": + void set_seed(int) + int bounded_rand_int(int) + +def set_seed_wrap(unsigned custom_seed): + set_seed(custom_seed) + +def bounded_rand_int_wrap(int orig_range): + return bounded_rand_int(orig_range) diff --git a/sklearn/svm/setup.py b/sklearn/svm/setup.py index 989e6c7d6a316..52f10ddcae1a8 100644 --- a/sklearn/svm/setup.py +++ b/sklearn/svm/setup.py @@ -41,6 +41,17 @@ def configuration(parent_package='', top_path=None): depends=libsvm_depends, ) + config.add_extension('_newrand', + sources=['_newrand.pyx'], + include_dirs=[numpy.get_include(), + join('src', 'libsvm'), + join('src', 'newrand')], + libraries=['libsvm-skl'], + depends=[join('src', 'newrand', 'newrand.h'), + join('src', 'newrand', 'newrand-cache.cpp'), + join('src', 'newrand', 'newrand-cache.h')], + ) + # liblinear module libraries = [] if os.name == 'posix': diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h index 7cd7b4c9fbf2b..6968c13eb6a38 100644 --- a/sklearn/svm/src/newrand/newrand.h +++ b/sklearn/svm/src/newrand/newrand.h @@ -33,7 +33,7 @@ void set_seed(unsigned custom_seed) { } // - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere. -inline int bounded_rand_int(int orig_range) { +int bounded_rand_int(int orig_range) { // "LibSVM / LibLinear Original way" - make a 31bit or 63bit positive // random number and use modulo to make it fit in the range // return abs( (int)mt_rand()) % orig_range; diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 8454ebf64de1a..dc37106e43eb4 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -1,10 +1,12 @@ import numpy as np from scipy import sparse as sp +from scipy import stats import pytest from sklearn.svm._bounds import l1_min_c from sklearn.svm import LinearSVC +from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap from sklearn.linear_model import LogisticRegression from sklearn.utils._testing import assert_raise_message @@ -74,3 +76,36 @@ def test_ill_posed_min_c(): def test_unsupported_loss(): with pytest.raises(ValueError): l1_min_c(dense_X, Y1, loss='l1') + + +def test_set_seed(): + def _test(seed, val): + # TODO currently causes 99% coverage in test_bounds.py + if seed is not None: + set_seed_wrap(seed) + x = bounded_rand_int_wrap(100) + assert(x == val), 'Expected {} but got {} instead'.format(val, x) + + # _test(None, 81) + # TODO should be default seed for std::mt19937, + # but different results on jupyter lab + _test(5489, 81) # default seed for std::mt19937 + _test(0, 54) + _test(4294967295, 9) # max unsigned int size + + +def test_bounded_rand_int(): + orig_range = 2147483647 # max int as input to bounded_rand_int + n_pts = 1000 + n_iter = 100 + ks_pvals = [] + uniform_dist = stats.uniform(loc=0, scale=orig_range) + for _ in range(n_iter): + # Deterministic random sampling + sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)] + res = stats.kstest(sample, uniform_dist.cdf) + ks_pvals.append(res.pvalue) + avg_pval = np.mean(ks_pvals) + assert(avg_pval > 0.05),\ + 'p-value of {} not > 0.05, '\ + 'therefore distribution isn\'t uniform'.format(avg_pval) From 7a92336480a48e566e420d7c44c2269a6c47c0e1 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Thu, 7 May 2020 12:22:19 -0500 Subject: [PATCH 02/22] Implemented issue #16862 --- sklearn/svm/tests/test_bounds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index dc37106e43eb4..6e93c0c39d2af 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -86,9 +86,9 @@ def _test(seed, val): x = bounded_rand_int_wrap(100) assert(x == val), 'Expected {} but got {} instead'.format(val, x) - # _test(None, 81) # TODO should be default seed for std::mt19937, # but different results on jupyter lab + # _test(None, 81) _test(5489, 81) # default seed for std::mt19937 _test(0, 54) _test(4294967295, 9) # max unsigned int size From 901a1b9ce1b7434391a771e3abaeb724eb58033e Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Sat, 9 May 2020 18:38:04 -0500 Subject: [PATCH 03/22] Ready for pull request --- sklearn/svm/_newrand.pyx | 5 +++-- sklearn/svm/setup.py | 5 ++--- sklearn/svm/src/newrand/newrand.h | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/svm/_newrand.pyx b/sklearn/svm/_newrand.pyx index 627106be33022..afbcbb1411db7 100644 --- a/sklearn/svm/_newrand.pyx +++ b/sklearn/svm/_newrand.pyx @@ -1,10 +1,11 @@ +# language: c++ """ Wrapper for newrand.h """ -cdef extern from "newrand-cache.h": - void set_seed(int) +cdef extern from "newrand.h": + void set_seed(unsigned) int bounded_rand_int(int) def set_seed_wrap(unsigned custom_seed): diff --git a/sklearn/svm/setup.py b/sklearn/svm/setup.py index 52f10ddcae1a8..92570cb9bf772 100644 --- a/sklearn/svm/setup.py +++ b/sklearn/svm/setup.py @@ -47,9 +47,8 @@ def configuration(parent_package='', top_path=None): join('src', 'libsvm'), join('src', 'newrand')], libraries=['libsvm-skl'], - depends=[join('src', 'newrand', 'newrand.h'), - join('src', 'newrand', 'newrand-cache.cpp'), - join('src', 'newrand', 'newrand-cache.h')], + depends=[join('src', 'newrand', 'newrand.h')], + language='c++', ) # liblinear module diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h index 6968c13eb6a38..255f5c0310a7e 100644 --- a/sklearn/svm/src/newrand/newrand.h +++ b/sklearn/svm/src/newrand/newrand.h @@ -5,12 +5,12 @@ libsvm and liblinear. Sylvain Marie, Schneider Electric See - */ #ifndef _NEWRAND_H #define _NEWRAND_H #ifdef __cplusplus +#include extern "C" { #endif @@ -33,7 +33,7 @@ void set_seed(unsigned custom_seed) { } // - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere. -int bounded_rand_int(int orig_range) { +inline int bounded_rand_int(int orig_range) { // "LibSVM / LibLinear Original way" - make a 31bit or 63bit positive // random number and use modulo to make it fit in the range // return abs( (int)mt_rand()) % orig_range; @@ -65,4 +65,4 @@ int bounded_rand_int(int orig_range) { } #endif -#endif /* _NEWRAND_H */ +#endif /* _NEWRAND_H */ \ No newline at end of file From 9daf268d6e0d1a6ddf0946864c32353c09c9d856 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Sat, 9 May 2020 18:44:53 -0500 Subject: [PATCH 04/22] Cleaned up redundancies --- sklearn/svm/_newrand.pyx | 1 - sklearn/svm/setup.py | 19 +++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/sklearn/svm/_newrand.pyx b/sklearn/svm/_newrand.pyx index afbcbb1411db7..f392f44eb7b1b 100644 --- a/sklearn/svm/_newrand.pyx +++ b/sklearn/svm/_newrand.pyx @@ -1,4 +1,3 @@ -# language: c++ """ Wrapper for newrand.h diff --git a/sklearn/svm/setup.py b/sklearn/svm/setup.py index 92570cb9bf772..395e398437558 100644 --- a/sklearn/svm/setup.py +++ b/sklearn/svm/setup.py @@ -10,6 +10,15 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('tests') + # newrand wrappers + config.add_extension('_newrand', + sources=['_newrand.pyx'], + include_dirs=[numpy.get_include(), + join('src', 'newrand')], + depends=[join('src', 'newrand', 'newrand.h')], + language='c++', + ) + # Section LibSVM # we compile both libsvm and libsvm_sparse @@ -41,16 +50,6 @@ def configuration(parent_package='', top_path=None): depends=libsvm_depends, ) - config.add_extension('_newrand', - sources=['_newrand.pyx'], - include_dirs=[numpy.get_include(), - join('src', 'libsvm'), - join('src', 'newrand')], - libraries=['libsvm-skl'], - depends=[join('src', 'newrand', 'newrand.h')], - language='c++', - ) - # liblinear module libraries = [] if os.name == 'posix': From 4734ad1c2748c82bc451477e177d0531b2bb81d2 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Mon, 11 May 2020 14:02:13 -0500 Subject: [PATCH 05/22] Removed test_set_seed_wrap for now --- sklearn/svm/tests/test_bounds.py | 60 +++++++++++++++++--------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 6e93c0c39d2af..88f2ab1888685 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -6,8 +6,8 @@ from sklearn.svm._bounds import l1_min_c from sklearn.svm import LinearSVC -from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap from sklearn.linear_model import LogisticRegression +from sklearn.svm._newrand import bounded_rand_int_wrap from sklearn.utils._testing import assert_raise_message @@ -78,34 +78,38 @@ def test_unsupported_loss(): l1_min_c(dense_X, Y1, loss='l1') -def test_set_seed(): - def _test(seed, val): - # TODO currently causes 99% coverage in test_bounds.py - if seed is not None: - set_seed_wrap(seed) - x = bounded_rand_int_wrap(100) - assert(x == val), 'Expected {} but got {} instead'.format(val, x) +# TODO test deterministic results on differing systems +# def test_set_seed(): +# def _test(seed, val): +# # TODO currently causes 99% coverage in test_bounds.py +# if seed is not None: +# set_seed_wrap(seed) +# x = bounded_rand_int_wrap(100) +# assert(x == val), 'Expected {} but got {} instead'.format(val, x) - # TODO should be default seed for std::mt19937, - # but different results on jupyter lab - # _test(None, 81) - _test(5489, 81) # default seed for std::mt19937 - _test(0, 54) - _test(4294967295, 9) # max unsigned int size +# # TODO should be default seed for std::mt19937, +# # but different results on jupyter lab +# # _test(None, 81) +# _test(5489, 81) # default seed for std::mt19937 +# _test(0, 54) +# _test(4294967295, 9) # max unsigned int size def test_bounded_rand_int(): - orig_range = 2147483647 # max int as input to bounded_rand_int - n_pts = 1000 - n_iter = 100 - ks_pvals = [] - uniform_dist = stats.uniform(loc=0, scale=orig_range) - for _ in range(n_iter): - # Deterministic random sampling - sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)] - res = stats.kstest(sample, uniform_dist.cdf) - ks_pvals.append(res.pvalue) - avg_pval = np.mean(ks_pvals) - assert(avg_pval > 0.05),\ - 'p-value of {} not > 0.05, '\ - 'therefore distribution isn\'t uniform'.format(avg_pval) + def _test(orig_range): + n_pts = 1000 + n_iter = 100 + ks_pvals = [] + uniform_dist = stats.uniform(loc=0, scale=orig_range) + # avg over multiple tests to make chance of outlier result negligible + for _ in range(n_iter): + # Deterministic random sampling + sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)] + res = stats.kstest(sample, uniform_dist.cdf) + ks_pvals.append(res.pvalue) + avg_pval = np.mean(ks_pvals) + assert(avg_pval > 0.05),\ + 'p-value of {} not > 0.05, '\ + 'therefore distribution isn\'t uniform'.format(avg_pval) + _test(2147483647) # max int + _test(100) # arbitrary less than max max int From edfaf72d236811a1bb473e0796238683187cad15 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Thu, 14 May 2020 12:56:17 -0500 Subject: [PATCH 06/22] Update sklearn/svm/tests/test_bounds.py Co-authored-by: smarie --- sklearn/svm/tests/test_bounds.py | 37 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 88f2ab1888685..b3a360af1880f 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -95,21 +95,22 @@ def test_unsupported_loss(): # _test(4294967295, 9) # max unsigned int size -def test_bounded_rand_int(): - def _test(orig_range): - n_pts = 1000 - n_iter = 100 - ks_pvals = [] - uniform_dist = stats.uniform(loc=0, scale=orig_range) - # avg over multiple tests to make chance of outlier result negligible - for _ in range(n_iter): - # Deterministic random sampling - sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)] - res = stats.kstest(sample, uniform_dist.cdf) - ks_pvals.append(res.pvalue) - avg_pval = np.mean(ks_pvals) - assert(avg_pval > 0.05),\ - 'p-value of {} not > 0.05, '\ - 'therefore distribution isn\'t uniform'.format(avg_pval) - _test(2147483647) # max int - _test(100) # arbitrary less than max max int +_MAX_INT = 2147483647 + + +@pytest.mark.parametrize("orig_range", [_MAX_INT, 100]) +def test_bounded_rand_int(orig_range): + n_pts = 1000 + n_iter = 100 + ks_pvals = [] + uniform_dist = stats.uniform(loc=0, scale=orig_range) + # avg over multiple tests to make chance of outlier result negligible + for _ in range(n_iter): + # Deterministic random sampling + sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)] + res = stats.kstest(sample, uniform_dist.cdf) + ks_pvals.append(res.pvalue) + avg_pval = np.mean(ks_pvals) + assert(avg_pval > 0.05),\ + 'p-value of {} not > 0.05, '\ + 'therefore distribution isn\'t uniform'.format(avg_pval) From d8f720dcf94ade6487a0421516b8b6ea9b5d14fc Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Fri, 15 May 2020 17:07:48 -0500 Subject: [PATCH 07/22] Review feedback changes --- sklearn/svm/tests/test_bounds.py | 38 +++++++++++++++++--------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 88f2ab1888685..3c9b737872cca 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -95,21 +95,23 @@ def test_unsupported_loss(): # _test(4294967295, 9) # max unsigned int size -def test_bounded_rand_int(): - def _test(orig_range): - n_pts = 1000 - n_iter = 100 - ks_pvals = [] - uniform_dist = stats.uniform(loc=0, scale=orig_range) - # avg over multiple tests to make chance of outlier result negligible - for _ in range(n_iter): - # Deterministic random sampling - sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)] - res = stats.kstest(sample, uniform_dist.cdf) - ks_pvals.append(res.pvalue) - avg_pval = np.mean(ks_pvals) - assert(avg_pval > 0.05),\ - 'p-value of {} not > 0.05, '\ - 'therefore distribution isn\'t uniform'.format(avg_pval) - _test(2147483647) # max int - _test(100) # arbitrary less than max max int +_MAX_INT = 2147483647 +@pytest.mark.parametrize('orig_range, n_pts', + [(_MAX_INT, 10000), (100, 10)]) +def test_bounded_rand_int(orig_range, n_pts): + n_iter = 100 + ks_pvals = [] + uniform_dist = stats.uniform(loc=0, scale=orig_range) + # perform multiple samplings to make chance of outlier sampling negligible + for _ in range(n_iter): + # Deterministic random sampling + sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)] + res = stats.kstest(sample, uniform_dist.cdf) + ks_pvals.append(res.pvalue) + min_10pct_pval = np.quantile(ks_pvals, q=0.1) + # lower 10th quantile pvalue <= 0.05 means that the test rejects the + # null hypothesis that the sample came from the uniform distribution + assert(min_10pct_pval > 0.05),\ + 'lower 10th quantile p-value of {} not > 0.05, '\ + 'therefore distribution isn\'t sufficiently uniform'\ + .format(min_10pct_pval) From b95d87f8f8653c0f94f08055ab5f64fd1e2ca4e7 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Fri, 15 May 2020 22:39:37 -0500 Subject: [PATCH 08/22] Style fixes --- sklearn/svm/tests/test_bounds.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 3c9b737872cca..1456fd02f6bfb 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -96,6 +96,8 @@ def test_unsupported_loss(): _MAX_INT = 2147483647 + + @pytest.mark.parametrize('orig_range, n_pts', [(_MAX_INT, 10000), (100, 10)]) def test_bounded_rand_int(orig_range, n_pts): From e0e8e6d19a9a517900cf997bf4cfaab7eb88e40e Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Sun, 17 May 2020 16:06:39 -0500 Subject: [PATCH 09/22] np.quantile -> np.percentile --- sklearn/svm/tests/test_bounds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 1456fd02f6bfb..9b4fc900341c6 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -110,7 +110,7 @@ def test_bounded_rand_int(orig_range, n_pts): sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)] res = stats.kstest(sample, uniform_dist.cdf) ks_pvals.append(res.pvalue) - min_10pct_pval = np.quantile(ks_pvals, q=0.1) + min_10pct_pval = np.percentile(ks_pvals, q=10) # lower 10th quantile pvalue <= 0.05 means that the test rejects the # null hypothesis that the sample came from the uniform distribution assert(min_10pct_pval > 0.05),\ From 83fb69385cb58e1904c1f56b1013c19867311377 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Mon, 22 Jun 2020 17:23:39 -0500 Subject: [PATCH 10/22] Update sklearn/svm/src/newrand/newrand.h Merged suggestion Co-authored-by: smarie --- sklearn/svm/src/newrand/newrand.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h index 255f5c0310a7e..0da8f07e2274a 100644 --- a/sklearn/svm/src/newrand/newrand.h +++ b/sklearn/svm/src/newrand/newrand.h @@ -10,7 +10,7 @@ #define _NEWRAND_H #ifdef __cplusplus -#include +#include // needed for cython to generate a .cpp file from newrand.h extern "C" { #endif @@ -65,4 +65,4 @@ inline int bounded_rand_int(int orig_range) { } #endif -#endif /* _NEWRAND_H */ \ No newline at end of file +#endif /* _NEWRAND_H */ From b4c71f48f9efac45e6a2d198cb684ca582be5a3d Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Mon, 22 Jun 2020 17:25:54 -0500 Subject: [PATCH 11/22] Update sklearn/svm/tests/test_bounds.py Merge suggestions Co-authored-by: smarie --- sklearn/svm/tests/test_bounds.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 9b4fc900341c6..1ca64b482368a 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -110,10 +110,24 @@ def test_bounded_rand_int(orig_range, n_pts): sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)] res = stats.kstest(sample, uniform_dist.cdf) ks_pvals.append(res.pvalue) + # Null hypothesis = samples come from an uniform distribution. + # Under the null hypothesis, p-values should be uniformly distributed and not concentrated + # on low values (this may seem counter-intuitive but is backed by multiple refs) + # So we can do two checks: + + # (1) check uniformity of p-values + uniform_p_vals_dist = stats.uniform(loc=0, scale=1) + res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf) + assert res_pvals.pvalue > 0.05,\ + "Null hypothesis rejected: generated random numbers are not uniform."\ + " Details: the (meta) p-value of the test of uniform distribution of p-values"\ + " is {} which is not > 0.05".format(res_pvals.pvalue) + + # (2) (safety belt) check that most p-values are above 0.05 min_10pct_pval = np.percentile(ks_pvals, q=10) # lower 10th quantile pvalue <= 0.05 means that the test rejects the # null hypothesis that the sample came from the uniform distribution assert(min_10pct_pval > 0.05),\ - 'lower 10th quantile p-value of {} not > 0.05, '\ - 'therefore distribution isn\'t sufficiently uniform'\ + "Null hypothesis rejected: generated random numbers are not uniform." \ + " Details: lower 10th quantile p-value of {} not > 0.05."\ .format(min_10pct_pval) From 90a85f35fd12458fba1b8ba1bd6d3b71b211aefc Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Mon, 22 Jun 2020 19:52:53 -0500 Subject: [PATCH 12/22] linting fix --- sklearn/svm/tests/test_bounds.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 1ca64b482368a..3ea7c4d34c426 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -113,8 +113,8 @@ def test_bounded_rand_int(orig_range, n_pts): # Null hypothesis = samples come from an uniform distribution. # Under the null hypothesis, p-values should be uniformly distributed and not concentrated # on low values (this may seem counter-intuitive but is backed by multiple refs) - # So we can do two checks: - + # So we can do two checks: + # (1) check uniformity of p-values uniform_p_vals_dist = stats.uniform(loc=0, scale=1) res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf) @@ -122,7 +122,7 @@ def test_bounded_rand_int(orig_range, n_pts): "Null hypothesis rejected: generated random numbers are not uniform."\ " Details: the (meta) p-value of the test of uniform distribution of p-values"\ " is {} which is not > 0.05".format(res_pvals.pvalue) - + # (2) (safety belt) check that most p-values are above 0.05 min_10pct_pval = np.percentile(ks_pvals, q=10) # lower 10th quantile pvalue <= 0.05 means that the test rejects the From 9d1bc44f91f1af060fc81f45bde75e6e25a1bfd5 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Mon, 22 Jun 2020 19:56:38 -0500 Subject: [PATCH 13/22] linting fix --- sklearn/svm/tests/test_bounds.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 3ea7c4d34c426..0c666f0703db7 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -111,11 +111,13 @@ def test_bounded_rand_int(orig_range, n_pts): res = stats.kstest(sample, uniform_dist.cdf) ks_pvals.append(res.pvalue) # Null hypothesis = samples come from an uniform distribution. - # Under the null hypothesis, p-values should be uniformly distributed and not concentrated - # on low values (this may seem counter-intuitive but is backed by multiple refs) + # Under the null hypothesis, + # p-values should be uniformly distributed and not concentrated + # on low values + # (this may seem counter-intuitive but is backed by multiple refs) # So we can do two checks: - # (1) check uniformity of p-values + # (1) check uniformity of p-values uniform_p_vals_dist = stats.uniform(loc=0, scale=1) res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf) assert res_pvals.pvalue > 0.05,\ From 5215fee8ef1a8a903af84a32004b74bf42905db6 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Mon, 22 Jun 2020 19:59:04 -0500 Subject: [PATCH 14/22] linting fix --- sklearn/svm/tests/test_bounds.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 0c666f0703db7..db6fd3c11759e 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -122,8 +122,8 @@ def test_bounded_rand_int(orig_range, n_pts): res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf) assert res_pvals.pvalue > 0.05,\ "Null hypothesis rejected: generated random numbers are not uniform."\ - " Details: the (meta) p-value of the test of uniform distribution of p-values"\ - " is {} which is not > 0.05".format(res_pvals.pvalue) + " Details: the (meta) p-value of the test of uniform distribution"\ + " of p-values is {} which is not > 0.05".format(res_pvals.pvalue) # (2) (safety belt) check that most p-values are above 0.05 min_10pct_pval = np.percentile(ks_pvals, q=10) From f266f1512a274a2fbb4b56bae7980c63af9e3036 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Tue, 23 Jun 2020 14:17:31 -0500 Subject: [PATCH 15/22] Added deterministic test for mt19937 in svm/newrand.h --- sklearn/svm/tests/test_bounds.py | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index db6fd3c11759e..5b7cf0ccdb2da 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -78,24 +78,19 @@ def test_unsupported_loss(): l1_min_c(dense_X, Y1, loss='l1') -# TODO test deterministic results on differing systems -# def test_set_seed(): -# def _test(seed, val): -# # TODO currently causes 99% coverage in test_bounds.py -# if seed is not None: -# set_seed_wrap(seed) -# x = bounded_rand_int_wrap(100) -# assert(x == val), 'Expected {} but got {} instead'.format(val, x) - -# # TODO should be default seed for std::mt19937, -# # but different results on jupyter lab -# # _test(None, 81) -# _test(5489, 81) # default seed for std::mt19937 -# _test(0, 54) -# _test(4294967295, 9) # max unsigned int size +_MAX_UNSIGNED_INT = 4294967295 +_MAX_INT = 2147483647 -_MAX_INT = 2147483647 +@pytest.mark.parametrize('seed, val', + [(None, 81), + (0, 54), + (_MAX_UNSIGNED_INT, 9)]) +def test_set_seed(seed, val32, val64): + if seed is not None: + set_seed_wrap(seed) + x = bounded_rand_int_wrap(100) + assert(x == val), 'Expected {} but got {} instead'.format(val, x) @pytest.mark.parametrize('orig_range, n_pts', From b37f3b415f1c59d783219d486c5f065d015bdd31 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Tue, 23 Jun 2020 14:19:58 -0500 Subject: [PATCH 16/22] Linting fix --- sklearn/svm/tests/test_bounds.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 5b7cf0ccdb2da..48bee28a388d2 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -7,7 +7,7 @@ from sklearn.svm._bounds import l1_min_c from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression -from sklearn.svm._newrand import bounded_rand_int_wrap +from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap from sklearn.utils._testing import assert_raise_message @@ -86,7 +86,7 @@ def test_unsupported_loss(): [(None, 81), (0, 54), (_MAX_UNSIGNED_INT, 9)]) -def test_set_seed(seed, val32, val64): +def test_set_seed(seed, val): if seed is not None: set_seed_wrap(seed) x = bounded_rand_int_wrap(100) From 1d68995fa5db1a023792621a96809af4f463d70f Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Wed, 24 Jun 2020 19:24:10 -0500 Subject: [PATCH 17/22] Review changes --- sklearn/svm/_newrand.pyx | 10 ++++---- sklearn/svm/src/newrand/newrand.h | 15 +++--------- sklearn/svm/tests/test_bounds.py | 38 ++++++++++++++++++++++--------- 3 files changed, 35 insertions(+), 28 deletions(-) diff --git a/sklearn/svm/_newrand.pyx b/sklearn/svm/_newrand.pyx index f392f44eb7b1b..0f82cbcbfdd6f 100644 --- a/sklearn/svm/_newrand.pyx +++ b/sklearn/svm/_newrand.pyx @@ -4,11 +4,11 @@ Wrapper for newrand.h """ cdef extern from "newrand.h": - void set_seed(unsigned) - int bounded_rand_int(int) + void set_seed(unsigned int) + unsigned int bounded_rand_int(unsigned int) -def set_seed_wrap(unsigned custom_seed): +def set_seed_wrap(unsigned int custom_seed): set_seed(custom_seed) -def bounded_rand_int_wrap(int orig_range): - return bounded_rand_int(orig_range) +def bounded_rand_int_wrap(unsigned int range_): + return bounded_rand_int(range_) diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h index 0da8f07e2274a..e01bea99ec17e 100644 --- a/sklearn/svm/src/newrand/newrand.h +++ b/sklearn/svm/src/newrand/newrand.h @@ -18,14 +18,7 @@ extern "C" { // used in LibSVM / LibLinear, to ensure the same behaviour on windows-linux, // with increased speed // - (1) Init a `mt_rand` object -#if INT_MAX == 0x7FFFFFFF std::mt19937 mt_rand(std::mt19937::default_seed); -#elif INT_MAX == 0x7FFFFFFFFFFFFFFF -std::mt19937_64 mt_rand(std::mt19937::default_seed); -#else -info("Random number generator is not fixed for this system. Please report issue. INT_MAX=%d\n", INT_MAX); -exit(1); -#endif // - (2) public `set_seed()` function that should be used instead of `srand()` to set a new seed. void set_seed(unsigned custom_seed) { @@ -33,15 +26,13 @@ void set_seed(unsigned custom_seed) { } // - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere. -inline int bounded_rand_int(int orig_range) { - // "LibSVM / LibLinear Original way" - make a 31bit or 63bit positive +inline uint32_t bounded_rand_int(uint32_t range) { + // "LibSVM / LibLinear Original way" - make a 31bit positive // random number and use modulo to make it fit in the range - // return abs( (int)mt_rand()) % orig_range; + // return abs( (int)mt_rand()) % range; // "Better way": tweaked Lemire post-processor // from http://www.pcg-random.org/posts/bounded-rands.html - // TODO how could we make this casting safer, raising an error if lost information? - uint32_t range = uint32_t(orig_range); uint32_t x = mt_rand(); uint64_t m = uint64_t(x) * uint64_t(range); uint32_t l = uint32_t(m); diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 48bee28a388d2..200a24a70da81 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -79,37 +79,45 @@ def test_unsupported_loss(): _MAX_UNSIGNED_INT = 4294967295 -_MAX_INT = 2147483647 @pytest.mark.parametrize('seed, val', [(None, 81), (0, 54), (_MAX_UNSIGNED_INT, 9)]) -def test_set_seed(seed, val): +def test_newrand_set_seed(seed, val): + """Test that `set_seed` produces deterministic results""" if seed is not None: set_seed_wrap(seed) x = bounded_rand_int_wrap(100) assert(x == val), 'Expected {} but got {} instead'.format(val, x) -@pytest.mark.parametrize('orig_range, n_pts', - [(_MAX_INT, 10000), (100, 10)]) -def test_bounded_rand_int(orig_range, n_pts): +@pytest.mark.parametrize('seed', + [-1, _MAX_UNSIGNED_INT + 1]) +def test_newrand_set_seed_overflow(seed): + """Test that `set_seed_wrap` is defined for unsigned 32bits ints""" + with pytest.raises(OverflowError): + set_seed_wrap(seed) + + +@pytest.mark.parametrize('range_, n_pts', + [(_MAX_UNSIGNED_INT, 10000), (100, 10)]) +def test_newrand_bounded_rand_int(range_, n_pts): + """Test that `bounded_rand_int` follows a uniform distribution""" n_iter = 100 ks_pvals = [] - uniform_dist = stats.uniform(loc=0, scale=orig_range) + uniform_dist = stats.uniform(loc=0, scale=range_) # perform multiple samplings to make chance of outlier sampling negligible for _ in range(n_iter): # Deterministic random sampling - sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)] + sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)] res = stats.kstest(sample, uniform_dist.cdf) ks_pvals.append(res.pvalue) # Null hypothesis = samples come from an uniform distribution. - # Under the null hypothesis, - # p-values should be uniformly distributed and not concentrated - # on low values - # (this may seem counter-intuitive but is backed by multiple refs) + # Under the null hypothesis, p-values should be uniformly distributed + # and not concentrated on low values + # (this may seem counter-intuitive but is backed by multiple refs) # So we can do two checks: # (1) check uniformity of p-values @@ -128,3 +136,11 @@ def test_bounded_rand_int(orig_range, n_pts): "Null hypothesis rejected: generated random numbers are not uniform." \ " Details: lower 10th quantile p-value of {} not > 0.05."\ .format(min_10pct_pval) + + +@pytest.mark.parametrize('range_', + [-1, _MAX_UNSIGNED_INT + 1]) +def test_newrand_bounded_rand_int_limits(range_): + """Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints""" + with pytest.raises(OverflowError): + bounded_rand_int_wrap(range_) From 586aef600abb5c7d36560a5b398b44d21560d62e Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Thu, 25 Jun 2020 11:08:14 -0500 Subject: [PATCH 18/22] Update sklearn/svm/tests/test_bounds.py Co-authored-by: smarie --- sklearn/svm/tests/test_bounds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 200a24a70da81..550217a49271e 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -102,7 +102,7 @@ def test_newrand_set_seed_overflow(seed): @pytest.mark.parametrize('range_, n_pts', - [(_MAX_UNSIGNED_INT, 10000), (100, 10)]) + [(_MAX_UNSIGNED_INT, 10000), (100, 25)]) def test_newrand_bounded_rand_int(range_, n_pts): """Test that `bounded_rand_int` follows a uniform distribution""" n_iter = 100 From b3395ba0ccddfb5af0e230daf6c4911188321995 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Fri, 26 Jun 2020 12:44:19 -0500 Subject: [PATCH 19/22] Update sklearn/svm/tests/test_bounds.py Co-authored-by: Joel Nothman --- sklearn/svm/tests/test_bounds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 550217a49271e..b613135ba55da 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -90,7 +90,7 @@ def test_newrand_set_seed(seed, val): if seed is not None: set_seed_wrap(seed) x = bounded_rand_int_wrap(100) - assert(x == val), 'Expected {} but got {} instead'.format(val, x) + assert x == val, f'Expected {val} but got {x} instead' @pytest.mark.parametrize('seed', From 8abd97cf1f732a22ff42ef181fa74a08f2616679 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Fri, 26 Jun 2020 12:45:12 -0500 Subject: [PATCH 20/22] Update sklearn/svm/tests/test_bounds.py Co-authored-by: Joel Nothman --- sklearn/svm/tests/test_bounds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index b613135ba55da..b7075c89544f2 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -128,7 +128,7 @@ def test_newrand_bounded_rand_int(range_, n_pts): " Details: the (meta) p-value of the test of uniform distribution"\ " of p-values is {} which is not > 0.05".format(res_pvals.pvalue) - # (2) (safety belt) check that most p-values are above 0.05 + # (2) (safety belt) check that 90% of p-values are above 0.05 min_10pct_pval = np.percentile(ks_pvals, q=10) # lower 10th quantile pvalue <= 0.05 means that the test rejects the # null hypothesis that the sample came from the uniform distribution From 8d88b427be981e198e2e9c407acdf03da6fb5aa4 Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Fri, 26 Jun 2020 12:51:22 -0500 Subject: [PATCH 21/22] Update test_bounds.py --- sklearn/svm/tests/test_bounds.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index b7075c89544f2..977bbbfcc6932 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -123,19 +123,18 @@ def test_newrand_bounded_rand_int(range_, n_pts): # (1) check uniformity of p-values uniform_p_vals_dist = stats.uniform(loc=0, scale=1) res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf) - assert res_pvals.pvalue > 0.05,\ - "Null hypothesis rejected: generated random numbers are not uniform."\ - " Details: the (meta) p-value of the test of uniform distribution"\ - " of p-values is {} which is not > 0.05".format(res_pvals.pvalue) + assert res_pvals.pvalue > 0.05, ( + "Null hypothesis rejected: generated random numbers are not uniform." + " Details: the (meta) p-value of the test of uniform distribution" + f" of p-values is {res_pvals.pvalue} which is not > 0.05") # (2) (safety belt) check that 90% of p-values are above 0.05 min_10pct_pval = np.percentile(ks_pvals, q=10) # lower 10th quantile pvalue <= 0.05 means that the test rejects the # null hypothesis that the sample came from the uniform distribution - assert(min_10pct_pval > 0.05),\ - "Null hypothesis rejected: generated random numbers are not uniform." \ - " Details: lower 10th quantile p-value of {} not > 0.05."\ - .format(min_10pct_pval) + assert min_10pct_pval > 0.05, ( + "Null hypothesis rejected: generated random numbers are not uniform." + f" Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05.") @pytest.mark.parametrize('range_', From f62ed4a553f015739749f4127995a9b73c4c5f8e Mon Sep 17 00:00:00 2001 From: Justin Huber Date: Fri, 26 Jun 2020 13:48:41 -0500 Subject: [PATCH 22/22] Linting fixes --- sklearn/svm/tests/test_bounds.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 977bbbfcc6932..423d5ed7a7fba 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -133,8 +133,9 @@ def test_newrand_bounded_rand_int(range_, n_pts): # lower 10th quantile pvalue <= 0.05 means that the test rejects the # null hypothesis that the sample came from the uniform distribution assert min_10pct_pval > 0.05, ( - "Null hypothesis rejected: generated random numbers are not uniform." - f" Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05.") + "Null hypothesis rejected: generated random numbers are not uniform. " + f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05." + ) @pytest.mark.parametrize('range_',