TST Python wrapper and tests for sklearn/svm/newrand/ (#17157)

Justin-Huber · smarie · jnothman · web-flow · commit 430c2080e81d · 2020-06-28T18:22:04.000+02:00
* Implemented issue #16862 * Implemented issue #16862 * Ready for pull request * Cleaned up redundancies * Removed test_set_seed_wrap for now * Update sklearn/svm/tests/test_bounds.py Co-authored-by: smarie <sylvain.marie@schneider-electric.com> * Review feedback changes * Style fixes * np.quantile -> np.percentile * Update sklearn/svm/src/newrand/newrand.h Merged suggestion Co-authored-by: smarie <sylvain.marie@schneider-electric.com> * Update sklearn/svm/tests/test_bounds.py Merge suggestions Co-authored-by: smarie <sylvain.marie@schneider-electric.com> * linting fix * linting fix * linting fix * Added deterministic test for mt19937 in svm/newrand.h * Linting fix * Review changes * Update sklearn/svm/tests/test_bounds.py Co-authored-by: smarie <sylvain.marie@schneider-electric.com> * Update sklearn/svm/tests/test_bounds.py Co-authored-by: Joel Nothman <joel.nothman@gmail.com> * Update sklearn/svm/tests/test_bounds.py Co-authored-by: Joel Nothman <joel.nothman@gmail.com> * Update test_bounds.py * Linting fixes Co-authored-by: smarie <sylvain.marie@schneider-electric.com> Co-authored-by: Joel Nothman <joel.nothman@gmail.com>
diff --git a/sklearn/svm/_newrand.pyx b/sklearn/svm/_newrand.pyx
@@ -0,0 +1,14 @@
+"""
+Wrapper for newrand.h
+
+"""
+
+cdef extern from "newrand.h":
+	void set_seed(unsigned int)
+	unsigned int bounded_rand_int(unsigned int)
+
+def set_seed_wrap(unsigned int custom_seed):
+	set_seed(custom_seed)
+
+def bounded_rand_int_wrap(unsigned int range_):
+	return bounded_rand_int(range_)
diff --git a/sklearn/svm/setup.py b/sklearn/svm/setup.py
@@ -10,6 +10,15 @@ def configuration(parent_package='', top_path=None):
 
     config.add_subpackage('tests')
 
+    # newrand wrappers
+    config.add_extension('_newrand',
+                         sources=['_newrand.pyx'],
+                         include_dirs=[numpy.get_include(),
+                                       join('src', 'newrand')],
+                         depends=[join('src', 'newrand', 'newrand.h')],
+                         language='c++',
+                         )
+
     # Section LibSVM
 
     # we compile both libsvm and libsvm_sparse
diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h
@@ -5,43 +5,34 @@
      libsvm and liblinear.
      Sylvain Marie, Schneider Electric
      See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
-
  */
 #ifndef _NEWRAND_H
 #define _NEWRAND_H
 
 #ifdef __cplusplus
+#include <random>  // needed for cython to generate a .cpp file from newrand.h
 extern "C" {
 #endif
 
 // Scikit-Learn-specific random number generator replacing `rand()` originally
 // used in LibSVM / LibLinear, to ensure the same behaviour on windows-linux,
 // with increased speed
 // - (1) Init a `mt_rand` object
-#if INT_MAX == 0x7FFFFFFF
 std::mt19937 mt_rand(std::mt19937::default_seed);
-#elif INT_MAX == 0x7FFFFFFFFFFFFFFF
-std::mt19937_64 mt_rand(std::mt19937::default_seed);
-#else
-info("Random number generator is not fixed for this system. Please report issue. INT_MAX=%d\n", INT_MAX);
-exit(1);
-#endif
 
 // - (2) public `set_seed()` function that should be used instead of `srand()` to set a new seed.
 void set_seed(unsigned custom_seed) {
     mt_rand.seed(custom_seed);
 }
 
 // - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere.
-inline int bounded_rand_int(int orig_range) {
-    // "LibSVM / LibLinear Original way" - make a 31bit or 63bit positive
+inline uint32_t bounded_rand_int(uint32_t range) {
+    // "LibSVM / LibLinear Original way" - make a 31bit positive
     // random number and use modulo to make it fit in the range
-    // return abs( (int)mt_rand()) % orig_range;
+    // return abs( (int)mt_rand()) % range;
 
     // "Better way": tweaked Lemire post-processor
     // from http://www.pcg-random.org/posts/bounded-rands.html
-    // TODO how could we make this casting safer, raising an error if lost information?
-    uint32_t range = uint32_t(orig_range);
     uint32_t x = mt_rand();
     uint64_t m = uint64_t(x) * uint64_t(range);
     uint32_t l = uint32_t(m);
diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
@@ -1,11 +1,13 @@
 import numpy as np
 from scipy import sparse as sp
+from scipy import stats
 
 import pytest
 
 from sklearn.svm._bounds import l1_min_c
 from sklearn.svm import LinearSVC
 from sklearn.linear_model import LogisticRegression
+from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap
 
 from sklearn.utils._testing import assert_raise_message
 
@@ -74,3 +76,71 @@ def test_ill_posed_min_c():
 def test_unsupported_loss():
     with pytest.raises(ValueError):
         l1_min_c(dense_X, Y1, loss='l1')
+
+
+_MAX_UNSIGNED_INT = 4294967295
+
+
+@pytest.mark.parametrize('seed, val',
+                         [(None, 81),
+                          (0, 54),
+                          (_MAX_UNSIGNED_INT, 9)])
+def test_newrand_set_seed(seed, val):
+    """Test that `set_seed` produces deterministic results"""
+    if seed is not None:
+        set_seed_wrap(seed)
+    x = bounded_rand_int_wrap(100)
+    assert x == val, f'Expected {val} but got {x} instead'
+
+
+@pytest.mark.parametrize('seed',
+                         [-1, _MAX_UNSIGNED_INT + 1])
+def test_newrand_set_seed_overflow(seed):
+    """Test that `set_seed_wrap` is defined for unsigned 32bits ints"""
+    with pytest.raises(OverflowError):
+        set_seed_wrap(seed)
+
+
+@pytest.mark.parametrize('range_, n_pts',
+                         [(_MAX_UNSIGNED_INT, 10000), (100, 25)])
+def test_newrand_bounded_rand_int(range_, n_pts):
+    """Test that `bounded_rand_int` follows a uniform distribution"""
+    n_iter = 100
+    ks_pvals = []
+    uniform_dist = stats.uniform(loc=0, scale=range_)
+    # perform multiple samplings to make chance of outlier sampling negligible
+    for _ in range(n_iter):
+        # Deterministic random sampling
+        sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)]
+        res = stats.kstest(sample, uniform_dist.cdf)
+        ks_pvals.append(res.pvalue)
+    # Null hypothesis = samples come from an uniform distribution.
+    # Under the null hypothesis, p-values should be uniformly distributed
+    # and not concentrated on low values
+    # (this may seem counter-intuitive but is backed by multiple refs)
+    # So we can do two checks:
+
+    # (1) check uniformity of p-values
+    uniform_p_vals_dist = stats.uniform(loc=0, scale=1)
+    res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
+    assert res_pvals.pvalue > 0.05, (
+        "Null hypothesis rejected: generated random numbers are not uniform."
+        " Details: the (meta) p-value of the test of uniform distribution"
+        f" of p-values is {res_pvals.pvalue} which is not > 0.05")
+
+    # (2) (safety belt) check that 90% of p-values are above 0.05
+    min_10pct_pval = np.percentile(ks_pvals, q=10)
+    # lower 10th quantile pvalue <= 0.05 means that the test rejects the
+    # null hypothesis that the sample came from the uniform distribution
+    assert min_10pct_pval > 0.05, (
+        "Null hypothesis rejected: generated random numbers are not uniform. "
+        f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05."
+        )
+
+
+@pytest.mark.parametrize('range_',
+                         [-1, _MAX_UNSIGNED_INT + 1])
+def test_newrand_bounded_rand_int_limits(range_):
+    """Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints"""
+    with pytest.raises(OverflowError):
+        bounded_rand_int_wrap(range_)