From 9069c3f15f277b927de7e2df797c414a8d95cacf Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Thu, 7 May 2020 12:19:45 -0500
Subject: [PATCH 01/22] Implemented issue #16862

---
 sklearn/svm/_newrand.pyx          | 14 +++++++++++++
 sklearn/svm/setup.py              | 11 ++++++++++
 sklearn/svm/src/newrand/newrand.h |  2 +-
 sklearn/svm/tests/test_bounds.py  | 35 +++++++++++++++++++++++++++++++
 4 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 sklearn/svm/_newrand.pyx

diff --git a/sklearn/svm/_newrand.pyx b/sklearn/svm/_newrand.pyx
new file mode 100644
index 0000000000000..627106be33022
--- /dev/null
+++ b/sklearn/svm/_newrand.pyx
@@ -0,0 +1,14 @@
+"""
+Wrapper for newrand.h
+
+"""
+
+cdef extern from "newrand-cache.h":
+	void set_seed(int)
+	int bounded_rand_int(int)
+
+def set_seed_wrap(unsigned custom_seed):
+	set_seed(custom_seed)
+
+def bounded_rand_int_wrap(int orig_range):
+	return bounded_rand_int(orig_range)
diff --git a/sklearn/svm/setup.py b/sklearn/svm/setup.py
index 989e6c7d6a316..52f10ddcae1a8 100644
--- a/sklearn/svm/setup.py
+++ b/sklearn/svm/setup.py
@@ -41,6 +41,17 @@ def configuration(parent_package='', top_path=None):
                          depends=libsvm_depends,
                          )
 
+    config.add_extension('_newrand',
+                         sources=['_newrand.pyx'],
+                         include_dirs=[numpy.get_include(),
+                                       join('src', 'libsvm'),
+                                       join('src', 'newrand')],
+                         libraries=['libsvm-skl'],
+                         depends=[join('src', 'newrand', 'newrand.h'),
+                                  join('src', 'newrand', 'newrand-cache.cpp'),
+                                  join('src', 'newrand', 'newrand-cache.h')],
+                         )
+
     # liblinear module
     libraries = []
     if os.name == 'posix':
diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h
index 7cd7b4c9fbf2b..6968c13eb6a38 100644
--- a/sklearn/svm/src/newrand/newrand.h
+++ b/sklearn/svm/src/newrand/newrand.h
@@ -33,7 +33,7 @@ void set_seed(unsigned custom_seed) {
 }
 
 // - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere.
-inline int bounded_rand_int(int orig_range) {
+int bounded_rand_int(int orig_range) {
     // "LibSVM / LibLinear Original way" - make a 31bit or 63bit positive
     // random number and use modulo to make it fit in the range
     // return abs( (int)mt_rand()) % orig_range;
diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 8454ebf64de1a..dc37106e43eb4 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -1,10 +1,12 @@
 import numpy as np
 from scipy import sparse as sp
+from scipy import stats
 
 import pytest
 
 from sklearn.svm._bounds import l1_min_c
 from sklearn.svm import LinearSVC
+from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap
 from sklearn.linear_model import LogisticRegression
 
 from sklearn.utils._testing import assert_raise_message
@@ -74,3 +76,36 @@ def test_ill_posed_min_c():
 def test_unsupported_loss():
     with pytest.raises(ValueError):
         l1_min_c(dense_X, Y1, loss='l1')
+
+
+def test_set_seed():
+    def _test(seed, val):
+        # TODO currently causes 99% coverage in test_bounds.py
+        if seed is not None:
+            set_seed_wrap(seed)
+        x = bounded_rand_int_wrap(100)
+        assert(x == val), 'Expected {} but got {} instead'.format(val, x)
+
+    # _test(None, 81)
+    # TODO should be default seed for std::mt19937,
+    # but different results on jupyter lab
+    _test(5489, 81)  # default seed for std::mt19937
+    _test(0, 54)
+    _test(4294967295, 9)  # max unsigned int size
+
+
+def test_bounded_rand_int():
+    orig_range = 2147483647  # max int as input to bounded_rand_int
+    n_pts = 1000
+    n_iter = 100
+    ks_pvals = []
+    uniform_dist = stats.uniform(loc=0, scale=orig_range)
+    for _ in range(n_iter):
+        # Deterministic random sampling
+        sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)]
+        res = stats.kstest(sample, uniform_dist.cdf)
+        ks_pvals.append(res.pvalue)
+    avg_pval = np.mean(ks_pvals)
+    assert(avg_pval > 0.05),\
+        'p-value of {} not > 0.05, '\
+        'therefore distribution isn\'t uniform'.format(avg_pval)

From 7a92336480a48e566e420d7c44c2269a6c47c0e1 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Thu, 7 May 2020 12:22:19 -0500
Subject: [PATCH 02/22] Implemented issue #16862

---
 sklearn/svm/tests/test_bounds.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index dc37106e43eb4..6e93c0c39d2af 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -86,9 +86,9 @@ def _test(seed, val):
         x = bounded_rand_int_wrap(100)
         assert(x == val), 'Expected {} but got {} instead'.format(val, x)
 
-    # _test(None, 81)
     # TODO should be default seed for std::mt19937,
     # but different results on jupyter lab
+    # _test(None, 81)
     _test(5489, 81)  # default seed for std::mt19937
     _test(0, 54)
     _test(4294967295, 9)  # max unsigned int size

From 901a1b9ce1b7434391a771e3abaeb724eb58033e Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Sat, 9 May 2020 18:38:04 -0500
Subject: [PATCH 03/22] Ready for pull request

---
 sklearn/svm/_newrand.pyx          | 5 +++--
 sklearn/svm/setup.py              | 5 ++---
 sklearn/svm/src/newrand/newrand.h | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/svm/_newrand.pyx b/sklearn/svm/_newrand.pyx
index 627106be33022..afbcbb1411db7 100644
--- a/sklearn/svm/_newrand.pyx
+++ b/sklearn/svm/_newrand.pyx
@@ -1,10 +1,11 @@
+# language: c++
 """
 Wrapper for newrand.h
 
 """
 
-cdef extern from "newrand-cache.h":
-	void set_seed(int)
+cdef extern from "newrand.h":
+	void set_seed(unsigned)
 	int bounded_rand_int(int)
 
 def set_seed_wrap(unsigned custom_seed):
diff --git a/sklearn/svm/setup.py b/sklearn/svm/setup.py
index 52f10ddcae1a8..92570cb9bf772 100644
--- a/sklearn/svm/setup.py
+++ b/sklearn/svm/setup.py
@@ -47,9 +47,8 @@ def configuration(parent_package='', top_path=None):
                                        join('src', 'libsvm'),
                                        join('src', 'newrand')],
                          libraries=['libsvm-skl'],
-                         depends=[join('src', 'newrand', 'newrand.h'),
-                                  join('src', 'newrand', 'newrand-cache.cpp'),
-                                  join('src', 'newrand', 'newrand-cache.h')],
+                         depends=[join('src', 'newrand', 'newrand.h')],
+                         language='c++',
                          )
 
     # liblinear module
diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h
index 6968c13eb6a38..255f5c0310a7e 100644
--- a/sklearn/svm/src/newrand/newrand.h
+++ b/sklearn/svm/src/newrand/newrand.h
@@ -5,12 +5,12 @@
      libsvm and liblinear.
      Sylvain Marie, Schneider Electric
      See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
-
  */
 #ifndef _NEWRAND_H
 #define _NEWRAND_H
 
 #ifdef __cplusplus
+#include <random>
 extern "C" {
 #endif
 
@@ -33,7 +33,7 @@ void set_seed(unsigned custom_seed) {
 }
 
 // - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere.
-int bounded_rand_int(int orig_range) {
+inline int bounded_rand_int(int orig_range) {
     // "LibSVM / LibLinear Original way" - make a 31bit or 63bit positive
     // random number and use modulo to make it fit in the range
     // return abs( (int)mt_rand()) % orig_range;
@@ -65,4 +65,4 @@ int bounded_rand_int(int orig_range) {
 }
 #endif
 
-#endif /* _NEWRAND_H */
+#endif /* _NEWRAND_H */
\ No newline at end of file

From 9daf268d6e0d1a6ddf0946864c32353c09c9d856 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Sat, 9 May 2020 18:44:53 -0500
Subject: [PATCH 04/22] Cleaned up redundancies

---
 sklearn/svm/_newrand.pyx |  1 -
 sklearn/svm/setup.py     | 19 +++++++++----------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/sklearn/svm/_newrand.pyx b/sklearn/svm/_newrand.pyx
index afbcbb1411db7..f392f44eb7b1b 100644
--- a/sklearn/svm/_newrand.pyx
+++ b/sklearn/svm/_newrand.pyx
@@ -1,4 +1,3 @@
-# language: c++
 """
 Wrapper for newrand.h
 
diff --git a/sklearn/svm/setup.py b/sklearn/svm/setup.py
index 92570cb9bf772..395e398437558 100644
--- a/sklearn/svm/setup.py
+++ b/sklearn/svm/setup.py
@@ -10,6 +10,15 @@ def configuration(parent_package='', top_path=None):
 
     config.add_subpackage('tests')
 
+    # newrand wrappers
+    config.add_extension('_newrand',
+                         sources=['_newrand.pyx'],
+                         include_dirs=[numpy.get_include(),
+                                       join('src', 'newrand')],
+                         depends=[join('src', 'newrand', 'newrand.h')],
+                         language='c++',
+                         )
+
     # Section LibSVM
 
     # we compile both libsvm and libsvm_sparse
@@ -41,16 +50,6 @@ def configuration(parent_package='', top_path=None):
                          depends=libsvm_depends,
                          )
 
-    config.add_extension('_newrand',
-                         sources=['_newrand.pyx'],
-                         include_dirs=[numpy.get_include(),
-                                       join('src', 'libsvm'),
-                                       join('src', 'newrand')],
-                         libraries=['libsvm-skl'],
-                         depends=[join('src', 'newrand', 'newrand.h')],
-                         language='c++',
-                         )
-
     # liblinear module
     libraries = []
     if os.name == 'posix':

From 4734ad1c2748c82bc451477e177d0531b2bb81d2 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Mon, 11 May 2020 14:02:13 -0500
Subject: [PATCH 05/22] Removed test_set_seed_wrap for now

---
 sklearn/svm/tests/test_bounds.py | 60 +++++++++++++++++---------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 6e93c0c39d2af..88f2ab1888685 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -6,8 +6,8 @@
 
 from sklearn.svm._bounds import l1_min_c
 from sklearn.svm import LinearSVC
-from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap
 from sklearn.linear_model import LogisticRegression
+from sklearn.svm._newrand import bounded_rand_int_wrap
 
 from sklearn.utils._testing import assert_raise_message
 
@@ -78,34 +78,38 @@ def test_unsupported_loss():
         l1_min_c(dense_X, Y1, loss='l1')
 
 
-def test_set_seed():
-    def _test(seed, val):
-        # TODO currently causes 99% coverage in test_bounds.py
-        if seed is not None:
-            set_seed_wrap(seed)
-        x = bounded_rand_int_wrap(100)
-        assert(x == val), 'Expected {} but got {} instead'.format(val, x)
+# TODO test deterministic results on differing systems
+# def test_set_seed():
+#     def _test(seed, val):
+#         # TODO currently causes 99% coverage in test_bounds.py
+#         if seed is not None:
+#             set_seed_wrap(seed)
+#         x = bounded_rand_int_wrap(100)
+#         assert(x == val), 'Expected {} but got {} instead'.format(val, x)
 
-    # TODO should be default seed for std::mt19937,
-    # but different results on jupyter lab
-    # _test(None, 81)
-    _test(5489, 81)  # default seed for std::mt19937
-    _test(0, 54)
-    _test(4294967295, 9)  # max unsigned int size
+#     # TODO should be default seed for std::mt19937,
+#     # but different results on jupyter lab
+#     # _test(None, 81)
+#     _test(5489, 81)  # default seed for std::mt19937
+#     _test(0, 54)
+#     _test(4294967295, 9)  # max unsigned int size
 
 
 def test_bounded_rand_int():
-    orig_range = 2147483647  # max int as input to bounded_rand_int
-    n_pts = 1000
-    n_iter = 100
-    ks_pvals = []
-    uniform_dist = stats.uniform(loc=0, scale=orig_range)
-    for _ in range(n_iter):
-        # Deterministic random sampling
-        sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)]
-        res = stats.kstest(sample, uniform_dist.cdf)
-        ks_pvals.append(res.pvalue)
-    avg_pval = np.mean(ks_pvals)
-    assert(avg_pval > 0.05),\
-        'p-value of {} not > 0.05, '\
-        'therefore distribution isn\'t uniform'.format(avg_pval)
+    def _test(orig_range):
+        n_pts = 1000
+        n_iter = 100
+        ks_pvals = []
+        uniform_dist = stats.uniform(loc=0, scale=orig_range)
+        # avg over multiple tests to make chance of outlier result negligible
+        for _ in range(n_iter):
+            # Deterministic random sampling
+            sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)]
+            res = stats.kstest(sample, uniform_dist.cdf)
+            ks_pvals.append(res.pvalue)
+        avg_pval = np.mean(ks_pvals)
+        assert(avg_pval > 0.05),\
+            'p-value of {} not > 0.05, '\
+            'therefore distribution isn\'t uniform'.format(avg_pval)
+    _test(2147483647)  # max int
+    _test(100)  # arbitrary less than max max int

From edfaf72d236811a1bb473e0796238683187cad15 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Thu, 14 May 2020 12:56:17 -0500
Subject: [PATCH 06/22] Update sklearn/svm/tests/test_bounds.py

Co-authored-by: smarie <sylvain.marie@schneider-electric.com>
---
 sklearn/svm/tests/test_bounds.py | 37 ++++++++++++++++----------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 88f2ab1888685..b3a360af1880f 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -95,21 +95,22 @@ def test_unsupported_loss():
 #     _test(4294967295, 9)  # max unsigned int size
 
 
-def test_bounded_rand_int():
-    def _test(orig_range):
-        n_pts = 1000
-        n_iter = 100
-        ks_pvals = []
-        uniform_dist = stats.uniform(loc=0, scale=orig_range)
-        # avg over multiple tests to make chance of outlier result negligible
-        for _ in range(n_iter):
-            # Deterministic random sampling
-            sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)]
-            res = stats.kstest(sample, uniform_dist.cdf)
-            ks_pvals.append(res.pvalue)
-        avg_pval = np.mean(ks_pvals)
-        assert(avg_pval > 0.05),\
-            'p-value of {} not > 0.05, '\
-            'therefore distribution isn\'t uniform'.format(avg_pval)
-    _test(2147483647)  # max int
-    _test(100)  # arbitrary less than max max int
+_MAX_INT = 2147483647
+
+
+@pytest.mark.parametrize("orig_range", [_MAX_INT, 100])
+def test_bounded_rand_int(orig_range):
+    n_pts = 1000
+    n_iter = 100
+    ks_pvals = []
+    uniform_dist = stats.uniform(loc=0, scale=orig_range)
+    # avg over multiple tests to make chance of outlier result negligible
+    for _ in range(n_iter):
+        # Deterministic random sampling
+        sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)]
+        res = stats.kstest(sample, uniform_dist.cdf)
+        ks_pvals.append(res.pvalue)
+    avg_pval = np.mean(ks_pvals)
+    assert(avg_pval > 0.05),\
+        'p-value of {} not > 0.05, '\
+        'therefore distribution isn\'t uniform'.format(avg_pval)

From d8f720dcf94ade6487a0421516b8b6ea9b5d14fc Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Fri, 15 May 2020 17:07:48 -0500
Subject: [PATCH 07/22] Review feedback changes

---
 sklearn/svm/tests/test_bounds.py | 38 +++++++++++++++++---------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 88f2ab1888685..3c9b737872cca 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -95,21 +95,23 @@ def test_unsupported_loss():
 #     _test(4294967295, 9)  # max unsigned int size
 
 
-def test_bounded_rand_int():
-    def _test(orig_range):
-        n_pts = 1000
-        n_iter = 100
-        ks_pvals = []
-        uniform_dist = stats.uniform(loc=0, scale=orig_range)
-        # avg over multiple tests to make chance of outlier result negligible
-        for _ in range(n_iter):
-            # Deterministic random sampling
-            sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)]
-            res = stats.kstest(sample, uniform_dist.cdf)
-            ks_pvals.append(res.pvalue)
-        avg_pval = np.mean(ks_pvals)
-        assert(avg_pval > 0.05),\
-            'p-value of {} not > 0.05, '\
-            'therefore distribution isn\'t uniform'.format(avg_pval)
-    _test(2147483647)  # max int
-    _test(100)  # arbitrary less than max max int
+_MAX_INT = 2147483647
+@pytest.mark.parametrize('orig_range, n_pts',
+                         [(_MAX_INT, 10000), (100, 10)])
+def test_bounded_rand_int(orig_range, n_pts):
+    n_iter = 100
+    ks_pvals = []
+    uniform_dist = stats.uniform(loc=0, scale=orig_range)
+    # perform multiple samplings to make chance of outlier sampling negligible
+    for _ in range(n_iter):
+        # Deterministic random sampling
+        sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)]
+        res = stats.kstest(sample, uniform_dist.cdf)
+        ks_pvals.append(res.pvalue)
+    min_10pct_pval = np.quantile(ks_pvals, q=0.1)
+    # lower 10th quantile pvalue <= 0.05 means that the test rejects the
+    # null hypothesis that the sample came from the uniform distribution
+    assert(min_10pct_pval > 0.05),\
+        'lower 10th quantile p-value of {} not > 0.05, '\
+        'therefore distribution isn\'t sufficiently uniform'\
+        .format(min_10pct_pval)

From b95d87f8f8653c0f94f08055ab5f64fd1e2ca4e7 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Fri, 15 May 2020 22:39:37 -0500
Subject: [PATCH 08/22] Style fixes

---
 sklearn/svm/tests/test_bounds.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 3c9b737872cca..1456fd02f6bfb 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -96,6 +96,8 @@ def test_unsupported_loss():
 
 
 _MAX_INT = 2147483647
+
+
 @pytest.mark.parametrize('orig_range, n_pts',
                          [(_MAX_INT, 10000), (100, 10)])
 def test_bounded_rand_int(orig_range, n_pts):

From e0e8e6d19a9a517900cf997bf4cfaab7eb88e40e Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Sun, 17 May 2020 16:06:39 -0500
Subject: [PATCH 09/22] np.quantile -> np.percentile

---
 sklearn/svm/tests/test_bounds.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 1456fd02f6bfb..9b4fc900341c6 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -110,7 +110,7 @@ def test_bounded_rand_int(orig_range, n_pts):
         sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)]
         res = stats.kstest(sample, uniform_dist.cdf)
         ks_pvals.append(res.pvalue)
-    min_10pct_pval = np.quantile(ks_pvals, q=0.1)
+    min_10pct_pval = np.percentile(ks_pvals, q=10)
     # lower 10th quantile pvalue <= 0.05 means that the test rejects the
     # null hypothesis that the sample came from the uniform distribution
     assert(min_10pct_pval > 0.05),\

From 83fb69385cb58e1904c1f56b1013c19867311377 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Mon, 22 Jun 2020 17:23:39 -0500
Subject: [PATCH 10/22] Update sklearn/svm/src/newrand/newrand.h

Merged suggestion

Co-authored-by: smarie <sylvain.marie@schneider-electric.com>
---
 sklearn/svm/src/newrand/newrand.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h
index 255f5c0310a7e..0da8f07e2274a 100644
--- a/sklearn/svm/src/newrand/newrand.h
+++ b/sklearn/svm/src/newrand/newrand.h
@@ -10,7 +10,7 @@
 #define _NEWRAND_H
 
 #ifdef __cplusplus
-#include <random>
+#include <random>  // needed for cython to generate a .cpp file from newrand.h
 extern "C" {
 #endif
 
@@ -65,4 +65,4 @@ inline int bounded_rand_int(int orig_range) {
 }
 #endif
 
-#endif /* _NEWRAND_H */
\ No newline at end of file
+#endif /* _NEWRAND_H */

From b4c71f48f9efac45e6a2d198cb684ca582be5a3d Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Mon, 22 Jun 2020 17:25:54 -0500
Subject: [PATCH 11/22] Update sklearn/svm/tests/test_bounds.py

Merge suggestions

Co-authored-by: smarie <sylvain.marie@schneider-electric.com>
---
 sklearn/svm/tests/test_bounds.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 9b4fc900341c6..1ca64b482368a 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -110,10 +110,24 @@ def test_bounded_rand_int(orig_range, n_pts):
         sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)]
         res = stats.kstest(sample, uniform_dist.cdf)
         ks_pvals.append(res.pvalue)
+    # Null hypothesis = samples come from an uniform distribution.
+    # Under the null hypothesis, p-values should be uniformly distributed and not concentrated 
+    # on low values (this may seem counter-intuitive but is backed by multiple refs)
+    # So we can do two checks: 
+    
+    # (1) check uniformity of p-values 
+    uniform_p_vals_dist = stats.uniform(loc=0, scale=1)
+    res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
+    assert res_pvals.pvalue > 0.05,\
+        "Null hypothesis rejected: generated random numbers are not uniform."\
+        " Details: the (meta) p-value of the test of uniform distribution of p-values"\
+        " is {} which is not > 0.05".format(res_pvals.pvalue)
+    
+    # (2) (safety belt) check that most p-values are above 0.05
     min_10pct_pval = np.percentile(ks_pvals, q=10)
     # lower 10th quantile pvalue <= 0.05 means that the test rejects the
     # null hypothesis that the sample came from the uniform distribution
     assert(min_10pct_pval > 0.05),\
-        'lower 10th quantile p-value of {} not > 0.05, '\
-        'therefore distribution isn\'t sufficiently uniform'\
+        "Null hypothesis rejected: generated random numbers are not uniform." \
+        " Details: lower 10th quantile p-value of {} not > 0.05."\
         .format(min_10pct_pval)

From 90a85f35fd12458fba1b8ba1bd6d3b71b211aefc Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Mon, 22 Jun 2020 19:52:53 -0500
Subject: [PATCH 12/22] linting fix

---
 sklearn/svm/tests/test_bounds.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 1ca64b482368a..3ea7c4d34c426 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -113,8 +113,8 @@ def test_bounded_rand_int(orig_range, n_pts):
     # Null hypothesis = samples come from an uniform distribution.
     # Under the null hypothesis, p-values should be uniformly distributed and not concentrated 
     # on low values (this may seem counter-intuitive but is backed by multiple refs)
-    # So we can do two checks: 
-    
+    # So we can do two checks:
+
     # (1) check uniformity of p-values 
     uniform_p_vals_dist = stats.uniform(loc=0, scale=1)
     res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
@@ -122,7 +122,7 @@ def test_bounded_rand_int(orig_range, n_pts):
         "Null hypothesis rejected: generated random numbers are not uniform."\
         " Details: the (meta) p-value of the test of uniform distribution of p-values"\
         " is {} which is not > 0.05".format(res_pvals.pvalue)
-    
+
     # (2) (safety belt) check that most p-values are above 0.05
     min_10pct_pval = np.percentile(ks_pvals, q=10)
     # lower 10th quantile pvalue <= 0.05 means that the test rejects the

From 9d1bc44f91f1af060fc81f45bde75e6e25a1bfd5 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Mon, 22 Jun 2020 19:56:38 -0500
Subject: [PATCH 13/22] linting fix

---
 sklearn/svm/tests/test_bounds.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 3ea7c4d34c426..0c666f0703db7 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -111,11 +111,13 @@ def test_bounded_rand_int(orig_range, n_pts):
         res = stats.kstest(sample, uniform_dist.cdf)
         ks_pvals.append(res.pvalue)
     # Null hypothesis = samples come from an uniform distribution.
-    # Under the null hypothesis, p-values should be uniformly distributed and not concentrated 
-    # on low values (this may seem counter-intuitive but is backed by multiple refs)
+    # Under the null hypothesis,
+    #   p-values should be uniformly distributed and not concentrated
+    # on low values
+    #   (this may seem counter-intuitive but is backed by multiple refs)
     # So we can do two checks:
 
-    # (1) check uniformity of p-values 
+    # (1) check uniformity of p-values
     uniform_p_vals_dist = stats.uniform(loc=0, scale=1)
     res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
     assert res_pvals.pvalue > 0.05,\

From 5215fee8ef1a8a903af84a32004b74bf42905db6 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Mon, 22 Jun 2020 19:59:04 -0500
Subject: [PATCH 14/22] linting fix

---
 sklearn/svm/tests/test_bounds.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 0c666f0703db7..db6fd3c11759e 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -122,8 +122,8 @@ def test_bounded_rand_int(orig_range, n_pts):
     res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
     assert res_pvals.pvalue > 0.05,\
         "Null hypothesis rejected: generated random numbers are not uniform."\
-        " Details: the (meta) p-value of the test of uniform distribution of p-values"\
-        " is {} which is not > 0.05".format(res_pvals.pvalue)
+        " Details: the (meta) p-value of the test of uniform distribution"\
+        " of p-values is {} which is not > 0.05".format(res_pvals.pvalue)
 
     # (2) (safety belt) check that most p-values are above 0.05
     min_10pct_pval = np.percentile(ks_pvals, q=10)

From f266f1512a274a2fbb4b56bae7980c63af9e3036 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Tue, 23 Jun 2020 14:17:31 -0500
Subject: [PATCH 15/22] Added deterministic test for mt19937 in svm/newrand.h

---
 sklearn/svm/tests/test_bounds.py | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index db6fd3c11759e..5b7cf0ccdb2da 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -78,24 +78,19 @@ def test_unsupported_loss():
         l1_min_c(dense_X, Y1, loss='l1')
 
 
-# TODO test deterministic results on differing systems
-# def test_set_seed():
-#     def _test(seed, val):
-#         # TODO currently causes 99% coverage in test_bounds.py
-#         if seed is not None:
-#             set_seed_wrap(seed)
-#         x = bounded_rand_int_wrap(100)
-#         assert(x == val), 'Expected {} but got {} instead'.format(val, x)
-
-#     # TODO should be default seed for std::mt19937,
-#     # but different results on jupyter lab
-#     # _test(None, 81)
-#     _test(5489, 81)  # default seed for std::mt19937
-#     _test(0, 54)
-#     _test(4294967295, 9)  # max unsigned int size
+_MAX_UNSIGNED_INT = 4294967295
+_MAX_INT = 2147483647
 
 
-_MAX_INT = 2147483647
+@pytest.mark.parametrize('seed, val',
+                         [(None, 81),
+                          (0, 54),
+                          (_MAX_UNSIGNED_INT, 9)])
+def test_set_seed(seed, val32, val64):
+    if seed is not None:
+        set_seed_wrap(seed)
+    x = bounded_rand_int_wrap(100)
+    assert(x == val), 'Expected {} but got {} instead'.format(val, x)
 
 
 @pytest.mark.parametrize('orig_range, n_pts',

From b37f3b415f1c59d783219d486c5f065d015bdd31 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Tue, 23 Jun 2020 14:19:58 -0500
Subject: [PATCH 16/22] Linting fix

---
 sklearn/svm/tests/test_bounds.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 5b7cf0ccdb2da..48bee28a388d2 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -7,7 +7,7 @@
 from sklearn.svm._bounds import l1_min_c
 from sklearn.svm import LinearSVC
 from sklearn.linear_model import LogisticRegression
-from sklearn.svm._newrand import bounded_rand_int_wrap
+from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap
 
 from sklearn.utils._testing import assert_raise_message
 
@@ -86,7 +86,7 @@ def test_unsupported_loss():
                          [(None, 81),
                           (0, 54),
                           (_MAX_UNSIGNED_INT, 9)])
-def test_set_seed(seed, val32, val64):
+def test_set_seed(seed, val):
     if seed is not None:
         set_seed_wrap(seed)
     x = bounded_rand_int_wrap(100)

From 1d68995fa5db1a023792621a96809af4f463d70f Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Wed, 24 Jun 2020 19:24:10 -0500
Subject: [PATCH 17/22] Review changes

---
 sklearn/svm/_newrand.pyx          | 10 ++++----
 sklearn/svm/src/newrand/newrand.h | 15 +++---------
 sklearn/svm/tests/test_bounds.py  | 38 ++++++++++++++++++++++---------
 3 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/sklearn/svm/_newrand.pyx b/sklearn/svm/_newrand.pyx
index f392f44eb7b1b..0f82cbcbfdd6f 100644
--- a/sklearn/svm/_newrand.pyx
+++ b/sklearn/svm/_newrand.pyx
@@ -4,11 +4,11 @@ Wrapper for newrand.h
 """
 
 cdef extern from "newrand.h":
-	void set_seed(unsigned)
-	int bounded_rand_int(int)
+	void set_seed(unsigned int)
+	unsigned int bounded_rand_int(unsigned int)
 
-def set_seed_wrap(unsigned custom_seed):
+def set_seed_wrap(unsigned int custom_seed):
 	set_seed(custom_seed)
 
-def bounded_rand_int_wrap(int orig_range):
-	return bounded_rand_int(orig_range)
+def bounded_rand_int_wrap(unsigned int range_):
+	return bounded_rand_int(range_)
diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h
index 0da8f07e2274a..e01bea99ec17e 100644
--- a/sklearn/svm/src/newrand/newrand.h
+++ b/sklearn/svm/src/newrand/newrand.h
@@ -18,14 +18,7 @@ extern "C" {
 // used in LibSVM / LibLinear, to ensure the same behaviour on windows-linux,
 // with increased speed
 // - (1) Init a `mt_rand` object
-#if INT_MAX == 0x7FFFFFFF
 std::mt19937 mt_rand(std::mt19937::default_seed);
-#elif INT_MAX == 0x7FFFFFFFFFFFFFFF
-std::mt19937_64 mt_rand(std::mt19937::default_seed);
-#else
-info("Random number generator is not fixed for this system. Please report issue. INT_MAX=%d\n", INT_MAX);
-exit(1);
-#endif
 
 // - (2) public `set_seed()` function that should be used instead of `srand()` to set a new seed.
 void set_seed(unsigned custom_seed) {
@@ -33,15 +26,13 @@ void set_seed(unsigned custom_seed) {
 }
 
 // - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere.
-inline int bounded_rand_int(int orig_range) {
-    // "LibSVM / LibLinear Original way" - make a 31bit or 63bit positive
+inline uint32_t bounded_rand_int(uint32_t range) {
+    // "LibSVM / LibLinear Original way" - make a 31bit positive
     // random number and use modulo to make it fit in the range
-    // return abs( (int)mt_rand()) % orig_range;
+    // return abs( (int)mt_rand()) % range;
 
     // "Better way": tweaked Lemire post-processor
     // from http://www.pcg-random.org/posts/bounded-rands.html
-    // TODO how could we make this casting safer, raising an error if lost information?
-    uint32_t range = uint32_t(orig_range);
     uint32_t x = mt_rand();
     uint64_t m = uint64_t(x) * uint64_t(range);
     uint32_t l = uint32_t(m);
diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 48bee28a388d2..200a24a70da81 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -79,37 +79,45 @@ def test_unsupported_loss():
 
 
 _MAX_UNSIGNED_INT = 4294967295
-_MAX_INT = 2147483647
 
 
 @pytest.mark.parametrize('seed, val',
                          [(None, 81),
                           (0, 54),
                           (_MAX_UNSIGNED_INT, 9)])
-def test_set_seed(seed, val):
+def test_newrand_set_seed(seed, val):
+    """Test that `set_seed` produces deterministic results"""
     if seed is not None:
         set_seed_wrap(seed)
     x = bounded_rand_int_wrap(100)
     assert(x == val), 'Expected {} but got {} instead'.format(val, x)
 
 
-@pytest.mark.parametrize('orig_range, n_pts',
-                         [(_MAX_INT, 10000), (100, 10)])
-def test_bounded_rand_int(orig_range, n_pts):
+@pytest.mark.parametrize('seed',
+                         [-1, _MAX_UNSIGNED_INT + 1])
+def test_newrand_set_seed_overflow(seed):
+    """Test that `set_seed_wrap` is defined for unsigned 32bits ints"""
+    with pytest.raises(OverflowError):
+        set_seed_wrap(seed)
+
+
+@pytest.mark.parametrize('range_, n_pts',
+                         [(_MAX_UNSIGNED_INT, 10000), (100, 10)])
+def test_newrand_bounded_rand_int(range_, n_pts):
+    """Test that `bounded_rand_int` follows a uniform distribution"""
     n_iter = 100
     ks_pvals = []
-    uniform_dist = stats.uniform(loc=0, scale=orig_range)
+    uniform_dist = stats.uniform(loc=0, scale=range_)
     # perform multiple samplings to make chance of outlier sampling negligible
     for _ in range(n_iter):
         # Deterministic random sampling
-        sample = [bounded_rand_int_wrap(orig_range) for _ in range(n_pts)]
+        sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)]
         res = stats.kstest(sample, uniform_dist.cdf)
         ks_pvals.append(res.pvalue)
     # Null hypothesis = samples come from an uniform distribution.
-    # Under the null hypothesis,
-    #   p-values should be uniformly distributed and not concentrated
-    # on low values
-    #   (this may seem counter-intuitive but is backed by multiple refs)
+    # Under the null hypothesis, p-values should be uniformly distributed
+    # and not concentrated on low values
+    # (this may seem counter-intuitive but is backed by multiple refs)
     # So we can do two checks:
 
     # (1) check uniformity of p-values
@@ -128,3 +136,11 @@ def test_bounded_rand_int(orig_range, n_pts):
         "Null hypothesis rejected: generated random numbers are not uniform." \
         " Details: lower 10th quantile p-value of {} not > 0.05."\
         .format(min_10pct_pval)
+
+
+@pytest.mark.parametrize('range_',
+                         [-1, _MAX_UNSIGNED_INT + 1])
+def test_newrand_bounded_rand_int_limits(range_):
+    """Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints"""
+    with pytest.raises(OverflowError):
+        bounded_rand_int_wrap(range_)

From 586aef600abb5c7d36560a5b398b44d21560d62e Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Thu, 25 Jun 2020 11:08:14 -0500
Subject: [PATCH 18/22] Update sklearn/svm/tests/test_bounds.py

Co-authored-by: smarie <sylvain.marie@schneider-electric.com>
---
 sklearn/svm/tests/test_bounds.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 200a24a70da81..550217a49271e 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -102,7 +102,7 @@ def test_newrand_set_seed_overflow(seed):
 
 
 @pytest.mark.parametrize('range_, n_pts',
-                         [(_MAX_UNSIGNED_INT, 10000), (100, 10)])
+                         [(_MAX_UNSIGNED_INT, 10000), (100, 25)])
 def test_newrand_bounded_rand_int(range_, n_pts):
     """Test that `bounded_rand_int` follows a uniform distribution"""
     n_iter = 100

From b3395ba0ccddfb5af0e230daf6c4911188321995 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Fri, 26 Jun 2020 12:44:19 -0500
Subject: [PATCH 19/22] Update sklearn/svm/tests/test_bounds.py

Co-authored-by: Joel Nothman <joel.nothman@gmail.com>
---
 sklearn/svm/tests/test_bounds.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 550217a49271e..b613135ba55da 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -90,7 +90,7 @@ def test_newrand_set_seed(seed, val):
     if seed is not None:
         set_seed_wrap(seed)
     x = bounded_rand_int_wrap(100)
-    assert(x == val), 'Expected {} but got {} instead'.format(val, x)
+    assert x == val, f'Expected {val} but got {x} instead'
 
 
 @pytest.mark.parametrize('seed',

From 8abd97cf1f732a22ff42ef181fa74a08f2616679 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Fri, 26 Jun 2020 12:45:12 -0500
Subject: [PATCH 20/22] Update sklearn/svm/tests/test_bounds.py

Co-authored-by: Joel Nothman <joel.nothman@gmail.com>
---
 sklearn/svm/tests/test_bounds.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index b613135ba55da..b7075c89544f2 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -128,7 +128,7 @@ def test_newrand_bounded_rand_int(range_, n_pts):
         " Details: the (meta) p-value of the test of uniform distribution"\
         " of p-values is {} which is not > 0.05".format(res_pvals.pvalue)
 
-    # (2) (safety belt) check that most p-values are above 0.05
+    # (2) (safety belt) check that 90% of p-values are above 0.05
     min_10pct_pval = np.percentile(ks_pvals, q=10)
     # lower 10th quantile pvalue <= 0.05 means that the test rejects the
     # null hypothesis that the sample came from the uniform distribution

From 8d88b427be981e198e2e9c407acdf03da6fb5aa4 Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Fri, 26 Jun 2020 12:51:22 -0500
Subject: [PATCH 21/22] Update test_bounds.py

---
 sklearn/svm/tests/test_bounds.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index b7075c89544f2..977bbbfcc6932 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -123,19 +123,18 @@ def test_newrand_bounded_rand_int(range_, n_pts):
     # (1) check uniformity of p-values
     uniform_p_vals_dist = stats.uniform(loc=0, scale=1)
     res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
-    assert res_pvals.pvalue > 0.05,\
-        "Null hypothesis rejected: generated random numbers are not uniform."\
-        " Details: the (meta) p-value of the test of uniform distribution"\
-        " of p-values is {} which is not > 0.05".format(res_pvals.pvalue)
+    assert res_pvals.pvalue > 0.05, (
+        "Null hypothesis rejected: generated random numbers are not uniform."
+        " Details: the (meta) p-value of the test of uniform distribution"
+        f" of p-values is {res_pvals.pvalue} which is not > 0.05")
 
     # (2) (safety belt) check that 90% of p-values are above 0.05
     min_10pct_pval = np.percentile(ks_pvals, q=10)
     # lower 10th quantile pvalue <= 0.05 means that the test rejects the
     # null hypothesis that the sample came from the uniform distribution
-    assert(min_10pct_pval > 0.05),\
-        "Null hypothesis rejected: generated random numbers are not uniform." \
-        " Details: lower 10th quantile p-value of {} not > 0.05."\
-        .format(min_10pct_pval)
+    assert min_10pct_pval > 0.05, (
+        "Null hypothesis rejected: generated random numbers are not uniform."
+        f" Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05.")
 
 
 @pytest.mark.parametrize('range_',

From f62ed4a553f015739749f4127995a9b73c4c5f8e Mon Sep 17 00:00:00 2001
From: Justin Huber <justhube@umich.edu>
Date: Fri, 26 Jun 2020 13:48:41 -0500
Subject: [PATCH 22/22] Linting fixes

---
 sklearn/svm/tests/test_bounds.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 977bbbfcc6932..423d5ed7a7fba 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -133,8 +133,9 @@ def test_newrand_bounded_rand_int(range_, n_pts):
     # lower 10th quantile pvalue <= 0.05 means that the test rejects the
     # null hypothesis that the sample came from the uniform distribution
     assert min_10pct_pval > 0.05, (
-        "Null hypothesis rejected: generated random numbers are not uniform."
-        f" Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05.")
+        "Null hypothesis rejected: generated random numbers are not uniform. "
+        f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05."
+        )
 
 
 @pytest.mark.parametrize('range_',