Skip to content

Python wrapper and tests for sklearn/svm/newrand/ #17157

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Jun 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions sklearn/svm/_newrand.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
Wrapper for newrand.h

"""

cdef extern from "newrand.h":
void set_seed(unsigned int)
unsigned int bounded_rand_int(unsigned int)

def set_seed_wrap(unsigned int custom_seed):
set_seed(custom_seed)

def bounded_rand_int_wrap(unsigned int range_):
return bounded_rand_int(range_)
9 changes: 9 additions & 0 deletions sklearn/svm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@ def configuration(parent_package='', top_path=None):

config.add_subpackage('tests')

# newrand wrappers
config.add_extension('_newrand',
sources=['_newrand.pyx'],
include_dirs=[numpy.get_include(),
join('src', 'newrand')],
depends=[join('src', 'newrand', 'newrand.h')],
language='c++',
)

# Section LibSVM

# we compile both libsvm and libsvm_sparse
Expand Down
17 changes: 4 additions & 13 deletions sklearn/svm/src/newrand/newrand.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,43 +5,34 @@
libsvm and liblinear.
Sylvain Marie, Schneider Electric
See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>

*/
#ifndef _NEWRAND_H
#define _NEWRAND_H

#ifdef __cplusplus
#include <random> // needed for cython to generate a .cpp file from newrand.h
extern "C" {
#endif

// Scikit-Learn-specific random number generator replacing `rand()` originally
// used in LibSVM / LibLinear, to ensure the same behaviour on windows-linux,
// with increased speed
// - (1) Init a `mt_rand` object
#if INT_MAX == 0x7FFFFFFF
std::mt19937 mt_rand(std::mt19937::default_seed);
#elif INT_MAX == 0x7FFFFFFFFFFFFFFF
std::mt19937_64 mt_rand(std::mt19937::default_seed);
#else
info("Random number generator is not fixed for this system. Please report issue. INT_MAX=%d\n", INT_MAX);
exit(1);
#endif

// - (2) public `set_seed()` function that should be used instead of `srand()` to set a new seed.
void set_seed(unsigned custom_seed) {
mt_rand.seed(custom_seed);
}

// - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere.
inline int bounded_rand_int(int orig_range) {
// "LibSVM / LibLinear Original way" - make a 31bit or 63bit positive
inline uint32_t bounded_rand_int(uint32_t range) {
// "LibSVM / LibLinear Original way" - make a 31bit positive
// random number and use modulo to make it fit in the range
// return abs( (int)mt_rand()) % orig_range;
// return abs( (int)mt_rand()) % range;

// "Better way": tweaked Lemire post-processor
// from http://www.pcg-random.org/posts/bounded-rands.html
// TODO how could we make this casting safer, raising an error if lost information?
uint32_t range = uint32_t(orig_range);
uint32_t x = mt_rand();
Comment on lines 35 to 36
Copy link
Contributor

@smarie smarie Jun 25, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't you think that it would be safer to keep this ? Indeed bounded_rand_int is marked as inline so I guess (but I'm no C++ expert) that the input can actually be anything, there is absolutely no protection. At least having this cast would ensure that when this code is badly used directly from C++ there are no side effects (from python that's already safe thanks to the Cython wrapper). What do you think ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that the input to bounded_rand_int is uint32_t, I think this extra step is redundant (casting uint32_t to uint32_t)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok !

uint64_t m = uint64_t(x) * uint64_t(range);
uint32_t l = uint32_t(m);
Expand Down
70 changes: 70 additions & 0 deletions sklearn/svm/tests/test_bounds.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import numpy as np
from scipy import sparse as sp
from scipy import stats

import pytest

from sklearn.svm._bounds import l1_min_c
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap

from sklearn.utils._testing import assert_raise_message

Expand Down Expand Up @@ -74,3 +76,71 @@ def test_ill_posed_min_c():
def test_unsupported_loss():
with pytest.raises(ValueError):
l1_min_c(dense_X, Y1, loss='l1')


_MAX_UNSIGNED_INT = 4294967295


@pytest.mark.parametrize('seed, val',
[(None, 81),
(0, 54),
(_MAX_UNSIGNED_INT, 9)])
def test_newrand_set_seed(seed, val):
"""Test that `set_seed` produces deterministic results"""
if seed is not None:
set_seed_wrap(seed)
x = bounded_rand_int_wrap(100)
assert x == val, f'Expected {val} but got {x} instead'


@pytest.mark.parametrize('seed',
[-1, _MAX_UNSIGNED_INT + 1])
def test_newrand_set_seed_overflow(seed):
"""Test that `set_seed_wrap` is defined for unsigned 32bits ints"""
with pytest.raises(OverflowError):
set_seed_wrap(seed)


@pytest.mark.parametrize('range_, n_pts',
[(_MAX_UNSIGNED_INT, 10000), (100, 25)])
def test_newrand_bounded_rand_int(range_, n_pts):
"""Test that `bounded_rand_int` follows a uniform distribution"""
n_iter = 100
ks_pvals = []
uniform_dist = stats.uniform(loc=0, scale=range_)
# perform multiple samplings to make chance of outlier sampling negligible
for _ in range(n_iter):
# Deterministic random sampling
sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)]
res = stats.kstest(sample, uniform_dist.cdf)
ks_pvals.append(res.pvalue)
# Null hypothesis = samples come from an uniform distribution.
# Under the null hypothesis, p-values should be uniformly distributed
# and not concentrated on low values
# (this may seem counter-intuitive but is backed by multiple refs)
# So we can do two checks:

# (1) check uniformity of p-values
uniform_p_vals_dist = stats.uniform(loc=0, scale=1)
res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
assert res_pvals.pvalue > 0.05, (
"Null hypothesis rejected: generated random numbers are not uniform."
" Details: the (meta) p-value of the test of uniform distribution"
f" of p-values is {res_pvals.pvalue} which is not > 0.05")

# (2) (safety belt) check that 90% of p-values are above 0.05
min_10pct_pval = np.percentile(ks_pvals, q=10)
# lower 10th quantile pvalue <= 0.05 means that the test rejects the
# null hypothesis that the sample came from the uniform distribution
assert min_10pct_pval > 0.05, (
"Null hypothesis rejected: generated random numbers are not uniform. "
f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05."
)


@pytest.mark.parametrize('range_',
[-1, _MAX_UNSIGNED_INT + 1])
def test_newrand_bounded_rand_int_limits(range_):
"""Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints"""
with pytest.raises(OverflowError):
bounded_rand_int_wrap(range_)