From 2a7a5612cc8d1330396840c3ae2ec395eadae665 Mon Sep 17 00:00:00 2001 From: Warren Weckesser Date: Sat, 17 Sep 2016 10:13:22 -0400 Subject: [PATCH] ENH: random: Add multivariate_hypergeometric function. --- doc/source/reference/routines.random.rst | 1 + numpy/random/mtrand/logfactorial.c | 154 ++++++++++ numpy/random/mtrand/logfactorial.h | 7 + numpy/random/mtrand/mtrand.pyx | 220 +++++++++++++- numpy/random/mtrand/mvhg_count.c | 121 ++++++++ numpy/random/mtrand/mvhg_count.h | 13 + numpy/random/mtrand/mvhg_marginals.c | 364 +++++++++++++++++++++++ numpy/random/mtrand/mvhg_marginals.h | 13 + numpy/random/mtrand/util.c | 25 ++ numpy/random/mtrand/util.h | 6 + numpy/random/setup.py | 9 +- numpy/random/tests/test_random.py | 131 +++++++- 12 files changed, 1059 insertions(+), 5 deletions(-) create mode 100644 numpy/random/mtrand/logfactorial.c create mode 100644 numpy/random/mtrand/logfactorial.h create mode 100644 numpy/random/mtrand/mvhg_count.c create mode 100644 numpy/random/mtrand/mvhg_count.h create mode 100644 numpy/random/mtrand/mvhg_marginals.c create mode 100644 numpy/random/mtrand/mvhg_marginals.h create mode 100644 numpy/random/mtrand/util.c create mode 100644 numpy/random/mtrand/util.h diff --git a/doc/source/reference/routines.random.rst b/doc/source/reference/routines.random.rst index c8b097d7d14c..dfe9ce7b23f2 100644 --- a/doc/source/reference/routines.random.rst +++ b/doc/source/reference/routines.random.rst @@ -50,6 +50,7 @@ Distributions logseries multinomial multivariate_normal + multivariate_hypergeometric negative_binomial noncentral_chisquare noncentral_f diff --git a/numpy/random/mtrand/logfactorial.c b/numpy/random/mtrand/logfactorial.c new file mode 100644 index 000000000000..9217e199ddeb --- /dev/null +++ b/numpy/random/mtrand/logfactorial.c @@ -0,0 +1,154 @@ + +#include + +/* + * logfact[k] holds log(k!) for k = 0, 1, 2, ..., 125. + */ + +static const double logfact[] = { + 0, + 0, + 0.69314718055994529, + 1.791759469228055, + 3.1780538303479458, + 4.7874917427820458, + 6.5792512120101012, + 8.5251613610654147, + 10.604602902745251, + 12.801827480081469, + 15.104412573075516, + 17.502307845873887, + 19.987214495661885, + 22.552163853123425, + 25.19122118273868, + 27.89927138384089, + 30.671860106080672, + 33.505073450136891, + 36.395445208033053, + 39.339884187199495, + 42.335616460753485, + 45.380138898476908, + 48.471181351835227, + 51.606675567764377, + 54.784729398112319, + 58.003605222980518, + 61.261701761002001, + 64.557538627006338, + 67.88974313718154, + 71.257038967168015, + 74.658236348830158, + 78.092223553315307, + 81.557959456115043, + 85.054467017581516, + 88.580827542197682, + 92.136175603687093, + 95.719694542143202, + 99.330612454787428, + 102.96819861451381, + 106.63176026064346, + 110.32063971475739, + 114.03421178146171, + 117.77188139974507, + 121.53308151543864, + 125.3172711493569, + 129.12393363912722, + 132.95257503561632, + 136.80272263732635, + 140.67392364823425, + 144.5657439463449, + 148.47776695177302, + 152.40959258449735, + 156.3608363030788, + 160.3311282166309, + 164.32011226319517, + 168.32744544842765, + 172.35279713916279, + 176.39584840699735, + 180.45629141754378, + 184.53382886144948, + 188.6281734236716, + 192.7390472878449, + 196.86618167289001, + 201.00931639928152, + 205.1681994826412, + 209.34258675253685, + 213.53224149456327, + 217.73693411395422, + 221.95644181913033, + 226.1905483237276, + 230.43904356577696, + 234.70172344281826, + 238.97838956183432, + 243.26884900298271, + 247.57291409618688, + 251.89040220972319, + 256.22113555000954, + 260.56494097186322, + 264.92164979855278, + 269.29109765101981, + 273.67312428569369, + 278.06757344036612, + 282.4742926876304, + 286.89313329542699, + 291.32395009427029, + 295.76660135076065, + 300.22094864701415, + 304.68685676566872, + 309.1641935801469, + 313.65282994987905, + 318.1526396202093, + 322.66349912672615, + 327.1852877037752, + 331.71788719692847, + 336.26118197919845, + 340.81505887079902, + 345.37940706226686, + 349.95411804077025, + 354.53908551944079, + 359.1342053695754, + 363.73937555556347, + 368.35449607240474, + 372.97946888568902, + 377.61419787391867, + 382.25858877306001, + 386.91254912321756, + 391.57598821732961, + 396.24881705179155, + 400.93094827891576, + 405.6222961611449, + 410.32277652693733, + 415.03230672824964, + 419.75080559954472, + 424.47819341825709, + 429.21439186665157, + 433.95932399501481, + 438.71291418612117, + 443.47508812091894, + 448.24577274538461, + 453.02489623849613, + 457.81238798127816, + 462.60817852687489, + 467.4121995716082, + 472.22438392698058, + 477.04466549258564, + 481.87297922988796 +}; + +/* + * Compute log(k!) + */ + +double logfactorial(unsigned long k) +{ + const double halfln2pi = 0.9189385332046728; + + if (k < sizeof(logfact)/sizeof(logfact[0])) { + /* Use the lookup table. */ + return logfact[k]; + } + + /* + * Use the Stirling series, truncated at the 1/k**3 term. + */ + return (k + 0.5)*log(k) - k + halfln2pi + (1.0/k)*(1/12.0 - 1/(360.0*k*k)); +} diff --git a/numpy/random/mtrand/logfactorial.h b/numpy/random/mtrand/logfactorial.h new file mode 100644 index 000000000000..25bd8a734b8e --- /dev/null +++ b/numpy/random/mtrand/logfactorial.h @@ -0,0 +1,7 @@ + +#ifndef LOGFACTORIAL_H +#define LOGFACTORIAL_H + +double logfactorial(unsigned long k); + +#endif diff --git a/numpy/random/mtrand/mtrand.pyx b/numpy/random/mtrand/mtrand.pyx index 21bc73e544be..077eef642064 100644 --- a/numpy/random/mtrand/mtrand.pyx +++ b/numpy/random/mtrand/mtrand.pyx @@ -37,6 +37,7 @@ cdef extern from "math.h": cdef extern from "numpy/npy_math.h": int npy_isfinite(double x) + long NPY_MAX_LONG cdef extern from "mtrand_py_helper.h": object empty_py_bytes(npy_intp length, void **bytes) @@ -84,7 +85,7 @@ cdef extern from "randomkit.h": cdef extern from "distributions.h": - # do not need the GIL, but they do need a lock on the state !! */ + # do not need the GIL, but they do need a lock on the state !! double rk_normal(rk_state *state, double loc, double scale) nogil double rk_standard_exponential(rk_state *state) nogil @@ -123,6 +124,32 @@ cdef extern from "distributions.h": long rk_hypergeometric(rk_state *state, long good, long bad, long sample) nogil long rk_logseries(rk_state *state, double p) nogil + +cdef extern from "util.h": + + long safe_sum_nonneg_long(npy_intp num_colors, long *colors) nogil + + +cdef extern from "mvhg_count.h": + + # This functions need a lock on the state: + int mvhg_count(rk_state *state, + long total, + npy_intp num_colors, long *colors, + long nsample, + npy_intp num_sample, long *sample) nogil + + +cdef extern from "mvhg_marginals.h": + + # This functions need a lock on the state: + int mvhg_marginals(rk_state *state, + long total, + npy_intp num_colors, long *colors, + long nsample, + npy_intp num_sample, long *sample) nogil + + ctypedef double (* rk_cont0)(rk_state *state) nogil ctypedef double (* rk_cont1)(rk_state *state, double a) nogil ctypedef double (* rk_cont2)(rk_state *state, double a, double b) nogil @@ -4193,6 +4220,8 @@ cdef class RandomState: -------- scipy.stats.hypergeom : probability density function, distribution or cumulative density function, etc. + multivariate_hypergeometric : Draw samples from the multivariate + hypergeometric distribution Notes ----- @@ -4651,6 +4680,194 @@ cdef class RandomState: return multin + def multivariate_hypergeometric(self, object colors, npy_intp nsample, + size=None, method='marginals'): + """ + multivariate_hypergeometric(colors, nsample, size=None, + method='marginals') + + Draw samples from a multivariate hypergeometric distribution. + + The multivariate hypergeometric distribution is a generalization + of the hypergeometric distribution. + + Choose ``nsample`` items at random without replacement from a + collection with ``k`` distinct types. ``k`` is the length of + ``colors``, and the values in ``colors`` are the number of + occurrences of that type in the collection. + + The name ``colors`` comes from a common description of the + distribution: it is the probability distribution of the number of + marbles of each color selected without replacement from an urn + containing marbles of different colors; ``colors[i]`` is the number + of marbles in the urn with color ``i``. + + Parameters + ---------- + colors : sequence of integers + The number of each type of item in the collection from which + a sample is drawn. The values in ``colors`` must be nonnegative. + To avoid loss of precision in the algorithm, ``sum(colors)`` + must be less than 1000000000 when `method` is "marginals". + nsample : int + The number of items selected. ``nsample`` must not be greater + than ``sum(colors)``. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, + then ``m * n * k`` samples are drawn, and the return value has + shape ``(m, n, k, len(colors))``. If `size` is an integer, + the output has shape ``(size, len(colors))``. Default is None, + in which case a single sample is returned as an array with shape + ``(len(colors),)``. + method : string, optional + Specify the algorithm that is used to generate the sample. + Must be 'count' or 'marginals' (the default). See the Notes + for a description of the methods. + + Returns + ------- + sample : ndarray + Sample drawn from the multivariate hypergeometric distribution. + + See Also + -------- + hypergeometric : Draw samples from the (univariate) hypergeometric + distribution. + + Notes + ----- + The two methods do not return the same sequence of samples. + + The "count" algorithm is roughly equivalent to the following numpy + code:: + + choices = np.repeat(np.arange(len(colors)), colors) + selection = np.random.choice(choices, nsample, replace=False) + sample = np.bincount(selection, minlength=len(colors)) + + The "count" algorithm uses a temporary array of integers with length + ``sum(colors)``. + + The "marginals" algorithm generates a single sample by using repeated + calls to the univariate hypergeometric sampler. It is roughly + equivalent to:: + + sample = np.zeros(len(colors), dtype=np.int) + # `remaining` is the cumulative sum of `colors` from the last + # element to the first; e.g. if `colors` is [3, 1, 5], then + # `remaining` is [9, 6, 5]. + remaining = np.cumsum(colors[::-1])[::-1] + for i in range(len(colors)-1): + if nsample < 1: + break + sample[i] = np.random.hypergeometric(colors[i], remaining[i+1], + nsample) + nsample -= sample[i] + sample[-1] = nsample + + The default method is "marginals". For some cases (e.g. when + `colors` contains relatively small integers), the "count" method + can be significantly faster than the "marginals" method. If + performance of the algorithm is important, test the two methods + with typical inputs to decide which works best. + + .. versionadded:: 1.16.0 + + Examples + -------- + >>> colors = [16, 8, 4] + >>> np.random.seed(123) + >>> np.random.multivariate_hypergeometric(colors, 6) + array([4, 1, 1]) + >>> np.random.multivariate_hypergeometric(colors, 6, size=3) + array([[3, 1, 2], + [3, 2, 1], + [2, 3, 1]]) + >>> np.random.multivariate_hypergeometric(colors, 6, size=(2, 2)) + array([[[4, 1, 1], + [4, 2, 0]], + [[3, 1, 2], + [4, 2, 0]]]) + + """ + cdef npy_intp num_colors, sample_size + cdef long total + cdef long *colors_ptr + cdef int result + + if method not in ['count', 'marginals']: + raise ValueError('method must be "count" or "marginals".') + + if nsample < 0: + raise ValueError("nsample must be nonnegative.") + + # Validation of colors, a 1-d sequence of nonnegative long integers. + invalid_colors = False + try: + colors = np.asarray(colors) + if colors.ndim != 1: + invalid_colors = True + elif colors.size > 0 and not np.issubdtype(colors.dtype, + np.integer): + invalid_colors = True + elif np.any((colors < 0) | (colors > NPY_MAX_LONG)): + invalid_colors = True + except ValueError: + invalid_colors = True + if invalid_colors: + raise ValueError('colors must be a one-dimensional sequence ' + 'of nonnegative integers less than %d.' % + NPY_MAX_LONG) + + colors = np.ascontiguousarray(colors, dtype=np.int_) + num_colors = colors.size + + colors_ptr = PyArray_DATA(colors) + + total = safe_sum_nonneg_long(num_colors, colors_ptr) + if total == -1: + raise ValueError("sum(colors) must not exceed the maximum value " + "of a long integer (%d)" % NPY_MAX_LONG) + + if method == 'marginals' and total >= 1000000000: + raise ValueError('When method is "marginals", sum(colors) must ' + 'be less than 1000000000.') + + # The C code that implements the 'count' method will malloc an + # array of size total*sizeof(npy_intp). Here we ensure that that + # product does not overflow. + max_long_index = NPY_MAX_LONG / sizeof(npy_intp) + if method == 'count' and total > max_long_index: + raise ValueError("When method is 'count', sum(colors) must not " + "exceed %d" % max_long_index) + if nsample > total: + raise ValueError("nsample > sum(colors)") + + shape = _shape_from_size(size, num_colors) + if num_colors == 0: + return np.empty(shape, dtype=np.int_) + + sample = np.zeros(shape, dtype=np.int_) + sample_ptr = PyArray_DATA(sample) + sample_size = sample.size + + if method == 'count': + with self.lock, nogil: + result = mvhg_count(self.internal_state, total, + num_colors, colors_ptr, nsample, + sample_size, sample_ptr) + if result == -1: + raise MemoryError("Insufficent memory for multivariate_" + "hypergeometric with method='count' and " + "sum(colors)=%d" % total) + else: + with self.lock, nogil: + result = mvhg_marginals(self.internal_state, total, + num_colors, colors_ptr, nsample, + sample_size, sample_ptr) + return sample + + def dirichlet(self, object alpha, size=None): """ dirichlet(alpha, size=None) @@ -4965,6 +5182,7 @@ logseries = _rand.logseries multivariate_normal = _rand.multivariate_normal multinomial = _rand.multinomial +multivariate_hypergeometric = _rand.multivariate_hypergeometric dirichlet = _rand.dirichlet shuffle = _rand.shuffle diff --git a/numpy/random/mtrand/mvhg_count.c b/numpy/random/mtrand/mvhg_count.c new file mode 100644 index 000000000000..08e6b8d228e0 --- /dev/null +++ b/numpy/random/mtrand/mvhg_count.c @@ -0,0 +1,121 @@ +#include "randomkit.h" +#include + + +/* + * mvhg_count + * + * Draw variates from the multivariate hypergeometric distribution-- + * the "count" algorithm. + * + * Parameters + * ---------- + * rk_state *state + * Pointer to a randomkit `rk_state` instance. + * long total + * The sum of the values in the array `colors`. (This is redundant + * information, but we know the caller has already computed it, so + * we might as well use it.) + * size_t num_colors + * The length of the `colors` array. + * long *colors + * The array of colors (i.e. the number of each type in the collection + * from which the random variate is drawn). + * long nsample + * The number of objects drawn without replacement for each variate. + * `nsample` must not exceed sum(colors). This condition is not checked; + * it is assumed that the caller has already validated the value. + * size_t num_sample + * The number of random variates to generate. (This is not the size of + * the array `sample`, because each variate requires `num_colors` + * values.) + * long *sample + * The array that will hold the result. The length of this array must + * be the product of `num_colors` and `num_sample`. + * The array is not initialized; it is expected that array has been + * initialized with zeros when the function is called. + * + * Notes + * ----- + * The "count" algorithm for drawing one variate is roughly equivalent to the + * following numpy code: + * + * choices = np.repeat(np.arange(len(colors)), colors) + * selection = np.random.choice(choices, nsample, replace=False) + * sample = np.bincount(selection, minlength=len(colors)) + * + * This function uses a temporary array with length sum(colors). + * + * On input, it is assumed that: + * * num_colors > 0 + * * colors[i] >= 0 + * * nsample <= sum(colors) + * + */ + +int mvhg_count(rk_state *state, + long total, + npy_intp num_colors, long *colors, + long nsample, + npy_intp num_sample, long *sample) +{ + npy_intp i, j, k; + npy_intp *choices; + int more_than_half; + + choices = malloc(total * (sizeof *choices)); + if (choices == NULL) { + return -1; + } + + /* + * If colors contains, for example, [3 2 5], then choices + * will contain [0 0 0 1 1 2 2 2 2 2]. + */ + k = 0; + for (i = 0; i < num_colors; ++i) { + for (j = 0; j < colors[i]; ++j) { + choices[k] = i; + ++k; + } + } + + more_than_half = nsample > (total / 2); + if (more_than_half) { + nsample = total - nsample; + } + + for (i = 0; i < num_sample; i += num_colors) { + /* + * Fisher-Yates shuffle, but only loop through the first + * `nsample` entries of `choices`. After the loop, + * choices[:nsample] contains a random sample from the + * the full array. + */ + for (j = 0; j < nsample; ++j) { + npy_intp tmp; + k = j + rk_interval(total - j - 1, state); + tmp = choices[k]; + choices[k] = choices[j]; + choices[j] = tmp; + } + /* + * Count the number of occurrences of each value in choices[:nsample]. + * The result, stored in sample[i:i+num_colors], is the sample from + * the multivariate hypergeometric distribution. + */ + for (j = 0; j < nsample; ++j) { + sample[i + choices[j]] += 1; + } + + if (more_than_half) { + for (j = 0; j < num_colors; ++j) { + sample[i + j] = colors[j] - sample[i + j]; + } + } + } + + free(choices); + + return 0; +} diff --git a/numpy/random/mtrand/mvhg_count.h b/numpy/random/mtrand/mvhg_count.h new file mode 100644 index 000000000000..f9ef5e2dc001 --- /dev/null +++ b/numpy/random/mtrand/mvhg_count.h @@ -0,0 +1,13 @@ + +#ifndef MVHG_COUNT_H +#define MVHG_COUNT_H + +#include "randomkit.h" + +int mvhg_count(rk_state *state, + long total, + npy_intp num_colors, long *colors, + long nsample, + npy_intp num_sample, long *sample); + +#endif diff --git a/numpy/random/mtrand/mvhg_marginals.c b/numpy/random/mtrand/mvhg_marginals.c new file mode 100644 index 000000000000..950160506b34 --- /dev/null +++ b/numpy/random/mtrand/mvhg_marginals.c @@ -0,0 +1,364 @@ + +#include "randomkit.h" +#include "logfactorial.h" +#include + +#ifndef min +#define min(x,y) ((xy)?x:y) +#endif + + +/* + * This is an alternative to rk_hypergeometric_hyp(), implemented + * without using floating point. + * + * It is assumed that when this function is called: + * * good, bad and sample are nonnegative; + * * the sum good+bad will not result in overflow; + * * sample <= good+bad. + */ + +static long hypergeometric_sample(rk_state *state, + long good, long bad, long sample) +{ + long remaining_total, remaining_good, result, computed_sample; + long total = good + bad; + + if (sample > total/2) { + computed_sample = total - sample; + } + else { + computed_sample = sample; + } + + remaining_total = total; + remaining_good = good; + + while ((computed_sample > 0) && (remaining_good > 0) && + (remaining_total > remaining_good)) { + /* + * rk_interval(max, state) returns an integer in [0, max] *inclusive*, + * so we decrement remaining_total before passing it to rk_interval(). + */ + --remaining_total; + if ((long) rk_interval(remaining_total, state) < remaining_good) { + /* Selected a "good" one, so decrement remaining_good. */ + --remaining_good; + } + --computed_sample; + } + + if (remaining_total == remaining_good) { + /* Only "good" choices are left. */ + remaining_good -= computed_sample; + } + + if (sample > total/2) { + result = remaining_good; + } + else { + result = good - remaining_good; + } + + return result; +} + + +/* D1 = 2*sqrt(2/e) */ +/* D2 = 3 - 2*sqrt(3/e) */ +#define D1 1.7155277699214135 +#define D2 0.8989161620588988 + +/* + * Generate variates from the hypergeometric distribution + * using the ratio-of-uniforms method. + * + * This is an alternative to rk_hypergeometric_hrua. + * It fixes a mistake in that code, and uses logfactorial() + * instead of loggam(). + * + * Variables have been renamed to match the original source. + * In the code, the variable names a, b, c, g, h, m, p, q, K, T, + * U and X match the names used in "Algorithm HRUA" beginning on + * page 82 of Stadlober's 1989 thesis. + * + * It is assumed that when this function is called: + * * good, bad and sample are nonnegative; + * * the sum good+bad will not result in overflow; + * * sample <= good+bad. + * + * References: + * - Ernst Stadlober's thesis "Sampling from Poisson, Binomial and + * Hypergeometric Distributions: Ratio of Uniforms as a Simple and + * Fast Alternative" (1989) + * - Ernst Stadlober, "The ratio of uniforms approach for generating + * discrete random variates", Journal of Computational and Applied + * Mathematics, 31, pp. 181-189 (1990). + */ + +static long hypergeometric_hrua(rk_state *state, + long good, long bad, long sample) +{ + long mingoodbad, maxgoodbad, popsize; + long computed_sample; + double p, q; + double mu, var; + double a, c, b, h, g; + long m, K; + + popsize = good + bad; + computed_sample = min(sample, popsize - sample); + mingoodbad = min(good, bad); + maxgoodbad = max(good, bad); + + /* + * Variables that do not match Stadlober (1989) + * Here Stadlober + * ---------------- --------- + * mingoodbad M + * popsize N + * computed_sample n + */ + + p = ((double) mingoodbad) / popsize; + q = ((double) maxgoodbad) / popsize; + + /* mu is the mean of the distribution. */ + mu = computed_sample * p; + + a = mu + 0.5; + + /* var is the variance of the distribution. */ + var = ((double)(popsize - computed_sample) * + computed_sample * p * q / (popsize - 1)); + + c = sqrt(var + 0.5); + + /* + * h is 2*s_hat (See Stadlober's theses (1989), Eq. (5.17); or + * Stadlober (1990), Eq. 8). s_hat is the scale of the "table mountain" + * function that dominates the scaled hypergeometric PMF ("scaled" means + * normalized to have a maximum value of 1). + */ + h = D1*c + D2; + + m = (long)floor((double)(computed_sample + 1) * (mingoodbad + 1) / + (popsize + 2)); + + g = (logfactorial(m) + + logfactorial(mingoodbad - m) + + logfactorial(computed_sample - m) + + logfactorial(maxgoodbad - computed_sample + m)); + + /* + * b is the upper bound for random samples: + * ... min(computed_sample, mingoodbad) + 1 is the length of the support. + * ... floor(a + 16*c) is 16 standard deviations beyond the mean. + * + * The idea behind the second upper bound is that values that far out in + * the tail have negligible probabilities. + * + * There is a comment in a previous version of this algorithm that says + * "16 for 16-decimal-digit precision in D1 and D2", + * but there is no documented justification for this value. A lower value + * might work just as well, but I've kept the value 16 here. + */ + b = min(min(computed_sample, mingoodbad) + 1.0, floor(a + 16*c)); + + while (1) { + double U, V, X, T; + double gp; + U = rk_double(state); + V = rk_double(state); /* "U star" in Stadlober (1989) */ + X = a + h*(V - 0.5) / U; + + /* fast rejection: */ + if ((X < 0.0) || (X >= b)) { + continue; + } + + K = (long)floor(X); + + gp = (logfactorial(K) + + logfactorial(mingoodbad - K) + + logfactorial(computed_sample - K) + + logfactorial(maxgoodbad - computed_sample + K)); + + T = g - gp; + + /* fast acceptance: */ + if ((U*(4.0 - U) - 3.0) <= T) { + break; + } + + /* fast rejection: */ + if (U*(U - T) >= 1) { + continue; + } + + if (2.0*log(U) <= T) { + /* acceptance */ + break; + } + } + + if (good > bad) { + K = computed_sample - K; + } + + if (computed_sample < sample) { + K = good - K; + } + + return K; +} + + +/* + * Draw a sample from the hypergeometric distribution. + * + * It is assumed that when this function is called: + * * good, bad and sample are nonnegative; + * * the sum good+bad will not result in overflow; + * * sample <= good+bad. + */ + +static long hypergeometric(rk_state *state, + long good, long bad, long sample) +{ + long r; + + if ((sample >= 10) && (sample <= good + bad - 10)) { + /* This will use the ratio-of-uniforms method. */ + r = hypergeometric_hrua(state, good, bad, sample); + } + else { + /* The simpler implementation is faster for small samples. */ + r = hypergeometric_sample(state, good, bad, sample); + } + return r; +} + + +/* + * mvhg_marginals + * + * Draw samples from the multivariate hypergeometric distribution-- + * the "marginals" algorithm. + * + * This version generates the sample by iteratively calling + * hypergeometric(). + * + * Parameters + * ---------- + * rk_state *state + * Pointer to a randomkit `rk_state` instance. + * long total + * The sum of the values in the array `colors`. (This is redundant + * information, but we know the caller has already computed it, so + * we might as well use it.) + * size_t num_colors + * The length of the `colors` array. + * long *colors + * The array of colors (i.e. the number of each type in the collection + * from which the random variate is drawn). + * long nsample + * The number of objects drawn without replacement for each variate. + * `nsample` must not exceed sum(colors). This condition is not checked; + * it is assumed that the caller has already validated the value. + * size_t num_sample + * The number of random variates to generate. (This is not the size of + * the array `sample`, because each variate requires `num_colors` + * values.) + * long *sample + * The array that will hold the result. The length of this array must + * be the product of `num_colors` and `num_sample`. + * The array is not initialized; it is expected that array has been + * initialized with zeros when the function is called. + * + * Notes + * ----- + * Here's an example that demonstrates the idea of this algorithm. + * + * Suppose the urn contains red, green, blue and yellow marbles. + * Let nred be the number of red marbles, and define the quantities for + * the other colors similarly. The total number of marbles is + * + * total = nred + ngreen + nblue + nyellow. + * + * To generate a sample using rk_hypergeometric: + * + * red_sample = rk_hypergeometric(ngood=nred, nbad=total - nred, + * nsample=nsample) + * + * This gives us the number of red marbles in the sample. The number of + * marbles in the sample that are *not* red is nsample - red_sample. + * To figure out the distribution of those marbles, we again use + * rk_hypergeometric: + * + * green_sample = rk_hypergeometric(ngood=ngreen, + * nbad=total - nred - ngreen, + * nsample=nsample - red_sample) + * + * Similarly, + * + * blue_sample = rk_hypergeometric( + * ngood=nblue, + * nbad=total - nred - ngreen - nblue, + * nsample=nsample - red_sample - green_sample) + * + * Finally, + * + * yellow_sample = total - (red_sample + green_sample + blue_sample). + * + * The above sequence of steps is implemented as a loop for an arbitrary + * number of colors in the innermost loop in the code below. `remaining` + * is the value passed to `nbad`; it is `total - colors[0]` in the first + * call to rk_hypergeometric(), and then decreases by `colors[j]` in + * each iteration. `num_to_sample` is the `nsample` argument. It + * starts at this function's `nsample` input, and is decreased by the + * result of the call to rk_hypergeometric() in each iteration. + */ + +int mvhg_marginals(rk_state *state, + long total, + npy_intp num_colors, long *colors, + long nsample, + npy_intp num_sample, long *sample) +{ + npy_intp i; + int more_than_half; + + more_than_half = nsample > (total / 2); + if (more_than_half) { + nsample = total - nsample; + } + + for (i = 0; i < num_sample; i += num_colors) { + npy_intp num_to_sample, remaining, j; + num_to_sample = nsample; + remaining = total; + for (j = 0; j < num_colors - 1; ++j) { + long r; + if (num_to_sample <= 0) { + break; + } + remaining -= colors[j]; + r = hypergeometric(state, colors[j], remaining, num_to_sample); + sample[i + j] = r; + num_to_sample -= r; + } + + if (j == num_colors - 1) { + sample[i + j] = num_to_sample; + } + + if (more_than_half) { + for (j = 0; j < num_colors; ++j) { + sample[i + j] = colors[j] - sample[i + j]; + } + } + } + return 0; +} diff --git a/numpy/random/mtrand/mvhg_marginals.h b/numpy/random/mtrand/mvhg_marginals.h new file mode 100644 index 000000000000..37d3796bcfb7 --- /dev/null +++ b/numpy/random/mtrand/mvhg_marginals.h @@ -0,0 +1,13 @@ + +#ifndef MVHG_MARGINALS_H +#define MVHG_MARGINALS_H + +#include "randomkit.h" + +int mvhg_marginals(rk_state *state, + long total, + npy_intp num_colors, long *colors, + long nsample, + npy_intp num_sample, long *sample); + +#endif diff --git a/numpy/random/mtrand/util.c b/numpy/random/mtrand/util.c new file mode 100644 index 000000000000..5ffaff57c0f1 --- /dev/null +++ b/numpy/random/mtrand/util.c @@ -0,0 +1,25 @@ + +#include + + +/* + * Sum the values in the array `colors`. + * Return -1 if an overflow occurs. + * + * The values in *colors are assumed to be nonnegative. + */ + +long safe_sum_nonneg_long(npy_intp num_colors, long *colors) +{ + npy_intp i; + long sum; + + sum = 0; + for (i = 0; i < num_colors; ++i) { + if (colors[i] > NPY_MAX_LONG - sum) { + return -1; + } + sum += colors[i]; + } + return sum; +} diff --git a/numpy/random/mtrand/util.h b/numpy/random/mtrand/util.h new file mode 100644 index 000000000000..d7b53fbb8073 --- /dev/null +++ b/numpy/random/mtrand/util.h @@ -0,0 +1,6 @@ +#ifndef UTIL_H +#include + +long safe_sum_nonneg_long(npy_intp num_colors, long *colors); + +#endif diff --git a/numpy/random/setup.py b/numpy/random/setup.py index 394a70ead371..9b38d7b0b859 100644 --- a/numpy/random/setup.py +++ b/numpy/random/setup.py @@ -42,9 +42,12 @@ def generate_libraries(ext, build_dir): libs = [] # Configure mtrand config.add_extension('mtrand', - sources=[join('mtrand', x) for x in - ['mtrand.c', 'randomkit.c', 'initarray.c', - 'distributions.c']]+[generate_libraries], + sources=([join('mtrand', x) for x in + ['mtrand.c', 'randomkit.c', 'initarray.c', + 'distributions.c', + 'mvhg_count.c', 'mvhg_marginals.c', + 'logfactorial.c', 'util.c']] + + [generate_libraries]), libraries=libs, depends=[join('mtrand', '*.h'), join('mtrand', '*.pyx'), diff --git a/numpy/random/tests/test_random.py b/numpy/random/tests/test_random.py index d0bb92a73895..7422cf46030b 100644 --- a/numpy/random/tests/test_random.py +++ b/numpy/random/tests/test_random.py @@ -5,7 +5,7 @@ from numpy.testing import ( assert_, assert_raises, assert_equal, assert_warns, assert_no_warnings, assert_array_equal, assert_array_almost_equal, - suppress_warnings + assert_allclose, suppress_warnings ) from numpy import random import sys @@ -955,6 +955,135 @@ def test_zipf(self): assert_array_equal(actual, desired) +class TestMultivariateHypergeometric(object): + + def test_repeatability(self): + np.random.seed(1234567890) + sample = np.random.multivariate_hypergeometric([3, 4, 5], 5, size=5, + method='count') + expected = np.array([[2, 2, 1], + [0, 3, 2], + [2, 2, 1], + [1, 1, 3], + [1, 1, 3]]) + assert_array_equal(sample, expected) + + np.random.seed(1234567890) + sample = np.random.multivariate_hypergeometric([20, 30, 50], 50, + size=5, + method='marginals') + expected = np.array([[11, 18, 21], + [10, 17, 23], + [ 6, 15, 29], + [11, 16, 23], + [ 7, 10, 33]]) + assert_array_equal(sample, expected) + + np.random.seed(1234567890) + sample = np.random.multivariate_hypergeometric([20, 30, 50], 12, + size=5, + method='marginals') + expected = np.array([[3, 4, 5], + [2, 3, 7], + [0, 5, 7], + [1, 4, 7], + [3, 4, 5]]) + assert_array_equal(sample, expected) + + def test_argument_validation(self): + # Error cases... + + # `colors` must be a 1-d sequence + assert_raises(ValueError, np.random.multivariate_hypergeometric, + 10, 4) + + # Negative nsample + assert_raises(ValueError, np.random.multivariate_hypergeometric, + [2, 3, 4], -1) + + # Negative color + assert_raises(ValueError, np.random.multivariate_hypergeometric, + [-1, 2, 3], 2) + + # nsample exceeds sum(colors) + assert_raises(ValueError, np.random.multivariate_hypergeometric, + [2, 3, 4], 10) + + # nsample exceeds sum(colors) (edge case of empty colors) + assert_raises(ValueError, np.random.multivariate_hypergeometric, + [], 1) + + # Validation errors associated with very large values in colors. + assert_raises(ValueError, np.random.multivariate_hypergeometric, + [999999999, 101], 5, 1, 'marginals') + long_info = np.iinfo(np.int_) + max_long = long_info.max + max_long_index = max_long // long_info.dtype.itemsize + assert_raises(ValueError, np.random.multivariate_hypergeometric, + [max_long_index - 100, 101], 5, 1, 'count') + + def test_edge_cases(self): + for method in ['count', 'marginals']: + sample = np.random.multivariate_hypergeometric([0, 0, 0], 0, + method=method) + assert_array_equal(sample, [0, 0, 0]) + + sample = np.random.multivariate_hypergeometric([], 0, + method=method) + assert_array_equal(sample, []) + + sample = np.random.multivariate_hypergeometric([], 0, size=1, + method=method) + assert_array_equal(sample, np.empty((1, 0), dtype=np.int_)) + + sample = np.random.multivariate_hypergeometric([1, 2, 3], 0, + method=method) + assert_array_equal(sample, [0, 0, 0]) + + sample = np.random.multivariate_hypergeometric([9, 0, 0], 3, + method=method) + assert_array_equal(sample, [3, 0, 0]) + + sample = np.random.multivariate_hypergeometric([1, 1, 0, 1, 1], 4, + method=method) + assert_array_equal(sample, [1, 1, 0, 1, 1]) + + sample = np.random.multivariate_hypergeometric([3, 4, 5], 12, + size=3, + method=method) + assert_array_equal(sample, [[3, 4, 5]]*3) + + def test_typical_cases(self): + mvhypergeom = np.random.multivariate_hypergeometric + np.random.seed(20920107) + colors = np.array([10, 5, 20, 25]) + # Cases for nsample: + # nsample < 10 + # 10 <= nsample < colors.sum()/2 + # colors.sum()/2 < nsample < colors.sum() - 10 + # colors.sum() - 10 < nsample < colors.sum() + for nsample in [8, 25, 45, 55]: + for method in ['count', 'marginals']: + for size in [5, (2, 3), 100000]: + sample = mvhypergeom(colors, nsample, size, method=method) + if isinstance(size, int): + expected_shape = (size,) + colors.shape + else: + expected_shape = size + colors.shape + assert_equal(sample.shape, expected_shape) + assert_((sample >= 0).all()) + assert_((sample <= colors).all()) + assert_array_equal(sample.sum(axis=-1), + np.full(size, fill_value=nsample, + dtype=int)) + if isinstance(size, int) and size >= 100000: + # This sample is large enough to compare its mean to + # the expected values. + assert_allclose(sample.mean(axis=0), + nsample * colors / colors.sum(), + rtol=1e-3, atol=0.005) + + class TestBroadcast(object): # tests that functions that broadcast behave # correctly when presented with non-scalar arguments