Skip to content

ENH: random: Add the multivariate hypergeometric distribution. #13794

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/release/upcoming_changes/13794.new_function.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Multivariate hypergeometric distribution added to `numpy.random`
----------------------------------------------------------------
The method `multivariate_hypergeometric` has been added to the class
`numpy.random.Generator`. This method generates random variates from
the multivariate hypergeometric probability distribution.
1 change: 1 addition & 0 deletions doc/source/reference/random/generator.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Distributions
~numpy.random.Generator.lognormal
~numpy.random.Generator.logseries
~numpy.random.Generator.multinomial
~numpy.random.Generator.multivariate_hypergeometric
~numpy.random.Generator.multivariate_normal
~numpy.random.Generator.negative_binomial
~numpy.random.Generator.noncentral_chisquare
Expand Down
249 changes: 248 additions & 1 deletion numpy/random/_generator.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ from numpy.core.multiarray import normalize_axis_index

from libc cimport string
from libc.stdint cimport (uint8_t, uint16_t, uint32_t, uint64_t,
int32_t, int64_t)
int32_t, int64_t, INT64_MAX, SIZE_MAX)
from ._bounded_integers cimport (_rand_bool, _rand_int32, _rand_int64,
_rand_int16, _rand_int8, _rand_uint64, _rand_uint32, _rand_uint16,
_rand_uint8, _gen_mask)
Expand Down Expand Up @@ -126,9 +126,38 @@ cdef extern from "include/distributions.h":
void random_multinomial(bitgen_t *bitgen_state, int64_t n, int64_t *mnix,
double *pix, np.npy_intp d, binomial_t *binomial) nogil

int random_mvhg_count(bitgen_t *bitgen_state,
int64_t total,
size_t num_colors, int64_t *colors,
int64_t nsample,
size_t num_variates, int64_t *variates) nogil
void random_mvhg_marginals(bitgen_t *bitgen_state,
int64_t total,
size_t num_colors, int64_t *colors,
int64_t nsample,
size_t num_variates, int64_t *variates) nogil

np.import_array()


cdef int64_t _safe_sum_nonneg_int64(size_t num_colors, int64_t *colors):
"""
Sum the values in the array `colors`.

Return -1 if an overflow occurs.
The values in *colors are assumed to be nonnegative.
"""
cdef size_t i
cdef int64_t sum

sum = 0
for i in range(num_colors):
if colors[i] > INT64_MAX - sum:
return -1
sum += colors[i]
return sum


cdef bint _check_bit_generator(object bitgen):
"""Check if an object satisfies the BitGenerator interface.
"""
Expand Down Expand Up @@ -3241,6 +3270,8 @@ cdef class Generator:

See Also
--------
multivariate_hypergeometric : Draw samples from the multivariate
hypergeometric distribution.
scipy.stats.hypergeom : probability density function, distribution or
cumulative density function, etc.

Expand Down Expand Up @@ -3739,6 +3770,222 @@ cdef class Generator:

return multin

def multivariate_hypergeometric(self, object colors, object nsample,
size=None, method='marginals'):
"""
multivariate_hypergeometric(colors, nsample, size=None,
method='marginals')

Generate variates from a multivariate hypergeometric distribution.

The multivariate hypergeometric distribution is a generalization
of the hypergeometric distribution.

Choose ``nsample`` items at random without replacement from a
collection with ``N`` distinct types. ``N`` is the length of
``colors``, and the values in ``colors`` are the number of occurrences
of that type in the collection. The total number of items in the
collection is ``sum(colors)``. Each random variate generated by this
function is a vector of length ``N`` holding the counts of the
different types that occurred in the ``nsample`` items.

The name ``colors`` comes from a common description of the
distribution: it is the probability distribution of the number of
marbles of each color selected without replacement from an urn
containing marbles of different colors; ``colors[i]`` is the number
of marbles in the urn with color ``i``.

Parameters
----------
colors : sequence of integers
The number of each type of item in the collection from which
a sample is drawn. The values in ``colors`` must be nonnegative.
To avoid loss of precision in the algorithm, ``sum(colors)``
must be less than ``10**9`` when `method` is "marginals".
nsample : int
The number of items selected. ``nsample`` must not be greater
than ``sum(colors)``.
size : int or tuple of ints, optional
The number of variates to generate, either an integer or a tuple
holding the shape of the array of variates. If the given size is,
e.g., ``(k, m)``, then ``k * m`` variates are drawn, where one
variate is a vector of length ``len(colors)``, and the return value
has shape ``(k, m, len(colors))``. If `size` is an integer, the
output has shape ``(size, len(colors))``. Default is None, in
which case a single variate is returned as an array with shape
``(len(colors),)``.
method : string, optional
Specify the algorithm that is used to generate the variates.
Must be 'count' or 'marginals' (the default). See the Notes
for a description of the methods.

Returns
-------
variates : ndarray
Array of variates drawn from the multivariate hypergeometric
distribution.

See Also
--------
hypergeometric : Draw samples from the (univariate) hypergeometric
distribution.

Notes
-----
The two methods do not return the same sequence of variates.

The "count" algorithm is roughly equivalent to the following numpy
code::

choices = np.repeat(np.arange(len(colors)), colors)
selection = np.random.choice(choices, nsample, replace=False)
variate = np.bincount(selection, minlength=len(colors))

The "count" algorithm uses a temporary array of integers with length
``sum(colors)``.

The "marginals" algorithm generates a variate by using repeated
calls to the univariate hypergeometric sampler. It is roughly
equivalent to::

variate = np.zeros(len(colors), dtype=np.int64)
# `remaining` is the cumulative sum of `colors` from the last
# element to the first; e.g. if `colors` is [3, 1, 5], then
# `remaining` is [9, 6, 5].
remaining = np.cumsum(colors[::-1])[::-1]
for i in range(len(colors)-1):
if nsample < 1:
break
variate[i] = hypergeometric(colors[i], remaining[i+1],
nsample)
nsample -= variate[i]
variate[-1] = nsample

The default method is "marginals". For some cases (e.g. when
`colors` contains relatively small integers), the "count" method
can be significantly faster than the "marginals" method. If
performance of the algorithm is important, test the two methods
with typical inputs to decide which works best.

.. versionadded:: 1.18.0

Examples
--------
>>> colors = [16, 8, 4]
>>> seed = 4861946401452
>>> gen = np.random.Generator(np.random.PCG64(seed))
>>> gen.multivariate_hypergeometric(colors, 6)
array([5, 0, 1])
>>> gen.multivariate_hypergeometric(colors, 6, size=3)
array([[5, 0, 1],
[2, 2, 2],
[3, 3, 0]])
>>> gen.multivariate_hypergeometric(colors, 6, size=(2, 2))
array([[[3, 2, 1],
[3, 2, 1]],
[[4, 1, 1],
[3, 2, 1]]])
"""
cdef int64_t nsamp
cdef size_t num_colors
cdef int64_t total
cdef int64_t *colors_ptr
cdef int64_t max_index
cdef size_t num_variates
cdef int64_t *variates_ptr
cdef int result

if method not in ['count', 'marginals']:
raise ValueError('method must be "count" or "marginals".')

try:
operator.index(nsample)
except TypeError:
raise ValueError('nsample must be an integer')

if nsample < 0:
raise ValueError("nsample must be nonnegative.")
if nsample > INT64_MAX:
raise ValueError("nsample must not exceed %d" % INT64_MAX)
nsamp = nsample

# Validation of colors, a 1-d sequence of nonnegative integers.
invalid_colors = False
try:
colors = np.asarray(colors)
if colors.ndim != 1:
invalid_colors = True
elif colors.size > 0 and not np.issubdtype(colors.dtype,
np.integer):
invalid_colors = True
elif np.any((colors < 0) | (colors > INT64_MAX)):
invalid_colors = True
except ValueError:
invalid_colors = True
if invalid_colors:
raise ValueError('colors must be a one-dimensional sequence '
'of nonnegative integers not exceeding %d.' %
INT64_MAX)

colors = np.ascontiguousarray(colors, dtype=np.int64)
num_colors = colors.size

colors_ptr = <int64_t *> np.PyArray_DATA(colors)

total = _safe_sum_nonneg_int64(num_colors, colors_ptr)
if total == -1:
raise ValueError("sum(colors) must not exceed the maximum value "
"of a 64 bit signed integer (%d)" % INT64_MAX)

if method == 'marginals' and total >= 1000000000:
raise ValueError('When method is "marginals", sum(colors) must '
'be less than 1000000000.')

# The C code that implements the 'count' method will malloc an
# array of size total*sizeof(size_t). Here we ensure that that
# product does not overflow.
if SIZE_MAX > <uint64_t>INT64_MAX:
max_index = INT64_MAX // sizeof(size_t)
else:
max_index = SIZE_MAX // sizeof(size_t)
if method == 'count' and total > max_index:
raise ValueError("When method is 'count', sum(colors) must not "
"exceed %d" % max_index)
if nsamp > total:
raise ValueError("nsample > sum(colors)")

# Figure out the shape of the return array.
if size is None:
shape = (num_colors,)
elif np.isscalar(size):
shape = (size, num_colors)
else:
shape = tuple(size) + (num_colors,)
variates = np.zeros(shape, dtype=np.int64)

if num_colors == 0:
return variates

# One variate is a vector of length num_colors.
num_variates = variates.size // num_colors
variates_ptr = <int64_t *> np.PyArray_DATA(variates)

if method == 'count':
with self.lock, nogil:
result = random_mvhg_count(&self._bitgen, total,
num_colors, colors_ptr, nsamp,
num_variates, variates_ptr)
if result == -1:
raise MemoryError("Insufficent memory for multivariate_"
"hypergeometric with method='count' and "
"sum(colors)=%d" % total)
else:
with self.lock, nogil:
random_mvhg_marginals(&self._bitgen, total,
num_colors, colors_ptr, nsamp,
num_variates, variates_ptr)
return variates

def dirichlet(self, object alpha, size=None):
"""
dirichlet(alpha, size=None)
Expand Down
14 changes: 14 additions & 0 deletions numpy/random/include/distributions.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,20 @@ DECLDIR void random_bounded_bool_fill(bitgen_t *bitgen_state, npy_bool off,
DECLDIR void random_multinomial(bitgen_t *bitgen_state, RAND_INT_TYPE n, RAND_INT_TYPE *mnix,
double *pix, npy_intp d, binomial_t *binomial);

/* multivariate hypergeometric, "count" method */
DECLDIR int random_mvhg_count(bitgen_t *bitgen_state,
int64_t total,
size_t num_colors, int64_t *colors,
int64_t nsample,
size_t num_variates, int64_t *variates);

/* multivariate hypergeometric, "marginals" method */
DECLDIR void random_mvhg_marginals(bitgen_t *bitgen_state,
int64_t total,
size_t num_colors, int64_t *colors,
int64_t nsample,
size_t num_variates, int64_t *variates);

/* Common to legacy-distributions.c and distributions.c but not exported */

RAND_INT_TYPE random_binomial_btpe(bitgen_t *bitgen_state,
Expand Down
3 changes: 2 additions & 1 deletion numpy/random/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ def generate_libraries(ext, build_dir):
other_srcs = [
'src/distributions/logfactorial.c',
'src/distributions/distributions.c',
'src/distributions/random_mvhg_count.c',
'src/distributions/random_mvhg_marginals.c',
'src/distributions/random_hypergeometric.c',
]
for gen in ['_generator', '_bounded_integers']:
Expand All @@ -114,7 +116,6 @@ def generate_libraries(ext, build_dir):
define_macros=defs,
)
config.add_extension('mtrand',
# mtrand does not depend on random_hypergeometric.c.
sources=['mtrand.c',
'src/legacy/legacy-distributions.c',
'src/distributions/logfactorial.c',
Expand Down
Loading