From 2a7a5612cc8d1330396840c3ae2ec395eadae665 Mon Sep 17 00:00:00 2001
From: Warren Weckesser <warren.weckesser@gmail.com>
Date: Sat, 17 Sep 2016 10:13:22 -0400
Subject: [PATCH] ENH: random: Add multivariate_hypergeometric function.

---
 doc/source/reference/routines.random.rst |   1 +
 numpy/random/mtrand/logfactorial.c       | 154 ++++++++++
 numpy/random/mtrand/logfactorial.h       |   7 +
 numpy/random/mtrand/mtrand.pyx           | 220 +++++++++++++-
 numpy/random/mtrand/mvhg_count.c         | 121 ++++++++
 numpy/random/mtrand/mvhg_count.h         |  13 +
 numpy/random/mtrand/mvhg_marginals.c     | 364 +++++++++++++++++++++++
 numpy/random/mtrand/mvhg_marginals.h     |  13 +
 numpy/random/mtrand/util.c               |  25 ++
 numpy/random/mtrand/util.h               |   6 +
 numpy/random/setup.py                    |   9 +-
 numpy/random/tests/test_random.py        | 131 +++++++-
 12 files changed, 1059 insertions(+), 5 deletions(-)
 create mode 100644 numpy/random/mtrand/logfactorial.c
 create mode 100644 numpy/random/mtrand/logfactorial.h
 create mode 100644 numpy/random/mtrand/mvhg_count.c
 create mode 100644 numpy/random/mtrand/mvhg_count.h
 create mode 100644 numpy/random/mtrand/mvhg_marginals.c
 create mode 100644 numpy/random/mtrand/mvhg_marginals.h
 create mode 100644 numpy/random/mtrand/util.c
 create mode 100644 numpy/random/mtrand/util.h

diff --git a/doc/source/reference/routines.random.rst b/doc/source/reference/routines.random.rst
index c8b097d7d14c..dfe9ce7b23f2 100644
--- a/doc/source/reference/routines.random.rst
+++ b/doc/source/reference/routines.random.rst
@@ -50,6 +50,7 @@ Distributions
    logseries
    multinomial
    multivariate_normal
+   multivariate_hypergeometric
    negative_binomial
    noncentral_chisquare
    noncentral_f
diff --git a/numpy/random/mtrand/logfactorial.c b/numpy/random/mtrand/logfactorial.c
new file mode 100644
index 000000000000..9217e199ddeb
--- /dev/null
+++ b/numpy/random/mtrand/logfactorial.c
@@ -0,0 +1,154 @@
+
+#include <math.h>
+
+/*
+ *  logfact[k] holds log(k!) for k = 0, 1, 2, ..., 125.
+ */
+
+static const double logfact[] = {
+    0,
+    0,
+    0.69314718055994529,
+    1.791759469228055,
+    3.1780538303479458,
+    4.7874917427820458,
+    6.5792512120101012,
+    8.5251613610654147,
+    10.604602902745251,
+    12.801827480081469,
+    15.104412573075516,
+    17.502307845873887,
+    19.987214495661885,
+    22.552163853123425,
+    25.19122118273868,
+    27.89927138384089,
+    30.671860106080672,
+    33.505073450136891,
+    36.395445208033053,
+    39.339884187199495,
+    42.335616460753485,
+    45.380138898476908,
+    48.471181351835227,
+    51.606675567764377,
+    54.784729398112319,
+    58.003605222980518,
+    61.261701761002001,
+    64.557538627006338,
+    67.88974313718154,
+    71.257038967168015,
+    74.658236348830158,
+    78.092223553315307,
+    81.557959456115043,
+    85.054467017581516,
+    88.580827542197682,
+    92.136175603687093,
+    95.719694542143202,
+    99.330612454787428,
+    102.96819861451381,
+    106.63176026064346,
+    110.32063971475739,
+    114.03421178146171,
+    117.77188139974507,
+    121.53308151543864,
+    125.3172711493569,
+    129.12393363912722,
+    132.95257503561632,
+    136.80272263732635,
+    140.67392364823425,
+    144.5657439463449,
+    148.47776695177302,
+    152.40959258449735,
+    156.3608363030788,
+    160.3311282166309,
+    164.32011226319517,
+    168.32744544842765,
+    172.35279713916279,
+    176.39584840699735,
+    180.45629141754378,
+    184.53382886144948,
+    188.6281734236716,
+    192.7390472878449,
+    196.86618167289001,
+    201.00931639928152,
+    205.1681994826412,
+    209.34258675253685,
+    213.53224149456327,
+    217.73693411395422,
+    221.95644181913033,
+    226.1905483237276,
+    230.43904356577696,
+    234.70172344281826,
+    238.97838956183432,
+    243.26884900298271,
+    247.57291409618688,
+    251.89040220972319,
+    256.22113555000954,
+    260.56494097186322,
+    264.92164979855278,
+    269.29109765101981,
+    273.67312428569369,
+    278.06757344036612,
+    282.4742926876304,
+    286.89313329542699,
+    291.32395009427029,
+    295.76660135076065,
+    300.22094864701415,
+    304.68685676566872,
+    309.1641935801469,
+    313.65282994987905,
+    318.1526396202093,
+    322.66349912672615,
+    327.1852877037752,
+    331.71788719692847,
+    336.26118197919845,
+    340.81505887079902,
+    345.37940706226686,
+    349.95411804077025,
+    354.53908551944079,
+    359.1342053695754,
+    363.73937555556347,
+    368.35449607240474,
+    372.97946888568902,
+    377.61419787391867,
+    382.25858877306001,
+    386.91254912321756,
+    391.57598821732961,
+    396.24881705179155,
+    400.93094827891576,
+    405.6222961611449,
+    410.32277652693733,
+    415.03230672824964,
+    419.75080559954472,
+    424.47819341825709,
+    429.21439186665157,
+    433.95932399501481,
+    438.71291418612117,
+    443.47508812091894,
+    448.24577274538461,
+    453.02489623849613,
+    457.81238798127816,
+    462.60817852687489,
+    467.4121995716082,
+    472.22438392698058,
+    477.04466549258564,
+    481.87297922988796
+};
+
+/*
+ *  Compute log(k!)
+ */
+
+double logfactorial(unsigned long k)
+{
+    const double halfln2pi = 0.9189385332046728;
+
+    if (k < sizeof(logfact)/sizeof(logfact[0])) {
+        /* Use the lookup table. */
+        return logfact[k];
+    }
+
+    /*
+     *  Use the Stirling series, truncated at the 1/k**3 term.
+     */
+    return (k + 0.5)*log(k) - k + halfln2pi + (1.0/k)*(1/12.0 - 1/(360.0*k*k));
+}
diff --git a/numpy/random/mtrand/logfactorial.h b/numpy/random/mtrand/logfactorial.h
new file mode 100644
index 000000000000..25bd8a734b8e
--- /dev/null
+++ b/numpy/random/mtrand/logfactorial.h
@@ -0,0 +1,7 @@
+
+#ifndef LOGFACTORIAL_H
+#define LOGFACTORIAL_H
+
+double logfactorial(unsigned long k);
+
+#endif
diff --git a/numpy/random/mtrand/mtrand.pyx b/numpy/random/mtrand/mtrand.pyx
index 21bc73e544be..077eef642064 100644
--- a/numpy/random/mtrand/mtrand.pyx
+++ b/numpy/random/mtrand/mtrand.pyx
@@ -37,6 +37,7 @@ cdef extern from "math.h":
 
 cdef extern from "numpy/npy_math.h":
     int npy_isfinite(double x)
+    long NPY_MAX_LONG
 
 cdef extern from "mtrand_py_helper.h":
     object empty_py_bytes(npy_intp length, void **bytes)
@@ -84,7 +85,7 @@ cdef extern from "randomkit.h":
 
 
 cdef extern from "distributions.h":
-    # do not need the GIL, but they do need a lock on the state !! */
+    # do not need the GIL, but they do need a lock on the state !!
 
     double rk_normal(rk_state *state, double loc, double scale) nogil
     double rk_standard_exponential(rk_state *state) nogil
@@ -123,6 +124,32 @@ cdef extern from "distributions.h":
     long rk_hypergeometric(rk_state *state, long good, long bad, long sample) nogil
     long rk_logseries(rk_state *state, double p) nogil
 
+
+cdef extern from "util.h":
+
+    long safe_sum_nonneg_long(npy_intp num_colors, long *colors) nogil
+
+
+cdef extern from "mvhg_count.h":
+
+    # This functions need a lock on the state:
+    int mvhg_count(rk_state *state,
+                   long total,
+                   npy_intp num_colors, long *colors,
+                   long nsample,
+                   npy_intp num_sample, long *sample) nogil
+
+
+cdef extern from "mvhg_marginals.h":
+
+    # This functions need a lock on the state:
+    int mvhg_marginals(rk_state *state,
+                       long total,
+                       npy_intp num_colors, long *colors,
+                       long nsample,
+                       npy_intp num_sample, long *sample) nogil
+
+
 ctypedef double (* rk_cont0)(rk_state *state) nogil
 ctypedef double (* rk_cont1)(rk_state *state, double a) nogil
 ctypedef double (* rk_cont2)(rk_state *state, double a, double b) nogil
@@ -4193,6 +4220,8 @@ cdef class RandomState:
         --------
         scipy.stats.hypergeom : probability density function, distribution or
             cumulative density function, etc.
+        multivariate_hypergeometric : Draw samples from the multivariate
+            hypergeometric distribution
 
         Notes
         -----
@@ -4651,6 +4680,194 @@ cdef class RandomState:
 
         return multin
 
+    def multivariate_hypergeometric(self, object colors, npy_intp nsample,
+                                    size=None, method='marginals'):
+        """
+        multivariate_hypergeometric(colors, nsample, size=None,
+                                    method='marginals')
+
+        Draw samples from a multivariate hypergeometric distribution.
+
+        The multivariate hypergeometric distribution is a generalization
+        of the hypergeometric distribution.
+
+        Choose ``nsample`` items at random without replacement from a
+        collection with ``k`` distinct types.  ``k`` is the length of
+        ``colors``, and the values in ``colors`` are the number of
+        occurrences of that type in the collection.
+
+        The name ``colors`` comes from a common description of the
+        distribution: it is the probability distribution of the number of
+        marbles of each color selected without replacement from an urn
+        containing marbles of different colors; ``colors[i]`` is the number
+        of marbles in the urn with color ``i``.
+
+        Parameters
+        ----------
+        colors : sequence of integers
+            The number of each type of item in the collection from which
+            a sample is drawn.  The values in ``colors`` must be nonnegative.
+            To avoid loss of precision in the algorithm, ``sum(colors)``
+            must be less than 1000000000 when `method` is "marginals".
+        nsample : int
+            The number of items selected.  ``nsample`` must not be greater
+            than ``sum(colors)``.
+        size : int or tuple of ints, optional
+            Output shape.  If the given shape is, e.g., ``(m, n, k)``,
+            then ``m * n * k`` samples are drawn, and the return value has
+            shape ``(m, n, k, len(colors))``.  If `size` is an integer,
+            the output has shape ``(size, len(colors))``.  Default is None,
+            in which case a single sample is returned as an array with shape
+            ``(len(colors),)``.
+        method : string, optional
+            Specify the algorithm that is used to generate the sample.
+            Must be 'count' or 'marginals' (the default).  See the Notes
+            for a description of the methods.
+
+        Returns
+        -------
+        sample : ndarray
+            Sample drawn from the multivariate hypergeometric distribution.
+
+        See Also
+        --------
+        hypergeometric : Draw samples from the (univariate) hypergeometric
+            distribution.
+
+        Notes
+        -----
+        The two methods do not return the same sequence of samples.
+
+        The "count" algorithm is roughly equivalent to the following numpy
+        code::
+
+            choices = np.repeat(np.arange(len(colors)), colors)
+            selection = np.random.choice(choices, nsample, replace=False)
+            sample = np.bincount(selection, minlength=len(colors))
+
+        The "count" algorithm uses a temporary array of integers with length
+        ``sum(colors)``.
+
+        The "marginals" algorithm generates a single sample by using repeated
+        calls to the univariate hypergeometric sampler.  It is roughly
+        equivalent to::
+
+            sample = np.zeros(len(colors), dtype=np.int)
+            # `remaining` is the cumulative sum of `colors` from the last
+            # element to the first; e.g. if `colors` is [3, 1, 5], then
+            # `remaining` is [9, 6, 5].
+            remaining = np.cumsum(colors[::-1])[::-1]
+            for i in range(len(colors)-1):
+                if nsample < 1:
+                    break
+                sample[i] = np.random.hypergeometric(colors[i], remaining[i+1],
+                                                     nsample)
+                nsample -= sample[i]
+            sample[-1] = nsample
+
+        The default method is "marginals".  For some cases (e.g. when
+        `colors` contains relatively small integers), the "count" method
+        can be significantly faster than the "marginals" method.  If
+        performance of the algorithm is important, test the two methods
+        with typical inputs to decide which works best.
+
+        .. versionadded:: 1.16.0
+
+        Examples
+        --------
+        >>> colors = [16, 8, 4]
+        >>> np.random.seed(123)
+        >>> np.random.multivariate_hypergeometric(colors, 6)
+        array([4, 1, 1])
+        >>> np.random.multivariate_hypergeometric(colors, 6, size=3)
+        array([[3, 1, 2],
+               [3, 2, 1],
+               [2, 3, 1]])
+        >>> np.random.multivariate_hypergeometric(colors, 6, size=(2, 2))
+        array([[[4, 1, 1],
+                [4, 2, 0]],
+               [[3, 1, 2],
+                [4, 2, 0]]])
+
+        """
+        cdef npy_intp num_colors, sample_size
+        cdef long total
+        cdef long *colors_ptr
+        cdef int result
+
+        if method not in ['count', 'marginals']:
+            raise ValueError('method must be "count" or "marginals".')
+
+        if nsample < 0:
+            raise ValueError("nsample must be nonnegative.")
+
+        # Validation of colors, a 1-d sequence of nonnegative long integers.
+        invalid_colors = False
+        try:
+            colors = np.asarray(colors)
+            if colors.ndim != 1:
+                invalid_colors = True
+            elif colors.size > 0 and not np.issubdtype(colors.dtype,
+                                                       np.integer):
+                invalid_colors = True
+            elif np.any((colors < 0) | (colors > NPY_MAX_LONG)):
+                invalid_colors = True
+        except ValueError:
+            invalid_colors = True
+        if invalid_colors:
+            raise ValueError('colors must be a one-dimensional sequence '
+                             'of nonnegative integers less than %d.' %
+                             NPY_MAX_LONG)
+
+        colors = np.ascontiguousarray(colors, dtype=np.int_)
+        num_colors = colors.size
+
+        colors_ptr = <long*> PyArray_DATA(colors)
+
+        total = safe_sum_nonneg_long(num_colors, colors_ptr)
+        if total == -1:
+            raise ValueError("sum(colors) must not exceed the maximum value "
+                             "of a long integer (%d)" % NPY_MAX_LONG)
+
+        if method == 'marginals' and total >= 1000000000:
+            raise ValueError('When method is "marginals", sum(colors) must '
+                             'be less than 1000000000.')
+
+        # The C code that implements the 'count' method will malloc an
+        # array of size total*sizeof(npy_intp). Here we ensure that that
+        # product does not overflow.
+        max_long_index = NPY_MAX_LONG / sizeof(npy_intp)
+        if method == 'count' and total > max_long_index:
+            raise ValueError("When method is 'count', sum(colors) must not "
+                             "exceed %d" % max_long_index)
+        if nsample > total:
+            raise ValueError("nsample > sum(colors)")
+
+        shape = _shape_from_size(size, num_colors)
+        if num_colors == 0:
+            return np.empty(shape, dtype=np.int_)
+
+        sample = np.zeros(shape, dtype=np.int_)
+        sample_ptr = <long*> PyArray_DATA(sample)
+        sample_size = sample.size
+
+        if method == 'count':
+            with self.lock, nogil:
+                result = mvhg_count(self.internal_state, total,
+                                    num_colors, colors_ptr, nsample,
+                                    sample_size, sample_ptr)
+            if result == -1:
+                raise MemoryError("Insufficent memory for multivariate_"
+                                  "hypergeometric with method='count' and "
+                                  "sum(colors)=%d" % total)
+        else:
+            with self.lock, nogil:
+                result = mvhg_marginals(self.internal_state, total,
+                                        num_colors, colors_ptr, nsample,
+                                        sample_size, sample_ptr)
+        return sample
+
+
     def dirichlet(self, object alpha, size=None):
         """
         dirichlet(alpha, size=None)
@@ -4965,6 +5182,7 @@ logseries = _rand.logseries
 
 multivariate_normal = _rand.multivariate_normal
 multinomial = _rand.multinomial
+multivariate_hypergeometric = _rand.multivariate_hypergeometric
 dirichlet = _rand.dirichlet
 
 shuffle = _rand.shuffle
diff --git a/numpy/random/mtrand/mvhg_count.c b/numpy/random/mtrand/mvhg_count.c
new file mode 100644
index 000000000000..08e6b8d228e0
--- /dev/null
+++ b/numpy/random/mtrand/mvhg_count.c
@@ -0,0 +1,121 @@
+#include "randomkit.h"
+#include <stdlib.h>
+
+
+/*
+ *  mvhg_count
+ *
+ *  Draw variates from the multivariate hypergeometric distribution--
+ *  the "count" algorithm.
+ *
+ *  Parameters
+ *  ----------
+ *  rk_state *state
+ *      Pointer to a randomkit `rk_state` instance.
+ *  long total
+ *      The sum of the values in the array `colors`.  (This is redundant
+ *      information, but we know the caller has already computed it, so
+ *      we might as well use it.)
+ *  size_t num_colors
+ *      The length of the `colors` array.
+ *  long *colors
+ *      The array of colors (i.e. the number of each type in the collection
+ *      from which the random variate is drawn).
+ *  long nsample
+ *      The number of objects drawn without replacement for each variate.
+ *      `nsample` must not exceed sum(colors).  This condition is not checked;
+ *      it is assumed that the caller has already validated the value.
+ *  size_t num_sample
+ *      The number of random variates to generate.  (This is not the size of
+ *      the array `sample`, because each variate requires `num_colors`
+ *      values.)
+ *  long *sample
+ *      The array that will hold the result.  The length of this array must
+ *      be the product of `num_colors` and `num_sample`.
+ *      The array is not initialized; it is expected that array has been
+ *      initialized with zeros when the function is called.
+ *
+ *  Notes
+ *  -----
+ *  The "count" algorithm for drawing one variate is roughly equivalent to the
+ *  following numpy code:
+ *
+ *      choices = np.repeat(np.arange(len(colors)), colors)
+ *      selection = np.random.choice(choices, nsample, replace=False)
+ *      sample = np.bincount(selection, minlength=len(colors))
+ *
+ *  This function uses a temporary array with length sum(colors).
+ *
+ *  On input, it is assumed that:
+ *    * num_colors > 0
+ *    * colors[i] >= 0
+ *    * nsample <= sum(colors)
+ *
+ */
+
+int mvhg_count(rk_state *state,
+               long total,
+               npy_intp num_colors, long *colors,
+               long nsample,
+               npy_intp num_sample, long *sample)
+{
+    npy_intp i, j, k;
+    npy_intp *choices;
+    int more_than_half;
+
+    choices = malloc(total * (sizeof *choices));
+    if (choices == NULL) {
+        return -1;
+    }
+
+    /*
+     *  If colors contains, for example, [3 2 5], then choices
+     *  will contain [0 0 0 1 1 2 2 2 2 2].
+     */
+    k = 0;
+    for (i = 0; i < num_colors; ++i) {
+        for (j = 0; j < colors[i]; ++j) {
+            choices[k] = i;
+            ++k;
+        }
+    }
+
+    more_than_half = nsample > (total / 2);
+    if (more_than_half) {
+        nsample = total - nsample;
+    }
+
+    for (i = 0; i < num_sample; i += num_colors) {
+        /*
+         *  Fisher-Yates shuffle, but only loop through the first
+         *  `nsample` entries of `choices`.  After the loop,
+         *  choices[:nsample] contains a random sample from the
+         *  the full array.
+         */
+        for (j = 0; j < nsample; ++j) {
+            npy_intp tmp;
+            k = j + rk_interval(total - j - 1, state);
+            tmp = choices[k];
+            choices[k] = choices[j];
+            choices[j] = tmp;
+        }
+        /*
+         *  Count the number of occurrences of each value in choices[:nsample].
+         *  The result, stored in sample[i:i+num_colors], is the sample from
+         *  the multivariate hypergeometric distribution.
+         */
+        for (j = 0; j < nsample; ++j) {
+            sample[i + choices[j]] += 1;
+        }
+
+        if (more_than_half) {
+            for (j = 0; j < num_colors; ++j) {
+                sample[i + j] = colors[j] - sample[i + j];
+            }
+        }
+    }
+
+    free(choices);
+
+    return 0;
+}
diff --git a/numpy/random/mtrand/mvhg_count.h b/numpy/random/mtrand/mvhg_count.h
new file mode 100644
index 000000000000..f9ef5e2dc001
--- /dev/null
+++ b/numpy/random/mtrand/mvhg_count.h
@@ -0,0 +1,13 @@
+
+#ifndef MVHG_COUNT_H
+#define MVHG_COUNT_H
+
+#include "randomkit.h"
+
+int mvhg_count(rk_state *state,
+               long total,
+               npy_intp num_colors, long *colors,
+               long nsample,
+               npy_intp num_sample, long *sample);
+
+#endif
diff --git a/numpy/random/mtrand/mvhg_marginals.c b/numpy/random/mtrand/mvhg_marginals.c
new file mode 100644
index 000000000000..950160506b34
--- /dev/null
+++ b/numpy/random/mtrand/mvhg_marginals.c
@@ -0,0 +1,364 @@
+
+#include "randomkit.h"
+#include "logfactorial.h"
+#include <math.h>
+
+#ifndef min
+#define min(x,y) ((x<y)?x:y)
+#define max(x,y) ((x>y)?x:y)
+#endif
+
+
+/*
+ *  This is an alternative to rk_hypergeometric_hyp(), implemented
+ *  without using floating point.
+ *
+ *  It is assumed that when this function is called:
+ *    * good, bad and sample are nonnegative;
+ *    * the sum good+bad will not result in overflow; 
+ *    * sample <= good+bad.
+ */
+
+static long hypergeometric_sample(rk_state *state,
+                                  long good, long bad, long sample)
+{
+    long remaining_total, remaining_good, result, computed_sample;
+    long total = good + bad;
+
+    if (sample > total/2) {
+        computed_sample = total - sample;
+    }
+    else {
+        computed_sample = sample;
+    }
+
+    remaining_total = total;
+    remaining_good = good;
+
+    while ((computed_sample > 0) && (remaining_good > 0) &&
+           (remaining_total > remaining_good)) {
+        /*
+         *  rk_interval(max, state) returns an integer in [0, max] *inclusive*,
+         *  so we decrement remaining_total before passing it to rk_interval().
+         */
+        --remaining_total;
+        if ((long) rk_interval(remaining_total, state) < remaining_good) {
+            /* Selected a "good" one, so decrement remaining_good. */
+            --remaining_good;
+        }
+        --computed_sample;
+    }
+
+    if (remaining_total == remaining_good) {
+        /* Only "good" choices are left. */
+        remaining_good -= computed_sample;
+    }
+
+    if (sample > total/2) {
+        result = remaining_good;
+    }
+    else {
+        result = good - remaining_good;
+    }
+
+    return result;
+}
+
+
+/* D1 = 2*sqrt(2/e)      */
+/* D2 = 3 - 2*sqrt(3/e)  */
+#define D1 1.7155277699214135
+#define D2 0.8989161620588988
+
+/*
+ *  Generate variates from the hypergeometric distribution
+ *  using the ratio-of-uniforms method.
+ *
+ *  This is an alternative to rk_hypergeometric_hrua.
+ *  It fixes a mistake in that code, and uses logfactorial()
+ *  instead of loggam().
+ *
+ *  Variables have been renamed to match the original source.
+ *  In the code, the variable names a, b, c, g, h, m, p, q, K, T,
+ *  U and X match the names used in "Algorithm HRUA" beginning on
+ *  page 82 of Stadlober's 1989 thesis.
+ *
+ *  It is assumed that when this function is called:
+ *    * good, bad and sample are nonnegative;
+ *    * the sum good+bad will not result in overflow; 
+ *    * sample <= good+bad.
+ *
+ * References:
+ * -  Ernst Stadlober's thesis "Sampling from Poisson, Binomial and
+ *    Hypergeometric Distributions: Ratio of Uniforms as a Simple and
+ *    Fast Alternative" (1989)
+ * -  Ernst Stadlober, "The ratio of uniforms approach for generating
+ *    discrete random variates", Journal of Computational and Applied
+ *    Mathematics, 31, pp. 181-189 (1990).
+ */
+
+static long hypergeometric_hrua(rk_state *state,
+                                long good, long bad, long sample)
+{
+    long mingoodbad, maxgoodbad, popsize;
+    long computed_sample;
+    double p, q;
+    double mu, var;
+    double a, c, b, h, g;
+    long m, K;
+
+    popsize = good + bad;
+    computed_sample = min(sample, popsize - sample);
+    mingoodbad = min(good, bad);
+    maxgoodbad = max(good, bad);
+
+    /*
+     *  Variables that do not match Stadlober (1989)
+     *    Here               Stadlober
+     *    ----------------   ---------
+     *    mingoodbad            M
+     *    popsize               N
+     *    computed_sample       n
+     */
+
+    p = ((double) mingoodbad) / popsize;
+    q = ((double) maxgoodbad) / popsize;
+
+    /* mu is the mean of the distribution. */
+    mu = computed_sample * p;
+
+    a = mu + 0.5;
+
+    /* var is the variance of the distribution. */
+    var = ((double)(popsize - computed_sample) *
+           computed_sample * p * q / (popsize - 1));
+
+    c = sqrt(var + 0.5);
+
+    /*
+     *  h is 2*s_hat (See Stadlober's theses (1989), Eq. (5.17); or
+     *  Stadlober (1990), Eq. 8).  s_hat is the scale of the "table mountain"
+     *  function that dominates the scaled hypergeometric PMF ("scaled" means
+     *  normalized to have a maximum value of 1).
+     */
+    h = D1*c + D2;
+
+    m = (long)floor((double)(computed_sample + 1) * (mingoodbad + 1) /
+                            (popsize + 2));
+
+    g = (logfactorial(m) +
+         logfactorial(mingoodbad - m) +
+         logfactorial(computed_sample - m) +
+         logfactorial(maxgoodbad - computed_sample + m));
+
+    /*
+     *  b is the upper bound for random samples:
+     *  ... min(computed_sample, mingoodbad) + 1 is the length of the support.
+     *  ... floor(a + 16*c) is 16 standard deviations beyond the mean.
+     *
+     *  The idea behind the second upper bound is that values that far out in
+     *  the tail have negligible probabilities.
+     *
+     *  There is a comment in a previous version of this algorithm that says
+     *      "16 for 16-decimal-digit precision in D1 and D2",
+     *  but there is no documented justification for this value.  A lower value
+     *  might work just as well, but I've kept the value 16 here.
+     */
+    b = min(min(computed_sample, mingoodbad) + 1.0, floor(a + 16*c));
+
+    while (1) {
+        double U, V, X, T;
+        double gp;
+        U = rk_double(state);
+        V = rk_double(state);  /*  "U star" in Stadlober (1989) */
+        X = a + h*(V - 0.5) / U;
+
+        /* fast rejection: */
+        if ((X < 0.0) || (X >= b)) {
+            continue;
+        }
+
+        K = (long)floor(X);
+
+        gp = (logfactorial(K) +
+              logfactorial(mingoodbad - K) +
+              logfactorial(computed_sample - K) +
+              logfactorial(maxgoodbad - computed_sample + K));
+
+        T = g - gp;
+
+        /* fast acceptance: */
+        if ((U*(4.0 - U) - 3.0) <= T) {
+            break;
+        }
+
+        /* fast rejection: */
+        if (U*(U - T) >= 1) {
+            continue;
+        }
+
+        if (2.0*log(U) <= T) {
+            /* acceptance */
+            break;  
+        }
+    }
+
+    if (good > bad) {
+        K = computed_sample - K;
+    }
+
+    if (computed_sample < sample) {
+        K = good - K;
+    }
+
+    return K;
+}
+
+
+/*
+ *  Draw a sample from the hypergeometric distribution.
+ *
+ *  It is assumed that when this function is called:
+ *    * good, bad and sample are nonnegative;
+ *    * the sum good+bad will not result in overflow; 
+ *    * sample <= good+bad.
+ */
+
+static long hypergeometric(rk_state *state,
+                           long good, long bad, long sample)
+{
+    long r;
+
+    if ((sample >= 10) && (sample <= good + bad - 10)) {
+        /* This will use the ratio-of-uniforms method. */
+        r = hypergeometric_hrua(state, good, bad, sample);
+    }
+    else {
+        /* The simpler implementation is faster for small samples. */
+        r = hypergeometric_sample(state, good, bad, sample);
+    }
+    return r;
+}
+
+
+/*
+ *  mvhg_marginals
+ *
+ *  Draw samples from the multivariate hypergeometric distribution--
+ *  the "marginals" algorithm.
+ *
+ *  This version generates the sample by iteratively calling
+ *  hypergeometric().
+ *
+ *  Parameters
+ *  ----------
+ *  rk_state *state
+ *      Pointer to a randomkit `rk_state` instance.
+ *  long total
+ *      The sum of the values in the array `colors`.  (This is redundant
+ *      information, but we know the caller has already computed it, so
+ *      we might as well use it.)
+ *  size_t num_colors
+ *      The length of the `colors` array.
+ *  long *colors
+ *      The array of colors (i.e. the number of each type in the collection
+ *      from which the random variate is drawn).
+ *  long nsample
+ *      The number of objects drawn without replacement for each variate.
+ *      `nsample` must not exceed sum(colors).  This condition is not checked;
+ *      it is assumed that the caller has already validated the value.
+ *  size_t num_sample
+ *      The number of random variates to generate.  (This is not the size of
+ *      the array `sample`, because each variate requires `num_colors`
+ *      values.)
+ *  long *sample
+ *      The array that will hold the result.  The length of this array must
+ *      be the product of `num_colors` and `num_sample`.
+ *      The array is not initialized; it is expected that array has been
+ *      initialized with zeros when the function is called.
+ *
+ *  Notes
+ *  -----
+ *  Here's an example that demonstrates the idea of this algorithm.
+ *
+ *  Suppose the urn contains red, green, blue and yellow marbles.
+ *  Let nred be the number of red marbles, and define the quantities for
+ *  the other colors similarly.  The total number of marbles is
+ *
+ *      total = nred + ngreen + nblue + nyellow.
+ *
+ *  To generate a sample using rk_hypergeometric:
+ *
+ *     red_sample = rk_hypergeometric(ngood=nred, nbad=total - nred,
+ *                                    nsample=nsample)
+ *
+ *  This gives us the number of red marbles in the sample.  The number of
+ *  marbles in the sample that are *not* red is nsample - red_sample.
+ *  To figure out the distribution of those marbles, we again use
+ *  rk_hypergeometric:
+ *
+ *      green_sample = rk_hypergeometric(ngood=ngreen,
+ *                                       nbad=total - nred - ngreen,
+ *                                       nsample=nsample - red_sample)
+ *
+ *  Similarly,
+ *
+ *      blue_sample = rk_hypergeometric(
+ *                        ngood=nblue,
+ *                        nbad=total - nred - ngreen - nblue,
+ *                        nsample=nsample - red_sample - green_sample)
+ *
+ *  Finally,
+ *
+ *      yellow_sample = total - (red_sample + green_sample + blue_sample).
+ *
+ *  The above sequence of steps is implemented as a loop for an arbitrary
+ *  number of colors in the innermost loop in the code below.  `remaining`
+ *  is the value passed to `nbad`; it is `total - colors[0]` in the first
+ *  call to rk_hypergeometric(), and then decreases by `colors[j]` in
+ *  each iteration.  `num_to_sample` is the `nsample` argument.  It
+ *  starts at this function's `nsample` input, and is decreased by the
+ *  result of the call to rk_hypergeometric() in each iteration.
+ */
+
+int mvhg_marginals(rk_state *state,
+                   long total,
+                   npy_intp num_colors, long *colors,
+                   long nsample,
+                   npy_intp num_sample, long *sample)
+{
+    npy_intp i;
+    int more_than_half;
+
+    more_than_half = nsample > (total / 2);
+    if (more_than_half) {
+        nsample = total - nsample;
+    }
+
+    for (i = 0; i < num_sample; i += num_colors) {
+        npy_intp num_to_sample, remaining, j;
+        num_to_sample = nsample;
+        remaining = total;
+        for (j = 0; j < num_colors - 1; ++j) {
+            long r;
+            if (num_to_sample <= 0) {
+                break;
+            }
+            remaining -= colors[j];
+            r = hypergeometric(state, colors[j], remaining, num_to_sample);
+            sample[i + j] = r;
+            num_to_sample -= r;
+        }
+
+        if (j == num_colors - 1) {
+            sample[i + j] = num_to_sample;
+        }
+
+        if (more_than_half) {
+            for (j = 0; j < num_colors; ++j) {
+                sample[i + j] = colors[j] - sample[i + j];
+            }
+        }
+    }
+    return 0;
+}
diff --git a/numpy/random/mtrand/mvhg_marginals.h b/numpy/random/mtrand/mvhg_marginals.h
new file mode 100644
index 000000000000..37d3796bcfb7
--- /dev/null
+++ b/numpy/random/mtrand/mvhg_marginals.h
@@ -0,0 +1,13 @@
+
+#ifndef MVHG_MARGINALS_H
+#define MVHG_MARGINALS_H
+
+#include "randomkit.h"
+
+int mvhg_marginals(rk_state *state,
+                   long total,
+                   npy_intp num_colors, long *colors,
+                   long nsample,
+                   npy_intp num_sample, long *sample);
+
+#endif
diff --git a/numpy/random/mtrand/util.c b/numpy/random/mtrand/util.c
new file mode 100644
index 000000000000..5ffaff57c0f1
--- /dev/null
+++ b/numpy/random/mtrand/util.c
@@ -0,0 +1,25 @@
+
+#include <numpy/npy_common.h>
+
+
+/*
+ *  Sum the values in the array `colors`.
+ *  Return -1 if an overflow occurs.
+ *
+ *  The values in *colors are assumed to be nonnegative.
+ */
+
+long safe_sum_nonneg_long(npy_intp num_colors, long *colors)
+{
+    npy_intp i;
+    long sum;
+
+    sum = 0;
+    for (i = 0; i < num_colors; ++i) {
+        if (colors[i] > NPY_MAX_LONG - sum) {
+            return -1;
+        }
+        sum += colors[i];
+    }
+    return sum;
+}
diff --git a/numpy/random/mtrand/util.h b/numpy/random/mtrand/util.h
new file mode 100644
index 000000000000..d7b53fbb8073
--- /dev/null
+++ b/numpy/random/mtrand/util.h
@@ -0,0 +1,6 @@
+#ifndef UTIL_H
+#include <numpy/npy_common.h>
+
+long safe_sum_nonneg_long(npy_intp num_colors, long *colors);
+
+#endif
diff --git a/numpy/random/setup.py b/numpy/random/setup.py
index 394a70ead371..9b38d7b0b859 100644
--- a/numpy/random/setup.py
+++ b/numpy/random/setup.py
@@ -42,9 +42,12 @@ def generate_libraries(ext, build_dir):
     libs = []
     # Configure mtrand
     config.add_extension('mtrand',
-                         sources=[join('mtrand', x) for x in
-                                  ['mtrand.c', 'randomkit.c', 'initarray.c',
-                                   'distributions.c']]+[generate_libraries],
+                         sources=([join('mtrand', x) for x in
+                                    ['mtrand.c', 'randomkit.c', 'initarray.c',
+                                     'distributions.c',
+                                     'mvhg_count.c', 'mvhg_marginals.c',
+                                     'logfactorial.c', 'util.c']] +
+                                  [generate_libraries]),
                          libraries=libs,
                          depends=[join('mtrand', '*.h'),
                                   join('mtrand', '*.pyx'),
diff --git a/numpy/random/tests/test_random.py b/numpy/random/tests/test_random.py
index d0bb92a73895..7422cf46030b 100644
--- a/numpy/random/tests/test_random.py
+++ b/numpy/random/tests/test_random.py
@@ -5,7 +5,7 @@
 from numpy.testing import (
         assert_, assert_raises, assert_equal, assert_warns,
         assert_no_warnings, assert_array_equal, assert_array_almost_equal,
-        suppress_warnings
+        assert_allclose, suppress_warnings
         )
 from numpy import random
 import sys
@@ -955,6 +955,135 @@ def test_zipf(self):
         assert_array_equal(actual, desired)
 
 
+class TestMultivariateHypergeometric(object):
+
+    def test_repeatability(self):
+        np.random.seed(1234567890)
+        sample = np.random.multivariate_hypergeometric([3, 4, 5], 5, size=5,
+                                                       method='count')
+        expected = np.array([[2, 2, 1],
+                             [0, 3, 2],
+                             [2, 2, 1],
+                             [1, 1, 3],
+                             [1, 1, 3]])
+        assert_array_equal(sample, expected)
+
+        np.random.seed(1234567890)
+        sample = np.random.multivariate_hypergeometric([20, 30, 50], 50,
+                                                       size=5,
+                                                       method='marginals')
+        expected = np.array([[11, 18, 21],
+                             [10, 17, 23],
+                             [ 6, 15, 29],
+                             [11, 16, 23],
+                             [ 7, 10, 33]])
+        assert_array_equal(sample, expected)
+
+        np.random.seed(1234567890)
+        sample = np.random.multivariate_hypergeometric([20, 30, 50], 12,
+                                                       size=5,
+                                                       method='marginals')
+        expected = np.array([[3, 4, 5],
+                             [2, 3, 7],
+                             [0, 5, 7],
+                             [1, 4, 7],
+                             [3, 4, 5]])
+        assert_array_equal(sample, expected)
+
+    def test_argument_validation(self):
+        # Error cases...
+
+        # `colors` must be a 1-d sequence
+        assert_raises(ValueError, np.random.multivariate_hypergeometric,
+                      10, 4)
+
+        # Negative nsample
+        assert_raises(ValueError, np.random.multivariate_hypergeometric,
+                      [2, 3, 4], -1)
+
+        # Negative color
+        assert_raises(ValueError, np.random.multivariate_hypergeometric,
+                      [-1, 2, 3], 2)
+
+        # nsample exceeds sum(colors)
+        assert_raises(ValueError, np.random.multivariate_hypergeometric,
+                      [2, 3, 4], 10)
+
+        # nsample exceeds sum(colors) (edge case of empty colors)
+        assert_raises(ValueError, np.random.multivariate_hypergeometric,
+                      [], 1)
+
+        # Validation errors associated with very large values in colors.
+        assert_raises(ValueError, np.random.multivariate_hypergeometric,
+                      [999999999, 101], 5, 1, 'marginals')
+        long_info = np.iinfo(np.int_)
+        max_long = long_info.max
+        max_long_index = max_long // long_info.dtype.itemsize
+        assert_raises(ValueError, np.random.multivariate_hypergeometric,
+                      [max_long_index - 100, 101], 5, 1, 'count')
+
+    def test_edge_cases(self):
+        for method in ['count', 'marginals']:
+            sample = np.random.multivariate_hypergeometric([0, 0, 0], 0,
+                                                           method=method)
+            assert_array_equal(sample, [0, 0, 0])
+
+            sample = np.random.multivariate_hypergeometric([], 0,
+                                                           method=method)
+            assert_array_equal(sample, [])
+
+            sample = np.random.multivariate_hypergeometric([], 0, size=1,
+                                                           method=method)
+            assert_array_equal(sample, np.empty((1, 0), dtype=np.int_))
+
+            sample = np.random.multivariate_hypergeometric([1, 2, 3], 0,
+                                                           method=method)
+            assert_array_equal(sample, [0, 0, 0])
+
+            sample = np.random.multivariate_hypergeometric([9, 0, 0], 3,
+                                                           method=method)
+            assert_array_equal(sample, [3, 0, 0])
+
+            sample = np.random.multivariate_hypergeometric([1, 1, 0, 1, 1], 4,
+                                                           method=method)
+            assert_array_equal(sample, [1, 1, 0, 1, 1])
+
+            sample = np.random.multivariate_hypergeometric([3, 4, 5], 12,
+                                                           size=3,
+                                                           method=method)
+            assert_array_equal(sample, [[3, 4, 5]]*3)
+
+    def test_typical_cases(self):
+        mvhypergeom = np.random.multivariate_hypergeometric
+        np.random.seed(20920107)
+        colors = np.array([10, 5, 20, 25])
+        # Cases for nsample:
+        #     nsample < 10
+        #     10 <= nsample < colors.sum()/2
+        #     colors.sum()/2 < nsample < colors.sum() - 10
+        #     colors.sum() - 10 < nsample < colors.sum()
+        for nsample in [8, 25, 45, 55]:
+            for method in ['count', 'marginals']:
+                for size in [5, (2, 3), 100000]:
+                    sample = mvhypergeom(colors, nsample, size, method=method)
+                    if isinstance(size, int):
+                        expected_shape = (size,) + colors.shape
+                    else:
+                        expected_shape = size + colors.shape
+                    assert_equal(sample.shape, expected_shape)
+                    assert_((sample >= 0).all())
+                    assert_((sample <= colors).all())
+                    assert_array_equal(sample.sum(axis=-1),
+                                       np.full(size, fill_value=nsample,
+                                               dtype=int))
+                    if isinstance(size, int) and size >= 100000:
+                        # This sample is large enough to compare its mean to
+                        # the expected values.
+                        assert_allclose(sample.mean(axis=0),
+                                        nsample * colors / colors.sum(),
+                                        rtol=1e-3, atol=0.005)
+
+
 class TestBroadcast(object):
     # tests that functions that broadcast behave
     # correctly when presented with non-scalar arguments