Skip to content

Commit 888f2ea

Browse files
committed
Add array_sample() and array_shuffle() functions.
These are useful in Monte Carlo applications. Martin Kalcher, reviewed/adjusted by Daniel Gustafsson and myself Discussion: https://postgr.es/m/9d160a44-7675-51e8-60cf-6d64b76db831@aboutsource.net
1 parent cd82e5c commit 888f2ea

File tree

6 files changed

+284
-2
lines changed

6 files changed

+284
-2
lines changed

doc/src/sgml/func.sgml

+43-1
Original file line numberDiff line numberDiff line change
@@ -16053,7 +16053,7 @@ SELECT js,
1605316053
js IS JSON ARRAY "array?"
1605416054
FROM (VALUES
1605516055
('123'), ('"abc"'), ('{"a": "b"}'), ('[1,2]'),('abc')) foo(js);
16056-
js | json? | scalar? | object? | array?
16056+
js | json? | scalar? | object? | array?
1605716057
------------+-------+---------+---------+--------
1605816058
123 | t | t | f | f
1605916059
"abc" | t | t | f | f
@@ -18777,6 +18777,48 @@ SELECT NULLIF(value, '(none)') ...
1877718777
</para></entry>
1877818778
</row>
1877918779

18780+
<row>
18781+
<entry role="func_table_entry"><para role="func_signature">
18782+
<indexterm>
18783+
<primary>array_sample</primary>
18784+
</indexterm>
18785+
<function>array_sample</function> ( <parameter>array</parameter> <type>anyarray</type>, <parameter>n</parameter> <type>integer</type> )
18786+
<returnvalue>anyarray</returnvalue>
18787+
</para>
18788+
<para>
18789+
Returns an array of <parameter>n</parameter> items randomly selected
18790+
from <parameter>array</parameter>. <parameter>n</parameter> may not
18791+
exceed the length of <parameter>array</parameter>'s first dimension.
18792+
If <parameter>array</parameter> is multi-dimensional,
18793+
an <quote>item</quote> is a slice having a given first subscript.
18794+
</para>
18795+
<para>
18796+
<literal>array_sample(ARRAY[1,2,3,4,5,6], 3)</literal>
18797+
<returnvalue>{2,6,1}</returnvalue>
18798+
</para>
18799+
<para>
18800+
<literal>array_sample(ARRAY[[1,2],[3,4],[5,6]], 2)</literal>
18801+
<returnvalue>{{5,6},{1,2}}</returnvalue>
18802+
</para></entry>
18803+
</row>
18804+
18805+
<row>
18806+
<entry role="func_table_entry"><para role="func_signature">
18807+
<indexterm>
18808+
<primary>array_shuffle</primary>
18809+
</indexterm>
18810+
<function>array_shuffle</function> ( <type>anyarray</type> )
18811+
<returnvalue>anyarray</returnvalue>
18812+
</para>
18813+
<para>
18814+
Randomly shuffles the first dimension of the array.
18815+
</para>
18816+
<para>
18817+
<literal>array_shuffle(ARRAY[[1,2],[3,4],[5,6]])</literal>
18818+
<returnvalue>{{5,6},{1,2},{3,4}}</returnvalue>
18819+
</para></entry>
18820+
</row>
18821+
1878018822
<row>
1878118823
<entry role="func_table_entry"><para role="func_signature">
1878218824
<indexterm id="function-array-to-string">

src/backend/utils/adt/array_userfuncs.c

+166
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "catalog/pg_type.h"
1616
#include "libpq/pqformat.h"
1717
#include "common/int.h"
18+
#include "common/pg_prng.h"
1819
#include "port/pg_bitutils.h"
1920
#include "utils/array.h"
2021
#include "utils/datum.h"
@@ -1525,3 +1526,168 @@ array_positions(PG_FUNCTION_ARGS)
15251526

15261527
PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
15271528
}
1529+
1530+
/*
1531+
* array_shuffle_n
1532+
* Return a copy of array with n randomly chosen items.
1533+
*
1534+
* The number of items must not exceed the size of the first dimension of the
1535+
* array. We preserve the first dimension's lower bound if keep_lb,
1536+
* else it's set to 1. Lower-order dimensions are preserved in any case.
1537+
*
1538+
* NOTE: it would be cleaner to look up the elmlen/elmbval/elmalign info
1539+
* from the system catalogs, given only the elmtyp. However, the caller is
1540+
* in a better position to cache this info across multiple calls.
1541+
*/
1542+
static ArrayType *
1543+
array_shuffle_n(ArrayType *array, int n, bool keep_lb,
1544+
Oid elmtyp, TypeCacheEntry *typentry)
1545+
{
1546+
ArrayType *result;
1547+
int ndim,
1548+
*dims,
1549+
*lbs,
1550+
nelm,
1551+
nitem,
1552+
rdims[MAXDIM],
1553+
rlbs[MAXDIM];
1554+
int16 elmlen;
1555+
bool elmbyval;
1556+
char elmalign;
1557+
Datum *elms,
1558+
*ielms;
1559+
bool *nuls,
1560+
*inuls;
1561+
1562+
ndim = ARR_NDIM(array);
1563+
dims = ARR_DIMS(array);
1564+
lbs = ARR_LBOUND(array);
1565+
1566+
elmlen = typentry->typlen;
1567+
elmbyval = typentry->typbyval;
1568+
elmalign = typentry->typalign;
1569+
1570+
/* If the target array is empty, exit fast */
1571+
if (ndim < 1 || dims[0] < 1 || n < 1)
1572+
return construct_empty_array(elmtyp);
1573+
1574+
deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign,
1575+
&elms, &nuls, &nelm);
1576+
1577+
nitem = dims[0]; /* total number of items */
1578+
nelm /= nitem; /* number of elements per item */
1579+
1580+
Assert(n <= nitem); /* else it's caller error */
1581+
1582+
/*
1583+
* Shuffle array using Fisher-Yates algorithm. Scan the array and swap
1584+
* current item (nelm datums starting at ielms) with a randomly chosen
1585+
* later item (nelm datums starting at jelms) in each iteration. We can
1586+
* stop once we've done n iterations; then first n items are the result.
1587+
*/
1588+
ielms = elms;
1589+
inuls = nuls;
1590+
for (int i = 0; i < n; i++)
1591+
{
1592+
int j = (int) pg_prng_uint64_range(&pg_global_prng_state, i, nitem - 1) * nelm;
1593+
Datum *jelms = elms + j;
1594+
bool *jnuls = nuls + j;
1595+
1596+
/* Swap i'th and j'th items; advance ielms/inuls to next item */
1597+
for (int k = 0; k < nelm; k++)
1598+
{
1599+
Datum elm = *ielms;
1600+
bool nul = *inuls;
1601+
1602+
*ielms++ = *jelms;
1603+
*inuls++ = *jnuls;
1604+
*jelms++ = elm;
1605+
*jnuls++ = nul;
1606+
}
1607+
}
1608+
1609+
/* Set up dimensions of the result */
1610+
memcpy(rdims, dims, ndim * sizeof(int));
1611+
memcpy(rlbs, lbs, ndim * sizeof(int));
1612+
rdims[0] = n;
1613+
if (!keep_lb)
1614+
rlbs[0] = 1;
1615+
1616+
result = construct_md_array(elms, nuls, ndim, rdims, rlbs,
1617+
elmtyp, elmlen, elmbyval, elmalign);
1618+
1619+
pfree(elms);
1620+
pfree(nuls);
1621+
1622+
return result;
1623+
}
1624+
1625+
/*
1626+
* array_shuffle
1627+
*
1628+
* Returns an array with the same dimensions as the input array, with its
1629+
* first-dimension elements in random order.
1630+
*/
1631+
Datum
1632+
array_shuffle(PG_FUNCTION_ARGS)
1633+
{
1634+
ArrayType *array = PG_GETARG_ARRAYTYPE_P(0);
1635+
ArrayType *result;
1636+
Oid elmtyp;
1637+
TypeCacheEntry *typentry;
1638+
1639+
/*
1640+
* There is no point in shuffling empty arrays or arrays with less than
1641+
* two items.
1642+
*/
1643+
if (ARR_NDIM(array) < 1 || ARR_DIMS(array)[0] < 2)
1644+
PG_RETURN_ARRAYTYPE_P(array);
1645+
1646+
elmtyp = ARR_ELEMTYPE(array);
1647+
typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
1648+
if (typentry == NULL || typentry->type_id != elmtyp)
1649+
{
1650+
typentry = lookup_type_cache(elmtyp, 0);
1651+
fcinfo->flinfo->fn_extra = (void *) typentry;
1652+
}
1653+
1654+
result = array_shuffle_n(array, ARR_DIMS(array)[0], true, elmtyp, typentry);
1655+
1656+
PG_RETURN_ARRAYTYPE_P(result);
1657+
}
1658+
1659+
/*
1660+
* array_sample
1661+
*
1662+
* Returns an array of n randomly chosen first-dimension elements
1663+
* from the input array.
1664+
*/
1665+
Datum
1666+
array_sample(PG_FUNCTION_ARGS)
1667+
{
1668+
ArrayType *array = PG_GETARG_ARRAYTYPE_P(0);
1669+
int n = PG_GETARG_INT32(1);
1670+
ArrayType *result;
1671+
Oid elmtyp;
1672+
TypeCacheEntry *typentry;
1673+
int nitem;
1674+
1675+
nitem = (ARR_NDIM(array) < 1) ? 0 : ARR_DIMS(array)[0];
1676+
1677+
if (n < 0 || n > nitem)
1678+
ereport(ERROR,
1679+
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1680+
errmsg("sample size must be between 0 and %d", nitem)));
1681+
1682+
elmtyp = ARR_ELEMTYPE(array);
1683+
typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
1684+
if (typentry == NULL || typentry->type_id != elmtyp)
1685+
{
1686+
typentry = lookup_type_cache(elmtyp, 0);
1687+
fcinfo->flinfo->fn_extra = (void *) typentry;
1688+
}
1689+
1690+
result = array_shuffle_n(array, n, false, elmtyp, typentry);
1691+
1692+
PG_RETURN_ARRAYTYPE_P(result);
1693+
}

src/include/catalog/catversion.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,6 @@
5757
*/
5858

5959
/* yyyymmddN */
60-
#define CATALOG_VERSION_NO 202304051
60+
#define CATALOG_VERSION_NO 202304071
6161

6262
#endif

src/include/catalog/pg_proc.dat

+6
Original file line numberDiff line numberDiff line change
@@ -1717,6 +1717,12 @@
17171717
{ oid => '6172', descr => 'remove last N elements of array',
17181718
proname => 'trim_array', prorettype => 'anyarray',
17191719
proargtypes => 'anyarray int4', prosrc => 'trim_array' },
1720+
{ oid => '8464', descr => 'shuffle array',
1721+
proname => 'array_shuffle', provolatile => 'v', prorettype => 'anyarray',
1722+
proargtypes => 'anyarray', prosrc => 'array_shuffle' },
1723+
{ oid => '8465', descr => 'take samples from array',
1724+
proname => 'array_sample', provolatile => 'v', prorettype => 'anyarray',
1725+
proargtypes => 'anyarray int4', prosrc => 'array_sample' },
17201726
{ oid => '3816', descr => 'array typanalyze',
17211727
proname => 'array_typanalyze', provolatile => 's', prorettype => 'bool',
17221728
proargtypes => 'internal', prosrc => 'array_typanalyze' },

src/test/regress/expected/arrays.out

+54
Original file line numberDiff line numberDiff line change
@@ -2472,3 +2472,57 @@ SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
24722472
ERROR: number of elements to trim must be between 0 and 3
24732473
SELECT trim_array(ARRAY[]::int[], 1); -- fail
24742474
ERROR: number of elements to trim must be between 0 and 0
2475+
-- array_shuffle
2476+
SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) <@ '{1,2,3,4,5,6}';
2477+
?column?
2478+
----------
2479+
t
2480+
(1 row)
2481+
2482+
SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) @> '{1,2,3,4,5,6}';
2483+
?column?
2484+
----------
2485+
t
2486+
(1 row)
2487+
2488+
SELECT array_dims(array_shuffle('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[]));
2489+
array_dims
2490+
-------------
2491+
[-1:2][2:3]
2492+
(1 row)
2493+
2494+
SELECT array_dims(array_shuffle('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[]));
2495+
array_dims
2496+
-----------------
2497+
[1:3][1:2][1:2]
2498+
(1 row)
2499+
2500+
-- array_sample
2501+
SELECT array_sample('{1,2,3,4,5,6}'::int[], 3) <@ '{1,2,3,4,5,6}';
2502+
?column?
2503+
----------
2504+
t
2505+
(1 row)
2506+
2507+
SELECT array_length(array_sample('{1,2,3,4,5,6}'::int[], 3), 1);
2508+
array_length
2509+
--------------
2510+
3
2511+
(1 row)
2512+
2513+
SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[], 3));
2514+
array_dims
2515+
------------
2516+
[1:3][2:3]
2517+
(1 row)
2518+
2519+
SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2));
2520+
array_dims
2521+
-----------------
2522+
[1:2][1:2][1:2]
2523+
(1 row)
2524+
2525+
SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail
2526+
ERROR: sample size must be between 0 and 6
2527+
SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail
2528+
ERROR: sample size must be between 0 and 6

src/test/regress/sql/arrays.sql

+14
Original file line numberDiff line numberDiff line change
@@ -761,3 +761,17 @@ FROM
761761
SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail
762762
SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
763763
SELECT trim_array(ARRAY[]::int[], 1); -- fail
764+
765+
-- array_shuffle
766+
SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) <@ '{1,2,3,4,5,6}';
767+
SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) @> '{1,2,3,4,5,6}';
768+
SELECT array_dims(array_shuffle('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[]));
769+
SELECT array_dims(array_shuffle('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[]));
770+
771+
-- array_sample
772+
SELECT array_sample('{1,2,3,4,5,6}'::int[], 3) <@ '{1,2,3,4,5,6}';
773+
SELECT array_length(array_sample('{1,2,3,4,5,6}'::int[], 3), 1);
774+
SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[], 3));
775+
SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2));
776+
SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail
777+
SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail

0 commit comments

Comments
 (0)