Skip to content

Commit e80252d

Browse files
committed
Add width_bucket(anyelement, anyarray).
This provides a convenient method of classifying input values into buckets that are not necessarily equal-width. It works on any sortable data type. The choice of function name is a bit debatable, perhaps, but showing that there's a relationship to the SQL standard's width_bucket() function seems more attractive than the other proposals. Petr Jelinek, reviewed by Pavel Stehule
1 parent 220bb39 commit e80252d

File tree

7 files changed

+458
-12
lines changed

7 files changed

+458
-12
lines changed

doc/src/sgml/func.sgml

+24-9
Original file line numberDiff line numberDiff line change
@@ -901,25 +901,40 @@
901901
<indexterm>
902902
<primary>width_bucket</primary>
903903
</indexterm>
904-
<literal><function>width_bucket(<parameter>op</parameter> <type>numeric</type>, <parameter>b1</parameter> <type>numeric</type>, <parameter>b2</parameter> <type>numeric</type>, <parameter>count</parameter> <type>int</type>)</function></literal>
905-
</entry>
904+
<literal><function>width_bucket(<parameter>operand</parameter> <type>dp</type>, <parameter>b1</parameter> <type>dp</type>, <parameter>b2</parameter> <type>dp</type>, <parameter>count</parameter> <type>int</type>)</function></literal></entry>
906905
<entry><type>int</type></entry>
907-
<entry>return the bucket to which <parameter>operand</> would
908-
be assigned in an equidepth histogram with <parameter>count</>
909-
buckets, in the range <parameter>b1</> to <parameter>b2</></entry>
906+
<entry>return the bucket number to which <parameter>operand</> would
907+
be assigned in a histogram having <parameter>count</> equal-width
908+
buckets spanning the range <parameter>b1</> to <parameter>b2</>;
909+
returns <literal>0</> or <literal><parameter>count</>+1</literal> for
910+
an input outside the range</entry>
910911
<entry><literal>width_bucket(5.35, 0.024, 10.06, 5)</literal></entry>
911912
<entry><literal>3</literal></entry>
912913
</row>
913914

914915
<row>
915-
<entry><literal><function>width_bucket(<parameter>op</parameter> <type>dp</type>, <parameter>b1</parameter> <type>dp</type>, <parameter>b2</parameter> <type>dp</type>, <parameter>count</parameter> <type>int</type>)</function></literal></entry>
916+
<entry><literal><function>width_bucket(<parameter>operand</parameter> <type>numeric</type>, <parameter>b1</parameter> <type>numeric</type>, <parameter>b2</parameter> <type>numeric</type>, <parameter>count</parameter> <type>int</type>)</function></literal></entry>
916917
<entry><type>int</type></entry>
917-
<entry>return the bucket to which <parameter>operand</> would
918-
be assigned in an equidepth histogram with <parameter>count</>
919-
buckets, in the range <parameter>b1</> to <parameter>b2</></entry>
918+
<entry>return the bucket number to which <parameter>operand</> would
919+
be assigned in a histogram having <parameter>count</> equal-width
920+
buckets spanning the range <parameter>b1</> to <parameter>b2</>;
921+
returns <literal>0</> or <literal><parameter>count</>+1</literal> for
922+
an input outside the range</entry>
920923
<entry><literal>width_bucket(5.35, 0.024, 10.06, 5)</literal></entry>
921924
<entry><literal>3</literal></entry>
922925
</row>
926+
927+
<row>
928+
<entry><literal><function>width_bucket(<parameter>operand</parameter> <type>anyelement</type>, <parameter>thresholds</parameter> <type>anyarray</type>)</function></literal></entry>
929+
<entry><type>int</type></entry>
930+
<entry>return the bucket number to which <parameter>operand</> would
931+
be assigned given an array listing the lower bounds of the buckets;
932+
returns <literal>0</> for an input less than the first lower bound;
933+
the <parameter>thresholds</> array <emphasis>must be sorted</>,
934+
smallest first, or unexpected results will be obtained</entry>
935+
<entry><literal>width_bucket(now(), array['yesterday', 'today', 'tomorrow']::timestamptz[])</literal></entry>
936+
<entry><literal>2</literal></entry>
937+
</row>
923938
</tbody>
924939
</tgroup>
925940
</table>

src/backend/utils/adt/arrayfuncs.c

+243
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515
#include "postgres.h"
1616

1717
#include <ctype.h>
18+
#include <math.h>
1819

1920
#include "access/htup_details.h"
21+
#include "catalog/pg_type.h"
2022
#include "funcapi.h"
2123
#include "libpq/pqformat.h"
2224
#include "utils/array.h"
@@ -130,6 +132,15 @@ static ArrayType *array_replace_internal(ArrayType *array,
130132
Datum replace, bool replace_isnull,
131133
bool remove, Oid collation,
132134
FunctionCallInfo fcinfo);
135+
static int width_bucket_array_float8(Datum operand, ArrayType *thresholds);
136+
static int width_bucket_array_fixed(Datum operand,
137+
ArrayType *thresholds,
138+
Oid collation,
139+
TypeCacheEntry *typentry);
140+
static int width_bucket_array_variable(Datum operand,
141+
ArrayType *thresholds,
142+
Oid collation,
143+
TypeCacheEntry *typentry);
133144

134145

135146
/*
@@ -5502,3 +5513,235 @@ array_replace(PG_FUNCTION_ARGS)
55025513
fcinfo);
55035514
PG_RETURN_ARRAYTYPE_P(array);
55045515
}
5516+
5517+
/*
5518+
* Implements width_bucket(anyelement, anyarray).
5519+
*
5520+
* 'thresholds' is an array containing lower bound values for each bucket;
5521+
* these must be sorted from smallest to largest, or bogus results will be
5522+
* produced. If N thresholds are supplied, the output is from 0 to N:
5523+
* 0 is for inputs < first threshold, N is for inputs >= last threshold.
5524+
*/
5525+
Datum
5526+
width_bucket_array(PG_FUNCTION_ARGS)
5527+
{
5528+
Datum operand = PG_GETARG_DATUM(0);
5529+
ArrayType *thresholds = PG_GETARG_ARRAYTYPE_P(1);
5530+
Oid collation = PG_GET_COLLATION();
5531+
Oid element_type = ARR_ELEMTYPE(thresholds);
5532+
int result;
5533+
5534+
/* Check input */
5535+
if (ARR_NDIM(thresholds) > 1)
5536+
ereport(ERROR,
5537+
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
5538+
errmsg("thresholds must be one-dimensional array")));
5539+
5540+
if (array_contains_nulls(thresholds))
5541+
ereport(ERROR,
5542+
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5543+
errmsg("thresholds array must not contain NULLs")));
5544+
5545+
/* We have a dedicated implementation for float8 data */
5546+
if (element_type == FLOAT8OID)
5547+
result = width_bucket_array_float8(operand, thresholds);
5548+
else
5549+
{
5550+
TypeCacheEntry *typentry;
5551+
5552+
/* Cache information about the input type */
5553+
typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
5554+
if (typentry == NULL ||
5555+
typentry->type_id != element_type)
5556+
{
5557+
typentry = lookup_type_cache(element_type,
5558+
TYPECACHE_CMP_PROC_FINFO);
5559+
if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
5560+
ereport(ERROR,
5561+
(errcode(ERRCODE_UNDEFINED_FUNCTION),
5562+
errmsg("could not identify a comparison function for type %s",
5563+
format_type_be(element_type))));
5564+
fcinfo->flinfo->fn_extra = (void *) typentry;
5565+
}
5566+
5567+
/*
5568+
* We have separate implementation paths for fixed- and variable-width
5569+
* types, since indexing the array is a lot cheaper in the first case.
5570+
*/
5571+
if (typentry->typlen > 0)
5572+
result = width_bucket_array_fixed(operand, thresholds,
5573+
collation, typentry);
5574+
else
5575+
result = width_bucket_array_variable(operand, thresholds,
5576+
collation, typentry);
5577+
}
5578+
5579+
/* Avoid leaking memory when handed toasted input. */
5580+
PG_FREE_IF_COPY(thresholds, 1);
5581+
5582+
PG_RETURN_INT32(result);
5583+
}
5584+
5585+
/*
5586+
* width_bucket_array for float8 data.
5587+
*/
5588+
static int
5589+
width_bucket_array_float8(Datum operand, ArrayType *thresholds)
5590+
{
5591+
float8 op = DatumGetFloat8(operand);
5592+
float8 *thresholds_data;
5593+
int left;
5594+
int right;
5595+
5596+
/*
5597+
* Since we know the array contains no NULLs, we can just index it
5598+
* directly.
5599+
*/
5600+
thresholds_data = (float8 *) ARR_DATA_PTR(thresholds);
5601+
5602+
left = 0;
5603+
right = ArrayGetNItems(ARR_NDIM(thresholds), ARR_DIMS(thresholds));
5604+
5605+
/*
5606+
* If the probe value is a NaN, it's greater than or equal to all possible
5607+
* threshold values (including other NaNs), so we need not search. Note
5608+
* that this would give the same result as searching even if the array
5609+
* contains multiple NaNs (as long as they're correctly sorted), since the
5610+
* loop logic will find the rightmost of multiple equal threshold values.
5611+
*/
5612+
if (isnan(op))
5613+
return right;
5614+
5615+
/* Find the bucket */
5616+
while (left < right)
5617+
{
5618+
int mid = (left + right) / 2;
5619+
5620+
if (isnan(thresholds_data[mid]) || op < thresholds_data[mid])
5621+
right = mid;
5622+
else
5623+
left = mid + 1;
5624+
}
5625+
5626+
return left;
5627+
}
5628+
5629+
/*
5630+
* width_bucket_array for generic fixed-width data types.
5631+
*/
5632+
static int
5633+
width_bucket_array_fixed(Datum operand,
5634+
ArrayType *thresholds,
5635+
Oid collation,
5636+
TypeCacheEntry *typentry)
5637+
{
5638+
char *thresholds_data;
5639+
int typlen = typentry->typlen;
5640+
bool typbyval = typentry->typbyval;
5641+
FunctionCallInfoData locfcinfo;
5642+
int left;
5643+
int right;
5644+
5645+
/*
5646+
* Since we know the array contains no NULLs, we can just index it
5647+
* directly.
5648+
*/
5649+
thresholds_data = (char *) ARR_DATA_PTR(thresholds);
5650+
5651+
InitFunctionCallInfoData(locfcinfo, &typentry->cmp_proc_finfo, 2,
5652+
collation, NULL, NULL);
5653+
5654+
/* Find the bucket */
5655+
left = 0;
5656+
right = ArrayGetNItems(ARR_NDIM(thresholds), ARR_DIMS(thresholds));
5657+
while (left < right)
5658+
{
5659+
int mid = (left + right) / 2;
5660+
char *ptr;
5661+
int32 cmpresult;
5662+
5663+
ptr = thresholds_data + mid * typlen;
5664+
5665+
locfcinfo.arg[0] = operand;
5666+
locfcinfo.arg[1] = fetch_att(ptr, typbyval, typlen);
5667+
locfcinfo.argnull[0] = false;
5668+
locfcinfo.argnull[1] = false;
5669+
locfcinfo.isnull = false;
5670+
5671+
cmpresult = DatumGetInt32(FunctionCallInvoke(&locfcinfo));
5672+
5673+
if (cmpresult < 0)
5674+
right = mid;
5675+
else
5676+
left = mid + 1;
5677+
}
5678+
5679+
return left;
5680+
}
5681+
5682+
/*
5683+
* width_bucket_array for generic variable-width data types.
5684+
*/
5685+
static int
5686+
width_bucket_array_variable(Datum operand,
5687+
ArrayType *thresholds,
5688+
Oid collation,
5689+
TypeCacheEntry *typentry)
5690+
{
5691+
char *thresholds_data;
5692+
int typlen = typentry->typlen;
5693+
bool typbyval = typentry->typbyval;
5694+
char typalign = typentry->typalign;
5695+
FunctionCallInfoData locfcinfo;
5696+
int left;
5697+
int right;
5698+
5699+
thresholds_data = (char *) ARR_DATA_PTR(thresholds);
5700+
5701+
InitFunctionCallInfoData(locfcinfo, &typentry->cmp_proc_finfo, 2,
5702+
collation, NULL, NULL);
5703+
5704+
/* Find the bucket */
5705+
left = 0;
5706+
right = ArrayGetNItems(ARR_NDIM(thresholds), ARR_DIMS(thresholds));
5707+
while (left < right)
5708+
{
5709+
int mid = (left + right) / 2;
5710+
char *ptr;
5711+
int i;
5712+
int32 cmpresult;
5713+
5714+
/* Locate mid'th array element by advancing from left element */
5715+
ptr = thresholds_data;
5716+
for (i = left; i < mid; i++)
5717+
{
5718+
ptr = att_addlength_pointer(ptr, typlen, ptr);
5719+
ptr = (char *) att_align_nominal(ptr, typalign);
5720+
}
5721+
5722+
locfcinfo.arg[0] = operand;
5723+
locfcinfo.arg[1] = fetch_att(ptr, typbyval, typlen);
5724+
locfcinfo.argnull[0] = false;
5725+
locfcinfo.argnull[1] = false;
5726+
locfcinfo.isnull = false;
5727+
5728+
cmpresult = DatumGetInt32(FunctionCallInvoke(&locfcinfo));
5729+
5730+
if (cmpresult < 0)
5731+
right = mid;
5732+
else
5733+
{
5734+
left = mid + 1;
5735+
5736+
/*
5737+
* Move the thresholds pointer to match new "left" index, so we
5738+
* don't have to seek over those elements again. This trick
5739+
* ensures we do only O(N) array indexing work, not O(N^2).
5740+
*/
5741+
ptr = att_addlength_pointer(ptr, typlen, ptr);
5742+
thresholds_data = (char *) att_align_nominal(ptr, typalign);
5743+
}
5744+
}
5745+
5746+
return left;
5747+
}

src/include/catalog/catversion.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,6 @@
5353
*/
5454

5555
/* yyyymmddN */
56-
#define CATALOG_VERSION_NO 201408281
56+
#define CATALOG_VERSION_NO 201409091
5757

5858
#endif

src/include/catalog/pg_proc.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -514,7 +514,7 @@ DATA(insert OID = 308 ( float84le PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0
514514
DATA(insert OID = 309 ( float84gt PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0 16 "701 700" _null_ _null_ _null_ _null_ float84gt _null_ _null_ _null_ ));
515515
DATA(insert OID = 310 ( float84ge PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0 16 "701 700" _null_ _null_ _null_ _null_ float84ge _null_ _null_ _null_ ));
516516
DATA(insert OID = 320 ( width_bucket PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 23 "701 701 701 23" _null_ _null_ _null_ _null_ width_bucket_float8 _null_ _null_ _null_ ));
517-
DESCR("bucket number of operand in equidepth histogram");
517+
DESCR("bucket number of operand in equal-width histogram");
518518

519519
DATA(insert OID = 311 ( float8 PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 701 "700" _null_ _null_ _null_ _null_ ftod _null_ _null_ _null_ ));
520520
DESCR("convert float4 to float8");
@@ -885,6 +885,8 @@ DATA(insert OID = 2334 ( array_agg_finalfn PGNSP PGUID 12 1 0 0 0 f f f f f f
885885
DESCR("aggregate final function");
886886
DATA(insert OID = 2335 ( array_agg PGNSP PGUID 12 1 0 0 0 t f f f f f i 1 0 2277 "2283" _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
887887
DESCR("concatenate aggregate input into an array");
888+
DATA(insert OID = 3218 ( width_bucket PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "2283 2277" _null_ _null_ _null_ _null_ width_bucket_array _null_ _null_ _null_ ));
889+
DESCR("bucket number of operand given a sorted array of bucket lower bounds");
888890
DATA(insert OID = 3816 ( array_typanalyze PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ array_typanalyze _null_ _null_ _null_ ));
889891
DESCR("array typanalyze");
890892
DATA(insert OID = 3817 ( arraycontsel PGNSP PGUID 12 1 0 0 0 f f f f t f s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ arraycontsel _null_ _null_ _null_ ));
@@ -2301,7 +2303,7 @@ DESCR("trunc(x/y)");
23012303
DATA(insert OID = 1980 ( numeric_div_trunc PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1700 "1700 1700" _null_ _null_ _null_ _null_ numeric_div_trunc _null_ _null_ _null_ ));
23022304
DESCR("trunc(x/y)");
23032305
DATA(insert OID = 2170 ( width_bucket PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 23 "1700 1700 1700 23" _null_ _null_ _null_ _null_ width_bucket_numeric _null_ _null_ _null_ ));
2304-
DESCR("bucket number of operand in equidepth histogram");
2306+
DESCR("bucket number of operand in equal-width histogram");
23052307

23062308
DATA(insert OID = 1747 ( time_pl_interval PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1083 "1083 1186" _null_ _null_ _null_ _null_ time_pl_interval _null_ _null_ _null_ ));
23072309
DATA(insert OID = 1748 ( time_mi_interval PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1083 "1083 1186" _null_ _null_ _null_ _null_ time_mi_interval _null_ _null_ _null_ ));

src/include/utils/array.h

+1
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ extern Datum array_fill_with_lower_bounds(PG_FUNCTION_ARGS);
214214
extern Datum array_unnest(PG_FUNCTION_ARGS);
215215
extern Datum array_remove(PG_FUNCTION_ARGS);
216216
extern Datum array_replace(PG_FUNCTION_ARGS);
217+
extern Datum width_bucket_array(PG_FUNCTION_ARGS);
217218

218219
extern Datum array_ref(ArrayType *array, int nSubscripts, int *indx,
219220
int arraytyplen, int elmlen, bool elmbyval, char elmalign,

0 commit comments

Comments
 (0)