Skip to content

Commit 1fcd0ad

Browse files
committed
Add approximated Zipfian-distributed random generator to pgbench.
Generator helps to make close to real-world tests. Author: Alik Khilazhev Reviewed-By: Fabien COELHO Discussion: https://www.postgresql.org/message-id/flat/BF3B6F54-68C3-417A-BFAB-FB4D66F2B410@postgrespro.ru
1 parent 538d114 commit 1fcd0ad

File tree

5 files changed

+263
-3
lines changed

5 files changed

+263
-3
lines changed

doc/src/sgml/ref/pgbench.sgml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,14 @@ pgbench <optional> <replaceable>options</replaceable> </optional> <replaceable>d
10921092
<entry><literal>random_gaussian(1, 10, 2.5)</literal></entry>
10931093
<entry>an integer between <literal>1</literal> and <literal>10</literal></entry>
10941094
</row>
1095+
<row>
1096+
<entry><literal><function>random_zipfian(<replaceable>lb</replaceable>, <replaceable>ub</replaceable>, <replaceable>parameter</replaceable>)</function></literal></entry>
1097+
<entry>integer</entry>
1098+
<entry>Zipfian-distributed random integer in <literal>[lb, ub]</literal>,
1099+
see below</entry>
1100+
<entry><literal>random_zipfian(1, 10, 1.5)</literal></entry>
1101+
<entry>an integer between <literal>1</literal> and <literal>10</literal></entry>
1102+
</row>
10951103
<row>
10961104
<entry><literal><function>sqrt(<replaceable>x</replaceable>)</function></literal></entry>
10971105
<entry>double</entry>
@@ -1173,6 +1181,27 @@ f(x) = PHI(2.0 * parameter * (x - mu) / (max - min + 1)) /
11731181
of the Box-Muller transform.
11741182
</para>
11751183
</listitem>
1184+
<listitem>
1185+
<para>
1186+
<literal>random_zipfian</literal> generates an approximated bounded zipfian
1187+
distribution. For <replaceable>parameter</replaceable> in (0, 1), an
1188+
approximated algorithm is taken from
1189+
"Quickly Generating Billion-Record Synthetic Databases",
1190+
Jim Gray et al, SIGMOD 1994. For <replaceable>parameter</replaceable>
1191+
in (1, 1000), a rejection method is used, based on
1192+
"Non-Uniform Random Variate Generation", Luc Devroye, p. 550-551,
1193+
Springer 1986. The distribution is not defined when the parameter's
1194+
value is 1.0. The drawing performance is poor for parameter values
1195+
close and above 1.0 and on a small range.
1196+
</para>
1197+
<para>
1198+
<replaceable>parameter</replaceable>
1199+
defines how skewed the distribution is. The larger the <replaceable>parameter</replaceable>, the more
1200+
frequently values to the beginning of the interval are drawn.
1201+
The closer to 0 <replaceable>parameter</replaceable> is,
1202+
the flatter (more uniform) the access distribution.
1203+
</para>
1204+
</listitem>
11761205
</itemizedlist>
11771206

11781207
<para>

src/bin/pgbench/exprparse.y

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,9 @@ static const struct
191191
{
192192
"random_exponential", 3, PGBENCH_RANDOM_EXPONENTIAL
193193
},
194+
{
195+
"random_zipfian", 3, PGBENCH_RANDOM_ZIPFIAN
196+
},
194197
/* keep as last array element */
195198
{
196199
NULL, 0, 0

src/bin/pgbench/pgbench.c

Lines changed: 192 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,10 @@ static int pthread_join(pthread_t th, void **thread_return);
9595
#define LOG_STEP_SECONDS 5 /* seconds between log messages */
9696
#define DEFAULT_NXACTS 10 /* default nxacts */
9797

98+
#define ZIPF_CACHE_SIZE 15 /* cache cells number */
99+
98100
#define MIN_GAUSSIAN_PARAM 2.0 /* minimum parameter for gauss */
101+
#define MAX_ZIPFIAN_PARAM 1000 /* maximum parameter for zipfian */
99102

100103
int nxacts = 0; /* number of transactions per client */
101104
int duration = 0; /* duration in seconds */
@@ -330,6 +333,35 @@ typedef struct
330333
int ecnt; /* error count */
331334
} CState;
332335

336+
/*
337+
* Cache cell for zipfian_random call
338+
*/
339+
typedef struct
340+
{
341+
/* cell keys */
342+
double s; /* s - parameter of zipfan_random function */
343+
int64 n; /* number of elements in range (max - min + 1) */
344+
345+
double harmonicn; /* generalizedHarmonicNumber(n, s) */
346+
double alpha;
347+
double beta;
348+
double eta;
349+
350+
uint64 last_used; /* last used logical time */
351+
} ZipfCell;
352+
353+
/*
354+
* Zipf cache for zeta values
355+
*/
356+
typedef struct
357+
{
358+
uint64 current; /* counter for LRU cache replacement algorithm */
359+
360+
int nb_cells; /* number of filled cells */
361+
int overflowCount; /* number of cache overflows */
362+
ZipfCell cells[ZIPF_CACHE_SIZE];
363+
} ZipfCache;
364+
333365
/*
334366
* Thread state
335367
*/
@@ -342,6 +374,8 @@ typedef struct
342374
unsigned short random_state[3]; /* separate randomness for each thread */
343375
int64 throttle_trigger; /* previous/next throttling (us) */
344376
FILE *logfile; /* where to log, or NULL */
377+
ZipfCache zipf_cache; /* for thread-safe zipfian random number
378+
* generation */
345379

346380
/* per thread collected stats */
347381
instr_time start_time; /* thread start time */
@@ -746,6 +780,137 @@ getPoissonRand(TState *thread, int64 center)
746780
return (int64) (-log(uniform) * ((double) center) + 0.5);
747781
}
748782

783+
/* helper function for getZipfianRand */
784+
static double
785+
generalizedHarmonicNumber(int64 n, double s)
786+
{
787+
int i;
788+
double ans = 0.0;
789+
790+
for (i = n; i > 1; i--)
791+
ans += pow(i, -s);
792+
return ans + 1.0;
793+
}
794+
795+
/* set harmonicn and other parameters to cache cell */
796+
static void
797+
zipfSetCacheCell(ZipfCell * cell, int64 n, double s)
798+
{
799+
double harmonic2;
800+
801+
cell->n = n;
802+
cell->s = s;
803+
804+
harmonic2 = generalizedHarmonicNumber(2, s);
805+
cell->harmonicn = generalizedHarmonicNumber(n, s);
806+
807+
cell->alpha = 1.0 / (1.0 - s);
808+
cell->beta = pow(0.5, s);
809+
cell->eta = (1.0 - pow(2.0 / n, 1.0 - s)) / (1.0 - harmonic2 / cell->harmonicn);
810+
}
811+
812+
/*
813+
* search for cache cell with keys (n, s)
814+
* and create new cell if it does not exist
815+
*/
816+
static ZipfCell *
817+
zipfFindOrCreateCacheCell(ZipfCache * cache, int64 n, double s)
818+
{
819+
int i,
820+
least_recently_used = 0;
821+
ZipfCell *cell;
822+
823+
/* search cached cell for given parameters */
824+
for (i = 0; i < cache->nb_cells; i++)
825+
{
826+
cell = &cache->cells[i];
827+
if (cell->n == n && cell->s == s)
828+
return &cache->cells[i];
829+
830+
if (cell->last_used < cache->cells[least_recently_used].last_used)
831+
least_recently_used = i;
832+
}
833+
834+
/* create new one if it does not exist */
835+
if (cache->nb_cells < ZIPF_CACHE_SIZE)
836+
i = cache->nb_cells++;
837+
else
838+
{
839+
/* replace LRU cell if cache is full */
840+
i = least_recently_used;
841+
cache->overflowCount++;
842+
}
843+
844+
zipfSetCacheCell(&cache->cells[i], n, s);
845+
846+
cache->cells[i].last_used = cache->current++;
847+
return &cache->cells[i];
848+
}
849+
850+
/*
851+
* Computing zipfian using rejection method, based on
852+
* "Non-Uniform Random Variate Generation",
853+
* Luc Devroye, p. 550-551, Springer 1986.
854+
*/
855+
static int64
856+
computeIterativeZipfian(TState *thread, int64 n, double s)
857+
{
858+
double b = pow(2.0, s - 1.0);
859+
double x,
860+
t,
861+
u,
862+
v;
863+
864+
while (true)
865+
{
866+
/* random variates */
867+
u = pg_erand48(thread->random_state);
868+
v = pg_erand48(thread->random_state);
869+
870+
x = floor(pow(u, -1.0 / (s - 1.0)));
871+
872+
t = pow(1.0 + 1.0 / x, s - 1.0);
873+
/* reject if too large or out of bound */
874+
if (v * x * (t - 1.0) / (b - 1.0) <= t / b && x <= n)
875+
break;
876+
}
877+
return (int64) x;
878+
}
879+
880+
/*
881+
* Computing zipfian using harmonic numbers, based on algorithm described in
882+
* "Quickly Generating Billion-Record Synthetic Databases",
883+
* Jim Gray et al, SIGMOD 1994
884+
*/
885+
static int64
886+
computeHarmonicZipfian(TState *thread, int64 n, double s)
887+
{
888+
ZipfCell *cell = zipfFindOrCreateCacheCell(&thread->zipf_cache, n, s);
889+
double uniform = pg_erand48(thread->random_state);
890+
double uz = uniform * cell->harmonicn;
891+
892+
if (uz < 1.0)
893+
return 1;
894+
if (uz < 1.0 + cell->beta)
895+
return 2;
896+
return 1 + (int64) (cell->n * pow(cell->eta * uniform - cell->eta + 1.0, cell->alpha));
897+
}
898+
899+
/* random number generator: zipfian distribution from min to max inclusive */
900+
static int64
901+
getZipfianRand(TState *thread, int64 min, int64 max, double s)
902+
{
903+
int64 n = max - min + 1;
904+
905+
/* abort if parameter is invalid */
906+
Assert(s > 0.0 && s != 1.0 && s <= MAX_ZIPFIAN_PARAM);
907+
908+
909+
return min - 1 + ((s > 1)
910+
? computeIterativeZipfian(thread, n, s)
911+
: computeHarmonicZipfian(thread, n, s));
912+
}
913+
749914
/*
750915
* Initialize the given SimpleStats struct to all zeroes
751916
*/
@@ -1303,7 +1468,6 @@ coerceToDouble(PgBenchValue *pval, double *dval)
13031468
return true;
13041469
}
13051470
}
1306-
13071471
/* assign an integer value */
13081472
static void
13091473
setIntValue(PgBenchValue *pv, int64 ival)
@@ -1605,6 +1769,7 @@ evalFunc(TState *thread, CState *st,
16051769
case PGBENCH_RANDOM:
16061770
case PGBENCH_RANDOM_EXPONENTIAL:
16071771
case PGBENCH_RANDOM_GAUSSIAN:
1772+
case PGBENCH_RANDOM_ZIPFIAN:
16081773
{
16091774
int64 imin,
16101775
imax;
@@ -1655,6 +1820,18 @@ evalFunc(TState *thread, CState *st,
16551820
setIntValue(retval,
16561821
getGaussianRand(thread, imin, imax, param));
16571822
}
1823+
else if (func == PGBENCH_RANDOM_ZIPFIAN)
1824+
{
1825+
if (param <= 0.0 || param == 1.0 || param > MAX_ZIPFIAN_PARAM)
1826+
{
1827+
fprintf(stderr,
1828+
"zipfian parameter must be in range (0, 1) U (1, %d]"
1829+
" (got %f)\n", MAX_ZIPFIAN_PARAM, param);
1830+
return false;
1831+
}
1832+
setIntValue(retval,
1833+
getZipfianRand(thread, imin, imax, param));
1834+
}
16581835
else /* exponential */
16591836
{
16601837
if (param <= 0.0)
@@ -3683,6 +3860,8 @@ printResults(TState *threads, StatsData *total, instr_time total_time,
36833860
tps_include,
36843861
tps_exclude;
36853862
int64 ntx = total->cnt - total->skipped;
3863+
int i,
3864+
totalCacheOverflows = 0;
36863865

36873866
time_include = INSTR_TIME_GET_DOUBLE(total_time);
36883867

@@ -3710,6 +3889,15 @@ printResults(TState *threads, StatsData *total, instr_time total_time,
37103889
printf("number of transactions actually processed: " INT64_FORMAT "\n",
37113890
ntx);
37123891
}
3892+
/* Report zipfian cache overflow */
3893+
for (i = 0; i < nthreads; i++)
3894+
{
3895+
totalCacheOverflows += threads[i].zipf_cache.overflowCount;
3896+
}
3897+
if (totalCacheOverflows > 0)
3898+
{
3899+
printf("zipfian cache array overflowed %d time(s)\n", totalCacheOverflows);
3900+
}
37133901

37143902
/* Remaining stats are nonsensical if we failed to execute any xacts */
37153903
if (total->cnt <= 0)
@@ -4513,6 +4701,9 @@ main(int argc, char **argv)
45134701
thread->random_state[2] = random();
45144702
thread->logfile = NULL; /* filled in later */
45154703
thread->latency_late = 0;
4704+
thread->zipf_cache.nb_cells = 0;
4705+
thread->zipf_cache.current = 0;
4706+
thread->zipf_cache.overflowCount = 0;
45164707
initStats(&thread->stats, 0);
45174708

45184709
nclients_dealt += thread->nstate;

src/bin/pgbench/pgbench.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ typedef enum PgBenchFunction
7575
PGBENCH_SQRT,
7676
PGBENCH_RANDOM,
7777
PGBENCH_RANDOM_GAUSSIAN,
78-
PGBENCH_RANDOM_EXPONENTIAL
78+
PGBENCH_RANDOM_EXPONENTIAL,
79+
PGBENCH_RANDOM_ZIPFIAN
7980
} PgBenchFunction;
8081

8182
typedef struct PgBenchExpr PgBenchExpr;

src/bin/pgbench/t/001_pgbench_with_server.pl

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,8 @@ sub pgbench
231231
qr{command=18.: double 18\b},
232232
qr{command=19.: double 19\b},
233233
qr{command=20.: double 20\b},
234-
qr{command=21.: int 9223372036854775807\b}, ],
234+
qr{command=21.: int 9223372036854775807\b},
235+
qr{command=23.: int [1-9]\b}, ],
235236
'pgbench expressions',
236237
{ '001_pgbench_expressions' => q{-- integer functions
237238
\set i1 debug(random(1, 100))
@@ -261,6 +262,8 @@ sub pgbench
261262
\set maxint debug(:minint - 1)
262263
-- reset a variable
263264
\set i1 0
265+
-- yet another integer function
266+
\set id debug(random_zipfian(1, 9, 1.3))
264267
} });
265268

266269
# backslash commands
@@ -371,6 +374,14 @@ sub pgbench
371374
0,
372375
[qr{exponential parameter must be greater }],
373376
q{\set i random_exponential(0, 10, 0.0)} ],
377+
[ 'set zipfian param to 1',
378+
0,
379+
[qr{zipfian parameter must be in range \(0, 1\) U \(1, \d+\]}],
380+
q{\set i random_zipfian(0, 10, 1)} ],
381+
[ 'set zipfian param too large',
382+
0,
383+
[qr{zipfian parameter must be in range \(0, 1\) U \(1, \d+\]}],
384+
q{\set i random_zipfian(0, 10, 1000000)} ],
374385
[ 'set non numeric value', 0,
375386
[qr{malformed variable "foo" value: "bla"}], q{\set i :foo + 1} ],
376387
[ 'set no expression', 1, [qr{syntax error}], q{\set i} ],
@@ -412,6 +423,31 @@ sub pgbench
412423
{ $n => $script });
413424
}
414425

426+
# zipfian cache array overflow
427+
pgbench(
428+
'-t 1', 0,
429+
[ qr{processed: 1/1}, qr{zipfian cache array overflowed 1 time\(s\)} ],
430+
[ qr{^} ],
431+
'pgbench zipfian array overflow on random_zipfian',
432+
{ '001_pgbench_random_zipfian' => q{
433+
\set i random_zipfian(1, 100, 0.5)
434+
\set i random_zipfian(2, 100, 0.5)
435+
\set i random_zipfian(3, 100, 0.5)
436+
\set i random_zipfian(4, 100, 0.5)
437+
\set i random_zipfian(5, 100, 0.5)
438+
\set i random_zipfian(6, 100, 0.5)
439+
\set i random_zipfian(7, 100, 0.5)
440+
\set i random_zipfian(8, 100, 0.5)
441+
\set i random_zipfian(9, 100, 0.5)
442+
\set i random_zipfian(10, 100, 0.5)
443+
\set i random_zipfian(11, 100, 0.5)
444+
\set i random_zipfian(12, 100, 0.5)
445+
\set i random_zipfian(13, 100, 0.5)
446+
\set i random_zipfian(14, 100, 0.5)
447+
\set i random_zipfian(15, 100, 0.5)
448+
\set i random_zipfian(16, 100, 0.5)
449+
} });
450+
415451
# throttling
416452
pgbench(
417453
'-t 100 -S --rate=100000 --latency-limit=1000000 -c 2 -n -r',

0 commit comments

Comments
 (0)