Skip to content

Commit cc721c4

Browse files
committed
HashAgg: use Bump allocator for hash TupleHashTable entries.
The entries aren't freed until the entire hash table is destroyed, so use the Bump allocator to improve allocation speed, avoid wasting space on the chunk header, and avoid wasting space due to the power-of-two allocations. Discussion: https://postgr.es/m/CAApHDvqv1aNB4cM36FzRwivXrEvBO_LsG_eQ3nqDXTjECaatOQ@mail.gmail.com Reviewed-by: David Rowley
1 parent cc43316 commit cc721c4

File tree

3 files changed

+104
-29
lines changed

3 files changed

+104
-29
lines changed

src/backend/executor/execUtils.c

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -322,19 +322,18 @@ CreateExprContext(EState *estate)
322322
ExprContext *
323323
CreateWorkExprContext(EState *estate)
324324
{
325-
Size minContextSize = ALLOCSET_DEFAULT_MINSIZE;
326-
Size initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
327325
Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
328326

329-
/* choose the maxBlockSize to be no larger than 1/16 of work_mem */
330-
while (maxBlockSize > work_mem * (Size) 1024 / 16)
331-
maxBlockSize >>= 1;
327+
maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16);
332328

333-
if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
334-
maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
329+
/* But no bigger than ALLOCSET_DEFAULT_MAXSIZE */
330+
maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE);
335331

336-
return CreateExprContextInternal(estate, minContextSize,
337-
initBlockSize, maxBlockSize);
332+
/* and no smaller than ALLOCSET_DEFAULT_INITSIZE */
333+
maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE);
334+
335+
return CreateExprContextInternal(estate, ALLOCSET_DEFAULT_MINSIZE,
336+
ALLOCSET_DEFAULT_INITSIZE, maxBlockSize);
338337
}
339338

340339
/* ----------------

src/backend/executor/nodeAgg.c

Lines changed: 93 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,7 @@ static void build_hash_tables(AggState *aggstate);
406406
static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
407407
static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
408408
bool nullcheck);
409+
static void hash_create_memory(AggState *aggstate);
409410
static long hash_choose_num_buckets(double hashentrysize,
410411
long ngroups, Size memory);
411412
static int hash_choose_num_partitions(double input_groups,
@@ -1509,7 +1510,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
15091510
{
15101511
AggStatePerHash perhash = &aggstate->perhash[setno];
15111512
MemoryContext metacxt = aggstate->hash_metacxt;
1512-
MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
1513+
MemoryContext tablecxt = aggstate->hash_tablecxt;
15131514
MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
15141515
Size additionalsize;
15151516

@@ -1535,7 +1536,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
15351536
nbuckets,
15361537
additionalsize,
15371538
metacxt,
1538-
hashcxt,
1539+
tablecxt,
15391540
tmpcxt,
15401541
DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
15411542
}
@@ -1706,15 +1707,19 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
17061707
tupleWidth);
17071708
Size pergroupSize = numTrans * sizeof(AggStatePerGroupData);
17081709

1709-
tupleChunkSize = CHUNKHDRSZ + tupleSize;
1710-
1711-
if (pergroupSize > 0)
1712-
pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
1713-
else
1714-
pergroupChunkSize = 0;
1710+
/*
1711+
* Entries use the Bump allocator, so the chunk sizes are the same as the
1712+
* requested sizes.
1713+
*/
1714+
tupleChunkSize = MAXALIGN(tupleSize);
1715+
pergroupChunkSize = pergroupSize;
17151716

1717+
/*
1718+
* Transition values use AllocSet, which has a chunk header and also uses
1719+
* power-of-two allocations.
1720+
*/
17161721
if (transitionSpace > 0)
1717-
transitionChunkSize = CHUNKHDRSZ + transitionSpace;
1722+
transitionChunkSize = CHUNKHDRSZ + pg_nextpower2_size_t(transitionSpace);
17181723
else
17191724
transitionChunkSize = 0;
17201725

@@ -1864,8 +1869,11 @@ hash_agg_check_limits(AggState *aggstate)
18641869
uint64 ngroups = aggstate->hash_ngroups_current;
18651870
Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
18661871
true);
1867-
Size hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
1868-
true);
1872+
Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt,
1873+
true);
1874+
Size tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
1875+
true);
1876+
Size total_mem = meta_mem + entry_mem + tval_mem;
18691877
bool do_spill = false;
18701878

18711879
#ifdef USE_INJECTION_POINTS
@@ -1884,7 +1892,7 @@ hash_agg_check_limits(AggState *aggstate)
18841892
* can be sure to make progress even in edge cases.
18851893
*/
18861894
if (aggstate->hash_ngroups_current > 0 &&
1887-
(meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
1895+
(total_mem > aggstate->hash_mem_limit ||
18881896
ngroups > aggstate->hash_ngroups_limit))
18891897
{
18901898
do_spill = true;
@@ -1939,6 +1947,7 @@ static void
19391947
hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
19401948
{
19411949
Size meta_mem;
1950+
Size entry_mem;
19421951
Size hashkey_mem;
19431952
Size buffer_mem;
19441953
Size total_mem;
@@ -1950,7 +1959,10 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
19501959
/* memory for the hash table itself */
19511960
meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
19521961

1953-
/* memory for the group keys and transition states */
1962+
/* memory for hash entries */
1963+
entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true);
1964+
1965+
/* memory for byref transition states */
19541966
hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
19551967

19561968
/* memory for read/write tape buffers, if spilled */
@@ -1959,7 +1971,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
19591971
buffer_mem += HASHAGG_READ_BUFFER_SIZE;
19601972

19611973
/* update peak mem */
1962-
total_mem = meta_mem + hashkey_mem + buffer_mem;
1974+
total_mem = meta_mem + entry_mem + hashkey_mem + buffer_mem;
19631975
if (total_mem > aggstate->hash_mem_peak)
19641976
aggstate->hash_mem_peak = total_mem;
19651977

@@ -1981,6 +1993,64 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
19811993
}
19821994
}
19831995

1996+
/*
1997+
* Create memory contexts used for hash aggregation.
1998+
*/
1999+
static void
2000+
hash_create_memory(AggState *aggstate)
2001+
{
2002+
Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
2003+
2004+
/*
2005+
* The hashcontext's per-tuple memory will be used for byref transition
2006+
* values and returned by AggCheckCallContext().
2007+
*/
2008+
aggstate->hashcontext = CreateWorkExprContext(aggstate->ss.ps.state);
2009+
2010+
/*
2011+
* The meta context will be used for the bucket array of
2012+
* TupleHashEntryData (or arrays, in the case of grouping sets). As the
2013+
* hash table grows, the bucket array will double in size and the old one
2014+
* will be freed, so an AllocSet is appropriate. For large bucket arrays,
2015+
* the large allocation path will be used, so it's not worth worrying
2016+
* about wasting space due to power-of-two allocations.
2017+
*/
2018+
aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
2019+
"HashAgg meta context",
2020+
ALLOCSET_DEFAULT_SIZES);
2021+
2022+
/*
2023+
* The hash entries themselves, which include the grouping key
2024+
* (firstTuple) and pergroup data, are stored in the table context. The
2025+
* bump allocator can be used because the entries are not freed until the
2026+
* entire hash table is reset. The bump allocator is faster for
2027+
* allocations and avoids wasting space on the chunk header or
2028+
* power-of-two allocations.
2029+
*
2030+
* Like CreateWorkExprContext(), use smaller sizings for smaller work_mem,
2031+
* to avoid large jumps in memory usage.
2032+
*/
2033+
2034+
/*
2035+
* Like CreateWorkExprContext(), use smaller sizings for smaller work_mem,
2036+
* to avoid large jumps in memory usage.
2037+
*/
2038+
maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16);
2039+
2040+
/* But no bigger than ALLOCSET_DEFAULT_MAXSIZE */
2041+
maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE);
2042+
2043+
/* and no smaller than ALLOCSET_DEFAULT_INITSIZE */
2044+
maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE);
2045+
2046+
aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt,
2047+
"HashAgg table context",
2048+
ALLOCSET_DEFAULT_MINSIZE,
2049+
ALLOCSET_DEFAULT_INITSIZE,
2050+
maxBlockSize);
2051+
2052+
}
2053+
19842054
/*
19852055
* Choose a reasonable number of buckets for the initial hash table size.
19862056
*/
@@ -2642,6 +2712,7 @@ agg_refill_hash_table(AggState *aggstate)
26422712

26432713
/* free memory and reset hash tables */
26442714
ReScanExprContext(aggstate->hashcontext);
2715+
MemoryContextReset(aggstate->hash_tablecxt);
26452716
for (int setno = 0; setno < aggstate->num_hashes; setno++)
26462717
ResetTupleHashTable(aggstate->perhash[setno].hashtable);
26472718

@@ -3326,7 +3397,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
33263397
}
33273398

33283399
if (use_hashing)
3329-
aggstate->hashcontext = CreateWorkExprContext(estate);
3400+
hash_create_memory(aggstate);
33303401

33313402
ExecAssignExprContext(estate, &aggstate->ss.ps);
33323403

@@ -3621,9 +3692,6 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
36213692
Plan *outerplan = outerPlan(node);
36223693
uint64 totalGroups = 0;
36233694

3624-
aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
3625-
"HashAgg meta context",
3626-
ALLOCSET_DEFAULT_SIZES);
36273695
aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
36283696
&TTSOpsMinimalTuple);
36293697
aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
@@ -4368,6 +4436,12 @@ ExecEndAgg(AggState *node)
43684436
MemoryContextDelete(node->hash_metacxt);
43694437
node->hash_metacxt = NULL;
43704438
}
4439+
if (node->hash_tablecxt != NULL)
4440+
{
4441+
MemoryContextDelete(node->hash_tablecxt);
4442+
node->hash_tablecxt = NULL;
4443+
}
4444+
43714445

43724446
for (transno = 0; transno < node->numtrans; transno++)
43734447
{
@@ -4484,6 +4558,7 @@ ExecReScanAgg(AggState *node)
44844558
node->hash_ngroups_current = 0;
44854559

44864560
ReScanExprContext(node->hashcontext);
4561+
MemoryContextReset(node->hash_tablecxt);
44874562
/* Rebuild an empty hash table */
44884563
build_hash_tables(node);
44894564
node->table_filled = false;

src/include/nodes/execnodes.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2560,7 +2560,8 @@ typedef struct AggState
25602560
/* these fields are used in AGG_HASHED and AGG_MIXED modes: */
25612561
bool table_filled; /* hash table filled yet? */
25622562
int num_hashes;
2563-
MemoryContext hash_metacxt; /* memory for hash table itself */
2563+
MemoryContext hash_metacxt; /* memory for hash table bucket array */
2564+
MemoryContext hash_tablecxt; /* memory for hash table entries */
25642565
struct LogicalTapeSet *hash_tapeset; /* tape set for hash spill tapes */
25652566
struct HashAggSpill *hash_spills; /* HashAggSpill for each grouping set,
25662567
* exists only during first pass */
@@ -2586,7 +2587,7 @@ typedef struct AggState
25862587
* per-group pointers */
25872588

25882589
/* support for evaluation of agg input expressions: */
2589-
#define FIELDNO_AGGSTATE_ALL_PERGROUPS 53
2590+
#define FIELDNO_AGGSTATE_ALL_PERGROUPS 54
25902591
AggStatePerGroup *all_pergroups; /* array of first ->pergroups, than
25912592
* ->hash_pergroup */
25922593
SharedAggInfo *shared_info; /* one entry per worker */

0 commit comments

Comments
 (0)