Skip to content

Commit 93ea78b

Browse files
committed
Fix EXPLAIN ANALYZE output for Parallel Hash.
In a race case, EXPLAIN ANALYZE could fail to display correct nbatch and size information. Refactor so that participants report only on batches they worked on rather than trying to report on all of them, and teach explain.c to consider the HashInstrumentation object from all participants instead of picking the first one it can find. This should fix an occasional build farm failure in the "join" regression test. Author: Thomas Munro Reviewed-By: Andres Freund Discussion: https://postgr.es/m/30219.1514428346%40sss.pgh.pa.us
1 parent 6078770 commit 93ea78b

File tree

4 files changed

+62
-51
lines changed

4 files changed

+62
-51
lines changed

src/backend/commands/explain.c

Lines changed: 52 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2379,70 +2379,95 @@ show_sort_info(SortState *sortstate, ExplainState *es)
23792379
static void
23802380
show_hash_info(HashState *hashstate, ExplainState *es)
23812381
{
2382-
HashInstrumentation *hinstrument = NULL;
2382+
HashInstrumentation hinstrument = {0};
23832383

23842384
/*
23852385
* In a parallel query, the leader process may or may not have run the
23862386
* hash join, and even if it did it may not have built a hash table due to
23872387
* timing (if it started late it might have seen no tuples in the outer
23882388
* relation and skipped building the hash table). Therefore we have to be
2389-
* prepared to get instrumentation data from a worker if there is no hash
2390-
* table.
2389+
* prepared to get instrumentation data from all participants.
23912390
*/
23922391
if (hashstate->hashtable)
2393-
{
2394-
hinstrument = (HashInstrumentation *)
2395-
palloc(sizeof(HashInstrumentation));
2396-
ExecHashGetInstrumentation(hinstrument, hashstate->hashtable);
2397-
}
2398-
else if (hashstate->shared_info)
2392+
ExecHashGetInstrumentation(&hinstrument, hashstate->hashtable);
2393+
2394+
/*
2395+
* Merge results from workers. In the parallel-oblivious case, the
2396+
* results from all participants should be identical, except where
2397+
* participants didn't run the join at all so have no data. In the
2398+
* parallel-aware case, we need to consider all the results. Each worker
2399+
* may have seen a different subset of batches and we want to find the
2400+
* highest memory usage for any one batch across all batches.
2401+
*/
2402+
if (hashstate->shared_info)
23992403
{
24002404
SharedHashInfo *shared_info = hashstate->shared_info;
2401-
int i;
2405+
int i;
24022406

2403-
/* Find the first worker that built a hash table. */
24042407
for (i = 0; i < shared_info->num_workers; ++i)
24052408
{
2406-
if (shared_info->hinstrument[i].nbatch > 0)
2409+
HashInstrumentation *worker_hi = &shared_info->hinstrument[i];
2410+
2411+
if (worker_hi->nbatch > 0)
24072412
{
2408-
hinstrument = &shared_info->hinstrument[i];
2409-
break;
2413+
/*
2414+
* Every participant should agree on the buckets, so to be
2415+
* sure we have a value we'll just overwrite each time.
2416+
*/
2417+
hinstrument.nbuckets = worker_hi->nbuckets;
2418+
hinstrument.nbuckets_original = worker_hi->nbuckets_original;
2419+
2420+
/*
2421+
* Normally every participant should agree on the number of
2422+
* batches too, but it's possible for a backend that started
2423+
* late and missed the whole join not to have the final nbatch
2424+
* number. So we'll take the largest number.
2425+
*/
2426+
hinstrument.nbatch = Max(hinstrument.nbatch, worker_hi->nbatch);
2427+
hinstrument.nbatch_original = worker_hi->nbatch_original;
2428+
2429+
/*
2430+
* In a parallel-aware hash join, for now we report the
2431+
* maximum peak memory reported by any worker.
2432+
*/
2433+
hinstrument.space_peak =
2434+
Max(hinstrument.space_peak, worker_hi->space_peak);
24102435
}
24112436
}
24122437
}
24132438

2414-
if (hinstrument)
2439+
if (hinstrument.nbatch > 0)
24152440
{
2416-
long spacePeakKb = (hinstrument->space_peak + 1023) / 1024;
2441+
long spacePeakKb = (hinstrument.space_peak + 1023) / 1024;
24172442

24182443
if (es->format != EXPLAIN_FORMAT_TEXT)
24192444
{
2420-
ExplainPropertyLong("Hash Buckets", hinstrument->nbuckets, es);
2445+
ExplainPropertyLong("Hash Buckets", hinstrument.nbuckets, es);
24212446
ExplainPropertyLong("Original Hash Buckets",
2422-
hinstrument->nbuckets_original, es);
2423-
ExplainPropertyLong("Hash Batches", hinstrument->nbatch, es);
2447+
hinstrument.nbuckets_original, es);
2448+
ExplainPropertyLong("Hash Batches", hinstrument.nbatch, es);
24242449
ExplainPropertyLong("Original Hash Batches",
2425-
hinstrument->nbatch_original, es);
2450+
hinstrument.nbatch_original, es);
24262451
ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
24272452
}
2428-
else if (hinstrument->nbatch_original != hinstrument->nbatch ||
2429-
hinstrument->nbuckets_original != hinstrument->nbuckets)
2453+
else if (hinstrument.nbatch_original != hinstrument.nbatch ||
2454+
hinstrument.nbuckets_original != hinstrument.nbuckets)
24302455
{
24312456
appendStringInfoSpaces(es->str, es->indent * 2);
24322457
appendStringInfo(es->str,
24332458
"Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n",
2434-
hinstrument->nbuckets,
2435-
hinstrument->nbuckets_original,
2436-
hinstrument->nbatch,
2437-
hinstrument->nbatch_original,
2459+
hinstrument.nbuckets,
2460+
hinstrument.nbuckets_original,
2461+
hinstrument.nbatch,
2462+
hinstrument.nbatch_original,
24382463
spacePeakKb);
24392464
}
24402465
else
24412466
{
24422467
appendStringInfoSpaces(es->str, es->indent * 2);
24432468
appendStringInfo(es->str,
24442469
"Buckets: %d Batches: %d Memory Usage: %ldkB\n",
2445-
hinstrument->nbuckets, hinstrument->nbatch,
2470+
hinstrument.nbuckets, hinstrument.nbatch,
24462471
spacePeakKb);
24472472
}
24482473
}

src/backend/executor/nodeHash.c

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3090,7 +3090,16 @@ ExecHashTableDetachBatch(HashJoinTable hashtable)
30903090
batch->buckets = InvalidDsaPointer;
30913091
}
30923092
}
3093-
ExecParallelHashUpdateSpacePeak(hashtable, curbatch);
3093+
3094+
/*
3095+
* Track the largest batch we've been attached to. Though each
3096+
* backend might see a different subset of batches, explain.c will
3097+
* scan the results from all backends to find the largest value.
3098+
*/
3099+
hashtable->spacePeak =
3100+
Max(hashtable->spacePeak,
3101+
batch->size + sizeof(dsa_pointer_atomic) * hashtable->nbuckets);
3102+
30943103
/* Remember that we are not attached to a batch. */
30953104
hashtable->curbatch = -1;
30963105
}
@@ -3295,19 +3304,3 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
32953304

32963305
return true;
32973306
}
3298-
3299-
/*
3300-
* Update this backend's copy of hashtable->spacePeak to account for a given
3301-
* batch. This is called at the end of hashing for batch 0, and then for each
3302-
* batch when it is done or discovered to be already done. The result is used
3303-
* for EXPLAIN output.
3304-
*/
3305-
void
3306-
ExecParallelHashUpdateSpacePeak(HashJoinTable hashtable, int batchno)
3307-
{
3308-
size_t size;
3309-
3310-
size = hashtable->batches[batchno].shared->size;
3311-
size += sizeof(dsa_pointer_atomic) * hashtable->nbuckets;
3312-
hashtable->spacePeak = Max(hashtable->spacePeak, size);
3313-
}

src/backend/executor/nodeHashjoin.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1186,12 +1186,6 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
11861186
* remain).
11871187
*/
11881188
BarrierDetach(batch_barrier);
1189-
1190-
/*
1191-
* We didn't work on this batch, but we need to observe
1192-
* its size for EXPLAIN.
1193-
*/
1194-
ExecParallelHashUpdateSpacePeak(hashtable, batchno);
11951189
hashtable->batches[batchno].done = true;
11961190
hashtable->curbatch = -1;
11971191
break;

src/include/executor/nodeHash.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ extern void ExecHashTableDetach(HashJoinTable hashtable);
3333
extern void ExecHashTableDetachBatch(HashJoinTable hashtable);
3434
extern void ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable,
3535
int batchno);
36-
void ExecParallelHashUpdateSpacePeak(HashJoinTable hashtable, int batchno);
3736

3837
extern void ExecHashTableInsert(HashJoinTable hashtable,
3938
TupleTableSlot *slot,

0 commit comments

Comments
 (0)