Skip to content

Commit 0129c56

Browse files
committed
Fix race in Parallel Hash Join batch cleanup.
With very unlucky timing and parallel_leader_participation off, PHJ could attempt to access per-batch state just as it was being freed. There was code intended to prevent that by checking for a cleared pointer, but it was buggy. Fix, by introducing an extra barrier phase. The new phase PHJ_BUILD_RUNNING means that it's safe to access the per-batch state to find a batch to help with, and PHJ_BUILD_DONE means that it is too late. The last to detach will free the array of per-batch state as before, but now it will also atomically advance the phase at the same time, so that late attachers can avoid the hazard, without the data race. This mirrors the way per-batch hash tables are freed (see phases PHJ_BATCH_PROBING and PHJ_BATCH_DONE). Revealed by a one-off build farm failure, where BarrierAttach() failed a sanity check assertion, because the memory had been clobbered by dsa_free(). Back-patch to 11, where the code arrived. Reported-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/20200929061142.GA29096%40paquier.xyz
1 parent a2764d8 commit 0129c56

File tree

3 files changed

+58
-32
lines changed

3 files changed

+58
-32
lines changed

src/backend/executor/nodeHash.c

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -331,14 +331,21 @@ MultiExecParallelHash(HashState *node)
331331
hashtable->nbuckets = pstate->nbuckets;
332332
hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
333333
hashtable->totalTuples = pstate->total_tuples;
334-
ExecParallelHashEnsureBatchAccessors(hashtable);
334+
335+
/*
336+
* Unless we're completely done and the batch state has been freed, make
337+
* sure we have accessors.
338+
*/
339+
if (BarrierPhase(build_barrier) < PHJ_BUILD_DONE)
340+
ExecParallelHashEnsureBatchAccessors(hashtable);
335341

336342
/*
337343
* The next synchronization point is in ExecHashJoin's HJ_BUILD_HASHTABLE
338-
* case, which will bring the build phase to PHJ_BUILD_DONE (if it isn't
344+
* case, which will bring the build phase to PHJ_BUILD_RUNNING (if it isn't
339345
* there already).
340346
*/
341347
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
348+
BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
342349
BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
343350
}
344351

@@ -618,7 +625,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, bool keepNulls)
618625
/*
619626
* The next Parallel Hash synchronization point is in
620627
* MultiExecParallelHash(), which will progress it all the way to
621-
* PHJ_BUILD_DONE. The caller must not return control from this
628+
* PHJ_BUILD_RUNNING. The caller must not return control from this
622629
* executor node between now and then.
623630
*/
624631
}
@@ -3001,14 +3008,11 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
30013008
}
30023009

30033010
/*
3004-
* It's possible for a backend to start up very late so that the whole
3005-
* join is finished and the shm state for tracking batches has already
3006-
* been freed by ExecHashTableDetach(). In that case we'll just leave
3007-
* hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives
3008-
* up early.
3011+
* We should never see a state where the batch-tracking array is freed,
3012+
* because we should have given up sooner if we join when the build barrier
3013+
* has reached the PHJ_BUILD_DONE phase.
30093014
*/
3010-
if (!DsaPointerIsValid(pstate->batches))
3011-
return;
3015+
Assert(DsaPointerIsValid(pstate->batches));
30123016

30133017
/* Use hash join memory context. */
30143018
oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
@@ -3128,9 +3132,17 @@ ExecHashTableDetachBatch(HashJoinTable hashtable)
31283132
void
31293133
ExecHashTableDetach(HashJoinTable hashtable)
31303134
{
3131-
if (hashtable->parallel_state)
3135+
ParallelHashJoinState *pstate = hashtable->parallel_state;
3136+
3137+
/*
3138+
* If we're involved in a parallel query, we must either have got all the
3139+
* way to PHJ_BUILD_RUNNING, or joined too late and be in PHJ_BUILD_DONE.
3140+
*/
3141+
Assert(!pstate ||
3142+
BarrierPhase(&pstate->build_barrier) >= PHJ_BUILD_RUNNING);
3143+
3144+
if (pstate && BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_RUNNING)
31323145
{
3133-
ParallelHashJoinState *pstate = hashtable->parallel_state;
31343146
int i;
31353147

31363148
/* Make sure any temporary files are closed. */
@@ -3146,17 +3158,22 @@ ExecHashTableDetach(HashJoinTable hashtable)
31463158
}
31473159

31483160
/* If we're last to detach, clean up shared memory. */
3149-
if (BarrierDetach(&pstate->build_barrier))
3161+
if (BarrierArriveAndDetach(&pstate->build_barrier))
31503162
{
3163+
/*
3164+
* Late joining processes will see this state and give up
3165+
* immediately.
3166+
*/
3167+
Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_DONE);
3168+
31513169
if (DsaPointerIsValid(pstate->batches))
31523170
{
31533171
dsa_free(hashtable->area, pstate->batches);
31543172
pstate->batches = InvalidDsaPointer;
31553173
}
31563174
}
3157-
3158-
hashtable->parallel_state = NULL;
31593175
}
3176+
hashtable->parallel_state = NULL;
31603177
}
31613178

31623179
/*

src/backend/executor/nodeHashjoin.c

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@
4545
* PHJ_BUILD_ALLOCATING -- one sets up the batches and table 0
4646
* PHJ_BUILD_HASHING_INNER -- all hash the inner rel
4747
* PHJ_BUILD_HASHING_OUTER -- (multi-batch only) all hash the outer
48-
* PHJ_BUILD_DONE -- building done, probing can begin
48+
* PHJ_BUILD_RUNNING -- building done, probing can begin
49+
* PHJ_BUILD_DONE -- all work complete, one frees batches
4950
*
5051
* While in the phase PHJ_BUILD_HASHING_INNER a separate pair of barriers may
5152
* be used repeatedly as required to coordinate expansions in the number of
@@ -73,7 +74,7 @@
7374
* batches whenever it encounters them while scanning and probing, which it
7475
* can do because it processes batches in serial order.
7576
*
76-
* Once PHJ_BUILD_DONE is reached, backends then split up and process
77+
* Once PHJ_BUILD_RUNNING is reached, backends then split up and process
7778
* different batches, or gang up and work together on probing batches if there
7879
* aren't enough to go around. For each batch there is a separate barrier
7980
* with the following phases:
@@ -95,11 +96,16 @@
9596
*
9697
* To avoid deadlocks, we never wait for any barrier unless it is known that
9798
* all other backends attached to it are actively executing the node or have
98-
* already arrived. Practically, that means that we never return a tuple
99-
* while attached to a barrier, unless the barrier has reached its final
100-
* state. In the slightly special case of the per-batch barrier, we return
101-
* tuples while in PHJ_BATCH_PROBING phase, but that's OK because we use
102-
* BarrierArriveAndDetach() to advance it to PHJ_BATCH_DONE without waiting.
99+
* finished. Practically, that means that we never emit a tuple while attached
100+
* to a barrier, unless the barrier has reached a phase that means that no
101+
* process will wait on it again. We emit tuples while attached to the build
102+
* barrier in phase PHJ_BUILD_RUNNING, and to a per-batch barrier in phase
103+
* PHJ_BATCH_PROBING. These are advanced to PHJ_BUILD_DONE and PHJ_BATCH_DONE
104+
* respectively without waiting, using BarrierArriveAndDetach(). The last to
105+
* detach receives a different return value so that it knows that it's safe to
106+
* clean up. Any straggler process that attaches after that phase is reached
107+
* will see that it's too late to participate or access the relevant shared
108+
* memory objects.
103109
*
104110
*-------------------------------------------------------------------------
105111
*/
@@ -316,6 +322,7 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
316322

317323
build_barrier = &parallel_state->build_barrier;
318324
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
325+
BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
319326
BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
320327
if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER)
321328
{
@@ -328,9 +335,18 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
328335
BarrierArriveAndWait(build_barrier,
329336
WAIT_EVENT_HASH_BUILD_HASHING_OUTER);
330337
}
331-
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
338+
else if (BarrierPhase(build_barrier) == PHJ_BUILD_DONE)
339+
{
340+
/*
341+
* If we attached so late that the job is finished and
342+
* the batch state has been freed, we can return
343+
* immediately.
344+
*/
345+
return NULL;
346+
}
332347

333348
/* Each backend should now select a batch to work on. */
349+
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING);
334350
hashtable->curbatch = -1;
335351
node->hj_JoinState = HJ_NEED_NEW_BATCH;
336352

@@ -1103,14 +1119,6 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
11031119
int start_batchno;
11041120
int batchno;
11051121

1106-
/*
1107-
* If we started up so late that the batch tracking array has been freed
1108-
* already by ExecHashTableDetach(), then we are finished. See also
1109-
* ExecParallelHashEnsureBatchAccessors().
1110-
*/
1111-
if (hashtable->batches == NULL)
1112-
return false;
1113-
11141122
/*
11151123
* If we were already attached to a batch, remember not to bother checking
11161124
* it again, and detach from it (possibly freeing the hash table if we are

src/include/executor/hashjoin.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,8 @@ typedef struct ParallelHashJoinState
258258
#define PHJ_BUILD_ALLOCATING 1
259259
#define PHJ_BUILD_HASHING_INNER 2
260260
#define PHJ_BUILD_HASHING_OUTER 3
261-
#define PHJ_BUILD_DONE 4
261+
#define PHJ_BUILD_RUNNING 4
262+
#define PHJ_BUILD_DONE 5
262263

263264
/* The phases for probing each batch, used by for batch_barrier. */
264265
#define PHJ_BATCH_ELECTING 0

0 commit comments

Comments
 (0)