Skip to content

Commit 492f6e2

Browse files
committed
Revert "Fix race in Parallel Hash Join batch cleanup."
This reverts commit 0129c56. Discussion: https://postgr.es/m/CA%2BhUKGJmcqAE3MZeDCLLXa62cWM0AJbKmp2JrJYaJ86bz36LFA%40mail.gmail.com
1 parent 0129c56 commit 492f6e2

File tree

3 files changed

+32
-58
lines changed

3 files changed

+32
-58
lines changed

src/backend/executor/nodeHash.c

Lines changed: 15 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -331,21 +331,14 @@ MultiExecParallelHash(HashState *node)
331331
hashtable->nbuckets = pstate->nbuckets;
332332
hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
333333
hashtable->totalTuples = pstate->total_tuples;
334-
335-
/*
336-
* Unless we're completely done and the batch state has been freed, make
337-
* sure we have accessors.
338-
*/
339-
if (BarrierPhase(build_barrier) < PHJ_BUILD_DONE)
340-
ExecParallelHashEnsureBatchAccessors(hashtable);
334+
ExecParallelHashEnsureBatchAccessors(hashtable);
341335

342336
/*
343337
* The next synchronization point is in ExecHashJoin's HJ_BUILD_HASHTABLE
344-
* case, which will bring the build phase to PHJ_BUILD_RUNNING (if it isn't
338+
* case, which will bring the build phase to PHJ_BUILD_DONE (if it isn't
345339
* there already).
346340
*/
347341
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
348-
BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
349342
BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
350343
}
351344

@@ -625,7 +618,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, bool keepNulls)
625618
/*
626619
* The next Parallel Hash synchronization point is in
627620
* MultiExecParallelHash(), which will progress it all the way to
628-
* PHJ_BUILD_RUNNING. The caller must not return control from this
621+
* PHJ_BUILD_DONE. The caller must not return control from this
629622
* executor node between now and then.
630623
*/
631624
}
@@ -3008,11 +3001,14 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
30083001
}
30093002

30103003
/*
3011-
* We should never see a state where the batch-tracking array is freed,
3012-
* because we should have given up sooner if we join when the build barrier
3013-
* has reached the PHJ_BUILD_DONE phase.
3004+
* It's possible for a backend to start up very late so that the whole
3005+
* join is finished and the shm state for tracking batches has already
3006+
* been freed by ExecHashTableDetach(). In that case we'll just leave
3007+
* hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives
3008+
* up early.
30143009
*/
3015-
Assert(DsaPointerIsValid(pstate->batches));
3010+
if (!DsaPointerIsValid(pstate->batches))
3011+
return;
30163012

30173013
/* Use hash join memory context. */
30183014
oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
@@ -3132,17 +3128,9 @@ ExecHashTableDetachBatch(HashJoinTable hashtable)
31323128
void
31333129
ExecHashTableDetach(HashJoinTable hashtable)
31343130
{
3135-
ParallelHashJoinState *pstate = hashtable->parallel_state;
3136-
3137-
/*
3138-
* If we're involved in a parallel query, we must either have got all the
3139-
* way to PHJ_BUILD_RUNNING, or joined too late and be in PHJ_BUILD_DONE.
3140-
*/
3141-
Assert(!pstate ||
3142-
BarrierPhase(&pstate->build_barrier) >= PHJ_BUILD_RUNNING);
3143-
3144-
if (pstate && BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_RUNNING)
3131+
if (hashtable->parallel_state)
31453132
{
3133+
ParallelHashJoinState *pstate = hashtable->parallel_state;
31463134
int i;
31473135

31483136
/* Make sure any temporary files are closed. */
@@ -3158,22 +3146,17 @@ ExecHashTableDetach(HashJoinTable hashtable)
31583146
}
31593147

31603148
/* If we're last to detach, clean up shared memory. */
3161-
if (BarrierArriveAndDetach(&pstate->build_barrier))
3149+
if (BarrierDetach(&pstate->build_barrier))
31623150
{
3163-
/*
3164-
* Late joining processes will see this state and give up
3165-
* immediately.
3166-
*/
3167-
Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_DONE);
3168-
31693151
if (DsaPointerIsValid(pstate->batches))
31703152
{
31713153
dsa_free(hashtable->area, pstate->batches);
31723154
pstate->batches = InvalidDsaPointer;
31733155
}
31743156
}
3157+
3158+
hashtable->parallel_state = NULL;
31753159
}
3176-
hashtable->parallel_state = NULL;
31773160
}
31783161

31793162
/*

src/backend/executor/nodeHashjoin.c

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,7 @@
4545
* PHJ_BUILD_ALLOCATING -- one sets up the batches and table 0
4646
* PHJ_BUILD_HASHING_INNER -- all hash the inner rel
4747
* PHJ_BUILD_HASHING_OUTER -- (multi-batch only) all hash the outer
48-
* PHJ_BUILD_RUNNING -- building done, probing can begin
49-
* PHJ_BUILD_DONE -- all work complete, one frees batches
48+
* PHJ_BUILD_DONE -- building done, probing can begin
5049
*
5150
* While in the phase PHJ_BUILD_HASHING_INNER a separate pair of barriers may
5251
* be used repeatedly as required to coordinate expansions in the number of
@@ -74,7 +73,7 @@
7473
* batches whenever it encounters them while scanning and probing, which it
7574
* can do because it processes batches in serial order.
7675
*
77-
* Once PHJ_BUILD_RUNNING is reached, backends then split up and process
76+
* Once PHJ_BUILD_DONE is reached, backends then split up and process
7877
* different batches, or gang up and work together on probing batches if there
7978
* aren't enough to go around. For each batch there is a separate barrier
8079
* with the following phases:
@@ -96,16 +95,11 @@
9695
*
9796
* To avoid deadlocks, we never wait for any barrier unless it is known that
9897
* all other backends attached to it are actively executing the node or have
99-
* finished. Practically, that means that we never emit a tuple while attached
100-
* to a barrier, unless the barrier has reached a phase that means that no
101-
* process will wait on it again. We emit tuples while attached to the build
102-
* barrier in phase PHJ_BUILD_RUNNING, and to a per-batch barrier in phase
103-
* PHJ_BATCH_PROBING. These are advanced to PHJ_BUILD_DONE and PHJ_BATCH_DONE
104-
* respectively without waiting, using BarrierArriveAndDetach(). The last to
105-
* detach receives a different return value so that it knows that it's safe to
106-
* clean up. Any straggler process that attaches after that phase is reached
107-
* will see that it's too late to participate or access the relevant shared
108-
* memory objects.
98+
* already arrived. Practically, that means that we never return a tuple
99+
* while attached to a barrier, unless the barrier has reached its final
100+
* state. In the slightly special case of the per-batch barrier, we return
101+
* tuples while in PHJ_BATCH_PROBING phase, but that's OK because we use
102+
* BarrierArriveAndDetach() to advance it to PHJ_BATCH_DONE without waiting.
109103
*
110104
*-------------------------------------------------------------------------
111105
*/
@@ -322,7 +316,6 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
322316

323317
build_barrier = &parallel_state->build_barrier;
324318
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
325-
BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
326319
BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
327320
if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER)
328321
{
@@ -335,18 +328,9 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
335328
BarrierArriveAndWait(build_barrier,
336329
WAIT_EVENT_HASH_BUILD_HASHING_OUTER);
337330
}
338-
else if (BarrierPhase(build_barrier) == PHJ_BUILD_DONE)
339-
{
340-
/*
341-
* If we attached so late that the job is finished and
342-
* the batch state has been freed, we can return
343-
* immediately.
344-
*/
345-
return NULL;
346-
}
331+
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
347332

348333
/* Each backend should now select a batch to work on. */
349-
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING);
350334
hashtable->curbatch = -1;
351335
node->hj_JoinState = HJ_NEED_NEW_BATCH;
352336

@@ -1119,6 +1103,14 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
11191103
int start_batchno;
11201104
int batchno;
11211105

1106+
/*
1107+
* If we started up so late that the batch tracking array has been freed
1108+
* already by ExecHashTableDetach(), then we are finished. See also
1109+
* ExecParallelHashEnsureBatchAccessors().
1110+
*/
1111+
if (hashtable->batches == NULL)
1112+
return false;
1113+
11221114
/*
11231115
* If we were already attached to a batch, remember not to bother checking
11241116
* it again, and detach from it (possibly freeing the hash table if we are

src/include/executor/hashjoin.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,8 +258,7 @@ typedef struct ParallelHashJoinState
258258
#define PHJ_BUILD_ALLOCATING 1
259259
#define PHJ_BUILD_HASHING_INNER 2
260260
#define PHJ_BUILD_HASHING_OUTER 3
261-
#define PHJ_BUILD_RUNNING 4
262-
#define PHJ_BUILD_DONE 5
261+
#define PHJ_BUILD_DONE 4
263262

264263
/* The phases for probing each batch, used by for batch_barrier. */
265264
#define PHJ_BATCH_ELECTING 0

0 commit comments

Comments
 (0)