Skip to content

Commit 887feef

Browse files
committed
Don't CHECK_FOR_INTERRUPTS between WaitLatch and ResetLatch.
This coding pattern creates a race condition, because if an interesting interrupt happens after we've checked InterruptPending but before we reset our latch, the latch-setting done by the signal handler would get lost, and then we might block at WaitLatch in the next iteration without ever noticing the interrupt condition. You can put the CHECK_FOR_INTERRUPTS before WaitLatch or after ResetLatch, but not between them. Aside from fixing the bugs, add some explanatory comments to latch.h to perhaps forestall the next person from making the same mistake. In HEAD, also replace gather_readnext's direct call of HandleParallelMessages with CHECK_FOR_INTERRUPTS. It does not seem clean or useful for this one caller to bypass ProcessInterrupts and go straight to HandleParallelMessages; not least because that fails to consider the InterruptPending flag, resulting in useless work both here (if InterruptPending isn't set) and in the next CHECK_FOR_INTERRUPTS call (if it is). This thinko seems to have been introduced in the initial coding of storage/ipc/shm_mq.c (commit ec9037d), and then blindly copied into all the subsequent parallel-query support logic. Back-patch relevant hunks to 9.4 to extirpate the error everywhere. Discussion: <1661.1469996911@sss.pgh.pa.us>
1 parent dd5eb80 commit 887feef

File tree

6 files changed

+32
-17
lines changed

6 files changed

+32
-17
lines changed

src/backend/executor/nodeGather.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -330,8 +330,8 @@ gather_readnext(GatherState *gatherstate)
330330
HeapTuple tup;
331331
bool readerdone;
332332

333-
/* Make sure we've read all messages from workers. */
334-
HandleParallelMessages();
333+
/* Check for async events, particularly messages from workers. */
334+
CHECK_FOR_INTERRUPTS();
335335

336336
/* Attempt to read a tuple, but don't block if none is available. */
337337
reader = gatherstate->reader[gatherstate->nextreader];
@@ -388,7 +388,6 @@ gather_readnext(GatherState *gatherstate)
388388

389389
/* Nothing to do except wait for developments. */
390390
WaitLatch(MyLatch, WL_LATCH_SET, 0);
391-
CHECK_FOR_INTERRUPTS();
392391
ResetLatch(MyLatch);
393392
nvisited = 0;
394393
}

src/backend/libpq/pqmq.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,8 @@ mq_putmessage(char msgtype, const char *s, size_t len)
172172
break;
173173

174174
WaitLatch(&MyProc->procLatch, WL_LATCH_SET, 0);
175-
CHECK_FOR_INTERRUPTS();
176175
ResetLatch(&MyProc->procLatch);
176+
CHECK_FOR_INTERRUPTS();
177177
}
178178

179179
pq_mq_busy = false;

src/backend/storage/ipc/shm_mq.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -896,11 +896,11 @@ shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data,
896896
*/
897897
WaitLatch(MyLatch, WL_LATCH_SET, 0);
898898

899-
/* An interrupt may have occurred while we were waiting. */
900-
CHECK_FOR_INTERRUPTS();
901-
902899
/* Reset the latch so we don't spin. */
903900
ResetLatch(MyLatch);
901+
902+
/* An interrupt may have occurred while we were waiting. */
903+
CHECK_FOR_INTERRUPTS();
904904
}
905905
else
906906
{
@@ -993,11 +993,11 @@ shm_mq_receive_bytes(shm_mq *mq, Size bytes_needed, bool nowait,
993993
*/
994994
WaitLatch(MyLatch, WL_LATCH_SET, 0);
995995

996-
/* An interrupt may have occurred while we were waiting. */
997-
CHECK_FOR_INTERRUPTS();
998-
999996
/* Reset the latch so we don't spin. */
1000997
ResetLatch(MyLatch);
998+
999+
/* An interrupt may have occurred while we were waiting. */
1000+
CHECK_FOR_INTERRUPTS();
10011001
}
10021002
}
10031003

@@ -1092,11 +1092,11 @@ shm_mq_wait_internal(volatile shm_mq *mq, PGPROC *volatile * ptr,
10921092
/* Wait to be signalled. */
10931093
WaitLatch(MyLatch, WL_LATCH_SET, 0);
10941094

1095-
/* An interrupt may have occurred while we were waiting. */
1096-
CHECK_FOR_INTERRUPTS();
1097-
10981095
/* Reset the latch so we don't spin. */
10991096
ResetLatch(MyLatch);
1097+
1098+
/* An interrupt may have occurred while we were waiting. */
1099+
CHECK_FOR_INTERRUPTS();
11001100
}
11011101

11021102
return result;

src/include/storage/latch.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,22 @@
5252
* do. Otherwise, if someone sets the latch between the check and the
5353
* ResetLatch call, you will miss it and Wait will incorrectly block.
5454
*
55+
* Another valid coding pattern looks like:
56+
*
57+
* for (;;)
58+
* {
59+
* if (work to do)
60+
* Do Stuff(); // in particular, exit loop if some condition satisfied
61+
* WaitLatch();
62+
* ResetLatch();
63+
* }
64+
*
65+
* This is useful to reduce latch traffic if it's expected that the loop's
66+
* termination condition will often be satisfied in the first iteration;
67+
* the cost is an extra loop iteration before blocking when it is not.
68+
* What must be avoided is placing any checks for asynchronous events after
69+
* WaitLatch and before ResetLatch, as that creates a race condition.
70+
*
5571
* To wake up the waiter, you must first set a global flag or something
5672
* else that the wait loop tests in the "if (work to do)" part, and call
5773
* SetLatch *after* that. SetLatch is designed to return quickly if the

src/test/modules/test_shm_mq/setup.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -281,11 +281,11 @@ wait_for_workers_to_become_ready(worker_state *wstate,
281281
/* Wait to be signalled. */
282282
WaitLatch(MyLatch, WL_LATCH_SET, 0);
283283

284-
/* An interrupt may have occurred while we were waiting. */
285-
CHECK_FOR_INTERRUPTS();
286-
287284
/* Reset the latch so we don't spin. */
288285
ResetLatch(MyLatch);
286+
287+
/* An interrupt may have occurred while we were waiting. */
288+
CHECK_FOR_INTERRUPTS();
289289
}
290290

291291
if (!result)

src/test/modules/test_shm_mq/test.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,8 +231,8 @@ test_shm_mq_pipelined(PG_FUNCTION_ARGS)
231231
* for us to do.
232232
*/
233233
WaitLatch(MyLatch, WL_LATCH_SET, 0);
234-
CHECK_FOR_INTERRUPTS();
235234
ResetLatch(MyLatch);
235+
CHECK_FOR_INTERRUPTS();
236236
}
237237
}
238238

0 commit comments

Comments
 (0)