Skip to content

Commit 18a4a62

Browse files
committed
Harden pmsignal.c against clobbered shared memory.
The postmaster is not supposed to do anything that depends fundamentally on shared memory contents, because that creates the risk that a backend crash that trashes shared memory will take the postmaster down with it, preventing automatic recovery. In commit 969d7cd I lost sight of this principle and coded AssignPostmasterChildSlot() in such a way that it could fail or even crash if the shared PMSignalState structure became corrupted. Remarkably, we've not seen field reports of such crashes; but I managed to induce one while testing the recent changes around palloc chunk headers. To fix, make a semi-duplicative state array inside the postmaster so that we need consult only local state while choosing a "child slot" for a new backend. Ensure that other postmaster-executed routines in pmsignal.c don't have critical dependencies on the shared state, either. Corruption of PMSignalState might now lead ReleasePostmasterChildSlot() to conclude that backend X failed, when actually backend Y was the one that trashed things. But that doesn't matter, because we'll force a cluster-wide reset regardless. Back-patch to all supported branches, since this is an old bug. Discussion: https://postgr.es/m/3436789.1665187055@sss.pgh.pa.us
1 parent b8f2687 commit 18a4a62

File tree

1 file changed

+44
-12
lines changed

1 file changed

+44
-12
lines changed

src/backend/storage/ipc/pmsignal.c

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "replication/walsender.h"
2727
#include "storage/pmsignal.h"
2828
#include "storage/shmem.h"
29+
#include "utils/memutils.h"
2930

3031

3132
/*
@@ -75,12 +76,21 @@ struct PMSignalData
7576
QuitSignalReason sigquit_reason; /* why SIGQUIT was sent */
7677
/* per-child-process flags */
7778
int num_child_flags; /* # of entries in PMChildFlags[] */
78-
int next_child_flag; /* next slot to try to assign */
7979
sig_atomic_t PMChildFlags[FLEXIBLE_ARRAY_MEMBER];
8080
};
8181

82+
/* PMSignalState pointer is valid in both postmaster and child processes */
8283
NON_EXEC_STATIC volatile PMSignalData *PMSignalState = NULL;
8384

85+
/*
86+
* These static variables are valid only in the postmaster. We keep a
87+
* duplicative private array so that we can trust its state even if some
88+
* failing child has clobbered the PMSignalData struct in shared memory.
89+
*/
90+
static int num_child_inuse; /* # of entries in PMChildInUse[] */
91+
static int next_child_inuse; /* next slot to try to assign */
92+
static bool *PMChildInUse; /* true if i'th flag slot is assigned */
93+
8494
/*
8595
* Signal handler to be notified if postmaster dies.
8696
*/
@@ -142,7 +152,25 @@ PMSignalShmemInit(void)
142152
{
143153
/* initialize all flags to zeroes */
144154
MemSet(unvolatize(PMSignalData *, PMSignalState), 0, PMSignalShmemSize());
145-
PMSignalState->num_child_flags = MaxLivePostmasterChildren();
155+
num_child_inuse = MaxLivePostmasterChildren();
156+
PMSignalState->num_child_flags = num_child_inuse;
157+
158+
/*
159+
* Also allocate postmaster's private PMChildInUse[] array. We
160+
* might've already done that in a previous shared-memory creation
161+
* cycle, in which case free the old array to avoid a leak. (Do it
162+
* like this to support the possibility that MaxLivePostmasterChildren
163+
* changed.) In a standalone backend, we do not need this.
164+
*/
165+
if (PostmasterContext != NULL)
166+
{
167+
if (PMChildInUse)
168+
pfree(PMChildInUse);
169+
PMChildInUse = (bool *)
170+
MemoryContextAllocZero(PostmasterContext,
171+
num_child_inuse * sizeof(bool));
172+
}
173+
next_child_inuse = 0;
146174
}
147175
}
148176

@@ -218,21 +246,24 @@ GetQuitSignalReason(void)
218246
int
219247
AssignPostmasterChildSlot(void)
220248
{
221-
int slot = PMSignalState->next_child_flag;
249+
int slot = next_child_inuse;
222250
int n;
223251

224252
/*
225-
* Scan for a free slot. We track the last slot assigned so as not to
226-
* waste time repeatedly rescanning low-numbered slots.
253+
* Scan for a free slot. Notice that we trust nothing about the contents
254+
* of PMSignalState, but use only postmaster-local data for this decision.
255+
* We track the last slot assigned so as not to waste time repeatedly
256+
* rescanning low-numbered slots.
227257
*/
228-
for (n = PMSignalState->num_child_flags; n > 0; n--)
258+
for (n = num_child_inuse; n > 0; n--)
229259
{
230260
if (--slot < 0)
231-
slot = PMSignalState->num_child_flags - 1;
232-
if (PMSignalState->PMChildFlags[slot] == PM_CHILD_UNUSED)
261+
slot = num_child_inuse - 1;
262+
if (!PMChildInUse[slot])
233263
{
264+
PMChildInUse[slot] = true;
234265
PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED;
235-
PMSignalState->next_child_flag = slot;
266+
next_child_inuse = slot;
236267
return slot + 1;
237268
}
238269
}
@@ -254,7 +285,7 @@ ReleasePostmasterChildSlot(int slot)
254285
{
255286
bool result;
256287

257-
Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
288+
Assert(slot > 0 && slot <= num_child_inuse);
258289
slot--;
259290

260291
/*
@@ -264,17 +295,18 @@ ReleasePostmasterChildSlot(int slot)
264295
*/
265296
result = (PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED);
266297
PMSignalState->PMChildFlags[slot] = PM_CHILD_UNUSED;
298+
PMChildInUse[slot] = false;
267299
return result;
268300
}
269301

270302
/*
271303
* IsPostmasterChildWalSender - check if given slot is in use by a
272-
* walsender process.
304+
* walsender process. This is called only by the postmaster.
273305
*/
274306
bool
275307
IsPostmasterChildWalSender(int slot)
276308
{
277-
Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
309+
Assert(slot > 0 && slot <= num_child_inuse);
278310
slot--;
279311

280312
if (PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER)

0 commit comments

Comments
 (0)