Skip to content

Commit 85de67b

Browse files
tglsfdcpull[bot]
authored andcommitted
Harden pmsignal.c against clobbered shared memory.
The postmaster is not supposed to do anything that depends fundamentally on shared memory contents, because that creates the risk that a backend crash that trashes shared memory will take the postmaster down with it, preventing automatic recovery. In commit 969d7cd I lost sight of this principle and coded AssignPostmasterChildSlot() in such a way that it could fail or even crash if the shared PMSignalState structure became corrupted. Remarkably, we've not seen field reports of such crashes; but I managed to induce one while testing the recent changes around palloc chunk headers. To fix, make a semi-duplicative state array inside the postmaster so that we need consult only local state while choosing a "child slot" for a new backend. Ensure that other postmaster-executed routines in pmsignal.c don't have critical dependencies on the shared state, either. Corruption of PMSignalState might now lead ReleasePostmasterChildSlot() to conclude that backend X failed, when actually backend Y was the one that trashed things. But that doesn't matter, because we'll force a cluster-wide reset regardless. Back-patch to all supported branches, since this is an old bug. Discussion: https://postgr.es/m/3436789.1665187055@sss.pgh.pa.us
1 parent 70a4de4 commit 85de67b

File tree

1 file changed

+44
-12
lines changed

1 file changed

+44
-12
lines changed

src/backend/storage/ipc/pmsignal.c

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "replication/walsender.h"
2727
#include "storage/pmsignal.h"
2828
#include "storage/shmem.h"
29+
#include "utils/memutils.h"
2930

3031

3132
/*
@@ -75,12 +76,21 @@ struct PMSignalData
7576
QuitSignalReason sigquit_reason; /* why SIGQUIT was sent */
7677
/* per-child-process flags */
7778
int num_child_flags; /* # of entries in PMChildFlags[] */
78-
int next_child_flag; /* next slot to try to assign */
7979
sig_atomic_t PMChildFlags[FLEXIBLE_ARRAY_MEMBER];
8080
};
8181

82+
/* PMSignalState pointer is valid in both postmaster and child processes */
8283
NON_EXEC_STATIC volatile PMSignalData *PMSignalState = NULL;
8384

85+
/*
86+
* These static variables are valid only in the postmaster. We keep a
87+
* duplicative private array so that we can trust its state even if some
88+
* failing child has clobbered the PMSignalData struct in shared memory.
89+
*/
90+
static int num_child_inuse; /* # of entries in PMChildInUse[] */
91+
static int next_child_inuse; /* next slot to try to assign */
92+
static bool *PMChildInUse; /* true if i'th flag slot is assigned */
93+
8494
/*
8595
* Signal handler to be notified if postmaster dies.
8696
*/
@@ -142,7 +152,25 @@ PMSignalShmemInit(void)
142152
{
143153
/* initialize all flags to zeroes */
144154
MemSet(unvolatize(PMSignalData *, PMSignalState), 0, PMSignalShmemSize());
145-
PMSignalState->num_child_flags = MaxLivePostmasterChildren();
155+
num_child_inuse = MaxLivePostmasterChildren();
156+
PMSignalState->num_child_flags = num_child_inuse;
157+
158+
/*
159+
* Also allocate postmaster's private PMChildInUse[] array. We
160+
* might've already done that in a previous shared-memory creation
161+
* cycle, in which case free the old array to avoid a leak. (Do it
162+
* like this to support the possibility that MaxLivePostmasterChildren
163+
* changed.) In a standalone backend, we do not need this.
164+
*/
165+
if (PostmasterContext != NULL)
166+
{
167+
if (PMChildInUse)
168+
pfree(PMChildInUse);
169+
PMChildInUse = (bool *)
170+
MemoryContextAllocZero(PostmasterContext,
171+
num_child_inuse * sizeof(bool));
172+
}
173+
next_child_inuse = 0;
146174
}
147175
}
148176

@@ -218,21 +246,24 @@ GetQuitSignalReason(void)
218246
int
219247
AssignPostmasterChildSlot(void)
220248
{
221-
int slot = PMSignalState->next_child_flag;
249+
int slot = next_child_inuse;
222250
int n;
223251

224252
/*
225-
* Scan for a free slot. We track the last slot assigned so as not to
226-
* waste time repeatedly rescanning low-numbered slots.
253+
* Scan for a free slot. Notice that we trust nothing about the contents
254+
* of PMSignalState, but use only postmaster-local data for this decision.
255+
* We track the last slot assigned so as not to waste time repeatedly
256+
* rescanning low-numbered slots.
227257
*/
228-
for (n = PMSignalState->num_child_flags; n > 0; n--)
258+
for (n = num_child_inuse; n > 0; n--)
229259
{
230260
if (--slot < 0)
231-
slot = PMSignalState->num_child_flags - 1;
232-
if (PMSignalState->PMChildFlags[slot] == PM_CHILD_UNUSED)
261+
slot = num_child_inuse - 1;
262+
if (!PMChildInUse[slot])
233263
{
264+
PMChildInUse[slot] = true;
234265
PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED;
235-
PMSignalState->next_child_flag = slot;
266+
next_child_inuse = slot;
236267
return slot + 1;
237268
}
238269
}
@@ -254,7 +285,7 @@ ReleasePostmasterChildSlot(int slot)
254285
{
255286
bool result;
256287

257-
Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
288+
Assert(slot > 0 && slot <= num_child_inuse);
258289
slot--;
259290

260291
/*
@@ -264,17 +295,18 @@ ReleasePostmasterChildSlot(int slot)
264295
*/
265296
result = (PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED);
266297
PMSignalState->PMChildFlags[slot] = PM_CHILD_UNUSED;
298+
PMChildInUse[slot] = false;
267299
return result;
268300
}
269301

270302
/*
271303
* IsPostmasterChildWalSender - check if given slot is in use by a
272-
* walsender process.
304+
* walsender process. This is called only by the postmaster.
273305
*/
274306
bool
275307
IsPostmasterChildWalSender(int slot)
276308
{
277-
Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
309+
Assert(slot > 0 && slot <= num_child_inuse);
278310
slot--;
279311

280312
if (PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER)

0 commit comments

Comments
 (0)