Skip to content

Commit 1a31d8c

Browse files
committed
Prevent excess SimpleLruTruncate() deletion.
Every core SLRU wraps around. With the exception of pg_notify, the wrap point can fall in the middle of a page. Account for this in the PagePrecedes callback specification and in SimpleLruTruncate()'s use of said callback. Update each callback implementation to fit the new specification. This changes SerialPagePrecedesLogically() from the style of asyncQueuePagePrecedes() to the style of CLOGPagePrecedes(). (Whereas pg_clog and pg_serial share a key space, pg_serial is nothing like pg_notify.) The bug fixed here has the same symptoms and user followup steps as 592a589. Back-patch to 9.5 (all supported versions). Reviewed by Andrey Borodin and (in earlier versions) by Tom Lane. Discussion: https://postgr.es/m/20190202083822.GC32531@gust.leadboat.com
1 parent fc6d08b commit 1a31d8c

File tree

8 files changed

+312
-79
lines changed

8 files changed

+312
-79
lines changed

src/backend/access/transam/clog.c

+19-8
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,7 @@ CLOGShmemInit(void)
457457
ClogCtl->PagePrecedes = CLOGPagePrecedes;
458458
SimpleLruInit(ClogCtl, "clog", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
459459
CLogControlLock, "pg_clog", LWTRANCHE_CLOG_BUFFERS);
460+
SlruPagePrecedesUnitTests(ClogCtl, CLOG_XACTS_PER_PAGE);
460461
}
461462

462463
/*
@@ -669,13 +670,22 @@ TruncateCLOG(TransactionId oldestXact)
669670

670671

671672
/*
672-
* Decide which of two CLOG page numbers is "older" for truncation purposes.
673+
* Decide whether a CLOG page number is "older" for truncation purposes.
673674
*
674675
* We need to use comparison of TransactionIds here in order to do the right
675-
* thing with wraparound XID arithmetic. However, if we are asked about
676-
* page number zero, we don't want to hand InvalidTransactionId to
677-
* TransactionIdPrecedes: it'll get weird about permanent xact IDs. So,
678-
* offset both xids by FirstNormalTransactionId to avoid that.
676+
* thing with wraparound XID arithmetic. However, TransactionIdPrecedes()
677+
* would get weird about permanent xact IDs. So, offset both such that xid1,
678+
* xid2, and xid2 + CLOG_XACTS_PER_PAGE - 1 are all normal XIDs; this offset
679+
* is relevant to page 0 and to the page preceding page 0.
680+
*
681+
* The page containing oldestXact-2^31 is the important edge case. The
682+
* portion of that page equaling or following oldestXact-2^31 is expendable,
683+
* but the portion preceding oldestXact-2^31 is not. When oldestXact-2^31 is
684+
* the first XID of a page and segment, the entire page and segment is
685+
* expendable, and we could truncate the segment. Recognizing that case would
686+
* require making oldestXact, not just the page containing oldestXact,
687+
* available to this callback. The benefit would be rare and small, so we
688+
* don't optimize that edge case.
679689
*/
680690
static bool
681691
CLOGPagePrecedes(int page1, int page2)
@@ -684,11 +694,12 @@ CLOGPagePrecedes(int page1, int page2)
684694
TransactionId xid2;
685695

686696
xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE;
687-
xid1 += FirstNormalTransactionId;
697+
xid1 += FirstNormalTransactionId + 1;
688698
xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE;
689-
xid2 += FirstNormalTransactionId;
699+
xid2 += FirstNormalTransactionId + 1;
690700

691-
return TransactionIdPrecedes(xid1, xid2);
701+
return (TransactionIdPrecedes(xid1, xid2) &&
702+
TransactionIdPrecedes(xid1, xid2 + CLOG_XACTS_PER_PAGE - 1));
692703
}
693704

694705

src/backend/access/transam/commit_ts.c

+25-9
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,7 @@ CommitTsShmemInit(void)
494494
SimpleLruInit(CommitTsCtl, "commit_timestamp", CommitTsShmemBuffers(), 0,
495495
CommitTsControlLock, "pg_commit_ts",
496496
LWTRANCHE_COMMITTS_BUFFERS);
497+
SlruPagePrecedesUnitTests(CommitTsCtl, COMMIT_TS_XACTS_PER_PAGE);
497498

498499
commitTsShared = ShmemInitStruct("CommitTs shared",
499500
sizeof(CommitTimestampShared),
@@ -870,13 +871,27 @@ AdvanceOldestCommitTsXid(TransactionId oldestXact)
870871

871872

872873
/*
873-
* Decide which of two CLOG page numbers is "older" for truncation purposes.
874+
* Decide whether a commitTS page number is "older" for truncation purposes.
875+
* Analogous to CLOGPagePrecedes().
874876
*
875-
* We need to use comparison of TransactionIds here in order to do the right
876-
* thing with wraparound XID arithmetic. However, if we are asked about
877-
* page number zero, we don't want to hand InvalidTransactionId to
878-
* TransactionIdPrecedes: it'll get weird about permanent xact IDs. So,
879-
* offset both xids by FirstNormalTransactionId to avoid that.
877+
* At default BLCKSZ, (1 << 31) % COMMIT_TS_XACTS_PER_PAGE == 128. This
878+
* introduces differences compared to CLOG and the other SLRUs having (1 <<
879+
* 31) % per_page == 0. This function never tests exactly
880+
* TransactionIdPrecedes(x-2^31, x). When the system reaches xidStopLimit,
881+
* there are two possible counts of page boundaries between oldestXact and the
882+
* latest XID assigned, depending on whether oldestXact is within the first
883+
* 128 entries of its page. Since this function doesn't know the location of
884+
* oldestXact within page2, it returns false for one page that actually is
885+
* expendable. This is a wider (yet still negligible) version of the
886+
* truncation opportunity that CLOGPagePrecedes() cannot recognize.
887+
*
888+
* For the sake of a worked example, number entries with decimal values such
889+
* that page1==1 entries range from 1.0 to 1.999. Let N+0.15 be the number of
890+
* pages that 2^31 entries will span (N is an integer). If oldestXact=N+2.1,
891+
* then the final safe XID assignment leaves newestXact=1.95. We keep page 2,
892+
* because entry=2.85 is the border that toggles whether entries precede the
893+
* last entry of the oldestXact page. While page 2 is expendable at
894+
* oldestXact=N+2.1, it would be precious at oldestXact=N+2.9.
880895
*/
881896
static bool
882897
CommitTsPagePrecedes(int page1, int page2)
@@ -885,11 +900,12 @@ CommitTsPagePrecedes(int page1, int page2)
885900
TransactionId xid2;
886901

887902
xid1 = ((TransactionId) page1) * COMMIT_TS_XACTS_PER_PAGE;
888-
xid1 += FirstNormalTransactionId;
903+
xid1 += FirstNormalTransactionId + 1;
889904
xid2 = ((TransactionId) page2) * COMMIT_TS_XACTS_PER_PAGE;
890-
xid2 += FirstNormalTransactionId;
905+
xid2 += FirstNormalTransactionId + 1;
891906

892-
return TransactionIdPrecedes(xid1, xid2);
907+
return (TransactionIdPrecedes(xid1, xid2) &&
908+
TransactionIdPrecedes(xid1, xid2 + COMMIT_TS_XACTS_PER_PAGE - 1));
893909
}
894910

895911

src/backend/access/transam/multixact.c

+24-14
Original file line numberDiff line numberDiff line change
@@ -1830,10 +1830,12 @@ MultiXactShmemInit(void)
18301830
"multixact_offset", NUM_MXACTOFFSET_BUFFERS, 0,
18311831
MultiXactOffsetControlLock, "pg_multixact/offsets",
18321832
LWTRANCHE_MXACTOFFSET_BUFFERS);
1833+
SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
18331834
SimpleLruInit(MultiXactMemberCtl,
18341835
"multixact_member", NUM_MXACTMEMBER_BUFFERS, 0,
18351836
MultiXactMemberControlLock, "pg_multixact/members",
18361837
LWTRANCHE_MXACTMEMBER_BUFFERS);
1838+
/* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
18371839

18381840
/* Initialize our shared state struct */
18391841
MultiXactState = ShmemInitStruct("Shared MultiXact State",
@@ -2971,6 +2973,14 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
29712973
* truncate the members SLRU. So we first scan the directory to determine
29722974
* the earliest offsets page number that we can read without error.
29732975
*
2976+
* When nextMXact is less than one segment away from multiWrapLimit,
2977+
* SlruScanDirCbFindEarliest can find some early segment other than the
2978+
* actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST)
2979+
* returns false, because not all pairs of entries have the same answer.)
2980+
* That can also arise when an earlier truncation attempt failed unlink()
2981+
* or returned early from this function. The only consequence is
2982+
* returning early, which wastes space that we could have liberated.
2983+
*
29742984
* NB: It's also possible that the page that oldestMulti is on has already
29752985
* been truncated away, and we crashed before updating oldestMulti.
29762986
*/
@@ -3085,15 +3095,11 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
30853095
}
30863096

30873097
/*
3088-
* Decide which of two MultiXactOffset page numbers is "older" for truncation
3089-
* purposes.
3090-
*
3091-
* We need to use comparison of MultiXactId here in order to do the right
3092-
* thing with wraparound. However, if we are asked about page number zero, we
3093-
* don't want to hand InvalidMultiXactId to MultiXactIdPrecedes: it'll get
3094-
* weird. So, offset both multis by FirstMultiXactId to avoid that.
3095-
* (Actually, the current implementation doesn't do anything weird with
3096-
* InvalidMultiXactId, but there's no harm in leaving this code like this.)
3098+
* Decide whether a MultiXactOffset page number is "older" for truncation
3099+
* purposes. Analogous to CLOGPagePrecedes().
3100+
*
3101+
* Offsetting the values is optional, because MultiXactIdPrecedes() has
3102+
* translational symmetry.
30973103
*/
30983104
static bool
30993105
MultiXactOffsetPagePrecedes(int page1, int page2)
@@ -3102,15 +3108,17 @@ MultiXactOffsetPagePrecedes(int page1, int page2)
31023108
MultiXactId multi2;
31033109

31043110
multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
3105-
multi1 += FirstMultiXactId;
3111+
multi1 += FirstMultiXactId + 1;
31063112
multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
3107-
multi2 += FirstMultiXactId;
3113+
multi2 += FirstMultiXactId + 1;
31083114

3109-
return MultiXactIdPrecedes(multi1, multi2);
3115+
return (MultiXactIdPrecedes(multi1, multi2) &&
3116+
MultiXactIdPrecedes(multi1,
3117+
multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
31103118
}
31113119

31123120
/*
3113-
* Decide which of two MultiXactMember page numbers is "older" for truncation
3121+
* Decide whether a MultiXactMember page number is "older" for truncation
31143122
* purposes. There is no "invalid offset number" so use the numbers verbatim.
31153123
*/
31163124
static bool
@@ -3122,7 +3130,9 @@ MultiXactMemberPagePrecedes(int page1, int page2)
31223130
offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
31233131
offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
31243132

3125-
return MultiXactOffsetPrecedes(offset1, offset2);
3133+
return (MultiXactOffsetPrecedes(offset1, offset2) &&
3134+
MultiXactOffsetPrecedes(offset1,
3135+
offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1));
31263136
}
31273137

31283138
/*

src/backend/access/transam/slru.c

+127-16
Original file line numberDiff line numberDiff line change
@@ -1173,11 +1173,6 @@ SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
11731173
SlruShared shared = ctl->shared;
11741174
int slotno;
11751175

1176-
/*
1177-
* The cutoff point is the start of the segment containing cutoffPage.
1178-
*/
1179-
cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT;
1180-
11811176
/*
11821177
* Scan shared memory and remove any pages preceding the cutoff page, to
11831178
* ensure we won't rewrite them later. (Since this is normally called in
@@ -1190,9 +1185,7 @@ restart:;
11901185

11911186
/*
11921187
* While we are holding the lock, make an important safety check: the
1193-
* planned cutoff point must be <= the current endpoint page. Otherwise we
1194-
* have already wrapped around, and proceeding with the truncation would
1195-
* risk removing the current segment.
1188+
* current endpoint page must not be eligible for removal.
11961189
*/
11971190
if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage))
11981191
{
@@ -1224,8 +1217,11 @@ restart:;
12241217
* Hmm, we have (or may have) I/O operations acting on the page, so
12251218
* we've got to wait for them to finish and then start again. This is
12261219
* the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
1227-
* wouldn't it be OK to just discard it without writing it? For now,
1228-
* keep the logic the same as it was.)
1220+
* wouldn't it be OK to just discard it without writing it?
1221+
* SlruMayDeleteSegment() uses a stricter qualification, so we might
1222+
* not delete this page in the end; even if we don't delete it, we
1223+
* won't have cause to read its data again. For now, keep the logic
1224+
* the same as it was.)
12291225
*/
12301226
if (shared->page_status[slotno] == SLRU_PAGE_VALID)
12311227
SlruInternalWritePage(ctl, slotno, NULL);
@@ -1315,19 +1311,134 @@ SlruDeleteSegment(SlruCtl ctl, int segno)
13151311
LWLockRelease(shared->ControlLock);
13161312
}
13171313

1314+
/*
1315+
* Determine whether a segment is okay to delete.
1316+
*
1317+
* segpage is the first page of the segment, and cutoffPage is the oldest (in
1318+
* PagePrecedes order) page in the SLRU containing still-useful data. Since
1319+
* every core PagePrecedes callback implements "wrap around", check the
1320+
* segment's first and last pages:
1321+
*
1322+
* first<cutoff && last<cutoff: yes
1323+
* first<cutoff && last>=cutoff: no; cutoff falls inside this segment
1324+
* first>=cutoff && last<cutoff: no; wrap point falls inside this segment
1325+
* first>=cutoff && last>=cutoff: no; every page of this segment is too young
1326+
*/
1327+
static bool
1328+
SlruMayDeleteSegment(SlruCtl ctl, int segpage, int cutoffPage)
1329+
{
1330+
int seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
1331+
1332+
Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
1333+
1334+
return (ctl->PagePrecedes(segpage, cutoffPage) &&
1335+
ctl->PagePrecedes(seg_last_page, cutoffPage));
1336+
}
1337+
1338+
#ifdef USE_ASSERT_CHECKING
1339+
static void
1340+
SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset)
1341+
{
1342+
TransactionId lhs,
1343+
rhs;
1344+
int newestPage,
1345+
oldestPage;
1346+
TransactionId newestXact,
1347+
oldestXact;
1348+
1349+
/*
1350+
* Compare an XID pair having undefined order (see RFC 1982), a pair at
1351+
* "opposite ends" of the XID space. TransactionIdPrecedes() treats each
1352+
* as preceding the other. If RHS is oldestXact, LHS is the first XID we
1353+
* must not assign.
1354+
*/
1355+
lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */
1356+
rhs = lhs + (1U << 31);
1357+
Assert(TransactionIdPrecedes(lhs, rhs));
1358+
Assert(TransactionIdPrecedes(rhs, lhs));
1359+
Assert(!TransactionIdPrecedes(lhs - 1, rhs));
1360+
Assert(TransactionIdPrecedes(rhs, lhs - 1));
1361+
Assert(TransactionIdPrecedes(lhs + 1, rhs));
1362+
Assert(!TransactionIdPrecedes(rhs, lhs + 1));
1363+
Assert(!TransactionIdFollowsOrEquals(lhs, rhs));
1364+
Assert(!TransactionIdFollowsOrEquals(rhs, lhs));
1365+
Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
1366+
Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
1367+
Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
1368+
Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
1369+
Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
1370+
Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
1371+
Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
1372+
|| (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */
1373+
Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
1374+
|| (1U << 31) % per_page != 0);
1375+
Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
1376+
Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
1377+
Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
1378+
1379+
/*
1380+
* GetNewTransactionId() has assigned the last XID it can safely use, and
1381+
* that XID is in the *LAST* page of the second segment. We must not
1382+
* delete that segment.
1383+
*/
1384+
newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
1385+
newestXact = newestPage * per_page + offset;
1386+
Assert(newestXact / per_page == newestPage);
1387+
oldestXact = newestXact + 1;
1388+
oldestXact -= 1U << 31;
1389+
oldestPage = oldestXact / per_page;
1390+
Assert(!SlruMayDeleteSegment(ctl,
1391+
(newestPage -
1392+
newestPage % SLRU_PAGES_PER_SEGMENT),
1393+
oldestPage));
1394+
1395+
/*
1396+
* GetNewTransactionId() has assigned the last XID it can safely use, and
1397+
* that XID is in the *FIRST* page of the second segment. We must not
1398+
* delete that segment.
1399+
*/
1400+
newestPage = SLRU_PAGES_PER_SEGMENT;
1401+
newestXact = newestPage * per_page + offset;
1402+
Assert(newestXact / per_page == newestPage);
1403+
oldestXact = newestXact + 1;
1404+
oldestXact -= 1U << 31;
1405+
oldestPage = oldestXact / per_page;
1406+
Assert(!SlruMayDeleteSegment(ctl,
1407+
(newestPage -
1408+
newestPage % SLRU_PAGES_PER_SEGMENT),
1409+
oldestPage));
1410+
}
1411+
1412+
/*
1413+
* Unit-test a PagePrecedes function.
1414+
*
1415+
* This assumes every uint32 >= FirstNormalTransactionId is a valid key. It
1416+
* assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
1417+
* (MultiXactMemberCtl separates flags from XIDs. AsyncCtl has
1418+
* variable-length entries, no keys, and no random access. These unit tests
1419+
* do not apply to them.)
1420+
*/
1421+
void
1422+
SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page)
1423+
{
1424+
/* Test first, middle and last entries of a page. */
1425+
SlruPagePrecedesTestOffset(ctl, per_page, 0);
1426+
SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
1427+
SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
1428+
}
1429+
#endif
1430+
13181431
/*
13191432
* SlruScanDirectory callback
1320-
* This callback reports true if there's any segment prior to the one
1321-
* containing the page passed as "data".
1433+
* This callback reports true if there's any segment wholly prior to the
1434+
* one containing the page passed as "data".
13221435
*/
13231436
bool
13241437
SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data)
13251438
{
13261439
int cutoffPage = *(int *) data;
13271440

1328-
cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT;
1329-
1330-
if (ctl->PagePrecedes(segpage, cutoffPage))
1441+
if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
13311442
return true; /* found one; don't iterate any more */
13321443

13331444
return false; /* keep going */
@@ -1342,7 +1453,7 @@ SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data)
13421453
{
13431454
int cutoffPage = *(int *) data;
13441455

1345-
if (ctl->PagePrecedes(segpage, cutoffPage))
1456+
if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
13461457
SlruInternalDeleteSegment(ctl, filename);
13471458

13481459
return false; /* keep going */

0 commit comments

Comments
 (0)