Skip to content

Commit e525770

Browse files
committed
Prevent concurrent SimpleLruTruncate() for any given SLRU.
The SimpleLruTruncate() header comment states the new coding rule. To achieve this, add locktype "frozenid" and two LWLocks. This closes a rare opportunity for data loss, which manifested as "apparent wraparound" or "could not access status of transaction" errors. Data loss is more likely in pg_multixact, due to released branches' thin margin between multiStopLimit and multiWrapLimit. If a user's physical replication primary logged ": apparent wraparound" messages, the user should rebuild standbys of that primary regardless of symptoms. At less risk is a cluster having emitted "not accepting commands" errors or "must be vacuumed" warnings at some point. One can test a cluster for this data loss by running VACUUM FREEZE in every database. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/20190218073103.GA1434723@rfd.leadboat.com
1 parent dea0709 commit e525770

File tree

12 files changed

+119
-16
lines changed

12 files changed

+119
-16
lines changed

doc/src/sgml/catalogs.sgml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8886,7 +8886,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
88868886
and general database objects (identified by class OID and object OID,
88878887
in the same way as in <structname>pg_description</structname> or
88888888
<structname>pg_depend</structname>). Also, the right to extend a
8889-
relation is represented as a separate lockable object.
8889+
relation is represented as a separate lockable object, as is the right to
8890+
update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>.
88908891
Also, <quote>advisory</> locks can be taken on numbers that have
88918892
user-defined meanings.
88928893
</para>
@@ -8912,6 +8913,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
89128913
Type of the lockable object:
89138914
<literal>relation</>,
89148915
<literal>extend</>,
8916+
<literal>frozenid</literal>,
89158917
<literal>page</>,
89168918
<literal>tuple</>,
89178919
<literal>transactionid</>,

doc/src/sgml/monitoring.sgml

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -845,7 +845,7 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
845845

846846
<tbody>
847847
<row>
848-
<entry morerows="62"><literal>LWLock</></entry>
848+
<entry morerows="64"><literal>LWLock</></entry>
849849
<entry><literal>ShmemIndexLock</></entry>
850850
<entry>Waiting to find or allocate space in shared memory.</entry>
851851
</row>
@@ -1043,6 +1043,16 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
10431043
<entry>Waiting to execute <function>txid_status</function> or update
10441044
the oldest transaction id available to it.</entry>
10451045
</row>
1046+
<row>
1047+
<entry><literal>WrapLimitsVacuumLock</literal></entry>
1048+
<entry>Waiting to update limits on transaction id and multixact
1049+
consumption.</entry>
1050+
</row>
1051+
<row>
1052+
<entry><literal>NotifyQueueTailLock</literal></entry>
1053+
<entry>Waiting to update limit on notification message
1054+
storage.</entry>
1055+
</row>
10461056
<row>
10471057
<entry><literal>clog</></entry>
10481058
<entry>Waiting for I/O on a clog (transaction status) buffer.</entry>
@@ -1118,14 +1128,20 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
11181128
<entry>Waiting for TBM shared iterator lock.</entry>
11191129
</row>
11201130
<row>
1121-
<entry morerows="9"><literal>Lock</></entry>
1131+
<entry morerows="10"><literal>Lock</></entry>
11221132
<entry><literal>relation</></entry>
11231133
<entry>Waiting to acquire a lock on a relation.</entry>
11241134
</row>
11251135
<row>
11261136
<entry><literal>extend</></entry>
11271137
<entry>Waiting to extend a relation.</entry>
11281138
</row>
1139+
<row>
1140+
<entry><literal>frozenid</literal></entry>
1141+
<entry>Waiting to
1142+
update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>
1143+
and <structname>pg_database</structname>.<structfield>datminmxid</structfield>.</entry>
1144+
</row>
11291145
<row>
11301146
<entry><literal>page</></entry>
11311147
<entry>Waiting to acquire a lock on page of a relation.</entry>

src/backend/access/transam/slru.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1164,6 +1164,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
11641164

11651165
/*
11661166
* Remove all segments before the one holding the passed page number
1167+
*
1168+
* All SLRUs prevent concurrent calls to this function, either with an LWLock
1169+
* or by calling it only as part of a checkpoint. Mutual exclusion must begin
1170+
* before computing cutoffPage. Mutual exclusion must end after any limit
1171+
* update that would permit other backends to write fresh data into the
1172+
* segment immediately preceding the one containing cutoffPage. Otherwise,
1173+
* when the SLRU is quite full, SimpleLruTruncate() might delete that segment
1174+
* after it has accrued freshly-written data.
11671175
*/
11681176
void
11691177
SimpleLruTruncate(SlruCtl ctl, int cutoffPage)

src/backend/access/transam/subtrans.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,8 +347,8 @@ ExtendSUBTRANS(TransactionId newestXact)
347347
/*
348348
* Remove all SUBTRANS segments before the one holding the passed transaction ID
349349
*
350-
* This is normally called during checkpoint, with oldestXact being the
351-
* oldest TransactionXmin of any running transaction.
350+
* oldestXact is the oldest TransactionXmin of any running transaction. This
351+
* is called only during checkpoint.
352352
*/
353353
void
354354
TruncateSUBTRANS(TransactionId oldestXact)

src/backend/commands/async.c

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -224,19 +224,22 @@ typedef struct QueueBackendStatus
224224
/*
225225
* Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff)
226226
*
227-
* The AsyncQueueControl structure is protected by the AsyncQueueLock.
227+
* The AsyncQueueControl structure is protected by the AsyncQueueLock and
228+
* NotifyQueueTailLock.
228229
*
229-
* When holding the lock in SHARED mode, backends may only inspect their own
230-
* entries as well as the head and tail pointers. Consequently we can allow a
231-
* backend to update its own record while holding only SHARED lock (since no
232-
* other backend will inspect it).
230+
* When holding AsyncQueueLock in SHARED mode, backends may only inspect their
231+
* own entries as well as the head and tail pointers. Consequently we can
232+
* allow a backend to update its own record while holding only SHARED lock
233+
* (since no other backend will inspect it).
233234
*
234-
* When holding the lock in EXCLUSIVE mode, backends can inspect the entries
235-
* of other backends and also change the head and tail pointers.
235+
* When holding AsyncQueueLock in EXCLUSIVE mode, backends can inspect the
236+
* entries of other backends and also change the head pointer. When holding
237+
* both AsyncQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends can
238+
* change the tail pointer.
236239
*
237240
* AsyncCtlLock is used as the control lock for the pg_notify SLRU buffers.
238-
* In order to avoid deadlocks, whenever we need both locks, we always first
239-
* get AsyncQueueLock and then AsyncCtlLock.
241+
* In order to avoid deadlocks, whenever we need multiple locks, we first get
242+
* NotifyQueueTailLock, then AsyncQueueLock, and lastly AsyncCtlLock.
240243
*
241244
* Each backend uses the backend[] array entry with index equal to its
242245
* BackendId (which can range from 1 to MaxBackends). We rely on this to make
@@ -2013,6 +2016,10 @@ asyncQueueAdvanceTail(void)
20132016
int newtailpage;
20142017
int boundary;
20152018

2019+
/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
2020+
LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE);
2021+
2022+
/* Compute the new tail. */
20162023
LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
20172024
min = QUEUE_HEAD;
20182025
for (i = 1; i <= MaxBackends; i++)
@@ -2021,7 +2028,6 @@ asyncQueueAdvanceTail(void)
20212028
min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i));
20222029
}
20232030
oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL);
2024-
QUEUE_TAIL = min;
20252031
LWLockRelease(AsyncQueueLock);
20262032

20272033
/*
@@ -2041,6 +2047,17 @@ asyncQueueAdvanceTail(void)
20412047
*/
20422048
SimpleLruTruncate(AsyncCtl, newtailpage);
20432049
}
2050+
2051+
/*
2052+
* Advertise the new tail. This changes asyncQueueIsFull()'s verdict for
2053+
* the segment immediately prior to the new tail, allowing fresh data into
2054+
* that segment.
2055+
*/
2056+
LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
2057+
QUEUE_TAIL = min;
2058+
LWLockRelease(AsyncQueueLock);
2059+
2060+
LWLockRelease(NotifyQueueTailLock);
20442061
}
20452062

20462063
/*

src/backend/commands/vacuum.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,14 @@ vac_update_datfrozenxid(void)
933933
bool bogus = false;
934934
bool dirty = false;
935935

936+
/*
937+
* Restrict this task to one backend per database. This avoids race
938+
* conditions that would move datfrozenxid or datminmxid backward. It
939+
* avoids calling vac_truncate_clog() with a datfrozenxid preceding a
940+
* datfrozenxid passed to an earlier vac_truncate_clog() call.
941+
*/
942+
LockDatabaseFrozenIds(ExclusiveLock);
943+
936944
/*
937945
* Initialize the "min" calculation with GetOldestXmin, which is a
938946
* reasonable approximation to the minimum relfrozenxid for not-yet-
@@ -1097,6 +1105,9 @@ vac_truncate_clog(TransactionId frozenXID,
10971105
bool bogus = false;
10981106
bool frozenAlreadyWrapped = false;
10991107

1108+
/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
1109+
LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE);
1110+
11001111
/* init oldest datoids to sync with my frozenXID/minMulti values */
11011112
oldestxid_datoid = MyDatabaseId;
11021113
minmulti_datoid = MyDatabaseId;
@@ -1206,6 +1217,8 @@ vac_truncate_clog(TransactionId frozenXID,
12061217
*/
12071218
SetTransactionIdLimit(frozenXID, oldestxid_datoid);
12081219
SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
1220+
1221+
LWLockRelease(WrapLimitsVacuumLock);
12091222
}
12101223

12111224

src/backend/storage/lmgr/lmgr.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
412412
LockRelease(&tag, lockmode, false);
413413
}
414414

415+
/*
416+
* LockDatabaseFrozenIds
417+
*
418+
* This allows one backend per database to execute vac_update_datfrozenxid().
419+
*/
420+
void
421+
LockDatabaseFrozenIds(LOCKMODE lockmode)
422+
{
423+
LOCKTAG tag;
424+
425+
SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
426+
427+
(void) LockAcquire(&tag, lockmode, false, false);
428+
}
429+
415430
/*
416431
* LockPage
417432
*
@@ -1015,6 +1030,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
10151030
tag->locktag_field2,
10161031
tag->locktag_field1);
10171032
break;
1033+
case LOCKTAG_DATABASE_FROZEN_IDS:
1034+
appendStringInfo(buf,
1035+
_("pg_database.datfrozenxid of database %u"),
1036+
tag->locktag_field1);
1037+
break;
10181038
case LOCKTAG_PAGE:
10191039
appendStringInfo(buf,
10201040
_("page %u of relation %u of database %u"),

src/backend/storage/lmgr/lwlock.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ RegisterLWLockTranches(void)
495495

496496
if (LWLockTrancheArray == NULL)
497497
{
498-
LWLockTranchesAllocated = 64;
498+
LWLockTranchesAllocated = 128;
499499
LWLockTrancheArray = (char **)
500500
MemoryContextAllocZero(TopMemoryContext,
501501
LWLockTranchesAllocated * sizeof(char *));

src/backend/storage/lmgr/lwlocknames.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,5 @@ OldSnapshotTimeMapLock 42
5050
BackendRandomLock 43
5151
LogicalRepWorkerLock 44
5252
CLogTruncationLock 45
53+
WrapLimitsVacuumLock 46
54+
NotifyQueueTailLock 47

src/backend/utils/adt/lockfuncs.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
const char *const LockTagTypeNames[] = {
2727
"relation",
2828
"extend",
29+
"frozenid",
2930
"page",
3031
"tuple",
3132
"transactionid",
@@ -245,6 +246,17 @@ pg_lock_status(PG_FUNCTION_ARGS)
245246
nulls[8] = true;
246247
nulls[9] = true;
247248
break;
249+
case LOCKTAG_DATABASE_FROZEN_IDS:
250+
values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
251+
nulls[2] = true;
252+
nulls[3] = true;
253+
nulls[4] = true;
254+
nulls[5] = true;
255+
nulls[6] = true;
256+
nulls[7] = true;
257+
nulls[8] = true;
258+
nulls[9] = true;
259+
break;
248260
case LOCKTAG_PAGE:
249261
values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
250262
values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2);

src/include/storage/lmgr.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ extern bool ConditionalLockRelationForExtension(Relation relation,
5757
LOCKMODE lockmode);
5858
extern int RelationExtensionLockWaiterCount(Relation relation);
5959

60+
/* Lock to recompute pg_database.datfrozenxid in the current database */
61+
extern void LockDatabaseFrozenIds(LOCKMODE lockmode);
62+
6063
/* Lock a page (currently only used within indexes) */
6164
extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
6265
extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);

src/include/storage/lock.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ typedef enum LockTagType
141141
/* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */
142142
LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */
143143
/* same ID info as RELATION */
144+
LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */
145+
/* ID info for frozen IDs is DB OID */
144146
LOCKTAG_PAGE, /* one page of a relation */
145147
/* ID info for a page is RELATION info + BlockNumber */
146148
LOCKTAG_TUPLE, /* one physical tuple */
@@ -206,6 +208,14 @@ typedef struct LOCKTAG
206208
(locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \
207209
(locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
208210

211+
#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \
212+
((locktag).locktag_field1 = (dboid), \
213+
(locktag).locktag_field2 = 0, \
214+
(locktag).locktag_field3 = 0, \
215+
(locktag).locktag_field4 = 0, \
216+
(locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \
217+
(locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
218+
209219
#define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \
210220
((locktag).locktag_field1 = (dboid), \
211221
(locktag).locktag_field2 = (reloid), \

0 commit comments

Comments
 (0)