Skip to content

Commit d4031d7

Browse files
committed
Prevent concurrent SimpleLruTruncate() for any given SLRU.
The SimpleLruTruncate() header comment states the new coding rule. To achieve this, add locktype "frozenid" and two LWLocks. This closes a rare opportunity for data loss, which manifested as "apparent wraparound" or "could not access status of transaction" errors. Data loss is more likely in pg_multixact, due to released branches' thin margin between multiStopLimit and multiWrapLimit. If a user's physical replication primary logged ": apparent wraparound" messages, the user should rebuild standbys of that primary regardless of symptoms. At less risk is a cluster having emitted "not accepting commands" errors or "must be vacuumed" warnings at some point. One can test a cluster for this data loss by running VACUUM FREEZE in every database. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/20190218073103.GA1434723@rfd.leadboat.com
1 parent 9d472b5 commit d4031d7

File tree

11 files changed

+118
-15
lines changed

11 files changed

+118
-15
lines changed

doc/src/sgml/catalogs.sgml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9005,7 +9005,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
90059005
and general database objects (identified by class OID and object OID,
90069006
in the same way as in <structname>pg_description</structname> or
90079007
<structname>pg_depend</structname>). Also, the right to extend a
9008-
relation is represented as a separate lockable object.
9008+
relation is represented as a separate lockable object, as is the right to
9009+
update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>.
90099010
Also, <quote>advisory</quote> locks can be taken on numbers that have
90109011
user-defined meanings.
90119012
</para>
@@ -9031,6 +9032,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
90319032
Type of the lockable object:
90329033
<literal>relation</literal>,
90339034
<literal>extend</literal>,
9035+
<literal>frozenid</literal>,
90349036
<literal>page</literal>,
90359037
<literal>tuple</literal>,
90369038
<literal>transactionid</literal>,

doc/src/sgml/monitoring.sgml

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,7 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
849849

850850
<tbody>
851851
<row>
852-
<entry morerows="64"><literal>LWLock</literal></entry>
852+
<entry morerows="66"><literal>LWLock</literal></entry>
853853
<entry><literal>ShmemIndexLock</literal></entry>
854854
<entry>Waiting to find or allocate space in shared memory.</entry>
855855
</row>
@@ -1047,6 +1047,16 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
10471047
<entry>Waiting to execute <function>txid_status</function> or update
10481048
the oldest transaction id available to it.</entry>
10491049
</row>
1050+
<row>
1051+
<entry><literal>WrapLimitsVacuumLock</literal></entry>
1052+
<entry>Waiting to update limits on transaction id and multixact
1053+
consumption.</entry>
1054+
</row>
1055+
<row>
1056+
<entry><literal>NotifyQueueTailLock</literal></entry>
1057+
<entry>Waiting to update limit on notification message
1058+
storage.</entry>
1059+
</row>
10501060
<row>
10511061
<entry><literal>clog</literal></entry>
10521062
<entry>Waiting for I/O on a clog (transaction status) buffer.</entry>
@@ -1132,14 +1142,20 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
11321142
counters during Parallel Hash plan execution.</entry>
11331143
</row>
11341144
<row>
1135-
<entry morerows="9"><literal>Lock</literal></entry>
1145+
<entry morerows="10"><literal>Lock</literal></entry>
11361146
<entry><literal>relation</literal></entry>
11371147
<entry>Waiting to acquire a lock on a relation.</entry>
11381148
</row>
11391149
<row>
11401150
<entry><literal>extend</literal></entry>
11411151
<entry>Waiting to extend a relation.</entry>
11421152
</row>
1153+
<row>
1154+
<entry><literal>frozenid</literal></entry>
1155+
<entry>Waiting to
1156+
update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>
1157+
and <structname>pg_database</structname>.<structfield>datminmxid</structfield>.</entry>
1158+
</row>
11431159
<row>
11441160
<entry><literal>page</literal></entry>
11451161
<entry>Waiting to acquire a lock on page of a relation.</entry>

src/backend/access/transam/slru.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,6 +1163,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
11631163

11641164
/*
11651165
* Remove all segments before the one holding the passed page number
1166+
*
1167+
* All SLRUs prevent concurrent calls to this function, either with an LWLock
1168+
* or by calling it only as part of a checkpoint. Mutual exclusion must begin
1169+
* before computing cutoffPage. Mutual exclusion must end after any limit
1170+
* update that would permit other backends to write fresh data into the
1171+
* segment immediately preceding the one containing cutoffPage. Otherwise,
1172+
* when the SLRU is quite full, SimpleLruTruncate() might delete that segment
1173+
* after it has accrued freshly-written data.
11661174
*/
11671175
void
11681176
SimpleLruTruncate(SlruCtl ctl, int cutoffPage)

src/backend/access/transam/subtrans.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,8 +347,8 @@ ExtendSUBTRANS(TransactionId newestXact)
347347
/*
348348
* Remove all SUBTRANS segments before the one holding the passed transaction ID
349349
*
350-
* This is normally called during checkpoint, with oldestXact being the
351-
* oldest TransactionXmin of any running transaction.
350+
* oldestXact is the oldest TransactionXmin of any running transaction. This
351+
* is called only during checkpoint.
352352
*/
353353
void
354354
TruncateSUBTRANS(TransactionId oldestXact)

src/backend/commands/async.c

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -224,19 +224,22 @@ typedef struct QueueBackendStatus
224224
/*
225225
* Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff)
226226
*
227-
* The AsyncQueueControl structure is protected by the AsyncQueueLock.
227+
* The AsyncQueueControl structure is protected by the AsyncQueueLock and
228+
* NotifyQueueTailLock.
228229
*
229-
* When holding the lock in SHARED mode, backends may only inspect their own
230-
* entries as well as the head and tail pointers. Consequently we can allow a
231-
* backend to update its own record while holding only SHARED lock (since no
232-
* other backend will inspect it).
230+
* When holding AsyncQueueLock in SHARED mode, backends may only inspect their
231+
* own entries as well as the head and tail pointers. Consequently we can
232+
* allow a backend to update its own record while holding only SHARED lock
233+
* (since no other backend will inspect it).
233234
*
234-
* When holding the lock in EXCLUSIVE mode, backends can inspect the entries
235-
* of other backends and also change the head and tail pointers.
235+
* When holding AsyncQueueLock in EXCLUSIVE mode, backends can inspect the
236+
* entries of other backends and also change the head pointer. When holding
237+
* both AsyncQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends can
238+
* change the tail pointer.
236239
*
237240
* AsyncCtlLock is used as the control lock for the pg_notify SLRU buffers.
238-
* In order to avoid deadlocks, whenever we need both locks, we always first
239-
* get AsyncQueueLock and then AsyncCtlLock.
241+
* In order to avoid deadlocks, whenever we need multiple locks, we first get
242+
* NotifyQueueTailLock, then AsyncQueueLock, and lastly AsyncCtlLock.
240243
*
241244
* Each backend uses the backend[] array entry with index equal to its
242245
* BackendId (which can range from 1 to MaxBackends). We rely on this to make
@@ -2013,6 +2016,10 @@ asyncQueueAdvanceTail(void)
20132016
int newtailpage;
20142017
int boundary;
20152018

2019+
/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
2020+
LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE);
2021+
2022+
/* Compute the new tail. */
20162023
LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
20172024
min = QUEUE_HEAD;
20182025
for (i = 1; i <= MaxBackends; i++)
@@ -2021,7 +2028,6 @@ asyncQueueAdvanceTail(void)
20212028
min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i));
20222029
}
20232030
oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL);
2024-
QUEUE_TAIL = min;
20252031
LWLockRelease(AsyncQueueLock);
20262032

20272033
/*
@@ -2041,6 +2047,17 @@ asyncQueueAdvanceTail(void)
20412047
*/
20422048
SimpleLruTruncate(AsyncCtl, newtailpage);
20432049
}
2050+
2051+
/*
2052+
* Advertise the new tail. This changes asyncQueueIsFull()'s verdict for
2053+
* the segment immediately prior to the new tail, allowing fresh data into
2054+
* that segment.
2055+
*/
2056+
LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
2057+
QUEUE_TAIL = min;
2058+
LWLockRelease(AsyncQueueLock);
2059+
2060+
LWLockRelease(NotifyQueueTailLock);
20442061
}
20452062

20462063
/*

src/backend/commands/vacuum.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,14 @@ vac_update_datfrozenxid(void)
10171017
bool bogus = false;
10181018
bool dirty = false;
10191019

1020+
/*
1021+
* Restrict this task to one backend per database. This avoids race
1022+
* conditions that would move datfrozenxid or datminmxid backward. It
1023+
* avoids calling vac_truncate_clog() with a datfrozenxid preceding a
1024+
* datfrozenxid passed to an earlier vac_truncate_clog() call.
1025+
*/
1026+
LockDatabaseFrozenIds(ExclusiveLock);
1027+
10201028
/*
10211029
* Initialize the "min" calculation with GetOldestXmin, which is a
10221030
* reasonable approximation to the minimum relfrozenxid for not-yet-
@@ -1181,6 +1189,9 @@ vac_truncate_clog(TransactionId frozenXID,
11811189
bool bogus = false;
11821190
bool frozenAlreadyWrapped = false;
11831191

1192+
/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
1193+
LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE);
1194+
11841195
/* init oldest datoids to sync with my frozenXID/minMulti values */
11851196
oldestxid_datoid = MyDatabaseId;
11861197
minmulti_datoid = MyDatabaseId;
@@ -1290,6 +1301,8 @@ vac_truncate_clog(TransactionId frozenXID,
12901301
*/
12911302
SetTransactionIdLimit(frozenXID, oldestxid_datoid);
12921303
SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
1304+
1305+
LWLockRelease(WrapLimitsVacuumLock);
12931306
}
12941307

12951308

src/backend/storage/lmgr/lmgr.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
412412
LockRelease(&tag, lockmode, false);
413413
}
414414

415+
/*
416+
* LockDatabaseFrozenIds
417+
*
418+
* This allows one backend per database to execute vac_update_datfrozenxid().
419+
*/
420+
void
421+
LockDatabaseFrozenIds(LOCKMODE lockmode)
422+
{
423+
LOCKTAG tag;
424+
425+
SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
426+
427+
(void) LockAcquire(&tag, lockmode, false, false);
428+
}
429+
415430
/*
416431
* LockPage
417432
*
@@ -1015,6 +1030,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
10151030
tag->locktag_field2,
10161031
tag->locktag_field1);
10171032
break;
1033+
case LOCKTAG_DATABASE_FROZEN_IDS:
1034+
appendStringInfo(buf,
1035+
_("pg_database.datfrozenxid of database %u"),
1036+
tag->locktag_field1);
1037+
break;
10181038
case LOCKTAG_PAGE:
10191039
appendStringInfo(buf,
10201040
_("page %u of relation %u of database %u"),

src/backend/storage/lmgr/lwlocknames.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,5 @@ OldSnapshotTimeMapLock 42
5050
BackendRandomLock 43
5151
LogicalRepWorkerLock 44
5252
CLogTruncationLock 45
53+
WrapLimitsVacuumLock 46
54+
NotifyQueueTailLock 47

src/backend/utils/adt/lockfuncs.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
const char *const LockTagTypeNames[] = {
2727
"relation",
2828
"extend",
29+
"frozenid",
2930
"page",
3031
"tuple",
3132
"transactionid",
@@ -245,6 +246,17 @@ pg_lock_status(PG_FUNCTION_ARGS)
245246
nulls[8] = true;
246247
nulls[9] = true;
247248
break;
249+
case LOCKTAG_DATABASE_FROZEN_IDS:
250+
values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
251+
nulls[2] = true;
252+
nulls[3] = true;
253+
nulls[4] = true;
254+
nulls[5] = true;
255+
nulls[6] = true;
256+
nulls[7] = true;
257+
nulls[8] = true;
258+
nulls[9] = true;
259+
break;
248260
case LOCKTAG_PAGE:
249261
values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
250262
values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2);

src/include/storage/lmgr.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ extern bool ConditionalLockRelationForExtension(Relation relation,
5757
LOCKMODE lockmode);
5858
extern int RelationExtensionLockWaiterCount(Relation relation);
5959

60+
/* Lock to recompute pg_database.datfrozenxid in the current database */
61+
extern void LockDatabaseFrozenIds(LOCKMODE lockmode);
62+
6063
/* Lock a page (currently only used within indexes) */
6164
extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
6265
extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);

0 commit comments

Comments
 (0)