Skip to content

Commit 30e68a2

Browse files
committed
Prevent concurrent SimpleLruTruncate() for any given SLRU.
The SimpleLruTruncate() header comment states the new coding rule. To achieve this, add locktype "frozenid" and two LWLocks. This closes a rare opportunity for data loss, which manifested as "apparent wraparound" or "could not access status of transaction" errors. Data loss is more likely in pg_multixact, due to released branches' thin margin between multiStopLimit and multiWrapLimit. If a user's physical replication primary logged ": apparent wraparound" messages, the user should rebuild standbys of that primary regardless of symptoms. At less risk is a cluster having emitted "not accepting commands" errors or "must be vacuumed" warnings at some point. One can test a cluster for this data loss by running VACUUM FREEZE in every database. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/20190218073103.GA1434723@rfd.leadboat.com
1 parent 912fb29 commit 30e68a2

File tree

11 files changed

+119
-15
lines changed

11 files changed

+119
-15
lines changed

doc/src/sgml/catalogs.sgml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9095,7 +9095,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
90959095
and general database objects (identified by class OID and object OID,
90969096
in the same way as in <structname>pg_description</structname> or
90979097
<structname>pg_depend</structname>). Also, the right to extend a
9098-
relation is represented as a separate lockable object.
9098+
relation is represented as a separate lockable object, as is the right to
9099+
update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>.
90999100
Also, <quote>advisory</quote> locks can be taken on numbers that have
91009101
user-defined meanings.
91019102
</para>
@@ -9121,6 +9122,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
91219122
Type of the lockable object:
91229123
<literal>relation</literal>,
91239124
<literal>extend</literal>,
9125+
<literal>frozenid</literal>,
91249126
<literal>page</literal>,
91259127
<literal>tuple</literal>,
91269128
<literal>transactionid</literal>,

doc/src/sgml/monitoring.sgml

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -885,7 +885,7 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
885885

886886
<tbody>
887887
<row>
888-
<entry morerows="64"><literal>LWLock</literal></entry>
888+
<entry morerows="66"><literal>LWLock</literal></entry>
889889
<entry><literal>ShmemIndexLock</literal></entry>
890890
<entry>Waiting to find or allocate space in shared memory.</entry>
891891
</row>
@@ -1079,6 +1079,16 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
10791079
<entry>Waiting to execute <function>txid_status</function> or update
10801080
the oldest transaction id available to it.</entry>
10811081
</row>
1082+
<row>
1083+
<entry><literal>WrapLimitsVacuumLock</literal></entry>
1084+
<entry>Waiting to update limits on transaction id and multixact
1085+
consumption.</entry>
1086+
</row>
1087+
<row>
1088+
<entry><literal>NotifyQueueTailLock</literal></entry>
1089+
<entry>Waiting to update limit on notification message
1090+
storage.</entry>
1091+
</row>
10821092
<row>
10831093
<entry><literal>clog</literal></entry>
10841094
<entry>Waiting for I/O on a clog (transaction status) buffer.</entry>
@@ -1169,14 +1179,20 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
11691179
counters during Parallel Hash plan execution.</entry>
11701180
</row>
11711181
<row>
1172-
<entry morerows="9"><literal>Lock</literal></entry>
1182+
<entry morerows="10"><literal>Lock</literal></entry>
11731183
<entry><literal>relation</literal></entry>
11741184
<entry>Waiting to acquire a lock on a relation.</entry>
11751185
</row>
11761186
<row>
11771187
<entry><literal>extend</literal></entry>
11781188
<entry>Waiting to extend a relation.</entry>
11791189
</row>
1190+
<row>
1191+
<entry><literal>frozenid</literal></entry>
1192+
<entry>Waiting to
1193+
update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>
1194+
and <structname>pg_database</structname>.<structfield>datminmxid</structfield>.</entry>
1195+
</row>
11801196
<row>
11811197
<entry><literal>page</literal></entry>
11821198
<entry>Waiting to acquire a lock on page of a relation.</entry>

src/backend/access/transam/slru.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1180,6 +1180,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
11801180

11811181
/*
11821182
* Remove all segments before the one holding the passed page number
1183+
*
1184+
* All SLRUs prevent concurrent calls to this function, either with an LWLock
1185+
* or by calling it only as part of a checkpoint. Mutual exclusion must begin
1186+
* before computing cutoffPage. Mutual exclusion must end after any limit
1187+
* update that would permit other backends to write fresh data into the
1188+
* segment immediately preceding the one containing cutoffPage. Otherwise,
1189+
* when the SLRU is quite full, SimpleLruTruncate() might delete that segment
1190+
* after it has accrued freshly-written data.
11831191
*/
11841192
void
11851193
SimpleLruTruncate(SlruCtl ctl, int cutoffPage)

src/backend/access/transam/subtrans.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -349,8 +349,8 @@ ExtendSUBTRANS(TransactionId newestXact)
349349
/*
350350
* Remove all SUBTRANS segments before the one holding the passed transaction ID
351351
*
352-
* This is normally called during checkpoint, with oldestXact being the
353-
* oldest TransactionXmin of any running transaction.
352+
* oldestXact is the oldest TransactionXmin of any running transaction. This
353+
* is called only during checkpoint.
354354
*/
355355
void
356356
TruncateSUBTRANS(TransactionId oldestXact)

src/backend/commands/async.c

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -223,19 +223,22 @@ typedef struct QueueBackendStatus
223223
/*
224224
* Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff)
225225
*
226-
* The AsyncQueueControl structure is protected by the AsyncQueueLock.
226+
* The AsyncQueueControl structure is protected by the AsyncQueueLock and
227+
* NotifyQueueTailLock.
227228
*
228-
* When holding the lock in SHARED mode, backends may only inspect their own
229-
* entries as well as the head and tail pointers. Consequently we can allow a
230-
* backend to update its own record while holding only SHARED lock (since no
231-
* other backend will inspect it).
229+
* When holding AsyncQueueLock in SHARED mode, backends may only inspect their
230+
* own entries as well as the head and tail pointers. Consequently we can
231+
* allow a backend to update its own record while holding only SHARED lock
232+
* (since no other backend will inspect it).
232233
*
233-
* When holding the lock in EXCLUSIVE mode, backends can inspect the entries
234-
* of other backends and also change the head and tail pointers.
234+
* When holding AsyncQueueLock in EXCLUSIVE mode, backends can inspect the
235+
* entries of other backends and also change the head pointer. When holding
236+
* both AsyncQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends can
237+
* change the tail pointer.
235238
*
236239
* AsyncCtlLock is used as the control lock for the pg_notify SLRU buffers.
237-
* In order to avoid deadlocks, whenever we need both locks, we always first
238-
* get AsyncQueueLock and then AsyncCtlLock.
240+
* In order to avoid deadlocks, whenever we need multiple locks, we first get
241+
* NotifyQueueTailLock, then AsyncQueueLock, and lastly AsyncCtlLock.
239242
*
240243
* Each backend uses the backend[] array entry with index equal to its
241244
* BackendId (which can range from 1 to MaxBackends). We rely on this to make
@@ -2012,6 +2015,10 @@ asyncQueueAdvanceTail(void)
20122015
int newtailpage;
20132016
int boundary;
20142017

2018+
/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
2019+
LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE);
2020+
2021+
/* Compute the new tail. */
20152022
LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
20162023
min = QUEUE_HEAD;
20172024
for (i = 1; i <= MaxBackends; i++)
@@ -2020,7 +2027,6 @@ asyncQueueAdvanceTail(void)
20202027
min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i));
20212028
}
20222029
oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL);
2023-
QUEUE_TAIL = min;
20242030
LWLockRelease(AsyncQueueLock);
20252031

20262032
/*
@@ -2040,6 +2046,17 @@ asyncQueueAdvanceTail(void)
20402046
*/
20412047
SimpleLruTruncate(AsyncCtl, newtailpage);
20422048
}
2049+
2050+
/*
2051+
* Advertise the new tail. This changes asyncQueueIsFull()'s verdict for
2052+
* the segment immediately prior to the new tail, allowing fresh data into
2053+
* that segment.
2054+
*/
2055+
LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
2056+
QUEUE_TAIL = min;
2057+
LWLockRelease(AsyncQueueLock);
2058+
2059+
LWLockRelease(NotifyQueueTailLock);
20432060
}
20442061

20452062
/*

src/backend/commands/vacuum.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1295,6 +1295,14 @@ vac_update_datfrozenxid(void)
12951295
bool bogus = false;
12961296
bool dirty = false;
12971297

1298+
/*
1299+
* Restrict this task to one backend per database. This avoids race
1300+
* conditions that would move datfrozenxid or datminmxid backward. It
1301+
* avoids calling vac_truncate_clog() with a datfrozenxid preceding a
1302+
* datfrozenxid passed to an earlier vac_truncate_clog() call.
1303+
*/
1304+
LockDatabaseFrozenIds(ExclusiveLock);
1305+
12981306
/*
12991307
* Initialize the "min" calculation with GetOldestXmin, which is a
13001308
* reasonable approximation to the minimum relfrozenxid for not-yet-
@@ -1484,6 +1492,9 @@ vac_truncate_clog(TransactionId frozenXID,
14841492
bool bogus = false;
14851493
bool frozenAlreadyWrapped = false;
14861494

1495+
/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
1496+
LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE);
1497+
14871498
/* init oldest datoids to sync with my frozenXID/minMulti values */
14881499
oldestxid_datoid = MyDatabaseId;
14891500
minmulti_datoid = MyDatabaseId;
@@ -1593,6 +1604,8 @@ vac_truncate_clog(TransactionId frozenXID,
15931604
*/
15941605
SetTransactionIdLimit(frozenXID, oldestxid_datoid);
15951606
SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
1607+
1608+
LWLockRelease(WrapLimitsVacuumLock);
15961609
}
15971610

15981611

src/backend/storage/lmgr/lmgr.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
460460
LockRelease(&tag, lockmode, false);
461461
}
462462

463+
/*
464+
* LockDatabaseFrozenIds
465+
*
466+
* This allows one backend per database to execute vac_update_datfrozenxid().
467+
*/
468+
void
469+
LockDatabaseFrozenIds(LOCKMODE lockmode)
470+
{
471+
LOCKTAG tag;
472+
473+
SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
474+
475+
(void) LockAcquire(&tag, lockmode, false, false);
476+
}
477+
463478
/*
464479
* LockPage
465480
*
@@ -1098,6 +1113,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
10981113
tag->locktag_field2,
10991114
tag->locktag_field1);
11001115
break;
1116+
case LOCKTAG_DATABASE_FROZEN_IDS:
1117+
appendStringInfo(buf,
1118+
_("pg_database.datfrozenxid of database %u"),
1119+
tag->locktag_field1);
1120+
break;
11011121
case LOCKTAG_PAGE:
11021122
appendStringInfo(buf,
11031123
_("page %u of relation %u of database %u"),

src/backend/storage/lmgr/lwlocknames.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,6 @@ MultiXactTruncationLock 41
4949
OldSnapshotTimeMapLock 42
5050
LogicalRepWorkerLock 43
5151
CLogTruncationLock 44
52+
# 45 was CLogTruncationLock until removal of BackendRandomLock
53+
WrapLimitsVacuumLock 46
54+
NotifyQueueTailLock 47

src/backend/utils/adt/lockfuncs.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
const char *const LockTagTypeNames[] = {
2727
"relation",
2828
"extend",
29+
"frozenid",
2930
"page",
3031
"tuple",
3132
"transactionid",
@@ -245,6 +246,17 @@ pg_lock_status(PG_FUNCTION_ARGS)
245246
nulls[8] = true;
246247
nulls[9] = true;
247248
break;
249+
case LOCKTAG_DATABASE_FROZEN_IDS:
250+
values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
251+
nulls[2] = true;
252+
nulls[3] = true;
253+
nulls[4] = true;
254+
nulls[5] = true;
255+
nulls[6] = true;
256+
nulls[7] = true;
257+
nulls[8] = true;
258+
nulls[9] = true;
259+
break;
248260
case LOCKTAG_PAGE:
249261
values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
250262
values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2);

src/include/storage/lmgr.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ extern bool ConditionalLockRelationForExtension(Relation relation,
5959
LOCKMODE lockmode);
6060
extern int RelationExtensionLockWaiterCount(Relation relation);
6161

62+
/* Lock to recompute pg_database.datfrozenxid in the current database */
63+
extern void LockDatabaseFrozenIds(LOCKMODE lockmode);
64+
6265
/* Lock a page (currently only used within indexes) */
6366
extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
6467
extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);

src/include/storage/lock.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ typedef enum LockTagType
139139
{
140140
LOCKTAG_RELATION, /* whole relation */
141141
LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */
142+
LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */
142143
LOCKTAG_PAGE, /* one page of a relation */
143144
LOCKTAG_TUPLE, /* one physical tuple */
144145
LOCKTAG_TRANSACTION, /* transaction (for waiting for xact done) */
@@ -195,6 +196,15 @@ typedef struct LOCKTAG
195196
(locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \
196197
(locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
197198

199+
/* ID info for frozen IDs is DB OID */
200+
#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \
201+
((locktag).locktag_field1 = (dboid), \
202+
(locktag).locktag_field2 = 0, \
203+
(locktag).locktag_field3 = 0, \
204+
(locktag).locktag_field4 = 0, \
205+
(locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \
206+
(locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
207+
198208
/* ID info for a page is RELATION info + BlockNumber */
199209
#define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \
200210
((locktag).locktag_field1 = (dboid), \

0 commit comments

Comments
 (0)