Skip to content

Commit 95e9f92

Browse files
committed
Fix longstanding recursion hazard in sinval message processing.
LockRelationOid and sibling routines supposed that, if our session already holds the lock they were asked to acquire, they could skip calling AcceptInvalidationMessages on the grounds that we must have already read any remote sinval messages issued against the relation being locked. This is normally true, but there's a critical special case where it's not: processing inside AcceptInvalidationMessages might attempt to access system relations, resulting in a recursive call to acquire a relation lock. Hence, if the outer call had acquired that same system catalog lock, we'd fall through, despite the possibility that there's an as-yet-unread sinval message for that system catalog. This could, for example, result in failure to access a system catalog or index that had just been processed by VACUUM FULL. This is the explanation for buildfarm failures we've been seeing intermittently for the past three months. The bug is far older than that, but commits a54e1f1 et al added a new recursion case within AcceptInvalidationMessages that is apparently easier to hit than any previous case. To fix this, we must not skip calling AcceptInvalidationMessages until we have *finished* a call to it since acquiring a relation lock, not merely acquired the lock. (There's already adequate logic inside AcceptInvalidationMessages to deal with being called recursively.) Fortunately, we can implement that at trivial cost, by adding a flag to LOCALLOCK hashtable entries that tracks whether we know we have completed such a call. There is an API hazard added by this patch for external callers of LockAcquire: if anything is testing for LOCKACQUIRE_ALREADY_HELD, it might be fooled by the new return code LOCKACQUIRE_ALREADY_CLEAR into thinking the lock wasn't already held. This should be a fail-soft condition, though, unless something very bizarre is being done in response to the test. Also, I added an additional output argument to LockAcquireExtended, assuming that that probably isn't called by any outside code given the very limited usefulness of its additional functionality. Back-patch to all supported branches. Discussion: https://postgr.es/m/12259.1532117714@sss.pgh.pa.us
1 parent 25ff97b commit 95e9f92

File tree

4 files changed

+92
-17
lines changed

4 files changed

+92
-17
lines changed

src/backend/storage/ipc/standby.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -395,8 +395,8 @@ ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid)
395395
ResolveRecoveryConflictWithVirtualXIDs(backends,
396396
PROCSIG_RECOVERY_CONFLICT_LOCK);
397397

398-
if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true, false)
399-
!= LOCKACQUIRE_NOT_AVAIL)
398+
if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true,
399+
false, NULL) != LOCKACQUIRE_NOT_AVAIL)
400400
lock_acquired = true;
401401
}
402402
}
@@ -631,8 +631,8 @@ StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
631631
*/
632632
SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
633633

634-
if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true, false)
635-
== LOCKACQUIRE_NOT_AVAIL)
634+
if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true,
635+
false, NULL) == LOCKACQUIRE_NOT_AVAIL)
636636
ResolveRecoveryConflictWithLock(newlock->dbOid, newlock->relOid);
637637
}
638638

src/backend/storage/lmgr/lmgr.c

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,12 @@ void
7272
LockRelationOid(Oid relid, LOCKMODE lockmode)
7373
{
7474
LOCKTAG tag;
75+
LOCALLOCK *locallock;
7576
LockAcquireResult res;
7677

7778
SetLocktagRelationOid(&tag, relid);
7879

79-
res = LockAcquire(&tag, lockmode, false, false);
80+
res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
8081

8182
/*
8283
* Now that we have the lock, check for invalidation messages, so that we
@@ -87,9 +88,18 @@ LockRelationOid(Oid relid, LOCKMODE lockmode)
8788
* relcache entry in an undesirable way. (In the case where our own xact
8889
* modifies the rel, the relcache update happens via
8990
* CommandCounterIncrement, not here.)
91+
*
92+
* However, in corner cases where code acts on tables (usually catalogs)
93+
* recursively, we might get here while still processing invalidation
94+
* messages in some outer execution of this function or a sibling. The
95+
* "cleared" status of the lock tells us whether we really are done
96+
* absorbing relevant inval messages.
9097
*/
91-
if (res != LOCKACQUIRE_ALREADY_HELD)
98+
if (res != LOCKACQUIRE_ALREADY_CLEAR)
99+
{
92100
AcceptInvalidationMessages();
101+
MarkLockClear(locallock);
102+
}
93103
}
94104

95105
/*
@@ -105,11 +115,12 @@ bool
105115
ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode)
106116
{
107117
LOCKTAG tag;
118+
LOCALLOCK *locallock;
108119
LockAcquireResult res;
109120

110121
SetLocktagRelationOid(&tag, relid);
111122

112-
res = LockAcquire(&tag, lockmode, false, true);
123+
res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
113124

114125
if (res == LOCKACQUIRE_NOT_AVAIL)
115126
return false;
@@ -118,8 +129,11 @@ ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode)
118129
* Now that we have the lock, check for invalidation messages; see notes
119130
* in LockRelationOid.
120131
*/
121-
if (res != LOCKACQUIRE_ALREADY_HELD)
132+
if (res != LOCKACQUIRE_ALREADY_CLEAR)
133+
{
122134
AcceptInvalidationMessages();
135+
MarkLockClear(locallock);
136+
}
123137

124138
return true;
125139
}
@@ -166,20 +180,24 @@ void
166180
LockRelation(Relation relation, LOCKMODE lockmode)
167181
{
168182
LOCKTAG tag;
183+
LOCALLOCK *locallock;
169184
LockAcquireResult res;
170185

171186
SET_LOCKTAG_RELATION(tag,
172187
relation->rd_lockInfo.lockRelId.dbId,
173188
relation->rd_lockInfo.lockRelId.relId);
174189

175-
res = LockAcquire(&tag, lockmode, false, false);
190+
res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
176191

177192
/*
178193
* Now that we have the lock, check for invalidation messages; see notes
179194
* in LockRelationOid.
180195
*/
181-
if (res != LOCKACQUIRE_ALREADY_HELD)
196+
if (res != LOCKACQUIRE_ALREADY_CLEAR)
197+
{
182198
AcceptInvalidationMessages();
199+
MarkLockClear(locallock);
200+
}
183201
}
184202

185203
/*
@@ -193,13 +211,14 @@ bool
193211
ConditionalLockRelation(Relation relation, LOCKMODE lockmode)
194212
{
195213
LOCKTAG tag;
214+
LOCALLOCK *locallock;
196215
LockAcquireResult res;
197216

198217
SET_LOCKTAG_RELATION(tag,
199218
relation->rd_lockInfo.lockRelId.dbId,
200219
relation->rd_lockInfo.lockRelId.relId);
201220

202-
res = LockAcquire(&tag, lockmode, false, true);
221+
res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
203222

204223
if (res == LOCKACQUIRE_NOT_AVAIL)
205224
return false;
@@ -208,8 +227,11 @@ ConditionalLockRelation(Relation relation, LOCKMODE lockmode)
208227
* Now that we have the lock, check for invalidation messages; see notes
209228
* in LockRelationOid.
210229
*/
211-
if (res != LOCKACQUIRE_ALREADY_HELD)
230+
if (res != LOCKACQUIRE_ALREADY_CLEAR)
231+
{
212232
AcceptInvalidationMessages();
233+
MarkLockClear(locallock);
234+
}
213235

214236
return true;
215237
}

src/backend/storage/lmgr/lock.c

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,7 @@ LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
658658
* LOCKACQUIRE_NOT_AVAIL lock not available, and dontWait=true
659659
* LOCKACQUIRE_OK lock successfully acquired
660660
* LOCKACQUIRE_ALREADY_HELD incremented count for lock already held
661+
* LOCKACQUIRE_ALREADY_CLEAR incremented count for lock already clear
661662
*
662663
* In the normal case where dontWait=false and the caller doesn't need to
663664
* distinguish a freshly acquired lock from one already taken earlier in
@@ -674,7 +675,8 @@ LockAcquire(const LOCKTAG *locktag,
674675
bool sessionLock,
675676
bool dontWait)
676677
{
677-
return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait, true);
678+
return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait,
679+
true, NULL);
678680
}
679681

680682
/*
@@ -685,13 +687,17 @@ LockAcquire(const LOCKTAG *locktag,
685687
* caller to note that the lock table is full and then begin taking
686688
* extreme action to reduce the number of other lock holders before
687689
* retrying the action.
690+
*
691+
* If locallockp isn't NULL, *locallockp receives a pointer to the LOCALLOCK
692+
* table entry if a lock is successfully acquired, or NULL if not.
688693
*/
689694
LockAcquireResult
690695
LockAcquireExtended(const LOCKTAG *locktag,
691696
LOCKMODE lockmode,
692697
bool sessionLock,
693698
bool dontWait,
694-
bool reportMemoryError)
699+
bool reportMemoryError,
700+
LOCALLOCK **locallockp)
695701
{
696702
LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
697703
LockMethod lockMethodTable;
@@ -758,6 +764,7 @@ LockAcquireExtended(const LOCKTAG *locktag,
758764
locallock->numLockOwners = 0;
759765
locallock->maxLockOwners = 8;
760766
locallock->holdsStrongLockCount = FALSE;
767+
locallock->lockCleared = false;
761768
locallock->lockOwners = NULL; /* in case next line fails */
762769
locallock->lockOwners = (LOCALLOCKOWNER *)
763770
MemoryContextAlloc(TopMemoryContext,
@@ -778,13 +785,22 @@ LockAcquireExtended(const LOCKTAG *locktag,
778785
}
779786
hashcode = locallock->hashcode;
780787

788+
if (locallockp)
789+
*locallockp = locallock;
790+
781791
/*
782792
* If we already hold the lock, we can just increase the count locally.
793+
*
794+
* If lockCleared is already set, caller need not worry about absorbing
795+
* sinval messages related to the lock's object.
783796
*/
784797
if (locallock->nLocks > 0)
785798
{
786799
GrantLockLocal(locallock, owner);
787-
return LOCKACQUIRE_ALREADY_HELD;
800+
if (locallock->lockCleared)
801+
return LOCKACQUIRE_ALREADY_CLEAR;
802+
else
803+
return LOCKACQUIRE_ALREADY_HELD;
788804
}
789805

790806
/*
@@ -866,6 +882,10 @@ LockAcquireExtended(const LOCKTAG *locktag,
866882
hashcode))
867883
{
868884
AbortStrongLockAcquire();
885+
if (locallock->nLocks == 0)
886+
RemoveLocalLock(locallock);
887+
if (locallockp)
888+
*locallockp = NULL;
869889
if (reportMemoryError)
870890
ereport(ERROR,
871891
(errcode(ERRCODE_OUT_OF_MEMORY),
@@ -900,6 +920,10 @@ LockAcquireExtended(const LOCKTAG *locktag,
900920
{
901921
AbortStrongLockAcquire();
902922
LWLockRelease(partitionLock);
923+
if (locallock->nLocks == 0)
924+
RemoveLocalLock(locallock);
925+
if (locallockp)
926+
*locallockp = NULL;
903927
if (reportMemoryError)
904928
ereport(ERROR,
905929
(errcode(ERRCODE_OUT_OF_MEMORY),
@@ -965,6 +989,8 @@ LockAcquireExtended(const LOCKTAG *locktag,
965989
LWLockRelease(partitionLock);
966990
if (locallock->nLocks == 0)
967991
RemoveLocalLock(locallock);
992+
if (locallockp)
993+
*locallockp = NULL;
968994
return LOCKACQUIRE_NOT_AVAIL;
969995
}
970996

@@ -1567,6 +1593,20 @@ GrantAwaitedLock(void)
15671593
GrantLockLocal(awaitedLock, awaitedOwner);
15681594
}
15691595

1596+
/*
1597+
* MarkLockClear -- mark an acquired lock as "clear"
1598+
*
1599+
* This means that we know we have absorbed all sinval messages that other
1600+
* sessions generated before we acquired this lock, and so we can confidently
1601+
* assume we know about any catalog changes protected by this lock.
1602+
*/
1603+
void
1604+
MarkLockClear(LOCALLOCK *locallock)
1605+
{
1606+
Assert(locallock->nLocks > 0);
1607+
locallock->lockCleared = true;
1608+
}
1609+
15701610
/*
15711611
* WaitOnLock -- wait to acquire a lock
15721612
*
@@ -1834,6 +1874,15 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
18341874
if (locallock->nLocks > 0)
18351875
return TRUE;
18361876

1877+
/*
1878+
* At this point we can no longer suppose we are clear of invalidation
1879+
* messages related to this lock. Although we'll delete the LOCALLOCK
1880+
* object before any intentional return from this routine, it seems worth
1881+
* the trouble to explicitly reset lockCleared right now, just in case
1882+
* some error prevents us from deleting the LOCALLOCK.
1883+
*/
1884+
locallock->lockCleared = false;
1885+
18371886
/* Attempt fast release of any lock eligible for the fast path. */
18381887
if (EligibleForRelationFastPath(locktag, lockmode) &&
18391888
FastPathLocalUseCount > 0)

src/include/storage/lock.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,7 @@ typedef struct LOCALLOCK
425425
int numLockOwners; /* # of relevant ResourceOwners */
426426
int maxLockOwners; /* allocated size of array */
427427
bool holdsStrongLockCount; /* bumped FastPathStrongRelationLocks */
428+
bool lockCleared; /* we read all sinval msgs for lock */
428429
LOCALLOCKOWNER *lockOwners; /* dynamically resizable array */
429430
} LOCALLOCK;
430431

@@ -459,7 +460,8 @@ typedef enum
459460
{
460461
LOCKACQUIRE_NOT_AVAIL, /* lock not available, and dontWait=true */
461462
LOCKACQUIRE_OK, /* lock successfully acquired */
462-
LOCKACQUIRE_ALREADY_HELD /* incremented count for lock already held */
463+
LOCKACQUIRE_ALREADY_HELD, /* incremented count for lock already held */
464+
LOCKACQUIRE_ALREADY_CLEAR /* incremented count for lock already clear */
463465
} LockAcquireResult;
464466

465467
/* Deadlock states identified by DeadLockCheck() */
@@ -501,8 +503,10 @@ extern LockAcquireResult LockAcquireExtended(const LOCKTAG *locktag,
501503
LOCKMODE lockmode,
502504
bool sessionLock,
503505
bool dontWait,
504-
bool report_memory_error);
506+
bool reportMemoryError,
507+
LOCALLOCK **locallockp);
505508
extern void AbortStrongLockAcquire(void);
509+
extern void MarkLockClear(LOCALLOCK *locallock);
506510
extern bool LockRelease(const LOCKTAG *locktag,
507511
LOCKMODE lockmode, bool sessionLock);
508512
extern void LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks);

0 commit comments

Comments
 (0)