Skip to content

Commit bf91938

Browse files
committed
Fix longstanding recursion hazard in sinval message processing.
LockRelationOid and sibling routines supposed that, if our session already holds the lock they were asked to acquire, they could skip calling AcceptInvalidationMessages on the grounds that we must have already read any remote sinval messages issued against the relation being locked. This is normally true, but there's a critical special case where it's not: processing inside AcceptInvalidationMessages might attempt to access system relations, resulting in a recursive call to acquire a relation lock. Hence, if the outer call had acquired that same system catalog lock, we'd fall through, despite the possibility that there's an as-yet-unread sinval message for that system catalog. This could, for example, result in failure to access a system catalog or index that had just been processed by VACUUM FULL. This is the explanation for buildfarm failures we've been seeing intermittently for the past three months. The bug is far older than that, but commits a54e1f1 et al added a new recursion case within AcceptInvalidationMessages that is apparently easier to hit than any previous case. To fix this, we must not skip calling AcceptInvalidationMessages until we have *finished* a call to it since acquiring a relation lock, not merely acquired the lock. (There's already adequate logic inside AcceptInvalidationMessages to deal with being called recursively.) Fortunately, we can implement that at trivial cost, by adding a flag to LOCALLOCK hashtable entries that tracks whether we know we have completed such a call. There is an API hazard added by this patch for external callers of LockAcquire: if anything is testing for LOCKACQUIRE_ALREADY_HELD, it might be fooled by the new return code LOCKACQUIRE_ALREADY_CLEAR into thinking the lock wasn't already held. This should be a fail-soft condition, though, unless something very bizarre is being done in response to the test. Also, I added an additional output argument to LockAcquireExtended, assuming that that probably isn't called by any outside code given the very limited usefulness of its additional functionality. Back-patch to all supported branches. Discussion: https://postgr.es/m/12259.1532117714@sss.pgh.pa.us
1 parent d4ab396 commit bf91938

File tree

4 files changed

+92
-17
lines changed

4 files changed

+92
-17
lines changed

src/backend/storage/ipc/standby.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -395,8 +395,8 @@ ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid)
395395
ResolveRecoveryConflictWithVirtualXIDs(backends,
396396
PROCSIG_RECOVERY_CONFLICT_LOCK);
397397

398-
if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true, false)
399-
!= LOCKACQUIRE_NOT_AVAIL)
398+
if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true,
399+
false, NULL) != LOCKACQUIRE_NOT_AVAIL)
400400
lock_acquired = true;
401401
}
402402
}
@@ -631,8 +631,8 @@ StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
631631
*/
632632
SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
633633

634-
if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true, false)
635-
== LOCKACQUIRE_NOT_AVAIL)
634+
if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true,
635+
false, NULL) == LOCKACQUIRE_NOT_AVAIL)
636636
ResolveRecoveryConflictWithLock(newlock->dbOid, newlock->relOid);
637637
}
638638

src/backend/storage/lmgr/lmgr.c

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,12 @@ void
8787
LockRelationOid(Oid relid, LOCKMODE lockmode)
8888
{
8989
LOCKTAG tag;
90+
LOCALLOCK *locallock;
9091
LockAcquireResult res;
9192

9293
SetLocktagRelationOid(&tag, relid);
9394

94-
res = LockAcquire(&tag, lockmode, false, false);
95+
res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
9596

9697
/*
9798
* Now that we have the lock, check for invalidation messages, so that we
@@ -102,9 +103,18 @@ LockRelationOid(Oid relid, LOCKMODE lockmode)
102103
* relcache entry in an undesirable way. (In the case where our own xact
103104
* modifies the rel, the relcache update happens via
104105
* CommandCounterIncrement, not here.)
106+
*
107+
* However, in corner cases where code acts on tables (usually catalogs)
108+
* recursively, we might get here while still processing invalidation
109+
* messages in some outer execution of this function or a sibling. The
110+
* "cleared" status of the lock tells us whether we really are done
111+
* absorbing relevant inval messages.
105112
*/
106-
if (res != LOCKACQUIRE_ALREADY_HELD)
113+
if (res != LOCKACQUIRE_ALREADY_CLEAR)
114+
{
107115
AcceptInvalidationMessages();
116+
MarkLockClear(locallock);
117+
}
108118
}
109119

110120
/*
@@ -120,11 +130,12 @@ bool
120130
ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode)
121131
{
122132
LOCKTAG tag;
133+
LOCALLOCK *locallock;
123134
LockAcquireResult res;
124135

125136
SetLocktagRelationOid(&tag, relid);
126137

127-
res = LockAcquire(&tag, lockmode, false, true);
138+
res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
128139

129140
if (res == LOCKACQUIRE_NOT_AVAIL)
130141
return false;
@@ -133,8 +144,11 @@ ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode)
133144
* Now that we have the lock, check for invalidation messages; see notes
134145
* in LockRelationOid.
135146
*/
136-
if (res != LOCKACQUIRE_ALREADY_HELD)
147+
if (res != LOCKACQUIRE_ALREADY_CLEAR)
148+
{
137149
AcceptInvalidationMessages();
150+
MarkLockClear(locallock);
151+
}
138152

139153
return true;
140154
}
@@ -181,20 +195,24 @@ void
181195
LockRelation(Relation relation, LOCKMODE lockmode)
182196
{
183197
LOCKTAG tag;
198+
LOCALLOCK *locallock;
184199
LockAcquireResult res;
185200

186201
SET_LOCKTAG_RELATION(tag,
187202
relation->rd_lockInfo.lockRelId.dbId,
188203
relation->rd_lockInfo.lockRelId.relId);
189204

190-
res = LockAcquire(&tag, lockmode, false, false);
205+
res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
191206

192207
/*
193208
* Now that we have the lock, check for invalidation messages; see notes
194209
* in LockRelationOid.
195210
*/
196-
if (res != LOCKACQUIRE_ALREADY_HELD)
211+
if (res != LOCKACQUIRE_ALREADY_CLEAR)
212+
{
197213
AcceptInvalidationMessages();
214+
MarkLockClear(locallock);
215+
}
198216
}
199217

200218
/*
@@ -208,13 +226,14 @@ bool
208226
ConditionalLockRelation(Relation relation, LOCKMODE lockmode)
209227
{
210228
LOCKTAG tag;
229+
LOCALLOCK *locallock;
211230
LockAcquireResult res;
212231

213232
SET_LOCKTAG_RELATION(tag,
214233
relation->rd_lockInfo.lockRelId.dbId,
215234
relation->rd_lockInfo.lockRelId.relId);
216235

217-
res = LockAcquire(&tag, lockmode, false, true);
236+
res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
218237

219238
if (res == LOCKACQUIRE_NOT_AVAIL)
220239
return false;
@@ -223,8 +242,11 @@ ConditionalLockRelation(Relation relation, LOCKMODE lockmode)
223242
* Now that we have the lock, check for invalidation messages; see notes
224243
* in LockRelationOid.
225244
*/
226-
if (res != LOCKACQUIRE_ALREADY_HELD)
245+
if (res != LOCKACQUIRE_ALREADY_CLEAR)
246+
{
227247
AcceptInvalidationMessages();
248+
MarkLockClear(locallock);
249+
}
228250

229251
return true;
230252
}

src/backend/storage/lmgr/lock.c

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,7 @@ LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
658658
* LOCKACQUIRE_NOT_AVAIL lock not available, and dontWait=true
659659
* LOCKACQUIRE_OK lock successfully acquired
660660
* LOCKACQUIRE_ALREADY_HELD incremented count for lock already held
661+
* LOCKACQUIRE_ALREADY_CLEAR incremented count for lock already clear
661662
*
662663
* In the normal case where dontWait=false and the caller doesn't need to
663664
* distinguish a freshly acquired lock from one already taken earlier in
@@ -674,7 +675,8 @@ LockAcquire(const LOCKTAG *locktag,
674675
bool sessionLock,
675676
bool dontWait)
676677
{
677-
return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait, true);
678+
return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait,
679+
true, NULL);
678680
}
679681

680682
/*
@@ -685,13 +687,17 @@ LockAcquire(const LOCKTAG *locktag,
685687
* caller to note that the lock table is full and then begin taking
686688
* extreme action to reduce the number of other lock holders before
687689
* retrying the action.
690+
*
691+
* If locallockp isn't NULL, *locallockp receives a pointer to the LOCALLOCK
692+
* table entry if a lock is successfully acquired, or NULL if not.
688693
*/
689694
LockAcquireResult
690695
LockAcquireExtended(const LOCKTAG *locktag,
691696
LOCKMODE lockmode,
692697
bool sessionLock,
693698
bool dontWait,
694-
bool reportMemoryError)
699+
bool reportMemoryError,
700+
LOCALLOCK **locallockp)
695701
{
696702
LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
697703
LockMethod lockMethodTable;
@@ -758,6 +764,7 @@ LockAcquireExtended(const LOCKTAG *locktag,
758764
locallock->numLockOwners = 0;
759765
locallock->maxLockOwners = 8;
760766
locallock->holdsStrongLockCount = FALSE;
767+
locallock->lockCleared = false;
761768
locallock->lockOwners = NULL; /* in case next line fails */
762769
locallock->lockOwners = (LOCALLOCKOWNER *)
763770
MemoryContextAlloc(TopMemoryContext,
@@ -778,13 +785,22 @@ LockAcquireExtended(const LOCKTAG *locktag,
778785
}
779786
hashcode = locallock->hashcode;
780787

788+
if (locallockp)
789+
*locallockp = locallock;
790+
781791
/*
782792
* If we already hold the lock, we can just increase the count locally.
793+
*
794+
* If lockCleared is already set, caller need not worry about absorbing
795+
* sinval messages related to the lock's object.
783796
*/
784797
if (locallock->nLocks > 0)
785798
{
786799
GrantLockLocal(locallock, owner);
787-
return LOCKACQUIRE_ALREADY_HELD;
800+
if (locallock->lockCleared)
801+
return LOCKACQUIRE_ALREADY_CLEAR;
802+
else
803+
return LOCKACQUIRE_ALREADY_HELD;
788804
}
789805

790806
/*
@@ -866,6 +882,10 @@ LockAcquireExtended(const LOCKTAG *locktag,
866882
hashcode))
867883
{
868884
AbortStrongLockAcquire();
885+
if (locallock->nLocks == 0)
886+
RemoveLocalLock(locallock);
887+
if (locallockp)
888+
*locallockp = NULL;
869889
if (reportMemoryError)
870890
ereport(ERROR,
871891
(errcode(ERRCODE_OUT_OF_MEMORY),
@@ -900,6 +920,10 @@ LockAcquireExtended(const LOCKTAG *locktag,
900920
{
901921
AbortStrongLockAcquire();
902922
LWLockRelease(partitionLock);
923+
if (locallock->nLocks == 0)
924+
RemoveLocalLock(locallock);
925+
if (locallockp)
926+
*locallockp = NULL;
903927
if (reportMemoryError)
904928
ereport(ERROR,
905929
(errcode(ERRCODE_OUT_OF_MEMORY),
@@ -965,6 +989,8 @@ LockAcquireExtended(const LOCKTAG *locktag,
965989
LWLockRelease(partitionLock);
966990
if (locallock->nLocks == 0)
967991
RemoveLocalLock(locallock);
992+
if (locallockp)
993+
*locallockp = NULL;
968994
return LOCKACQUIRE_NOT_AVAIL;
969995
}
970996

@@ -1566,6 +1592,20 @@ GrantAwaitedLock(void)
15661592
GrantLockLocal(awaitedLock, awaitedOwner);
15671593
}
15681594

1595+
/*
1596+
* MarkLockClear -- mark an acquired lock as "clear"
1597+
*
1598+
* This means that we know we have absorbed all sinval messages that other
1599+
* sessions generated before we acquired this lock, and so we can confidently
1600+
* assume we know about any catalog changes protected by this lock.
1601+
*/
1602+
void
1603+
MarkLockClear(LOCALLOCK *locallock)
1604+
{
1605+
Assert(locallock->nLocks > 0);
1606+
locallock->lockCleared = true;
1607+
}
1608+
15691609
/*
15701610
* WaitOnLock -- wait to acquire a lock
15711611
*
@@ -1833,6 +1873,15 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
18331873
if (locallock->nLocks > 0)
18341874
return TRUE;
18351875

1876+
/*
1877+
* At this point we can no longer suppose we are clear of invalidation
1878+
* messages related to this lock. Although we'll delete the LOCALLOCK
1879+
* object before any intentional return from this routine, it seems worth
1880+
* the trouble to explicitly reset lockCleared right now, just in case
1881+
* some error prevents us from deleting the LOCALLOCK.
1882+
*/
1883+
locallock->lockCleared = false;
1884+
18361885
/* Attempt fast release of any lock eligible for the fast path. */
18371886
if (EligibleForRelationFastPath(locktag, lockmode) &&
18381887
FastPathLocalUseCount > 0)

src/include/storage/lock.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,7 @@ typedef struct LOCALLOCK
425425
int numLockOwners; /* # of relevant ResourceOwners */
426426
int maxLockOwners; /* allocated size of array */
427427
bool holdsStrongLockCount; /* bumped FastPathStrongRelationLocks */
428+
bool lockCleared; /* we read all sinval msgs for lock */
428429
LOCALLOCKOWNER *lockOwners; /* dynamically resizable array */
429430
} LOCALLOCK;
430431

@@ -459,7 +460,8 @@ typedef enum
459460
{
460461
LOCKACQUIRE_NOT_AVAIL, /* lock not available, and dontWait=true */
461462
LOCKACQUIRE_OK, /* lock successfully acquired */
462-
LOCKACQUIRE_ALREADY_HELD /* incremented count for lock already held */
463+
LOCKACQUIRE_ALREADY_HELD, /* incremented count for lock already held */
464+
LOCKACQUIRE_ALREADY_CLEAR /* incremented count for lock already clear */
463465
} LockAcquireResult;
464466

465467
/* Deadlock states identified by DeadLockCheck() */
@@ -503,8 +505,10 @@ extern LockAcquireResult LockAcquireExtended(const LOCKTAG *locktag,
503505
LOCKMODE lockmode,
504506
bool sessionLock,
505507
bool dontWait,
506-
bool report_memory_error);
508+
bool reportMemoryError,
509+
LOCALLOCK **locallockp);
507510
extern void AbortStrongLockAcquire(void);
511+
extern void MarkLockClear(LOCALLOCK *locallock);
508512
extern bool LockRelease(const LOCKTAG *locktag,
509513
LOCKMODE lockmode, bool sessionLock);
510514
extern void LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks);

0 commit comments

Comments
 (0)