Skip to content

Commit 1986ca5

Browse files
committed
Fix race condition in multixact code: it's possible to try to read a
multixact's starting offset before the offset has been stored into the SLRU file. A simple fix would be to hold the MultiXactGenLock until the offset has been stored, but that looks like a big concurrency hit. Instead rely on knowledge that unset offsets will be zero, and loop when we see a zero. This requires a little extra hacking to ensure that zero is never a valid value for the offset. Problem reported by Matteo Beccati, fix ideas from Martijn van Oosterhout, Alvaro Herrera, and Tom Lane.
1 parent 21b748e commit 1986ca5

File tree

1 file changed

+110
-35
lines changed

1 file changed

+110
-35
lines changed

src/backend/access/transam/multixact.c

Lines changed: 110 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
4343
* Portions Copyright (c) 1994, Regents of the University of California
4444
*
45-
* $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.9 2005/10/15 02:49:09 momjian Exp $
45+
* $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.10 2005/10/28 17:27:29 tgl Exp $
4646
*
4747
*-------------------------------------------------------------------------
4848
*/
@@ -631,7 +631,16 @@ CreateMultiXactId(int nxids, TransactionId *xids)
631631
}
632632

633633
/*
634-
* OK, assign the MXID and offsets range to use
634+
* Critical section from here until we've written the data; we don't
635+
* want to error out with a partly written MultiXact structure.
636+
* (In particular, failing to write our start offset after advancing
637+
* nextMXact would effectively corrupt the previous MultiXact.)
638+
*/
639+
START_CRIT_SECTION();
640+
641+
/*
642+
* Assign the MXID and offsets range to use, and make sure there is
643+
* space in the OFFSETs and MEMBERs files.
635644
*/
636645
multi = GetNewMultiXactId(nxids, &offset);
637646

@@ -668,6 +677,9 @@ CreateMultiXactId(int nxids, TransactionId *xids)
668677
/* Now enter the information into the OFFSETs and MEMBERs logs */
669678
RecordNewMultiXact(multi, offset, nxids, xids);
670679

680+
/* Done with critical section */
681+
END_CRIT_SECTION();
682+
671683
/* Store the new MultiXactId in the local cache, too */
672684
mXactCachePut(multi, nxids, xids);
673685

@@ -761,6 +773,7 @@ static MultiXactId
761773
GetNewMultiXactId(int nxids, MultiXactOffset *offset)
762774
{
763775
MultiXactId result;
776+
MultiXactOffset nextOffset;
764777

765778
debug_elog3(DEBUG2, "GetNew: for %d xids", nxids);
766779

@@ -784,19 +797,28 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
784797
* Advance counter. As in GetNewTransactionId(), this must not happen
785798
* until after ExtendMultiXactOffset has succeeded!
786799
*
787-
* We don't care about MultiXactId wraparound here; it will be handled by the
788-
* next iteration. But note that nextMXact may be InvalidMultiXactId
800+
* We don't care about MultiXactId wraparound here; it will be handled by
801+
* the next iteration. But note that nextMXact may be InvalidMultiXactId
789802
* after this routine exits, so anyone else looking at the variable must
790803
* be prepared to deal with that.
791804
*/
792805
(MultiXactState->nextMXact)++;
793806

794807
/*
795-
* Reserve the members space. Same considerations as above.
808+
* Reserve the members space. Same considerations as above. Also, be
809+
* careful not to return zero as the starting offset for any multixact.
810+
* See GetMultiXactIdMembers() for motivation.
796811
*/
797-
*offset = MultiXactState->nextOffset;
812+
nextOffset = MultiXactState->nextOffset;
813+
if (nextOffset == 0)
814+
{
815+
*offset = 1;
816+
nxids++; /* allocate member slot 0 too */
817+
}
818+
else
819+
*offset = nextOffset;
798820

799-
ExtendMultiXactMember(*offset, nxids);
821+
ExtendMultiXactMember(nextOffset, nxids);
800822

801823
MultiXactState->nextOffset += nxids;
802824

@@ -824,6 +846,7 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
824846
MultiXactOffset *offptr;
825847
MultiXactOffset offset;
826848
int length;
849+
int truelength;
827850
int i;
828851
MultiXactId nextMXact;
829852
MultiXactId tmpMXact;
@@ -849,13 +872,13 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
849872
/*
850873
* We check known limits on MultiXact before resorting to the SLRU area.
851874
*
852-
* An ID older than our OldestVisibleMXactId[] entry can't possibly still be
853-
* running, and we'd run the risk of trying to read already-truncated SLRU
854-
* data if we did try to examine it.
875+
* An ID older than our OldestVisibleMXactId[] entry can't possibly still
876+
* be running, and we'd run the risk of trying to read already-truncated
877+
* SLRU data if we did try to examine it.
855878
*
856-
* Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is seen,
857-
* it implies undetected ID wraparound has occurred. We just silently
858-
* assume that such an ID is no longer running.
879+
* Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is
880+
* seen, it implies undetected ID wraparound has occurred. We just
881+
* silently assume that such an ID is no longer running.
859882
*
860883
* Shared lock is enough here since we aren't modifying any global state.
861884
* Also, we can examine our own OldestVisibleMXactId without the lock,
@@ -868,27 +891,58 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
868891
return -1;
869892
}
870893

894+
/*
895+
* Acquire the shared lock just long enough to grab the current counter
896+
* values. We may need both nextMXact and nextOffset; see below.
897+
*/
871898
LWLockAcquire(MultiXactGenLock, LW_SHARED);
872899

873-
if (!MultiXactIdPrecedes(multi, MultiXactState->nextMXact))
900+
nextMXact = MultiXactState->nextMXact;
901+
nextOffset = MultiXactState->nextOffset;
902+
903+
LWLockRelease(MultiXactGenLock);
904+
905+
if (!MultiXactIdPrecedes(multi, nextMXact))
874906
{
875-
LWLockRelease(MultiXactGenLock);
876907
debug_elog2(DEBUG2, "GetMembers: it's too new!");
877908
*xids = NULL;
878909
return -1;
879910
}
880911

881912
/*
882-
* Before releasing the lock, save the current counter values, because the
883-
* target MultiXactId may be just one less than nextMXact. We will need
884-
* to use nextOffset as the endpoint if so.
913+
* Find out the offset at which we need to start reading MultiXactMembers
914+
* and the number of members in the multixact. We determine the latter
915+
* as the difference between this multixact's starting offset and the
916+
* next one's. However, there are some corner cases to worry about:
917+
*
918+
* 1. This multixact may be the latest one created, in which case there
919+
* is no next one to look at. In this case the nextOffset value we just
920+
* saved is the correct endpoint.
921+
*
922+
* 2. The next multixact may still be in process of being filled in:
923+
* that is, another process may have done GetNewMultiXactId but not yet
924+
* written the offset entry for that ID. In that scenario, it is
925+
* guaranteed that the offset entry for that multixact exists (because
926+
* GetNewMultiXactId won't release MultiXactGenLock until it does)
927+
* but contains zero (because we are careful to pre-zero offset pages).
928+
* Because GetNewMultiXactId will never return zero as the starting offset
929+
* for a multixact, when we read zero as the next multixact's offset, we
930+
* know we have this case. We sleep for a bit and try again.
931+
*
932+
* 3. Because GetNewMultiXactId increments offset zero to offset one
933+
* to handle case #2, there is an ambiguity near the point of offset
934+
* wraparound. If we see next multixact's offset is one, is that our
935+
* multixact's actual endpoint, or did it end at zero with a subsequent
936+
* increment? We handle this using the knowledge that if the zero'th
937+
* member slot wasn't filled, it'll contain zero, and zero isn't a valid
938+
* transaction ID so it can't be a multixact member. Therefore, if we
939+
* read a zero from the members array, just ignore it.
940+
*
941+
* This is all pretty messy, but the mess occurs only in infrequent corner
942+
* cases, so it seems better than holding the MultiXactGenLock for a long
943+
* time on every multixact creation.
885944
*/
886-
nextMXact = MultiXactState->nextMXact;
887-
nextOffset = MultiXactState->nextOffset;
888-
889-
LWLockRelease(MultiXactGenLock);
890-
891-
/* Get the offset at which we need to start reading MultiXactMembers */
945+
retry:
892946
LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
893947

894948
pageno = MultiXactIdToOffsetPage(multi);
@@ -899,20 +953,23 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
899953
offptr += entryno;
900954
offset = *offptr;
901955

956+
Assert(offset != 0);
957+
902958
/*
903-
* How many members do we need to read? If we are at the end of the
904-
* assigned MultiXactIds, use the offset just saved above. Else we need
905-
* to check the MultiXactId following ours.
906-
*
907-
* Use the same increment rule as GetNewMultiXactId(), that is, don't handle
908-
* wraparound explicitly until needed.
959+
* Use the same increment rule as GetNewMultiXactId(), that is, don't
960+
* handle wraparound explicitly until needed.
909961
*/
910962
tmpMXact = multi + 1;
911963

912964
if (nextMXact == tmpMXact)
965+
{
966+
/* Corner case 1: there is no next multixact */
913967
length = nextOffset - offset;
968+
}
914969
else
915970
{
971+
MultiXactOffset nextMXOffset;
972+
916973
/* handle wraparound if needed */
917974
if (tmpMXact < FirstMultiXactId)
918975
tmpMXact = FirstMultiXactId;
@@ -927,7 +984,17 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
927984

928985
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
929986
offptr += entryno;
930-
length = *offptr - offset;
987+
nextMXOffset = *offptr;
988+
989+
if (nextMXOffset == 0)
990+
{
991+
/* Corner case 2: next multixact is still being filled in */
992+
LWLockRelease(MultiXactOffsetControlLock);
993+
pg_usleep(1000L);
994+
goto retry;
995+
}
996+
997+
length = nextMXOffset - offset;
931998
}
932999

9331000
LWLockRelease(MultiXactOffsetControlLock);
@@ -938,6 +1005,7 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
9381005
/* Now get the members themselves. */
9391006
LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
9401007

1008+
truelength = 0;
9411009
prev_pageno = -1;
9421010
for (i = 0; i < length; i++, offset++)
9431011
{
@@ -956,19 +1024,26 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
9561024
MultiXactMemberCtl->shared->page_buffer[slotno];
9571025
xactptr += entryno;
9581026

959-
ptr[i] = *xactptr;
1027+
if (!TransactionIdIsValid(*xactptr))
1028+
{
1029+
/* Corner case 3: we must be looking at unused slot zero */
1030+
Assert(offset == 0);
1031+
continue;
1032+
}
1033+
1034+
ptr[truelength++] = *xactptr;
9601035
}
9611036

9621037
LWLockRelease(MultiXactMemberControlLock);
9631038

9641039
/*
9651040
* Copy the result into the local cache.
9661041
*/
967-
mXactCachePut(multi, length, ptr);
1042+
mXactCachePut(multi, truelength, ptr);
9681043

9691044
debug_elog3(DEBUG2, "GetMembers: no cache for %s",
970-
mxid_to_string(multi, length, ptr));
971-
return length;
1045+
mxid_to_string(multi, truelength, ptr));
1046+
return truelength;
9721047
}
9731048

9741049
/*

0 commit comments

Comments
 (0)