Skip to content

Commit d872510

Browse files
committed
Replace buffer I/O locks with condition variables.
1. Backends waiting for buffer I/O are now interruptible. 2. If something goes wrong in a backend that is currently performing I/O, waiting backends no longer wake up until that backend reaches AbortBufferIO() and broadcasts on the CV. Previously, any waiters would wake up (because the I/O lock was automatically released) and then busy-loop until AbortBufferIO() cleared BM_IO_IN_PROGRESS. 3. LWLockMinimallyPadded is removed, as it would now be unused. Author: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Thomas Munro <thomas.munro@gmail.com> Reviewed-by: Julien Rouhaud <rjuju123@gmail.com> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> (earlier version, 2016) Discussion: https://postgr.es/m/CA%2BhUKGJ8nBFrjLuCTuqKN0pd2PQOwj9b_jnsiGFFMDvUxahj_A%40mail.gmail.com Discussion: https://postgr.es/m/CA+Tgmoaj2aPti0yho7FeEf2qt-JgQPRWb0gci_o1Hfr=C56Xng@mail.gmail.com
1 parent c3ffe34 commit d872510

File tree

11 files changed

+61
-116
lines changed

11 files changed

+61
-116
lines changed

doc/src/sgml/monitoring.sgml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1586,6 +1586,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
15861586
<entry>Waiting for the page number needed to continue a parallel B-tree
15871587
scan to become available.</entry>
15881588
</row>
1589+
<row>
1590+
<entry><literal>BufferIO</literal></entry>
1591+
<entry>Waiting for buffer I/O to complete.</entry>
1592+
</row>
15891593
<row>
15901594
<entry><literal>CheckpointDone</literal></entry>
15911595
<entry>Waiting for a checkpoint to complete.</entry>
@@ -1876,10 +1880,6 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
18761880
<entry><literal>BufferContent</literal></entry>
18771881
<entry>Waiting to access a data page in memory.</entry>
18781882
</row>
1879-
<row>
1880-
<entry><literal>BufferIO</literal></entry>
1881-
<entry>Waiting for I/O on a data page.</entry>
1882-
</row>
18831883
<row>
18841884
<entry><literal>BufferMapping</literal></entry>
18851885
<entry>Waiting to associate a data block with a buffer in the buffer

src/backend/postmaster/pgstat.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4010,6 +4010,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
40104010
case WAIT_EVENT_BTREE_PAGE:
40114011
event_name = "BtreePage";
40124012
break;
4013+
case WAIT_EVENT_BUFFER_IO:
4014+
event_name = "BufferIO";
4015+
break;
40134016
case WAIT_EVENT_CHECKPOINT_DONE:
40144017
event_name = "CheckpointDone";
40154018
break;

src/backend/storage/buffer/README

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -145,14 +145,11 @@ held within the buffer. Each buffer header also contains an LWLock, the
145145
"buffer content lock", that *does* represent the right to access the data
146146
in the buffer. It is used per the rules above.
147147

148-
There is yet another set of per-buffer LWLocks, the io_in_progress locks,
149-
that are used to wait for I/O on a buffer to complete. The process doing
150-
a read or write takes exclusive lock for the duration, and processes that
151-
need to wait for completion try to take shared locks (which they release
152-
immediately upon obtaining). XXX on systems where an LWLock represents
153-
nontrivial resources, it's fairly annoying to need so many locks. Possibly
154-
we could use per-backend LWLocks instead (a buffer header would then contain
155-
a field to show which backend is doing its I/O).
148+
* The BM_IO_IN_PROGRESS flag acts as a kind of lock, used to wait for I/O on a
149+
buffer to complete (and in releases before 14, it was accompanied by a
150+
per-buffer LWLock). The process doing a read or write sets the flag for the
151+
duration, and processes that need to wait for it to be cleared sleep on a
152+
condition variable.
156153

157154

158155
Normal Buffer Replacement Strategy

src/backend/storage/buffer/buf_init.c

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
BufferDescPadded *BufferDescriptors;
2121
char *BufferBlocks;
22-
LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
22+
ConditionVariableMinimallyPadded *BufferIOCVArray;
2323
WritebackContext BackendWritebackContext;
2424
CkptSortItem *CkptBufferIds;
2525

@@ -68,7 +68,7 @@ InitBufferPool(void)
6868
{
6969
bool foundBufs,
7070
foundDescs,
71-
foundIOLocks,
71+
foundIOCV,
7272
foundBufCkpt;
7373

7474
/* Align descriptors to a cacheline boundary. */
@@ -81,11 +81,11 @@ InitBufferPool(void)
8181
ShmemInitStruct("Buffer Blocks",
8282
NBuffers * (Size) BLCKSZ, &foundBufs);
8383

84-
/* Align lwlocks to cacheline boundary */
85-
BufferIOLWLockArray = (LWLockMinimallyPadded *)
86-
ShmemInitStruct("Buffer IO Locks",
87-
NBuffers * (Size) sizeof(LWLockMinimallyPadded),
88-
&foundIOLocks);
84+
/* Align condition variables to cacheline boundary. */
85+
BufferIOCVArray = (ConditionVariableMinimallyPadded *)
86+
ShmemInitStruct("Buffer IO Condition Variables",
87+
NBuffers * sizeof(ConditionVariableMinimallyPadded),
88+
&foundIOCV);
8989

9090
/*
9191
* The array used to sort to-be-checkpointed buffer ids is located in
@@ -98,10 +98,10 @@ InitBufferPool(void)
9898
ShmemInitStruct("Checkpoint BufferIds",
9999
NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
100100

101-
if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt)
101+
if (foundDescs || foundBufs || foundIOCV || foundBufCkpt)
102102
{
103103
/* should find all of these, or none of them */
104-
Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt);
104+
Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt);
105105
/* note: this path is only taken in EXEC_BACKEND case */
106106
}
107107
else
@@ -131,8 +131,7 @@ InitBufferPool(void)
131131
LWLockInitialize(BufferDescriptorGetContentLock(buf),
132132
LWTRANCHE_BUFFER_CONTENT);
133133

134-
LWLockInitialize(BufferDescriptorGetIOLock(buf),
135-
LWTRANCHE_BUFFER_IO);
134+
ConditionVariableInit(BufferDescriptorGetIOCV(buf));
136135
}
137136

138137
/* Correct last entry of linked list */
@@ -169,16 +168,9 @@ BufferShmemSize(void)
169168
/* size of stuff controlled by freelist.c */
170169
size = add_size(size, StrategyShmemSize());
171170

172-
/*
173-
* It would be nice to include the I/O locks in the BufferDesc, but that
174-
* would increase the size of a BufferDesc to more than one cache line,
175-
* and benchmarking has shown that keeping every BufferDesc aligned on a
176-
* cache line boundary is important for performance. So, instead, the
177-
* array of I/O locks is allocated in a separate tranche. Because those
178-
* locks are not highly contended, we lay out the array with minimal
179-
* padding.
180-
*/
181-
size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
171+
/* size of I/O condition variables */
172+
size = add_size(size, mul_size(NBuffers,
173+
sizeof(ConditionVariableMinimallyPadded)));
182174
/* to allow aligning the above */
183175
size = add_size(size, PG_CACHE_LINE_SIZE);
184176

src/backend/storage/buffer/bufmgr.c

Lines changed: 18 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1352,9 +1352,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
13521352
LWLockRelease(newPartitionLock);
13531353

13541354
/*
1355-
* Buffer contents are currently invalid. Try to get the io_in_progress
1356-
* lock. If StartBufferIO returns false, then someone else managed to
1357-
* read it before we did, so there's nothing left for BufferAlloc() to do.
1355+
* Buffer contents are currently invalid. Try to obtain the right to
1356+
* start I/O. If StartBufferIO returns false, then someone else managed
1357+
* to read it before we did, so there's nothing left for BufferAlloc() to
1358+
* do.
13581359
*/
13591360
if (StartBufferIO(buf, true))
13601361
*foundPtr = false;
@@ -1777,9 +1778,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
17771778
*/
17781779
VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
17791780

1780-
/* I'd better not still hold any locks on the buffer */
1781+
/* I'd better not still hold the buffer content lock */
17811782
Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
1782-
Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
17831783

17841784
/*
17851785
* Decrement the shared reference count.
@@ -2742,9 +2742,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
27422742
uint32 buf_state;
27432743

27442744
/*
2745-
* Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2746-
* false, then someone else flushed the buffer before we could, so we need
2747-
* not do anything.
2745+
* Try to start an I/O operation. If StartBufferIO returns false, then
2746+
* someone else flushed the buffer before we could, so we need not do
2747+
* anything.
27482748
*/
27492749
if (!StartBufferIO(buf, false))
27502750
return;
@@ -2800,7 +2800,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
28002800
/*
28012801
* Now it's safe to write buffer to disk. Note that no one else should
28022802
* have been able to write it while we were busy with log flushing because
2803-
* we have the io_in_progress lock.
2803+
* only one process at a time can set the BM_IO_IN_PROGRESS bit.
28042804
*/
28052805
bufBlock = BufHdrGetBlock(buf);
28062806

@@ -2835,7 +2835,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
28352835

28362836
/*
28372837
* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2838-
* end the io_in_progress state.
2838+
* end the BM_IO_IN_PROGRESS state.
28392839
*/
28402840
TerminateBufferIO(buf, true, 0);
28412841

@@ -4271,7 +4271,7 @@ IsBufferCleanupOK(Buffer buffer)
42714271
* Functions for buffer I/O handling
42724272
*
42734273
* Note: We assume that nested buffer I/O never occurs.
4274-
* i.e at most one io_in_progress lock is held per proc.
4274+
* i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
42754275
*
42764276
* Also note that these are used only for shared buffers, not local ones.
42774277
*/
@@ -4282,13 +4282,9 @@ IsBufferCleanupOK(Buffer buffer)
42824282
static void
42834283
WaitIO(BufferDesc *buf)
42844284
{
4285-
/*
4286-
* Changed to wait until there's no IO - Inoue 01/13/2000
4287-
*
4288-
* Note this is *necessary* because an error abort in the process doing
4289-
* I/O could release the io_in_progress_lock prematurely. See
4290-
* AbortBufferIO.
4291-
*/
4285+
ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
4286+
4287+
ConditionVariablePrepareToSleep(cv);
42924288
for (;;)
42934289
{
42944290
uint32 buf_state;
@@ -4303,9 +4299,9 @@ WaitIO(BufferDesc *buf)
43034299

43044300
if (!(buf_state & BM_IO_IN_PROGRESS))
43054301
break;
4306-
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
4307-
LWLockRelease(BufferDescriptorGetIOLock(buf));
4302+
ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
43084303
}
4304+
ConditionVariableCancelSleep();
43094305
}
43104306

43114307
/*
@@ -4317,7 +4313,7 @@ WaitIO(BufferDesc *buf)
43174313
* In some scenarios there are race conditions in which multiple backends
43184314
* could attempt the same I/O operation concurrently. If someone else
43194315
* has already started I/O on this buffer then we will block on the
4320-
* io_in_progress lock until he's done.
4316+
* I/O condition variable until he's done.
43214317
*
43224318
* Input operations are only attempted on buffers that are not BM_VALID,
43234319
* and output operations only on buffers that are BM_VALID and BM_DIRTY,
@@ -4335,25 +4331,11 @@ StartBufferIO(BufferDesc *buf, bool forInput)
43354331

43364332
for (;;)
43374333
{
4338-
/*
4339-
* Grab the io_in_progress lock so that other processes can wait for
4340-
* me to finish the I/O.
4341-
*/
4342-
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
4343-
43444334
buf_state = LockBufHdr(buf);
43454335

43464336
if (!(buf_state & BM_IO_IN_PROGRESS))
43474337
break;
4348-
4349-
/*
4350-
* The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
4351-
* lock isn't held is if the process doing the I/O is recovering from
4352-
* an error (see AbortBufferIO). If that's the case, we must wait for
4353-
* him to get unwedged.
4354-
*/
43554338
UnlockBufHdr(buf, buf_state);
4356-
LWLockRelease(BufferDescriptorGetIOLock(buf));
43574339
WaitIO(buf);
43584340
}
43594341

@@ -4363,7 +4345,6 @@ StartBufferIO(BufferDesc *buf, bool forInput)
43634345
{
43644346
/* someone else already did the I/O */
43654347
UnlockBufHdr(buf, buf_state);
4366-
LWLockRelease(BufferDescriptorGetIOLock(buf));
43674348
return false;
43684349
}
43694350

@@ -4381,7 +4362,6 @@ StartBufferIO(BufferDesc *buf, bool forInput)
43814362
* (Assumptions)
43824363
* My process is executing IO for the buffer
43834364
* BM_IO_IN_PROGRESS bit is set for the buffer
4384-
* We hold the buffer's io_in_progress lock
43854365
* The buffer is Pinned
43864366
*
43874367
* If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
@@ -4413,7 +4393,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
44134393

44144394
InProgressBuf = NULL;
44154395

4416-
LWLockRelease(BufferDescriptorGetIOLock(buf));
4396+
ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
44174397
}
44184398

44194399
/*
@@ -4434,14 +4414,6 @@ AbortBufferIO(void)
44344414
{
44354415
uint32 buf_state;
44364416

4437-
/*
4438-
* Since LWLockReleaseAll has already been called, we're not holding
4439-
* the buffer's io_in_progress_lock. We have to re-acquire it so that
4440-
* we can use TerminateBufferIO. Anyone who's executing WaitIO on the
4441-
* buffer will be in a busy spin until we succeed in doing this.
4442-
*/
4443-
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
4444-
44454417
buf_state = LockBufHdr(buf);
44464418
Assert(buf_state & BM_IO_IN_PROGRESS);
44474419
if (IsForInput)

src/backend/storage/lmgr/lwlock.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,6 @@ static const char *const BuiltinTrancheNames[] = {
146146
"WALInsert",
147147
/* LWTRANCHE_BUFFER_CONTENT: */
148148
"BufferContent",
149-
/* LWTRANCHE_BUFFER_IO: */
150-
"BufferIO",
151149
/* LWTRANCHE_REPLICATION_ORIGIN_STATE: */
152150
"ReplicationOriginState",
153151
/* LWTRANCHE_REPLICATION_SLOT_IO: */
@@ -469,8 +467,7 @@ CreateLWLocks(void)
469467
StaticAssertStmt(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
470468
"MAX_BACKENDS too big for lwlock.c");
471469

472-
StaticAssertStmt(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE &&
473-
sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
470+
StaticAssertStmt(sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
474471
"Miscalculated LWLock padding");
475472

476473
if (!IsUnderPostmaster)

src/include/pgstat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,7 @@ typedef enum
971971
WAIT_EVENT_BGWORKER_SHUTDOWN,
972972
WAIT_EVENT_BGWORKER_STARTUP,
973973
WAIT_EVENT_BTREE_PAGE,
974+
WAIT_EVENT_BUFFER_IO,
974975
WAIT_EVENT_CHECKPOINT_DONE,
975976
WAIT_EVENT_CHECKPOINT_START,
976977
WAIT_EVENT_EXECUTE_GATHER,

src/include/storage/buf_internals.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "port/atomics.h"
1919
#include "storage/buf.h"
2020
#include "storage/bufmgr.h"
21+
#include "storage/condition_variable.h"
2122
#include "storage/latch.h"
2223
#include "storage/lwlock.h"
2324
#include "storage/shmem.h"
@@ -184,7 +185,6 @@ typedef struct BufferDesc
184185

185186
int wait_backend_pid; /* backend PID of pin-count waiter */
186187
int freeNext; /* link in freelist chain */
187-
188188
LWLock content_lock; /* to lock access to buffer contents */
189189
} BufferDesc;
190190

@@ -221,12 +221,12 @@ typedef union BufferDescPadded
221221

222222
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
223223

224-
#define BufferDescriptorGetIOLock(bdesc) \
225-
(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
224+
#define BufferDescriptorGetIOCV(bdesc) \
225+
(&(BufferIOCVArray[(bdesc)->buf_id]).cv)
226226
#define BufferDescriptorGetContentLock(bdesc) \
227227
((LWLock*) (&(bdesc)->content_lock))
228228

229-
extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
229+
extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
230230

231231
/*
232232
* The freeNext field is either the index of the next freelist entry,

src/include/storage/condition_variable.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,17 @@ typedef struct
3131
proclist_head wakeup; /* list of wake-able processes */
3232
} ConditionVariable;
3333

34+
/*
35+
* Pad a condition variable to a power-of-two size so that an array of
36+
* condition variables does not cross a cache line boundary.
37+
*/
38+
#define CV_MINIMAL_SIZE (sizeof(ConditionVariable) <= 16 ? 16 : 32)
39+
typedef union ConditionVariableMinimallyPadded
40+
{
41+
ConditionVariable cv;
42+
char pad[CV_MINIMAL_SIZE];
43+
} ConditionVariableMinimallyPadded;
44+
3445
/* Initialize a condition variable. */
3546
extern void ConditionVariableInit(ConditionVariable *cv);
3647

0 commit comments

Comments
 (0)