Skip to content

Commit 73a3b1d

Browse files
committed
Backport fsync queue compaction logic to all supported branches.
This backports commit 7f242d8, except for the counter in pg_stat_bgwriter. The underlying problem (namely, that a full fsync request queue causes terrible checkpoint behavior) continues to be reported in the wild, and this code seems to be safe and robust enough to risk back-porting the fix.
1 parent d8bd584 commit 73a3b1d

File tree

2 files changed

+127
-10
lines changed

2 files changed

+127
-10
lines changed

src/backend/postmaster/bgwriter.c

Lines changed: 120 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ static void CheckArchiveTimeout(void);
179179
static void BgWriterNap(void);
180180
static bool IsCheckpointOnSchedule(double progress);
181181
static bool ImmediateCheckpointRequested(void);
182+
static bool CompactBgwriterRequestQueue(void);
182183

183184
/* Signal handlers */
184185

@@ -1061,14 +1062,15 @@ RequestCheckpoint(int flags)
10611062
* use high values for special flags; that's all internal to md.c, which
10621063
* see for details.)
10631064
*
1064-
* If we are unable to pass over the request (at present, this can happen
1065-
* if the shared memory queue is full), we return false. That forces
1066-
* the backend to do its own fsync. We hope that will be even more seldom.
1067-
*
1068-
* Note: we presently make no attempt to eliminate duplicate requests
1069-
* in the requests[] queue. The bgwriter will have to eliminate dups
1070-
* internally anyway, so we may as well avoid holding the lock longer
1071-
* than we have to here.
1065+
* To avoid holding the lock for longer than necessary, we normally write
1066+
* to the requests[] queue without checking for duplicates. The bgwriter
1067+
* will have to eliminate dups internally anyway. However, if we discover
1068+
* that the queue is full, we make a pass over the entire queue to compact
1069+
* it. This is somewhat expensive, but the alternative is for the backend
1070+
* to perform its own fsync, which is far more expensive in practice. It
1071+
* is theoretically possible a backend fsync might still be necessary, if
1072+
* the queue is full and contains no duplicate entries. In that case, we
1073+
* let the backend know by returning false.
10721074
*/
10731075
bool
10741076
ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
@@ -1086,8 +1088,15 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
10861088
/* we count non-bgwriter writes even when the request queue overflows */
10871089
BgWriterShmem->num_backend_writes++;
10881090

1091+
/*
1092+
* If the background writer isn't running or the request queue is full,
1093+
* the backend will have to perform its own fsync request. But before
1094+
* forcing that to happen, we can try to compact the background writer
1095+
* request queue.
1096+
*/
10891097
if (BgWriterShmem->bgwriter_pid == 0 ||
1090-
BgWriterShmem->num_requests >= BgWriterShmem->max_requests)
1098+
(BgWriterShmem->num_requests >= BgWriterShmem->max_requests
1099+
&& !CompactBgwriterRequestQueue()))
10911100
{
10921101
LWLockRelease(BgWriterCommLock);
10931102
return false;
@@ -1100,6 +1109,108 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
11001109
return true;
11011110
}
11021111

1112+
/*
1113+
* CompactBgwriterRequestQueue
1114+
* Remove duplicates from the request queue to avoid backend fsyncs.
1115+
*
1116+
* Although a full fsync request queue is not common, it can lead to severe
1117+
* performance problems when it does happen. So far, this situation has
1118+
* only been observed to occur when the system is under heavy write load,
1119+
* and especially during the "sync" phase of a checkpoint. Without this
1120+
* logic, each backend begins doing an fsync for every block written, which
1121+
* gets very expensive and can slow down the whole system.
1122+
*
1123+
* Trying to do this every time the queue is full could lose if there
1124+
* aren't any removable entries. But should be vanishingly rare in
1125+
* practice: there's one queue entry per shared buffer.
1126+
*/
1127+
static bool
1128+
CompactBgwriterRequestQueue()
1129+
{
1130+
struct BgWriterSlotMapping {
1131+
BgWriterRequest request;
1132+
int slot;
1133+
};
1134+
1135+
int n,
1136+
preserve_count;
1137+
int num_skipped = 0;
1138+
HASHCTL ctl;
1139+
HTAB *htab;
1140+
bool *skip_slot;
1141+
1142+
/* must hold BgWriterCommLock in exclusive mode */
1143+
Assert(LWLockHeldByMe(BgWriterCommLock));
1144+
1145+
/* Initialize temporary hash table */
1146+
MemSet(&ctl, 0, sizeof(ctl));
1147+
ctl.keysize = sizeof(BgWriterRequest);
1148+
ctl.entrysize = sizeof(struct BgWriterSlotMapping);
1149+
ctl.hash = tag_hash;
1150+
htab = hash_create("CompactBgwriterRequestQueue",
1151+
BgWriterShmem->num_requests,
1152+
&ctl,
1153+
HASH_ELEM | HASH_FUNCTION);
1154+
1155+
/* Initialize skip_slot array */
1156+
skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
1157+
1158+
/*
1159+
* The basic idea here is that a request can be skipped if it's followed
1160+
* by a later, identical request. It might seem more sensible to work
1161+
* backwards from the end of the queue and check whether a request is
1162+
* *preceded* by an earlier, identical request, in the hopes of doing less
1163+
* copying. But that might change the semantics, if there's an
1164+
* intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
1165+
* we do it this way. It would be possible to be even smarter if we made
1166+
* the code below understand the specific semantics of such requests (it
1167+
* could blow away preceding entries that would end up being cancelled
1168+
* anyhow), but it's not clear that the extra complexity would buy us
1169+
* anything.
1170+
*/
1171+
for (n = 0; n < BgWriterShmem->num_requests; ++n)
1172+
{
1173+
BgWriterRequest *request;
1174+
struct BgWriterSlotMapping *slotmap;
1175+
bool found;
1176+
1177+
request = &BgWriterShmem->requests[n];
1178+
slotmap = hash_search(htab, request, HASH_ENTER, &found);
1179+
if (found)
1180+
{
1181+
skip_slot[slotmap->slot] = true;
1182+
++num_skipped;
1183+
}
1184+
slotmap->slot = n;
1185+
}
1186+
1187+
/* Done with the hash table. */
1188+
hash_destroy(htab);
1189+
1190+
/* If no duplicates, we're out of luck. */
1191+
if (!num_skipped)
1192+
{
1193+
pfree(skip_slot);
1194+
return false;
1195+
}
1196+
1197+
/* We found some duplicates; remove them. */
1198+
for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n)
1199+
{
1200+
if (skip_slot[n])
1201+
continue;
1202+
BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n];
1203+
}
1204+
ereport(DEBUG1,
1205+
(errmsg("compacted fsync request queue from %d entries to %d entries",
1206+
BgWriterShmem->num_requests, preserve_count)));
1207+
BgWriterShmem->num_requests = preserve_count;
1208+
1209+
/* Cleanup. */
1210+
pfree(skip_slot);
1211+
return true;
1212+
}
1213+
11031214
/*
11041215
* AbsorbFsyncRequests
11051216
* Retrieve queued fsync requests and pass them to local smgr.

src/backend/storage/smgr/md.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,13 @@
3333
/* interval for calling AbsorbFsyncRequests in mdsync */
3434
#define FSYNCS_PER_ABSORB 10
3535

36-
/* special values for the segno arg to RememberFsyncRequest */
36+
/*
37+
* Special values for the segno arg to RememberFsyncRequest.
38+
*
39+
* Note that CompactBgwriterRequestQueue assumes that it's OK to remove an
40+
* fsync request from the queue if an identical, subsequent request is found.
41+
* See comments there before making changes here.
42+
*/
3743
#define FORGET_RELATION_FSYNC (InvalidBlockNumber)
3844
#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
3945
#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)

0 commit comments

Comments
 (0)