@@ -179,6 +179,7 @@ static void CheckArchiveTimeout(void);
179
179
static void BgWriterNap (void );
180
180
static bool IsCheckpointOnSchedule (double progress );
181
181
static bool ImmediateCheckpointRequested (void );
182
+ static bool CompactBgwriterRequestQueue (void );
182
183
183
184
/* Signal handlers */
184
185
@@ -1061,14 +1062,15 @@ RequestCheckpoint(int flags)
1061
1062
* use high values for special flags; that's all internal to md.c, which
1062
1063
* see for details.)
1063
1064
*
1064
- * If we are unable to pass over the request (at present, this can happen
1065
- * if the shared memory queue is full), we return false. That forces
1066
- * the backend to do its own fsync. We hope that will be even more seldom.
1067
- *
1068
- * Note: we presently make no attempt to eliminate duplicate requests
1069
- * in the requests[] queue. The bgwriter will have to eliminate dups
1070
- * internally anyway, so we may as well avoid holding the lock longer
1071
- * than we have to here.
1065
+ * To avoid holding the lock for longer than necessary, we normally write
1066
+ * to the requests[] queue without checking for duplicates. The bgwriter
1067
+ * will have to eliminate dups internally anyway. However, if we discover
1068
+ * that the queue is full, we make a pass over the entire queue to compact
1069
+ * it. This is somewhat expensive, but the alternative is for the backend
1070
+ * to perform its own fsync, which is far more expensive in practice. It
1071
+ * is theoretically possible a backend fsync might still be necessary, if
1072
+ * the queue is full and contains no duplicate entries. In that case, we
1073
+ * let the backend know by returning false.
1072
1074
*/
1073
1075
bool
1074
1076
ForwardFsyncRequest (RelFileNode rnode , ForkNumber forknum , BlockNumber segno )
@@ -1086,8 +1088,15 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
1086
1088
/* we count non-bgwriter writes even when the request queue overflows */
1087
1089
BgWriterShmem -> num_backend_writes ++ ;
1088
1090
1091
+ /*
1092
+ * If the background writer isn't running or the request queue is full,
1093
+ * the backend will have to perform its own fsync request. But before
1094
+ * forcing that to happen, we can try to compact the background writer
1095
+ * request queue.
1096
+ */
1089
1097
if (BgWriterShmem -> bgwriter_pid == 0 ||
1090
- BgWriterShmem -> num_requests >= BgWriterShmem -> max_requests )
1098
+ (BgWriterShmem -> num_requests >= BgWriterShmem -> max_requests
1099
+ && !CompactBgwriterRequestQueue ()))
1091
1100
{
1092
1101
LWLockRelease (BgWriterCommLock );
1093
1102
return false;
@@ -1100,6 +1109,108 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
1100
1109
return true;
1101
1110
}
1102
1111
1112
+ /*
1113
+ * CompactBgwriterRequestQueue
1114
+ * Remove duplicates from the request queue to avoid backend fsyncs.
1115
+ *
1116
+ * Although a full fsync request queue is not common, it can lead to severe
1117
+ * performance problems when it does happen. So far, this situation has
1118
+ * only been observed to occur when the system is under heavy write load,
1119
+ * and especially during the "sync" phase of a checkpoint. Without this
1120
+ * logic, each backend begins doing an fsync for every block written, which
1121
+ * gets very expensive and can slow down the whole system.
1122
+ *
1123
+ * Trying to do this every time the queue is full could lose if there
1124
+ * aren't any removable entries. But should be vanishingly rare in
1125
+ * practice: there's one queue entry per shared buffer.
1126
+ */
1127
+ static bool
1128
+ CompactBgwriterRequestQueue ()
1129
+ {
1130
+ struct BgWriterSlotMapping {
1131
+ BgWriterRequest request ;
1132
+ int slot ;
1133
+ };
1134
+
1135
+ int n ,
1136
+ preserve_count ;
1137
+ int num_skipped = 0 ;
1138
+ HASHCTL ctl ;
1139
+ HTAB * htab ;
1140
+ bool * skip_slot ;
1141
+
1142
+ /* must hold BgWriterCommLock in exclusive mode */
1143
+ Assert (LWLockHeldByMe (BgWriterCommLock ));
1144
+
1145
+ /* Initialize temporary hash table */
1146
+ MemSet (& ctl , 0 , sizeof (ctl ));
1147
+ ctl .keysize = sizeof (BgWriterRequest );
1148
+ ctl .entrysize = sizeof (struct BgWriterSlotMapping );
1149
+ ctl .hash = tag_hash ;
1150
+ htab = hash_create ("CompactBgwriterRequestQueue" ,
1151
+ BgWriterShmem -> num_requests ,
1152
+ & ctl ,
1153
+ HASH_ELEM | HASH_FUNCTION );
1154
+
1155
+ /* Initialize skip_slot array */
1156
+ skip_slot = palloc0 (sizeof (bool ) * BgWriterShmem -> num_requests );
1157
+
1158
+ /*
1159
+ * The basic idea here is that a request can be skipped if it's followed
1160
+ * by a later, identical request. It might seem more sensible to work
1161
+ * backwards from the end of the queue and check whether a request is
1162
+ * *preceded* by an earlier, identical request, in the hopes of doing less
1163
+ * copying. But that might change the semantics, if there's an
1164
+ * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
1165
+ * we do it this way. It would be possible to be even smarter if we made
1166
+ * the code below understand the specific semantics of such requests (it
1167
+ * could blow away preceding entries that would end up being cancelled
1168
+ * anyhow), but it's not clear that the extra complexity would buy us
1169
+ * anything.
1170
+ */
1171
+ for (n = 0 ; n < BgWriterShmem -> num_requests ; ++ n )
1172
+ {
1173
+ BgWriterRequest * request ;
1174
+ struct BgWriterSlotMapping * slotmap ;
1175
+ bool found ;
1176
+
1177
+ request = & BgWriterShmem -> requests [n ];
1178
+ slotmap = hash_search (htab , request , HASH_ENTER , & found );
1179
+ if (found )
1180
+ {
1181
+ skip_slot [slotmap -> slot ] = true;
1182
+ ++ num_skipped ;
1183
+ }
1184
+ slotmap -> slot = n ;
1185
+ }
1186
+
1187
+ /* Done with the hash table. */
1188
+ hash_destroy (htab );
1189
+
1190
+ /* If no duplicates, we're out of luck. */
1191
+ if (!num_skipped )
1192
+ {
1193
+ pfree (skip_slot );
1194
+ return false;
1195
+ }
1196
+
1197
+ /* We found some duplicates; remove them. */
1198
+ for (n = 0 , preserve_count = 0 ; n < BgWriterShmem -> num_requests ; ++ n )
1199
+ {
1200
+ if (skip_slot [n ])
1201
+ continue ;
1202
+ BgWriterShmem -> requests [preserve_count ++ ] = BgWriterShmem -> requests [n ];
1203
+ }
1204
+ ereport (DEBUG1 ,
1205
+ (errmsg ("compacted fsync request queue from %d entries to %d entries" ,
1206
+ BgWriterShmem -> num_requests , preserve_count )));
1207
+ BgWriterShmem -> num_requests = preserve_count ;
1208
+
1209
+ /* Cleanup. */
1210
+ pfree (skip_slot );
1211
+ return true;
1212
+ }
1213
+
1103
1214
/*
1104
1215
* AbsorbFsyncRequests
1105
1216
* Retrieve queued fsync requests and pass them to local smgr.
0 commit comments