29
29
30
30
#ifdef IOMETHOD_IO_URING_ENABLED
31
31
32
+ #include <sys/mman.h>
33
+ #include <unistd.h>
34
+
32
35
#include <liburing.h>
33
36
34
37
#include "miscadmin.h"
@@ -94,12 +97,32 @@ PgAioUringContext
94
97
struct io_uring io_uring_ring ;
95
98
} PgAioUringContext ;
96
99
100
+ /*
101
+ * Information about the capabilities that io_uring has.
102
+ *
103
+ * Depending on liburing and kernel version different features are
104
+ * supported. At least for the kernel a kernel version check does not suffice
105
+ * as various vendors do backport features to older kernels :(.
106
+ */
107
+ typedef struct PgAioUringCaps
108
+ {
109
+ bool checked ;
110
+ /* -1 if io_uring_queue_init_mem() is unsupported */
111
+ int mem_init_size ;
112
+ } PgAioUringCaps ;
113
+
114
+
97
115
/* PgAioUringContexts for all backends */
98
116
static PgAioUringContext * pgaio_uring_contexts ;
99
117
100
118
/* the current backend's context */
101
119
static PgAioUringContext * pgaio_my_uring_context ;
102
120
121
+ static PgAioUringCaps pgaio_uring_caps =
122
+ {
123
+ .checked = false,
124
+ .mem_init_size = -1 ,
125
+ };
103
126
104
127
static uint32
105
128
pgaio_uring_procs (void )
@@ -111,30 +134,184 @@ pgaio_uring_procs(void)
111
134
return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS ;
112
135
}
113
136
114
- static Size
137
+ /*
138
+ * Initializes pgaio_uring_caps, unless that's already done.
139
+ */
140
+ static void
141
+ pgaio_uring_check_capabilities (void )
142
+ {
143
+ if (pgaio_uring_caps .checked )
144
+ return ;
145
+
146
+ /*
147
+ * By default io_uring creates a shared memory mapping for each io_uring
148
+ * instance, leading to a large number of memory mappings. Unfortunately a
149
+ * large number of memory mappings slows things down, backend exit is
150
+ * particularly affected. To address that, newer kernels (6.5) support
151
+ * using user-provided memory for the memory, by putting the relevant
152
+ * memory into shared memory we don't need any additional mappings.
153
+ *
154
+ * To know whether this is supported, we unfortunately need to probe the
155
+ * kernel by trying to create a ring with userspace-provided memory. This
156
+ * also has a secondary benefit: We can determine precisely how much
157
+ * memory we need for each io_uring instance.
158
+ */
159
+ #if defined(HAVE_LIBURING_QUEUE_INIT_MEM ) && defined(IORING_SETUP_NO_MMAP )
160
+ {
161
+ struct io_uring test_ring ;
162
+ size_t ring_size ;
163
+ void * ring_ptr ;
164
+ struct io_uring_params p = {0 };
165
+ int ret ;
166
+
167
+ /*
168
+ * Liburing does not yet provide an API to query how much memory a
169
+ * ring will need. So we over-estimate it here. As the memory is freed
170
+ * just below that's small temporary waste of memory.
171
+ *
172
+ * 1MB is more than enough for rings within io_max_concurrency's
173
+ * range.
174
+ */
175
+ ring_size = 1024 * 1024 ;
176
+
177
+ /*
178
+ * Hard to believe a system exists where 1MB would not be a multiple
179
+ * of the page size. But it's cheap to ensure...
180
+ */
181
+ ring_size -= ring_size % sysconf (_SC_PAGESIZE );
182
+
183
+ ring_ptr = mmap (NULL , ring_size , PROT_READ | PROT_WRITE , MAP_SHARED | MAP_ANONYMOUS , -1 , 0 );
184
+ if (ring_ptr == MAP_FAILED )
185
+ elog (ERROR ,
186
+ "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m" ,
187
+ ring_size );
188
+
189
+ ret = io_uring_queue_init_mem (io_max_concurrency , & test_ring , & p , ring_ptr , ring_size );
190
+ if (ret > 0 )
191
+ {
192
+ pgaio_uring_caps .mem_init_size = ret ;
193
+
194
+ elog (DEBUG1 ,
195
+ "can use combined memory mapping for io_uring, each ring needs %d bytes" ,
196
+ ret );
197
+
198
+ /* clean up the created ring, it was just for a test */
199
+ io_uring_queue_exit (& test_ring );
200
+ }
201
+ else
202
+ {
203
+ /*
204
+ * There are different reasons for ring creation to fail, but it's
205
+ * ok to treat that just as io_uring_queue_init_mem() not being
206
+ * supported. We'll report a more detailed error in
207
+ * pgaio_uring_shmem_init().
208
+ */
209
+ errno = - ret ;
210
+ elog (DEBUG1 ,
211
+ "cannot use combined memory mapping for io_uring, ring creation failed: %m" );
212
+
213
+ }
214
+
215
+ if (munmap (ring_ptr , ring_size ) != 0 )
216
+ elog (ERROR , "munmap() failed: %m" );
217
+ }
218
+ #else
219
+ {
220
+ elog (DEBUG1 ,
221
+ "can't use combined memory mapping for io_uring, kernel or liburing too old" );
222
+ }
223
+ #endif
224
+
225
+ pgaio_uring_caps .checked = true;
226
+ }
227
+
228
+ /*
229
+ * Memory for all PgAioUringContext instances
230
+ */
231
+ static size_t
115
232
pgaio_uring_context_shmem_size (void )
116
233
{
117
234
return mul_size (pgaio_uring_procs (), sizeof (PgAioUringContext ));
118
235
}
119
236
237
+ /*
238
+ * Memory for the combined memory used by io_uring instances. Returns 0 if
239
+ * that is not supported by kernel/liburing.
240
+ */
241
+ static size_t
242
+ pgaio_uring_ring_shmem_size (void )
243
+ {
244
+ size_t sz = 0 ;
245
+
246
+ if (pgaio_uring_caps .mem_init_size > 0 )
247
+ {
248
+ /*
249
+ * Memory for rings needs to be allocated to the page boundary,
250
+ * reserve space. Luckily it does not need to be aligned to hugepage
251
+ * boundaries, even if huge pages are used.
252
+ */
253
+ sz = add_size (sz , sysconf (_SC_PAGESIZE ));
254
+ sz = add_size (sz , mul_size (pgaio_uring_procs (),
255
+ pgaio_uring_caps .mem_init_size ));
256
+ }
257
+
258
+ return sz ;
259
+ }
260
+
120
261
static size_t
121
262
pgaio_uring_shmem_size (void )
122
263
{
123
- return pgaio_uring_context_shmem_size ();
264
+ size_t sz ;
265
+
266
+ /*
267
+ * Kernel and liburing support for various features influences how much
268
+ * shmem we need, perform the necessary checks.
269
+ */
270
+ pgaio_uring_check_capabilities ();
271
+
272
+ sz = pgaio_uring_context_shmem_size ();
273
+ sz = add_size (sz , pgaio_uring_ring_shmem_size ());
274
+
275
+ return sz ;
124
276
}
125
277
126
278
static void
127
279
pgaio_uring_shmem_init (bool first_time )
128
280
{
129
281
int TotalProcs = pgaio_uring_procs ();
130
282
bool found ;
283
+ char * shmem ;
284
+ size_t ring_mem_remain = 0 ;
285
+ char * ring_mem_next = 0 ;
131
286
132
- pgaio_uring_contexts = (PgAioUringContext * )
133
- ShmemInitStruct ("AioUring" , pgaio_uring_shmem_size (), & found );
134
-
287
+ /*
288
+ * We allocate memory for all PgAioUringContext instances and, if
289
+ * supported, the memory required for each of the io_uring instances, in
290
+ * one ShmemInitStruct().
291
+ */
292
+ shmem = ShmemInitStruct ("AioUringContext" , pgaio_uring_shmem_size (), & found );
135
293
if (found )
136
294
return ;
137
295
296
+ pgaio_uring_contexts = (PgAioUringContext * ) shmem ;
297
+ shmem += pgaio_uring_context_shmem_size ();
298
+
299
+ /* if supported, handle memory alignment / sizing for io_uring memory */
300
+ if (pgaio_uring_caps .mem_init_size > 0 )
301
+ {
302
+ ring_mem_remain = pgaio_uring_ring_shmem_size ();
303
+ ring_mem_next = (char * ) shmem ;
304
+
305
+ /* align to page boundary, see also pgaio_uring_ring_shmem_size() */
306
+ ring_mem_next = (char * ) TYPEALIGN (sysconf (_SC_PAGESIZE ), ring_mem_next );
307
+
308
+ /* account for alignment */
309
+ ring_mem_remain -= ring_mem_next - shmem ;
310
+ shmem += ring_mem_next - shmem ;
311
+
312
+ shmem += ring_mem_remain ;
313
+ }
314
+
138
315
for (int contextno = 0 ; contextno < TotalProcs ; contextno ++ )
139
316
{
140
317
PgAioUringContext * context = & pgaio_uring_contexts [contextno ];
@@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time)
158
335
* be worth using that - also need to evaluate if that causes
159
336
* noticeable additional contention?
160
337
*/
161
- ret = io_uring_queue_init (io_max_concurrency , & context -> io_uring_ring , 0 );
338
+
339
+ /*
340
+ * If supported (c.f. pgaio_uring_check_capabilities()), create ring
341
+ * with its data in shared memory. Otherwise fall back io_uring
342
+ * creating a memory mapping for each ring.
343
+ */
344
+ #if defined(HAVE_LIBURING_QUEUE_INIT_MEM ) && defined(IORING_SETUP_NO_MMAP )
345
+ if (pgaio_uring_caps .mem_init_size > 0 )
346
+ {
347
+ struct io_uring_params p = {0 };
348
+
349
+ ret = io_uring_queue_init_mem (io_max_concurrency , & context -> io_uring_ring , & p , ring_mem_next , ring_mem_remain );
350
+
351
+ ring_mem_remain -= ret ;
352
+ ring_mem_next += ret ;
353
+ }
354
+ else
355
+ #endif
356
+ {
357
+ ret = io_uring_queue_init (io_max_concurrency , & context -> io_uring_ring , 0 );
358
+ }
359
+
162
360
if (ret < 0 )
163
361
{
164
362
char * hint = NULL ;
0 commit comments