Skip to content

Commit f54af9f

Browse files
committed
aio: Combine io_uring memory mappings, if supported
By default io_uring creates a shared memory mapping for each io_uring instance, leading to a large number of memory mappings. Unfortunately a large number of memory mappings slows things down, backend exit is particularly affected. To address that, newer kernels (6.5) support using user-provided memory for the memory. By putting the relevant memory into shared memory we don't need any additional mappings. On a system with a new enough kernel and liburing, there is no discernible overhead when doing a pgbench -S -C anymore. Reported-by: MARK CALLAGHAN <mdcallag@gmail.com> Reviewed-by: "Burd, Greg" <greg@burd.me> Reviewed-by: Jim Nasby <jnasby@upgrade.com> Discussion: https://postgr.es/m/CAFbpF8OA44_UG+RYJcWH9WjF7E3GA6gka3gvH6nsrSnEe9H0NA@mail.gmail.com Backpatch-through: 18
1 parent 55a780e commit f54af9f

File tree

6 files changed

+238
-6
lines changed

6 files changed

+238
-6
lines changed

configure

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13309,6 +13309,23 @@ fi
1330913309

1331013310
fi
1331113311

13312+
if test "$with_liburing" = yes; then
13313+
_LIBS="$LIBS"
13314+
LIBS="$LIBURING_LIBS $LIBS"
13315+
for ac_func in io_uring_queue_init_mem
13316+
do :
13317+
ac_fn_c_check_func "$LINENO" "io_uring_queue_init_mem" "ac_cv_func_io_uring_queue_init_mem"
13318+
if test "x$ac_cv_func_io_uring_queue_init_mem" = xyes; then :
13319+
cat >>confdefs.h <<_ACEOF
13320+
#define HAVE_IO_URING_QUEUE_INIT_MEM 1
13321+
_ACEOF
13322+
13323+
fi
13324+
done
13325+
13326+
LIBS="$_LIBS"
13327+
fi
13328+
1331213329
if test "$with_lz4" = yes ; then
1331313330
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5
1331413331
$as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; }

configure.ac

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,6 +1420,13 @@ if test "$with_libxslt" = yes ; then
14201420
AC_CHECK_LIB(xslt, xsltCleanupGlobals, [], [AC_MSG_ERROR([library 'xslt' is required for XSLT support])])
14211421
fi
14221422

1423+
if test "$with_liburing" = yes; then
1424+
_LIBS="$LIBS"
1425+
LIBS="$LIBURING_LIBS $LIBS"
1426+
AC_CHECK_FUNCS([io_uring_queue_init_mem])
1427+
LIBS="$_LIBS"
1428+
fi
1429+
14231430
if test "$with_lz4" = yes ; then
14241431
AC_CHECK_LIB(lz4, LZ4_compress_default, [], [AC_MSG_ERROR([library 'lz4' is required for LZ4 support])])
14251432
fi

meson.build

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -995,6 +995,12 @@ liburingopt = get_option('liburing')
995995
liburing = dependency('liburing', required: liburingopt)
996996
if liburing.found()
997997
cdata.set('USE_LIBURING', 1)
998+
999+
if cc.has_function('io_uring_queue_init_mem',
1000+
dependencies: liburing, args: test_c_args)
1001+
cdata.set('HAVE_LIBURING_QUEUE_INIT_MEM', 1)
1002+
endif
1003+
9981004
endif
9991005

10001006

src/backend/storage/aio/method_io_uring.c

Lines changed: 204 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929

3030
#ifdef IOMETHOD_IO_URING_ENABLED
3131

32+
#include <sys/mman.h>
33+
#include <unistd.h>
34+
3235
#include <liburing.h>
3336

3437
#include "miscadmin.h"
@@ -94,12 +97,32 @@ PgAioUringContext
9497
struct io_uring io_uring_ring;
9598
} PgAioUringContext;
9699

100+
/*
101+
* Information about the capabilities that io_uring has.
102+
*
103+
* Depending on liburing and kernel version different features are
104+
* supported. At least for the kernel a kernel version check does not suffice
105+
* as various vendors do backport features to older kernels :(.
106+
*/
107+
typedef struct PgAioUringCaps
108+
{
109+
bool checked;
110+
/* -1 if io_uring_queue_init_mem() is unsupported */
111+
int mem_init_size;
112+
} PgAioUringCaps;
113+
114+
97115
/* PgAioUringContexts for all backends */
98116
static PgAioUringContext *pgaio_uring_contexts;
99117

100118
/* the current backend's context */
101119
static PgAioUringContext *pgaio_my_uring_context;
102120

121+
static PgAioUringCaps pgaio_uring_caps =
122+
{
123+
.checked = false,
124+
.mem_init_size = -1,
125+
};
103126

104127
static uint32
105128
pgaio_uring_procs(void)
@@ -111,30 +134,184 @@ pgaio_uring_procs(void)
111134
return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS;
112135
}
113136

114-
static Size
137+
/*
138+
* Initializes pgaio_uring_caps, unless that's already done.
139+
*/
140+
static void
141+
pgaio_uring_check_capabilities(void)
142+
{
143+
if (pgaio_uring_caps.checked)
144+
return;
145+
146+
/*
147+
* By default io_uring creates a shared memory mapping for each io_uring
148+
* instance, leading to a large number of memory mappings. Unfortunately a
149+
* large number of memory mappings slows things down, backend exit is
150+
* particularly affected. To address that, newer kernels (6.5) support
151+
* using user-provided memory for the memory, by putting the relevant
152+
* memory into shared memory we don't need any additional mappings.
153+
*
154+
* To know whether this is supported, we unfortunately need to probe the
155+
* kernel by trying to create a ring with userspace-provided memory. This
156+
* also has a secondary benefit: We can determine precisely how much
157+
* memory we need for each io_uring instance.
158+
*/
159+
#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
160+
{
161+
struct io_uring test_ring;
162+
size_t ring_size;
163+
void *ring_ptr;
164+
struct io_uring_params p = {0};
165+
int ret;
166+
167+
/*
168+
* Liburing does not yet provide an API to query how much memory a
169+
* ring will need. So we over-estimate it here. As the memory is freed
170+
* just below that's small temporary waste of memory.
171+
*
172+
* 1MB is more than enough for rings within io_max_concurrency's
173+
* range.
174+
*/
175+
ring_size = 1024 * 1024;
176+
177+
/*
178+
* Hard to believe a system exists where 1MB would not be a multiple
179+
* of the page size. But it's cheap to ensure...
180+
*/
181+
ring_size -= ring_size % sysconf(_SC_PAGESIZE);
182+
183+
ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
184+
if (ring_ptr == MAP_FAILED)
185+
elog(ERROR,
186+
"mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m",
187+
ring_size);
188+
189+
ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size);
190+
if (ret > 0)
191+
{
192+
pgaio_uring_caps.mem_init_size = ret;
193+
194+
elog(DEBUG1,
195+
"can use combined memory mapping for io_uring, each ring needs %d bytes",
196+
ret);
197+
198+
/* clean up the created ring, it was just for a test */
199+
io_uring_queue_exit(&test_ring);
200+
}
201+
else
202+
{
203+
/*
204+
* There are different reasons for ring creation to fail, but it's
205+
* ok to treat that just as io_uring_queue_init_mem() not being
206+
* supported. We'll report a more detailed error in
207+
* pgaio_uring_shmem_init().
208+
*/
209+
errno = -ret;
210+
elog(DEBUG1,
211+
"cannot use combined memory mapping for io_uring, ring creation failed: %m");
212+
213+
}
214+
215+
if (munmap(ring_ptr, ring_size) != 0)
216+
elog(ERROR, "munmap() failed: %m");
217+
}
218+
#else
219+
{
220+
elog(DEBUG1,
221+
"can't use combined memory mapping for io_uring, kernel or liburing too old");
222+
}
223+
#endif
224+
225+
pgaio_uring_caps.checked = true;
226+
}
227+
228+
/*
229+
* Memory for all PgAioUringContext instances
230+
*/
231+
static size_t
115232
pgaio_uring_context_shmem_size(void)
116233
{
117234
return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext));
118235
}
119236

237+
/*
238+
* Memory for the combined memory used by io_uring instances. Returns 0 if
239+
* that is not supported by kernel/liburing.
240+
*/
241+
static size_t
242+
pgaio_uring_ring_shmem_size(void)
243+
{
244+
size_t sz = 0;
245+
246+
if (pgaio_uring_caps.mem_init_size > 0)
247+
{
248+
/*
249+
* Memory for rings needs to be allocated to the page boundary,
250+
* reserve space. Luckily it does not need to be aligned to hugepage
251+
* boundaries, even if huge pages are used.
252+
*/
253+
sz = add_size(sz, sysconf(_SC_PAGESIZE));
254+
sz = add_size(sz, mul_size(pgaio_uring_procs(),
255+
pgaio_uring_caps.mem_init_size));
256+
}
257+
258+
return sz;
259+
}
260+
120261
static size_t
121262
pgaio_uring_shmem_size(void)
122263
{
123-
return pgaio_uring_context_shmem_size();
264+
size_t sz;
265+
266+
/*
267+
* Kernel and liburing support for various features influences how much
268+
* shmem we need, perform the necessary checks.
269+
*/
270+
pgaio_uring_check_capabilities();
271+
272+
sz = pgaio_uring_context_shmem_size();
273+
sz = add_size(sz, pgaio_uring_ring_shmem_size());
274+
275+
return sz;
124276
}
125277

126278
static void
127279
pgaio_uring_shmem_init(bool first_time)
128280
{
129281
int TotalProcs = pgaio_uring_procs();
130282
bool found;
283+
char *shmem;
284+
size_t ring_mem_remain = 0;
285+
char *ring_mem_next = 0;
131286

132-
pgaio_uring_contexts = (PgAioUringContext *)
133-
ShmemInitStruct("AioUring", pgaio_uring_shmem_size(), &found);
134-
287+
/*
288+
* We allocate memory for all PgAioUringContext instances and, if
289+
* supported, the memory required for each of the io_uring instances, in
290+
* one ShmemInitStruct().
291+
*/
292+
shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found);
135293
if (found)
136294
return;
137295

296+
pgaio_uring_contexts = (PgAioUringContext *) shmem;
297+
shmem += pgaio_uring_context_shmem_size();
298+
299+
/* if supported, handle memory alignment / sizing for io_uring memory */
300+
if (pgaio_uring_caps.mem_init_size > 0)
301+
{
302+
ring_mem_remain = pgaio_uring_ring_shmem_size();
303+
ring_mem_next = (char *) shmem;
304+
305+
/* align to page boundary, see also pgaio_uring_ring_shmem_size() */
306+
ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next);
307+
308+
/* account for alignment */
309+
ring_mem_remain -= ring_mem_next - shmem;
310+
shmem += ring_mem_next - shmem;
311+
312+
shmem += ring_mem_remain;
313+
}
314+
138315
for (int contextno = 0; contextno < TotalProcs; contextno++)
139316
{
140317
PgAioUringContext *context = &pgaio_uring_contexts[contextno];
@@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time)
158335
* be worth using that - also need to evaluate if that causes
159336
* noticeable additional contention?
160337
*/
161-
ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
338+
339+
/*
340+
* If supported (c.f. pgaio_uring_check_capabilities()), create ring
341+
* with its data in shared memory. Otherwise fall back io_uring
342+
* creating a memory mapping for each ring.
343+
*/
344+
#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
345+
if (pgaio_uring_caps.mem_init_size > 0)
346+
{
347+
struct io_uring_params p = {0};
348+
349+
ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain);
350+
351+
ring_mem_remain -= ret;
352+
ring_mem_next += ret;
353+
}
354+
else
355+
#endif
356+
{
357+
ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
358+
}
359+
162360
if (ret < 0)
163361
{
164362
char *hint = NULL;

src/include/pg_config.h.in

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,9 @@
229229
/* Define to 1 if you have the global variable 'int timezone'. */
230230
#undef HAVE_INT_TIMEZONE
231231

232+
/* Define to 1 if you have the `io_uring_queue_init_mem' function. */
233+
#undef HAVE_IO_URING_QUEUE_INIT_MEM
234+
232235
/* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */
233236
#undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
234237

src/tools/pgindent/typedefs.list

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2181,6 +2181,7 @@ PgAioReturn
21812181
PgAioTargetData
21822182
PgAioTargetID
21832183
PgAioTargetInfo
2184+
PgAioUringCaps
21842185
PgAioUringContext
21852186
PgAioWaitRef
21862187
PgArchData

0 commit comments

Comments
 (0)