Skip to content

Commit c325a76

Browse files
committed
aio: Add io_method=io_uring
Performing AIO using io_uring can be considerably faster than io_method=worker, particularly when lots of small IOs are issued, as a) the context-switch overhead for worker based AIO becomes more significant b) the number of IO workers can become limiting io_uring, however, is linux specific and requires an additional compile-time dependency (liburing). This implementation is fairly simple and there are substantial optimization opportunities. The description of the existing AIO_IO_COMPLETION wait event is updated to make the difference between it and the new AIO_IO_URING_EXECUTION clearer. Reviewed-by: Noah Misch <noah@leadboat.com> Reviewed-by: Jakub Wartak <jakub.wartak@enterprisedb.com> Discussion: https://postgr.es/m/uvrtrknj4kdytuboidbhwclo4gxhswwcpgadptsjvjqcluzmah%40brqs62irg4dt Discussion: https://postgr.es/m/20210223100344.llw5an2aklengrmn@alap3.anarazel.de Discussion: https://postgr.es/m/stj36ea6yyhoxtqkhpieia2z4krnam7qyetc57rfezgk4zgapf@gcnactj4z56m
1 parent 8eadd5c commit c325a76

File tree

14 files changed

+589
-2
lines changed

14 files changed

+589
-2
lines changed

.cirrus.tasks.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -493,11 +493,14 @@ task:
493493
# - Uses undefined behaviour and alignment sanitizers, sanitizer failures
494494
# are typically printed in the server log
495495
# - Test both 64bit and 32 bit builds
496+
# - uses io_method=io_uring
496497
- name: Linux - Debian Bookworm - Meson
497498

498499
env:
499500
CCACHE_MAXSIZE: "400M" # tests two different builds
500501
SANITIZER_FLAGS: -fsanitize=alignment,undefined
502+
PG_TEST_INITDB_EXTRA_OPTS: >-
503+
-c io_method=io_uring
501504
502505
configure_script: |
503506
su postgres <<-EOF

doc/src/sgml/config.sgml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2710,6 +2710,14 @@ include_dir 'conf.d'
27102710
<literal>worker</literal> (execute asynchronous I/O using worker processes)
27112711
</para>
27122712
</listitem>
2713+
<listitem>
2714+
<para>
2715+
<literal>io_uring</literal> (execute asynchronous I/O using
2716+
io_uring, requires a build with
2717+
<link linkend="configure-option-with-liburing"><option>--with-liburing</option></link> /
2718+
<link linkend="configure-with-liburing-meson"><option>-Dliburing</option></link>)
2719+
</para>
2720+
</listitem>
27132721
<listitem>
27142722
<para>
27152723
<literal>sync</literal> (execute asynchronous-eligible I/O synchronously)

src/backend/storage/aio/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ OBJS = \
1414
aio_init.o \
1515
aio_io.o \
1616
aio_target.o \
17+
method_io_uring.o \
1718
method_sync.o \
1819
method_worker.o \
1920
read_stream.o

src/backend/storage/aio/aio.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
6565
const struct config_enum_entry io_method_options[] = {
6666
{"sync", IOMETHOD_SYNC, false},
6767
{"worker", IOMETHOD_WORKER, false},
68+
#ifdef IOMETHOD_IO_URING_ENABLED
69+
{"io_uring", IOMETHOD_IO_URING, false},
70+
#endif
6871
{NULL, 0, false}
6972
};
7073

@@ -82,6 +85,9 @@ PgAioBackend *pgaio_my_backend;
8285
static const IoMethodOps *const pgaio_method_ops_table[] = {
8386
[IOMETHOD_SYNC] = &pgaio_sync_ops,
8487
[IOMETHOD_WORKER] = &pgaio_worker_ops,
88+
#ifdef IOMETHOD_IO_URING_ENABLED
89+
[IOMETHOD_IO_URING] = &pgaio_uring_ops,
90+
#endif
8591
};
8692

8793
/* callbacks for the configured io_method, set by assign_io_method */
@@ -1118,6 +1124,41 @@ pgaio_closing_fd(int fd)
11181124
* it's probably not worth it.
11191125
*/
11201126
pgaio_submit_staged();
1127+
1128+
/*
1129+
* If requested by the IO method, wait for all IOs that use the
1130+
* to-be-closed FD.
1131+
*/
1132+
if (pgaio_method_ops->wait_on_fd_before_close)
1133+
{
1134+
/*
1135+
* As waiting for one IO to complete may complete multiple IOs, we
1136+
* can't just use a mutable list iterator. The maximum number of
1137+
* in-flight IOs is fairly small, so just restart the loop after
1138+
* waiting for an IO.
1139+
*/
1140+
while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
1141+
{
1142+
dlist_iter iter;
1143+
PgAioHandle *ioh = NULL;
1144+
1145+
dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
1146+
{
1147+
ioh = dclist_container(PgAioHandle, node, iter.cur);
1148+
1149+
if (pgaio_io_uses_fd(ioh, fd))
1150+
break;
1151+
else
1152+
ioh = NULL;
1153+
}
1154+
1155+
if (!ioh)
1156+
break;
1157+
1158+
/* see comment in pgaio_io_wait_for_free() about raciness */
1159+
pgaio_io_wait(ioh, ioh->generation);
1160+
}
1161+
}
11211162
}
11221163

11231164
/*

src/backend/storage/aio/aio_io.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,3 +188,25 @@ pgaio_io_get_op_name(PgAioHandle *ioh)
188188

189189
return NULL; /* silence compiler */
190190
}
191+
192+
/*
193+
* Used to determine if an IO needs to be waited upon before the file
194+
* descriptor can be closed.
195+
*/
196+
bool
197+
pgaio_io_uses_fd(PgAioHandle *ioh, int fd)
198+
{
199+
Assert(ioh->state >= PGAIO_HS_DEFINED);
200+
201+
switch (ioh->op)
202+
{
203+
case PGAIO_OP_READV:
204+
return ioh->op_data.read.fd == fd;
205+
case PGAIO_OP_WRITEV:
206+
return ioh->op_data.write.fd == fd;
207+
case PGAIO_OP_INVALID:
208+
return false;
209+
}
210+
211+
return false; /* silence compiler */
212+
}

src/backend/storage/aio/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ backend_sources += files(
66
'aio_init.c',
77
'aio_io.c',
88
'aio_target.c',
9+
'method_io_uring.c',
910
'method_sync.c',
1011
'method_worker.c',
1112
'read_stream.c',

0 commit comments

Comments
 (0)