Skip to content

Commit 221c5eb

Browse files
committed
io_uring: add support for IORING_OP_POLL
This is basically a direct port of bfe4037, which implements a one-shot poll command through aio. Description below is based on that commit as well. However, instead of adding a POLL command and relying on io_cancel(2) to remove it, we mimic the epoll(2) interface of having a command to add a poll notification, IORING_OP_POLL_ADD, and one to remove it again, IORING_OP_POLL_REMOVE. To poll for a file descriptor the application should submit an sqe of type IORING_OP_POLL. It will poll the fd for the events specified in the poll_events field. Unlike poll or epoll without EPOLLONESHOT this interface always works in one shot mode, that is once the sqe is completed, it will have to be resubmitted. Reviewed-by: Hannes Reinecke <hare@suse.com> Based-on-code-from: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent c16361c commit 221c5eb

File tree

2 files changed

+265
-1
lines changed

2 files changed

+265
-1
lines changed

fs/io_uring.c

Lines changed: 262 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ struct io_ring_ctx {
161161
* manipulate the list, hence no extra locking is needed there.
162162
*/
163163
struct list_head poll_list;
164+
struct list_head cancel_list;
164165
} ____cacheline_aligned_in_smp;
165166

166167
#if defined(CONFIG_UNIX)
@@ -176,8 +177,20 @@ struct sqe_submit {
176177
bool needs_fixed_file;
177178
};
178179

180+
struct io_poll_iocb {
181+
struct file *file;
182+
struct wait_queue_head *head;
183+
__poll_t events;
184+
bool woken;
185+
bool canceled;
186+
struct wait_queue_entry wait;
187+
};
188+
179189
struct io_kiocb {
180-
struct kiocb rw;
190+
union {
191+
struct kiocb rw;
192+
struct io_poll_iocb poll;
193+
};
181194

182195
struct sqe_submit submit;
183196

@@ -261,6 +274,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
261274
init_waitqueue_head(&ctx->wait);
262275
spin_lock_init(&ctx->completion_lock);
263276
INIT_LIST_HEAD(&ctx->poll_list);
277+
INIT_LIST_HEAD(&ctx->cancel_list);
264278
return ctx;
265279
}
266280

@@ -1058,6 +1072,246 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
10581072
return 0;
10591073
}
10601074

1075+
static void io_poll_remove_one(struct io_kiocb *req)
1076+
{
1077+
struct io_poll_iocb *poll = &req->poll;
1078+
1079+
spin_lock(&poll->head->lock);
1080+
WRITE_ONCE(poll->canceled, true);
1081+
if (!list_empty(&poll->wait.entry)) {
1082+
list_del_init(&poll->wait.entry);
1083+
queue_work(req->ctx->sqo_wq, &req->work);
1084+
}
1085+
spin_unlock(&poll->head->lock);
1086+
1087+
list_del_init(&req->list);
1088+
}
1089+
1090+
static void io_poll_remove_all(struct io_ring_ctx *ctx)
1091+
{
1092+
struct io_kiocb *req;
1093+
1094+
spin_lock_irq(&ctx->completion_lock);
1095+
while (!list_empty(&ctx->cancel_list)) {
1096+
req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
1097+
io_poll_remove_one(req);
1098+
}
1099+
spin_unlock_irq(&ctx->completion_lock);
1100+
}
1101+
1102+
/*
1103+
* Find a running poll command that matches one specified in sqe->addr,
1104+
* and remove it if found.
1105+
*/
1106+
static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1107+
{
1108+
struct io_ring_ctx *ctx = req->ctx;
1109+
struct io_kiocb *poll_req, *next;
1110+
int ret = -ENOENT;
1111+
1112+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1113+
return -EINVAL;
1114+
if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
1115+
sqe->poll_events)
1116+
return -EINVAL;
1117+
1118+
spin_lock_irq(&ctx->completion_lock);
1119+
list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
1120+
if (READ_ONCE(sqe->addr) == poll_req->user_data) {
1121+
io_poll_remove_one(poll_req);
1122+
ret = 0;
1123+
break;
1124+
}
1125+
}
1126+
spin_unlock_irq(&ctx->completion_lock);
1127+
1128+
io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
1129+
io_free_req(req);
1130+
return 0;
1131+
}
1132+
1133+
static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
1134+
{
1135+
io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
1136+
io_fput(req);
1137+
io_free_req(req);
1138+
}
1139+
1140+
static void io_poll_complete_work(struct work_struct *work)
1141+
{
1142+
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1143+
struct io_poll_iocb *poll = &req->poll;
1144+
struct poll_table_struct pt = { ._key = poll->events };
1145+
struct io_ring_ctx *ctx = req->ctx;
1146+
__poll_t mask = 0;
1147+
1148+
if (!READ_ONCE(poll->canceled))
1149+
mask = vfs_poll(poll->file, &pt) & poll->events;
1150+
1151+
/*
1152+
* Note that ->ki_cancel callers also delete iocb from active_reqs after
1153+
* calling ->ki_cancel. We need the ctx_lock roundtrip here to
1154+
* synchronize with them. In the cancellation case the list_del_init
1155+
* itself is not actually needed, but harmless so we keep it in to
1156+
* avoid further branches in the fast path.
1157+
*/
1158+
spin_lock_irq(&ctx->completion_lock);
1159+
if (!mask && !READ_ONCE(poll->canceled)) {
1160+
add_wait_queue(poll->head, &poll->wait);
1161+
spin_unlock_irq(&ctx->completion_lock);
1162+
return;
1163+
}
1164+
list_del_init(&req->list);
1165+
spin_unlock_irq(&ctx->completion_lock);
1166+
1167+
io_poll_complete(req, mask);
1168+
}
1169+
1170+
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
1171+
void *key)
1172+
{
1173+
struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
1174+
wait);
1175+
struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
1176+
struct io_ring_ctx *ctx = req->ctx;
1177+
__poll_t mask = key_to_poll(key);
1178+
1179+
poll->woken = true;
1180+
1181+
/* for instances that support it check for an event match first: */
1182+
if (mask) {
1183+
unsigned long flags;
1184+
1185+
if (!(mask & poll->events))
1186+
return 0;
1187+
1188+
/* try to complete the iocb inline if we can: */
1189+
if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
1190+
list_del(&req->list);
1191+
spin_unlock_irqrestore(&ctx->completion_lock, flags);
1192+
1193+
list_del_init(&poll->wait.entry);
1194+
io_poll_complete(req, mask);
1195+
return 1;
1196+
}
1197+
}
1198+
1199+
list_del_init(&poll->wait.entry);
1200+
queue_work(ctx->sqo_wq, &req->work);
1201+
return 1;
1202+
}
1203+
1204+
struct io_poll_table {
1205+
struct poll_table_struct pt;
1206+
struct io_kiocb *req;
1207+
int error;
1208+
};
1209+
1210+
static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
1211+
struct poll_table_struct *p)
1212+
{
1213+
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
1214+
1215+
if (unlikely(pt->req->poll.head)) {
1216+
pt->error = -EINVAL;
1217+
return;
1218+
}
1219+
1220+
pt->error = 0;
1221+
pt->req->poll.head = head;
1222+
add_wait_queue(head, &pt->req->poll.wait);
1223+
}
1224+
1225+
static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1226+
{
1227+
struct io_poll_iocb *poll = &req->poll;
1228+
struct io_ring_ctx *ctx = req->ctx;
1229+
struct io_poll_table ipt;
1230+
unsigned flags;
1231+
__poll_t mask;
1232+
u16 events;
1233+
int fd;
1234+
1235+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1236+
return -EINVAL;
1237+
if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
1238+
return -EINVAL;
1239+
1240+
INIT_WORK(&req->work, io_poll_complete_work);
1241+
events = READ_ONCE(sqe->poll_events);
1242+
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
1243+
1244+
flags = READ_ONCE(sqe->flags);
1245+
fd = READ_ONCE(sqe->fd);
1246+
1247+
if (flags & IOSQE_FIXED_FILE) {
1248+
if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
1249+
return -EBADF;
1250+
poll->file = ctx->user_files[fd];
1251+
req->flags |= REQ_F_FIXED_FILE;
1252+
} else {
1253+
poll->file = fget(fd);
1254+
}
1255+
if (unlikely(!poll->file))
1256+
return -EBADF;
1257+
1258+
poll->head = NULL;
1259+
poll->woken = false;
1260+
poll->canceled = false;
1261+
1262+
ipt.pt._qproc = io_poll_queue_proc;
1263+
ipt.pt._key = poll->events;
1264+
ipt.req = req;
1265+
ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
1266+
1267+
/* initialized the list so that we can do list_empty checks */
1268+
INIT_LIST_HEAD(&poll->wait.entry);
1269+
init_waitqueue_func_entry(&poll->wait, io_poll_wake);
1270+
1271+
/* one for removal from waitqueue, one for this function */
1272+
refcount_set(&req->refs, 2);
1273+
1274+
mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
1275+
if (unlikely(!poll->head)) {
1276+
/* we did not manage to set up a waitqueue, done */
1277+
goto out;
1278+
}
1279+
1280+
spin_lock_irq(&ctx->completion_lock);
1281+
spin_lock(&poll->head->lock);
1282+
if (poll->woken) {
1283+
/* wake_up context handles the rest */
1284+
mask = 0;
1285+
ipt.error = 0;
1286+
} else if (mask || ipt.error) {
1287+
/* if we get an error or a mask we are done */
1288+
WARN_ON_ONCE(list_empty(&poll->wait.entry));
1289+
list_del_init(&poll->wait.entry);
1290+
} else {
1291+
/* actually waiting for an event */
1292+
list_add_tail(&req->list, &ctx->cancel_list);
1293+
}
1294+
spin_unlock(&poll->head->lock);
1295+
spin_unlock_irq(&ctx->completion_lock);
1296+
1297+
out:
1298+
if (unlikely(ipt.error)) {
1299+
if (!(flags & IOSQE_FIXED_FILE))
1300+
fput(poll->file);
1301+
/*
1302+
* Drop one of our refs to this req, __io_submit_sqe() will
1303+
* drop the other one since we're returning an error.
1304+
*/
1305+
io_free_req(req);
1306+
return ipt.error;
1307+
}
1308+
1309+
if (mask)
1310+
io_poll_complete(req, mask);
1311+
io_free_req(req);
1312+
return 0;
1313+
}
1314+
10611315
static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
10621316
const struct sqe_submit *s, bool force_nonblock,
10631317
struct io_submit_state *state)
@@ -1093,6 +1347,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
10931347
case IORING_OP_FSYNC:
10941348
ret = io_fsync(req, s->sqe, force_nonblock);
10951349
break;
1350+
case IORING_OP_POLL_ADD:
1351+
ret = io_poll_add(req, s->sqe);
1352+
break;
1353+
case IORING_OP_POLL_REMOVE:
1354+
ret = io_poll_remove(req, s->sqe);
1355+
break;
10961356
default:
10971357
ret = -EINVAL;
10981358
break;
@@ -2131,6 +2391,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
21312391
percpu_ref_kill(&ctx->refs);
21322392
mutex_unlock(&ctx->uring_lock);
21332393

2394+
io_poll_remove_all(ctx);
21342395
io_iopoll_reap_events(ctx);
21352396
wait_for_completion(&ctx->ctx_done);
21362397
io_ring_ctx_free(ctx);

include/uapi/linux/io_uring.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ struct io_uring_sqe {
2525
union {
2626
__kernel_rwf_t rw_flags;
2727
__u32 fsync_flags;
28+
__u16 poll_events;
2829
};
2930
__u64 user_data; /* data to be passed back at completion time */
3031
union {
@@ -51,6 +52,8 @@ struct io_uring_sqe {
5152
#define IORING_OP_FSYNC 3
5253
#define IORING_OP_READ_FIXED 4
5354
#define IORING_OP_WRITE_FIXED 5
55+
#define IORING_OP_POLL_ADD 6
56+
#define IORING_OP_POLL_REMOVE 7
5457

5558
/*
5659
* sqe->fsync_flags

0 commit comments

Comments
 (0)