Skip to content

Commit 8c83878

Browse files
committed
io_uring: fix poll races
This is a straight port of Al's fix for the aio poll implementation, since the io_uring version is heavily based on that. The below description is almost straight from that patch, just modified to fit the io_uring situation. io_poll() has to cope with several unpleasant problems: * requests that might stay around indefinitely need to be made visible for io_cancel(2); that must not be done to a request already completed, though. * in cases when ->poll() has placed us on a waitqueue, wakeup might have happened (and request completed) before ->poll() returns. * worse, in some early wakeup cases request might end up re-added into the queue later - we can't treat "woken up and currently not in the queue" as "it's not going to stick around indefinitely" * ... moreover, ->poll() might have decided not to put it on any queues to start with, and that needs to be distinguished from the previous case * ->poll() might have tried to put us on more than one queue. Only the first will succeed for io poll, so we might end up missing wakeups. OTOH, we might very well notice that only after the wakeup hits and request gets completed (all before ->poll() gets around to the second poll_wait()). In that case it's too late to decide that we have an error. req->woken was an attempt to deal with that. Unfortunately, it was broken. What we need to keep track of is not that wakeup has happened - the thing might come back after that. It's that async reference is already gone and won't come back, so we can't (and needn't) put the request on the list of cancellables. The easiest case is "request hadn't been put on any waitqueues"; we can tell by seeing NULL apt.head, and in that case there won't be anything async. We should either complete the request ourselves (if vfs_poll() reports anything of interest) or return an error. In all other cases we get exclusion with wakeups by grabbing the queue lock. If request is currently on queue and we have something interesting from vfs_poll(), we can steal it and complete the request ourselves. If it's on queue and vfs_poll() has not reported anything interesting, we either put it on the cancellable list, or, if we know that it hadn't been put on all queues ->poll() wanted it on, we steal it and return an error. If it's _not_ on queue, it's either been already dealt with (in which case we do nothing), or there's io_poll_complete_work() about to be executed. In that case we either put it on the cancellable list, or, if we know it hadn't been put on all queues ->poll() wanted it on, simulate what cancel would've done. Fixes: 221c5eb ("io_uring: add support for IORING_OP_POLL") Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 09bb839 commit 8c83878

File tree

1 file changed

+56
-55
lines changed

1 file changed

+56
-55
lines changed

fs/io_uring.c

Lines changed: 56 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ struct io_poll_iocb {
197197
struct file *file;
198198
struct wait_queue_head *head;
199199
__poll_t events;
200-
bool woken;
200+
bool done;
201201
bool canceled;
202202
struct wait_queue_entry wait;
203203
};
@@ -367,20 +367,25 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
367367
}
368368
}
369369

370-
static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
370+
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
371+
{
372+
if (waitqueue_active(&ctx->wait))
373+
wake_up(&ctx->wait);
374+
if (waitqueue_active(&ctx->sqo_wait))
375+
wake_up(&ctx->sqo_wait);
376+
}
377+
378+
static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
371379
long res, unsigned ev_flags)
372380
{
373381
unsigned long flags;
374382

375383
spin_lock_irqsave(&ctx->completion_lock, flags);
376-
io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
384+
io_cqring_fill_event(ctx, user_data, res, ev_flags);
377385
io_commit_cqring(ctx);
378386
spin_unlock_irqrestore(&ctx->completion_lock, flags);
379387

380-
if (waitqueue_active(&ctx->wait))
381-
wake_up(&ctx->wait);
382-
if (waitqueue_active(&ctx->sqo_wait))
383-
wake_up(&ctx->sqo_wait);
388+
io_cqring_ev_posted(ctx);
384389
}
385390

386391
static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
@@ -1149,10 +1154,12 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
11491154
return 0;
11501155
}
11511156

1152-
static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
1157+
static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
1158+
__poll_t mask)
11531159
{
1154-
io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
1155-
io_put_req(req);
1160+
req->poll.done = true;
1161+
io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0);
1162+
io_commit_cqring(ctx);
11561163
}
11571164

11581165
static void io_poll_complete_work(struct work_struct *work)
@@ -1180,9 +1187,11 @@ static void io_poll_complete_work(struct work_struct *work)
11801187
return;
11811188
}
11821189
list_del_init(&req->list);
1190+
io_poll_complete(ctx, req, mask);
11831191
spin_unlock_irq(&ctx->completion_lock);
11841192

1185-
io_poll_complete(req, mask);
1193+
io_cqring_ev_posted(ctx);
1194+
io_put_req(req);
11861195
}
11871196

11881197
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -1193,29 +1202,25 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
11931202
struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
11941203
struct io_ring_ctx *ctx = req->ctx;
11951204
__poll_t mask = key_to_poll(key);
1196-
1197-
poll->woken = true;
1205+
unsigned long flags;
11981206

11991207
/* for instances that support it check for an event match first: */
1200-
if (mask) {
1201-
unsigned long flags;
1208+
if (mask && !(mask & poll->events))
1209+
return 0;
12021210

1203-
if (!(mask & poll->events))
1204-
return 0;
1211+
list_del_init(&poll->wait.entry);
12051212

1206-
/* try to complete the iocb inline if we can: */
1207-
if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
1208-
list_del(&req->list);
1209-
spin_unlock_irqrestore(&ctx->completion_lock, flags);
1213+
if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
1214+
list_del(&req->list);
1215+
io_poll_complete(ctx, req, mask);
1216+
spin_unlock_irqrestore(&ctx->completion_lock, flags);
12101217

1211-
list_del_init(&poll->wait.entry);
1212-
io_poll_complete(req, mask);
1213-
return 1;
1214-
}
1218+
io_cqring_ev_posted(ctx);
1219+
io_put_req(req);
1220+
} else {
1221+
queue_work(ctx->sqo_wq, &req->work);
12151222
}
12161223

1217-
list_del_init(&poll->wait.entry);
1218-
queue_work(ctx->sqo_wq, &req->work);
12191224
return 1;
12201225
}
12211226

@@ -1245,6 +1250,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
12451250
struct io_poll_iocb *poll = &req->poll;
12461251
struct io_ring_ctx *ctx = req->ctx;
12471252
struct io_poll_table ipt;
1253+
bool cancel = false;
12481254
__poll_t mask;
12491255
u16 events;
12501256

@@ -1260,7 +1266,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
12601266
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
12611267

12621268
poll->head = NULL;
1263-
poll->woken = false;
1269+
poll->done = false;
12641270
poll->canceled = false;
12651271

12661272
ipt.pt._qproc = io_poll_queue_proc;
@@ -1273,41 +1279,36 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
12731279
init_waitqueue_func_entry(&poll->wait, io_poll_wake);
12741280

12751281
mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
1276-
if (unlikely(!poll->head)) {
1277-
/* we did not manage to set up a waitqueue, done */
1278-
goto out;
1279-
}
12801282

12811283
spin_lock_irq(&ctx->completion_lock);
1282-
spin_lock(&poll->head->lock);
1283-
if (poll->woken) {
1284-
/* wake_up context handles the rest */
1285-
mask = 0;
1284+
if (likely(poll->head)) {
1285+
spin_lock(&poll->head->lock);
1286+
if (unlikely(list_empty(&poll->wait.entry))) {
1287+
if (ipt.error)
1288+
cancel = true;
1289+
ipt.error = 0;
1290+
mask = 0;
1291+
}
1292+
if (mask || ipt.error)
1293+
list_del_init(&poll->wait.entry);
1294+
else if (cancel)
1295+
WRITE_ONCE(poll->canceled, true);
1296+
else if (!poll->done) /* actually waiting for an event */
1297+
list_add_tail(&req->list, &ctx->cancel_list);
1298+
spin_unlock(&poll->head->lock);
1299+
}
1300+
if (mask) { /* no async, we'd stolen it */
1301+
req->error = mangle_poll(mask);
12861302
ipt.error = 0;
1287-
} else if (mask || ipt.error) {
1288-
/* if we get an error or a mask we are done */
1289-
WARN_ON_ONCE(list_empty(&poll->wait.entry));
1290-
list_del_init(&poll->wait.entry);
1291-
} else {
1292-
/* actually waiting for an event */
1293-
list_add_tail(&req->list, &ctx->cancel_list);
1303+
io_poll_complete(ctx, req, mask);
12941304
}
1295-
spin_unlock(&poll->head->lock);
12961305
spin_unlock_irq(&ctx->completion_lock);
12971306

1298-
out:
1299-
if (unlikely(ipt.error)) {
1300-
/*
1301-
* Drop one of our refs to this req, __io_submit_sqe() will
1302-
* drop the other one since we're returning an error.
1303-
*/
1307+
if (mask) {
1308+
io_cqring_ev_posted(ctx);
13041309
io_put_req(req);
1305-
return ipt.error;
13061310
}
1307-
1308-
if (mask)
1309-
io_poll_complete(req, mask);
1310-
return 0;
1311+
return ipt.error;
13111312
}
13121313

13131314
static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,

0 commit comments

Comments
 (0)