@@ -161,6 +161,7 @@ struct io_ring_ctx {
161
161
* manipulate the list, hence no extra locking is needed there.
162
162
*/
163
163
struct list_head poll_list ;
164
+ struct list_head cancel_list ;
164
165
} ____cacheline_aligned_in_smp ;
165
166
166
167
#if defined(CONFIG_UNIX )
@@ -176,8 +177,20 @@ struct sqe_submit {
176
177
bool needs_fixed_file ;
177
178
};
178
179
180
+ struct io_poll_iocb {
181
+ struct file * file ;
182
+ struct wait_queue_head * head ;
183
+ __poll_t events ;
184
+ bool woken ;
185
+ bool canceled ;
186
+ struct wait_queue_entry wait ;
187
+ };
188
+
179
189
struct io_kiocb {
180
- struct kiocb rw ;
190
+ union {
191
+ struct kiocb rw ;
192
+ struct io_poll_iocb poll ;
193
+ };
181
194
182
195
struct sqe_submit submit ;
183
196
@@ -261,6 +274,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
261
274
init_waitqueue_head (& ctx -> wait );
262
275
spin_lock_init (& ctx -> completion_lock );
263
276
INIT_LIST_HEAD (& ctx -> poll_list );
277
+ INIT_LIST_HEAD (& ctx -> cancel_list );
264
278
return ctx ;
265
279
}
266
280
@@ -1058,6 +1072,246 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1058
1072
return 0 ;
1059
1073
}
1060
1074
1075
+ static void io_poll_remove_one (struct io_kiocb * req )
1076
+ {
1077
+ struct io_poll_iocb * poll = & req -> poll ;
1078
+
1079
+ spin_lock (& poll -> head -> lock );
1080
+ WRITE_ONCE (poll -> canceled , true);
1081
+ if (!list_empty (& poll -> wait .entry )) {
1082
+ list_del_init (& poll -> wait .entry );
1083
+ queue_work (req -> ctx -> sqo_wq , & req -> work );
1084
+ }
1085
+ spin_unlock (& poll -> head -> lock );
1086
+
1087
+ list_del_init (& req -> list );
1088
+ }
1089
+
1090
+ static void io_poll_remove_all (struct io_ring_ctx * ctx )
1091
+ {
1092
+ struct io_kiocb * req ;
1093
+
1094
+ spin_lock_irq (& ctx -> completion_lock );
1095
+ while (!list_empty (& ctx -> cancel_list )) {
1096
+ req = list_first_entry (& ctx -> cancel_list , struct io_kiocb ,list );
1097
+ io_poll_remove_one (req );
1098
+ }
1099
+ spin_unlock_irq (& ctx -> completion_lock );
1100
+ }
1101
+
1102
+ /*
1103
+ * Find a running poll command that matches one specified in sqe->addr,
1104
+ * and remove it if found.
1105
+ */
1106
+ static int io_poll_remove (struct io_kiocb * req , const struct io_uring_sqe * sqe )
1107
+ {
1108
+ struct io_ring_ctx * ctx = req -> ctx ;
1109
+ struct io_kiocb * poll_req , * next ;
1110
+ int ret = - ENOENT ;
1111
+
1112
+ if (unlikely (req -> ctx -> flags & IORING_SETUP_IOPOLL ))
1113
+ return - EINVAL ;
1114
+ if (sqe -> ioprio || sqe -> off || sqe -> len || sqe -> buf_index ||
1115
+ sqe -> poll_events )
1116
+ return - EINVAL ;
1117
+
1118
+ spin_lock_irq (& ctx -> completion_lock );
1119
+ list_for_each_entry_safe (poll_req , next , & ctx -> cancel_list , list ) {
1120
+ if (READ_ONCE (sqe -> addr ) == poll_req -> user_data ) {
1121
+ io_poll_remove_one (poll_req );
1122
+ ret = 0 ;
1123
+ break ;
1124
+ }
1125
+ }
1126
+ spin_unlock_irq (& ctx -> completion_lock );
1127
+
1128
+ io_cqring_add_event (req -> ctx , sqe -> user_data , ret , 0 );
1129
+ io_free_req (req );
1130
+ return 0 ;
1131
+ }
1132
+
1133
+ static void io_poll_complete (struct io_kiocb * req , __poll_t mask )
1134
+ {
1135
+ io_cqring_add_event (req -> ctx , req -> user_data , mangle_poll (mask ), 0 );
1136
+ io_fput (req );
1137
+ io_free_req (req );
1138
+ }
1139
+
1140
+ static void io_poll_complete_work (struct work_struct * work )
1141
+ {
1142
+ struct io_kiocb * req = container_of (work , struct io_kiocb , work );
1143
+ struct io_poll_iocb * poll = & req -> poll ;
1144
+ struct poll_table_struct pt = { ._key = poll -> events };
1145
+ struct io_ring_ctx * ctx = req -> ctx ;
1146
+ __poll_t mask = 0 ;
1147
+
1148
+ if (!READ_ONCE (poll -> canceled ))
1149
+ mask = vfs_poll (poll -> file , & pt ) & poll -> events ;
1150
+
1151
+ /*
1152
+ * Note that ->ki_cancel callers also delete iocb from active_reqs after
1153
+ * calling ->ki_cancel. We need the ctx_lock roundtrip here to
1154
+ * synchronize with them. In the cancellation case the list_del_init
1155
+ * itself is not actually needed, but harmless so we keep it in to
1156
+ * avoid further branches in the fast path.
1157
+ */
1158
+ spin_lock_irq (& ctx -> completion_lock );
1159
+ if (!mask && !READ_ONCE (poll -> canceled )) {
1160
+ add_wait_queue (poll -> head , & poll -> wait );
1161
+ spin_unlock_irq (& ctx -> completion_lock );
1162
+ return ;
1163
+ }
1164
+ list_del_init (& req -> list );
1165
+ spin_unlock_irq (& ctx -> completion_lock );
1166
+
1167
+ io_poll_complete (req , mask );
1168
+ }
1169
+
1170
+ static int io_poll_wake (struct wait_queue_entry * wait , unsigned mode , int sync ,
1171
+ void * key )
1172
+ {
1173
+ struct io_poll_iocb * poll = container_of (wait , struct io_poll_iocb ,
1174
+ wait );
1175
+ struct io_kiocb * req = container_of (poll , struct io_kiocb , poll );
1176
+ struct io_ring_ctx * ctx = req -> ctx ;
1177
+ __poll_t mask = key_to_poll (key );
1178
+
1179
+ poll -> woken = true;
1180
+
1181
+ /* for instances that support it check for an event match first: */
1182
+ if (mask ) {
1183
+ unsigned long flags ;
1184
+
1185
+ if (!(mask & poll -> events ))
1186
+ return 0 ;
1187
+
1188
+ /* try to complete the iocb inline if we can: */
1189
+ if (spin_trylock_irqsave (& ctx -> completion_lock , flags )) {
1190
+ list_del (& req -> list );
1191
+ spin_unlock_irqrestore (& ctx -> completion_lock , flags );
1192
+
1193
+ list_del_init (& poll -> wait .entry );
1194
+ io_poll_complete (req , mask );
1195
+ return 1 ;
1196
+ }
1197
+ }
1198
+
1199
+ list_del_init (& poll -> wait .entry );
1200
+ queue_work (ctx -> sqo_wq , & req -> work );
1201
+ return 1 ;
1202
+ }
1203
+
1204
+ struct io_poll_table {
1205
+ struct poll_table_struct pt ;
1206
+ struct io_kiocb * req ;
1207
+ int error ;
1208
+ };
1209
+
1210
+ static void io_poll_queue_proc (struct file * file , struct wait_queue_head * head ,
1211
+ struct poll_table_struct * p )
1212
+ {
1213
+ struct io_poll_table * pt = container_of (p , struct io_poll_table , pt );
1214
+
1215
+ if (unlikely (pt -> req -> poll .head )) {
1216
+ pt -> error = - EINVAL ;
1217
+ return ;
1218
+ }
1219
+
1220
+ pt -> error = 0 ;
1221
+ pt -> req -> poll .head = head ;
1222
+ add_wait_queue (head , & pt -> req -> poll .wait );
1223
+ }
1224
+
1225
+ static int io_poll_add (struct io_kiocb * req , const struct io_uring_sqe * sqe )
1226
+ {
1227
+ struct io_poll_iocb * poll = & req -> poll ;
1228
+ struct io_ring_ctx * ctx = req -> ctx ;
1229
+ struct io_poll_table ipt ;
1230
+ unsigned flags ;
1231
+ __poll_t mask ;
1232
+ u16 events ;
1233
+ int fd ;
1234
+
1235
+ if (unlikely (req -> ctx -> flags & IORING_SETUP_IOPOLL ))
1236
+ return - EINVAL ;
1237
+ if (sqe -> addr || sqe -> ioprio || sqe -> off || sqe -> len || sqe -> buf_index )
1238
+ return - EINVAL ;
1239
+
1240
+ INIT_WORK (& req -> work , io_poll_complete_work );
1241
+ events = READ_ONCE (sqe -> poll_events );
1242
+ poll -> events = demangle_poll (events ) | EPOLLERR | EPOLLHUP ;
1243
+
1244
+ flags = READ_ONCE (sqe -> flags );
1245
+ fd = READ_ONCE (sqe -> fd );
1246
+
1247
+ if (flags & IOSQE_FIXED_FILE ) {
1248
+ if (unlikely (!ctx -> user_files || fd >= ctx -> nr_user_files ))
1249
+ return - EBADF ;
1250
+ poll -> file = ctx -> user_files [fd ];
1251
+ req -> flags |= REQ_F_FIXED_FILE ;
1252
+ } else {
1253
+ poll -> file = fget (fd );
1254
+ }
1255
+ if (unlikely (!poll -> file ))
1256
+ return - EBADF ;
1257
+
1258
+ poll -> head = NULL ;
1259
+ poll -> woken = false;
1260
+ poll -> canceled = false;
1261
+
1262
+ ipt .pt ._qproc = io_poll_queue_proc ;
1263
+ ipt .pt ._key = poll -> events ;
1264
+ ipt .req = req ;
1265
+ ipt .error = - EINVAL ; /* same as no support for IOCB_CMD_POLL */
1266
+
1267
+ /* initialized the list so that we can do list_empty checks */
1268
+ INIT_LIST_HEAD (& poll -> wait .entry );
1269
+ init_waitqueue_func_entry (& poll -> wait , io_poll_wake );
1270
+
1271
+ /* one for removal from waitqueue, one for this function */
1272
+ refcount_set (& req -> refs , 2 );
1273
+
1274
+ mask = vfs_poll (poll -> file , & ipt .pt ) & poll -> events ;
1275
+ if (unlikely (!poll -> head )) {
1276
+ /* we did not manage to set up a waitqueue, done */
1277
+ goto out ;
1278
+ }
1279
+
1280
+ spin_lock_irq (& ctx -> completion_lock );
1281
+ spin_lock (& poll -> head -> lock );
1282
+ if (poll -> woken ) {
1283
+ /* wake_up context handles the rest */
1284
+ mask = 0 ;
1285
+ ipt .error = 0 ;
1286
+ } else if (mask || ipt .error ) {
1287
+ /* if we get an error or a mask we are done */
1288
+ WARN_ON_ONCE (list_empty (& poll -> wait .entry ));
1289
+ list_del_init (& poll -> wait .entry );
1290
+ } else {
1291
+ /* actually waiting for an event */
1292
+ list_add_tail (& req -> list , & ctx -> cancel_list );
1293
+ }
1294
+ spin_unlock (& poll -> head -> lock );
1295
+ spin_unlock_irq (& ctx -> completion_lock );
1296
+
1297
+ out :
1298
+ if (unlikely (ipt .error )) {
1299
+ if (!(flags & IOSQE_FIXED_FILE ))
1300
+ fput (poll -> file );
1301
+ /*
1302
+ * Drop one of our refs to this req, __io_submit_sqe() will
1303
+ * drop the other one since we're returning an error.
1304
+ */
1305
+ io_free_req (req );
1306
+ return ipt .error ;
1307
+ }
1308
+
1309
+ if (mask )
1310
+ io_poll_complete (req , mask );
1311
+ io_free_req (req );
1312
+ return 0 ;
1313
+ }
1314
+
1061
1315
static int __io_submit_sqe (struct io_ring_ctx * ctx , struct io_kiocb * req ,
1062
1316
const struct sqe_submit * s , bool force_nonblock ,
1063
1317
struct io_submit_state * state )
@@ -1093,6 +1347,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1093
1347
case IORING_OP_FSYNC :
1094
1348
ret = io_fsync (req , s -> sqe , force_nonblock );
1095
1349
break ;
1350
+ case IORING_OP_POLL_ADD :
1351
+ ret = io_poll_add (req , s -> sqe );
1352
+ break ;
1353
+ case IORING_OP_POLL_REMOVE :
1354
+ ret = io_poll_remove (req , s -> sqe );
1355
+ break ;
1096
1356
default :
1097
1357
ret = - EINVAL ;
1098
1358
break ;
@@ -2131,6 +2391,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
2131
2391
percpu_ref_kill (& ctx -> refs );
2132
2392
mutex_unlock (& ctx -> uring_lock );
2133
2393
2394
+ io_poll_remove_all (ctx );
2134
2395
io_iopoll_reap_events (ctx );
2135
2396
wait_for_completion (& ctx -> ctx_done );
2136
2397
io_ring_ctx_free (ctx );
0 commit comments