@@ -270,22 +270,180 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
270
270
return ret ;
271
271
}
272
272
273
+ struct blkdev_dio {
274
+ union {
275
+ struct kiocb * iocb ;
276
+ struct task_struct * waiter ;
277
+ };
278
+ size_t size ;
279
+ atomic_t ref ;
280
+ bool multi_bio : 1 ;
281
+ bool should_dirty : 1 ;
282
+ bool is_sync : 1 ;
283
+ struct bio bio ;
284
+ };
285
+
286
+ static struct bio_set * blkdev_dio_pool __read_mostly ;
287
+
288
+ static void blkdev_bio_end_io (struct bio * bio )
289
+ {
290
+ struct blkdev_dio * dio = bio -> bi_private ;
291
+ bool should_dirty = dio -> should_dirty ;
292
+
293
+ if (dio -> multi_bio && !atomic_dec_and_test (& dio -> ref )) {
294
+ if (bio -> bi_error && !dio -> bio .bi_error )
295
+ dio -> bio .bi_error = bio -> bi_error ;
296
+ } else {
297
+ if (!dio -> is_sync ) {
298
+ struct kiocb * iocb = dio -> iocb ;
299
+ ssize_t ret = dio -> bio .bi_error ;
300
+
301
+ if (likely (!ret )) {
302
+ ret = dio -> size ;
303
+ iocb -> ki_pos += ret ;
304
+ }
305
+
306
+ dio -> iocb -> ki_complete (iocb , ret , 0 );
307
+ bio_put (& dio -> bio );
308
+ } else {
309
+ struct task_struct * waiter = dio -> waiter ;
310
+
311
+ WRITE_ONCE (dio -> waiter , NULL );
312
+ wake_up_process (waiter );
313
+ }
314
+ }
315
+
316
+ if (should_dirty ) {
317
+ bio_check_pages_dirty (bio );
318
+ } else {
319
+ struct bio_vec * bvec ;
320
+ int i ;
321
+
322
+ bio_for_each_segment_all (bvec , bio , i )
323
+ put_page (bvec -> bv_page );
324
+ bio_put (bio );
325
+ }
326
+ }
327
+
273
328
static ssize_t
274
- blkdev_direct_IO (struct kiocb * iocb , struct iov_iter * iter )
329
+ __blkdev_direct_IO (struct kiocb * iocb , struct iov_iter * iter , int nr_pages )
275
330
{
276
331
struct file * file = iocb -> ki_filp ;
277
332
struct inode * inode = bdev_file_inode (file );
333
+ struct block_device * bdev = I_BDEV (inode );
334
+ unsigned blkbits = blksize_bits (bdev_logical_block_size (bdev ));
335
+ struct blkdev_dio * dio ;
336
+ struct bio * bio ;
337
+ bool is_read = (iov_iter_rw (iter ) == READ );
338
+ loff_t pos = iocb -> ki_pos ;
339
+ blk_qc_t qc = BLK_QC_T_NONE ;
340
+ int ret ;
341
+
342
+ if ((pos | iov_iter_alignment (iter )) & ((1 << blkbits ) - 1 ))
343
+ return - EINVAL ;
344
+
345
+ bio = bio_alloc_bioset (GFP_KERNEL , nr_pages , blkdev_dio_pool );
346
+ bio_get (bio ); /* extra ref for the completion handler */
347
+
348
+ dio = container_of (bio , struct blkdev_dio , bio );
349
+ dio -> is_sync = is_sync_kiocb (iocb );
350
+ if (dio -> is_sync )
351
+ dio -> waiter = current ;
352
+ else
353
+ dio -> iocb = iocb ;
354
+
355
+ dio -> size = 0 ;
356
+ dio -> multi_bio = false;
357
+ dio -> should_dirty = is_read && (iter -> type == ITER_IOVEC );
358
+
359
+ for (;;) {
360
+ bio -> bi_bdev = bdev ;
361
+ bio -> bi_iter .bi_sector = pos >> blkbits ;
362
+ bio -> bi_private = dio ;
363
+ bio -> bi_end_io = blkdev_bio_end_io ;
364
+
365
+ ret = bio_iov_iter_get_pages (bio , iter );
366
+ if (unlikely (ret )) {
367
+ bio -> bi_error = ret ;
368
+ bio_endio (bio );
369
+ break ;
370
+ }
371
+
372
+ if (is_read ) {
373
+ bio -> bi_opf = REQ_OP_READ ;
374
+ if (dio -> should_dirty )
375
+ bio_set_pages_dirty (bio );
376
+ } else {
377
+ bio -> bi_opf = dio_bio_write_op (iocb );
378
+ task_io_account_write (bio -> bi_iter .bi_size );
379
+ }
380
+
381
+ dio -> size += bio -> bi_iter .bi_size ;
382
+ pos += bio -> bi_iter .bi_size ;
383
+
384
+ nr_pages = iov_iter_npages (iter , BIO_MAX_PAGES );
385
+ if (!nr_pages ) {
386
+ qc = submit_bio (bio );
387
+ break ;
388
+ }
389
+
390
+ if (!dio -> multi_bio ) {
391
+ dio -> multi_bio = true;
392
+ atomic_set (& dio -> ref , 2 );
393
+ } else {
394
+ atomic_inc (& dio -> ref );
395
+ }
396
+
397
+ submit_bio (bio );
398
+ bio = bio_alloc (GFP_KERNEL , nr_pages );
399
+ }
400
+
401
+ if (!dio -> is_sync )
402
+ return - EIOCBQUEUED ;
403
+
404
+ for (;;) {
405
+ set_current_state (TASK_UNINTERRUPTIBLE );
406
+ if (!READ_ONCE (dio -> waiter ))
407
+ break ;
408
+
409
+ if (!(iocb -> ki_flags & IOCB_HIPRI ) ||
410
+ !blk_mq_poll (bdev_get_queue (bdev ), qc ))
411
+ io_schedule ();
412
+ }
413
+ __set_current_state (TASK_RUNNING );
414
+
415
+ ret = dio -> bio .bi_error ;
416
+ if (likely (!ret )) {
417
+ ret = dio -> size ;
418
+ iocb -> ki_pos += ret ;
419
+ }
420
+
421
+ bio_put (& dio -> bio );
422
+ return ret ;
423
+ }
424
+
425
+ static ssize_t
426
+ blkdev_direct_IO (struct kiocb * iocb , struct iov_iter * iter )
427
+ {
278
428
int nr_pages ;
279
429
280
430
nr_pages = iov_iter_npages (iter , BIO_MAX_PAGES + 1 );
281
431
if (!nr_pages )
282
432
return 0 ;
283
433
if (is_sync_kiocb (iocb ) && nr_pages <= BIO_MAX_PAGES )
284
434
return __blkdev_direct_IO_simple (iocb , iter , nr_pages );
285
- return __blockdev_direct_IO (iocb , inode , I_BDEV (inode ), iter ,
286
- blkdev_get_block , NULL , NULL ,
287
- DIO_SKIP_DIO_COUNT );
435
+
436
+ return __blkdev_direct_IO (iocb , iter , min (nr_pages , BIO_MAX_PAGES ));
437
+ }
438
+
439
+ static __init int blkdev_init (void )
440
+ {
441
+ blkdev_dio_pool = bioset_create (4 , offsetof(struct blkdev_dio , bio ));
442
+ if (!blkdev_dio_pool )
443
+ return - ENOMEM ;
444
+ return 0 ;
288
445
}
446
+ module_init (blkdev_init );
289
447
290
448
int __sync_blockdev (struct block_device * bdev , int wait )
291
449
{
0 commit comments