Skip to content

Commit 542ff7b

Browse files
Christoph Hellwigaxboe
authored andcommitted
block: new direct I/O implementation
Similar to the simple fast path, but we now need a dio structure to track multiple-bio completions. It's basically a cut-down version of the new iomap-based direct I/O code for filesystems, but without all the logic to call into the filesystem for extent lookup or allocation, and without the complex I/O completion workqueue handler for AIO - instead we just use the FUA bit on the bios to ensure data is flushed to stable storage. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@fb.com>
1 parent 78250c0 commit 542ff7b

File tree

1 file changed

+162
-4
lines changed

1 file changed

+162
-4
lines changed

fs/block_dev.c

Lines changed: 162 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -270,22 +270,180 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
270270
return ret;
271271
}
272272

273+
struct blkdev_dio {
274+
union {
275+
struct kiocb *iocb;
276+
struct task_struct *waiter;
277+
};
278+
size_t size;
279+
atomic_t ref;
280+
bool multi_bio : 1;
281+
bool should_dirty : 1;
282+
bool is_sync : 1;
283+
struct bio bio;
284+
};
285+
286+
static struct bio_set *blkdev_dio_pool __read_mostly;
287+
288+
static void blkdev_bio_end_io(struct bio *bio)
289+
{
290+
struct blkdev_dio *dio = bio->bi_private;
291+
bool should_dirty = dio->should_dirty;
292+
293+
if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
294+
if (bio->bi_error && !dio->bio.bi_error)
295+
dio->bio.bi_error = bio->bi_error;
296+
} else {
297+
if (!dio->is_sync) {
298+
struct kiocb *iocb = dio->iocb;
299+
ssize_t ret = dio->bio.bi_error;
300+
301+
if (likely(!ret)) {
302+
ret = dio->size;
303+
iocb->ki_pos += ret;
304+
}
305+
306+
dio->iocb->ki_complete(iocb, ret, 0);
307+
bio_put(&dio->bio);
308+
} else {
309+
struct task_struct *waiter = dio->waiter;
310+
311+
WRITE_ONCE(dio->waiter, NULL);
312+
wake_up_process(waiter);
313+
}
314+
}
315+
316+
if (should_dirty) {
317+
bio_check_pages_dirty(bio);
318+
} else {
319+
struct bio_vec *bvec;
320+
int i;
321+
322+
bio_for_each_segment_all(bvec, bio, i)
323+
put_page(bvec->bv_page);
324+
bio_put(bio);
325+
}
326+
}
327+
273328
static ssize_t
274-
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
329+
__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
275330
{
276331
struct file *file = iocb->ki_filp;
277332
struct inode *inode = bdev_file_inode(file);
333+
struct block_device *bdev = I_BDEV(inode);
334+
unsigned blkbits = blksize_bits(bdev_logical_block_size(bdev));
335+
struct blkdev_dio *dio;
336+
struct bio *bio;
337+
bool is_read = (iov_iter_rw(iter) == READ);
338+
loff_t pos = iocb->ki_pos;
339+
blk_qc_t qc = BLK_QC_T_NONE;
340+
int ret;
341+
342+
if ((pos | iov_iter_alignment(iter)) & ((1 << blkbits) - 1))
343+
return -EINVAL;
344+
345+
bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool);
346+
bio_get(bio); /* extra ref for the completion handler */
347+
348+
dio = container_of(bio, struct blkdev_dio, bio);
349+
dio->is_sync = is_sync_kiocb(iocb);
350+
if (dio->is_sync)
351+
dio->waiter = current;
352+
else
353+
dio->iocb = iocb;
354+
355+
dio->size = 0;
356+
dio->multi_bio = false;
357+
dio->should_dirty = is_read && (iter->type == ITER_IOVEC);
358+
359+
for (;;) {
360+
bio->bi_bdev = bdev;
361+
bio->bi_iter.bi_sector = pos >> blkbits;
362+
bio->bi_private = dio;
363+
bio->bi_end_io = blkdev_bio_end_io;
364+
365+
ret = bio_iov_iter_get_pages(bio, iter);
366+
if (unlikely(ret)) {
367+
bio->bi_error = ret;
368+
bio_endio(bio);
369+
break;
370+
}
371+
372+
if (is_read) {
373+
bio->bi_opf = REQ_OP_READ;
374+
if (dio->should_dirty)
375+
bio_set_pages_dirty(bio);
376+
} else {
377+
bio->bi_opf = dio_bio_write_op(iocb);
378+
task_io_account_write(bio->bi_iter.bi_size);
379+
}
380+
381+
dio->size += bio->bi_iter.bi_size;
382+
pos += bio->bi_iter.bi_size;
383+
384+
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
385+
if (!nr_pages) {
386+
qc = submit_bio(bio);
387+
break;
388+
}
389+
390+
if (!dio->multi_bio) {
391+
dio->multi_bio = true;
392+
atomic_set(&dio->ref, 2);
393+
} else {
394+
atomic_inc(&dio->ref);
395+
}
396+
397+
submit_bio(bio);
398+
bio = bio_alloc(GFP_KERNEL, nr_pages);
399+
}
400+
401+
if (!dio->is_sync)
402+
return -EIOCBQUEUED;
403+
404+
for (;;) {
405+
set_current_state(TASK_UNINTERRUPTIBLE);
406+
if (!READ_ONCE(dio->waiter))
407+
break;
408+
409+
if (!(iocb->ki_flags & IOCB_HIPRI) ||
410+
!blk_mq_poll(bdev_get_queue(bdev), qc))
411+
io_schedule();
412+
}
413+
__set_current_state(TASK_RUNNING);
414+
415+
ret = dio->bio.bi_error;
416+
if (likely(!ret)) {
417+
ret = dio->size;
418+
iocb->ki_pos += ret;
419+
}
420+
421+
bio_put(&dio->bio);
422+
return ret;
423+
}
424+
425+
static ssize_t
426+
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
427+
{
278428
int nr_pages;
279429

280430
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
281431
if (!nr_pages)
282432
return 0;
283433
if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
284434
return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
285-
return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
286-
blkdev_get_block, NULL, NULL,
287-
DIO_SKIP_DIO_COUNT);
435+
436+
return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
437+
}
438+
439+
static __init int blkdev_init(void)
440+
{
441+
blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio));
442+
if (!blkdev_dio_pool)
443+
return -ENOMEM;
444+
return 0;
288445
}
446+
module_init(blkdev_init);
289447

290448
int __sync_blockdev(struct block_device *bdev, int wait)
291449
{

0 commit comments

Comments
 (0)