Skip to content

Commit ff6a929

Browse files
Christoph Hellwigdchinner
authored andcommitted
iomap: implement direct I/O
This adds a full fledget direct I/O implementation using the iomap interface. Full fledged in this case means all features are supported: AIO, vectored I/O, any iov_iter type including kernel pointers, bvecs and pipes, support for hole filling and async apending writes. It does not mean supporting all the warts of the old generic code. We expect i_rwsem to be held over the duration of the call, and we expect to maintain i_dio_count ourselves, and we pass on any kinds of mapping to the file system for now. The algorithm used is very simple: We use iomap_apply to iterate over the range of the I/O, and then we use the new bio_iov_iter_get_pages helper to lock down the user range for the size of the extent. bio_iov_iter_get_pages can currently lock down twice as many pages as the old direct I/O code did, which means that we will have a better batch factor for everything but overwrites of badly fragmented files. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com> Tested-by: Jens Axboe <axboe@fb.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
1 parent ec1b826 commit ff6a929

File tree

2 files changed

+384
-0
lines changed

2 files changed

+384
-0
lines changed

fs/iomap.c

Lines changed: 373 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <linux/uio.h>
2525
#include <linux/backing-dev.h>
2626
#include <linux/buffer_head.h>
27+
#include <linux/task_io_accounting_ops.h>
2728
#include <linux/dax.h>
2829
#include "internal.h"
2930

@@ -584,3 +585,375 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
584585
return 0;
585586
}
586587
EXPORT_SYMBOL_GPL(iomap_fiemap);
588+
589+
/*
590+
* Private flags for iomap_dio, must not overlap with the public ones in
591+
* iomap.h:
592+
*/
593+
#define IOMAP_DIO_WRITE (1 << 30)
594+
#define IOMAP_DIO_DIRTY (1 << 31)
595+
596+
struct iomap_dio {
597+
struct kiocb *iocb;
598+
iomap_dio_end_io_t *end_io;
599+
loff_t i_size;
600+
loff_t size;
601+
atomic_t ref;
602+
unsigned flags;
603+
int error;
604+
605+
union {
606+
/* used during submission and for synchronous completion: */
607+
struct {
608+
struct iov_iter *iter;
609+
struct task_struct *waiter;
610+
struct request_queue *last_queue;
611+
blk_qc_t cookie;
612+
} submit;
613+
614+
/* used for aio completion: */
615+
struct {
616+
struct work_struct work;
617+
} aio;
618+
};
619+
};
620+
621+
static ssize_t iomap_dio_complete(struct iomap_dio *dio)
622+
{
623+
struct kiocb *iocb = dio->iocb;
624+
ssize_t ret;
625+
626+
if (dio->end_io) {
627+
ret = dio->end_io(iocb,
628+
dio->error ? dio->error : dio->size,
629+
dio->flags);
630+
} else {
631+
ret = dio->error;
632+
}
633+
634+
if (likely(!ret)) {
635+
ret = dio->size;
636+
/* check for short read */
637+
if (iocb->ki_pos + ret > dio->i_size &&
638+
!(dio->flags & IOMAP_DIO_WRITE))
639+
ret = dio->i_size - iocb->ki_pos;
640+
iocb->ki_pos += ret;
641+
}
642+
643+
inode_dio_end(file_inode(iocb->ki_filp));
644+
kfree(dio);
645+
646+
return ret;
647+
}
648+
649+
static void iomap_dio_complete_work(struct work_struct *work)
650+
{
651+
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
652+
struct kiocb *iocb = dio->iocb;
653+
bool is_write = (dio->flags & IOMAP_DIO_WRITE);
654+
ssize_t ret;
655+
656+
ret = iomap_dio_complete(dio);
657+
if (is_write && ret > 0)
658+
ret = generic_write_sync(iocb, ret);
659+
iocb->ki_complete(iocb, ret, 0);
660+
}
661+
662+
/*
663+
* Set an error in the dio if none is set yet. We have to use cmpxchg
664+
* as the submission context and the completion context(s) can race to
665+
* update the error.
666+
*/
667+
static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
668+
{
669+
cmpxchg(&dio->error, 0, ret);
670+
}
671+
672+
static void iomap_dio_bio_end_io(struct bio *bio)
673+
{
674+
struct iomap_dio *dio = bio->bi_private;
675+
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
676+
677+
if (bio->bi_error)
678+
iomap_dio_set_error(dio, bio->bi_error);
679+
680+
if (atomic_dec_and_test(&dio->ref)) {
681+
if (is_sync_kiocb(dio->iocb)) {
682+
struct task_struct *waiter = dio->submit.waiter;
683+
684+
WRITE_ONCE(dio->submit.waiter, NULL);
685+
wake_up_process(waiter);
686+
} else if (dio->flags & IOMAP_DIO_WRITE) {
687+
struct inode *inode = file_inode(dio->iocb->ki_filp);
688+
689+
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
690+
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
691+
} else {
692+
iomap_dio_complete_work(&dio->aio.work);
693+
}
694+
}
695+
696+
if (should_dirty) {
697+
bio_check_pages_dirty(bio);
698+
} else {
699+
struct bio_vec *bvec;
700+
int i;
701+
702+
bio_for_each_segment_all(bvec, bio, i)
703+
put_page(bvec->bv_page);
704+
bio_put(bio);
705+
}
706+
}
707+
708+
static blk_qc_t
709+
iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
710+
unsigned len)
711+
{
712+
struct page *page = ZERO_PAGE(0);
713+
struct bio *bio;
714+
715+
bio = bio_alloc(GFP_KERNEL, 1);
716+
bio->bi_bdev = iomap->bdev;
717+
bio->bi_iter.bi_sector =
718+
iomap->blkno + ((pos - iomap->offset) >> 9);
719+
bio->bi_private = dio;
720+
bio->bi_end_io = iomap_dio_bio_end_io;
721+
722+
get_page(page);
723+
if (bio_add_page(bio, page, len, 0) != len)
724+
BUG();
725+
bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_ODIRECT);
726+
727+
atomic_inc(&dio->ref);
728+
return submit_bio(bio);
729+
}
730+
731+
static loff_t
732+
iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
733+
void *data, struct iomap *iomap)
734+
{
735+
struct iomap_dio *dio = data;
736+
unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
737+
unsigned fs_block_size = (1 << inode->i_blkbits), pad;
738+
unsigned align = iov_iter_alignment(dio->submit.iter);
739+
struct iov_iter iter;
740+
struct bio *bio;
741+
bool need_zeroout = false;
742+
int nr_pages, ret;
743+
744+
if ((pos | length | align) & ((1 << blkbits) - 1))
745+
return -EINVAL;
746+
747+
switch (iomap->type) {
748+
case IOMAP_HOLE:
749+
if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
750+
return -EIO;
751+
/*FALLTHRU*/
752+
case IOMAP_UNWRITTEN:
753+
if (!(dio->flags & IOMAP_DIO_WRITE)) {
754+
iov_iter_zero(length, dio->submit.iter);
755+
dio->size += length;
756+
return length;
757+
}
758+
dio->flags |= IOMAP_DIO_UNWRITTEN;
759+
need_zeroout = true;
760+
break;
761+
case IOMAP_MAPPED:
762+
if (iomap->flags & IOMAP_F_SHARED)
763+
dio->flags |= IOMAP_DIO_COW;
764+
if (iomap->flags & IOMAP_F_NEW)
765+
need_zeroout = true;
766+
break;
767+
default:
768+
WARN_ON_ONCE(1);
769+
return -EIO;
770+
}
771+
772+
/*
773+
* Operate on a partial iter trimmed to the extent we were called for.
774+
* We'll update the iter in the dio once we're done with this extent.
775+
*/
776+
iter = *dio->submit.iter;
777+
iov_iter_truncate(&iter, length);
778+
779+
nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
780+
if (nr_pages <= 0)
781+
return nr_pages;
782+
783+
if (need_zeroout) {
784+
/* zero out from the start of the block to the write offset */
785+
pad = pos & (fs_block_size - 1);
786+
if (pad)
787+
iomap_dio_zero(dio, iomap, pos - pad, pad);
788+
}
789+
790+
do {
791+
if (dio->error)
792+
return 0;
793+
794+
bio = bio_alloc(GFP_KERNEL, nr_pages);
795+
bio->bi_bdev = iomap->bdev;
796+
bio->bi_iter.bi_sector =
797+
iomap->blkno + ((pos - iomap->offset) >> 9);
798+
bio->bi_private = dio;
799+
bio->bi_end_io = iomap_dio_bio_end_io;
800+
801+
ret = bio_iov_iter_get_pages(bio, &iter);
802+
if (unlikely(ret)) {
803+
bio_put(bio);
804+
return ret;
805+
}
806+
807+
if (dio->flags & IOMAP_DIO_WRITE) {
808+
bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_ODIRECT);
809+
task_io_account_write(bio->bi_iter.bi_size);
810+
} else {
811+
bio_set_op_attrs(bio, REQ_OP_READ, 0);
812+
if (dio->flags & IOMAP_DIO_DIRTY)
813+
bio_set_pages_dirty(bio);
814+
}
815+
816+
dio->size += bio->bi_iter.bi_size;
817+
pos += bio->bi_iter.bi_size;
818+
819+
nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
820+
821+
atomic_inc(&dio->ref);
822+
823+
dio->submit.last_queue = bdev_get_queue(iomap->bdev);
824+
dio->submit.cookie = submit_bio(bio);
825+
} while (nr_pages);
826+
827+
if (need_zeroout) {
828+
/* zero out from the end of the write to the end of the block */
829+
pad = pos & (fs_block_size - 1);
830+
if (pad)
831+
iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
832+
}
833+
834+
iov_iter_advance(dio->submit.iter, length);
835+
return length;
836+
}
837+
838+
ssize_t
839+
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops,
840+
iomap_dio_end_io_t end_io)
841+
{
842+
struct address_space *mapping = iocb->ki_filp->f_mapping;
843+
struct inode *inode = file_inode(iocb->ki_filp);
844+
size_t count = iov_iter_count(iter);
845+
loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0;
846+
unsigned int flags = IOMAP_DIRECT;
847+
struct blk_plug plug;
848+
struct iomap_dio *dio;
849+
850+
lockdep_assert_held(&inode->i_rwsem);
851+
852+
if (!count)
853+
return 0;
854+
855+
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
856+
if (!dio)
857+
return -ENOMEM;
858+
859+
dio->iocb = iocb;
860+
atomic_set(&dio->ref, 1);
861+
dio->size = 0;
862+
dio->i_size = i_size_read(inode);
863+
dio->end_io = end_io;
864+
dio->error = 0;
865+
dio->flags = 0;
866+
867+
dio->submit.iter = iter;
868+
if (is_sync_kiocb(iocb)) {
869+
dio->submit.waiter = current;
870+
dio->submit.cookie = BLK_QC_T_NONE;
871+
dio->submit.last_queue = NULL;
872+
}
873+
874+
if (iov_iter_rw(iter) == READ) {
875+
if (pos >= dio->i_size)
876+
goto out_free_dio;
877+
878+
if (iter->type == ITER_IOVEC)
879+
dio->flags |= IOMAP_DIO_DIRTY;
880+
} else {
881+
dio->flags |= IOMAP_DIO_WRITE;
882+
flags |= IOMAP_WRITE;
883+
}
884+
885+
if (mapping->nrpages) {
886+
ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
887+
if (ret)
888+
goto out_free_dio;
889+
890+
ret = invalidate_inode_pages2_range(mapping,
891+
iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
892+
WARN_ON_ONCE(ret);
893+
ret = 0;
894+
}
895+
896+
inode_dio_begin(inode);
897+
898+
blk_start_plug(&plug);
899+
do {
900+
ret = iomap_apply(inode, pos, count, flags, ops, dio,
901+
iomap_dio_actor);
902+
if (ret <= 0) {
903+
/* magic error code to fall back to buffered I/O */
904+
if (ret == -ENOTBLK)
905+
ret = 0;
906+
break;
907+
}
908+
pos += ret;
909+
} while ((count = iov_iter_count(iter)) > 0);
910+
blk_finish_plug(&plug);
911+
912+
if (ret < 0)
913+
iomap_dio_set_error(dio, ret);
914+
915+
if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
916+
!inode->i_sb->s_dio_done_wq) {
917+
ret = sb_init_dio_done_wq(inode->i_sb);
918+
if (ret < 0)
919+
iomap_dio_set_error(dio, ret);
920+
}
921+
922+
if (!atomic_dec_and_test(&dio->ref)) {
923+
if (!is_sync_kiocb(iocb))
924+
return -EIOCBQUEUED;
925+
926+
for (;;) {
927+
set_current_state(TASK_UNINTERRUPTIBLE);
928+
if (!READ_ONCE(dio->submit.waiter))
929+
break;
930+
931+
if (!(iocb->ki_flags & IOCB_HIPRI) ||
932+
!dio->submit.last_queue ||
933+
!blk_poll(dio->submit.last_queue,
934+
dio->submit.cookie))
935+
io_schedule();
936+
}
937+
__set_current_state(TASK_RUNNING);
938+
}
939+
940+
/*
941+
* Try again to invalidate clean pages which might have been cached by
942+
* non-direct readahead, or faulted in by get_user_pages() if the source
943+
* of the write was an mmap'ed region of the file we're writing. Either
944+
* one is a pretty crazy thing to do, so we don't support it 100%. If
945+
* this invalidation fails, tough, the write still worked...
946+
*/
947+
if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
948+
ret = invalidate_inode_pages2_range(mapping,
949+
iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
950+
WARN_ON_ONCE(ret);
951+
}
952+
953+
return iomap_dio_complete(dio);
954+
955+
out_free_dio:
956+
kfree(dio);
957+
return ret;
958+
}
959+
EXPORT_SYMBOL_GPL(iomap_dio_rw);

0 commit comments

Comments
 (0)