Skip to content

Commit b8a6176

Browse files
jankaradjbw
authored andcommitted
ext4: Support for synchronous DAX faults
We return IOMAP_F_DIRTY flag from ext4_iomap_begin() when asked to prepare blocks for writing and the inode has some uncommitted metadata changes. In the fault handler ext4_dax_fault() we then detect this case (through VM_FAULT_NEEDDSYNC return value) and call helper dax_finish_sync_fault() to flush metadata changes and insert page table entry. Note that this will also dirty corresponding radix tree entry which is what we want - fsync(2) will still provide data integrity guarantees for applications not using userspace flushing. And applications using userspace flushing can avoid calling fsync(2) and thus avoid the performance overhead. Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
1 parent 497f692 commit b8a6176

File tree

4 files changed

+47
-1
lines changed

4 files changed

+47
-1
lines changed

fs/ext4/file.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <linux/quotaops.h>
2727
#include <linux/pagevec.h>
2828
#include <linux/uio.h>
29+
#include <linux/mman.h>
2930
#include "ext4.h"
3031
#include "ext4_jbd2.h"
3132
#include "xattr.h"
@@ -295,6 +296,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
295296
*/
296297
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
297298
(vmf->vma->vm_flags & VM_SHARED);
299+
pfn_t pfn;
298300

299301
if (write) {
300302
sb_start_pagefault(sb);
@@ -310,9 +312,12 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
310312
} else {
311313
down_read(&EXT4_I(inode)->i_mmap_sem);
312314
}
313-
result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops);
315+
result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops);
314316
if (write) {
315317
ext4_journal_stop(handle);
318+
/* Handling synchronous page fault? */
319+
if (result & VM_FAULT_NEEDDSYNC)
320+
result = dax_finish_sync_fault(vmf, pe_size, pfn);
316321
up_read(&EXT4_I(inode)->i_mmap_sem);
317322
sb_end_pagefault(sb);
318323
} else {
@@ -350,6 +355,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
350355
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
351356
return -EIO;
352357

358+
/*
359+
* We don't support synchronous mappings for non-DAX files. At least
360+
* until someone comes with a sensible use case.
361+
*/
362+
if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
363+
return -EOPNOTSUPP;
364+
353365
file_accessed(file);
354366
if (IS_DAX(file_inode(file))) {
355367
vma->vm_ops = &ext4_dax_vm_ops;
@@ -719,6 +731,7 @@ const struct file_operations ext4_file_operations = {
719731
.compat_ioctl = ext4_compat_ioctl,
720732
#endif
721733
.mmap = ext4_file_mmap,
734+
.mmap_supported_flags = MAP_SYNC,
722735
.open = ext4_file_open,
723736
.release = ext4_release_file,
724737
.fsync = ext4_sync_file,

fs/ext4/inode.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3394,6 +3394,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
33943394
}
33953395

33963396
#ifdef CONFIG_FS_DAX
3397+
static bool ext4_inode_datasync_dirty(struct inode *inode)
3398+
{
3399+
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
3400+
3401+
if (journal)
3402+
return !jbd2_transaction_committed(journal,
3403+
EXT4_I(inode)->i_datasync_tid);
3404+
/* Any metadata buffers to write? */
3405+
if (!list_empty(&inode->i_mapping->private_list))
3406+
return true;
3407+
return inode->i_state & I_DIRTY_DATASYNC;
3408+
}
3409+
33973410
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
33983411
unsigned flags, struct iomap *iomap)
33993412
{
@@ -3466,6 +3479,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
34663479
}
34673480

34683481
iomap->flags = 0;
3482+
if ((flags & IOMAP_WRITE) && ext4_inode_datasync_dirty(inode))
3483+
iomap->flags |= IOMAP_F_DIRTY;
34693484
iomap->bdev = inode->i_sb->s_bdev;
34703485
iomap->dax_dev = sbi->s_daxdev;
34713486
iomap->offset = first_block << blkbits;

fs/jbd2/journal.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
738738
return err;
739739
}
740740

741+
/* Return 1 when transaction with given tid has already committed. */
742+
int jbd2_transaction_committed(journal_t *journal, tid_t tid)
743+
{
744+
int ret = 1;
745+
746+
read_lock(&journal->j_state_lock);
747+
if (journal->j_running_transaction &&
748+
journal->j_running_transaction->t_tid == tid)
749+
ret = 0;
750+
if (journal->j_committing_transaction &&
751+
journal->j_committing_transaction->t_tid == tid)
752+
ret = 0;
753+
read_unlock(&journal->j_state_lock);
754+
return ret;
755+
}
756+
EXPORT_SYMBOL(jbd2_transaction_committed);
757+
741758
/*
742759
* When this function returns the transaction corresponding to tid
743760
* will be completed. If the transaction has currently running, start

include/linux/jbd2.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid);
13671367
int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
13681368
int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
13691369
int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
1370+
int jbd2_transaction_committed(journal_t *journal, tid_t tid);
13701371
int jbd2_complete_transaction(journal_t *journal, tid_t tid);
13711372
int jbd2_log_do_checkpoint(journal_t *journal);
13721373
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);

0 commit comments

Comments
 (0)