Skip to content

Commit e7c5809

Browse files
mjkravetztorvalds
authored andcommitted
hugetlbfs: revert "Use i_mmap_rwsem to fix page fault/truncate race"
This reverts c86aa7b The reverted commit caused ABBA deadlocks when file migration raced with file eviction for specific hugetlbfs files. This was discovered with a modified version of the LTP move_pages12 test. The purpose of the reverted patch was to close a long existing race between hugetlbfs file truncation and page faults. After more analysis of the patch and impacted code, it was determined that i_mmap_rwsem can not be used for all required synchronization. Therefore, revert this patch while working an another approach to the underlying issue. Link: http://lkml.kernel.org/r/20190103235452.29335-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Reported-by: Jan Stancek <jstancek@redhat.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Hugh Dickins <hughd@google.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Prakash Sangappa <prakash.sangappa@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 8ab88c7 commit e7c5809

File tree

2 files changed

+44
-38
lines changed

2 files changed

+44
-38
lines changed

fs/hugetlbfs/inode.c

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
383383
* truncation is indicated by end of range being LLONG_MAX
384384
* In this case, we first scan the range and release found pages.
385385
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
386-
* maps and global counts.
386+
* maps and global counts. Page faults can not race with truncation
387+
* in this routine. hugetlb_no_page() prevents page faults in the
388+
* truncated range. It checks i_size before allocation, and again after
389+
* with the page table lock for the page held. The same lock must be
390+
* acquired to unmap a page.
387391
* hole punch is indicated if end is not LLONG_MAX
388392
* In the hole punch case we scan the range and release found pages.
389393
* Only when releasing a page is the associated region/reserv map
390394
* deleted. The region/reserv map for ranges without associated
391-
* pages are not modified.
392-
*
393-
* Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
394-
* races with page faults.
395-
*
395+
* pages are not modified. Page faults can race with hole punch.
396+
* This is indicated if we find a mapped page.
396397
* Note: If the passed end of range value is beyond the end of file, but
397398
* not LLONG_MAX this routine still performs a hole punch operation.
398399
*/
@@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
422423

423424
for (i = 0; i < pagevec_count(&pvec); ++i) {
424425
struct page *page = pvec.pages[i];
426+
u32 hash;
425427

426428
index = page->index;
429+
hash = hugetlb_fault_mutex_hash(h, current->mm,
430+
&pseudo_vma,
431+
mapping, index, 0);
432+
mutex_lock(&hugetlb_fault_mutex_table[hash]);
433+
427434
/*
428-
* A mapped page is impossible as callers should unmap
429-
* all references before calling. And, i_mmap_rwsem
430-
* prevents the creation of additional mappings.
435+
* If page is mapped, it was faulted in after being
436+
* unmapped in caller. Unmap (again) now after taking
437+
* the fault mutex. The mutex will prevent faults
438+
* until we finish removing the page.
439+
*
440+
* This race can only happen in the hole punch case.
441+
* Getting here in a truncate operation is a bug.
431442
*/
432-
VM_BUG_ON(page_mapped(page));
443+
if (unlikely(page_mapped(page))) {
444+
BUG_ON(truncate_op);
445+
446+
i_mmap_lock_write(mapping);
447+
hugetlb_vmdelete_list(&mapping->i_mmap,
448+
index * pages_per_huge_page(h),
449+
(index + 1) * pages_per_huge_page(h));
450+
i_mmap_unlock_write(mapping);
451+
}
433452

434453
lock_page(page);
435454
/*
@@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
451470
}
452471

453472
unlock_page(page);
473+
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
454474
}
455475
huge_pagevec_release(&pvec);
456476
cond_resched();
@@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
462482

463483
static void hugetlbfs_evict_inode(struct inode *inode)
464484
{
465-
struct address_space *mapping = inode->i_mapping;
466485
struct resv_map *resv_map;
467486

468-
/*
469-
* The vfs layer guarantees that there are no other users of this
470-
* inode. Therefore, it would be safe to call remove_inode_hugepages
471-
* without holding i_mmap_rwsem. We acquire and hold here to be
472-
* consistent with other callers. Since there will be no contention
473-
* on the semaphore, overhead is negligible.
474-
*/
475-
i_mmap_lock_write(mapping);
476487
remove_inode_hugepages(inode, 0, LLONG_MAX);
477-
i_mmap_unlock_write(mapping);
478-
479488
resv_map = (struct resv_map *)inode->i_mapping->private_data;
480489
/* root inode doesn't have the resv_map, so we should check it */
481490
if (resv_map)
@@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
496505
i_mmap_lock_write(mapping);
497506
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
498507
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
499-
remove_inode_hugepages(inode, offset, LLONG_MAX);
500508
i_mmap_unlock_write(mapping);
509+
remove_inode_hugepages(inode, offset, LLONG_MAX);
501510
return 0;
502511
}
503512

@@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
531540
hugetlb_vmdelete_list(&mapping->i_mmap,
532541
hole_start >> PAGE_SHIFT,
533542
hole_end >> PAGE_SHIFT);
534-
remove_inode_hugepages(inode, hole_start, hole_end);
535543
i_mmap_unlock_write(mapping);
544+
remove_inode_hugepages(inode, hole_start, hole_end);
536545
inode_unlock(inode);
537546
}
538547

@@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
615624
/* addr is the offset within the file (zero based) */
616625
addr = index * hpage_size;
617626

618-
/*
619-
* fault mutex taken here, protects against fault path
620-
* and hole punch. inode_lock previously taken protects
621-
* against truncation.
622-
*/
627+
/* mutex taken here, fault path and hole punch */
623628
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
624629
index, addr);
625630
mutex_lock(&hugetlb_fault_mutex_table[hash]);

mm/hugetlb.c

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3755,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
37553755
}
37563756

37573757
/*
3758-
* We can not race with truncation due to holding i_mmap_rwsem.
3759-
* Check once here for faults beyond end of file.
3758+
* Use page lock to guard against racing truncation
3759+
* before we get page_table_lock.
37603760
*/
3761-
size = i_size_read(mapping->host) >> huge_page_shift(h);
3762-
if (idx >= size)
3763-
goto out;
3764-
37653761
retry:
37663762
page = find_lock_page(mapping, idx);
37673763
if (!page) {
3764+
size = i_size_read(mapping->host) >> huge_page_shift(h);
3765+
if (idx >= size)
3766+
goto out;
3767+
37683768
/*
37693769
* Check for page in userfault range
37703770
*/
@@ -3854,6 +3854,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
38543854
}
38553855

38563856
ptl = huge_pte_lock(h, mm, ptep);
3857+
size = i_size_read(mapping->host) >> huge_page_shift(h);
3858+
if (idx >= size)
3859+
goto backout;
38573860

38583861
ret = 0;
38593862
if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3956,10 +3959,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
39563959

39573960
/*
39583961
* Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
3959-
* until finished with ptep. This serves two purposes:
3960-
* 1) It prevents huge_pmd_unshare from being called elsewhere
3961-
* and making the ptep no longer valid.
3962-
* 2) It synchronizes us with file truncation.
3962+
* until finished with ptep. This prevents huge_pmd_unshare from
3963+
* being called elsewhere and making the ptep no longer valid.
39633964
*
39643965
* ptep could have already be assigned via huge_pte_offset. That
39653966
* is OK, as huge_pte_alloc will return the same value unless

0 commit comments

Comments
 (0)