Skip to content

Commit 785373b

Browse files
committed
Revert "rmap: do not call mmu_notifier_invalidate_page() under ptl"
This reverts commit aac2fea. It turns out that that patch was complete and utter garbage, and broke KVM, resulting in odd oopses. Quoting Andrea Arcangeli: "The aforementioned commit has 3 bugs. 1) mmu_notifier_invalidate_range cannot be used in replacement of mmu_notifier_invalidate_range_start/end. For KVM mmu_notifier_invalidate_range is a noop and rightfully so. A MMU notifier implementation has to implement either ->invalidate_range method or the invalidate_range_start/end methods, not both. And if you implement invalidate_range_start/end like KVM is forced to do, calling mmu_notifier_invalidate_range in common code is a noop for KVM. For those MMU notifiers that can get away only implementing ->invalidate_range, the ->invalidate_range is implicitly called by mmu_notifier_invalidate_range_end(). And only those secondary MMUs that share the same pagetable with the primary MMU (like AMD iommuv2) can get away only implementing ->invalidate_range. So all cases (THP on/off) are broken right now. To fix this is enough to replace mmu_notifier_invalidate_range with mmu_notifier_invalidate_range_start;mmu_notifier_invalidate_range_end. Either that or call multiple mmu_notifier_invalidate_page like before. 2) address + (1UL << compound_order(page) is buggy, it should be PAGE_SIZE << compound_order(page), it's bytes not pages, 2M not 512. 3) The whole invalidate_range thing was an attempt to call a single invalidate while walking multiple 4k ptes that maps the same THP (after a pmd virtual split without physical compound page THP split). It's unclear if the rmap_walk will always provide an address that is 2M aligned as parameter to try_to_unmap_one, in presence of THP. I think it needs also an address &= (PAGE_SIZE << compound_order(page)) - 1 to be safe" In general, we should stop making excuses for horrible MMU notifier users. It's much more important that the core VM is sane and safe, than letting MMU notifiers sleep. So if some MMU notifier is sleeping under a spinlock, we need to fix the notifier, not try to make excuses for that garbage in the core VM. Reported-and-tested-by: Bernhard Held <berny156@gmx.de> Reported-and-tested-by: Adam Borowski <kilobyte@angband.pl> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Radim Krčmář <rkrcmar@redhat.com> Cc: Wanpeng Li <kernellwp@gmail.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Takashi Iwai <tiwai@suse.de> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: axie <axie@amd.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 9c3a815 commit 785373b

File tree

1 file changed

+22
-30
lines changed

1 file changed

+22
-30
lines changed

mm/rmap.c

Lines changed: 22 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -888,22 +888,22 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
888888
.flags = PVMW_SYNC,
889889
};
890890
int *cleaned = arg;
891-
bool invalidation_needed = false;
892891

893892
while (page_vma_mapped_walk(&pvmw)) {
894893
int ret = 0;
894+
address = pvmw.address;
895895
if (pvmw.pte) {
896896
pte_t entry;
897897
pte_t *pte = pvmw.pte;
898898

899899
if (!pte_dirty(*pte) && !pte_write(*pte))
900900
continue;
901901

902-
flush_cache_page(vma, pvmw.address, pte_pfn(*pte));
903-
entry = ptep_clear_flush(vma, pvmw.address, pte);
902+
flush_cache_page(vma, address, pte_pfn(*pte));
903+
entry = ptep_clear_flush(vma, address, pte);
904904
entry = pte_wrprotect(entry);
905905
entry = pte_mkclean(entry);
906-
set_pte_at(vma->vm_mm, pvmw.address, pte, entry);
906+
set_pte_at(vma->vm_mm, address, pte, entry);
907907
ret = 1;
908908
} else {
909909
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -913,11 +913,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
913913
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
914914
continue;
915915

916-
flush_cache_page(vma, pvmw.address, page_to_pfn(page));
917-
entry = pmdp_huge_clear_flush(vma, pvmw.address, pmd);
916+
flush_cache_page(vma, address, page_to_pfn(page));
917+
entry = pmdp_huge_clear_flush(vma, address, pmd);
918918
entry = pmd_wrprotect(entry);
919919
entry = pmd_mkclean(entry);
920-
set_pmd_at(vma->vm_mm, pvmw.address, pmd, entry);
920+
set_pmd_at(vma->vm_mm, address, pmd, entry);
921921
ret = 1;
922922
#else
923923
/* unexpected pmd-mapped page? */
@@ -926,16 +926,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
926926
}
927927

928928
if (ret) {
929+
mmu_notifier_invalidate_page(vma->vm_mm, address);
929930
(*cleaned)++;
930-
invalidation_needed = true;
931931
}
932932
}
933933

934-
if (invalidation_needed) {
935-
mmu_notifier_invalidate_range(vma->vm_mm, address,
936-
address + (1UL << compound_order(page)));
937-
}
938-
939934
return true;
940935
}
941936

@@ -1328,7 +1323,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
13281323
};
13291324
pte_t pteval;
13301325
struct page *subpage;
1331-
bool ret = true, invalidation_needed = false;
1326+
bool ret = true;
13321327
enum ttu_flags flags = (enum ttu_flags)arg;
13331328

13341329
/* munlock has nothing to gain from examining un-locked vmas */
@@ -1368,9 +1363,11 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
13681363
VM_BUG_ON_PAGE(!pvmw.pte, page);
13691364

13701365
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
1366+
address = pvmw.address;
1367+
13711368

13721369
if (!(flags & TTU_IGNORE_ACCESS)) {
1373-
if (ptep_clear_flush_young_notify(vma, pvmw.address,
1370+
if (ptep_clear_flush_young_notify(vma, address,
13741371
pvmw.pte)) {
13751372
ret = false;
13761373
page_vma_mapped_walk_done(&pvmw);
@@ -1379,7 +1376,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
13791376
}
13801377

13811378
/* Nuke the page table entry. */
1382-
flush_cache_page(vma, pvmw.address, pte_pfn(*pvmw.pte));
1379+
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
13831380
if (should_defer_flush(mm, flags)) {
13841381
/*
13851382
* We clear the PTE but do not flush so potentially
@@ -1389,12 +1386,11 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
13891386
* transition on a cached TLB entry is written through
13901387
* and traps if the PTE is unmapped.
13911388
*/
1392-
pteval = ptep_get_and_clear(mm, pvmw.address,
1393-
pvmw.pte);
1389+
pteval = ptep_get_and_clear(mm, address, pvmw.pte);
13941390

13951391
set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
13961392
} else {
1397-
pteval = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1393+
pteval = ptep_clear_flush(vma, address, pvmw.pte);
13981394
}
13991395

14001396
/* Move the dirty bit to the page. Now the pte is gone. */
@@ -1409,12 +1405,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
14091405
if (PageHuge(page)) {
14101406
int nr = 1 << compound_order(page);
14111407
hugetlb_count_sub(nr, mm);
1412-
set_huge_swap_pte_at(mm, pvmw.address,
1408+
set_huge_swap_pte_at(mm, address,
14131409
pvmw.pte, pteval,
14141410
vma_mmu_pagesize(vma));
14151411
} else {
14161412
dec_mm_counter(mm, mm_counter(page));
1417-
set_pte_at(mm, pvmw.address, pvmw.pte, pteval);
1413+
set_pte_at(mm, address, pvmw.pte, pteval);
14181414
}
14191415

14201416
} else if (pte_unused(pteval)) {
@@ -1438,7 +1434,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
14381434
swp_pte = swp_entry_to_pte(entry);
14391435
if (pte_soft_dirty(pteval))
14401436
swp_pte = pte_swp_mksoft_dirty(swp_pte);
1441-
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
1437+
set_pte_at(mm, address, pvmw.pte, swp_pte);
14421438
} else if (PageAnon(page)) {
14431439
swp_entry_t entry = { .val = page_private(subpage) };
14441440
pte_t swp_pte;
@@ -1464,15 +1460,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
14641460
* If the page was redirtied, it cannot be
14651461
* discarded. Remap the page to page table.
14661462
*/
1467-
set_pte_at(mm, pvmw.address, pvmw.pte, pteval);
1463+
set_pte_at(mm, address, pvmw.pte, pteval);
14681464
SetPageSwapBacked(page);
14691465
ret = false;
14701466
page_vma_mapped_walk_done(&pvmw);
14711467
break;
14721468
}
14731469

14741470
if (swap_duplicate(entry) < 0) {
1475-
set_pte_at(mm, pvmw.address, pvmw.pte, pteval);
1471+
set_pte_at(mm, address, pvmw.pte, pteval);
14761472
ret = false;
14771473
page_vma_mapped_walk_done(&pvmw);
14781474
break;
@@ -1488,18 +1484,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
14881484
swp_pte = swp_entry_to_pte(entry);
14891485
if (pte_soft_dirty(pteval))
14901486
swp_pte = pte_swp_mksoft_dirty(swp_pte);
1491-
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
1487+
set_pte_at(mm, address, pvmw.pte, swp_pte);
14921488
} else
14931489
dec_mm_counter(mm, mm_counter_file(page));
14941490
discard:
14951491
page_remove_rmap(subpage, PageHuge(page));
14961492
put_page(page);
1497-
invalidation_needed = true;
1493+
mmu_notifier_invalidate_page(mm, address);
14981494
}
1499-
1500-
if (invalidation_needed)
1501-
mmu_notifier_invalidate_range(mm, address,
1502-
address + (1UL << compound_order(page)));
15031495
return ret;
15041496
}
15051497

0 commit comments

Comments
 (0)