Skip to content

Commit 2084140

Browse files
Rik van Rieltorvalds
authored andcommitted
mm: fix TLB flush race between migration, and change_protection_range
There are a few subtle races, between change_protection_range (used by mprotect and change_prot_numa) on one side, and NUMA page migration and compaction on the other side. The basic race is that there is a time window between when the PTE gets made non-present (PROT_NONE or NUMA), and the TLB is flushed. During that time, a CPU may continue writing to the page. This is fine most of the time, however compaction or the NUMA migration code may come in, and migrate the page away. When that happens, the CPU may continue writing, through the cached translation, to what is no longer the current memory location of the process. This only affects x86, which has a somewhat optimistic pte_accessible. All other architectures appear to be safe, and will either always flush, or flush whenever there is a valid mapping, even with no permissions (SPARC). The basic race looks like this: CPU A CPU B CPU C load TLB entry make entry PTE/PMD_NUMA fault on entry read/write old page start migrating page change PTE/PMD to new page read/write old page [*] flush TLB reload TLB from new entry read/write new page lose data [*] the old page may belong to a new user at this point! The obvious fix is to flush remote TLB entries, by making sure that pte_accessible aware of the fact that PROT_NONE and PROT_NUMA memory may still be accessible if there is a TLB flush pending for the mm. This should fix both NUMA migration and compaction. [mgorman@suse.de: fix build] Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Alex Thorlton <athorlton@sgi.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent de466bd commit 2084140

File tree

8 files changed

+69
-7
lines changed

8 files changed

+69
-7
lines changed

arch/sparc/include/asm/pgtable_64.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -619,7 +619,7 @@ static inline unsigned long pte_present(pte_t pte)
619619
}
620620

621621
#define pte_accessible pte_accessible
622-
static inline unsigned long pte_accessible(pte_t a)
622+
static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a)
623623
{
624624
return pte_val(a) & _PAGE_VALID;
625625
}
@@ -847,7 +847,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
847847
* SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U
848848
* and SUN4V pte layout, so this inline test is fine.
849849
*/
850-
if (likely(mm != &init_mm) && pte_accessible(orig))
850+
if (likely(mm != &init_mm) && pte_accessible(mm, orig))
851851
tlb_batch_add(mm, addr, ptep, orig, fullmm);
852852
}
853853

arch/x86/include/asm/pgtable.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -452,9 +452,16 @@ static inline int pte_present(pte_t a)
452452
}
453453

454454
#define pte_accessible pte_accessible
455-
static inline int pte_accessible(pte_t a)
455+
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
456456
{
457-
return pte_flags(a) & _PAGE_PRESENT;
457+
if (pte_flags(a) & _PAGE_PRESENT)
458+
return true;
459+
460+
if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
461+
mm_tlb_flush_pending(mm))
462+
return true;
463+
464+
return false;
458465
}
459466

460467
static inline int pte_hidden(pte_t pte)

include/asm-generic/pgtable.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
217217
#endif
218218

219219
#ifndef pte_accessible
220-
# define pte_accessible(pte) ((void)(pte),1)
220+
# define pte_accessible(mm, pte) ((void)(pte), 1)
221221
#endif
222222

223223
#ifndef flush_tlb_fix_spurious_fault

include/linux/mm_types.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,14 @@ struct mm_struct {
442442

443443
/* numa_scan_seq prevents two threads setting pte_numa */
444444
int numa_scan_seq;
445+
#endif
446+
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
447+
/*
448+
* An operation with batched TLB flushing is going on. Anything that
449+
* can move process memory needs to flush the TLB when moving a
450+
* PROT_NONE or PROT_NUMA mapped page.
451+
*/
452+
bool tlb_flush_pending;
445453
#endif
446454
struct uprobes_state uprobes_state;
447455
};
@@ -459,4 +467,40 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
459467
return mm->cpu_vm_mask_var;
460468
}
461469

470+
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
471+
/*
472+
* Memory barriers to keep this state in sync are graciously provided by
473+
* the page table locks, outside of which no page table modifications happen.
474+
* The barriers below prevent the compiler from re-ordering the instructions
475+
* around the memory barriers that are already present in the code.
476+
*/
477+
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
478+
{
479+
barrier();
480+
return mm->tlb_flush_pending;
481+
}
482+
static inline void set_tlb_flush_pending(struct mm_struct *mm)
483+
{
484+
mm->tlb_flush_pending = true;
485+
barrier();
486+
}
487+
/* Clearing is done after a TLB flush, which also provides a barrier. */
488+
static inline void clear_tlb_flush_pending(struct mm_struct *mm)
489+
{
490+
barrier();
491+
mm->tlb_flush_pending = false;
492+
}
493+
#else
494+
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
495+
{
496+
return false;
497+
}
498+
static inline void set_tlb_flush_pending(struct mm_struct *mm)
499+
{
500+
}
501+
static inline void clear_tlb_flush_pending(struct mm_struct *mm)
502+
{
503+
}
504+
#endif
505+
462506
#endif /* _LINUX_MM_TYPES_H */

kernel/fork.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
537537
spin_lock_init(&mm->page_table_lock);
538538
mm_init_aio(mm);
539539
mm_init_owner(mm, p);
540+
clear_tlb_flush_pending(mm);
540541

541542
if (likely(!mm_alloc_pgd(mm))) {
542543
mm->def_flags = 0;

mm/huge_memory.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1376,6 +1376,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
13761376
goto clear_pmdnuma;
13771377
}
13781378

1379+
/*
1380+
* The page_table_lock above provides a memory barrier
1381+
* with change_protection_range.
1382+
*/
1383+
if (mm_tlb_flush_pending(mm))
1384+
flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
1385+
13791386
/*
13801387
* Migrate the THP to the requested node, returns with page unlocked
13811388
* and pmd_numa cleared.

mm/mprotect.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
188188
BUG_ON(addr >= end);
189189
pgd = pgd_offset(mm, addr);
190190
flush_cache_range(vma, addr, end);
191+
set_tlb_flush_pending(mm);
191192
do {
192193
next = pgd_addr_end(addr, end);
193194
if (pgd_none_or_clear_bad(pgd))
@@ -199,6 +200,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
199200
/* Only flush the TLB if we actually modified any entries: */
200201
if (pages)
201202
flush_tlb_range(vma, start, end);
203+
clear_tlb_flush_pending(mm);
202204

203205
return pages;
204206
}

mm/pgtable-generic.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
110110
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
111111
pte_t *ptep)
112112
{
113+
struct mm_struct *mm = (vma)->vm_mm;
113114
pte_t pte;
114-
pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
115-
if (pte_accessible(pte))
115+
pte = ptep_get_and_clear(mm, address, ptep);
116+
if (pte_accessible(mm, pte))
116117
flush_tlb_page(vma, address);
117118
return pte;
118119
}

0 commit comments

Comments
 (0)