Skip to content

Commit 53e857f

Browse files
author
Martin Schwidefsky
committed
s390/mm,tlb: race of lazy TLB flush vs. recreation of TLB entries
Git commit 050eef3 "[S390] fix tlb flushing vs. concurrent /proc accesses" introduced the attach counter to avoid using the mm_users value to decide between IPTE for every PTE and lazy TLB flushing with IDTE. That fixed the problem with mm_users but it introduced another subtle race, fortunately one that is very hard to hit. The background is the requirement of the architecture that a valid PTE may not be changed while it can be used concurrently by another cpu. The decision between IPTE and lazy TLB flushing needs to be done while the PTE is still valid. Now if the virtual cpu is temporarily stopped after the decision to use lazy TLB flushing but before the invalid bit of the PTE has been set, another cpu can attach the mm, find that flush_mm is set, do the IDTE, return to userspace, and recreate a TLB that uses the PTE in question. When the first, stopped cpu continues it will change the PTE while it is attached on another cpu. The first cpu will do another IDTE shortly after the modification of the PTE which makes the race window quite short. To fix this race the CPU that wants to attach the address space of a user space thread needs to wait for the end of the PTE modification. The number of concurrent TLB flushers for an mm is tracked in the upper 16 bits of the attach_count and finish_arch_post_lock_switch is used to wait for the end of the flush operation if required. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
1 parent a53efe5 commit 53e857f

File tree

5 files changed

+85
-34
lines changed

5 files changed

+85
-34
lines changed

arch/s390/include/asm/mmu_context.h

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,42 @@ static inline void update_mm(struct mm_struct *mm, struct task_struct *tsk)
4848
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
4949
struct task_struct *tsk)
5050
{
51-
cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
52-
update_mm(next, tsk);
51+
int cpu = smp_processor_id();
52+
53+
if (prev == next)
54+
return;
55+
if (atomic_inc_return(&next->context.attach_count) >> 16) {
56+
/* Delay update_mm until all TLB flushes are done. */
57+
set_tsk_thread_flag(tsk, TIF_TLB_WAIT);
58+
} else {
59+
cpumask_set_cpu(cpu, mm_cpumask(next));
60+
update_mm(next, tsk);
61+
if (next->context.flush_mm)
62+
/* Flush pending TLBs */
63+
__tlb_flush_mm(next);
64+
}
5365
atomic_dec(&prev->context.attach_count);
5466
WARN_ON(atomic_read(&prev->context.attach_count) < 0);
55-
atomic_inc(&next->context.attach_count);
56-
/* Check for TLBs not flushed yet */
57-
__tlb_flush_mm_lazy(next);
67+
}
68+
69+
#define finish_arch_post_lock_switch finish_arch_post_lock_switch
70+
static inline void finish_arch_post_lock_switch(void)
71+
{
72+
struct task_struct *tsk = current;
73+
struct mm_struct *mm = tsk->mm;
74+
75+
if (!test_tsk_thread_flag(tsk, TIF_TLB_WAIT))
76+
return;
77+
preempt_disable();
78+
clear_tsk_thread_flag(tsk, TIF_TLB_WAIT);
79+
while (atomic_read(&mm->context.attach_count) >> 16)
80+
cpu_relax();
81+
82+
cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
83+
update_mm(mm, tsk);
84+
if (mm->context.flush_mm)
85+
__tlb_flush_mm(mm);
86+
preempt_enable();
5887
}
5988

6089
#define enter_lazy_tlb(mm,tsk) do { } while (0)

arch/s390/include/asm/pgtable.h

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,30 +1034,41 @@ static inline int ptep_test_and_clear_user_young(struct mm_struct *mm,
10341034

10351035
static inline void __ptep_ipte(unsigned long address, pte_t *ptep)
10361036
{
1037-
if (!(pte_val(*ptep) & _PAGE_INVALID)) {
1037+
unsigned long pto = (unsigned long) ptep;
1038+
10381039
#ifndef CONFIG_64BIT
1039-
/* pto must point to the start of the segment table */
1040-
pte_t *pto = (pte_t *) (((unsigned long) ptep) & 0x7ffffc00);
1041-
#else
1042-
/* ipte in zarch mode can do the math */
1043-
pte_t *pto = ptep;
1040+
/* pto in ESA mode must point to the start of the segment table */
1041+
pto &= 0x7ffffc00;
10441042
#endif
1045-
asm volatile(
1046-
" ipte %2,%3"
1047-
: "=m" (*ptep) : "m" (*ptep),
1048-
"a" (pto), "a" (address));
1049-
}
1043+
/* Invalidation + global TLB flush for the pte */
1044+
asm volatile(
1045+
" ipte %2,%3"
1046+
: "=m" (*ptep) : "m" (*ptep), "a" (pto), "a" (address));
1047+
}
1048+
1049+
static inline void ptep_flush_direct(struct mm_struct *mm,
1050+
unsigned long address, pte_t *ptep)
1051+
{
1052+
if (pte_val(*ptep) & _PAGE_INVALID)
1053+
return;
1054+
__ptep_ipte(address, ptep);
10501055
}
10511056

10521057
static inline void ptep_flush_lazy(struct mm_struct *mm,
10531058
unsigned long address, pte_t *ptep)
10541059
{
1055-
int active = (mm == current->active_mm) ? 1 : 0;
1060+
int active, count;
10561061

1057-
if (atomic_read(&mm->context.attach_count) > active)
1058-
__ptep_ipte(address, ptep);
1059-
else
1062+
if (pte_val(*ptep) & _PAGE_INVALID)
1063+
return;
1064+
active = (mm == current->active_mm) ? 1 : 0;
1065+
count = atomic_add_return(0x10000, &mm->context.attach_count);
1066+
if ((count & 0xffff) <= active) {
1067+
pte_val(*ptep) |= _PAGE_INVALID;
10601068
mm->context.flush_mm = 1;
1069+
} else
1070+
__ptep_ipte(address, ptep);
1071+
atomic_sub(0x10000, &mm->context.attach_count);
10611072
}
10621073

10631074
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
@@ -1074,7 +1085,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
10741085
}
10751086

10761087
pte = *ptep;
1077-
__ptep_ipte(addr, ptep);
1088+
ptep_flush_direct(vma->vm_mm, addr, ptep);
10781089
young = pte_young(pte);
10791090
pte = pte_mkold(pte);
10801091

@@ -1145,7 +1156,6 @@ static inline pte_t ptep_modify_prot_start(struct mm_struct *mm,
11451156

11461157
pte = *ptep;
11471158
ptep_flush_lazy(mm, address, ptep);
1148-
pte_val(*ptep) |= _PAGE_INVALID;
11491159

11501160
if (mm_has_pgste(mm)) {
11511161
pgste = pgste_update_all(&pte, pgste);
@@ -1182,7 +1192,7 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
11821192
}
11831193

11841194
pte = *ptep;
1185-
__ptep_ipte(address, ptep);
1195+
ptep_flush_direct(vma->vm_mm, address, ptep);
11861196
pte_val(*ptep) = _PAGE_INVALID;
11871197

11881198
if (mm_has_pgste(vma->vm_mm)) {
@@ -1263,7 +1273,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
12631273
pgste = pgste_ipte_notify(vma->vm_mm, address, ptep, pgste);
12641274
}
12651275

1266-
__ptep_ipte(address, ptep);
1276+
ptep_flush_direct(vma->vm_mm, address, ptep);
12671277

12681278
if (mm_has_pgste(vma->vm_mm)) {
12691279
pgste_set_pte(ptep, entry);
@@ -1447,12 +1457,16 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
14471457
static inline void pmdp_flush_lazy(struct mm_struct *mm,
14481458
unsigned long address, pmd_t *pmdp)
14491459
{
1450-
int active = (mm == current->active_mm) ? 1 : 0;
1460+
int active, count;
14511461

1452-
if ((atomic_read(&mm->context.attach_count) & 0xffff) > active)
1453-
__pmd_idte(address, pmdp);
1454-
else
1462+
active = (mm == current->active_mm) ? 1 : 0;
1463+
count = atomic_add_return(0x10000, &mm->context.attach_count);
1464+
if ((count & 0xffff) <= active) {
1465+
pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
14551466
mm->context.flush_mm = 1;
1467+
} else
1468+
__pmd_idte(address, pmdp);
1469+
atomic_sub(0x10000, &mm->context.attach_count);
14561470
}
14571471

14581472
#ifdef CONFIG_TRANSPARENT_HUGEPAGE

arch/s390/include/asm/thread_info.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ static inline struct thread_info *current_thread_info(void)
8181
#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
8282
#define TIF_SIGPENDING 2 /* signal pending */
8383
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
84+
#define TIF_TLB_WAIT 4 /* wait for TLB flush completion */
8485
#define TIF_PER_TRAP 6 /* deliver sigtrap on return to user */
8586
#define TIF_MCCK_PENDING 7 /* machine check handling is pending */
8687
#define TIF_SYSCALL_TRACE 8 /* syscall trace active */
@@ -96,6 +97,7 @@ static inline struct thread_info *current_thread_info(void)
9697
#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
9798
#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
9899
#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
100+
#define _TIF_TLB_WAIT (1<<TIF_TLB_WAIT)
99101
#define _TIF_PER_TRAP (1<<TIF_PER_TRAP)
100102
#define _TIF_MCCK_PENDING (1<<TIF_MCCK_PENDING)
101103
#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)

arch/s390/kernel/entry.S

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
4343
_TIF_MCCK_PENDING)
4444
_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
4545
_TIF_SYSCALL_TRACEPOINT)
46+
_TIF_TRANSFER = (_TIF_MCCK_PENDING | _TIF_TLB_WAIT)
4647

4748
STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER
4849
STACK_SIZE = 1 << STACK_SHIFT
@@ -159,10 +160,12 @@ ENTRY(__switch_to)
159160
lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4
160161
mvc __LC_CURRENT_PID(4,%r0),__TASK_pid(%r3) # store pid of next
161162
l %r15,__THREAD_ksp(%r3) # load kernel stack of next
162-
tm __TI_flags+3(%r4),_TIF_MCCK_PENDING # machine check pending?
163+
lhi %r6,_TIF_TRANSFER # transfer TIF bits
164+
n %r6,__TI_flags(%r4) # isolate TIF bits
163165
jz 0f
164-
ni __TI_flags+3(%r4),255-_TIF_MCCK_PENDING # clear flag in prev
165-
oi __TI_flags+3(%r5),_TIF_MCCK_PENDING # set it in next
166+
o %r6,__TI_flags(%r5) # set TIF bits of next
167+
st %r6,__TI_flags(%r5)
168+
ni __TI_flags+3(%r4),255-_TIF_TRANSFER # clear TIF bits of prev
166169
0: lm %r6,%r15,__SF_GPRS(%r15) # load gprs of next task
167170
br %r14
168171

arch/s390/kernel/entry64.S

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
4848
_TIF_MCCK_PENDING)
4949
_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
5050
_TIF_SYSCALL_TRACEPOINT)
51+
_TIF_TRANSFER = (_TIF_MCCK_PENDING | _TIF_TLB_WAIT)
5152

5253
#define BASED(name) name-system_call(%r13)
5354

@@ -189,10 +190,12 @@ ENTRY(__switch_to)
189190
lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4
190191
mvc __LC_CURRENT_PID+4(4,%r0),__TASK_pid(%r3) # store pid of next
191192
lg %r15,__THREAD_ksp(%r3) # load kernel stack of next
192-
tm __TI_flags+7(%r4),_TIF_MCCK_PENDING # machine check pending?
193+
llill %r6,_TIF_TRANSFER # transfer TIF bits
194+
ng %r6,__TI_flags(%r4) # isolate TIF bits
193195
jz 0f
194-
ni __TI_flags+7(%r4),255-_TIF_MCCK_PENDING # clear flag in prev
195-
oi __TI_flags+7(%r5),_TIF_MCCK_PENDING # set it in next
196+
og %r6,__TI_flags(%r5) # set TIF bits of next
197+
stg %r6,__TI_flags(%r5)
198+
ni __TI_flags+7(%r4),255-_TIF_TRANSFER # clear TIF bits of prev
196199
0: lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task
197200
br %r14
198201

0 commit comments

Comments
 (0)