Skip to content

Commit a25bd72

Browse files
ozbenhmpe
authored andcommitted
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM. When coming out of a guest with AIL (Alternate Interrupt Location, ie, MMU enabled), we start executing hypervisor code with the PID register still containing whatever the guest has been using. The problem is that the CPU can (and will) then start prefetching or speculatively load from whatever host context has that same PID (if any), thus bringing translations for that context into the TLB, which Linux doesn't know about. This can cause stale translations and subsequent crashes. Fixing this in a way that is neither racy nor a huge performance impact is difficult. We could just make the host invalidations always use broadcast forms but that would hurt single threaded programs for example. We chose to fix it instead by partitioning the PID space between guest and host. This is possible because today Linux only use 19 out of the 20 bits of PID space, so existing guests will work if we make the host use the top half of the 20 bits space. We additionally add support for a property to indicate to Linux the size of the PID register which will be useful if we eventually have processors with a larger PID space available. There is still an issue with malicious guests purposefully setting the PID register to a value in the hosts PID range. Hopefully future HW can prevent that, but in the meantime, we handle it with a pair of kludges: - On the way out of a guest, before we clear the current VCPU in the PACA, we check the PID and if it's outside of the permitted range we flush the TLB for that PID. - When context switching, if the mm is "new" on that CPU (the corresponding bit was set for the first time in the mm cpumask), we check if any sibling thread is in KVM (has a non-NULL VCPU pointer in the PACA). If that is the case, we also flush the PID for that CPU (core). This second part is needed to handle the case where a process is migrated (or starts a new pthread) on a sibling thread of the CPU coming out of KVM, as there's a window where stale translations can exist before we detect it and flush them out. A future optimization could be added by keeping track of whether the PID has ever been used and avoid doing that for completely fresh PIDs. We could similarily mark PIDs that have been the subject of a global invalidation as "fresh". But for now this will do. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> [mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop unneeded include of kvm_book3s_asm.h] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
1 parent 029d925 commit a25bd72

File tree

6 files changed

+154
-22
lines changed

6 files changed

+154
-22
lines changed

arch/powerpc/include/asm/book3s/64/mmu.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,14 @@ extern struct patb_entry *partition_tb;
5959
#define PRTS_MASK 0x1f /* process table size field */
6060
#define PRTB_MASK 0x0ffffffffffff000UL
6161

62-
/*
63-
* Limit process table to PAGE_SIZE table. This
64-
* also limit the max pid we can support.
65-
* MAX_USER_CONTEXT * 16 bytes of space.
66-
*/
67-
#define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4)
68-
#define PRTB_ENTRIES (1ul << CONTEXT_BITS)
62+
/* Number of supported PID bits */
63+
extern unsigned int mmu_pid_bits;
64+
65+
/* Base PID to allocate from */
66+
extern unsigned int mmu_base_pid;
67+
68+
#define PRTB_SIZE_SHIFT (mmu_pid_bits + 4)
69+
#define PRTB_ENTRIES (1ul << mmu_pid_bits)
6970

7071
/*
7172
* Power9 currently only support 64K partition table size.

arch/powerpc/include/asm/mmu_context.h

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ extern void set_context(unsigned long id, pgd_t *pgd);
4545

4646
#ifdef CONFIG_PPC_BOOK3S_64
4747
extern void radix__switch_mmu_context(struct mm_struct *prev,
48-
struct mm_struct *next);
48+
struct mm_struct *next);
4949
static inline void switch_mmu_context(struct mm_struct *prev,
5050
struct mm_struct *next,
5151
struct task_struct *tsk)
@@ -67,6 +67,12 @@ extern void __destroy_context(unsigned long context_id);
6767
extern void mmu_context_init(void);
6868
#endif
6969

70+
#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU)
71+
extern void radix_kvm_prefetch_workaround(struct mm_struct *mm);
72+
#else
73+
static inline void radix_kvm_prefetch_workaround(struct mm_struct *mm) { }
74+
#endif
75+
7076
extern void switch_cop(struct mm_struct *next);
7177
extern int use_cop(unsigned long acop, struct mm_struct *mm);
7278
extern void drop_cop(unsigned long acop, struct mm_struct *mm);
@@ -79,9 +85,13 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev,
7985
struct mm_struct *next,
8086
struct task_struct *tsk)
8187
{
88+
bool new_on_cpu = false;
89+
8290
/* Mark this context has been used on the new CPU */
83-
if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next)))
91+
if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) {
8492
cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
93+
new_on_cpu = true;
94+
}
8595

8696
/* 32-bit keeps track of the current PGDIR in the thread struct */
8797
#ifdef CONFIG_PPC32
@@ -109,6 +119,10 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev,
109119
if (cpu_has_feature(CPU_FTR_ALTIVEC))
110120
asm volatile ("dssall");
111121
#endif /* CONFIG_ALTIVEC */
122+
123+
if (new_on_cpu)
124+
radix_kvm_prefetch_workaround(next);
125+
112126
/*
113127
* The actual HW switching method differs between the various
114128
* sub architectures. Out of line for now

arch/powerpc/kvm/book3s_hv_rmhandlers.S

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1443,12 +1443,14 @@ mc_cont:
14431443
ori r6,r6,1
14441444
mtspr SPRN_CTRLT,r6
14451445
4:
1446-
/* Read the guest SLB and save it away */
1446+
/* Check if we are running hash or radix and store it in cr2 */
14471447
ld r5, VCPU_KVM(r9)
14481448
lbz r0, KVM_RADIX(r5)
1449-
cmpwi r0, 0
1449+
cmpwi cr2,r0,0
1450+
1451+
/* Read the guest SLB and save it away */
14501452
li r5, 0
1451-
bne 3f /* for radix, save 0 entries */
1453+
bne cr2, 3f /* for radix, save 0 entries */
14521454
lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */
14531455
mtctr r0
14541456
li r6,0
@@ -1712,11 +1714,6 @@ BEGIN_FTR_SECTION_NESTED(96)
17121714
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
17131715
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
17141716
22:
1715-
/* Clear out SLB */
1716-
li r5,0
1717-
slbmte r5,r5
1718-
slbia
1719-
ptesync
17201717

17211718
/* Restore host values of some registers */
17221719
BEGIN_FTR_SECTION
@@ -1737,10 +1734,56 @@ BEGIN_FTR_SECTION
17371734
mtspr SPRN_PID, r7
17381735
mtspr SPRN_IAMR, r8
17391736
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1737+
1738+
#ifdef CONFIG_PPC_RADIX_MMU
1739+
/*
1740+
* Are we running hash or radix ?
1741+
*/
1742+
beq cr2,3f
1743+
1744+
/* Radix: Handle the case where the guest used an illegal PID */
1745+
LOAD_REG_ADDR(r4, mmu_base_pid)
1746+
lwz r3, VCPU_GUEST_PID(r9)
1747+
lwz r5, 0(r4)
1748+
cmpw cr0,r3,r5
1749+
blt 2f
1750+
1751+
/*
1752+
* Illegal PID, the HW might have prefetched and cached in the TLB
1753+
* some translations for the LPID 0 / guest PID combination which
1754+
* Linux doesn't know about, so we need to flush that PID out of
1755+
* the TLB. First we need to set LPIDR to 0 so tlbiel applies to
1756+
* the right context.
1757+
*/
1758+
li r0,0
1759+
mtspr SPRN_LPID,r0
1760+
isync
1761+
1762+
/* Then do a congruence class local flush */
1763+
ld r6,VCPU_KVM(r9)
1764+
lwz r0,KVM_TLB_SETS(r6)
1765+
mtctr r0
1766+
li r7,0x400 /* IS field = 0b01 */
1767+
ptesync
1768+
sldi r0,r3,32 /* RS has PID */
1769+
1: PPC_TLBIEL(7,0,2,1,1) /* RIC=2, PRS=1, R=1 */
1770+
addi r7,r7,0x1000
1771+
bdnz 1b
1772+
ptesync
1773+
1774+
2: /* Flush the ERAT on radix P9 DD1 guest exit */
17401775
BEGIN_FTR_SECTION
17411776
PPC_INVALIDATE_ERAT
17421777
END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
1778+
b 4f
1779+
#endif /* CONFIG_PPC_RADIX_MMU */
17431780

1781+
/* Hash: clear out SLB */
1782+
3: li r5,0
1783+
slbmte r5,r5
1784+
slbia
1785+
ptesync
1786+
4:
17441787
/*
17451788
* POWER7/POWER8 guest -> host partition switch code.
17461789
* We don't have to lock against tlbies but we do

arch/powerpc/mm/mmu_context_book3s64.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,10 @@ static int hash__init_new_context(struct mm_struct *mm)
126126
static int radix__init_new_context(struct mm_struct *mm)
127127
{
128128
unsigned long rts_field;
129-
int index;
129+
int index, max_id;
130130

131-
index = alloc_context_id(1, PRTB_ENTRIES - 1);
131+
max_id = (1 << mmu_pid_bits) - 1;
132+
index = alloc_context_id(mmu_base_pid, max_id);
132133
if (index < 0)
133134
return index;
134135

arch/powerpc/mm/pgtable-radix.c

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525

2626
#include <trace/events/thp.h>
2727

28+
unsigned int mmu_pid_bits;
29+
unsigned int mmu_base_pid;
30+
2831
static int native_register_process_table(unsigned long base, unsigned long pg_sz,
2932
unsigned long table_size)
3033
{
@@ -261,11 +264,34 @@ static void __init radix_init_pgtable(void)
261264
for_each_memblock(memory, reg)
262265
WARN_ON(create_physical_mapping(reg->base,
263266
reg->base + reg->size));
267+
268+
/* Find out how many PID bits are supported */
269+
if (cpu_has_feature(CPU_FTR_HVMODE)) {
270+
if (!mmu_pid_bits)
271+
mmu_pid_bits = 20;
272+
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
273+
/*
274+
* When KVM is possible, we only use the top half of the
275+
* PID space to avoid collisions between host and guest PIDs
276+
* which can cause problems due to prefetch when exiting the
277+
* guest with AIL=3
278+
*/
279+
mmu_base_pid = 1 << (mmu_pid_bits - 1);
280+
#else
281+
mmu_base_pid = 1;
282+
#endif
283+
} else {
284+
/* The guest uses the bottom half of the PID space */
285+
if (!mmu_pid_bits)
286+
mmu_pid_bits = 19;
287+
mmu_base_pid = 1;
288+
}
289+
264290
/*
265291
* Allocate Partition table and process table for the
266292
* host.
267293
*/
268-
BUILD_BUG_ON_MSG((PRTB_SIZE_SHIFT > 36), "Process table size too large.");
294+
BUG_ON(PRTB_SIZE_SHIFT > 36);
269295
process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT);
270296
/*
271297
* Fill in the process table.
@@ -339,6 +365,12 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
339365
if (type == NULL || strcmp(type, "cpu") != 0)
340366
return 0;
341367

368+
/* Find MMU PID size */
369+
prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
370+
if (prop && size == 4)
371+
mmu_pid_bits = be32_to_cpup(prop);
372+
373+
/* Grab page size encodings */
342374
prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
343375
if (!prop)
344376
return 0;

arch/powerpc/mm/tlb-radix.c

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212
#include <linux/mm.h>
1313
#include <linux/hugetlb.h>
1414
#include <linux/memblock.h>
15-
#include <asm/ppc-opcode.h>
1615

16+
#include <asm/ppc-opcode.h>
1717
#include <asm/tlb.h>
1818
#include <asm/tlbflush.h>
1919
#include <asm/trace.h>
20-
20+
#include <asm/cputhreads.h>
2121

2222
#define RIC_FLUSH_TLB 0
2323
#define RIC_FLUSH_PWC 1
@@ -454,3 +454,44 @@ void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
454454
else
455455
radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);
456456
}
457+
458+
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
459+
extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
460+
{
461+
unsigned int pid = mm->context.id;
462+
463+
if (unlikely(pid == MMU_NO_CONTEXT))
464+
return;
465+
466+
/*
467+
* If this context hasn't run on that CPU before and KVM is
468+
* around, there's a slim chance that the guest on another
469+
* CPU just brought in obsolete translation into the TLB of
470+
* this CPU due to a bad prefetch using the guest PID on
471+
* the way into the hypervisor.
472+
*
473+
* We work around this here. If KVM is possible, we check if
474+
* any sibling thread is in KVM. If it is, the window may exist
475+
* and thus we flush that PID from the core.
476+
*
477+
* A potential future improvement would be to mark which PIDs
478+
* have never been used on the system and avoid it if the PID
479+
* is new and the process has no other cpumask bit set.
480+
*/
481+
if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
482+
int cpu = smp_processor_id();
483+
int sib = cpu_first_thread_sibling(cpu);
484+
bool flush = false;
485+
486+
for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
487+
if (sib == cpu)
488+
continue;
489+
if (paca[sib].kvm_hstate.kvm_vcpu)
490+
flush = true;
491+
}
492+
if (flush)
493+
_tlbiel_pid(pid, RIC_FLUSH_ALL);
494+
}
495+
}
496+
EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
497+
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */

0 commit comments

Comments
 (0)