Skip to content

Commit 6fd166a

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
x86/mm: Use/Fix PCID to optimize user/kernel switches
We can use PCID to retain the TLBs across CR3 switches; including those now part of the user/kernel switch. This increases performance of kernel entry/exit at the cost of more expensive/complicated TLB flushing. Now that we have two address spaces, one for kernel and one for user space, we need two PCIDs per mm. We use the top PCID bit to indicate a user PCID (just like we use the PFN LSB for the PGD). Since we do TLB invalidation from kernel space, the existing code will only invalidate the kernel PCID, we augment that by marking the corresponding user PCID invalid, and upon switching back to userspace, use a flushing CR3 write for the switch. In order to access the user_pcid_flush_mask we use PER_CPU storage, which means the previously established SWAPGS vs CR3 ordering is now mandatory and required. Having to do this memory access does require additional registers, most sites have a functioning stack and we can spill one (RAX), sites without functional stack need to otherwise provide the second scratch register. Note: PCID is generally available on Intel Sandybridge and later CPUs. Note: Up until this point TLB flushing was broken in this series. Based-on-code-from: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Andy Lutomirski <luto@kernel.org> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: David Laight <David.Laight@aculab.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Eduardo Valentin <eduval@amazon.com> Cc: Greg KH <gregkh@linuxfoundation.org> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Juergen Gross <jgross@suse.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will.deacon@arm.com> Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 48e1119 commit 6fd166a

File tree

9 files changed

+162
-33
lines changed

9 files changed

+162
-33
lines changed

arch/x86/entry/calling.h

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
#include <asm/unwind_hints.h>
44
#include <asm/cpufeatures.h>
55
#include <asm/page_types.h>
6+
#include <asm/percpu.h>
7+
#include <asm/asm-offsets.h>
8+
#include <asm/processor-flags.h>
69

710
/*
811
@@ -191,17 +194,21 @@ For 32-bit we have the following conventions - kernel is built with
191194

192195
#ifdef CONFIG_PAGE_TABLE_ISOLATION
193196

194-
/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */
195-
#define PTI_SWITCH_MASK (1<<PAGE_SHIFT)
197+
/*
198+
* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
199+
* halves:
200+
*/
201+
#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
202+
#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
196203

197-
.macro ADJUST_KERNEL_CR3 reg:req
198-
/* Clear "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
199-
andq $(~PTI_SWITCH_MASK), \reg
204+
.macro SET_NOFLUSH_BIT reg:req
205+
bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
200206
.endm
201207

202-
.macro ADJUST_USER_CR3 reg:req
203-
/* Move CR3 up a page to the user page tables: */
204-
orq $(PTI_SWITCH_MASK), \reg
208+
.macro ADJUST_KERNEL_CR3 reg:req
209+
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
210+
/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
211+
andq $(~PTI_SWITCH_MASK), \reg
205212
.endm
206213

207214
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
@@ -212,21 +219,58 @@ For 32-bit we have the following conventions - kernel is built with
212219
.Lend_\@:
213220
.endm
214221

215-
.macro SWITCH_TO_USER_CR3 scratch_reg:req
222+
#define THIS_CPU_user_pcid_flush_mask \
223+
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
224+
225+
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
216226
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
217227
mov %cr3, \scratch_reg
218-
ADJUST_USER_CR3 \scratch_reg
228+
229+
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
230+
231+
/*
232+
* Test if the ASID needs a flush.
233+
*/
234+
movq \scratch_reg, \scratch_reg2
235+
andq $(0x7FF), \scratch_reg /* mask ASID */
236+
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
237+
jnc .Lnoflush_\@
238+
239+
/* Flush needed, clear the bit */
240+
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
241+
movq \scratch_reg2, \scratch_reg
242+
jmp .Lwrcr3_\@
243+
244+
.Lnoflush_\@:
245+
movq \scratch_reg2, \scratch_reg
246+
SET_NOFLUSH_BIT \scratch_reg
247+
248+
.Lwrcr3_\@:
249+
/* Flip the PGD and ASID to the user version */
250+
orq $(PTI_SWITCH_MASK), \scratch_reg
219251
mov \scratch_reg, %cr3
220252
.Lend_\@:
221253
.endm
222254

255+
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
256+
pushq %rax
257+
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
258+
popq %rax
259+
.endm
260+
223261
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
224262
ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
225263
movq %cr3, \scratch_reg
226264
movq \scratch_reg, \save_reg
227265
/*
228-
* Is the switch bit zero? This means the address is
229-
* up in real PAGE_TABLE_ISOLATION patches in a moment.
266+
* Is the "switch mask" all zero? That means that both of
267+
* these are zero:
268+
*
269+
* 1. The user/kernel PCID bit, and
270+
* 2. The user/kernel "bit" that points CR3 to the
271+
* bottom half of the 8k PGD
272+
*
273+
* That indicates a kernel CR3 value, not a user CR3.
230274
*/
231275
testq $(PTI_SWITCH_MASK), \scratch_reg
232276
jz .Ldone_\@
@@ -251,7 +295,9 @@ For 32-bit we have the following conventions - kernel is built with
251295

252296
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
253297
.endm
254-
.macro SWITCH_TO_USER_CR3 scratch_reg:req
298+
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
299+
.endm
300+
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
255301
.endm
256302
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
257303
.endm

arch/x86/entry/entry_64.S

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
#include <asm/segment.h>
2424
#include <asm/cache.h>
2525
#include <asm/errno.h>
26-
#include "calling.h"
2726
#include <asm/asm-offsets.h>
2827
#include <asm/msr.h>
2928
#include <asm/unistd.h>
@@ -40,6 +39,8 @@
4039
#include <asm/frame.h>
4140
#include <linux/err.h>
4241

42+
#include "calling.h"
43+
4344
.code64
4445
.section .entry.text, "ax"
4546

@@ -406,7 +407,7 @@ syscall_return_via_sysret:
406407
* We are on the trampoline stack. All regs except RDI are live.
407408
* We can do future final exit work right here.
408409
*/
409-
SWITCH_TO_USER_CR3 scratch_reg=%rdi
410+
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
410411

411412
popq %rdi
412413
popq %rsp
@@ -744,7 +745,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
744745
* We can do future final exit work right here.
745746
*/
746747

747-
SWITCH_TO_USER_CR3 scratch_reg=%rdi
748+
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
748749

749750
/* Restore RDI. */
750751
popq %rdi
@@ -857,7 +858,7 @@ native_irq_return_ldt:
857858
*/
858859
orq PER_CPU_VAR(espfix_stack), %rax
859860

860-
SWITCH_TO_USER_CR3 scratch_reg=%rdi /* to user CR3 */
861+
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
861862
SWAPGS /* to user GS */
862863
popq %rdi /* Restore user RDI */
863864

arch/x86/entry/entry_64_compat.S

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -275,9 +275,9 @@ sysret32_from_system_call:
275275
* switch until after after the last reference to the process
276276
* stack.
277277
*
278-
* %r8 is zeroed before the sysret, thus safe to clobber.
278+
* %r8/%r9 are zeroed before the sysret, thus safe to clobber.
279279
*/
280-
SWITCH_TO_USER_CR3 scratch_reg=%r8
280+
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
281281

282282
xorq %r8, %r8
283283
xorq %r9, %r9

arch/x86/include/asm/processor-flags.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@
3838
#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
3939
#define CR3_PCID_MASK 0xFFFull
4040
#define CR3_NOFLUSH BIT_ULL(63)
41+
42+
#ifdef CONFIG_PAGE_TABLE_ISOLATION
43+
# define X86_CR3_PTI_SWITCH_BIT 11
44+
#endif
45+
4146
#else
4247
/*
4348
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save

arch/x86/include/asm/tlbflush.h

Lines changed: 79 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
#include <asm/special_insns.h>
1111
#include <asm/smp.h>
1212
#include <asm/invpcid.h>
13+
#include <asm/pti.h>
14+
#include <asm/processor-flags.h>
1315

1416
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
1517
{
@@ -24,24 +26,54 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
2426

2527
/* There are 12 bits of space for ASIDS in CR3 */
2628
#define CR3_HW_ASID_BITS 12
29+
2730
/*
2831
* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
2932
* user/kernel switches
3033
*/
31-
#define PTI_CONSUMED_ASID_BITS 0
34+
#ifdef CONFIG_PAGE_TABLE_ISOLATION
35+
# define PTI_CONSUMED_PCID_BITS 1
36+
#else
37+
# define PTI_CONSUMED_PCID_BITS 0
38+
#endif
39+
40+
#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
3241

33-
#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
3442
/*
3543
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
3644
* for them being zero-based. Another -1 is because ASID 0 is reserved for
3745
* use by non-PCID-aware users.
3846
*/
39-
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
47+
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
48+
49+
/*
50+
* 6 because 6 should be plenty and struct tlb_state will fit in two cache
51+
* lines.
52+
*/
53+
#define TLB_NR_DYN_ASIDS 6
4054

4155
static inline u16 kern_pcid(u16 asid)
4256
{
4357
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
58+
59+
#ifdef CONFIG_PAGE_TABLE_ISOLATION
60+
/*
61+
* Make sure that the dynamic ASID space does not confict with the
62+
* bit we are using to switch between user and kernel ASIDs.
63+
*/
64+
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
65+
4466
/*
67+
* The ASID being passed in here should have respected the
68+
* MAX_ASID_AVAILABLE and thus never have the switch bit set.
69+
*/
70+
VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
71+
#endif
72+
/*
73+
* The dynamically-assigned ASIDs that get passed in are small
74+
* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
75+
* so do not bother to clear it.
76+
*
4577
* If PCID is on, ASID-aware code paths put the ASID+1 into the
4678
* PCID bits. This serves two purposes. It prevents a nasty
4779
* situation in which PCID-unaware code saves CR3, loads some other
@@ -95,12 +127,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
95127
return !static_cpu_has(X86_FEATURE_PCID);
96128
}
97129

98-
/*
99-
* 6 because 6 should be plenty and struct tlb_state will fit in
100-
* two cache lines.
101-
*/
102-
#define TLB_NR_DYN_ASIDS 6
103-
104130
struct tlb_context {
105131
u64 ctx_id;
106132
u64 tlb_gen;
@@ -145,6 +171,13 @@ struct tlb_state {
145171
*/
146172
bool invalidate_other;
147173

174+
/*
175+
* Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
176+
* the corresponding user PCID needs a flush next time we
177+
* switch to it; see SWITCH_TO_USER_CR3.
178+
*/
179+
unsigned short user_pcid_flush_mask;
180+
148181
/*
149182
* Access to this CR4 shadow and to H/W CR4 is protected by
150183
* disabling interrupts when modifying either one.
@@ -249,15 +282,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
249282

250283
extern void initialize_tlbstate_and_flush(void);
251284

285+
/*
286+
* Given an ASID, flush the corresponding user ASID. We can delay this
287+
* until the next time we switch to it.
288+
*
289+
* See SWITCH_TO_USER_CR3.
290+
*/
291+
static inline void invalidate_user_asid(u16 asid)
292+
{
293+
/* There is no user ASID if address space separation is off */
294+
if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
295+
return;
296+
297+
/*
298+
* We only have a single ASID if PCID is off and the CR3
299+
* write will have flushed it.
300+
*/
301+
if (!cpu_feature_enabled(X86_FEATURE_PCID))
302+
return;
303+
304+
if (!static_cpu_has(X86_FEATURE_PTI))
305+
return;
306+
307+
__set_bit(kern_pcid(asid),
308+
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
309+
}
310+
252311
/*
253312
* flush the entire current user mapping
254313
*/
255314
static inline void __native_flush_tlb(void)
256315
{
316+
invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
257317
/*
258-
* If current->mm == NULL then we borrow a mm which may change during a
259-
* task switch and therefore we must not be preempted while we write CR3
260-
* back:
318+
* If current->mm == NULL then we borrow a mm which may change
319+
* during a task switch and therefore we must not be preempted
320+
* while we write CR3 back:
261321
*/
262322
preempt_disable();
263323
native_write_cr3(__native_read_cr3());
@@ -301,7 +361,14 @@ static inline void __native_flush_tlb_global(void)
301361
*/
302362
static inline void __native_flush_tlb_single(unsigned long addr)
303363
{
364+
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
365+
304366
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
367+
368+
if (!static_cpu_has(X86_FEATURE_PTI))
369+
return;
370+
371+
invalidate_user_asid(loaded_mm_asid);
305372
}
306373

307374
/*

arch/x86/include/uapi/asm/processor-flags.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,12 @@
7878
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
7979
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
8080
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
81-
#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
81+
82+
#define X86_CR3_PCID_BITS 12
83+
#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
84+
85+
#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
86+
#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
8287

8388
/*
8489
* Intel CPU features in CR4

arch/x86/kernel/asm-offsets.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <asm/sigframe.h>
1818
#include <asm/bootparam.h>
1919
#include <asm/suspend.h>
20+
#include <asm/tlbflush.h>
2021

2122
#ifdef CONFIG_XEN
2223
#include <xen/interface/xen.h>
@@ -94,6 +95,9 @@ void common(void) {
9495
BLANK();
9596
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
9697

98+
/* TLB state for the entry code */
99+
OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
100+
97101
/* Layout info for cpu_entry_area */
98102
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
99103
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);

arch/x86/mm/init.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -855,7 +855,7 @@ void __init zone_sizes_init(void)
855855
free_area_init_nodes(max_zone_pfns);
856856
}
857857

858-
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
858+
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
859859
.loaded_mm = &init_mm,
860860
.next_asid = 1,
861861
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */

arch/x86/mm/tlb.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
105105
unsigned long new_mm_cr3;
106106

107107
if (need_flush) {
108+
invalidate_user_asid(new_asid);
108109
new_mm_cr3 = build_cr3(pgdir, new_asid);
109110
} else {
110111
new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);

0 commit comments

Comments
 (0)