Skip to content

Commit caf9a82

Browse files
committed
Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 PTI preparatory patches from Thomas Gleixner: "Todays Advent calendar window contains twentyfour easy to digest patches. The original plan was to have twenty three matching the date, but a late fixup made that moot. - Move the cpu_entry_area mapping out of the fixmap into a separate address space. That's necessary because the fixmap becomes too big with NRCPUS=8192 and this caused already subtle and hard to diagnose failures. The top most patch is fresh from today and cures a brain slip of that tall grumpy german greybeard, who ignored the intricacies of 32bit wraparounds. - Limit the number of CPUs on 32bit to 64. That's insane big already, but at least it's small enough to prevent address space issues with the cpu_entry_area map, which have been observed and debugged with the fixmap code - A few TLB flush fixes in various places plus documentation which of the TLB functions should be used for what. - Rename the SYSENTER stack to CPU_ENTRY_AREA stack as it is used for more than sysenter now and keeping the name makes backtraces confusing. - Prevent LDT inheritance on exec() by moving it to arch_dup_mmap(), which is only invoked on fork(). - Make vysycall more robust. - A few fixes and cleanups of the debug_pagetables code. Check PAGE_PRESENT instead of checking the PTE for 0 and a cleanup of the C89 initialization of the address hint array which already was out of sync with the index enums. - Move the ESPFIX init to a different place to prepare for PTI. - Several code moves with no functional change to make PTI integration simpler and header files less convoluted. - Documentation fixes and clarifications" * 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits) x86/cpu_entry_area: Prevent wraparound in setup_cpu_entry_area_ptes() on 32bit init: Invoke init_espfix_bsp() from mm_init() x86/cpu_entry_area: Move it out of the fixmap x86/cpu_entry_area: Move it to a separate unit x86/mm: Create asm/invpcid.h x86/mm: Put MMU to hardware ASID translation in one place x86/mm: Remove hard-coded ASID limit checks x86/mm: Move the CR3 construction functions to tlbflush.h x86/mm: Add comments to clarify which TLB-flush functions are supposed to flush what x86/mm: Remove superfluous barriers x86/mm: Use __flush_tlb_one() for kernel memory x86/microcode: Dont abuse the TLB-flush interface x86/uv: Use the right TLB-flush API x86/entry: Rename SYSENTER_stack to CPU_ENTRY_AREA_entry_stack x86/doc: Remove obvious weirdnesses from the x86 MM layout documentation x86/mm/64: Improve the memory map documentation x86/ldt: Prevent LDT inheritance on exec x86/ldt: Rework locking arch, mm: Allow arch_dup_mmap() to fail x86/vsyscall/64: Warn and fail vsyscall emulation in NATIVE mode ...
2 parents 9c294ec + f6c4fd5 commit caf9a82

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+626
-458
lines changed

Documentation/x86/x86_64/mm.txt

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11

2-
<previous description obsolete, deleted>
3-
42
Virtual memory map with 4 level page tables:
53

64
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
@@ -14,13 +12,15 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
1412
... unused hole ...
1513
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
1614
... unused hole ...
15+
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
1716
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
1817
... unused hole ...
1918
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
2019
... unused hole ...
2120
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
22-
ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
23-
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
21+
ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable)
22+
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
23+
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
2424
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
2525

2626
Virtual memory map with 5 level page tables:
@@ -36,19 +36,22 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
3636
... unused hole ...
3737
ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
3838
... unused hole ...
39+
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
3940
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
4041
... unused hole ...
4142
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
4243
... unused hole ...
4344
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
44-
ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
45-
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
45+
ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space
46+
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
47+
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
4648
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
4749

4850
Architecture defines a 64-bit virtual address. Implementations can support
4951
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
50-
through to the most-significant implemented bit are set to either all ones
51-
or all zero. This causes hole between user space and kernel addresses.
52+
through to the most-significant implemented bit are sign extended.
53+
This causes hole between user space and kernel addresses if you interpret them
54+
as unsigned.
5255

5356
The direct mapping covers all memory in the system up to the highest
5457
memory address (this means in some cases it can also include PCI memory
@@ -58,9 +61,6 @@ vmalloc space is lazily synchronized into the different PML4/PML5 pages of
5861
the processes using the page fault handler, with init_top_pgt as
5962
reference.
6063

61-
Current X86-64 implementations support up to 46 bits of address space (64 TB),
62-
which is our current limit. This expands into MBZ space in the page tables.
63-
6464
We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual
6565
memory window (this size is arbitrary, it can be raised later if needed).
6666
The mappings are not part of any other kernel PGD and are only available
@@ -72,5 +72,3 @@ following fixmap section.
7272
Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
7373
physical memory, vmalloc/ioremap space and virtual memory map are randomized.
7474
Their order is preserved but their base will be offset early at boot time.
75-
76-
-Andi Kleen, Jul 2004

arch/powerpc/include/asm/mmu_context.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
160160
#endif
161161
}
162162

163-
static inline void arch_dup_mmap(struct mm_struct *oldmm,
164-
struct mm_struct *mm)
163+
static inline int arch_dup_mmap(struct mm_struct *oldmm,
164+
struct mm_struct *mm)
165165
{
166+
return 0;
166167
}
167168

168169
#ifndef CONFIG_PPC_BOOK3S_64

arch/um/include/asm/mmu_context.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm);
1515
/*
1616
* Needed since we do not use the asm-generic/mm_hooks.h:
1717
*/
18-
static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
18+
static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1919
{
2020
uml_setup_stubs(mm);
21+
return 0;
2122
}
2223
extern void arch_exit_mmap(struct mm_struct *mm);
2324
static inline void arch_unmap(struct mm_struct *mm,

arch/unicore32/include/asm/mmu_context.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,10 @@ do { \
8181
} \
8282
} while (0)
8383

84-
static inline void arch_dup_mmap(struct mm_struct *oldmm,
85-
struct mm_struct *mm)
84+
static inline int arch_dup_mmap(struct mm_struct *oldmm,
85+
struct mm_struct *mm)
8686
{
87+
return 0;
8788
}
8889

8990
static inline void arch_unmap(struct mm_struct *mm,

arch/x86/Kconfig

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -926,7 +926,8 @@ config MAXSMP
926926
config NR_CPUS
927927
int "Maximum number of CPUs" if SMP && !MAXSMP
928928
range 2 8 if SMP && X86_32 && !X86_BIGSMP
929-
range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK
929+
range 2 64 if SMP && X86_32 && X86_BIGSMP
930+
range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64
930931
range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64
931932
default "1" if !SMP
932933
default "8192" if MAXSMP

arch/x86/entry/entry_32.S

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -942,9 +942,9 @@ ENTRY(debug)
942942

943943
/* Are we currently on the SYSENTER stack? */
944944
movl PER_CPU_VAR(cpu_entry_area), %ecx
945-
addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
946-
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
947-
cmpl $SIZEOF_SYSENTER_stack, %ecx
945+
addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
946+
subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
947+
cmpl $SIZEOF_entry_stack, %ecx
948948
jb .Ldebug_from_sysenter_stack
949949

950950
TRACE_IRQS_OFF
@@ -986,9 +986,9 @@ ENTRY(nmi)
986986

987987
/* Are we currently on the SYSENTER stack? */
988988
movl PER_CPU_VAR(cpu_entry_area), %ecx
989-
addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
990-
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
991-
cmpl $SIZEOF_SYSENTER_stack, %ecx
989+
addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
990+
subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
991+
cmpl $SIZEOF_entry_stack, %ecx
992992
jb .Lnmi_from_sysenter_stack
993993

994994
/* Not on SYSENTER stack. */

arch/x86/entry/entry_64.S

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,8 @@ END(native_usergs_sysret64)
158158
_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
159159

160160
/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
161-
#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \
162-
SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
161+
#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
162+
SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
163163

164164
ENTRY(entry_SYSCALL_64_trampoline)
165165
UNWIND_HINT_EMPTY

arch/x86/entry/vsyscall/vsyscall_64.c

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <asm/unistd.h>
3838
#include <asm/fixmap.h>
3939
#include <asm/traps.h>
40+
#include <asm/paravirt.h>
4041

4142
#define CREATE_TRACE_POINTS
4243
#include "vsyscall_trace.h"
@@ -138,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
138139

139140
WARN_ON_ONCE(address != regs->ip);
140141

142+
/* This should be unreachable in NATIVE mode. */
143+
if (WARN_ON(vsyscall_mode == NATIVE))
144+
return false;
145+
141146
if (vsyscall_mode == NONE) {
142147
warn_bad_vsyscall(KERN_INFO, regs,
143148
"vsyscall attempted with vsyscall=none");
@@ -329,16 +334,47 @@ int in_gate_area_no_mm(unsigned long addr)
329334
return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
330335
}
331336

337+
/*
338+
* The VSYSCALL page is the only user-accessible page in the kernel address
339+
* range. Normally, the kernel page tables can have _PAGE_USER clear, but
340+
* the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
341+
* are enabled.
342+
*
343+
* Some day we may create a "minimal" vsyscall mode in which we emulate
344+
* vsyscalls but leave the page not present. If so, we skip calling
345+
* this.
346+
*/
347+
static void __init set_vsyscall_pgtable_user_bits(void)
348+
{
349+
pgd_t *pgd;
350+
p4d_t *p4d;
351+
pud_t *pud;
352+
pmd_t *pmd;
353+
354+
pgd = pgd_offset_k(VSYSCALL_ADDR);
355+
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
356+
p4d = p4d_offset(pgd, VSYSCALL_ADDR);
357+
#if CONFIG_PGTABLE_LEVELS >= 5
358+
p4d->p4d |= _PAGE_USER;
359+
#endif
360+
pud = pud_offset(p4d, VSYSCALL_ADDR);
361+
set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
362+
pmd = pmd_offset(pud, VSYSCALL_ADDR);
363+
set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
364+
}
365+
332366
void __init map_vsyscall(void)
333367
{
334368
extern char __vsyscall_page;
335369
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
336370

337-
if (vsyscall_mode != NONE)
371+
if (vsyscall_mode != NONE) {
338372
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
339373
vsyscall_mode == NATIVE
340374
? PAGE_KERNEL_VSYSCALL
341375
: PAGE_KERNEL_VVAR);
376+
set_vsyscall_pgtable_user_bits();
377+
}
342378

343379
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
344380
(unsigned long)VSYSCALL_ADDR);

arch/x86/include/asm/cpu_entry_area.h

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
3+
#ifndef _ASM_X86_CPU_ENTRY_AREA_H
4+
#define _ASM_X86_CPU_ENTRY_AREA_H
5+
6+
#include <linux/percpu-defs.h>
7+
#include <asm/processor.h>
8+
9+
/*
10+
* cpu_entry_area is a percpu region that contains things needed by the CPU
11+
* and early entry/exit code. Real types aren't used for all fields here
12+
* to avoid circular header dependencies.
13+
*
14+
* Every field is a virtual alias of some other allocated backing store.
15+
* There is no direct allocation of a struct cpu_entry_area.
16+
*/
17+
struct cpu_entry_area {
18+
char gdt[PAGE_SIZE];
19+
20+
/*
21+
* The GDT is just below entry_stack and thus serves (on x86_64) as
22+
* a a read-only guard page.
23+
*/
24+
struct entry_stack_page entry_stack_page;
25+
26+
/*
27+
* On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
28+
* we need task switches to work, and task switches write to the TSS.
29+
*/
30+
struct tss_struct tss;
31+
32+
char entry_trampoline[PAGE_SIZE];
33+
34+
#ifdef CONFIG_X86_64
35+
/*
36+
* Exception stacks used for IST entries.
37+
*
38+
* In the future, this should have a separate slot for each stack
39+
* with guard pages between them.
40+
*/
41+
char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
42+
#endif
43+
};
44+
45+
#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
46+
#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
47+
48+
DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
49+
50+
extern void setup_cpu_entry_areas(void);
51+
extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
52+
53+
#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
54+
#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
55+
56+
#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
57+
58+
#define CPU_ENTRY_AREA_MAP_SIZE \
59+
(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE)
60+
61+
extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
62+
63+
static inline struct entry_stack *cpu_entry_stack(int cpu)
64+
{
65+
return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
66+
}
67+
68+
#endif

arch/x86/include/asm/desc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <asm/mmu.h>
88
#include <asm/fixmap.h>
99
#include <asm/irq_vectors.h>
10+
#include <asm/cpu_entry_area.h>
1011

1112
#include <linux/smp.h>
1213
#include <linux/percpu.h>

arch/x86/include/asm/espfix.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#ifndef _ASM_X86_ESPFIX_H
33
#define _ASM_X86_ESPFIX_H
44

5-
#ifdef CONFIG_X86_64
5+
#ifdef CONFIG_X86_ESPFIX64
66

77
#include <asm/percpu.h>
88

@@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
1111

1212
extern void init_espfix_bsp(void);
1313
extern void init_espfix_ap(int cpu);
14-
15-
#endif /* CONFIG_X86_64 */
14+
#else
15+
static inline void init_espfix_ap(int cpu) { }
16+
#endif
1617

1718
#endif /* _ASM_X86_ESPFIX_H */

0 commit comments

Comments
 (0)