Skip to content

Commit f55f050

Browse files
amlutoIngo Molnar
authored andcommitted
x86/pti: Put the LDT in its own PGD if PTI is on
With PTI enabled, the LDT must be mapped in the usermode tables somewhere. The LDT is per process, i.e. per mm. An earlier approach mapped the LDT on context switch into a fixmap area, but that's a big overhead and exhausted the fixmap space when NR_CPUS got big. Take advantage of the fact that there is an address space hole which provides a completely unused pgd. Use this pgd to manage per-mm LDT mappings. This has a down side: the LDT isn't (currently) randomized, and an attack that can write the LDT is instant root due to call gates (thanks, AMD, for leaving call gates in AMD64 but designing them wrong so they're only useful for exploits). This can be mitigated by making the LDT read-only or randomizing the mapping, either of which is strightforward on top of this patch. This will significantly slow down LDT users, but that shouldn't matter for important workloads -- the LDT is only used by DOSEMU(2), Wine, and very old libc implementations. [ tglx: Cleaned it up. ] Signed-off-by: Andy Lutomirski <luto@kernel.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: David Laight <David.Laight@aculab.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Juergen Gross <jgross@suse.com> Cc: Kees Cook <keescook@chromium.org> Cc: Kirill A. Shutemov <kirill@shutemov.name> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 9f44977 commit f55f050

File tree

6 files changed

+220
-17
lines changed

6 files changed

+220
-17
lines changed

Documentation/x86/x86_64/mm.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
1212
... unused hole ...
1313
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
1414
... unused hole ...
15+
fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
1516
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
1617
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
1718
... unused hole ...
@@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables:
2930
hole caused by [56:63] sign extension
3031
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
3132
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
32-
ff90000000000000 - ff9fffffffffffff (=52 bits) hole
33+
ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
3334
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
3435
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
3536
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)

arch/x86/include/asm/mmu_context.h

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,33 @@ struct ldt_struct {
5050
* call gates. On native, we could merge the ldt_struct and LDT
5151
* allocations, but it's not worth trying to optimize.
5252
*/
53-
struct desc_struct *entries;
54-
unsigned int nr_entries;
53+
struct desc_struct *entries;
54+
unsigned int nr_entries;
55+
56+
/*
57+
* If PTI is in use, then the entries array is not mapped while we're
58+
* in user mode. The whole array will be aliased at the addressed
59+
* given by ldt_slot_va(slot). We use two slots so that we can allocate
60+
* and map, and enable a new LDT without invalidating the mapping
61+
* of an older, still-in-use LDT.
62+
*
63+
* slot will be -1 if this LDT doesn't have an alias mapping.
64+
*/
65+
int slot;
5566
};
5667

68+
/* This is a multiple of PAGE_SIZE. */
69+
#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
70+
71+
static inline void *ldt_slot_va(int slot)
72+
{
73+
#ifdef CONFIG_X86_64
74+
return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
75+
#else
76+
BUG();
77+
#endif
78+
}
79+
5780
/*
5881
* Used for LDT copy/destruction.
5982
*/
@@ -64,14 +87,16 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
6487
}
6588
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
6689
void destroy_context_ldt(struct mm_struct *mm);
90+
void ldt_arch_exit_mmap(struct mm_struct *mm);
6791
#else /* CONFIG_MODIFY_LDT_SYSCALL */
6892
static inline void init_new_context_ldt(struct mm_struct *mm) { }
6993
static inline int ldt_dup_context(struct mm_struct *oldmm,
7094
struct mm_struct *mm)
7195
{
7296
return 0;
7397
}
74-
static inline void destroy_context_ldt(struct mm_struct *mm) {}
98+
static inline void destroy_context_ldt(struct mm_struct *mm) { }
99+
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
75100
#endif
76101

77102
static inline void load_mm_ldt(struct mm_struct *mm)
@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
96121
* that we can see.
97122
*/
98123

99-
if (unlikely(ldt))
100-
set_ldt(ldt->entries, ldt->nr_entries);
101-
else
124+
if (unlikely(ldt)) {
125+
if (static_cpu_has(X86_FEATURE_PTI)) {
126+
if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
127+
/*
128+
* Whoops -- either the new LDT isn't mapped
129+
* (if slot == -1) or is mapped into a bogus
130+
* slot (if slot > 1).
131+
*/
132+
clear_LDT();
133+
return;
134+
}
135+
136+
/*
137+
* If page table isolation is enabled, ldt->entries
138+
* will not be mapped in the userspace pagetables.
139+
* Tell the CPU to access the LDT through the alias
140+
* at ldt_slot_va(ldt->slot).
141+
*/
142+
set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
143+
} else {
144+
set_ldt(ldt->entries, ldt->nr_entries);
145+
}
146+
} else {
102147
clear_LDT();
148+
}
103149
#else
104150
clear_LDT();
105151
#endif
@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
194240
static inline void arch_exit_mmap(struct mm_struct *mm)
195241
{
196242
paravirt_arch_exit_mmap(mm);
243+
ldt_arch_exit_mmap(mm);
197244
}
198245

199246
#ifdef CONFIG_X86_64

arch/x86/include/asm/pgtable_64_types.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,14 @@ typedef struct { pteval_t pte; } pte_t;
8282
# define VMALLOC_SIZE_TB _AC(12800, UL)
8383
# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
8484
# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
85+
# define LDT_PGD_ENTRY _AC(-112, UL)
86+
# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
8587
#else
8688
# define VMALLOC_SIZE_TB _AC(32, UL)
8789
# define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
8890
# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
91+
# define LDT_PGD_ENTRY _AC(-4, UL)
92+
# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
8993
#endif
9094

9195
#ifdef CONFIG_RANDOMIZE_MEMORY

arch/x86/include/asm/processor.h

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
851851

852852
#else
853853
/*
854-
* User space process size. 47bits minus one guard page. The guard
855-
* page is necessary on Intel CPUs: if a SYSCALL instruction is at
856-
* the highest possible canonical userspace address, then that
857-
* syscall will enter the kernel with a non-canonical return
858-
* address, and SYSRET will explode dangerously. We avoid this
859-
* particular problem by preventing anything from being mapped
860-
* at the maximum canonical address.
854+
* User space process size. This is the first address outside the user range.
855+
* There are a few constraints that determine this:
856+
*
857+
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
858+
* address, then that syscall will enter the kernel with a
859+
* non-canonical return address, and SYSRET will explode dangerously.
860+
* We avoid this particular problem by preventing anything executable
861+
* from being mapped at the maximum canonical address.
862+
*
863+
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
864+
* CPUs malfunction if they execute code from the highest canonical page.
865+
* They'll speculate right off the end of the canonical space, and
866+
* bad things happen. This is worked around in the same way as the
867+
* Intel problem.
868+
*
869+
* With page table isolation enabled, we map the LDT in ... [stay tuned]
861870
*/
862871
#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
863872

arch/x86/kernel/ldt.c

Lines changed: 136 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <linux/uaccess.h>
2525

2626
#include <asm/ldt.h>
27+
#include <asm/tlb.h>
2728
#include <asm/desc.h>
2829
#include <asm/mmu_context.h>
2930
#include <asm/syscalls.h>
@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
5152
static void flush_ldt(void *__mm)
5253
{
5354
struct mm_struct *mm = __mm;
54-
mm_context_t *pc;
5555

5656
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
5757
return;
5858

59-
pc = &mm->context;
60-
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
59+
load_mm_ldt(mm);
6160

6261
refresh_ldt_segments();
6362
}
@@ -94,10 +93,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
9493
return NULL;
9594
}
9695

96+
/* The new LDT isn't aliased for PTI yet. */
97+
new_ldt->slot = -1;
98+
9799
new_ldt->nr_entries = num_entries;
98100
return new_ldt;
99101
}
100102

103+
/*
104+
* If PTI is enabled, this maps the LDT into the kernelmode and
105+
* usermode tables for the given mm.
106+
*
107+
* There is no corresponding unmap function. Even if the LDT is freed, we
108+
* leave the PTEs around until the slot is reused or the mm is destroyed.
109+
* This is harmless: the LDT is always in ordinary memory, and no one will
110+
* access the freed slot.
111+
*
112+
* If we wanted to unmap freed LDTs, we'd also need to do a flush to make
113+
* it useful, and the flush would slow down modify_ldt().
114+
*/
115+
static int
116+
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
117+
{
118+
#ifdef CONFIG_PAGE_TABLE_ISOLATION
119+
bool is_vmalloc, had_top_level_entry;
120+
unsigned long va;
121+
spinlock_t *ptl;
122+
pgd_t *pgd;
123+
int i;
124+
125+
if (!static_cpu_has(X86_FEATURE_PTI))
126+
return 0;
127+
128+
/*
129+
* Any given ldt_struct should have map_ldt_struct() called at most
130+
* once.
131+
*/
132+
WARN_ON(ldt->slot != -1);
133+
134+
/*
135+
* Did we already have the top level entry allocated? We can't
136+
* use pgd_none() for this because it doens't do anything on
137+
* 4-level page table kernels.
138+
*/
139+
pgd = pgd_offset(mm, LDT_BASE_ADDR);
140+
had_top_level_entry = (pgd->pgd != 0);
141+
142+
is_vmalloc = is_vmalloc_addr(ldt->entries);
143+
144+
for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
145+
unsigned long offset = i << PAGE_SHIFT;
146+
const void *src = (char *)ldt->entries + offset;
147+
unsigned long pfn;
148+
pte_t pte, *ptep;
149+
150+
va = (unsigned long)ldt_slot_va(slot) + offset;
151+
pfn = is_vmalloc ? vmalloc_to_pfn(src) :
152+
page_to_pfn(virt_to_page(src));
153+
/*
154+
* Treat the PTI LDT range as a *userspace* range.
155+
* get_locked_pte() will allocate all needed pagetables
156+
* and account for them in this mm.
157+
*/
158+
ptep = get_locked_pte(mm, va, &ptl);
159+
if (!ptep)
160+
return -ENOMEM;
161+
pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
162+
set_pte_at(mm, va, ptep, pte);
163+
pte_unmap_unlock(ptep, ptl);
164+
}
165+
166+
if (mm->context.ldt) {
167+
/*
168+
* We already had an LDT. The top-level entry should already
169+
* have been allocated and synchronized with the usermode
170+
* tables.
171+
*/
172+
WARN_ON(!had_top_level_entry);
173+
if (static_cpu_has(X86_FEATURE_PTI))
174+
WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
175+
} else {
176+
/*
177+
* This is the first time we're mapping an LDT for this process.
178+
* Sync the pgd to the usermode tables.
179+
*/
180+
WARN_ON(had_top_level_entry);
181+
if (static_cpu_has(X86_FEATURE_PTI)) {
182+
WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
183+
set_pgd(kernel_to_user_pgdp(pgd), *pgd);
184+
}
185+
}
186+
187+
va = (unsigned long)ldt_slot_va(slot);
188+
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
189+
190+
ldt->slot = slot;
191+
#endif
192+
return 0;
193+
}
194+
195+
static void free_ldt_pgtables(struct mm_struct *mm)
196+
{
197+
#ifdef CONFIG_PAGE_TABLE_ISOLATION
198+
struct mmu_gather tlb;
199+
unsigned long start = LDT_BASE_ADDR;
200+
unsigned long end = start + (1UL << PGDIR_SHIFT);
201+
202+
if (!static_cpu_has(X86_FEATURE_PTI))
203+
return;
204+
205+
tlb_gather_mmu(&tlb, mm, start, end);
206+
free_pgd_range(&tlb, start, end, start, end);
207+
tlb_finish_mmu(&tlb, start, end);
208+
#endif
209+
}
210+
101211
/* After calling this, the LDT is immutable. */
102212
static void finalize_ldt_struct(struct ldt_struct *ldt)
103213
{
@@ -156,6 +266,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
156266
new_ldt->nr_entries * LDT_ENTRY_SIZE);
157267
finalize_ldt_struct(new_ldt);
158268

269+
retval = map_ldt_struct(mm, new_ldt, 0);
270+
if (retval) {
271+
free_ldt_pgtables(mm);
272+
free_ldt_struct(new_ldt);
273+
goto out_unlock;
274+
}
159275
mm->context.ldt = new_ldt;
160276

161277
out_unlock:
@@ -174,6 +290,11 @@ void destroy_context_ldt(struct mm_struct *mm)
174290
mm->context.ldt = NULL;
175291
}
176292

293+
void ldt_arch_exit_mmap(struct mm_struct *mm)
294+
{
295+
free_ldt_pgtables(mm);
296+
}
297+
177298
static int read_ldt(void __user *ptr, unsigned long bytecount)
178299
{
179300
struct mm_struct *mm = current->mm;
@@ -287,6 +408,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
287408
new_ldt->entries[ldt_info.entry_number] = ldt;
288409
finalize_ldt_struct(new_ldt);
289410

411+
/*
412+
* If we are using PTI, map the new LDT into the userspace pagetables.
413+
* If there is already an LDT, use the other slot so that other CPUs
414+
* will continue to use the old LDT until install_ldt() switches
415+
* them over to the new LDT.
416+
*/
417+
error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
418+
if (error) {
419+
free_ldt_struct(old_ldt);
420+
goto out_unlock;
421+
}
422+
290423
install_ldt(mm, new_ldt);
291424
free_ldt_struct(old_ldt);
292425
error = 0;

arch/x86/mm/dump_pagetables.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,17 @@ enum address_markers_idx {
5252
USER_SPACE_NR = 0,
5353
KERNEL_SPACE_NR,
5454
LOW_KERNEL_NR,
55+
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
56+
LDT_NR,
57+
#endif
5558
VMALLOC_START_NR,
5659
VMEMMAP_START_NR,
5760
#ifdef CONFIG_KASAN
5861
KASAN_SHADOW_START_NR,
5962
KASAN_SHADOW_END_NR,
63+
#endif
64+
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
65+
LDT_NR,
6066
#endif
6167
CPU_ENTRY_AREA_NR,
6268
#ifdef CONFIG_X86_ESPFIX64
@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
8187
#ifdef CONFIG_KASAN
8288
[KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
8389
[KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
90+
#endif
91+
#ifdef CONFIG_MODIFY_LDT_SYSCALL
92+
[LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
8493
#endif
8594
[CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
8695
#ifdef CONFIG_X86_ESPFIX64

0 commit comments

Comments
 (0)