Skip to content

Commit 6c690ee

Browse files
amlutoIngo Molnar
authored andcommitted
x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
The kernel has several code paths that read CR3. Most of them assume that CR3 contains the PGD's physical address, whereas some of them awkwardly use PHYSICAL_PAGE_MASK to mask off low bits. Add explicit mask macros for CR3 and convert all of the CR3 readers. This will keep them from breaking when PCID is enabled. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Juergen Gross <jgross@suse.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: xen-devel <xen-devel@lists.xen.org> Link: http://lkml.kernel.org/r/883f8fb121f4616c1c1427ad87350bb2f5ffeca1.1497288170.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 3f365cf commit 6c690ee

File tree

20 files changed

+79
-29
lines changed

20 files changed

+79
-29
lines changed

arch/x86/boot/compressed/pagetable.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ void initialize_identity_maps(void)
9292
* and we must append to the existing area instead of entirely
9393
* overwriting it.
9494
*/
95-
level4p = read_cr3();
95+
level4p = read_cr3_pa();
9696
if (level4p == (unsigned long)_pgtable) {
9797
debug_putstr("booted via startup_32()\n");
9898
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;

arch/x86/include/asm/efi.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ struct efi_scratch {
7474
__kernel_fpu_begin(); \
7575
\
7676
if (efi_scratch.use_pgd) { \
77-
efi_scratch.prev_cr3 = read_cr3(); \
77+
efi_scratch.prev_cr3 = __read_cr3(); \
7878
write_cr3((unsigned long)efi_scratch.efi_pgt); \
7979
__flush_tlb_all(); \
8080
} \

arch/x86/include/asm/mmu_context.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
269269

270270
/*
271271
* This can be used from process context to figure out what the value of
272-
* CR3 is without needing to do a (slow) read_cr3().
272+
* CR3 is without needing to do a (slow) __read_cr3().
273273
*
274274
* It's intended to be used for code like KVM that sneakily changes CR3
275275
* and needs to restore it. It needs to be used very carefully.
@@ -281,7 +281,7 @@ static inline unsigned long __get_current_cr3_fast(void)
281281
/* For now, be very restrictive about when this can be called. */
282282
VM_WARN_ON(in_nmi() || !in_atomic());
283283

284-
VM_BUG_ON(cr3 != read_cr3());
284+
VM_BUG_ON(cr3 != __read_cr3());
285285
return cr3;
286286
}
287287

arch/x86/include/asm/paravirt.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
6161
PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
6262
}
6363

64-
static inline unsigned long read_cr3(void)
64+
static inline unsigned long __read_cr3(void)
6565
{
6666
return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
6767
}

arch/x86/include/asm/processor-flags.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,40 @@
88
#else
99
#define X86_VM_MASK 0 /* No VM86 support */
1010
#endif
11+
12+
/*
13+
* CR3's layout varies depending on several things.
14+
*
15+
* If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
16+
* If PAE is enabled, then CR3[11:5] is part of the PDPT address
17+
* (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
18+
* Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
19+
* CR3[2:0] and CR3[11:5] are ignored.
20+
*
21+
* In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
22+
*
23+
* CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
24+
* written as 1 to prevent the write to CR3 from flushing the TLB.
25+
*
26+
* On systems with SME, one bit (in a variable position!) is stolen to indicate
27+
* that the top-level paging structure is encrypted.
28+
*
29+
* All of the remaining bits indicate the physical address of the top-level
30+
* paging structure.
31+
*
32+
* CR3_ADDR_MASK is the mask used by read_cr3_pa().
33+
*/
34+
#ifdef CONFIG_X86_64
35+
/* Mask off the address space ID bits. */
36+
#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
37+
#define CR3_PCID_MASK 0xFFFull
38+
#else
39+
/*
40+
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
41+
* a tiny bit of code size by setting all the bits.
42+
*/
43+
#define CR3_ADDR_MASK 0xFFFFFFFFull
44+
#define CR3_PCID_MASK 0ull
45+
#endif
46+
1147
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */

arch/x86/include/asm/processor.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
231231
native_cpuid_reg(ecx)
232232
native_cpuid_reg(edx)
233233

234+
/*
235+
* Friendlier CR3 helpers.
236+
*/
237+
static inline unsigned long read_cr3_pa(void)
238+
{
239+
return __read_cr3() & CR3_ADDR_MASK;
240+
}
241+
234242
static inline void load_cr3(pgd_t *pgdir)
235243
{
236244
write_cr3(__pa(pgdir));

arch/x86/include/asm/special_insns.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
3939
asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
4040
}
4141

42-
static inline unsigned long native_read_cr3(void)
42+
static inline unsigned long __native_read_cr3(void)
4343
{
4444
unsigned long val;
4545
asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
159159
native_write_cr2(x);
160160
}
161161

162-
static inline unsigned long read_cr3(void)
162+
/*
163+
* Careful! CR3 contains more than just an address. You probably want
164+
* read_cr3_pa() instead.
165+
*/
166+
static inline unsigned long __read_cr3(void)
163167
{
164-
return native_read_cr3();
168+
return __native_read_cr3();
165169
}
166170

167171
static inline void write_cr3(unsigned long x)

arch/x86/include/asm/tlbflush.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ static inline void __native_flush_tlb(void)
156156
* back:
157157
*/
158158
preempt_disable();
159-
native_write_cr3(native_read_cr3());
159+
native_write_cr3(__native_read_cr3());
160160
preempt_enable();
161161
}
162162

@@ -264,7 +264,7 @@ static inline void reset_lazy_tlbstate(void)
264264
this_cpu_write(cpu_tlbstate.state, 0);
265265
this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm);
266266

267-
WARN_ON(read_cr3() != __pa_symbol(swapper_pg_dir));
267+
WARN_ON(read_cr3_pa() != __pa_symbol(swapper_pg_dir));
268268
}
269269

270270
static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,

arch/x86/kernel/head64.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@ int __init early_make_pgtable(unsigned long address)
5555
pmdval_t pmd, *pmd_p;
5656

5757
/* Invalid address or early pgt is done ? */
58-
if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
58+
if (physaddr >= MAXMEM ||
59+
read_cr3_pa() != __pa_nodebug(early_level4_pgt))
5960
return -1;
6061

6162
again:

arch/x86/kernel/paravirt.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
391391

392392
.read_cr2 = native_read_cr2,
393393
.write_cr2 = native_write_cr2,
394-
.read_cr3 = native_read_cr3,
394+
.read_cr3 = __native_read_cr3,
395395
.write_cr3 = native_write_cr3,
396396

397397
.flush_tlb_user = native_flush_tlb,

arch/x86/kernel/process_32.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
9292

9393
cr0 = read_cr0();
9494
cr2 = read_cr2();
95-
cr3 = read_cr3();
95+
cr3 = __read_cr3();
9696
cr4 = __read_cr4();
9797
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
9898
cr0, cr2, cr3, cr4);

arch/x86/kernel/process_64.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
104104

105105
cr0 = read_cr0();
106106
cr2 = read_cr2();
107-
cr3 = read_cr3();
107+
cr3 = __read_cr3();
108108
cr4 = __read_cr4();
109109

110110
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",

arch/x86/kvm/vmx.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5024,7 +5024,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
50245024
* Save the most likely value for this task's CR3 in the VMCS.
50255025
* We can't use __get_current_cr3_fast() because we're not atomic.
50265026
*/
5027-
cr3 = read_cr3();
5027+
cr3 = __read_cr3();
50285028
vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
50295029
vmx->host_state.vmcs_host_cr3 = cr3;
50305030

arch/x86/mm/fault.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
346346
* Do _not_ use "current" here. We might be inside
347347
* an interrupt in the middle of a task switch..
348348
*/
349-
pgd_paddr = read_cr3();
349+
pgd_paddr = read_cr3_pa();
350350
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
351351
if (!pmd_k)
352352
return -1;
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
388388

389389
static void dump_pagetable(unsigned long address)
390390
{
391-
pgd_t *base = __va(read_cr3());
391+
pgd_t *base = __va(read_cr3_pa());
392392
pgd_t *pgd = &base[pgd_index(address)];
393393
p4d_t *p4d;
394394
pud_t *pud;
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
451451
* happen within a race in page table update. In the later
452452
* case just flush:
453453
*/
454-
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
454+
pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
455455
pgd_ref = pgd_offset_k(address);
456456
if (pgd_none(*pgd_ref))
457457
return -1;
@@ -555,7 +555,7 @@ static int bad_address(void *p)
555555

556556
static void dump_pagetable(unsigned long address)
557557
{
558-
pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
558+
pgd_t *base = __va(read_cr3_pa());
559559
pgd_t *pgd = base + pgd_index(address);
560560
p4d_t *p4d;
561561
pud_t *pud;
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
700700
pgd_t *pgd;
701701
pte_t *pte;
702702

703-
pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
703+
pgd = __va(read_cr3_pa());
704704
pgd += pgd_index(address);
705705

706706
pte = lookup_address_in_pgd(pgd, address, &level);

arch/x86/mm/ioremap.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
424424
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
425425
{
426426
/* Don't assume we're using swapper_pg_dir at this point */
427-
pgd_t *base = __va(read_cr3());
427+
pgd_t *base = __va(read_cr3_pa());
428428
pgd_t *pgd = &base[pgd_index(addr)];
429429
p4d_t *p4d = p4d_offset(pgd, addr);
430430
pud_t *pud = pud_offset(p4d, addr);

arch/x86/platform/efi/efi_64.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
8080
int n_pgds, i, j;
8181

8282
if (!efi_enabled(EFI_OLD_MEMMAP)) {
83-
save_pgd = (pgd_t *)read_cr3();
83+
save_pgd = (pgd_t *)__read_cr3();
8484
write_cr3((unsigned long)efi_scratch.efi_pgt);
8585
goto out;
8686
}
@@ -646,7 +646,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
646646
efi_sync_low_kernel_mappings();
647647
local_irq_save(flags);
648648

649-
efi_scratch.prev_cr3 = read_cr3();
649+
efi_scratch.prev_cr3 = __read_cr3();
650650
write_cr3((unsigned long)efi_scratch.efi_pgt);
651651
__flush_tlb_all();
652652

arch/x86/platform/olpc/olpc-xo1-pm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
7777

7878
asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
7979
{
80-
void *pgd_addr = __va(read_cr3());
80+
void *pgd_addr = __va(read_cr3_pa());
8181

8282
/* Program wakeup mask (using dword access to CS5536_PM1_EN) */
8383
outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);

arch/x86/power/cpu.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
129129
*/
130130
ctxt->cr0 = read_cr0();
131131
ctxt->cr2 = read_cr2();
132-
ctxt->cr3 = read_cr3();
132+
ctxt->cr3 = __read_cr3();
133133
ctxt->cr4 = __read_cr4();
134134
#ifdef CONFIG_X86_64
135135
ctxt->cr8 = read_cr8();

arch/x86/power/hibernate_64.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,8 @@ static int relocate_restore_code(void)
150150
memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
151151

152152
/* Make the page containing the relocated code executable */
153-
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
153+
pgd = (pgd_t *)__va(read_cr3_pa()) +
154+
pgd_index(relocated_restore_code);
154155
p4d = p4d_offset(pgd, relocated_restore_code);
155156
if (p4d_large(*p4d)) {
156157
set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));

arch/x86/xen/mmu_pv.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2017,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
20172017
pmd_t pmd;
20182018
pte_t pte;
20192019

2020-
pa = read_cr3();
2020+
pa = read_cr3_pa();
20212021
pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
20222022
sizeof(pgd)));
20232023
if (!pgd_present(pgd))
@@ -2097,7 +2097,7 @@ void __init xen_relocate_p2m(void)
20972097
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
20982098
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
20992099

2100-
pgd = __va(read_cr3());
2100+
pgd = __va(read_cr3_pa());
21012101
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
21022102
idx_p4d = 0;
21032103
save_pud = n_pud;
@@ -2204,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
22042204
{
22052205
unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
22062206

2207-
BUG_ON(read_cr3() != __pa(initial_page_table));
2207+
BUG_ON(read_cr3_pa() != __pa(initial_page_table));
22082208
BUG_ON(cr3 != __pa(swapper_pg_dir));
22092209

22102210
/*

0 commit comments

Comments
 (0)