Skip to content

Commit b7ffc44

Browse files
amlutobonzini
authored andcommitted
x86/kvm/vmx: Defer TR reload after VM exit
Intel's VMX is daft and resets the hidden TSS limit register to 0x67 on VMX reload, and the 0x67 is not configurable. KVM currently reloads TR using the LTR instruction on every exit, but this is quite slow because LTR is serializing. The 0x67 limit is entirely harmless unless ioperm() is in use, so defer the reload until a task using ioperm() is actually running. Here's some poorly done benchmarking using kvm-unit-tests: Before: cpuid 1313 vmcall 1195 mov_from_cr8 11 mov_to_cr8 17 inl_from_pmtimer 6770 inl_from_qemu 6856 inl_from_kernel 2435 outl_to_kernel 1402 After: cpuid 1291 vmcall 1181 mov_from_cr8 11 mov_to_cr8 16 inl_from_pmtimer 6457 inl_from_qemu 6209 inl_from_kernel 2339 outl_to_kernel 1391 Signed-off-by: Andy Lutomirski <luto@kernel.org> [Force-reload TR in invalidate_tss_limit. - Paolo] Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
1 parent d3273de commit b7ffc44

File tree

4 files changed

+72
-14
lines changed

4 files changed

+72
-14
lines changed

arch/x86/include/asm/desc.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,54 @@ static inline void native_load_tr_desc(void)
205205
asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
206206
}
207207

208+
static inline void force_reload_TR(void)
209+
{
210+
struct desc_struct *d = get_cpu_gdt_table(smp_processor_id());
211+
tss_desc tss;
212+
213+
memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
214+
215+
/*
216+
* LTR requires an available TSS, and the TSS is currently
217+
* busy. Make it be available so that LTR will work.
218+
*/
219+
tss.type = DESC_TSS;
220+
write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS);
221+
222+
load_TR_desc();
223+
}
224+
225+
DECLARE_PER_CPU(bool, need_tr_refresh);
226+
227+
static inline void refresh_TR(void)
228+
{
229+
DEBUG_LOCKS_WARN_ON(preemptible());
230+
231+
if (unlikely(this_cpu_read(need_tr_refresh))) {
232+
force_reload_TR();
233+
this_cpu_write(need_tr_refresh, false);
234+
}
235+
}
236+
237+
/*
238+
* If you do something evil that corrupts the cached TSS limit (I'm looking
239+
* at you, VMX exits), call this function.
240+
*
241+
* The optimization here is that the TSS limit only matters for Linux if the
242+
* IO bitmap is in use. If the TSS limit gets forced to its minimum value,
243+
* everything works except that IO bitmap will be ignored and all CPL 3 IO
244+
* instructions will #GP, which is exactly what we want for normal tasks.
245+
*/
246+
static inline void invalidate_tss_limit(void)
247+
{
248+
DEBUG_LOCKS_WARN_ON(preemptible());
249+
250+
if (unlikely(test_thread_flag(TIF_IO_BITMAP)))
251+
force_reload_TR();
252+
else
253+
this_cpu_write(need_tr_refresh, true);
254+
}
255+
208256
static inline void native_load_gdt(const struct desc_ptr *dtr)
209257
{
210258
asm volatile("lgdt %0"::"m" (*dtr));

arch/x86/kernel/ioport.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <linux/syscalls.h>
1717
#include <linux/bitmap.h>
1818
#include <asm/syscalls.h>
19+
#include <asm/desc.h>
1920

2021
/*
2122
* this changes the io permissions bitmap in the current task.
@@ -45,6 +46,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
4546
memset(bitmap, 0xff, IO_BITMAP_BYTES);
4647
t->io_bitmap_ptr = bitmap;
4748
set_thread_flag(TIF_IO_BITMAP);
49+
50+
preempt_disable();
51+
refresh_TR();
52+
preempt_enable();
4853
}
4954

5055
/*

arch/x86/kernel/process.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <asm/mce.h>
3333
#include <asm/vm86.h>
3434
#include <asm/switch_to.h>
35+
#include <asm/desc.h>
3536

3637
/*
3738
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -64,6 +65,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
6465
};
6566
EXPORT_PER_CPU_SYMBOL(cpu_tss);
6667

68+
DEFINE_PER_CPU(bool, need_tr_refresh);
69+
EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh);
70+
6771
/*
6872
* this gets called so that we can store lazy state into memory and copy the
6973
* current task into the new thread.
@@ -209,6 +213,12 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
209213
*/
210214
memcpy(tss->io_bitmap, next->io_bitmap_ptr,
211215
max(prev->io_bitmap_max, next->io_bitmap_max));
216+
217+
/*
218+
* Make sure that the TSS limit is correct for the CPU
219+
* to notice the IO bitmap.
220+
*/
221+
refresh_TR();
212222
} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
213223
/*
214224
* Clear any possible leftover bits:

arch/x86/kvm/vmx.c

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1990,19 +1990,6 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
19901990
m->host[i].value = host_val;
19911991
}
19921992

1993-
static void reload_tss(void)
1994-
{
1995-
/*
1996-
* VT restores TR but not its size. Useless.
1997-
*/
1998-
struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
1999-
struct desc_struct *descs;
2000-
2001-
descs = (void *)gdt->address;
2002-
descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
2003-
load_TR_desc();
2004-
}
2005-
20061993
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
20071994
{
20081995
u64 guest_efer = vmx->vcpu.arch.efer;
@@ -2172,7 +2159,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
21722159
loadsegment(es, vmx->host_state.es_sel);
21732160
}
21742161
#endif
2175-
reload_tss();
2162+
invalidate_tss_limit();
21762163
#ifdef CONFIG_X86_64
21772164
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
21782165
#endif
@@ -2293,6 +2280,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
22932280
(unsigned long)this_cpu_ptr(&cpu_tss));
22942281
vmcs_writel(HOST_GDTR_BASE, gdt->address);
22952282

2283+
/*
2284+
* VM exits change the host TR limit to 0x67 after a VM
2285+
* exit. This is okay, since 0x67 covers everything except
2286+
* the IO bitmap and have have code to handle the IO bitmap
2287+
* being lost after a VM exit.
2288+
*/
2289+
BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
2290+
22962291
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
22972292
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
22982293

0 commit comments

Comments
 (0)