Skip to content

Commit 8fcc4b5

Browse files
jsmattsonjrbonzini
authored andcommitted
kvm: nVMX: Introduce KVM_CAP_NESTED_STATE
For nested virtualization L0 KVM is managing a bit of state for L2 guests, this state can not be captured through the currently available IOCTLs. In fact the state captured through all of these IOCTLs is usually a mix of L1 and L2 state. It is also dependent on whether the L2 guest was running at the moment when the process was interrupted to save its state. With this capability, there are two new vcpu ioctls: KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE. These can be used for saving and restoring a VM that is in VMX operation. Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Radim Krčmář <rkrcmar@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Jim Mattson <jmattson@google.com> [karahmed@ - rename structs and functions and make them ready for AMD and address previous comments. - handle nested.smm state. - rebase & a bit of refactoring. - Merge 7/8 and 8/8 into one patch. ] Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
1 parent 7f7f1ba commit 8fcc4b5

File tree

6 files changed

+330
-2
lines changed

6 files changed

+330
-2
lines changed

Documentation/virtual/kvm/api.txt

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3561,6 +3561,62 @@ Returns: 0 on success,
35613561
-ENOENT on deassign if the conn_id isn't registered
35623562
-EEXIST on assign if the conn_id is already registered
35633563

3564+
4.114 KVM_GET_NESTED_STATE
3565+
3566+
Capability: KVM_CAP_NESTED_STATE
3567+
Architectures: x86
3568+
Type: vcpu ioctl
3569+
Parameters: struct kvm_nested_state (in/out)
3570+
Returns: 0 on success, -1 on error
3571+
Errors:
3572+
E2BIG: the total state size (including the fixed-size part of struct
3573+
kvm_nested_state) exceeds the value of 'size' specified by
3574+
the user; the size required will be written into size.
3575+
3576+
struct kvm_nested_state {
3577+
__u16 flags;
3578+
__u16 format;
3579+
__u32 size;
3580+
union {
3581+
struct kvm_vmx_nested_state vmx;
3582+
struct kvm_svm_nested_state svm;
3583+
__u8 pad[120];
3584+
};
3585+
__u8 data[0];
3586+
};
3587+
3588+
#define KVM_STATE_NESTED_GUEST_MODE 0x00000001
3589+
#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
3590+
3591+
#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
3592+
#define KVM_STATE_NESTED_SMM_VMXON 0x00000002
3593+
3594+
struct kvm_vmx_nested_state {
3595+
__u64 vmxon_pa;
3596+
__u64 vmcs_pa;
3597+
3598+
struct {
3599+
__u16 flags;
3600+
} smm;
3601+
};
3602+
3603+
This ioctl copies the vcpu's nested virtualization state from the kernel to
3604+
userspace.
3605+
3606+
The maximum size of the state, including the fixed-size part of struct
3607+
kvm_nested_state, can be retrieved by passing KVM_CAP_NESTED_STATE to
3608+
the KVM_CHECK_EXTENSION ioctl().
3609+
3610+
4.115 KVM_SET_NESTED_STATE
3611+
3612+
Capability: KVM_CAP_NESTED_STATE
3613+
Architectures: x86
3614+
Type: vcpu ioctl
3615+
Parameters: struct kvm_nested_state (in)
3616+
Returns: 0 on success, -1 on error
3617+
3618+
This copies the vcpu's kvm_nested_state struct from userspace to the kernel. For
3619+
the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE.
35643620

35653621
5. The kvm_run structure
35663622
------------------------

arch/x86/include/asm/kvm_host.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,6 +1086,12 @@ struct kvm_x86_ops {
10861086

10871087
void (*setup_mce)(struct kvm_vcpu *vcpu);
10881088

1089+
int (*get_nested_state)(struct kvm_vcpu *vcpu,
1090+
struct kvm_nested_state __user *user_kvm_nested_state,
1091+
unsigned user_data_size);
1092+
int (*set_nested_state)(struct kvm_vcpu *vcpu,
1093+
struct kvm_nested_state __user *user_kvm_nested_state,
1094+
struct kvm_nested_state *kvm_state);
10891095
void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
10901096

10911097
int (*smi_allowed)(struct kvm_vcpu *vcpu);

arch/x86/include/uapi/asm/kvm.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,4 +378,41 @@ struct kvm_sync_regs {
378378
#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
379379
#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
380380

381+
#define KVM_STATE_NESTED_GUEST_MODE 0x00000001
382+
#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
383+
384+
#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
385+
#define KVM_STATE_NESTED_SMM_VMXON 0x00000002
386+
387+
struct kvm_vmx_nested_state {
388+
__u64 vmxon_pa;
389+
__u64 vmcs_pa;
390+
391+
struct {
392+
__u16 flags;
393+
} smm;
394+
};
395+
396+
/* for KVM_CAP_NESTED_STATE */
397+
struct kvm_nested_state {
398+
/* KVM_STATE_* flags */
399+
__u16 flags;
400+
401+
/* 0 for VMX, 1 for SVM. */
402+
__u16 format;
403+
404+
/* 128 for SVM, 128 + VMCS size for VMX. */
405+
__u32 size;
406+
407+
union {
408+
/* VMXON, VMCS */
409+
struct kvm_vmx_nested_state vmx;
410+
411+
/* Pad the header to 128 bytes. */
412+
__u8 pad[120];
413+
};
414+
415+
__u8 data[0];
416+
};
417+
381418
#endif /* _ASM_X86_KVM_H */

arch/x86/kvm/vmx.c

Lines changed: 173 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7589,6 +7589,11 @@ static __init int hardware_setup(void)
75897589
else
75907590
kvm_disable_tdp();
75917591

7592+
if (!nested) {
7593+
kvm_x86_ops->get_nested_state = NULL;
7594+
kvm_x86_ops->set_nested_state = NULL;
7595+
}
7596+
75927597
/*
75937598
* Only enable PML when hardware supports PML feature, and both EPT
75947599
* and EPT A/D bit features are enabled -- PML depends on them to work.
@@ -11775,8 +11780,8 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
1177511780
}
1177611781

1177711782
/*
11778-
* If exit_qual is NULL, this is being called from RSM.
11779-
* Otherwise it's called from vmlaunch/vmresume.
11783+
* If exit_qual is NULL, this is being called from state restore (either RSM
11784+
* or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
1178011785
*/
1178111786
static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
1178211787
{
@@ -13016,6 +13021,170 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
1301613021
return 0;
1301713022
}
1301813023

13024+
static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
13025+
struct kvm_nested_state __user *user_kvm_nested_state,
13026+
u32 user_data_size)
13027+
{
13028+
struct vcpu_vmx *vmx;
13029+
struct vmcs12 *vmcs12;
13030+
struct kvm_nested_state kvm_state = {
13031+
.flags = 0,
13032+
.format = 0,
13033+
.size = sizeof(kvm_state),
13034+
.vmx.vmxon_pa = -1ull,
13035+
.vmx.vmcs_pa = -1ull,
13036+
};
13037+
13038+
if (!vcpu)
13039+
return kvm_state.size + 2 * VMCS12_SIZE;
13040+
13041+
vmx = to_vmx(vcpu);
13042+
vmcs12 = get_vmcs12(vcpu);
13043+
if (nested_vmx_allowed(vcpu) &&
13044+
(vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
13045+
kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
13046+
kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
13047+
13048+
if (vmx->nested.current_vmptr != -1ull)
13049+
kvm_state.size += VMCS12_SIZE;
13050+
13051+
if (vmx->nested.smm.vmxon)
13052+
kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
13053+
13054+
if (vmx->nested.smm.guest_mode)
13055+
kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
13056+
13057+
if (is_guest_mode(vcpu)) {
13058+
kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
13059+
13060+
if (vmx->nested.nested_run_pending)
13061+
kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
13062+
}
13063+
}
13064+
13065+
if (user_data_size < kvm_state.size)
13066+
goto out;
13067+
13068+
if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
13069+
return -EFAULT;
13070+
13071+
if (vmx->nested.current_vmptr == -1ull)
13072+
goto out;
13073+
13074+
/*
13075+
* When running L2, the authoritative vmcs12 state is in the
13076+
* vmcs02. When running L1, the authoritative vmcs12 state is
13077+
* in the shadow vmcs linked to vmcs01, unless
13078+
* sync_shadow_vmcs is set, in which case, the authoritative
13079+
* vmcs12 state is in the vmcs12 already.
13080+
*/
13081+
if (is_guest_mode(vcpu))
13082+
sync_vmcs12(vcpu, vmcs12);
13083+
else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
13084+
copy_shadow_to_vmcs12(vmx);
13085+
13086+
if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
13087+
return -EFAULT;
13088+
13089+
out:
13090+
return kvm_state.size;
13091+
}
13092+
13093+
static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
13094+
struct kvm_nested_state __user *user_kvm_nested_state,
13095+
struct kvm_nested_state *kvm_state)
13096+
{
13097+
struct vcpu_vmx *vmx = to_vmx(vcpu);
13098+
struct vmcs12 *vmcs12;
13099+
u32 exit_qual;
13100+
int ret;
13101+
13102+
if (kvm_state->format != 0)
13103+
return -EINVAL;
13104+
13105+
if (!nested_vmx_allowed(vcpu))
13106+
return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
13107+
13108+
if (kvm_state->vmx.vmxon_pa == -1ull) {
13109+
if (kvm_state->vmx.smm.flags)
13110+
return -EINVAL;
13111+
13112+
if (kvm_state->vmx.vmcs_pa != -1ull)
13113+
return -EINVAL;
13114+
13115+
vmx_leave_nested(vcpu);
13116+
return 0;
13117+
}
13118+
13119+
if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
13120+
return -EINVAL;
13121+
13122+
if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
13123+
return -EINVAL;
13124+
13125+
if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
13126+
!page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
13127+
return -EINVAL;
13128+
13129+
if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
13130+
(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
13131+
return -EINVAL;
13132+
13133+
if (kvm_state->vmx.smm.flags &
13134+
~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
13135+
return -EINVAL;
13136+
13137+
if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
13138+
!(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
13139+
return -EINVAL;
13140+
13141+
vmx_leave_nested(vcpu);
13142+
if (kvm_state->vmx.vmxon_pa == -1ull)
13143+
return 0;
13144+
13145+
vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
13146+
ret = enter_vmx_operation(vcpu);
13147+
if (ret)
13148+
return ret;
13149+
13150+
set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
13151+
13152+
if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
13153+
vmx->nested.smm.vmxon = true;
13154+
vmx->nested.vmxon = false;
13155+
13156+
if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
13157+
vmx->nested.smm.guest_mode = true;
13158+
}
13159+
13160+
vmcs12 = get_vmcs12(vcpu);
13161+
if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
13162+
return -EFAULT;
13163+
13164+
if (vmcs12->revision_id != VMCS12_REVISION)
13165+
return -EINVAL;
13166+
13167+
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
13168+
return 0;
13169+
13170+
vmx->nested.nested_run_pending =
13171+
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
13172+
13173+
if (check_vmentry_prereqs(vcpu, vmcs12) ||
13174+
check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
13175+
return -EINVAL;
13176+
13177+
if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
13178+
vmx->nested.nested_run_pending = 1;
13179+
13180+
vmx->nested.dirty_vmcs12 = true;
13181+
ret = enter_vmx_non_root_mode(vcpu, NULL);
13182+
if (ret)
13183+
return -EINVAL;
13184+
13185+
return 0;
13186+
}
13187+
1301913188
static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
1302013189
.cpu_has_kvm_support = cpu_has_kvm_support,
1302113190
.disabled_by_bios = vmx_disabled_by_bios,
@@ -13150,6 +13319,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
1315013319

1315113320
.setup_mce = vmx_setup_mce,
1315213321

13322+
.get_nested_state = vmx_get_nested_state,
13323+
.set_nested_state = vmx_set_nested_state,
1315313324
.get_vmcs12_pages = nested_get_vmcs12_pages,
1315413325

1315513326
.smi_allowed = vmx_smi_allowed,

arch/x86/kvm/x86.c

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2947,6 +2947,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
29472947
case KVM_CAP_X2APIC_API:
29482948
r = KVM_X2APIC_API_VALID_FLAGS;
29492949
break;
2950+
case KVM_CAP_NESTED_STATE:
2951+
r = kvm_x86_ops->get_nested_state ?
2952+
kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
2953+
break;
29502954
default:
29512955
break;
29522956
}
@@ -3963,6 +3967,56 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
39633967
r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
39643968
break;
39653969
}
3970+
case KVM_GET_NESTED_STATE: {
3971+
struct kvm_nested_state __user *user_kvm_nested_state = argp;
3972+
u32 user_data_size;
3973+
3974+
r = -EINVAL;
3975+
if (!kvm_x86_ops->get_nested_state)
3976+
break;
3977+
3978+
BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
3979+
if (get_user(user_data_size, &user_kvm_nested_state->size))
3980+
return -EFAULT;
3981+
3982+
r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
3983+
user_data_size);
3984+
if (r < 0)
3985+
return r;
3986+
3987+
if (r > user_data_size) {
3988+
if (put_user(r, &user_kvm_nested_state->size))
3989+
return -EFAULT;
3990+
return -E2BIG;
3991+
}
3992+
r = 0;
3993+
break;
3994+
}
3995+
case KVM_SET_NESTED_STATE: {
3996+
struct kvm_nested_state __user *user_kvm_nested_state = argp;
3997+
struct kvm_nested_state kvm_state;
3998+
3999+
r = -EINVAL;
4000+
if (!kvm_x86_ops->set_nested_state)
4001+
break;
4002+
4003+
if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
4004+
return -EFAULT;
4005+
4006+
if (kvm_state.size < sizeof(kvm_state))
4007+
return -EINVAL;
4008+
4009+
if (kvm_state.flags &
4010+
~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
4011+
return -EINVAL;
4012+
4013+
/* nested_run_pending implies guest_mode. */
4014+
if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
4015+
return -EINVAL;
4016+
4017+
r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
4018+
break;
4019+
}
39664020
default:
39674021
r = -EINVAL;
39684022
}

0 commit comments

Comments
 (0)