1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun
3*4882a593Smuzhiyun #include <linux/objtool.h>
4*4882a593Smuzhiyun #include <linux/percpu.h>
5*4882a593Smuzhiyun
6*4882a593Smuzhiyun #include <asm/debugreg.h>
7*4882a593Smuzhiyun #include <asm/mmu_context.h>
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun #include "cpuid.h"
10*4882a593Smuzhiyun #include "hyperv.h"
11*4882a593Smuzhiyun #include "mmu.h"
12*4882a593Smuzhiyun #include "nested.h"
13*4882a593Smuzhiyun #include "pmu.h"
14*4882a593Smuzhiyun #include "trace.h"
15*4882a593Smuzhiyun #include "vmx.h"
16*4882a593Smuzhiyun #include "x86.h"
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun static bool __read_mostly enable_shadow_vmcs = 1;
19*4882a593Smuzhiyun module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun static bool __read_mostly nested_early_check = 0;
22*4882a593Smuzhiyun module_param(nested_early_check, bool, S_IRUGO);
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun #define CC(consistency_check) \
25*4882a593Smuzhiyun ({ \
26*4882a593Smuzhiyun bool failed = (consistency_check); \
27*4882a593Smuzhiyun if (failed) \
28*4882a593Smuzhiyun trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
29*4882a593Smuzhiyun failed; \
30*4882a593Smuzhiyun })
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun /*
33*4882a593Smuzhiyun * Hyper-V requires all of these, so mark them as supported even though
34*4882a593Smuzhiyun * they are just treated the same as all-context.
35*4882a593Smuzhiyun */
36*4882a593Smuzhiyun #define VMX_VPID_EXTENT_SUPPORTED_MASK \
37*4882a593Smuzhiyun (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
38*4882a593Smuzhiyun VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
39*4882a593Smuzhiyun VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
40*4882a593Smuzhiyun VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun enum {
45*4882a593Smuzhiyun VMX_VMREAD_BITMAP,
46*4882a593Smuzhiyun VMX_VMWRITE_BITMAP,
47*4882a593Smuzhiyun VMX_BITMAP_NR
48*4882a593Smuzhiyun };
49*4882a593Smuzhiyun static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
52*4882a593Smuzhiyun #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun struct shadow_vmcs_field {
55*4882a593Smuzhiyun u16 encoding;
56*4882a593Smuzhiyun u16 offset;
57*4882a593Smuzhiyun };
58*4882a593Smuzhiyun static struct shadow_vmcs_field shadow_read_only_fields[] = {
59*4882a593Smuzhiyun #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
60*4882a593Smuzhiyun #include "vmcs_shadow_fields.h"
61*4882a593Smuzhiyun };
62*4882a593Smuzhiyun static int max_shadow_read_only_fields =
63*4882a593Smuzhiyun ARRAY_SIZE(shadow_read_only_fields);
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun static struct shadow_vmcs_field shadow_read_write_fields[] = {
66*4882a593Smuzhiyun #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
67*4882a593Smuzhiyun #include "vmcs_shadow_fields.h"
68*4882a593Smuzhiyun };
69*4882a593Smuzhiyun static int max_shadow_read_write_fields =
70*4882a593Smuzhiyun ARRAY_SIZE(shadow_read_write_fields);
71*4882a593Smuzhiyun
init_vmcs_shadow_fields(void)72*4882a593Smuzhiyun static void init_vmcs_shadow_fields(void)
73*4882a593Smuzhiyun {
74*4882a593Smuzhiyun int i, j;
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
77*4882a593Smuzhiyun memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun for (i = j = 0; i < max_shadow_read_only_fields; i++) {
80*4882a593Smuzhiyun struct shadow_vmcs_field entry = shadow_read_only_fields[i];
81*4882a593Smuzhiyun u16 field = entry.encoding;
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
84*4882a593Smuzhiyun (i + 1 == max_shadow_read_only_fields ||
85*4882a593Smuzhiyun shadow_read_only_fields[i + 1].encoding != field + 1))
86*4882a593Smuzhiyun pr_err("Missing field from shadow_read_only_field %x\n",
87*4882a593Smuzhiyun field + 1);
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun clear_bit(field, vmx_vmread_bitmap);
90*4882a593Smuzhiyun if (field & 1)
91*4882a593Smuzhiyun #ifdef CONFIG_X86_64
92*4882a593Smuzhiyun continue;
93*4882a593Smuzhiyun #else
94*4882a593Smuzhiyun entry.offset += sizeof(u32);
95*4882a593Smuzhiyun #endif
96*4882a593Smuzhiyun shadow_read_only_fields[j++] = entry;
97*4882a593Smuzhiyun }
98*4882a593Smuzhiyun max_shadow_read_only_fields = j;
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun for (i = j = 0; i < max_shadow_read_write_fields; i++) {
101*4882a593Smuzhiyun struct shadow_vmcs_field entry = shadow_read_write_fields[i];
102*4882a593Smuzhiyun u16 field = entry.encoding;
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
105*4882a593Smuzhiyun (i + 1 == max_shadow_read_write_fields ||
106*4882a593Smuzhiyun shadow_read_write_fields[i + 1].encoding != field + 1))
107*4882a593Smuzhiyun pr_err("Missing field from shadow_read_write_field %x\n",
108*4882a593Smuzhiyun field + 1);
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
111*4882a593Smuzhiyun field <= GUEST_TR_AR_BYTES,
112*4882a593Smuzhiyun "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
113*4882a593Smuzhiyun
114*4882a593Smuzhiyun /*
115*4882a593Smuzhiyun * PML and the preemption timer can be emulated, but the
116*4882a593Smuzhiyun * processor cannot vmwrite to fields that don't exist
117*4882a593Smuzhiyun * on bare metal.
118*4882a593Smuzhiyun */
119*4882a593Smuzhiyun switch (field) {
120*4882a593Smuzhiyun case GUEST_PML_INDEX:
121*4882a593Smuzhiyun if (!cpu_has_vmx_pml())
122*4882a593Smuzhiyun continue;
123*4882a593Smuzhiyun break;
124*4882a593Smuzhiyun case VMX_PREEMPTION_TIMER_VALUE:
125*4882a593Smuzhiyun if (!cpu_has_vmx_preemption_timer())
126*4882a593Smuzhiyun continue;
127*4882a593Smuzhiyun break;
128*4882a593Smuzhiyun case GUEST_INTR_STATUS:
129*4882a593Smuzhiyun if (!cpu_has_vmx_apicv())
130*4882a593Smuzhiyun continue;
131*4882a593Smuzhiyun break;
132*4882a593Smuzhiyun default:
133*4882a593Smuzhiyun break;
134*4882a593Smuzhiyun }
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun clear_bit(field, vmx_vmwrite_bitmap);
137*4882a593Smuzhiyun clear_bit(field, vmx_vmread_bitmap);
138*4882a593Smuzhiyun if (field & 1)
139*4882a593Smuzhiyun #ifdef CONFIG_X86_64
140*4882a593Smuzhiyun continue;
141*4882a593Smuzhiyun #else
142*4882a593Smuzhiyun entry.offset += sizeof(u32);
143*4882a593Smuzhiyun #endif
144*4882a593Smuzhiyun shadow_read_write_fields[j++] = entry;
145*4882a593Smuzhiyun }
146*4882a593Smuzhiyun max_shadow_read_write_fields = j;
147*4882a593Smuzhiyun }
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun /*
150*4882a593Smuzhiyun * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
151*4882a593Smuzhiyun * set the success or error code of an emulated VMX instruction (as specified
152*4882a593Smuzhiyun * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
153*4882a593Smuzhiyun * instruction.
154*4882a593Smuzhiyun */
nested_vmx_succeed(struct kvm_vcpu * vcpu)155*4882a593Smuzhiyun static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
156*4882a593Smuzhiyun {
157*4882a593Smuzhiyun vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
158*4882a593Smuzhiyun & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
159*4882a593Smuzhiyun X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
160*4882a593Smuzhiyun return kvm_skip_emulated_instruction(vcpu);
161*4882a593Smuzhiyun }
162*4882a593Smuzhiyun
nested_vmx_failInvalid(struct kvm_vcpu * vcpu)163*4882a593Smuzhiyun static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
164*4882a593Smuzhiyun {
165*4882a593Smuzhiyun vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
166*4882a593Smuzhiyun & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
167*4882a593Smuzhiyun X86_EFLAGS_SF | X86_EFLAGS_OF))
168*4882a593Smuzhiyun | X86_EFLAGS_CF);
169*4882a593Smuzhiyun return kvm_skip_emulated_instruction(vcpu);
170*4882a593Smuzhiyun }
171*4882a593Smuzhiyun
nested_vmx_failValid(struct kvm_vcpu * vcpu,u32 vm_instruction_error)172*4882a593Smuzhiyun static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
173*4882a593Smuzhiyun u32 vm_instruction_error)
174*4882a593Smuzhiyun {
175*4882a593Smuzhiyun vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
176*4882a593Smuzhiyun & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
177*4882a593Smuzhiyun X86_EFLAGS_SF | X86_EFLAGS_OF))
178*4882a593Smuzhiyun | X86_EFLAGS_ZF);
179*4882a593Smuzhiyun get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
180*4882a593Smuzhiyun /*
181*4882a593Smuzhiyun * We don't need to force a shadow sync because
182*4882a593Smuzhiyun * VM_INSTRUCTION_ERROR is not shadowed
183*4882a593Smuzhiyun */
184*4882a593Smuzhiyun return kvm_skip_emulated_instruction(vcpu);
185*4882a593Smuzhiyun }
186*4882a593Smuzhiyun
nested_vmx_fail(struct kvm_vcpu * vcpu,u32 vm_instruction_error)187*4882a593Smuzhiyun static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
188*4882a593Smuzhiyun {
189*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun /*
192*4882a593Smuzhiyun * failValid writes the error number to the current VMCS, which
193*4882a593Smuzhiyun * can't be done if there isn't a current VMCS.
194*4882a593Smuzhiyun */
195*4882a593Smuzhiyun if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
196*4882a593Smuzhiyun return nested_vmx_failInvalid(vcpu);
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun return nested_vmx_failValid(vcpu, vm_instruction_error);
199*4882a593Smuzhiyun }
200*4882a593Smuzhiyun
nested_vmx_abort(struct kvm_vcpu * vcpu,u32 indicator)201*4882a593Smuzhiyun static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
202*4882a593Smuzhiyun {
203*4882a593Smuzhiyun /* TODO: not to reset guest simply here. */
204*4882a593Smuzhiyun kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
205*4882a593Smuzhiyun pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
206*4882a593Smuzhiyun }
207*4882a593Smuzhiyun
vmx_control_verify(u32 control,u32 low,u32 high)208*4882a593Smuzhiyun static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
209*4882a593Smuzhiyun {
210*4882a593Smuzhiyun return fixed_bits_valid(control, low, high);
211*4882a593Smuzhiyun }
212*4882a593Smuzhiyun
vmx_control_msr(u32 low,u32 high)213*4882a593Smuzhiyun static inline u64 vmx_control_msr(u32 low, u32 high)
214*4882a593Smuzhiyun {
215*4882a593Smuzhiyun return low | ((u64)high << 32);
216*4882a593Smuzhiyun }
217*4882a593Smuzhiyun
vmx_disable_shadow_vmcs(struct vcpu_vmx * vmx)218*4882a593Smuzhiyun static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
219*4882a593Smuzhiyun {
220*4882a593Smuzhiyun secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
221*4882a593Smuzhiyun vmcs_write64(VMCS_LINK_POINTER, -1ull);
222*4882a593Smuzhiyun vmx->nested.need_vmcs12_to_shadow_sync = false;
223*4882a593Smuzhiyun }
224*4882a593Smuzhiyun
nested_release_evmcs(struct kvm_vcpu * vcpu)225*4882a593Smuzhiyun static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
226*4882a593Smuzhiyun {
227*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun if (!vmx->nested.hv_evmcs)
230*4882a593Smuzhiyun return;
231*4882a593Smuzhiyun
232*4882a593Smuzhiyun kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
233*4882a593Smuzhiyun vmx->nested.hv_evmcs_vmptr = 0;
234*4882a593Smuzhiyun vmx->nested.hv_evmcs = NULL;
235*4882a593Smuzhiyun }
236*4882a593Smuzhiyun
vmx_sync_vmcs_host_state(struct vcpu_vmx * vmx,struct loaded_vmcs * prev)237*4882a593Smuzhiyun static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
238*4882a593Smuzhiyun struct loaded_vmcs *prev)
239*4882a593Smuzhiyun {
240*4882a593Smuzhiyun struct vmcs_host_state *dest, *src;
241*4882a593Smuzhiyun
242*4882a593Smuzhiyun if (unlikely(!vmx->guest_state_loaded))
243*4882a593Smuzhiyun return;
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun src = &prev->host_state;
246*4882a593Smuzhiyun dest = &vmx->loaded_vmcs->host_state;
247*4882a593Smuzhiyun
248*4882a593Smuzhiyun vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
249*4882a593Smuzhiyun dest->ldt_sel = src->ldt_sel;
250*4882a593Smuzhiyun #ifdef CONFIG_X86_64
251*4882a593Smuzhiyun dest->ds_sel = src->ds_sel;
252*4882a593Smuzhiyun dest->es_sel = src->es_sel;
253*4882a593Smuzhiyun #endif
254*4882a593Smuzhiyun }
255*4882a593Smuzhiyun
vmx_switch_vmcs(struct kvm_vcpu * vcpu,struct loaded_vmcs * vmcs)256*4882a593Smuzhiyun static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
257*4882a593Smuzhiyun {
258*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
259*4882a593Smuzhiyun struct loaded_vmcs *prev;
260*4882a593Smuzhiyun int cpu;
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
263*4882a593Smuzhiyun return;
264*4882a593Smuzhiyun
265*4882a593Smuzhiyun cpu = get_cpu();
266*4882a593Smuzhiyun prev = vmx->loaded_vmcs;
267*4882a593Smuzhiyun vmx->loaded_vmcs = vmcs;
268*4882a593Smuzhiyun vmx_vcpu_load_vmcs(vcpu, cpu, prev);
269*4882a593Smuzhiyun vmx_sync_vmcs_host_state(vmx, prev);
270*4882a593Smuzhiyun put_cpu();
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun vmx_register_cache_reset(vcpu);
273*4882a593Smuzhiyun }
274*4882a593Smuzhiyun
275*4882a593Smuzhiyun /*
276*4882a593Smuzhiyun * Free whatever needs to be freed from vmx->nested when L1 goes down, or
277*4882a593Smuzhiyun * just stops using VMX.
278*4882a593Smuzhiyun */
free_nested(struct kvm_vcpu * vcpu)279*4882a593Smuzhiyun static void free_nested(struct kvm_vcpu *vcpu)
280*4882a593Smuzhiyun {
281*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
284*4882a593Smuzhiyun vmx_switch_vmcs(vcpu, &vmx->vmcs01);
285*4882a593Smuzhiyun
286*4882a593Smuzhiyun if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
287*4882a593Smuzhiyun return;
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
290*4882a593Smuzhiyun
291*4882a593Smuzhiyun vmx->nested.vmxon = false;
292*4882a593Smuzhiyun vmx->nested.smm.vmxon = false;
293*4882a593Smuzhiyun free_vpid(vmx->nested.vpid02);
294*4882a593Smuzhiyun vmx->nested.posted_intr_nv = -1;
295*4882a593Smuzhiyun vmx->nested.current_vmptr = -1ull;
296*4882a593Smuzhiyun if (enable_shadow_vmcs) {
297*4882a593Smuzhiyun vmx_disable_shadow_vmcs(vmx);
298*4882a593Smuzhiyun vmcs_clear(vmx->vmcs01.shadow_vmcs);
299*4882a593Smuzhiyun free_vmcs(vmx->vmcs01.shadow_vmcs);
300*4882a593Smuzhiyun vmx->vmcs01.shadow_vmcs = NULL;
301*4882a593Smuzhiyun }
302*4882a593Smuzhiyun kfree(vmx->nested.cached_vmcs12);
303*4882a593Smuzhiyun vmx->nested.cached_vmcs12 = NULL;
304*4882a593Smuzhiyun kfree(vmx->nested.cached_shadow_vmcs12);
305*4882a593Smuzhiyun vmx->nested.cached_shadow_vmcs12 = NULL;
306*4882a593Smuzhiyun /* Unpin physical memory we referred to in the vmcs02 */
307*4882a593Smuzhiyun if (vmx->nested.apic_access_page) {
308*4882a593Smuzhiyun kvm_release_page_clean(vmx->nested.apic_access_page);
309*4882a593Smuzhiyun vmx->nested.apic_access_page = NULL;
310*4882a593Smuzhiyun }
311*4882a593Smuzhiyun kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
312*4882a593Smuzhiyun kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
313*4882a593Smuzhiyun vmx->nested.pi_desc = NULL;
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun nested_release_evmcs(vcpu);
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun free_loaded_vmcs(&vmx->nested.vmcs02);
320*4882a593Smuzhiyun }
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun /*
323*4882a593Smuzhiyun * Ensure that the current vmcs of the logical processor is the
324*4882a593Smuzhiyun * vmcs01 of the vcpu before calling free_nested().
325*4882a593Smuzhiyun */
nested_vmx_free_vcpu(struct kvm_vcpu * vcpu)326*4882a593Smuzhiyun void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
327*4882a593Smuzhiyun {
328*4882a593Smuzhiyun vcpu_load(vcpu);
329*4882a593Smuzhiyun vmx_leave_nested(vcpu);
330*4882a593Smuzhiyun vcpu_put(vcpu);
331*4882a593Smuzhiyun }
332*4882a593Smuzhiyun
nested_ept_inject_page_fault(struct kvm_vcpu * vcpu,struct x86_exception * fault)333*4882a593Smuzhiyun static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
334*4882a593Smuzhiyun struct x86_exception *fault)
335*4882a593Smuzhiyun {
336*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
337*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
338*4882a593Smuzhiyun u32 vm_exit_reason;
339*4882a593Smuzhiyun unsigned long exit_qualification = vcpu->arch.exit_qualification;
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun if (vmx->nested.pml_full) {
342*4882a593Smuzhiyun vm_exit_reason = EXIT_REASON_PML_FULL;
343*4882a593Smuzhiyun vmx->nested.pml_full = false;
344*4882a593Smuzhiyun exit_qualification &= INTR_INFO_UNBLOCK_NMI;
345*4882a593Smuzhiyun } else if (fault->error_code & PFERR_RSVD_MASK)
346*4882a593Smuzhiyun vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
347*4882a593Smuzhiyun else
348*4882a593Smuzhiyun vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
351*4882a593Smuzhiyun vmcs12->guest_physical_address = fault->address;
352*4882a593Smuzhiyun }
353*4882a593Smuzhiyun
nested_ept_init_mmu_context(struct kvm_vcpu * vcpu)354*4882a593Smuzhiyun static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
355*4882a593Smuzhiyun {
356*4882a593Smuzhiyun WARN_ON(mmu_is_nested(vcpu));
357*4882a593Smuzhiyun
358*4882a593Smuzhiyun vcpu->arch.mmu = &vcpu->arch.guest_mmu;
359*4882a593Smuzhiyun kvm_init_shadow_ept_mmu(vcpu,
360*4882a593Smuzhiyun to_vmx(vcpu)->nested.msrs.ept_caps &
361*4882a593Smuzhiyun VMX_EPT_EXECUTE_ONLY_BIT,
362*4882a593Smuzhiyun nested_ept_ad_enabled(vcpu),
363*4882a593Smuzhiyun nested_ept_get_eptp(vcpu));
364*4882a593Smuzhiyun vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
365*4882a593Smuzhiyun vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
366*4882a593Smuzhiyun vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
367*4882a593Smuzhiyun
368*4882a593Smuzhiyun vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
369*4882a593Smuzhiyun }
370*4882a593Smuzhiyun
nested_ept_uninit_mmu_context(struct kvm_vcpu * vcpu)371*4882a593Smuzhiyun static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
372*4882a593Smuzhiyun {
373*4882a593Smuzhiyun vcpu->arch.mmu = &vcpu->arch.root_mmu;
374*4882a593Smuzhiyun vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
375*4882a593Smuzhiyun }
376*4882a593Smuzhiyun
nested_vmx_is_page_fault_vmexit(struct vmcs12 * vmcs12,u16 error_code)377*4882a593Smuzhiyun static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
378*4882a593Smuzhiyun u16 error_code)
379*4882a593Smuzhiyun {
380*4882a593Smuzhiyun bool inequality, bit;
381*4882a593Smuzhiyun
382*4882a593Smuzhiyun bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
383*4882a593Smuzhiyun inequality =
384*4882a593Smuzhiyun (error_code & vmcs12->page_fault_error_code_mask) !=
385*4882a593Smuzhiyun vmcs12->page_fault_error_code_match;
386*4882a593Smuzhiyun return inequality ^ bit;
387*4882a593Smuzhiyun }
388*4882a593Smuzhiyun
389*4882a593Smuzhiyun
390*4882a593Smuzhiyun /*
391*4882a593Smuzhiyun * KVM wants to inject page-faults which it got to the guest. This function
392*4882a593Smuzhiyun * checks whether in a nested guest, we need to inject them to L1 or L2.
393*4882a593Smuzhiyun */
nested_vmx_check_exception(struct kvm_vcpu * vcpu,unsigned long * exit_qual)394*4882a593Smuzhiyun static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
395*4882a593Smuzhiyun {
396*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
397*4882a593Smuzhiyun unsigned int nr = vcpu->arch.exception.nr;
398*4882a593Smuzhiyun bool has_payload = vcpu->arch.exception.has_payload;
399*4882a593Smuzhiyun unsigned long payload = vcpu->arch.exception.payload;
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun if (nr == PF_VECTOR) {
402*4882a593Smuzhiyun if (vcpu->arch.exception.nested_apf) {
403*4882a593Smuzhiyun *exit_qual = vcpu->arch.apf.nested_apf_token;
404*4882a593Smuzhiyun return 1;
405*4882a593Smuzhiyun }
406*4882a593Smuzhiyun if (nested_vmx_is_page_fault_vmexit(vmcs12,
407*4882a593Smuzhiyun vcpu->arch.exception.error_code)) {
408*4882a593Smuzhiyun *exit_qual = has_payload ? payload : vcpu->arch.cr2;
409*4882a593Smuzhiyun return 1;
410*4882a593Smuzhiyun }
411*4882a593Smuzhiyun } else if (vmcs12->exception_bitmap & (1u << nr)) {
412*4882a593Smuzhiyun if (nr == DB_VECTOR) {
413*4882a593Smuzhiyun if (!has_payload) {
414*4882a593Smuzhiyun payload = vcpu->arch.dr6;
415*4882a593Smuzhiyun payload &= ~(DR6_FIXED_1 | DR6_BT);
416*4882a593Smuzhiyun payload ^= DR6_RTM;
417*4882a593Smuzhiyun }
418*4882a593Smuzhiyun *exit_qual = payload;
419*4882a593Smuzhiyun } else
420*4882a593Smuzhiyun *exit_qual = 0;
421*4882a593Smuzhiyun return 1;
422*4882a593Smuzhiyun }
423*4882a593Smuzhiyun
424*4882a593Smuzhiyun return 0;
425*4882a593Smuzhiyun }
426*4882a593Smuzhiyun
427*4882a593Smuzhiyun
vmx_inject_page_fault_nested(struct kvm_vcpu * vcpu,struct x86_exception * fault)428*4882a593Smuzhiyun static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
429*4882a593Smuzhiyun struct x86_exception *fault)
430*4882a593Smuzhiyun {
431*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun WARN_ON(!is_guest_mode(vcpu));
434*4882a593Smuzhiyun
435*4882a593Smuzhiyun if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
436*4882a593Smuzhiyun !to_vmx(vcpu)->nested.nested_run_pending) {
437*4882a593Smuzhiyun vmcs12->vm_exit_intr_error_code = fault->error_code;
438*4882a593Smuzhiyun nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
439*4882a593Smuzhiyun PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
440*4882a593Smuzhiyun INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
441*4882a593Smuzhiyun fault->address);
442*4882a593Smuzhiyun } else {
443*4882a593Smuzhiyun kvm_inject_page_fault(vcpu, fault);
444*4882a593Smuzhiyun }
445*4882a593Smuzhiyun }
446*4882a593Smuzhiyun
nested_vmx_check_io_bitmap_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)447*4882a593Smuzhiyun static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
448*4882a593Smuzhiyun struct vmcs12 *vmcs12)
449*4882a593Smuzhiyun {
450*4882a593Smuzhiyun if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
451*4882a593Smuzhiyun return 0;
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
454*4882a593Smuzhiyun CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
455*4882a593Smuzhiyun return -EINVAL;
456*4882a593Smuzhiyun
457*4882a593Smuzhiyun return 0;
458*4882a593Smuzhiyun }
459*4882a593Smuzhiyun
nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)460*4882a593Smuzhiyun static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
461*4882a593Smuzhiyun struct vmcs12 *vmcs12)
462*4882a593Smuzhiyun {
463*4882a593Smuzhiyun if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
464*4882a593Smuzhiyun return 0;
465*4882a593Smuzhiyun
466*4882a593Smuzhiyun if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
467*4882a593Smuzhiyun return -EINVAL;
468*4882a593Smuzhiyun
469*4882a593Smuzhiyun return 0;
470*4882a593Smuzhiyun }
471*4882a593Smuzhiyun
nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)472*4882a593Smuzhiyun static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
473*4882a593Smuzhiyun struct vmcs12 *vmcs12)
474*4882a593Smuzhiyun {
475*4882a593Smuzhiyun if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
476*4882a593Smuzhiyun return 0;
477*4882a593Smuzhiyun
478*4882a593Smuzhiyun if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
479*4882a593Smuzhiyun return -EINVAL;
480*4882a593Smuzhiyun
481*4882a593Smuzhiyun return 0;
482*4882a593Smuzhiyun }
483*4882a593Smuzhiyun
484*4882a593Smuzhiyun /*
485*4882a593Smuzhiyun * Check if MSR is intercepted for L01 MSR bitmap.
486*4882a593Smuzhiyun */
msr_write_intercepted_l01(struct kvm_vcpu * vcpu,u32 msr)487*4882a593Smuzhiyun static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
488*4882a593Smuzhiyun {
489*4882a593Smuzhiyun unsigned long *msr_bitmap;
490*4882a593Smuzhiyun int f = sizeof(unsigned long);
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun if (!cpu_has_vmx_msr_bitmap())
493*4882a593Smuzhiyun return true;
494*4882a593Smuzhiyun
495*4882a593Smuzhiyun msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
496*4882a593Smuzhiyun
497*4882a593Smuzhiyun if (msr <= 0x1fff) {
498*4882a593Smuzhiyun return !!test_bit(msr, msr_bitmap + 0x800 / f);
499*4882a593Smuzhiyun } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
500*4882a593Smuzhiyun msr &= 0x1fff;
501*4882a593Smuzhiyun return !!test_bit(msr, msr_bitmap + 0xc00 / f);
502*4882a593Smuzhiyun }
503*4882a593Smuzhiyun
504*4882a593Smuzhiyun return true;
505*4882a593Smuzhiyun }
506*4882a593Smuzhiyun
507*4882a593Smuzhiyun /*
508*4882a593Smuzhiyun * If a msr is allowed by L0, we should check whether it is allowed by L1.
509*4882a593Smuzhiyun * The corresponding bit will be cleared unless both of L0 and L1 allow it.
510*4882a593Smuzhiyun */
nested_vmx_disable_intercept_for_msr(unsigned long * msr_bitmap_l1,unsigned long * msr_bitmap_nested,u32 msr,int type)511*4882a593Smuzhiyun static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
512*4882a593Smuzhiyun unsigned long *msr_bitmap_nested,
513*4882a593Smuzhiyun u32 msr, int type)
514*4882a593Smuzhiyun {
515*4882a593Smuzhiyun int f = sizeof(unsigned long);
516*4882a593Smuzhiyun
517*4882a593Smuzhiyun /*
518*4882a593Smuzhiyun * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
519*4882a593Smuzhiyun * have the write-low and read-high bitmap offsets the wrong way round.
520*4882a593Smuzhiyun * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
521*4882a593Smuzhiyun */
522*4882a593Smuzhiyun if (msr <= 0x1fff) {
523*4882a593Smuzhiyun if (type & MSR_TYPE_R &&
524*4882a593Smuzhiyun !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
525*4882a593Smuzhiyun /* read-low */
526*4882a593Smuzhiyun __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
527*4882a593Smuzhiyun
528*4882a593Smuzhiyun if (type & MSR_TYPE_W &&
529*4882a593Smuzhiyun !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
530*4882a593Smuzhiyun /* write-low */
531*4882a593Smuzhiyun __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
532*4882a593Smuzhiyun
533*4882a593Smuzhiyun } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
534*4882a593Smuzhiyun msr &= 0x1fff;
535*4882a593Smuzhiyun if (type & MSR_TYPE_R &&
536*4882a593Smuzhiyun !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
537*4882a593Smuzhiyun /* read-high */
538*4882a593Smuzhiyun __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
539*4882a593Smuzhiyun
540*4882a593Smuzhiyun if (type & MSR_TYPE_W &&
541*4882a593Smuzhiyun !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
542*4882a593Smuzhiyun /* write-high */
543*4882a593Smuzhiyun __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun }
546*4882a593Smuzhiyun }
547*4882a593Smuzhiyun
enable_x2apic_msr_intercepts(unsigned long * msr_bitmap)548*4882a593Smuzhiyun static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
549*4882a593Smuzhiyun {
550*4882a593Smuzhiyun int msr;
551*4882a593Smuzhiyun
552*4882a593Smuzhiyun for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
553*4882a593Smuzhiyun unsigned word = msr / BITS_PER_LONG;
554*4882a593Smuzhiyun
555*4882a593Smuzhiyun msr_bitmap[word] = ~0;
556*4882a593Smuzhiyun msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
557*4882a593Smuzhiyun }
558*4882a593Smuzhiyun }
559*4882a593Smuzhiyun
560*4882a593Smuzhiyun /*
561*4882a593Smuzhiyun * Merge L0's and L1's MSR bitmap, return false to indicate that
562*4882a593Smuzhiyun * we do not use the hardware.
563*4882a593Smuzhiyun */
nested_vmx_prepare_msr_bitmap(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)564*4882a593Smuzhiyun static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
565*4882a593Smuzhiyun struct vmcs12 *vmcs12)
566*4882a593Smuzhiyun {
567*4882a593Smuzhiyun int msr;
568*4882a593Smuzhiyun unsigned long *msr_bitmap_l1;
569*4882a593Smuzhiyun unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
570*4882a593Smuzhiyun struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
571*4882a593Smuzhiyun
572*4882a593Smuzhiyun /* Nothing to do if the MSR bitmap is not in use. */
573*4882a593Smuzhiyun if (!cpu_has_vmx_msr_bitmap() ||
574*4882a593Smuzhiyun !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
575*4882a593Smuzhiyun return false;
576*4882a593Smuzhiyun
577*4882a593Smuzhiyun if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
578*4882a593Smuzhiyun return false;
579*4882a593Smuzhiyun
580*4882a593Smuzhiyun msr_bitmap_l1 = (unsigned long *)map->hva;
581*4882a593Smuzhiyun
582*4882a593Smuzhiyun /*
583*4882a593Smuzhiyun * To keep the control flow simple, pay eight 8-byte writes (sixteen
584*4882a593Smuzhiyun * 4-byte writes on 32-bit systems) up front to enable intercepts for
585*4882a593Smuzhiyun * the x2APIC MSR range and selectively disable them below.
586*4882a593Smuzhiyun */
587*4882a593Smuzhiyun enable_x2apic_msr_intercepts(msr_bitmap_l0);
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
590*4882a593Smuzhiyun if (nested_cpu_has_apic_reg_virt(vmcs12)) {
591*4882a593Smuzhiyun /*
592*4882a593Smuzhiyun * L0 need not intercept reads for MSRs between 0x800
593*4882a593Smuzhiyun * and 0x8ff, it just lets the processor take the value
594*4882a593Smuzhiyun * from the virtual-APIC page; take those 256 bits
595*4882a593Smuzhiyun * directly from the L1 bitmap.
596*4882a593Smuzhiyun */
597*4882a593Smuzhiyun for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
598*4882a593Smuzhiyun unsigned word = msr / BITS_PER_LONG;
599*4882a593Smuzhiyun
600*4882a593Smuzhiyun msr_bitmap_l0[word] = msr_bitmap_l1[word];
601*4882a593Smuzhiyun }
602*4882a593Smuzhiyun }
603*4882a593Smuzhiyun
604*4882a593Smuzhiyun nested_vmx_disable_intercept_for_msr(
605*4882a593Smuzhiyun msr_bitmap_l1, msr_bitmap_l0,
606*4882a593Smuzhiyun X2APIC_MSR(APIC_TASKPRI),
607*4882a593Smuzhiyun MSR_TYPE_R | MSR_TYPE_W);
608*4882a593Smuzhiyun
609*4882a593Smuzhiyun if (nested_cpu_has_vid(vmcs12)) {
610*4882a593Smuzhiyun nested_vmx_disable_intercept_for_msr(
611*4882a593Smuzhiyun msr_bitmap_l1, msr_bitmap_l0,
612*4882a593Smuzhiyun X2APIC_MSR(APIC_EOI),
613*4882a593Smuzhiyun MSR_TYPE_W);
614*4882a593Smuzhiyun nested_vmx_disable_intercept_for_msr(
615*4882a593Smuzhiyun msr_bitmap_l1, msr_bitmap_l0,
616*4882a593Smuzhiyun X2APIC_MSR(APIC_SELF_IPI),
617*4882a593Smuzhiyun MSR_TYPE_W);
618*4882a593Smuzhiyun }
619*4882a593Smuzhiyun }
620*4882a593Smuzhiyun
621*4882a593Smuzhiyun /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
622*4882a593Smuzhiyun #ifdef CONFIG_X86_64
623*4882a593Smuzhiyun nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
624*4882a593Smuzhiyun MSR_FS_BASE, MSR_TYPE_RW);
625*4882a593Smuzhiyun
626*4882a593Smuzhiyun nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
627*4882a593Smuzhiyun MSR_GS_BASE, MSR_TYPE_RW);
628*4882a593Smuzhiyun
629*4882a593Smuzhiyun nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
630*4882a593Smuzhiyun MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
631*4882a593Smuzhiyun #endif
632*4882a593Smuzhiyun
633*4882a593Smuzhiyun /*
634*4882a593Smuzhiyun * Checking the L0->L1 bitmap is trying to verify two things:
635*4882a593Smuzhiyun *
636*4882a593Smuzhiyun * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
637*4882a593Smuzhiyun * ensures that we do not accidentally generate an L02 MSR bitmap
638*4882a593Smuzhiyun * from the L12 MSR bitmap that is too permissive.
639*4882a593Smuzhiyun * 2. That L1 or L2s have actually used the MSR. This avoids
640*4882a593Smuzhiyun * unnecessarily merging of the bitmap if the MSR is unused. This
641*4882a593Smuzhiyun * works properly because we only update the L01 MSR bitmap lazily.
642*4882a593Smuzhiyun * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
643*4882a593Smuzhiyun * updated to reflect this when L1 (or its L2s) actually write to
644*4882a593Smuzhiyun * the MSR.
645*4882a593Smuzhiyun */
646*4882a593Smuzhiyun if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
647*4882a593Smuzhiyun nested_vmx_disable_intercept_for_msr(
648*4882a593Smuzhiyun msr_bitmap_l1, msr_bitmap_l0,
649*4882a593Smuzhiyun MSR_IA32_SPEC_CTRL,
650*4882a593Smuzhiyun MSR_TYPE_R | MSR_TYPE_W);
651*4882a593Smuzhiyun
652*4882a593Smuzhiyun if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
653*4882a593Smuzhiyun nested_vmx_disable_intercept_for_msr(
654*4882a593Smuzhiyun msr_bitmap_l1, msr_bitmap_l0,
655*4882a593Smuzhiyun MSR_IA32_PRED_CMD,
656*4882a593Smuzhiyun MSR_TYPE_W);
657*4882a593Smuzhiyun
658*4882a593Smuzhiyun kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
659*4882a593Smuzhiyun
660*4882a593Smuzhiyun return true;
661*4882a593Smuzhiyun }
662*4882a593Smuzhiyun
nested_cache_shadow_vmcs12(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)663*4882a593Smuzhiyun static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
664*4882a593Smuzhiyun struct vmcs12 *vmcs12)
665*4882a593Smuzhiyun {
666*4882a593Smuzhiyun struct kvm_host_map map;
667*4882a593Smuzhiyun struct vmcs12 *shadow;
668*4882a593Smuzhiyun
669*4882a593Smuzhiyun if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
670*4882a593Smuzhiyun vmcs12->vmcs_link_pointer == -1ull)
671*4882a593Smuzhiyun return;
672*4882a593Smuzhiyun
673*4882a593Smuzhiyun shadow = get_shadow_vmcs12(vcpu);
674*4882a593Smuzhiyun
675*4882a593Smuzhiyun if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
676*4882a593Smuzhiyun return;
677*4882a593Smuzhiyun
678*4882a593Smuzhiyun memcpy(shadow, map.hva, VMCS12_SIZE);
679*4882a593Smuzhiyun kvm_vcpu_unmap(vcpu, &map, false);
680*4882a593Smuzhiyun }
681*4882a593Smuzhiyun
nested_flush_cached_shadow_vmcs12(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)682*4882a593Smuzhiyun static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
683*4882a593Smuzhiyun struct vmcs12 *vmcs12)
684*4882a593Smuzhiyun {
685*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
686*4882a593Smuzhiyun
687*4882a593Smuzhiyun if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
688*4882a593Smuzhiyun vmcs12->vmcs_link_pointer == -1ull)
689*4882a593Smuzhiyun return;
690*4882a593Smuzhiyun
691*4882a593Smuzhiyun kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
692*4882a593Smuzhiyun get_shadow_vmcs12(vcpu), VMCS12_SIZE);
693*4882a593Smuzhiyun }
694*4882a593Smuzhiyun
695*4882a593Smuzhiyun /*
696*4882a593Smuzhiyun * In nested virtualization, check if L1 has set
697*4882a593Smuzhiyun * VM_EXIT_ACK_INTR_ON_EXIT
698*4882a593Smuzhiyun */
nested_exit_intr_ack_set(struct kvm_vcpu * vcpu)699*4882a593Smuzhiyun static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
700*4882a593Smuzhiyun {
701*4882a593Smuzhiyun return get_vmcs12(vcpu)->vm_exit_controls &
702*4882a593Smuzhiyun VM_EXIT_ACK_INTR_ON_EXIT;
703*4882a593Smuzhiyun }
704*4882a593Smuzhiyun
nested_vmx_check_apic_access_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)705*4882a593Smuzhiyun static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
706*4882a593Smuzhiyun struct vmcs12 *vmcs12)
707*4882a593Smuzhiyun {
708*4882a593Smuzhiyun if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
709*4882a593Smuzhiyun CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
710*4882a593Smuzhiyun return -EINVAL;
711*4882a593Smuzhiyun else
712*4882a593Smuzhiyun return 0;
713*4882a593Smuzhiyun }
714*4882a593Smuzhiyun
nested_vmx_check_apicv_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)715*4882a593Smuzhiyun static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
716*4882a593Smuzhiyun struct vmcs12 *vmcs12)
717*4882a593Smuzhiyun {
718*4882a593Smuzhiyun if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
719*4882a593Smuzhiyun !nested_cpu_has_apic_reg_virt(vmcs12) &&
720*4882a593Smuzhiyun !nested_cpu_has_vid(vmcs12) &&
721*4882a593Smuzhiyun !nested_cpu_has_posted_intr(vmcs12))
722*4882a593Smuzhiyun return 0;
723*4882a593Smuzhiyun
724*4882a593Smuzhiyun /*
725*4882a593Smuzhiyun * If virtualize x2apic mode is enabled,
726*4882a593Smuzhiyun * virtualize apic access must be disabled.
727*4882a593Smuzhiyun */
728*4882a593Smuzhiyun if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
729*4882a593Smuzhiyun nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
730*4882a593Smuzhiyun return -EINVAL;
731*4882a593Smuzhiyun
732*4882a593Smuzhiyun /*
733*4882a593Smuzhiyun * If virtual interrupt delivery is enabled,
734*4882a593Smuzhiyun * we must exit on external interrupts.
735*4882a593Smuzhiyun */
736*4882a593Smuzhiyun if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
737*4882a593Smuzhiyun return -EINVAL;
738*4882a593Smuzhiyun
739*4882a593Smuzhiyun /*
740*4882a593Smuzhiyun * bits 15:8 should be zero in posted_intr_nv,
741*4882a593Smuzhiyun * the descriptor address has been already checked
742*4882a593Smuzhiyun * in nested_get_vmcs12_pages.
743*4882a593Smuzhiyun *
744*4882a593Smuzhiyun * bits 5:0 of posted_intr_desc_addr should be zero.
745*4882a593Smuzhiyun */
746*4882a593Smuzhiyun if (nested_cpu_has_posted_intr(vmcs12) &&
747*4882a593Smuzhiyun (CC(!nested_cpu_has_vid(vmcs12)) ||
748*4882a593Smuzhiyun CC(!nested_exit_intr_ack_set(vcpu)) ||
749*4882a593Smuzhiyun CC((vmcs12->posted_intr_nv & 0xff00)) ||
750*4882a593Smuzhiyun CC((vmcs12->posted_intr_desc_addr & 0x3f)) ||
751*4882a593Smuzhiyun CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))))
752*4882a593Smuzhiyun return -EINVAL;
753*4882a593Smuzhiyun
754*4882a593Smuzhiyun /* tpr shadow is needed by all apicv features. */
755*4882a593Smuzhiyun if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
756*4882a593Smuzhiyun return -EINVAL;
757*4882a593Smuzhiyun
758*4882a593Smuzhiyun return 0;
759*4882a593Smuzhiyun }
760*4882a593Smuzhiyun
nested_vmx_check_msr_switch(struct kvm_vcpu * vcpu,u32 count,u64 addr)761*4882a593Smuzhiyun static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
762*4882a593Smuzhiyun u32 count, u64 addr)
763*4882a593Smuzhiyun {
764*4882a593Smuzhiyun int maxphyaddr;
765*4882a593Smuzhiyun
766*4882a593Smuzhiyun if (count == 0)
767*4882a593Smuzhiyun return 0;
768*4882a593Smuzhiyun maxphyaddr = cpuid_maxphyaddr(vcpu);
769*4882a593Smuzhiyun if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
770*4882a593Smuzhiyun (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
771*4882a593Smuzhiyun return -EINVAL;
772*4882a593Smuzhiyun
773*4882a593Smuzhiyun return 0;
774*4882a593Smuzhiyun }
775*4882a593Smuzhiyun
nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)776*4882a593Smuzhiyun static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
777*4882a593Smuzhiyun struct vmcs12 *vmcs12)
778*4882a593Smuzhiyun {
779*4882a593Smuzhiyun if (CC(nested_vmx_check_msr_switch(vcpu,
780*4882a593Smuzhiyun vmcs12->vm_exit_msr_load_count,
781*4882a593Smuzhiyun vmcs12->vm_exit_msr_load_addr)) ||
782*4882a593Smuzhiyun CC(nested_vmx_check_msr_switch(vcpu,
783*4882a593Smuzhiyun vmcs12->vm_exit_msr_store_count,
784*4882a593Smuzhiyun vmcs12->vm_exit_msr_store_addr)))
785*4882a593Smuzhiyun return -EINVAL;
786*4882a593Smuzhiyun
787*4882a593Smuzhiyun return 0;
788*4882a593Smuzhiyun }
789*4882a593Smuzhiyun
nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)790*4882a593Smuzhiyun static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
791*4882a593Smuzhiyun struct vmcs12 *vmcs12)
792*4882a593Smuzhiyun {
793*4882a593Smuzhiyun if (CC(nested_vmx_check_msr_switch(vcpu,
794*4882a593Smuzhiyun vmcs12->vm_entry_msr_load_count,
795*4882a593Smuzhiyun vmcs12->vm_entry_msr_load_addr)))
796*4882a593Smuzhiyun return -EINVAL;
797*4882a593Smuzhiyun
798*4882a593Smuzhiyun return 0;
799*4882a593Smuzhiyun }
800*4882a593Smuzhiyun
nested_vmx_check_pml_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)801*4882a593Smuzhiyun static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
802*4882a593Smuzhiyun struct vmcs12 *vmcs12)
803*4882a593Smuzhiyun {
804*4882a593Smuzhiyun if (!nested_cpu_has_pml(vmcs12))
805*4882a593Smuzhiyun return 0;
806*4882a593Smuzhiyun
807*4882a593Smuzhiyun if (CC(!nested_cpu_has_ept(vmcs12)) ||
808*4882a593Smuzhiyun CC(!page_address_valid(vcpu, vmcs12->pml_address)))
809*4882a593Smuzhiyun return -EINVAL;
810*4882a593Smuzhiyun
811*4882a593Smuzhiyun return 0;
812*4882a593Smuzhiyun }
813*4882a593Smuzhiyun
nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)814*4882a593Smuzhiyun static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
815*4882a593Smuzhiyun struct vmcs12 *vmcs12)
816*4882a593Smuzhiyun {
817*4882a593Smuzhiyun if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
818*4882a593Smuzhiyun !nested_cpu_has_ept(vmcs12)))
819*4882a593Smuzhiyun return -EINVAL;
820*4882a593Smuzhiyun return 0;
821*4882a593Smuzhiyun }
822*4882a593Smuzhiyun
nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)823*4882a593Smuzhiyun static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
824*4882a593Smuzhiyun struct vmcs12 *vmcs12)
825*4882a593Smuzhiyun {
826*4882a593Smuzhiyun if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
827*4882a593Smuzhiyun !nested_cpu_has_ept(vmcs12)))
828*4882a593Smuzhiyun return -EINVAL;
829*4882a593Smuzhiyun return 0;
830*4882a593Smuzhiyun }
831*4882a593Smuzhiyun
nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)832*4882a593Smuzhiyun static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
833*4882a593Smuzhiyun struct vmcs12 *vmcs12)
834*4882a593Smuzhiyun {
835*4882a593Smuzhiyun if (!nested_cpu_has_shadow_vmcs(vmcs12))
836*4882a593Smuzhiyun return 0;
837*4882a593Smuzhiyun
838*4882a593Smuzhiyun if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
839*4882a593Smuzhiyun CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
840*4882a593Smuzhiyun return -EINVAL;
841*4882a593Smuzhiyun
842*4882a593Smuzhiyun return 0;
843*4882a593Smuzhiyun }
844*4882a593Smuzhiyun
nested_vmx_msr_check_common(struct kvm_vcpu * vcpu,struct vmx_msr_entry * e)845*4882a593Smuzhiyun static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
846*4882a593Smuzhiyun struct vmx_msr_entry *e)
847*4882a593Smuzhiyun {
848*4882a593Smuzhiyun /* x2APIC MSR accesses are not allowed */
849*4882a593Smuzhiyun if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
850*4882a593Smuzhiyun return -EINVAL;
851*4882a593Smuzhiyun if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
852*4882a593Smuzhiyun CC(e->index == MSR_IA32_UCODE_REV))
853*4882a593Smuzhiyun return -EINVAL;
854*4882a593Smuzhiyun if (CC(e->reserved != 0))
855*4882a593Smuzhiyun return -EINVAL;
856*4882a593Smuzhiyun return 0;
857*4882a593Smuzhiyun }
858*4882a593Smuzhiyun
nested_vmx_load_msr_check(struct kvm_vcpu * vcpu,struct vmx_msr_entry * e)859*4882a593Smuzhiyun static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
860*4882a593Smuzhiyun struct vmx_msr_entry *e)
861*4882a593Smuzhiyun {
862*4882a593Smuzhiyun if (CC(e->index == MSR_FS_BASE) ||
863*4882a593Smuzhiyun CC(e->index == MSR_GS_BASE) ||
864*4882a593Smuzhiyun CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
865*4882a593Smuzhiyun nested_vmx_msr_check_common(vcpu, e))
866*4882a593Smuzhiyun return -EINVAL;
867*4882a593Smuzhiyun return 0;
868*4882a593Smuzhiyun }
869*4882a593Smuzhiyun
nested_vmx_store_msr_check(struct kvm_vcpu * vcpu,struct vmx_msr_entry * e)870*4882a593Smuzhiyun static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
871*4882a593Smuzhiyun struct vmx_msr_entry *e)
872*4882a593Smuzhiyun {
873*4882a593Smuzhiyun if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
874*4882a593Smuzhiyun nested_vmx_msr_check_common(vcpu, e))
875*4882a593Smuzhiyun return -EINVAL;
876*4882a593Smuzhiyun return 0;
877*4882a593Smuzhiyun }
878*4882a593Smuzhiyun
nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu * vcpu)879*4882a593Smuzhiyun static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
880*4882a593Smuzhiyun {
881*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
882*4882a593Smuzhiyun u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
883*4882a593Smuzhiyun vmx->nested.msrs.misc_high);
884*4882a593Smuzhiyun
885*4882a593Smuzhiyun return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
886*4882a593Smuzhiyun }
887*4882a593Smuzhiyun
888*4882a593Smuzhiyun /*
889*4882a593Smuzhiyun * Load guest's/host's msr at nested entry/exit.
890*4882a593Smuzhiyun * return 0 for success, entry index for failure.
891*4882a593Smuzhiyun *
892*4882a593Smuzhiyun * One of the failure modes for MSR load/store is when a list exceeds the
893*4882a593Smuzhiyun * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
894*4882a593Smuzhiyun * as possible, process all valid entries before failing rather than precheck
895*4882a593Smuzhiyun * for a capacity violation.
896*4882a593Smuzhiyun */
nested_vmx_load_msr(struct kvm_vcpu * vcpu,u64 gpa,u32 count)897*4882a593Smuzhiyun static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
898*4882a593Smuzhiyun {
899*4882a593Smuzhiyun u32 i;
900*4882a593Smuzhiyun struct vmx_msr_entry e;
901*4882a593Smuzhiyun u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
902*4882a593Smuzhiyun
903*4882a593Smuzhiyun for (i = 0; i < count; i++) {
904*4882a593Smuzhiyun if (unlikely(i >= max_msr_list_size))
905*4882a593Smuzhiyun goto fail;
906*4882a593Smuzhiyun
907*4882a593Smuzhiyun if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
908*4882a593Smuzhiyun &e, sizeof(e))) {
909*4882a593Smuzhiyun pr_debug_ratelimited(
910*4882a593Smuzhiyun "%s cannot read MSR entry (%u, 0x%08llx)\n",
911*4882a593Smuzhiyun __func__, i, gpa + i * sizeof(e));
912*4882a593Smuzhiyun goto fail;
913*4882a593Smuzhiyun }
914*4882a593Smuzhiyun if (nested_vmx_load_msr_check(vcpu, &e)) {
915*4882a593Smuzhiyun pr_debug_ratelimited(
916*4882a593Smuzhiyun "%s check failed (%u, 0x%x, 0x%x)\n",
917*4882a593Smuzhiyun __func__, i, e.index, e.reserved);
918*4882a593Smuzhiyun goto fail;
919*4882a593Smuzhiyun }
920*4882a593Smuzhiyun if (kvm_set_msr(vcpu, e.index, e.value)) {
921*4882a593Smuzhiyun pr_debug_ratelimited(
922*4882a593Smuzhiyun "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
923*4882a593Smuzhiyun __func__, i, e.index, e.value);
924*4882a593Smuzhiyun goto fail;
925*4882a593Smuzhiyun }
926*4882a593Smuzhiyun }
927*4882a593Smuzhiyun return 0;
928*4882a593Smuzhiyun fail:
929*4882a593Smuzhiyun /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
930*4882a593Smuzhiyun return i + 1;
931*4882a593Smuzhiyun }
932*4882a593Smuzhiyun
nested_vmx_get_vmexit_msr_value(struct kvm_vcpu * vcpu,u32 msr_index,u64 * data)933*4882a593Smuzhiyun static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
934*4882a593Smuzhiyun u32 msr_index,
935*4882a593Smuzhiyun u64 *data)
936*4882a593Smuzhiyun {
937*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
938*4882a593Smuzhiyun
939*4882a593Smuzhiyun /*
940*4882a593Smuzhiyun * If the L0 hypervisor stored a more accurate value for the TSC that
941*4882a593Smuzhiyun * does not include the time taken for emulation of the L2->L1
942*4882a593Smuzhiyun * VM-exit in L0, use the more accurate value.
943*4882a593Smuzhiyun */
944*4882a593Smuzhiyun if (msr_index == MSR_IA32_TSC) {
945*4882a593Smuzhiyun int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
946*4882a593Smuzhiyun MSR_IA32_TSC);
947*4882a593Smuzhiyun
948*4882a593Smuzhiyun if (i >= 0) {
949*4882a593Smuzhiyun u64 val = vmx->msr_autostore.guest.val[i].value;
950*4882a593Smuzhiyun
951*4882a593Smuzhiyun *data = kvm_read_l1_tsc(vcpu, val);
952*4882a593Smuzhiyun return true;
953*4882a593Smuzhiyun }
954*4882a593Smuzhiyun }
955*4882a593Smuzhiyun
956*4882a593Smuzhiyun if (kvm_get_msr(vcpu, msr_index, data)) {
957*4882a593Smuzhiyun pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
958*4882a593Smuzhiyun msr_index);
959*4882a593Smuzhiyun return false;
960*4882a593Smuzhiyun }
961*4882a593Smuzhiyun return true;
962*4882a593Smuzhiyun }
963*4882a593Smuzhiyun
read_and_check_msr_entry(struct kvm_vcpu * vcpu,u64 gpa,int i,struct vmx_msr_entry * e)964*4882a593Smuzhiyun static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
965*4882a593Smuzhiyun struct vmx_msr_entry *e)
966*4882a593Smuzhiyun {
967*4882a593Smuzhiyun if (kvm_vcpu_read_guest(vcpu,
968*4882a593Smuzhiyun gpa + i * sizeof(*e),
969*4882a593Smuzhiyun e, 2 * sizeof(u32))) {
970*4882a593Smuzhiyun pr_debug_ratelimited(
971*4882a593Smuzhiyun "%s cannot read MSR entry (%u, 0x%08llx)\n",
972*4882a593Smuzhiyun __func__, i, gpa + i * sizeof(*e));
973*4882a593Smuzhiyun return false;
974*4882a593Smuzhiyun }
975*4882a593Smuzhiyun if (nested_vmx_store_msr_check(vcpu, e)) {
976*4882a593Smuzhiyun pr_debug_ratelimited(
977*4882a593Smuzhiyun "%s check failed (%u, 0x%x, 0x%x)\n",
978*4882a593Smuzhiyun __func__, i, e->index, e->reserved);
979*4882a593Smuzhiyun return false;
980*4882a593Smuzhiyun }
981*4882a593Smuzhiyun return true;
982*4882a593Smuzhiyun }
983*4882a593Smuzhiyun
nested_vmx_store_msr(struct kvm_vcpu * vcpu,u64 gpa,u32 count)984*4882a593Smuzhiyun static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
985*4882a593Smuzhiyun {
986*4882a593Smuzhiyun u64 data;
987*4882a593Smuzhiyun u32 i;
988*4882a593Smuzhiyun struct vmx_msr_entry e;
989*4882a593Smuzhiyun u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
990*4882a593Smuzhiyun
991*4882a593Smuzhiyun for (i = 0; i < count; i++) {
992*4882a593Smuzhiyun if (unlikely(i >= max_msr_list_size))
993*4882a593Smuzhiyun return -EINVAL;
994*4882a593Smuzhiyun
995*4882a593Smuzhiyun if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
996*4882a593Smuzhiyun return -EINVAL;
997*4882a593Smuzhiyun
998*4882a593Smuzhiyun if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
999*4882a593Smuzhiyun return -EINVAL;
1000*4882a593Smuzhiyun
1001*4882a593Smuzhiyun if (kvm_vcpu_write_guest(vcpu,
1002*4882a593Smuzhiyun gpa + i * sizeof(e) +
1003*4882a593Smuzhiyun offsetof(struct vmx_msr_entry, value),
1004*4882a593Smuzhiyun &data, sizeof(data))) {
1005*4882a593Smuzhiyun pr_debug_ratelimited(
1006*4882a593Smuzhiyun "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1007*4882a593Smuzhiyun __func__, i, e.index, data);
1008*4882a593Smuzhiyun return -EINVAL;
1009*4882a593Smuzhiyun }
1010*4882a593Smuzhiyun }
1011*4882a593Smuzhiyun return 0;
1012*4882a593Smuzhiyun }
1013*4882a593Smuzhiyun
nested_msr_store_list_has_msr(struct kvm_vcpu * vcpu,u32 msr_index)1014*4882a593Smuzhiyun static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1015*4882a593Smuzhiyun {
1016*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1017*4882a593Smuzhiyun u32 count = vmcs12->vm_exit_msr_store_count;
1018*4882a593Smuzhiyun u64 gpa = vmcs12->vm_exit_msr_store_addr;
1019*4882a593Smuzhiyun struct vmx_msr_entry e;
1020*4882a593Smuzhiyun u32 i;
1021*4882a593Smuzhiyun
1022*4882a593Smuzhiyun for (i = 0; i < count; i++) {
1023*4882a593Smuzhiyun if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1024*4882a593Smuzhiyun return false;
1025*4882a593Smuzhiyun
1026*4882a593Smuzhiyun if (e.index == msr_index)
1027*4882a593Smuzhiyun return true;
1028*4882a593Smuzhiyun }
1029*4882a593Smuzhiyun return false;
1030*4882a593Smuzhiyun }
1031*4882a593Smuzhiyun
prepare_vmx_msr_autostore_list(struct kvm_vcpu * vcpu,u32 msr_index)1032*4882a593Smuzhiyun static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1033*4882a593Smuzhiyun u32 msr_index)
1034*4882a593Smuzhiyun {
1035*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
1036*4882a593Smuzhiyun struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1037*4882a593Smuzhiyun bool in_vmcs12_store_list;
1038*4882a593Smuzhiyun int msr_autostore_slot;
1039*4882a593Smuzhiyun bool in_autostore_list;
1040*4882a593Smuzhiyun int last;
1041*4882a593Smuzhiyun
1042*4882a593Smuzhiyun msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
1043*4882a593Smuzhiyun in_autostore_list = msr_autostore_slot >= 0;
1044*4882a593Smuzhiyun in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1045*4882a593Smuzhiyun
1046*4882a593Smuzhiyun if (in_vmcs12_store_list && !in_autostore_list) {
1047*4882a593Smuzhiyun if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
1048*4882a593Smuzhiyun /*
1049*4882a593Smuzhiyun * Emulated VMEntry does not fail here. Instead a less
1050*4882a593Smuzhiyun * accurate value will be returned by
1051*4882a593Smuzhiyun * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
1052*4882a593Smuzhiyun * instead of reading the value from the vmcs02 VMExit
1053*4882a593Smuzhiyun * MSR-store area.
1054*4882a593Smuzhiyun */
1055*4882a593Smuzhiyun pr_warn_ratelimited(
1056*4882a593Smuzhiyun "Not enough msr entries in msr_autostore. Can't add msr %x\n",
1057*4882a593Smuzhiyun msr_index);
1058*4882a593Smuzhiyun return;
1059*4882a593Smuzhiyun }
1060*4882a593Smuzhiyun last = autostore->nr++;
1061*4882a593Smuzhiyun autostore->val[last].index = msr_index;
1062*4882a593Smuzhiyun } else if (!in_vmcs12_store_list && in_autostore_list) {
1063*4882a593Smuzhiyun last = --autostore->nr;
1064*4882a593Smuzhiyun autostore->val[msr_autostore_slot] = autostore->val[last];
1065*4882a593Smuzhiyun }
1066*4882a593Smuzhiyun }
1067*4882a593Smuzhiyun
nested_cr3_valid(struct kvm_vcpu * vcpu,unsigned long val)1068*4882a593Smuzhiyun static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
1069*4882a593Smuzhiyun {
1070*4882a593Smuzhiyun unsigned long invalid_mask;
1071*4882a593Smuzhiyun
1072*4882a593Smuzhiyun invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
1073*4882a593Smuzhiyun return (val & invalid_mask) == 0;
1074*4882a593Smuzhiyun }
1075*4882a593Smuzhiyun
1076*4882a593Smuzhiyun /*
1077*4882a593Smuzhiyun * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
1078*4882a593Smuzhiyun * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
1079*4882a593Smuzhiyun * enable VPID for L2 (implying it expects a TLB flush on VMX transitions).
1080*4882a593Smuzhiyun * Here's why.
1081*4882a593Smuzhiyun *
1082*4882a593Smuzhiyun * If EPT is enabled by L0 a sync is never needed:
1083*4882a593Smuzhiyun * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there
1084*4882a593Smuzhiyun * cannot be unsync'd SPTEs for either L1 or L2.
1085*4882a593Smuzhiyun *
1086*4882a593Smuzhiyun * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter
1087*4882a593Smuzhiyun * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings
1088*4882a593Smuzhiyun * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush
1089*4882a593Smuzhiyun * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't
1090*4882a593Smuzhiyun * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit.
1091*4882a593Smuzhiyun *
1092*4882a593Smuzhiyun * If EPT is disabled by L0:
1093*4882a593Smuzhiyun * - if VPID is enabled by L1 (for L2), the situation is similar to when L1
1094*4882a593Smuzhiyun * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't
1095*4882a593Smuzhiyun * required to invalidate linear mappings (EPT is disabled so there are
1096*4882a593Smuzhiyun * no combined or guest-physical mappings), i.e. L1 can't rely on the
1097*4882a593Smuzhiyun * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1).
1098*4882a593Smuzhiyun *
1099*4882a593Smuzhiyun * - however if VPID is disabled by L1, then a sync is needed as L1 expects all
1100*4882a593Smuzhiyun * linear mappings (EPT is disabled so there are no combined or guest-physical
1101*4882a593Smuzhiyun * mappings) to be invalidated on both VM-Enter and VM-Exit.
1102*4882a593Smuzhiyun *
1103*4882a593Smuzhiyun * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which
1104*4882a593Smuzhiyun * additionally checks that L2 has been assigned a VPID (when EPT is disabled).
1105*4882a593Smuzhiyun * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect
1106*4882a593Smuzhiyun * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2
1107*4882a593Smuzhiyun * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has
1108*4882a593Smuzhiyun * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1
1109*4882a593Smuzhiyun * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't
1110*4882a593Smuzhiyun * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush
1111*4882a593Smuzhiyun * stale TLB entries, at which point L0 will sync L2's MMU.
1112*4882a593Smuzhiyun */
nested_vmx_transition_mmu_sync(struct kvm_vcpu * vcpu)1113*4882a593Smuzhiyun static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
1114*4882a593Smuzhiyun {
1115*4882a593Smuzhiyun return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu));
1116*4882a593Smuzhiyun }
1117*4882a593Smuzhiyun
1118*4882a593Smuzhiyun /*
1119*4882a593Smuzhiyun * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
1120*4882a593Smuzhiyun * emulating VM-Entry into a guest with EPT enabled. On failure, the expected
1121*4882a593Smuzhiyun * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
1122*4882a593Smuzhiyun * @entry_failure_code.
1123*4882a593Smuzhiyun */
nested_vmx_load_cr3(struct kvm_vcpu * vcpu,unsigned long cr3,bool nested_ept,enum vm_entry_failure_code * entry_failure_code)1124*4882a593Smuzhiyun static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
1125*4882a593Smuzhiyun enum vm_entry_failure_code *entry_failure_code)
1126*4882a593Smuzhiyun {
1127*4882a593Smuzhiyun if (CC(!nested_cr3_valid(vcpu, cr3))) {
1128*4882a593Smuzhiyun *entry_failure_code = ENTRY_FAIL_DEFAULT;
1129*4882a593Smuzhiyun return -EINVAL;
1130*4882a593Smuzhiyun }
1131*4882a593Smuzhiyun
1132*4882a593Smuzhiyun /*
1133*4882a593Smuzhiyun * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1134*4882a593Smuzhiyun * must not be dereferenced.
1135*4882a593Smuzhiyun */
1136*4882a593Smuzhiyun if (!nested_ept && is_pae_paging(vcpu) &&
1137*4882a593Smuzhiyun (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
1138*4882a593Smuzhiyun if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
1139*4882a593Smuzhiyun *entry_failure_code = ENTRY_FAIL_PDPTE;
1140*4882a593Smuzhiyun return -EINVAL;
1141*4882a593Smuzhiyun }
1142*4882a593Smuzhiyun }
1143*4882a593Smuzhiyun
1144*4882a593Smuzhiyun /*
1145*4882a593Smuzhiyun * Unconditionally skip the TLB flush on fast CR3 switch, all TLB
1146*4882a593Smuzhiyun * flushes are handled by nested_vmx_transition_tlb_flush().
1147*4882a593Smuzhiyun */
1148*4882a593Smuzhiyun if (!nested_ept) {
1149*4882a593Smuzhiyun kvm_mmu_new_pgd(vcpu, cr3, true, true);
1150*4882a593Smuzhiyun
1151*4882a593Smuzhiyun /*
1152*4882a593Smuzhiyun * A TLB flush on VM-Enter/VM-Exit flushes all linear mappings
1153*4882a593Smuzhiyun * across all PCIDs, i.e. all PGDs need to be synchronized.
1154*4882a593Smuzhiyun * See nested_vmx_transition_mmu_sync() for more details.
1155*4882a593Smuzhiyun */
1156*4882a593Smuzhiyun if (nested_vmx_transition_mmu_sync(vcpu))
1157*4882a593Smuzhiyun kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1158*4882a593Smuzhiyun }
1159*4882a593Smuzhiyun
1160*4882a593Smuzhiyun vcpu->arch.cr3 = cr3;
1161*4882a593Smuzhiyun kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1162*4882a593Smuzhiyun
1163*4882a593Smuzhiyun kvm_init_mmu(vcpu, false);
1164*4882a593Smuzhiyun
1165*4882a593Smuzhiyun return 0;
1166*4882a593Smuzhiyun }
1167*4882a593Smuzhiyun
1168*4882a593Smuzhiyun /*
1169*4882a593Smuzhiyun * Returns if KVM is able to config CPU to tag TLB entries
1170*4882a593Smuzhiyun * populated by L2 differently than TLB entries populated
1171*4882a593Smuzhiyun * by L1.
1172*4882a593Smuzhiyun *
1173*4882a593Smuzhiyun * If L0 uses EPT, L1 and L2 run with different EPTP because
1174*4882a593Smuzhiyun * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1175*4882a593Smuzhiyun * are tagged with different EPTP.
1176*4882a593Smuzhiyun *
1177*4882a593Smuzhiyun * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1178*4882a593Smuzhiyun * with different VPID (L1 entries are tagged with vmx->vpid
1179*4882a593Smuzhiyun * while L2 entries are tagged with vmx->nested.vpid02).
1180*4882a593Smuzhiyun */
nested_has_guest_tlb_tag(struct kvm_vcpu * vcpu)1181*4882a593Smuzhiyun static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1182*4882a593Smuzhiyun {
1183*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1184*4882a593Smuzhiyun
1185*4882a593Smuzhiyun return enable_ept ||
1186*4882a593Smuzhiyun (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1187*4882a593Smuzhiyun }
1188*4882a593Smuzhiyun
nested_vmx_transition_tlb_flush(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,bool is_vmenter)1189*4882a593Smuzhiyun static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
1190*4882a593Smuzhiyun struct vmcs12 *vmcs12,
1191*4882a593Smuzhiyun bool is_vmenter)
1192*4882a593Smuzhiyun {
1193*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
1194*4882a593Smuzhiyun
1195*4882a593Smuzhiyun /*
1196*4882a593Smuzhiyun * If VPID is disabled, linear and combined mappings are flushed on
1197*4882a593Smuzhiyun * VM-Enter/VM-Exit, and guest-physical mappings are valid only for
1198*4882a593Smuzhiyun * their associated EPTP.
1199*4882a593Smuzhiyun */
1200*4882a593Smuzhiyun if (!enable_vpid)
1201*4882a593Smuzhiyun return;
1202*4882a593Smuzhiyun
1203*4882a593Smuzhiyun /*
1204*4882a593Smuzhiyun * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
1205*4882a593Smuzhiyun * for *all* contexts to be flushed on VM-Enter/VM-Exit.
1206*4882a593Smuzhiyun *
1207*4882a593Smuzhiyun * If VPID is enabled and used by vmc12, but L2 does not have a unique
1208*4882a593Smuzhiyun * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
1209*4882a593Smuzhiyun * a VPID for L2, flush the current context as the effective ASID is
1210*4882a593Smuzhiyun * common to both L1 and L2.
1211*4882a593Smuzhiyun *
1212*4882a593Smuzhiyun * Defer the flush so that it runs after vmcs02.EPTP has been set by
1213*4882a593Smuzhiyun * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
1214*4882a593Smuzhiyun * redundant flushes further down the nested pipeline.
1215*4882a593Smuzhiyun *
1216*4882a593Smuzhiyun * If a TLB flush isn't required due to any of the above, and vpid12 is
1217*4882a593Smuzhiyun * changing then the new "virtual" VPID (vpid12) will reuse the same
1218*4882a593Smuzhiyun * "real" VPID (vpid02), and so needs to be sync'd. There is no direct
1219*4882a593Smuzhiyun * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
1220*4882a593Smuzhiyun * all nested vCPUs.
1221*4882a593Smuzhiyun */
1222*4882a593Smuzhiyun if (!nested_cpu_has_vpid(vmcs12)) {
1223*4882a593Smuzhiyun kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1224*4882a593Smuzhiyun } else if (!nested_has_guest_tlb_tag(vcpu)) {
1225*4882a593Smuzhiyun kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1226*4882a593Smuzhiyun } else if (is_vmenter &&
1227*4882a593Smuzhiyun vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
1228*4882a593Smuzhiyun vmx->nested.last_vpid = vmcs12->virtual_processor_id;
1229*4882a593Smuzhiyun vpid_sync_context(nested_get_vpid02(vcpu));
1230*4882a593Smuzhiyun }
1231*4882a593Smuzhiyun }
1232*4882a593Smuzhiyun
is_bitwise_subset(u64 superset,u64 subset,u64 mask)1233*4882a593Smuzhiyun static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1234*4882a593Smuzhiyun {
1235*4882a593Smuzhiyun superset &= mask;
1236*4882a593Smuzhiyun subset &= mask;
1237*4882a593Smuzhiyun
1238*4882a593Smuzhiyun return (superset | subset) == superset;
1239*4882a593Smuzhiyun }
1240*4882a593Smuzhiyun
vmx_restore_vmx_basic(struct vcpu_vmx * vmx,u64 data)1241*4882a593Smuzhiyun static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1242*4882a593Smuzhiyun {
1243*4882a593Smuzhiyun const u64 feature_and_reserved =
1244*4882a593Smuzhiyun /* feature (except bit 48; see below) */
1245*4882a593Smuzhiyun BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1246*4882a593Smuzhiyun /* reserved */
1247*4882a593Smuzhiyun BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1248*4882a593Smuzhiyun u64 vmx_basic = vmcs_config.nested.basic;
1249*4882a593Smuzhiyun
1250*4882a593Smuzhiyun if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1251*4882a593Smuzhiyun return -EINVAL;
1252*4882a593Smuzhiyun
1253*4882a593Smuzhiyun /*
1254*4882a593Smuzhiyun * KVM does not emulate a version of VMX that constrains physical
1255*4882a593Smuzhiyun * addresses of VMX structures (e.g. VMCS) to 32-bits.
1256*4882a593Smuzhiyun */
1257*4882a593Smuzhiyun if (data & BIT_ULL(48))
1258*4882a593Smuzhiyun return -EINVAL;
1259*4882a593Smuzhiyun
1260*4882a593Smuzhiyun if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1261*4882a593Smuzhiyun vmx_basic_vmcs_revision_id(data))
1262*4882a593Smuzhiyun return -EINVAL;
1263*4882a593Smuzhiyun
1264*4882a593Smuzhiyun if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1265*4882a593Smuzhiyun return -EINVAL;
1266*4882a593Smuzhiyun
1267*4882a593Smuzhiyun vmx->nested.msrs.basic = data;
1268*4882a593Smuzhiyun return 0;
1269*4882a593Smuzhiyun }
1270*4882a593Smuzhiyun
vmx_get_control_msr(struct nested_vmx_msrs * msrs,u32 msr_index,u32 ** low,u32 ** high)1271*4882a593Smuzhiyun static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
1272*4882a593Smuzhiyun u32 **low, u32 **high)
1273*4882a593Smuzhiyun {
1274*4882a593Smuzhiyun switch (msr_index) {
1275*4882a593Smuzhiyun case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1276*4882a593Smuzhiyun *low = &msrs->pinbased_ctls_low;
1277*4882a593Smuzhiyun *high = &msrs->pinbased_ctls_high;
1278*4882a593Smuzhiyun break;
1279*4882a593Smuzhiyun case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1280*4882a593Smuzhiyun *low = &msrs->procbased_ctls_low;
1281*4882a593Smuzhiyun *high = &msrs->procbased_ctls_high;
1282*4882a593Smuzhiyun break;
1283*4882a593Smuzhiyun case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1284*4882a593Smuzhiyun *low = &msrs->exit_ctls_low;
1285*4882a593Smuzhiyun *high = &msrs->exit_ctls_high;
1286*4882a593Smuzhiyun break;
1287*4882a593Smuzhiyun case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1288*4882a593Smuzhiyun *low = &msrs->entry_ctls_low;
1289*4882a593Smuzhiyun *high = &msrs->entry_ctls_high;
1290*4882a593Smuzhiyun break;
1291*4882a593Smuzhiyun case MSR_IA32_VMX_PROCBASED_CTLS2:
1292*4882a593Smuzhiyun *low = &msrs->secondary_ctls_low;
1293*4882a593Smuzhiyun *high = &msrs->secondary_ctls_high;
1294*4882a593Smuzhiyun break;
1295*4882a593Smuzhiyun default:
1296*4882a593Smuzhiyun BUG();
1297*4882a593Smuzhiyun }
1298*4882a593Smuzhiyun }
1299*4882a593Smuzhiyun
1300*4882a593Smuzhiyun static int
vmx_restore_control_msr(struct vcpu_vmx * vmx,u32 msr_index,u64 data)1301*4882a593Smuzhiyun vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1302*4882a593Smuzhiyun {
1303*4882a593Smuzhiyun u32 *lowp, *highp;
1304*4882a593Smuzhiyun u64 supported;
1305*4882a593Smuzhiyun
1306*4882a593Smuzhiyun vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
1307*4882a593Smuzhiyun
1308*4882a593Smuzhiyun supported = vmx_control_msr(*lowp, *highp);
1309*4882a593Smuzhiyun
1310*4882a593Smuzhiyun /* Check must-be-1 bits are still 1. */
1311*4882a593Smuzhiyun if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1312*4882a593Smuzhiyun return -EINVAL;
1313*4882a593Smuzhiyun
1314*4882a593Smuzhiyun /* Check must-be-0 bits are still 0. */
1315*4882a593Smuzhiyun if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1316*4882a593Smuzhiyun return -EINVAL;
1317*4882a593Smuzhiyun
1318*4882a593Smuzhiyun vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
1319*4882a593Smuzhiyun *lowp = data;
1320*4882a593Smuzhiyun *highp = data >> 32;
1321*4882a593Smuzhiyun return 0;
1322*4882a593Smuzhiyun }
1323*4882a593Smuzhiyun
vmx_restore_vmx_misc(struct vcpu_vmx * vmx,u64 data)1324*4882a593Smuzhiyun static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1325*4882a593Smuzhiyun {
1326*4882a593Smuzhiyun const u64 feature_and_reserved_bits =
1327*4882a593Smuzhiyun /* feature */
1328*4882a593Smuzhiyun BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1329*4882a593Smuzhiyun BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1330*4882a593Smuzhiyun /* reserved */
1331*4882a593Smuzhiyun GENMASK_ULL(13, 9) | BIT_ULL(31);
1332*4882a593Smuzhiyun u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
1333*4882a593Smuzhiyun vmcs_config.nested.misc_high);
1334*4882a593Smuzhiyun
1335*4882a593Smuzhiyun if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1336*4882a593Smuzhiyun return -EINVAL;
1337*4882a593Smuzhiyun
1338*4882a593Smuzhiyun if ((vmx->nested.msrs.pinbased_ctls_high &
1339*4882a593Smuzhiyun PIN_BASED_VMX_PREEMPTION_TIMER) &&
1340*4882a593Smuzhiyun vmx_misc_preemption_timer_rate(data) !=
1341*4882a593Smuzhiyun vmx_misc_preemption_timer_rate(vmx_misc))
1342*4882a593Smuzhiyun return -EINVAL;
1343*4882a593Smuzhiyun
1344*4882a593Smuzhiyun if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1345*4882a593Smuzhiyun return -EINVAL;
1346*4882a593Smuzhiyun
1347*4882a593Smuzhiyun if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1348*4882a593Smuzhiyun return -EINVAL;
1349*4882a593Smuzhiyun
1350*4882a593Smuzhiyun if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1351*4882a593Smuzhiyun return -EINVAL;
1352*4882a593Smuzhiyun
1353*4882a593Smuzhiyun vmx->nested.msrs.misc_low = data;
1354*4882a593Smuzhiyun vmx->nested.msrs.misc_high = data >> 32;
1355*4882a593Smuzhiyun
1356*4882a593Smuzhiyun return 0;
1357*4882a593Smuzhiyun }
1358*4882a593Smuzhiyun
vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx * vmx,u64 data)1359*4882a593Smuzhiyun static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1360*4882a593Smuzhiyun {
1361*4882a593Smuzhiyun u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
1362*4882a593Smuzhiyun vmcs_config.nested.vpid_caps);
1363*4882a593Smuzhiyun
1364*4882a593Smuzhiyun /* Every bit is either reserved or a feature bit. */
1365*4882a593Smuzhiyun if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1366*4882a593Smuzhiyun return -EINVAL;
1367*4882a593Smuzhiyun
1368*4882a593Smuzhiyun vmx->nested.msrs.ept_caps = data;
1369*4882a593Smuzhiyun vmx->nested.msrs.vpid_caps = data >> 32;
1370*4882a593Smuzhiyun return 0;
1371*4882a593Smuzhiyun }
1372*4882a593Smuzhiyun
vmx_get_fixed0_msr(struct nested_vmx_msrs * msrs,u32 msr_index)1373*4882a593Smuzhiyun static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
1374*4882a593Smuzhiyun {
1375*4882a593Smuzhiyun switch (msr_index) {
1376*4882a593Smuzhiyun case MSR_IA32_VMX_CR0_FIXED0:
1377*4882a593Smuzhiyun return &msrs->cr0_fixed0;
1378*4882a593Smuzhiyun case MSR_IA32_VMX_CR4_FIXED0:
1379*4882a593Smuzhiyun return &msrs->cr4_fixed0;
1380*4882a593Smuzhiyun default:
1381*4882a593Smuzhiyun BUG();
1382*4882a593Smuzhiyun }
1383*4882a593Smuzhiyun }
1384*4882a593Smuzhiyun
vmx_restore_fixed0_msr(struct vcpu_vmx * vmx,u32 msr_index,u64 data)1385*4882a593Smuzhiyun static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1386*4882a593Smuzhiyun {
1387*4882a593Smuzhiyun const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
1388*4882a593Smuzhiyun
1389*4882a593Smuzhiyun /*
1390*4882a593Smuzhiyun * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1391*4882a593Smuzhiyun * must be 1 in the restored value.
1392*4882a593Smuzhiyun */
1393*4882a593Smuzhiyun if (!is_bitwise_subset(data, *msr, -1ULL))
1394*4882a593Smuzhiyun return -EINVAL;
1395*4882a593Smuzhiyun
1396*4882a593Smuzhiyun *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
1397*4882a593Smuzhiyun return 0;
1398*4882a593Smuzhiyun }
1399*4882a593Smuzhiyun
1400*4882a593Smuzhiyun /*
1401*4882a593Smuzhiyun * Called when userspace is restoring VMX MSRs.
1402*4882a593Smuzhiyun *
1403*4882a593Smuzhiyun * Returns 0 on success, non-0 otherwise.
1404*4882a593Smuzhiyun */
vmx_set_vmx_msr(struct kvm_vcpu * vcpu,u32 msr_index,u64 data)1405*4882a593Smuzhiyun int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1406*4882a593Smuzhiyun {
1407*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
1408*4882a593Smuzhiyun
1409*4882a593Smuzhiyun /*
1410*4882a593Smuzhiyun * Don't allow changes to the VMX capability MSRs while the vCPU
1411*4882a593Smuzhiyun * is in VMX operation.
1412*4882a593Smuzhiyun */
1413*4882a593Smuzhiyun if (vmx->nested.vmxon)
1414*4882a593Smuzhiyun return -EBUSY;
1415*4882a593Smuzhiyun
1416*4882a593Smuzhiyun switch (msr_index) {
1417*4882a593Smuzhiyun case MSR_IA32_VMX_BASIC:
1418*4882a593Smuzhiyun return vmx_restore_vmx_basic(vmx, data);
1419*4882a593Smuzhiyun case MSR_IA32_VMX_PINBASED_CTLS:
1420*4882a593Smuzhiyun case MSR_IA32_VMX_PROCBASED_CTLS:
1421*4882a593Smuzhiyun case MSR_IA32_VMX_EXIT_CTLS:
1422*4882a593Smuzhiyun case MSR_IA32_VMX_ENTRY_CTLS:
1423*4882a593Smuzhiyun /*
1424*4882a593Smuzhiyun * The "non-true" VMX capability MSRs are generated from the
1425*4882a593Smuzhiyun * "true" MSRs, so we do not support restoring them directly.
1426*4882a593Smuzhiyun *
1427*4882a593Smuzhiyun * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1428*4882a593Smuzhiyun * should restore the "true" MSRs with the must-be-1 bits
1429*4882a593Smuzhiyun * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1430*4882a593Smuzhiyun * DEFAULT SETTINGS".
1431*4882a593Smuzhiyun */
1432*4882a593Smuzhiyun return -EINVAL;
1433*4882a593Smuzhiyun case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1434*4882a593Smuzhiyun case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1435*4882a593Smuzhiyun case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1436*4882a593Smuzhiyun case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1437*4882a593Smuzhiyun case MSR_IA32_VMX_PROCBASED_CTLS2:
1438*4882a593Smuzhiyun return vmx_restore_control_msr(vmx, msr_index, data);
1439*4882a593Smuzhiyun case MSR_IA32_VMX_MISC:
1440*4882a593Smuzhiyun return vmx_restore_vmx_misc(vmx, data);
1441*4882a593Smuzhiyun case MSR_IA32_VMX_CR0_FIXED0:
1442*4882a593Smuzhiyun case MSR_IA32_VMX_CR4_FIXED0:
1443*4882a593Smuzhiyun return vmx_restore_fixed0_msr(vmx, msr_index, data);
1444*4882a593Smuzhiyun case MSR_IA32_VMX_CR0_FIXED1:
1445*4882a593Smuzhiyun case MSR_IA32_VMX_CR4_FIXED1:
1446*4882a593Smuzhiyun /*
1447*4882a593Smuzhiyun * These MSRs are generated based on the vCPU's CPUID, so we
1448*4882a593Smuzhiyun * do not support restoring them directly.
1449*4882a593Smuzhiyun */
1450*4882a593Smuzhiyun return -EINVAL;
1451*4882a593Smuzhiyun case MSR_IA32_VMX_EPT_VPID_CAP:
1452*4882a593Smuzhiyun return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1453*4882a593Smuzhiyun case MSR_IA32_VMX_VMCS_ENUM:
1454*4882a593Smuzhiyun vmx->nested.msrs.vmcs_enum = data;
1455*4882a593Smuzhiyun return 0;
1456*4882a593Smuzhiyun case MSR_IA32_VMX_VMFUNC:
1457*4882a593Smuzhiyun if (data & ~vmcs_config.nested.vmfunc_controls)
1458*4882a593Smuzhiyun return -EINVAL;
1459*4882a593Smuzhiyun vmx->nested.msrs.vmfunc_controls = data;
1460*4882a593Smuzhiyun return 0;
1461*4882a593Smuzhiyun default:
1462*4882a593Smuzhiyun /*
1463*4882a593Smuzhiyun * The rest of the VMX capability MSRs do not support restore.
1464*4882a593Smuzhiyun */
1465*4882a593Smuzhiyun return -EINVAL;
1466*4882a593Smuzhiyun }
1467*4882a593Smuzhiyun }
1468*4882a593Smuzhiyun
1469*4882a593Smuzhiyun /* Returns 0 on success, non-0 otherwise. */
vmx_get_vmx_msr(struct nested_vmx_msrs * msrs,u32 msr_index,u64 * pdata)1470*4882a593Smuzhiyun int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1471*4882a593Smuzhiyun {
1472*4882a593Smuzhiyun switch (msr_index) {
1473*4882a593Smuzhiyun case MSR_IA32_VMX_BASIC:
1474*4882a593Smuzhiyun *pdata = msrs->basic;
1475*4882a593Smuzhiyun break;
1476*4882a593Smuzhiyun case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1477*4882a593Smuzhiyun case MSR_IA32_VMX_PINBASED_CTLS:
1478*4882a593Smuzhiyun *pdata = vmx_control_msr(
1479*4882a593Smuzhiyun msrs->pinbased_ctls_low,
1480*4882a593Smuzhiyun msrs->pinbased_ctls_high);
1481*4882a593Smuzhiyun if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1482*4882a593Smuzhiyun *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1483*4882a593Smuzhiyun break;
1484*4882a593Smuzhiyun case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1485*4882a593Smuzhiyun case MSR_IA32_VMX_PROCBASED_CTLS:
1486*4882a593Smuzhiyun *pdata = vmx_control_msr(
1487*4882a593Smuzhiyun msrs->procbased_ctls_low,
1488*4882a593Smuzhiyun msrs->procbased_ctls_high);
1489*4882a593Smuzhiyun if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1490*4882a593Smuzhiyun *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1491*4882a593Smuzhiyun break;
1492*4882a593Smuzhiyun case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1493*4882a593Smuzhiyun case MSR_IA32_VMX_EXIT_CTLS:
1494*4882a593Smuzhiyun *pdata = vmx_control_msr(
1495*4882a593Smuzhiyun msrs->exit_ctls_low,
1496*4882a593Smuzhiyun msrs->exit_ctls_high);
1497*4882a593Smuzhiyun if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1498*4882a593Smuzhiyun *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1499*4882a593Smuzhiyun break;
1500*4882a593Smuzhiyun case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1501*4882a593Smuzhiyun case MSR_IA32_VMX_ENTRY_CTLS:
1502*4882a593Smuzhiyun *pdata = vmx_control_msr(
1503*4882a593Smuzhiyun msrs->entry_ctls_low,
1504*4882a593Smuzhiyun msrs->entry_ctls_high);
1505*4882a593Smuzhiyun if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1506*4882a593Smuzhiyun *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1507*4882a593Smuzhiyun break;
1508*4882a593Smuzhiyun case MSR_IA32_VMX_MISC:
1509*4882a593Smuzhiyun *pdata = vmx_control_msr(
1510*4882a593Smuzhiyun msrs->misc_low,
1511*4882a593Smuzhiyun msrs->misc_high);
1512*4882a593Smuzhiyun break;
1513*4882a593Smuzhiyun case MSR_IA32_VMX_CR0_FIXED0:
1514*4882a593Smuzhiyun *pdata = msrs->cr0_fixed0;
1515*4882a593Smuzhiyun break;
1516*4882a593Smuzhiyun case MSR_IA32_VMX_CR0_FIXED1:
1517*4882a593Smuzhiyun *pdata = msrs->cr0_fixed1;
1518*4882a593Smuzhiyun break;
1519*4882a593Smuzhiyun case MSR_IA32_VMX_CR4_FIXED0:
1520*4882a593Smuzhiyun *pdata = msrs->cr4_fixed0;
1521*4882a593Smuzhiyun break;
1522*4882a593Smuzhiyun case MSR_IA32_VMX_CR4_FIXED1:
1523*4882a593Smuzhiyun *pdata = msrs->cr4_fixed1;
1524*4882a593Smuzhiyun break;
1525*4882a593Smuzhiyun case MSR_IA32_VMX_VMCS_ENUM:
1526*4882a593Smuzhiyun *pdata = msrs->vmcs_enum;
1527*4882a593Smuzhiyun break;
1528*4882a593Smuzhiyun case MSR_IA32_VMX_PROCBASED_CTLS2:
1529*4882a593Smuzhiyun *pdata = vmx_control_msr(
1530*4882a593Smuzhiyun msrs->secondary_ctls_low,
1531*4882a593Smuzhiyun msrs->secondary_ctls_high);
1532*4882a593Smuzhiyun break;
1533*4882a593Smuzhiyun case MSR_IA32_VMX_EPT_VPID_CAP:
1534*4882a593Smuzhiyun *pdata = msrs->ept_caps |
1535*4882a593Smuzhiyun ((u64)msrs->vpid_caps << 32);
1536*4882a593Smuzhiyun break;
1537*4882a593Smuzhiyun case MSR_IA32_VMX_VMFUNC:
1538*4882a593Smuzhiyun *pdata = msrs->vmfunc_controls;
1539*4882a593Smuzhiyun break;
1540*4882a593Smuzhiyun default:
1541*4882a593Smuzhiyun return 1;
1542*4882a593Smuzhiyun }
1543*4882a593Smuzhiyun
1544*4882a593Smuzhiyun return 0;
1545*4882a593Smuzhiyun }
1546*4882a593Smuzhiyun
1547*4882a593Smuzhiyun /*
1548*4882a593Smuzhiyun * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1549*4882a593Smuzhiyun * been modified by the L1 guest. Note, "writable" in this context means
1550*4882a593Smuzhiyun * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1551*4882a593Smuzhiyun * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1552*4882a593Smuzhiyun * VM-exit information fields (which are actually writable if the vCPU is
1553*4882a593Smuzhiyun * configured to support "VMWRITE to any supported field in the VMCS").
1554*4882a593Smuzhiyun */
copy_shadow_to_vmcs12(struct vcpu_vmx * vmx)1555*4882a593Smuzhiyun static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1556*4882a593Smuzhiyun {
1557*4882a593Smuzhiyun struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1558*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1559*4882a593Smuzhiyun struct shadow_vmcs_field field;
1560*4882a593Smuzhiyun unsigned long val;
1561*4882a593Smuzhiyun int i;
1562*4882a593Smuzhiyun
1563*4882a593Smuzhiyun if (WARN_ON(!shadow_vmcs))
1564*4882a593Smuzhiyun return;
1565*4882a593Smuzhiyun
1566*4882a593Smuzhiyun preempt_disable();
1567*4882a593Smuzhiyun
1568*4882a593Smuzhiyun vmcs_load(shadow_vmcs);
1569*4882a593Smuzhiyun
1570*4882a593Smuzhiyun for (i = 0; i < max_shadow_read_write_fields; i++) {
1571*4882a593Smuzhiyun field = shadow_read_write_fields[i];
1572*4882a593Smuzhiyun val = __vmcs_readl(field.encoding);
1573*4882a593Smuzhiyun vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1574*4882a593Smuzhiyun }
1575*4882a593Smuzhiyun
1576*4882a593Smuzhiyun vmcs_clear(shadow_vmcs);
1577*4882a593Smuzhiyun vmcs_load(vmx->loaded_vmcs->vmcs);
1578*4882a593Smuzhiyun
1579*4882a593Smuzhiyun preempt_enable();
1580*4882a593Smuzhiyun }
1581*4882a593Smuzhiyun
copy_vmcs12_to_shadow(struct vcpu_vmx * vmx)1582*4882a593Smuzhiyun static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1583*4882a593Smuzhiyun {
1584*4882a593Smuzhiyun const struct shadow_vmcs_field *fields[] = {
1585*4882a593Smuzhiyun shadow_read_write_fields,
1586*4882a593Smuzhiyun shadow_read_only_fields
1587*4882a593Smuzhiyun };
1588*4882a593Smuzhiyun const int max_fields[] = {
1589*4882a593Smuzhiyun max_shadow_read_write_fields,
1590*4882a593Smuzhiyun max_shadow_read_only_fields
1591*4882a593Smuzhiyun };
1592*4882a593Smuzhiyun struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1593*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1594*4882a593Smuzhiyun struct shadow_vmcs_field field;
1595*4882a593Smuzhiyun unsigned long val;
1596*4882a593Smuzhiyun int i, q;
1597*4882a593Smuzhiyun
1598*4882a593Smuzhiyun if (WARN_ON(!shadow_vmcs))
1599*4882a593Smuzhiyun return;
1600*4882a593Smuzhiyun
1601*4882a593Smuzhiyun vmcs_load(shadow_vmcs);
1602*4882a593Smuzhiyun
1603*4882a593Smuzhiyun for (q = 0; q < ARRAY_SIZE(fields); q++) {
1604*4882a593Smuzhiyun for (i = 0; i < max_fields[q]; i++) {
1605*4882a593Smuzhiyun field = fields[q][i];
1606*4882a593Smuzhiyun val = vmcs12_read_any(vmcs12, field.encoding,
1607*4882a593Smuzhiyun field.offset);
1608*4882a593Smuzhiyun __vmcs_writel(field.encoding, val);
1609*4882a593Smuzhiyun }
1610*4882a593Smuzhiyun }
1611*4882a593Smuzhiyun
1612*4882a593Smuzhiyun vmcs_clear(shadow_vmcs);
1613*4882a593Smuzhiyun vmcs_load(vmx->loaded_vmcs->vmcs);
1614*4882a593Smuzhiyun }
1615*4882a593Smuzhiyun
copy_enlightened_to_vmcs12(struct vcpu_vmx * vmx)1616*4882a593Smuzhiyun static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
1617*4882a593Smuzhiyun {
1618*4882a593Smuzhiyun struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1619*4882a593Smuzhiyun struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1620*4882a593Smuzhiyun
1621*4882a593Smuzhiyun /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1622*4882a593Smuzhiyun vmcs12->tpr_threshold = evmcs->tpr_threshold;
1623*4882a593Smuzhiyun vmcs12->guest_rip = evmcs->guest_rip;
1624*4882a593Smuzhiyun
1625*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1626*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1627*4882a593Smuzhiyun vmcs12->guest_rsp = evmcs->guest_rsp;
1628*4882a593Smuzhiyun vmcs12->guest_rflags = evmcs->guest_rflags;
1629*4882a593Smuzhiyun vmcs12->guest_interruptibility_info =
1630*4882a593Smuzhiyun evmcs->guest_interruptibility_info;
1631*4882a593Smuzhiyun }
1632*4882a593Smuzhiyun
1633*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1634*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1635*4882a593Smuzhiyun vmcs12->cpu_based_vm_exec_control =
1636*4882a593Smuzhiyun evmcs->cpu_based_vm_exec_control;
1637*4882a593Smuzhiyun }
1638*4882a593Smuzhiyun
1639*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1640*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1641*4882a593Smuzhiyun vmcs12->exception_bitmap = evmcs->exception_bitmap;
1642*4882a593Smuzhiyun }
1643*4882a593Smuzhiyun
1644*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1645*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1646*4882a593Smuzhiyun vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1647*4882a593Smuzhiyun }
1648*4882a593Smuzhiyun
1649*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1650*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1651*4882a593Smuzhiyun vmcs12->vm_entry_intr_info_field =
1652*4882a593Smuzhiyun evmcs->vm_entry_intr_info_field;
1653*4882a593Smuzhiyun vmcs12->vm_entry_exception_error_code =
1654*4882a593Smuzhiyun evmcs->vm_entry_exception_error_code;
1655*4882a593Smuzhiyun vmcs12->vm_entry_instruction_len =
1656*4882a593Smuzhiyun evmcs->vm_entry_instruction_len;
1657*4882a593Smuzhiyun }
1658*4882a593Smuzhiyun
1659*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1660*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1661*4882a593Smuzhiyun vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1662*4882a593Smuzhiyun vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1663*4882a593Smuzhiyun vmcs12->host_cr0 = evmcs->host_cr0;
1664*4882a593Smuzhiyun vmcs12->host_cr3 = evmcs->host_cr3;
1665*4882a593Smuzhiyun vmcs12->host_cr4 = evmcs->host_cr4;
1666*4882a593Smuzhiyun vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1667*4882a593Smuzhiyun vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1668*4882a593Smuzhiyun vmcs12->host_rip = evmcs->host_rip;
1669*4882a593Smuzhiyun vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1670*4882a593Smuzhiyun vmcs12->host_es_selector = evmcs->host_es_selector;
1671*4882a593Smuzhiyun vmcs12->host_cs_selector = evmcs->host_cs_selector;
1672*4882a593Smuzhiyun vmcs12->host_ss_selector = evmcs->host_ss_selector;
1673*4882a593Smuzhiyun vmcs12->host_ds_selector = evmcs->host_ds_selector;
1674*4882a593Smuzhiyun vmcs12->host_fs_selector = evmcs->host_fs_selector;
1675*4882a593Smuzhiyun vmcs12->host_gs_selector = evmcs->host_gs_selector;
1676*4882a593Smuzhiyun vmcs12->host_tr_selector = evmcs->host_tr_selector;
1677*4882a593Smuzhiyun }
1678*4882a593Smuzhiyun
1679*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1680*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1681*4882a593Smuzhiyun vmcs12->pin_based_vm_exec_control =
1682*4882a593Smuzhiyun evmcs->pin_based_vm_exec_control;
1683*4882a593Smuzhiyun vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1684*4882a593Smuzhiyun vmcs12->secondary_vm_exec_control =
1685*4882a593Smuzhiyun evmcs->secondary_vm_exec_control;
1686*4882a593Smuzhiyun }
1687*4882a593Smuzhiyun
1688*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1689*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1690*4882a593Smuzhiyun vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1691*4882a593Smuzhiyun vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1692*4882a593Smuzhiyun }
1693*4882a593Smuzhiyun
1694*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1695*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1696*4882a593Smuzhiyun vmcs12->msr_bitmap = evmcs->msr_bitmap;
1697*4882a593Smuzhiyun }
1698*4882a593Smuzhiyun
1699*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1700*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1701*4882a593Smuzhiyun vmcs12->guest_es_base = evmcs->guest_es_base;
1702*4882a593Smuzhiyun vmcs12->guest_cs_base = evmcs->guest_cs_base;
1703*4882a593Smuzhiyun vmcs12->guest_ss_base = evmcs->guest_ss_base;
1704*4882a593Smuzhiyun vmcs12->guest_ds_base = evmcs->guest_ds_base;
1705*4882a593Smuzhiyun vmcs12->guest_fs_base = evmcs->guest_fs_base;
1706*4882a593Smuzhiyun vmcs12->guest_gs_base = evmcs->guest_gs_base;
1707*4882a593Smuzhiyun vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1708*4882a593Smuzhiyun vmcs12->guest_tr_base = evmcs->guest_tr_base;
1709*4882a593Smuzhiyun vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1710*4882a593Smuzhiyun vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1711*4882a593Smuzhiyun vmcs12->guest_es_limit = evmcs->guest_es_limit;
1712*4882a593Smuzhiyun vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1713*4882a593Smuzhiyun vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1714*4882a593Smuzhiyun vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1715*4882a593Smuzhiyun vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1716*4882a593Smuzhiyun vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1717*4882a593Smuzhiyun vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1718*4882a593Smuzhiyun vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1719*4882a593Smuzhiyun vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1720*4882a593Smuzhiyun vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1721*4882a593Smuzhiyun vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1722*4882a593Smuzhiyun vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1723*4882a593Smuzhiyun vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1724*4882a593Smuzhiyun vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1725*4882a593Smuzhiyun vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1726*4882a593Smuzhiyun vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1727*4882a593Smuzhiyun vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1728*4882a593Smuzhiyun vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1729*4882a593Smuzhiyun vmcs12->guest_es_selector = evmcs->guest_es_selector;
1730*4882a593Smuzhiyun vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1731*4882a593Smuzhiyun vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1732*4882a593Smuzhiyun vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1733*4882a593Smuzhiyun vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1734*4882a593Smuzhiyun vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1735*4882a593Smuzhiyun vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1736*4882a593Smuzhiyun vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1737*4882a593Smuzhiyun }
1738*4882a593Smuzhiyun
1739*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1740*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1741*4882a593Smuzhiyun vmcs12->tsc_offset = evmcs->tsc_offset;
1742*4882a593Smuzhiyun vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1743*4882a593Smuzhiyun vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1744*4882a593Smuzhiyun }
1745*4882a593Smuzhiyun
1746*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1747*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1748*4882a593Smuzhiyun vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1749*4882a593Smuzhiyun vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1750*4882a593Smuzhiyun vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1751*4882a593Smuzhiyun vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1752*4882a593Smuzhiyun vmcs12->guest_cr0 = evmcs->guest_cr0;
1753*4882a593Smuzhiyun vmcs12->guest_cr3 = evmcs->guest_cr3;
1754*4882a593Smuzhiyun vmcs12->guest_cr4 = evmcs->guest_cr4;
1755*4882a593Smuzhiyun vmcs12->guest_dr7 = evmcs->guest_dr7;
1756*4882a593Smuzhiyun }
1757*4882a593Smuzhiyun
1758*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1759*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1760*4882a593Smuzhiyun vmcs12->host_fs_base = evmcs->host_fs_base;
1761*4882a593Smuzhiyun vmcs12->host_gs_base = evmcs->host_gs_base;
1762*4882a593Smuzhiyun vmcs12->host_tr_base = evmcs->host_tr_base;
1763*4882a593Smuzhiyun vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1764*4882a593Smuzhiyun vmcs12->host_idtr_base = evmcs->host_idtr_base;
1765*4882a593Smuzhiyun vmcs12->host_rsp = evmcs->host_rsp;
1766*4882a593Smuzhiyun }
1767*4882a593Smuzhiyun
1768*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1769*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1770*4882a593Smuzhiyun vmcs12->ept_pointer = evmcs->ept_pointer;
1771*4882a593Smuzhiyun vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1772*4882a593Smuzhiyun }
1773*4882a593Smuzhiyun
1774*4882a593Smuzhiyun if (unlikely(!(evmcs->hv_clean_fields &
1775*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1776*4882a593Smuzhiyun vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1777*4882a593Smuzhiyun vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1778*4882a593Smuzhiyun vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1779*4882a593Smuzhiyun vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1780*4882a593Smuzhiyun vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1781*4882a593Smuzhiyun vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1782*4882a593Smuzhiyun vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1783*4882a593Smuzhiyun vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1784*4882a593Smuzhiyun vmcs12->guest_pending_dbg_exceptions =
1785*4882a593Smuzhiyun evmcs->guest_pending_dbg_exceptions;
1786*4882a593Smuzhiyun vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1787*4882a593Smuzhiyun vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1788*4882a593Smuzhiyun vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1789*4882a593Smuzhiyun vmcs12->guest_activity_state = evmcs->guest_activity_state;
1790*4882a593Smuzhiyun vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1791*4882a593Smuzhiyun }
1792*4882a593Smuzhiyun
1793*4882a593Smuzhiyun /*
1794*4882a593Smuzhiyun * Not used?
1795*4882a593Smuzhiyun * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1796*4882a593Smuzhiyun * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1797*4882a593Smuzhiyun * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1798*4882a593Smuzhiyun * vmcs12->page_fault_error_code_mask =
1799*4882a593Smuzhiyun * evmcs->page_fault_error_code_mask;
1800*4882a593Smuzhiyun * vmcs12->page_fault_error_code_match =
1801*4882a593Smuzhiyun * evmcs->page_fault_error_code_match;
1802*4882a593Smuzhiyun * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1803*4882a593Smuzhiyun * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1804*4882a593Smuzhiyun * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1805*4882a593Smuzhiyun * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1806*4882a593Smuzhiyun */
1807*4882a593Smuzhiyun
1808*4882a593Smuzhiyun /*
1809*4882a593Smuzhiyun * Read only fields:
1810*4882a593Smuzhiyun * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1811*4882a593Smuzhiyun * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1812*4882a593Smuzhiyun * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1813*4882a593Smuzhiyun * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1814*4882a593Smuzhiyun * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1815*4882a593Smuzhiyun * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1816*4882a593Smuzhiyun * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1817*4882a593Smuzhiyun * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1818*4882a593Smuzhiyun * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1819*4882a593Smuzhiyun * vmcs12->exit_qualification = evmcs->exit_qualification;
1820*4882a593Smuzhiyun * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1821*4882a593Smuzhiyun *
1822*4882a593Smuzhiyun * Not present in struct vmcs12:
1823*4882a593Smuzhiyun * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1824*4882a593Smuzhiyun * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1825*4882a593Smuzhiyun * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1826*4882a593Smuzhiyun * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1827*4882a593Smuzhiyun */
1828*4882a593Smuzhiyun
1829*4882a593Smuzhiyun return 0;
1830*4882a593Smuzhiyun }
1831*4882a593Smuzhiyun
copy_vmcs12_to_enlightened(struct vcpu_vmx * vmx)1832*4882a593Smuzhiyun static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1833*4882a593Smuzhiyun {
1834*4882a593Smuzhiyun struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1835*4882a593Smuzhiyun struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1836*4882a593Smuzhiyun
1837*4882a593Smuzhiyun /*
1838*4882a593Smuzhiyun * Should not be changed by KVM:
1839*4882a593Smuzhiyun *
1840*4882a593Smuzhiyun * evmcs->host_es_selector = vmcs12->host_es_selector;
1841*4882a593Smuzhiyun * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1842*4882a593Smuzhiyun * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1843*4882a593Smuzhiyun * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1844*4882a593Smuzhiyun * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1845*4882a593Smuzhiyun * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1846*4882a593Smuzhiyun * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1847*4882a593Smuzhiyun * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1848*4882a593Smuzhiyun * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1849*4882a593Smuzhiyun * evmcs->host_cr0 = vmcs12->host_cr0;
1850*4882a593Smuzhiyun * evmcs->host_cr3 = vmcs12->host_cr3;
1851*4882a593Smuzhiyun * evmcs->host_cr4 = vmcs12->host_cr4;
1852*4882a593Smuzhiyun * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1853*4882a593Smuzhiyun * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1854*4882a593Smuzhiyun * evmcs->host_rip = vmcs12->host_rip;
1855*4882a593Smuzhiyun * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1856*4882a593Smuzhiyun * evmcs->host_fs_base = vmcs12->host_fs_base;
1857*4882a593Smuzhiyun * evmcs->host_gs_base = vmcs12->host_gs_base;
1858*4882a593Smuzhiyun * evmcs->host_tr_base = vmcs12->host_tr_base;
1859*4882a593Smuzhiyun * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1860*4882a593Smuzhiyun * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1861*4882a593Smuzhiyun * evmcs->host_rsp = vmcs12->host_rsp;
1862*4882a593Smuzhiyun * sync_vmcs02_to_vmcs12() doesn't read these:
1863*4882a593Smuzhiyun * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1864*4882a593Smuzhiyun * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1865*4882a593Smuzhiyun * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1866*4882a593Smuzhiyun * evmcs->ept_pointer = vmcs12->ept_pointer;
1867*4882a593Smuzhiyun * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1868*4882a593Smuzhiyun * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1869*4882a593Smuzhiyun * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1870*4882a593Smuzhiyun * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1871*4882a593Smuzhiyun * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1872*4882a593Smuzhiyun * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1873*4882a593Smuzhiyun * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1874*4882a593Smuzhiyun * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1875*4882a593Smuzhiyun * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1876*4882a593Smuzhiyun * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1877*4882a593Smuzhiyun * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1878*4882a593Smuzhiyun * evmcs->page_fault_error_code_mask =
1879*4882a593Smuzhiyun * vmcs12->page_fault_error_code_mask;
1880*4882a593Smuzhiyun * evmcs->page_fault_error_code_match =
1881*4882a593Smuzhiyun * vmcs12->page_fault_error_code_match;
1882*4882a593Smuzhiyun * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1883*4882a593Smuzhiyun * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1884*4882a593Smuzhiyun * evmcs->tsc_offset = vmcs12->tsc_offset;
1885*4882a593Smuzhiyun * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1886*4882a593Smuzhiyun * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1887*4882a593Smuzhiyun * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1888*4882a593Smuzhiyun * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1889*4882a593Smuzhiyun * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1890*4882a593Smuzhiyun * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1891*4882a593Smuzhiyun * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1892*4882a593Smuzhiyun * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1893*4882a593Smuzhiyun *
1894*4882a593Smuzhiyun * Not present in struct vmcs12:
1895*4882a593Smuzhiyun * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1896*4882a593Smuzhiyun * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1897*4882a593Smuzhiyun * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1898*4882a593Smuzhiyun * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1899*4882a593Smuzhiyun */
1900*4882a593Smuzhiyun
1901*4882a593Smuzhiyun evmcs->guest_es_selector = vmcs12->guest_es_selector;
1902*4882a593Smuzhiyun evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1903*4882a593Smuzhiyun evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1904*4882a593Smuzhiyun evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1905*4882a593Smuzhiyun evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1906*4882a593Smuzhiyun evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1907*4882a593Smuzhiyun evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1908*4882a593Smuzhiyun evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1909*4882a593Smuzhiyun
1910*4882a593Smuzhiyun evmcs->guest_es_limit = vmcs12->guest_es_limit;
1911*4882a593Smuzhiyun evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1912*4882a593Smuzhiyun evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1913*4882a593Smuzhiyun evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1914*4882a593Smuzhiyun evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1915*4882a593Smuzhiyun evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1916*4882a593Smuzhiyun evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1917*4882a593Smuzhiyun evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1918*4882a593Smuzhiyun evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1919*4882a593Smuzhiyun evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1920*4882a593Smuzhiyun
1921*4882a593Smuzhiyun evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1922*4882a593Smuzhiyun evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1923*4882a593Smuzhiyun evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1924*4882a593Smuzhiyun evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1925*4882a593Smuzhiyun evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1926*4882a593Smuzhiyun evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1927*4882a593Smuzhiyun evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1928*4882a593Smuzhiyun evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1929*4882a593Smuzhiyun
1930*4882a593Smuzhiyun evmcs->guest_es_base = vmcs12->guest_es_base;
1931*4882a593Smuzhiyun evmcs->guest_cs_base = vmcs12->guest_cs_base;
1932*4882a593Smuzhiyun evmcs->guest_ss_base = vmcs12->guest_ss_base;
1933*4882a593Smuzhiyun evmcs->guest_ds_base = vmcs12->guest_ds_base;
1934*4882a593Smuzhiyun evmcs->guest_fs_base = vmcs12->guest_fs_base;
1935*4882a593Smuzhiyun evmcs->guest_gs_base = vmcs12->guest_gs_base;
1936*4882a593Smuzhiyun evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1937*4882a593Smuzhiyun evmcs->guest_tr_base = vmcs12->guest_tr_base;
1938*4882a593Smuzhiyun evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1939*4882a593Smuzhiyun evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1940*4882a593Smuzhiyun
1941*4882a593Smuzhiyun evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1942*4882a593Smuzhiyun evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1943*4882a593Smuzhiyun
1944*4882a593Smuzhiyun evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1945*4882a593Smuzhiyun evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1946*4882a593Smuzhiyun evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1947*4882a593Smuzhiyun evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1948*4882a593Smuzhiyun
1949*4882a593Smuzhiyun evmcs->guest_pending_dbg_exceptions =
1950*4882a593Smuzhiyun vmcs12->guest_pending_dbg_exceptions;
1951*4882a593Smuzhiyun evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1952*4882a593Smuzhiyun evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1953*4882a593Smuzhiyun
1954*4882a593Smuzhiyun evmcs->guest_activity_state = vmcs12->guest_activity_state;
1955*4882a593Smuzhiyun evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1956*4882a593Smuzhiyun
1957*4882a593Smuzhiyun evmcs->guest_cr0 = vmcs12->guest_cr0;
1958*4882a593Smuzhiyun evmcs->guest_cr3 = vmcs12->guest_cr3;
1959*4882a593Smuzhiyun evmcs->guest_cr4 = vmcs12->guest_cr4;
1960*4882a593Smuzhiyun evmcs->guest_dr7 = vmcs12->guest_dr7;
1961*4882a593Smuzhiyun
1962*4882a593Smuzhiyun evmcs->guest_physical_address = vmcs12->guest_physical_address;
1963*4882a593Smuzhiyun
1964*4882a593Smuzhiyun evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1965*4882a593Smuzhiyun evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1966*4882a593Smuzhiyun evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1967*4882a593Smuzhiyun evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1968*4882a593Smuzhiyun evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1969*4882a593Smuzhiyun evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1970*4882a593Smuzhiyun evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1971*4882a593Smuzhiyun evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1972*4882a593Smuzhiyun
1973*4882a593Smuzhiyun evmcs->exit_qualification = vmcs12->exit_qualification;
1974*4882a593Smuzhiyun
1975*4882a593Smuzhiyun evmcs->guest_linear_address = vmcs12->guest_linear_address;
1976*4882a593Smuzhiyun evmcs->guest_rsp = vmcs12->guest_rsp;
1977*4882a593Smuzhiyun evmcs->guest_rflags = vmcs12->guest_rflags;
1978*4882a593Smuzhiyun
1979*4882a593Smuzhiyun evmcs->guest_interruptibility_info =
1980*4882a593Smuzhiyun vmcs12->guest_interruptibility_info;
1981*4882a593Smuzhiyun evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1982*4882a593Smuzhiyun evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1983*4882a593Smuzhiyun evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1984*4882a593Smuzhiyun evmcs->vm_entry_exception_error_code =
1985*4882a593Smuzhiyun vmcs12->vm_entry_exception_error_code;
1986*4882a593Smuzhiyun evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1987*4882a593Smuzhiyun
1988*4882a593Smuzhiyun evmcs->guest_rip = vmcs12->guest_rip;
1989*4882a593Smuzhiyun
1990*4882a593Smuzhiyun evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1991*4882a593Smuzhiyun
1992*4882a593Smuzhiyun return 0;
1993*4882a593Smuzhiyun }
1994*4882a593Smuzhiyun
1995*4882a593Smuzhiyun /*
1996*4882a593Smuzhiyun * This is an equivalent of the nested hypervisor executing the vmptrld
1997*4882a593Smuzhiyun * instruction.
1998*4882a593Smuzhiyun */
nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu * vcpu,bool from_launch)1999*4882a593Smuzhiyun static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
2000*4882a593Smuzhiyun struct kvm_vcpu *vcpu, bool from_launch)
2001*4882a593Smuzhiyun {
2002*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
2003*4882a593Smuzhiyun bool evmcs_gpa_changed = false;
2004*4882a593Smuzhiyun u64 evmcs_gpa;
2005*4882a593Smuzhiyun
2006*4882a593Smuzhiyun if (likely(!vmx->nested.enlightened_vmcs_enabled))
2007*4882a593Smuzhiyun return EVMPTRLD_DISABLED;
2008*4882a593Smuzhiyun
2009*4882a593Smuzhiyun if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
2010*4882a593Smuzhiyun return EVMPTRLD_DISABLED;
2011*4882a593Smuzhiyun
2012*4882a593Smuzhiyun if (unlikely(!vmx->nested.hv_evmcs ||
2013*4882a593Smuzhiyun evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
2014*4882a593Smuzhiyun if (!vmx->nested.hv_evmcs)
2015*4882a593Smuzhiyun vmx->nested.current_vmptr = -1ull;
2016*4882a593Smuzhiyun
2017*4882a593Smuzhiyun nested_release_evmcs(vcpu);
2018*4882a593Smuzhiyun
2019*4882a593Smuzhiyun if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
2020*4882a593Smuzhiyun &vmx->nested.hv_evmcs_map))
2021*4882a593Smuzhiyun return EVMPTRLD_ERROR;
2022*4882a593Smuzhiyun
2023*4882a593Smuzhiyun vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
2024*4882a593Smuzhiyun
2025*4882a593Smuzhiyun /*
2026*4882a593Smuzhiyun * Currently, KVM only supports eVMCS version 1
2027*4882a593Smuzhiyun * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
2028*4882a593Smuzhiyun * value to first u32 field of eVMCS which should specify eVMCS
2029*4882a593Smuzhiyun * VersionNumber.
2030*4882a593Smuzhiyun *
2031*4882a593Smuzhiyun * Guest should be aware of supported eVMCS versions by host by
2032*4882a593Smuzhiyun * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
2033*4882a593Smuzhiyun * expected to set this CPUID leaf according to the value
2034*4882a593Smuzhiyun * returned in vmcs_version from nested_enable_evmcs().
2035*4882a593Smuzhiyun *
2036*4882a593Smuzhiyun * However, it turns out that Microsoft Hyper-V fails to comply
2037*4882a593Smuzhiyun * to their own invented interface: When Hyper-V use eVMCS, it
2038*4882a593Smuzhiyun * just sets first u32 field of eVMCS to revision_id specified
2039*4882a593Smuzhiyun * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
2040*4882a593Smuzhiyun * which is one of the supported versions specified in
2041*4882a593Smuzhiyun * CPUID.0x4000000A.EAX[0:15].
2042*4882a593Smuzhiyun *
2043*4882a593Smuzhiyun * To overcome Hyper-V bug, we accept here either a supported
2044*4882a593Smuzhiyun * eVMCS version or VMCS12 revision_id as valid values for first
2045*4882a593Smuzhiyun * u32 field of eVMCS.
2046*4882a593Smuzhiyun */
2047*4882a593Smuzhiyun if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
2048*4882a593Smuzhiyun (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
2049*4882a593Smuzhiyun nested_release_evmcs(vcpu);
2050*4882a593Smuzhiyun return EVMPTRLD_VMFAIL;
2051*4882a593Smuzhiyun }
2052*4882a593Smuzhiyun
2053*4882a593Smuzhiyun vmx->nested.dirty_vmcs12 = true;
2054*4882a593Smuzhiyun vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
2055*4882a593Smuzhiyun
2056*4882a593Smuzhiyun evmcs_gpa_changed = true;
2057*4882a593Smuzhiyun /*
2058*4882a593Smuzhiyun * Unlike normal vmcs12, enlightened vmcs12 is not fully
2059*4882a593Smuzhiyun * reloaded from guest's memory (read only fields, fields not
2060*4882a593Smuzhiyun * present in struct hv_enlightened_vmcs, ...). Make sure there
2061*4882a593Smuzhiyun * are no leftovers.
2062*4882a593Smuzhiyun */
2063*4882a593Smuzhiyun if (from_launch) {
2064*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2065*4882a593Smuzhiyun memset(vmcs12, 0, sizeof(*vmcs12));
2066*4882a593Smuzhiyun vmcs12->hdr.revision_id = VMCS12_REVISION;
2067*4882a593Smuzhiyun }
2068*4882a593Smuzhiyun
2069*4882a593Smuzhiyun }
2070*4882a593Smuzhiyun
2071*4882a593Smuzhiyun /*
2072*4882a593Smuzhiyun * Clean fields data can't be used on VMLAUNCH and when we switch
2073*4882a593Smuzhiyun * between different L2 guests as KVM keeps a single VMCS12 per L1.
2074*4882a593Smuzhiyun */
2075*4882a593Smuzhiyun if (from_launch || evmcs_gpa_changed)
2076*4882a593Smuzhiyun vmx->nested.hv_evmcs->hv_clean_fields &=
2077*4882a593Smuzhiyun ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2078*4882a593Smuzhiyun
2079*4882a593Smuzhiyun return EVMPTRLD_SUCCEEDED;
2080*4882a593Smuzhiyun }
2081*4882a593Smuzhiyun
nested_sync_vmcs12_to_shadow(struct kvm_vcpu * vcpu)2082*4882a593Smuzhiyun void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
2083*4882a593Smuzhiyun {
2084*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
2085*4882a593Smuzhiyun
2086*4882a593Smuzhiyun if (vmx->nested.hv_evmcs) {
2087*4882a593Smuzhiyun copy_vmcs12_to_enlightened(vmx);
2088*4882a593Smuzhiyun /* All fields are clean */
2089*4882a593Smuzhiyun vmx->nested.hv_evmcs->hv_clean_fields |=
2090*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2091*4882a593Smuzhiyun } else {
2092*4882a593Smuzhiyun copy_vmcs12_to_shadow(vmx);
2093*4882a593Smuzhiyun }
2094*4882a593Smuzhiyun
2095*4882a593Smuzhiyun vmx->nested.need_vmcs12_to_shadow_sync = false;
2096*4882a593Smuzhiyun }
2097*4882a593Smuzhiyun
vmx_preemption_timer_fn(struct hrtimer * timer)2098*4882a593Smuzhiyun static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2099*4882a593Smuzhiyun {
2100*4882a593Smuzhiyun struct vcpu_vmx *vmx =
2101*4882a593Smuzhiyun container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2102*4882a593Smuzhiyun
2103*4882a593Smuzhiyun vmx->nested.preemption_timer_expired = true;
2104*4882a593Smuzhiyun kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2105*4882a593Smuzhiyun kvm_vcpu_kick(&vmx->vcpu);
2106*4882a593Smuzhiyun
2107*4882a593Smuzhiyun return HRTIMER_NORESTART;
2108*4882a593Smuzhiyun }
2109*4882a593Smuzhiyun
vmx_calc_preemption_timer_value(struct kvm_vcpu * vcpu)2110*4882a593Smuzhiyun static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
2111*4882a593Smuzhiyun {
2112*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
2113*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2114*4882a593Smuzhiyun
2115*4882a593Smuzhiyun u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
2116*4882a593Smuzhiyun VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2117*4882a593Smuzhiyun
2118*4882a593Smuzhiyun if (!vmx->nested.has_preemption_timer_deadline) {
2119*4882a593Smuzhiyun vmx->nested.preemption_timer_deadline =
2120*4882a593Smuzhiyun vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
2121*4882a593Smuzhiyun vmx->nested.has_preemption_timer_deadline = true;
2122*4882a593Smuzhiyun }
2123*4882a593Smuzhiyun return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
2124*4882a593Smuzhiyun }
2125*4882a593Smuzhiyun
vmx_start_preemption_timer(struct kvm_vcpu * vcpu,u64 preemption_timeout)2126*4882a593Smuzhiyun static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
2127*4882a593Smuzhiyun u64 preemption_timeout)
2128*4882a593Smuzhiyun {
2129*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
2130*4882a593Smuzhiyun
2131*4882a593Smuzhiyun /*
2132*4882a593Smuzhiyun * A timer value of zero is architecturally guaranteed to cause
2133*4882a593Smuzhiyun * a VMExit prior to executing any instructions in the guest.
2134*4882a593Smuzhiyun */
2135*4882a593Smuzhiyun if (preemption_timeout == 0) {
2136*4882a593Smuzhiyun vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2137*4882a593Smuzhiyun return;
2138*4882a593Smuzhiyun }
2139*4882a593Smuzhiyun
2140*4882a593Smuzhiyun if (vcpu->arch.virtual_tsc_khz == 0)
2141*4882a593Smuzhiyun return;
2142*4882a593Smuzhiyun
2143*4882a593Smuzhiyun preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2144*4882a593Smuzhiyun preemption_timeout *= 1000000;
2145*4882a593Smuzhiyun do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2146*4882a593Smuzhiyun hrtimer_start(&vmx->nested.preemption_timer,
2147*4882a593Smuzhiyun ktime_add_ns(ktime_get(), preemption_timeout),
2148*4882a593Smuzhiyun HRTIMER_MODE_ABS_PINNED);
2149*4882a593Smuzhiyun }
2150*4882a593Smuzhiyun
nested_vmx_calc_efer(struct vcpu_vmx * vmx,struct vmcs12 * vmcs12)2151*4882a593Smuzhiyun static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2152*4882a593Smuzhiyun {
2153*4882a593Smuzhiyun if (vmx->nested.nested_run_pending &&
2154*4882a593Smuzhiyun (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2155*4882a593Smuzhiyun return vmcs12->guest_ia32_efer;
2156*4882a593Smuzhiyun else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2157*4882a593Smuzhiyun return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2158*4882a593Smuzhiyun else
2159*4882a593Smuzhiyun return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2160*4882a593Smuzhiyun }
2161*4882a593Smuzhiyun
prepare_vmcs02_constant_state(struct vcpu_vmx * vmx)2162*4882a593Smuzhiyun static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2163*4882a593Smuzhiyun {
2164*4882a593Smuzhiyun /*
2165*4882a593Smuzhiyun * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2166*4882a593Smuzhiyun * according to L0's settings (vmcs12 is irrelevant here). Host
2167*4882a593Smuzhiyun * fields that come from L0 and are not constant, e.g. HOST_CR3,
2168*4882a593Smuzhiyun * will be set as needed prior to VMLAUNCH/VMRESUME.
2169*4882a593Smuzhiyun */
2170*4882a593Smuzhiyun if (vmx->nested.vmcs02_initialized)
2171*4882a593Smuzhiyun return;
2172*4882a593Smuzhiyun vmx->nested.vmcs02_initialized = true;
2173*4882a593Smuzhiyun
2174*4882a593Smuzhiyun /*
2175*4882a593Smuzhiyun * We don't care what the EPTP value is we just need to guarantee
2176*4882a593Smuzhiyun * it's valid so we don't get a false positive when doing early
2177*4882a593Smuzhiyun * consistency checks.
2178*4882a593Smuzhiyun */
2179*4882a593Smuzhiyun if (enable_ept && nested_early_check)
2180*4882a593Smuzhiyun vmcs_write64(EPT_POINTER,
2181*4882a593Smuzhiyun construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
2182*4882a593Smuzhiyun
2183*4882a593Smuzhiyun /* All VMFUNCs are currently emulated through L0 vmexits. */
2184*4882a593Smuzhiyun if (cpu_has_vmx_vmfunc())
2185*4882a593Smuzhiyun vmcs_write64(VM_FUNCTION_CONTROL, 0);
2186*4882a593Smuzhiyun
2187*4882a593Smuzhiyun if (cpu_has_vmx_posted_intr())
2188*4882a593Smuzhiyun vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2189*4882a593Smuzhiyun
2190*4882a593Smuzhiyun if (cpu_has_vmx_msr_bitmap())
2191*4882a593Smuzhiyun vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2192*4882a593Smuzhiyun
2193*4882a593Smuzhiyun /*
2194*4882a593Smuzhiyun * The PML address never changes, so it is constant in vmcs02.
2195*4882a593Smuzhiyun * Conceptually we want to copy the PML index from vmcs01 here,
2196*4882a593Smuzhiyun * and then back to vmcs01 on nested vmexit. But since we flush
2197*4882a593Smuzhiyun * the log and reset GUEST_PML_INDEX on each vmexit, the PML
2198*4882a593Smuzhiyun * index is also effectively constant in vmcs02.
2199*4882a593Smuzhiyun */
2200*4882a593Smuzhiyun if (enable_pml) {
2201*4882a593Smuzhiyun vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
2202*4882a593Smuzhiyun vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
2203*4882a593Smuzhiyun }
2204*4882a593Smuzhiyun
2205*4882a593Smuzhiyun if (cpu_has_vmx_encls_vmexit())
2206*4882a593Smuzhiyun vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
2207*4882a593Smuzhiyun
2208*4882a593Smuzhiyun /*
2209*4882a593Smuzhiyun * Set the MSR load/store lists to match L0's settings. Only the
2210*4882a593Smuzhiyun * addresses are constant (for vmcs02), the counts can change based
2211*4882a593Smuzhiyun * on L2's behavior, e.g. switching to/from long mode.
2212*4882a593Smuzhiyun */
2213*4882a593Smuzhiyun vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
2214*4882a593Smuzhiyun vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2215*4882a593Smuzhiyun vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2216*4882a593Smuzhiyun
2217*4882a593Smuzhiyun vmx_set_constant_host_state(vmx);
2218*4882a593Smuzhiyun }
2219*4882a593Smuzhiyun
prepare_vmcs02_early_rare(struct vcpu_vmx * vmx,struct vmcs12 * vmcs12)2220*4882a593Smuzhiyun static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2221*4882a593Smuzhiyun struct vmcs12 *vmcs12)
2222*4882a593Smuzhiyun {
2223*4882a593Smuzhiyun prepare_vmcs02_constant_state(vmx);
2224*4882a593Smuzhiyun
2225*4882a593Smuzhiyun vmcs_write64(VMCS_LINK_POINTER, -1ull);
2226*4882a593Smuzhiyun
2227*4882a593Smuzhiyun if (enable_vpid) {
2228*4882a593Smuzhiyun if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2229*4882a593Smuzhiyun vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2230*4882a593Smuzhiyun else
2231*4882a593Smuzhiyun vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2232*4882a593Smuzhiyun }
2233*4882a593Smuzhiyun }
2234*4882a593Smuzhiyun
prepare_vmcs02_early(struct vcpu_vmx * vmx,struct loaded_vmcs * vmcs01,struct vmcs12 * vmcs12)2235*4882a593Smuzhiyun static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
2236*4882a593Smuzhiyun struct vmcs12 *vmcs12)
2237*4882a593Smuzhiyun {
2238*4882a593Smuzhiyun u32 exec_control, vmcs12_exec_ctrl;
2239*4882a593Smuzhiyun u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2240*4882a593Smuzhiyun
2241*4882a593Smuzhiyun if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
2242*4882a593Smuzhiyun prepare_vmcs02_early_rare(vmx, vmcs12);
2243*4882a593Smuzhiyun
2244*4882a593Smuzhiyun /*
2245*4882a593Smuzhiyun * PIN CONTROLS
2246*4882a593Smuzhiyun */
2247*4882a593Smuzhiyun exec_control = __pin_controls_get(vmcs01);
2248*4882a593Smuzhiyun exec_control |= (vmcs12->pin_based_vm_exec_control &
2249*4882a593Smuzhiyun ~PIN_BASED_VMX_PREEMPTION_TIMER);
2250*4882a593Smuzhiyun
2251*4882a593Smuzhiyun /* Posted interrupts setting is only taken from vmcs12. */
2252*4882a593Smuzhiyun vmx->nested.pi_pending = false;
2253*4882a593Smuzhiyun if (nested_cpu_has_posted_intr(vmcs12))
2254*4882a593Smuzhiyun vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2255*4882a593Smuzhiyun else
2256*4882a593Smuzhiyun exec_control &= ~PIN_BASED_POSTED_INTR;
2257*4882a593Smuzhiyun pin_controls_set(vmx, exec_control);
2258*4882a593Smuzhiyun
2259*4882a593Smuzhiyun /*
2260*4882a593Smuzhiyun * EXEC CONTROLS
2261*4882a593Smuzhiyun */
2262*4882a593Smuzhiyun exec_control = __exec_controls_get(vmcs01); /* L0's desires */
2263*4882a593Smuzhiyun exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
2264*4882a593Smuzhiyun exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
2265*4882a593Smuzhiyun exec_control &= ~CPU_BASED_TPR_SHADOW;
2266*4882a593Smuzhiyun exec_control |= vmcs12->cpu_based_vm_exec_control;
2267*4882a593Smuzhiyun
2268*4882a593Smuzhiyun vmx->nested.l1_tpr_threshold = -1;
2269*4882a593Smuzhiyun if (exec_control & CPU_BASED_TPR_SHADOW)
2270*4882a593Smuzhiyun vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2271*4882a593Smuzhiyun #ifdef CONFIG_X86_64
2272*4882a593Smuzhiyun else
2273*4882a593Smuzhiyun exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2274*4882a593Smuzhiyun CPU_BASED_CR8_STORE_EXITING;
2275*4882a593Smuzhiyun #endif
2276*4882a593Smuzhiyun
2277*4882a593Smuzhiyun /*
2278*4882a593Smuzhiyun * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2279*4882a593Smuzhiyun * for I/O port accesses.
2280*4882a593Smuzhiyun */
2281*4882a593Smuzhiyun exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2282*4882a593Smuzhiyun exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2283*4882a593Smuzhiyun
2284*4882a593Smuzhiyun /*
2285*4882a593Smuzhiyun * This bit will be computed in nested_get_vmcs12_pages, because
2286*4882a593Smuzhiyun * we do not have access to L1's MSR bitmap yet. For now, keep
2287*4882a593Smuzhiyun * the same bit as before, hoping to avoid multiple VMWRITEs that
2288*4882a593Smuzhiyun * only set/clear this bit.
2289*4882a593Smuzhiyun */
2290*4882a593Smuzhiyun exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2291*4882a593Smuzhiyun exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2292*4882a593Smuzhiyun
2293*4882a593Smuzhiyun exec_controls_set(vmx, exec_control);
2294*4882a593Smuzhiyun
2295*4882a593Smuzhiyun /*
2296*4882a593Smuzhiyun * SECONDARY EXEC CONTROLS
2297*4882a593Smuzhiyun */
2298*4882a593Smuzhiyun if (cpu_has_secondary_exec_ctrls()) {
2299*4882a593Smuzhiyun exec_control = __secondary_exec_controls_get(vmcs01);
2300*4882a593Smuzhiyun
2301*4882a593Smuzhiyun /* Take the following fields only from vmcs12 */
2302*4882a593Smuzhiyun exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2303*4882a593Smuzhiyun SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2304*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_INVPCID |
2305*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_RDTSCP |
2306*4882a593Smuzhiyun SECONDARY_EXEC_XSAVES |
2307*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2308*4882a593Smuzhiyun SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2309*4882a593Smuzhiyun SECONDARY_EXEC_APIC_REGISTER_VIRT |
2310*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_VMFUNC |
2311*4882a593Smuzhiyun SECONDARY_EXEC_DESC);
2312*4882a593Smuzhiyun
2313*4882a593Smuzhiyun if (nested_cpu_has(vmcs12,
2314*4882a593Smuzhiyun CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
2315*4882a593Smuzhiyun vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
2316*4882a593Smuzhiyun ~SECONDARY_EXEC_ENABLE_PML;
2317*4882a593Smuzhiyun exec_control |= vmcs12_exec_ctrl;
2318*4882a593Smuzhiyun }
2319*4882a593Smuzhiyun
2320*4882a593Smuzhiyun /* VMCS shadowing for L2 is emulated for now */
2321*4882a593Smuzhiyun exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2322*4882a593Smuzhiyun
2323*4882a593Smuzhiyun /*
2324*4882a593Smuzhiyun * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2325*4882a593Smuzhiyun * will not have to rewrite the controls just for this bit.
2326*4882a593Smuzhiyun */
2327*4882a593Smuzhiyun if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2328*4882a593Smuzhiyun (vmcs12->guest_cr4 & X86_CR4_UMIP))
2329*4882a593Smuzhiyun exec_control |= SECONDARY_EXEC_DESC;
2330*4882a593Smuzhiyun
2331*4882a593Smuzhiyun if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2332*4882a593Smuzhiyun vmcs_write16(GUEST_INTR_STATUS,
2333*4882a593Smuzhiyun vmcs12->guest_intr_status);
2334*4882a593Smuzhiyun
2335*4882a593Smuzhiyun if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
2336*4882a593Smuzhiyun exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2337*4882a593Smuzhiyun
2338*4882a593Smuzhiyun secondary_exec_controls_set(vmx, exec_control);
2339*4882a593Smuzhiyun }
2340*4882a593Smuzhiyun
2341*4882a593Smuzhiyun /*
2342*4882a593Smuzhiyun * ENTRY CONTROLS
2343*4882a593Smuzhiyun *
2344*4882a593Smuzhiyun * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2345*4882a593Smuzhiyun * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2346*4882a593Smuzhiyun * on the related bits (if supported by the CPU) in the hope that
2347*4882a593Smuzhiyun * we can avoid VMWrites during vmx_set_efer().
2348*4882a593Smuzhiyun *
2349*4882a593Smuzhiyun * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
2350*4882a593Smuzhiyun * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
2351*4882a593Smuzhiyun * do the same for L2.
2352*4882a593Smuzhiyun */
2353*4882a593Smuzhiyun exec_control = __vm_entry_controls_get(vmcs01);
2354*4882a593Smuzhiyun exec_control |= (vmcs12->vm_entry_controls &
2355*4882a593Smuzhiyun ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
2356*4882a593Smuzhiyun exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
2357*4882a593Smuzhiyun if (cpu_has_load_ia32_efer()) {
2358*4882a593Smuzhiyun if (guest_efer & EFER_LMA)
2359*4882a593Smuzhiyun exec_control |= VM_ENTRY_IA32E_MODE;
2360*4882a593Smuzhiyun if (guest_efer != host_efer)
2361*4882a593Smuzhiyun exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2362*4882a593Smuzhiyun }
2363*4882a593Smuzhiyun vm_entry_controls_set(vmx, exec_control);
2364*4882a593Smuzhiyun
2365*4882a593Smuzhiyun /*
2366*4882a593Smuzhiyun * EXIT CONTROLS
2367*4882a593Smuzhiyun *
2368*4882a593Smuzhiyun * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2369*4882a593Smuzhiyun * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2370*4882a593Smuzhiyun * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2371*4882a593Smuzhiyun */
2372*4882a593Smuzhiyun exec_control = __vm_exit_controls_get(vmcs01);
2373*4882a593Smuzhiyun if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2374*4882a593Smuzhiyun exec_control |= VM_EXIT_LOAD_IA32_EFER;
2375*4882a593Smuzhiyun else
2376*4882a593Smuzhiyun exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
2377*4882a593Smuzhiyun vm_exit_controls_set(vmx, exec_control);
2378*4882a593Smuzhiyun
2379*4882a593Smuzhiyun /*
2380*4882a593Smuzhiyun * Interrupt/Exception Fields
2381*4882a593Smuzhiyun */
2382*4882a593Smuzhiyun if (vmx->nested.nested_run_pending) {
2383*4882a593Smuzhiyun vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2384*4882a593Smuzhiyun vmcs12->vm_entry_intr_info_field);
2385*4882a593Smuzhiyun vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2386*4882a593Smuzhiyun vmcs12->vm_entry_exception_error_code);
2387*4882a593Smuzhiyun vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2388*4882a593Smuzhiyun vmcs12->vm_entry_instruction_len);
2389*4882a593Smuzhiyun vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2390*4882a593Smuzhiyun vmcs12->guest_interruptibility_info);
2391*4882a593Smuzhiyun vmx->loaded_vmcs->nmi_known_unmasked =
2392*4882a593Smuzhiyun !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2393*4882a593Smuzhiyun } else {
2394*4882a593Smuzhiyun vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2395*4882a593Smuzhiyun }
2396*4882a593Smuzhiyun }
2397*4882a593Smuzhiyun
prepare_vmcs02_rare(struct vcpu_vmx * vmx,struct vmcs12 * vmcs12)2398*4882a593Smuzhiyun static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2399*4882a593Smuzhiyun {
2400*4882a593Smuzhiyun struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2401*4882a593Smuzhiyun
2402*4882a593Smuzhiyun if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2403*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2404*4882a593Smuzhiyun vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2405*4882a593Smuzhiyun vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2406*4882a593Smuzhiyun vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2407*4882a593Smuzhiyun vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2408*4882a593Smuzhiyun vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2409*4882a593Smuzhiyun vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2410*4882a593Smuzhiyun vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2411*4882a593Smuzhiyun vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2412*4882a593Smuzhiyun vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2413*4882a593Smuzhiyun vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2414*4882a593Smuzhiyun vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2415*4882a593Smuzhiyun vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2416*4882a593Smuzhiyun vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2417*4882a593Smuzhiyun vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2418*4882a593Smuzhiyun vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2419*4882a593Smuzhiyun vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2420*4882a593Smuzhiyun vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2421*4882a593Smuzhiyun vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2422*4882a593Smuzhiyun vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2423*4882a593Smuzhiyun vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2424*4882a593Smuzhiyun vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2425*4882a593Smuzhiyun vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2426*4882a593Smuzhiyun vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2427*4882a593Smuzhiyun vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2428*4882a593Smuzhiyun vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2429*4882a593Smuzhiyun vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2430*4882a593Smuzhiyun vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2431*4882a593Smuzhiyun vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2432*4882a593Smuzhiyun vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2433*4882a593Smuzhiyun vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2434*4882a593Smuzhiyun vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2435*4882a593Smuzhiyun vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2436*4882a593Smuzhiyun vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2437*4882a593Smuzhiyun vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2438*4882a593Smuzhiyun vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2439*4882a593Smuzhiyun vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2440*4882a593Smuzhiyun
2441*4882a593Smuzhiyun vmx->segment_cache.bitmask = 0;
2442*4882a593Smuzhiyun }
2443*4882a593Smuzhiyun
2444*4882a593Smuzhiyun if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2445*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2446*4882a593Smuzhiyun vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2447*4882a593Smuzhiyun vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2448*4882a593Smuzhiyun vmcs12->guest_pending_dbg_exceptions);
2449*4882a593Smuzhiyun vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2450*4882a593Smuzhiyun vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2451*4882a593Smuzhiyun
2452*4882a593Smuzhiyun /*
2453*4882a593Smuzhiyun * L1 may access the L2's PDPTR, so save them to construct
2454*4882a593Smuzhiyun * vmcs12
2455*4882a593Smuzhiyun */
2456*4882a593Smuzhiyun if (enable_ept) {
2457*4882a593Smuzhiyun vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2458*4882a593Smuzhiyun vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2459*4882a593Smuzhiyun vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2460*4882a593Smuzhiyun vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2461*4882a593Smuzhiyun }
2462*4882a593Smuzhiyun
2463*4882a593Smuzhiyun if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2464*4882a593Smuzhiyun (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2465*4882a593Smuzhiyun vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2466*4882a593Smuzhiyun }
2467*4882a593Smuzhiyun
2468*4882a593Smuzhiyun if (nested_cpu_has_xsaves(vmcs12))
2469*4882a593Smuzhiyun vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2470*4882a593Smuzhiyun
2471*4882a593Smuzhiyun /*
2472*4882a593Smuzhiyun * Whether page-faults are trapped is determined by a combination of
2473*4882a593Smuzhiyun * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
2474*4882a593Smuzhiyun * doesn't care about page faults then we should set all of these to
2475*4882a593Smuzhiyun * L1's desires. However, if L0 does care about (some) page faults, it
2476*4882a593Smuzhiyun * is not easy (if at all possible?) to merge L0 and L1's desires, we
2477*4882a593Smuzhiyun * simply ask to exit on each and every L2 page fault. This is done by
2478*4882a593Smuzhiyun * setting MASK=MATCH=0 and (see below) EB.PF=1.
2479*4882a593Smuzhiyun * Note that below we don't need special code to set EB.PF beyond the
2480*4882a593Smuzhiyun * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2481*4882a593Smuzhiyun * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2482*4882a593Smuzhiyun * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2483*4882a593Smuzhiyun */
2484*4882a593Smuzhiyun if (vmx_need_pf_intercept(&vmx->vcpu)) {
2485*4882a593Smuzhiyun /*
2486*4882a593Smuzhiyun * TODO: if both L0 and L1 need the same MASK and MATCH,
2487*4882a593Smuzhiyun * go ahead and use it?
2488*4882a593Smuzhiyun */
2489*4882a593Smuzhiyun vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2490*4882a593Smuzhiyun vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2491*4882a593Smuzhiyun } else {
2492*4882a593Smuzhiyun vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
2493*4882a593Smuzhiyun vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
2494*4882a593Smuzhiyun }
2495*4882a593Smuzhiyun
2496*4882a593Smuzhiyun if (cpu_has_vmx_apicv()) {
2497*4882a593Smuzhiyun vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2498*4882a593Smuzhiyun vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2499*4882a593Smuzhiyun vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2500*4882a593Smuzhiyun vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2501*4882a593Smuzhiyun }
2502*4882a593Smuzhiyun
2503*4882a593Smuzhiyun /*
2504*4882a593Smuzhiyun * Make sure the msr_autostore list is up to date before we set the
2505*4882a593Smuzhiyun * count in the vmcs02.
2506*4882a593Smuzhiyun */
2507*4882a593Smuzhiyun prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2508*4882a593Smuzhiyun
2509*4882a593Smuzhiyun vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
2510*4882a593Smuzhiyun vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2511*4882a593Smuzhiyun vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2512*4882a593Smuzhiyun
2513*4882a593Smuzhiyun set_cr4_guest_host_mask(vmx);
2514*4882a593Smuzhiyun }
2515*4882a593Smuzhiyun
2516*4882a593Smuzhiyun /*
2517*4882a593Smuzhiyun * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2518*4882a593Smuzhiyun * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2519*4882a593Smuzhiyun * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2520*4882a593Smuzhiyun * guest in a way that will both be appropriate to L1's requests, and our
2521*4882a593Smuzhiyun * needs. In addition to modifying the active vmcs (which is vmcs02), this
2522*4882a593Smuzhiyun * function also has additional necessary side-effects, like setting various
2523*4882a593Smuzhiyun * vcpu->arch fields.
2524*4882a593Smuzhiyun * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2525*4882a593Smuzhiyun * is assigned to entry_failure_code on failure.
2526*4882a593Smuzhiyun */
prepare_vmcs02(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,enum vm_entry_failure_code * entry_failure_code)2527*4882a593Smuzhiyun static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2528*4882a593Smuzhiyun enum vm_entry_failure_code *entry_failure_code)
2529*4882a593Smuzhiyun {
2530*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
2531*4882a593Smuzhiyun struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2532*4882a593Smuzhiyun bool load_guest_pdptrs_vmcs12 = false;
2533*4882a593Smuzhiyun
2534*4882a593Smuzhiyun if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
2535*4882a593Smuzhiyun prepare_vmcs02_rare(vmx, vmcs12);
2536*4882a593Smuzhiyun vmx->nested.dirty_vmcs12 = false;
2537*4882a593Smuzhiyun
2538*4882a593Smuzhiyun load_guest_pdptrs_vmcs12 = !hv_evmcs ||
2539*4882a593Smuzhiyun !(hv_evmcs->hv_clean_fields &
2540*4882a593Smuzhiyun HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2541*4882a593Smuzhiyun }
2542*4882a593Smuzhiyun
2543*4882a593Smuzhiyun if (vmx->nested.nested_run_pending &&
2544*4882a593Smuzhiyun (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2545*4882a593Smuzhiyun kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2546*4882a593Smuzhiyun vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2547*4882a593Smuzhiyun } else {
2548*4882a593Smuzhiyun kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2549*4882a593Smuzhiyun vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2550*4882a593Smuzhiyun }
2551*4882a593Smuzhiyun if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2552*4882a593Smuzhiyun !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2553*4882a593Smuzhiyun vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2554*4882a593Smuzhiyun vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2555*4882a593Smuzhiyun
2556*4882a593Smuzhiyun /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2557*4882a593Smuzhiyun * bitwise-or of what L1 wants to trap for L2, and what we want to
2558*4882a593Smuzhiyun * trap. Note that CR0.TS also needs updating - we do this later.
2559*4882a593Smuzhiyun */
2560*4882a593Smuzhiyun update_exception_bitmap(vcpu);
2561*4882a593Smuzhiyun vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2562*4882a593Smuzhiyun vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2563*4882a593Smuzhiyun
2564*4882a593Smuzhiyun if (vmx->nested.nested_run_pending &&
2565*4882a593Smuzhiyun (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2566*4882a593Smuzhiyun vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2567*4882a593Smuzhiyun vcpu->arch.pat = vmcs12->guest_ia32_pat;
2568*4882a593Smuzhiyun } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2569*4882a593Smuzhiyun vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2570*4882a593Smuzhiyun }
2571*4882a593Smuzhiyun
2572*4882a593Smuzhiyun vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2573*4882a593Smuzhiyun
2574*4882a593Smuzhiyun if (kvm_has_tsc_control)
2575*4882a593Smuzhiyun decache_tsc_multiplier(vmx);
2576*4882a593Smuzhiyun
2577*4882a593Smuzhiyun nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
2578*4882a593Smuzhiyun
2579*4882a593Smuzhiyun if (nested_cpu_has_ept(vmcs12))
2580*4882a593Smuzhiyun nested_ept_init_mmu_context(vcpu);
2581*4882a593Smuzhiyun
2582*4882a593Smuzhiyun /*
2583*4882a593Smuzhiyun * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2584*4882a593Smuzhiyun * bits which we consider mandatory enabled.
2585*4882a593Smuzhiyun * The CR0_READ_SHADOW is what L2 should have expected to read given
2586*4882a593Smuzhiyun * the specifications by L1; It's not enough to take
2587*4882a593Smuzhiyun * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2588*4882a593Smuzhiyun * have more bits than L1 expected.
2589*4882a593Smuzhiyun */
2590*4882a593Smuzhiyun vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2591*4882a593Smuzhiyun vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2592*4882a593Smuzhiyun
2593*4882a593Smuzhiyun vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2594*4882a593Smuzhiyun vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2595*4882a593Smuzhiyun
2596*4882a593Smuzhiyun vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2597*4882a593Smuzhiyun /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2598*4882a593Smuzhiyun vmx_set_efer(vcpu, vcpu->arch.efer);
2599*4882a593Smuzhiyun
2600*4882a593Smuzhiyun /*
2601*4882a593Smuzhiyun * Guest state is invalid and unrestricted guest is disabled,
2602*4882a593Smuzhiyun * which means L1 attempted VMEntry to L2 with invalid state.
2603*4882a593Smuzhiyun * Fail the VMEntry.
2604*4882a593Smuzhiyun */
2605*4882a593Smuzhiyun if (CC(!vmx_guest_state_valid(vcpu))) {
2606*4882a593Smuzhiyun *entry_failure_code = ENTRY_FAIL_DEFAULT;
2607*4882a593Smuzhiyun return -EINVAL;
2608*4882a593Smuzhiyun }
2609*4882a593Smuzhiyun
2610*4882a593Smuzhiyun /* Shadow page tables on either EPT or shadow page tables. */
2611*4882a593Smuzhiyun if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2612*4882a593Smuzhiyun entry_failure_code))
2613*4882a593Smuzhiyun return -EINVAL;
2614*4882a593Smuzhiyun
2615*4882a593Smuzhiyun /*
2616*4882a593Smuzhiyun * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
2617*4882a593Smuzhiyun * on nested VM-Exit, which can occur without actually running L2 and
2618*4882a593Smuzhiyun * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
2619*4882a593Smuzhiyun * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2620*4882a593Smuzhiyun * transition to HLT instead of running L2.
2621*4882a593Smuzhiyun */
2622*4882a593Smuzhiyun if (enable_ept)
2623*4882a593Smuzhiyun vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2624*4882a593Smuzhiyun
2625*4882a593Smuzhiyun /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2626*4882a593Smuzhiyun if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2627*4882a593Smuzhiyun is_pae_paging(vcpu)) {
2628*4882a593Smuzhiyun vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2629*4882a593Smuzhiyun vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2630*4882a593Smuzhiyun vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2631*4882a593Smuzhiyun vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2632*4882a593Smuzhiyun }
2633*4882a593Smuzhiyun
2634*4882a593Smuzhiyun if (!enable_ept)
2635*4882a593Smuzhiyun vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2636*4882a593Smuzhiyun
2637*4882a593Smuzhiyun if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2638*4882a593Smuzhiyun WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2639*4882a593Smuzhiyun vmcs12->guest_ia32_perf_global_ctrl))) {
2640*4882a593Smuzhiyun *entry_failure_code = ENTRY_FAIL_DEFAULT;
2641*4882a593Smuzhiyun return -EINVAL;
2642*4882a593Smuzhiyun }
2643*4882a593Smuzhiyun
2644*4882a593Smuzhiyun kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2645*4882a593Smuzhiyun kvm_rip_write(vcpu, vmcs12->guest_rip);
2646*4882a593Smuzhiyun return 0;
2647*4882a593Smuzhiyun }
2648*4882a593Smuzhiyun
nested_vmx_check_nmi_controls(struct vmcs12 * vmcs12)2649*4882a593Smuzhiyun static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2650*4882a593Smuzhiyun {
2651*4882a593Smuzhiyun if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2652*4882a593Smuzhiyun nested_cpu_has_virtual_nmis(vmcs12)))
2653*4882a593Smuzhiyun return -EINVAL;
2654*4882a593Smuzhiyun
2655*4882a593Smuzhiyun if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2656*4882a593Smuzhiyun nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
2657*4882a593Smuzhiyun return -EINVAL;
2658*4882a593Smuzhiyun
2659*4882a593Smuzhiyun return 0;
2660*4882a593Smuzhiyun }
2661*4882a593Smuzhiyun
nested_vmx_check_eptp(struct kvm_vcpu * vcpu,u64 new_eptp)2662*4882a593Smuzhiyun static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
2663*4882a593Smuzhiyun {
2664*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
2665*4882a593Smuzhiyun int maxphyaddr = cpuid_maxphyaddr(vcpu);
2666*4882a593Smuzhiyun
2667*4882a593Smuzhiyun /* Check for memory type validity */
2668*4882a593Smuzhiyun switch (new_eptp & VMX_EPTP_MT_MASK) {
2669*4882a593Smuzhiyun case VMX_EPTP_MT_UC:
2670*4882a593Smuzhiyun if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2671*4882a593Smuzhiyun return false;
2672*4882a593Smuzhiyun break;
2673*4882a593Smuzhiyun case VMX_EPTP_MT_WB:
2674*4882a593Smuzhiyun if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2675*4882a593Smuzhiyun return false;
2676*4882a593Smuzhiyun break;
2677*4882a593Smuzhiyun default:
2678*4882a593Smuzhiyun return false;
2679*4882a593Smuzhiyun }
2680*4882a593Smuzhiyun
2681*4882a593Smuzhiyun /* Page-walk levels validity. */
2682*4882a593Smuzhiyun switch (new_eptp & VMX_EPTP_PWL_MASK) {
2683*4882a593Smuzhiyun case VMX_EPTP_PWL_5:
2684*4882a593Smuzhiyun if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
2685*4882a593Smuzhiyun return false;
2686*4882a593Smuzhiyun break;
2687*4882a593Smuzhiyun case VMX_EPTP_PWL_4:
2688*4882a593Smuzhiyun if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
2689*4882a593Smuzhiyun return false;
2690*4882a593Smuzhiyun break;
2691*4882a593Smuzhiyun default:
2692*4882a593Smuzhiyun return false;
2693*4882a593Smuzhiyun }
2694*4882a593Smuzhiyun
2695*4882a593Smuzhiyun /* Reserved bits should not be set */
2696*4882a593Smuzhiyun if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f)))
2697*4882a593Smuzhiyun return false;
2698*4882a593Smuzhiyun
2699*4882a593Smuzhiyun /* AD, if set, should be supported */
2700*4882a593Smuzhiyun if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
2701*4882a593Smuzhiyun if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2702*4882a593Smuzhiyun return false;
2703*4882a593Smuzhiyun }
2704*4882a593Smuzhiyun
2705*4882a593Smuzhiyun return true;
2706*4882a593Smuzhiyun }
2707*4882a593Smuzhiyun
2708*4882a593Smuzhiyun /*
2709*4882a593Smuzhiyun * Checks related to VM-Execution Control Fields
2710*4882a593Smuzhiyun */
nested_check_vm_execution_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)2711*4882a593Smuzhiyun static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2712*4882a593Smuzhiyun struct vmcs12 *vmcs12)
2713*4882a593Smuzhiyun {
2714*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
2715*4882a593Smuzhiyun
2716*4882a593Smuzhiyun if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2717*4882a593Smuzhiyun vmx->nested.msrs.pinbased_ctls_low,
2718*4882a593Smuzhiyun vmx->nested.msrs.pinbased_ctls_high)) ||
2719*4882a593Smuzhiyun CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2720*4882a593Smuzhiyun vmx->nested.msrs.procbased_ctls_low,
2721*4882a593Smuzhiyun vmx->nested.msrs.procbased_ctls_high)))
2722*4882a593Smuzhiyun return -EINVAL;
2723*4882a593Smuzhiyun
2724*4882a593Smuzhiyun if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2725*4882a593Smuzhiyun CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2726*4882a593Smuzhiyun vmx->nested.msrs.secondary_ctls_low,
2727*4882a593Smuzhiyun vmx->nested.msrs.secondary_ctls_high)))
2728*4882a593Smuzhiyun return -EINVAL;
2729*4882a593Smuzhiyun
2730*4882a593Smuzhiyun if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2731*4882a593Smuzhiyun nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2732*4882a593Smuzhiyun nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2733*4882a593Smuzhiyun nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2734*4882a593Smuzhiyun nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2735*4882a593Smuzhiyun nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2736*4882a593Smuzhiyun nested_vmx_check_nmi_controls(vmcs12) ||
2737*4882a593Smuzhiyun nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2738*4882a593Smuzhiyun nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2739*4882a593Smuzhiyun nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2740*4882a593Smuzhiyun nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2741*4882a593Smuzhiyun CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2742*4882a593Smuzhiyun return -EINVAL;
2743*4882a593Smuzhiyun
2744*4882a593Smuzhiyun if (!nested_cpu_has_preemption_timer(vmcs12) &&
2745*4882a593Smuzhiyun nested_cpu_has_save_preemption_timer(vmcs12))
2746*4882a593Smuzhiyun return -EINVAL;
2747*4882a593Smuzhiyun
2748*4882a593Smuzhiyun if (nested_cpu_has_ept(vmcs12) &&
2749*4882a593Smuzhiyun CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
2750*4882a593Smuzhiyun return -EINVAL;
2751*4882a593Smuzhiyun
2752*4882a593Smuzhiyun if (nested_cpu_has_vmfunc(vmcs12)) {
2753*4882a593Smuzhiyun if (CC(vmcs12->vm_function_control &
2754*4882a593Smuzhiyun ~vmx->nested.msrs.vmfunc_controls))
2755*4882a593Smuzhiyun return -EINVAL;
2756*4882a593Smuzhiyun
2757*4882a593Smuzhiyun if (nested_cpu_has_eptp_switching(vmcs12)) {
2758*4882a593Smuzhiyun if (CC(!nested_cpu_has_ept(vmcs12)) ||
2759*4882a593Smuzhiyun CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2760*4882a593Smuzhiyun return -EINVAL;
2761*4882a593Smuzhiyun }
2762*4882a593Smuzhiyun }
2763*4882a593Smuzhiyun
2764*4882a593Smuzhiyun return 0;
2765*4882a593Smuzhiyun }
2766*4882a593Smuzhiyun
2767*4882a593Smuzhiyun /*
2768*4882a593Smuzhiyun * Checks related to VM-Exit Control Fields
2769*4882a593Smuzhiyun */
nested_check_vm_exit_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)2770*4882a593Smuzhiyun static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2771*4882a593Smuzhiyun struct vmcs12 *vmcs12)
2772*4882a593Smuzhiyun {
2773*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
2774*4882a593Smuzhiyun
2775*4882a593Smuzhiyun if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2776*4882a593Smuzhiyun vmx->nested.msrs.exit_ctls_low,
2777*4882a593Smuzhiyun vmx->nested.msrs.exit_ctls_high)) ||
2778*4882a593Smuzhiyun CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2779*4882a593Smuzhiyun return -EINVAL;
2780*4882a593Smuzhiyun
2781*4882a593Smuzhiyun return 0;
2782*4882a593Smuzhiyun }
2783*4882a593Smuzhiyun
2784*4882a593Smuzhiyun /*
2785*4882a593Smuzhiyun * Checks related to VM-Entry Control Fields
2786*4882a593Smuzhiyun */
nested_check_vm_entry_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)2787*4882a593Smuzhiyun static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2788*4882a593Smuzhiyun struct vmcs12 *vmcs12)
2789*4882a593Smuzhiyun {
2790*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
2791*4882a593Smuzhiyun
2792*4882a593Smuzhiyun if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2793*4882a593Smuzhiyun vmx->nested.msrs.entry_ctls_low,
2794*4882a593Smuzhiyun vmx->nested.msrs.entry_ctls_high)))
2795*4882a593Smuzhiyun return -EINVAL;
2796*4882a593Smuzhiyun
2797*4882a593Smuzhiyun /*
2798*4882a593Smuzhiyun * From the Intel SDM, volume 3:
2799*4882a593Smuzhiyun * Fields relevant to VM-entry event injection must be set properly.
2800*4882a593Smuzhiyun * These fields are the VM-entry interruption-information field, the
2801*4882a593Smuzhiyun * VM-entry exception error code, and the VM-entry instruction length.
2802*4882a593Smuzhiyun */
2803*4882a593Smuzhiyun if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2804*4882a593Smuzhiyun u32 intr_info = vmcs12->vm_entry_intr_info_field;
2805*4882a593Smuzhiyun u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2806*4882a593Smuzhiyun u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2807*4882a593Smuzhiyun bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2808*4882a593Smuzhiyun bool should_have_error_code;
2809*4882a593Smuzhiyun bool urg = nested_cpu_has2(vmcs12,
2810*4882a593Smuzhiyun SECONDARY_EXEC_UNRESTRICTED_GUEST);
2811*4882a593Smuzhiyun bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2812*4882a593Smuzhiyun
2813*4882a593Smuzhiyun /* VM-entry interruption-info field: interruption type */
2814*4882a593Smuzhiyun if (CC(intr_type == INTR_TYPE_RESERVED) ||
2815*4882a593Smuzhiyun CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2816*4882a593Smuzhiyun !nested_cpu_supports_monitor_trap_flag(vcpu)))
2817*4882a593Smuzhiyun return -EINVAL;
2818*4882a593Smuzhiyun
2819*4882a593Smuzhiyun /* VM-entry interruption-info field: vector */
2820*4882a593Smuzhiyun if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2821*4882a593Smuzhiyun CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2822*4882a593Smuzhiyun CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2823*4882a593Smuzhiyun return -EINVAL;
2824*4882a593Smuzhiyun
2825*4882a593Smuzhiyun /* VM-entry interruption-info field: deliver error code */
2826*4882a593Smuzhiyun should_have_error_code =
2827*4882a593Smuzhiyun intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2828*4882a593Smuzhiyun x86_exception_has_error_code(vector);
2829*4882a593Smuzhiyun if (CC(has_error_code != should_have_error_code))
2830*4882a593Smuzhiyun return -EINVAL;
2831*4882a593Smuzhiyun
2832*4882a593Smuzhiyun /* VM-entry exception error code */
2833*4882a593Smuzhiyun if (CC(has_error_code &&
2834*4882a593Smuzhiyun vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
2835*4882a593Smuzhiyun return -EINVAL;
2836*4882a593Smuzhiyun
2837*4882a593Smuzhiyun /* VM-entry interruption-info field: reserved bits */
2838*4882a593Smuzhiyun if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
2839*4882a593Smuzhiyun return -EINVAL;
2840*4882a593Smuzhiyun
2841*4882a593Smuzhiyun /* VM-entry instruction length */
2842*4882a593Smuzhiyun switch (intr_type) {
2843*4882a593Smuzhiyun case INTR_TYPE_SOFT_EXCEPTION:
2844*4882a593Smuzhiyun case INTR_TYPE_SOFT_INTR:
2845*4882a593Smuzhiyun case INTR_TYPE_PRIV_SW_EXCEPTION:
2846*4882a593Smuzhiyun if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2847*4882a593Smuzhiyun CC(vmcs12->vm_entry_instruction_len == 0 &&
2848*4882a593Smuzhiyun CC(!nested_cpu_has_zero_length_injection(vcpu))))
2849*4882a593Smuzhiyun return -EINVAL;
2850*4882a593Smuzhiyun }
2851*4882a593Smuzhiyun }
2852*4882a593Smuzhiyun
2853*4882a593Smuzhiyun if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2854*4882a593Smuzhiyun return -EINVAL;
2855*4882a593Smuzhiyun
2856*4882a593Smuzhiyun return 0;
2857*4882a593Smuzhiyun }
2858*4882a593Smuzhiyun
nested_vmx_check_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)2859*4882a593Smuzhiyun static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2860*4882a593Smuzhiyun struct vmcs12 *vmcs12)
2861*4882a593Smuzhiyun {
2862*4882a593Smuzhiyun if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2863*4882a593Smuzhiyun nested_check_vm_exit_controls(vcpu, vmcs12) ||
2864*4882a593Smuzhiyun nested_check_vm_entry_controls(vcpu, vmcs12))
2865*4882a593Smuzhiyun return -EINVAL;
2866*4882a593Smuzhiyun
2867*4882a593Smuzhiyun if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
2868*4882a593Smuzhiyun return nested_evmcs_check_controls(vmcs12);
2869*4882a593Smuzhiyun
2870*4882a593Smuzhiyun return 0;
2871*4882a593Smuzhiyun }
2872*4882a593Smuzhiyun
nested_vmx_check_address_space_size(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)2873*4882a593Smuzhiyun static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
2874*4882a593Smuzhiyun struct vmcs12 *vmcs12)
2875*4882a593Smuzhiyun {
2876*4882a593Smuzhiyun #ifdef CONFIG_X86_64
2877*4882a593Smuzhiyun if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
2878*4882a593Smuzhiyun !!(vcpu->arch.efer & EFER_LMA)))
2879*4882a593Smuzhiyun return -EINVAL;
2880*4882a593Smuzhiyun #endif
2881*4882a593Smuzhiyun return 0;
2882*4882a593Smuzhiyun }
2883*4882a593Smuzhiyun
nested_vmx_check_host_state(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)2884*4882a593Smuzhiyun static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2885*4882a593Smuzhiyun struct vmcs12 *vmcs12)
2886*4882a593Smuzhiyun {
2887*4882a593Smuzhiyun bool ia32e;
2888*4882a593Smuzhiyun
2889*4882a593Smuzhiyun if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
2890*4882a593Smuzhiyun CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
2891*4882a593Smuzhiyun CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3)))
2892*4882a593Smuzhiyun return -EINVAL;
2893*4882a593Smuzhiyun
2894*4882a593Smuzhiyun if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
2895*4882a593Smuzhiyun CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
2896*4882a593Smuzhiyun return -EINVAL;
2897*4882a593Smuzhiyun
2898*4882a593Smuzhiyun if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2899*4882a593Smuzhiyun CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
2900*4882a593Smuzhiyun return -EINVAL;
2901*4882a593Smuzhiyun
2902*4882a593Smuzhiyun if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2903*4882a593Smuzhiyun CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2904*4882a593Smuzhiyun vmcs12->host_ia32_perf_global_ctrl)))
2905*4882a593Smuzhiyun return -EINVAL;
2906*4882a593Smuzhiyun
2907*4882a593Smuzhiyun #ifdef CONFIG_X86_64
2908*4882a593Smuzhiyun ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
2909*4882a593Smuzhiyun #else
2910*4882a593Smuzhiyun ia32e = false;
2911*4882a593Smuzhiyun #endif
2912*4882a593Smuzhiyun
2913*4882a593Smuzhiyun if (ia32e) {
2914*4882a593Smuzhiyun if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
2915*4882a593Smuzhiyun return -EINVAL;
2916*4882a593Smuzhiyun } else {
2917*4882a593Smuzhiyun if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
2918*4882a593Smuzhiyun CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
2919*4882a593Smuzhiyun CC((vmcs12->host_rip) >> 32))
2920*4882a593Smuzhiyun return -EINVAL;
2921*4882a593Smuzhiyun }
2922*4882a593Smuzhiyun
2923*4882a593Smuzhiyun if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2924*4882a593Smuzhiyun CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2925*4882a593Smuzhiyun CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2926*4882a593Smuzhiyun CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2927*4882a593Smuzhiyun CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2928*4882a593Smuzhiyun CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2929*4882a593Smuzhiyun CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2930*4882a593Smuzhiyun CC(vmcs12->host_cs_selector == 0) ||
2931*4882a593Smuzhiyun CC(vmcs12->host_tr_selector == 0) ||
2932*4882a593Smuzhiyun CC(vmcs12->host_ss_selector == 0 && !ia32e))
2933*4882a593Smuzhiyun return -EINVAL;
2934*4882a593Smuzhiyun
2935*4882a593Smuzhiyun if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
2936*4882a593Smuzhiyun CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
2937*4882a593Smuzhiyun CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
2938*4882a593Smuzhiyun CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
2939*4882a593Smuzhiyun CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
2940*4882a593Smuzhiyun CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
2941*4882a593Smuzhiyun return -EINVAL;
2942*4882a593Smuzhiyun
2943*4882a593Smuzhiyun /*
2944*4882a593Smuzhiyun * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2945*4882a593Smuzhiyun * IA32_EFER MSR must be 0 in the field for that register. In addition,
2946*4882a593Smuzhiyun * the values of the LMA and LME bits in the field must each be that of
2947*4882a593Smuzhiyun * the host address-space size VM-exit control.
2948*4882a593Smuzhiyun */
2949*4882a593Smuzhiyun if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2950*4882a593Smuzhiyun if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
2951*4882a593Smuzhiyun CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
2952*4882a593Smuzhiyun CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
2953*4882a593Smuzhiyun return -EINVAL;
2954*4882a593Smuzhiyun }
2955*4882a593Smuzhiyun
2956*4882a593Smuzhiyun return 0;
2957*4882a593Smuzhiyun }
2958*4882a593Smuzhiyun
nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)2959*4882a593Smuzhiyun static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2960*4882a593Smuzhiyun struct vmcs12 *vmcs12)
2961*4882a593Smuzhiyun {
2962*4882a593Smuzhiyun int r = 0;
2963*4882a593Smuzhiyun struct vmcs12 *shadow;
2964*4882a593Smuzhiyun struct kvm_host_map map;
2965*4882a593Smuzhiyun
2966*4882a593Smuzhiyun if (vmcs12->vmcs_link_pointer == -1ull)
2967*4882a593Smuzhiyun return 0;
2968*4882a593Smuzhiyun
2969*4882a593Smuzhiyun if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
2970*4882a593Smuzhiyun return -EINVAL;
2971*4882a593Smuzhiyun
2972*4882a593Smuzhiyun if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
2973*4882a593Smuzhiyun return -EINVAL;
2974*4882a593Smuzhiyun
2975*4882a593Smuzhiyun shadow = map.hva;
2976*4882a593Smuzhiyun
2977*4882a593Smuzhiyun if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
2978*4882a593Smuzhiyun CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
2979*4882a593Smuzhiyun r = -EINVAL;
2980*4882a593Smuzhiyun
2981*4882a593Smuzhiyun kvm_vcpu_unmap(vcpu, &map, false);
2982*4882a593Smuzhiyun return r;
2983*4882a593Smuzhiyun }
2984*4882a593Smuzhiyun
2985*4882a593Smuzhiyun /*
2986*4882a593Smuzhiyun * Checks related to Guest Non-register State
2987*4882a593Smuzhiyun */
nested_check_guest_non_reg_state(struct vmcs12 * vmcs12)2988*4882a593Smuzhiyun static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2989*4882a593Smuzhiyun {
2990*4882a593Smuzhiyun if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2991*4882a593Smuzhiyun vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT))
2992*4882a593Smuzhiyun return -EINVAL;
2993*4882a593Smuzhiyun
2994*4882a593Smuzhiyun return 0;
2995*4882a593Smuzhiyun }
2996*4882a593Smuzhiyun
nested_vmx_check_guest_state(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,enum vm_entry_failure_code * entry_failure_code)2997*4882a593Smuzhiyun static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2998*4882a593Smuzhiyun struct vmcs12 *vmcs12,
2999*4882a593Smuzhiyun enum vm_entry_failure_code *entry_failure_code)
3000*4882a593Smuzhiyun {
3001*4882a593Smuzhiyun bool ia32e;
3002*4882a593Smuzhiyun
3003*4882a593Smuzhiyun *entry_failure_code = ENTRY_FAIL_DEFAULT;
3004*4882a593Smuzhiyun
3005*4882a593Smuzhiyun if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
3006*4882a593Smuzhiyun CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
3007*4882a593Smuzhiyun return -EINVAL;
3008*4882a593Smuzhiyun
3009*4882a593Smuzhiyun if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
3010*4882a593Smuzhiyun CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
3011*4882a593Smuzhiyun return -EINVAL;
3012*4882a593Smuzhiyun
3013*4882a593Smuzhiyun if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
3014*4882a593Smuzhiyun CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
3015*4882a593Smuzhiyun return -EINVAL;
3016*4882a593Smuzhiyun
3017*4882a593Smuzhiyun if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
3018*4882a593Smuzhiyun *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
3019*4882a593Smuzhiyun return -EINVAL;
3020*4882a593Smuzhiyun }
3021*4882a593Smuzhiyun
3022*4882a593Smuzhiyun if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
3023*4882a593Smuzhiyun CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
3024*4882a593Smuzhiyun vmcs12->guest_ia32_perf_global_ctrl)))
3025*4882a593Smuzhiyun return -EINVAL;
3026*4882a593Smuzhiyun
3027*4882a593Smuzhiyun /*
3028*4882a593Smuzhiyun * If the load IA32_EFER VM-entry control is 1, the following checks
3029*4882a593Smuzhiyun * are performed on the field for the IA32_EFER MSR:
3030*4882a593Smuzhiyun * - Bits reserved in the IA32_EFER MSR must be 0.
3031*4882a593Smuzhiyun * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
3032*4882a593Smuzhiyun * the IA-32e mode guest VM-exit control. It must also be identical
3033*4882a593Smuzhiyun * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
3034*4882a593Smuzhiyun * CR0.PG) is 1.
3035*4882a593Smuzhiyun */
3036*4882a593Smuzhiyun if (to_vmx(vcpu)->nested.nested_run_pending &&
3037*4882a593Smuzhiyun (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
3038*4882a593Smuzhiyun ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
3039*4882a593Smuzhiyun if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
3040*4882a593Smuzhiyun CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
3041*4882a593Smuzhiyun CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
3042*4882a593Smuzhiyun ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
3043*4882a593Smuzhiyun return -EINVAL;
3044*4882a593Smuzhiyun }
3045*4882a593Smuzhiyun
3046*4882a593Smuzhiyun if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
3047*4882a593Smuzhiyun (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
3048*4882a593Smuzhiyun CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
3049*4882a593Smuzhiyun return -EINVAL;
3050*4882a593Smuzhiyun
3051*4882a593Smuzhiyun if (nested_check_guest_non_reg_state(vmcs12))
3052*4882a593Smuzhiyun return -EINVAL;
3053*4882a593Smuzhiyun
3054*4882a593Smuzhiyun return 0;
3055*4882a593Smuzhiyun }
3056*4882a593Smuzhiyun
nested_vmx_check_vmentry_hw(struct kvm_vcpu * vcpu)3057*4882a593Smuzhiyun static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
3058*4882a593Smuzhiyun {
3059*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
3060*4882a593Smuzhiyun unsigned long cr3, cr4;
3061*4882a593Smuzhiyun bool vm_fail;
3062*4882a593Smuzhiyun
3063*4882a593Smuzhiyun if (!nested_early_check)
3064*4882a593Smuzhiyun return 0;
3065*4882a593Smuzhiyun
3066*4882a593Smuzhiyun if (vmx->msr_autoload.host.nr)
3067*4882a593Smuzhiyun vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3068*4882a593Smuzhiyun if (vmx->msr_autoload.guest.nr)
3069*4882a593Smuzhiyun vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3070*4882a593Smuzhiyun
3071*4882a593Smuzhiyun preempt_disable();
3072*4882a593Smuzhiyun
3073*4882a593Smuzhiyun vmx_prepare_switch_to_guest(vcpu);
3074*4882a593Smuzhiyun
3075*4882a593Smuzhiyun /*
3076*4882a593Smuzhiyun * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
3077*4882a593Smuzhiyun * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
3078*4882a593Smuzhiyun * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
3079*4882a593Smuzhiyun * there is no need to preserve other bits or save/restore the field.
3080*4882a593Smuzhiyun */
3081*4882a593Smuzhiyun vmcs_writel(GUEST_RFLAGS, 0);
3082*4882a593Smuzhiyun
3083*4882a593Smuzhiyun cr3 = __get_current_cr3_fast();
3084*4882a593Smuzhiyun if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
3085*4882a593Smuzhiyun vmcs_writel(HOST_CR3, cr3);
3086*4882a593Smuzhiyun vmx->loaded_vmcs->host_state.cr3 = cr3;
3087*4882a593Smuzhiyun }
3088*4882a593Smuzhiyun
3089*4882a593Smuzhiyun cr4 = cr4_read_shadow();
3090*4882a593Smuzhiyun if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
3091*4882a593Smuzhiyun vmcs_writel(HOST_CR4, cr4);
3092*4882a593Smuzhiyun vmx->loaded_vmcs->host_state.cr4 = cr4;
3093*4882a593Smuzhiyun }
3094*4882a593Smuzhiyun
3095*4882a593Smuzhiyun vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
3096*4882a593Smuzhiyun __vmx_vcpu_run_flags(vmx));
3097*4882a593Smuzhiyun
3098*4882a593Smuzhiyun if (vmx->msr_autoload.host.nr)
3099*4882a593Smuzhiyun vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3100*4882a593Smuzhiyun if (vmx->msr_autoload.guest.nr)
3101*4882a593Smuzhiyun vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3102*4882a593Smuzhiyun
3103*4882a593Smuzhiyun if (vm_fail) {
3104*4882a593Smuzhiyun u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3105*4882a593Smuzhiyun
3106*4882a593Smuzhiyun preempt_enable();
3107*4882a593Smuzhiyun
3108*4882a593Smuzhiyun trace_kvm_nested_vmenter_failed(
3109*4882a593Smuzhiyun "early hardware check VM-instruction error: ", error);
3110*4882a593Smuzhiyun WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3111*4882a593Smuzhiyun return 1;
3112*4882a593Smuzhiyun }
3113*4882a593Smuzhiyun
3114*4882a593Smuzhiyun /*
3115*4882a593Smuzhiyun * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3116*4882a593Smuzhiyun */
3117*4882a593Smuzhiyun if (hw_breakpoint_active())
3118*4882a593Smuzhiyun set_debugreg(__this_cpu_read(cpu_dr7), 7);
3119*4882a593Smuzhiyun local_irq_enable();
3120*4882a593Smuzhiyun preempt_enable();
3121*4882a593Smuzhiyun
3122*4882a593Smuzhiyun /*
3123*4882a593Smuzhiyun * A non-failing VMEntry means we somehow entered guest mode with
3124*4882a593Smuzhiyun * an illegal RIP, and that's just the tip of the iceberg. There
3125*4882a593Smuzhiyun * is no telling what memory has been modified or what state has
3126*4882a593Smuzhiyun * been exposed to unknown code. Hitting this all but guarantees
3127*4882a593Smuzhiyun * a (very critical) hardware issue.
3128*4882a593Smuzhiyun */
3129*4882a593Smuzhiyun WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3130*4882a593Smuzhiyun VMX_EXIT_REASONS_FAILED_VMENTRY));
3131*4882a593Smuzhiyun
3132*4882a593Smuzhiyun return 0;
3133*4882a593Smuzhiyun }
3134*4882a593Smuzhiyun
nested_get_evmcs_page(struct kvm_vcpu * vcpu)3135*4882a593Smuzhiyun static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
3136*4882a593Smuzhiyun {
3137*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
3138*4882a593Smuzhiyun
3139*4882a593Smuzhiyun /*
3140*4882a593Smuzhiyun * hv_evmcs may end up being not mapped after migration (when
3141*4882a593Smuzhiyun * L2 was running), map it here to make sure vmcs12 changes are
3142*4882a593Smuzhiyun * properly reflected.
3143*4882a593Smuzhiyun */
3144*4882a593Smuzhiyun if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) {
3145*4882a593Smuzhiyun enum nested_evmptrld_status evmptrld_status =
3146*4882a593Smuzhiyun nested_vmx_handle_enlightened_vmptrld(vcpu, false);
3147*4882a593Smuzhiyun
3148*4882a593Smuzhiyun if (evmptrld_status == EVMPTRLD_VMFAIL ||
3149*4882a593Smuzhiyun evmptrld_status == EVMPTRLD_ERROR)
3150*4882a593Smuzhiyun return false;
3151*4882a593Smuzhiyun }
3152*4882a593Smuzhiyun
3153*4882a593Smuzhiyun return true;
3154*4882a593Smuzhiyun }
3155*4882a593Smuzhiyun
nested_get_vmcs12_pages(struct kvm_vcpu * vcpu)3156*4882a593Smuzhiyun static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3157*4882a593Smuzhiyun {
3158*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3159*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
3160*4882a593Smuzhiyun struct kvm_host_map *map;
3161*4882a593Smuzhiyun struct page *page;
3162*4882a593Smuzhiyun u64 hpa;
3163*4882a593Smuzhiyun
3164*4882a593Smuzhiyun if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3165*4882a593Smuzhiyun /*
3166*4882a593Smuzhiyun * Translate L1 physical address to host physical
3167*4882a593Smuzhiyun * address for vmcs02. Keep the page pinned, so this
3168*4882a593Smuzhiyun * physical address remains valid. We keep a reference
3169*4882a593Smuzhiyun * to it so we can release it later.
3170*4882a593Smuzhiyun */
3171*4882a593Smuzhiyun if (vmx->nested.apic_access_page) { /* shouldn't happen */
3172*4882a593Smuzhiyun kvm_release_page_clean(vmx->nested.apic_access_page);
3173*4882a593Smuzhiyun vmx->nested.apic_access_page = NULL;
3174*4882a593Smuzhiyun }
3175*4882a593Smuzhiyun page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
3176*4882a593Smuzhiyun if (!is_error_page(page)) {
3177*4882a593Smuzhiyun vmx->nested.apic_access_page = page;
3178*4882a593Smuzhiyun hpa = page_to_phys(vmx->nested.apic_access_page);
3179*4882a593Smuzhiyun vmcs_write64(APIC_ACCESS_ADDR, hpa);
3180*4882a593Smuzhiyun } else {
3181*4882a593Smuzhiyun pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
3182*4882a593Smuzhiyun __func__);
3183*4882a593Smuzhiyun vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3184*4882a593Smuzhiyun vcpu->run->internal.suberror =
3185*4882a593Smuzhiyun KVM_INTERNAL_ERROR_EMULATION;
3186*4882a593Smuzhiyun vcpu->run->internal.ndata = 0;
3187*4882a593Smuzhiyun return false;
3188*4882a593Smuzhiyun }
3189*4882a593Smuzhiyun }
3190*4882a593Smuzhiyun
3191*4882a593Smuzhiyun if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3192*4882a593Smuzhiyun map = &vmx->nested.virtual_apic_map;
3193*4882a593Smuzhiyun
3194*4882a593Smuzhiyun if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3195*4882a593Smuzhiyun vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
3196*4882a593Smuzhiyun } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3197*4882a593Smuzhiyun nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3198*4882a593Smuzhiyun !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3199*4882a593Smuzhiyun /*
3200*4882a593Smuzhiyun * The processor will never use the TPR shadow, simply
3201*4882a593Smuzhiyun * clear the bit from the execution control. Such a
3202*4882a593Smuzhiyun * configuration is useless, but it happens in tests.
3203*4882a593Smuzhiyun * For any other configuration, failing the vm entry is
3204*4882a593Smuzhiyun * _not_ what the processor does but it's basically the
3205*4882a593Smuzhiyun * only possibility we have.
3206*4882a593Smuzhiyun */
3207*4882a593Smuzhiyun exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3208*4882a593Smuzhiyun } else {
3209*4882a593Smuzhiyun /*
3210*4882a593Smuzhiyun * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3211*4882a593Smuzhiyun * force VM-Entry to fail.
3212*4882a593Smuzhiyun */
3213*4882a593Smuzhiyun vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
3214*4882a593Smuzhiyun }
3215*4882a593Smuzhiyun }
3216*4882a593Smuzhiyun
3217*4882a593Smuzhiyun if (nested_cpu_has_posted_intr(vmcs12)) {
3218*4882a593Smuzhiyun map = &vmx->nested.pi_desc_map;
3219*4882a593Smuzhiyun
3220*4882a593Smuzhiyun if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3221*4882a593Smuzhiyun vmx->nested.pi_desc =
3222*4882a593Smuzhiyun (struct pi_desc *)(((void *)map->hva) +
3223*4882a593Smuzhiyun offset_in_page(vmcs12->posted_intr_desc_addr));
3224*4882a593Smuzhiyun vmcs_write64(POSTED_INTR_DESC_ADDR,
3225*4882a593Smuzhiyun pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3226*4882a593Smuzhiyun }
3227*4882a593Smuzhiyun }
3228*4882a593Smuzhiyun if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3229*4882a593Smuzhiyun exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3230*4882a593Smuzhiyun else
3231*4882a593Smuzhiyun exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3232*4882a593Smuzhiyun
3233*4882a593Smuzhiyun return true;
3234*4882a593Smuzhiyun }
3235*4882a593Smuzhiyun
vmx_get_nested_state_pages(struct kvm_vcpu * vcpu)3236*4882a593Smuzhiyun static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
3237*4882a593Smuzhiyun {
3238*4882a593Smuzhiyun if (!nested_get_evmcs_page(vcpu)) {
3239*4882a593Smuzhiyun pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
3240*4882a593Smuzhiyun __func__);
3241*4882a593Smuzhiyun vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3242*4882a593Smuzhiyun vcpu->run->internal.suberror =
3243*4882a593Smuzhiyun KVM_INTERNAL_ERROR_EMULATION;
3244*4882a593Smuzhiyun vcpu->run->internal.ndata = 0;
3245*4882a593Smuzhiyun
3246*4882a593Smuzhiyun return false;
3247*4882a593Smuzhiyun }
3248*4882a593Smuzhiyun
3249*4882a593Smuzhiyun if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
3250*4882a593Smuzhiyun return false;
3251*4882a593Smuzhiyun
3252*4882a593Smuzhiyun return true;
3253*4882a593Smuzhiyun }
3254*4882a593Smuzhiyun
nested_vmx_write_pml_buffer(struct kvm_vcpu * vcpu,gpa_t gpa)3255*4882a593Smuzhiyun static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
3256*4882a593Smuzhiyun {
3257*4882a593Smuzhiyun struct vmcs12 *vmcs12;
3258*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
3259*4882a593Smuzhiyun gpa_t dst;
3260*4882a593Smuzhiyun
3261*4882a593Smuzhiyun if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
3262*4882a593Smuzhiyun return 0;
3263*4882a593Smuzhiyun
3264*4882a593Smuzhiyun if (WARN_ON_ONCE(vmx->nested.pml_full))
3265*4882a593Smuzhiyun return 1;
3266*4882a593Smuzhiyun
3267*4882a593Smuzhiyun /*
3268*4882a593Smuzhiyun * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
3269*4882a593Smuzhiyun * set is already checked as part of A/D emulation.
3270*4882a593Smuzhiyun */
3271*4882a593Smuzhiyun vmcs12 = get_vmcs12(vcpu);
3272*4882a593Smuzhiyun if (!nested_cpu_has_pml(vmcs12))
3273*4882a593Smuzhiyun return 0;
3274*4882a593Smuzhiyun
3275*4882a593Smuzhiyun if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
3276*4882a593Smuzhiyun vmx->nested.pml_full = true;
3277*4882a593Smuzhiyun return 1;
3278*4882a593Smuzhiyun }
3279*4882a593Smuzhiyun
3280*4882a593Smuzhiyun gpa &= ~0xFFFull;
3281*4882a593Smuzhiyun dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
3282*4882a593Smuzhiyun
3283*4882a593Smuzhiyun if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
3284*4882a593Smuzhiyun offset_in_page(dst), sizeof(gpa)))
3285*4882a593Smuzhiyun return 0;
3286*4882a593Smuzhiyun
3287*4882a593Smuzhiyun vmcs12->guest_pml_index--;
3288*4882a593Smuzhiyun
3289*4882a593Smuzhiyun return 0;
3290*4882a593Smuzhiyun }
3291*4882a593Smuzhiyun
3292*4882a593Smuzhiyun /*
3293*4882a593Smuzhiyun * Intel's VMX Instruction Reference specifies a common set of prerequisites
3294*4882a593Smuzhiyun * for running VMX instructions (except VMXON, whose prerequisites are
3295*4882a593Smuzhiyun * slightly different). It also specifies what exception to inject otherwise.
3296*4882a593Smuzhiyun * Note that many of these exceptions have priority over VM exits, so they
3297*4882a593Smuzhiyun * don't have to be checked again here.
3298*4882a593Smuzhiyun */
nested_vmx_check_permission(struct kvm_vcpu * vcpu)3299*4882a593Smuzhiyun static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3300*4882a593Smuzhiyun {
3301*4882a593Smuzhiyun if (!to_vmx(vcpu)->nested.vmxon) {
3302*4882a593Smuzhiyun kvm_queue_exception(vcpu, UD_VECTOR);
3303*4882a593Smuzhiyun return 0;
3304*4882a593Smuzhiyun }
3305*4882a593Smuzhiyun
3306*4882a593Smuzhiyun if (vmx_get_cpl(vcpu)) {
3307*4882a593Smuzhiyun kvm_inject_gp(vcpu, 0);
3308*4882a593Smuzhiyun return 0;
3309*4882a593Smuzhiyun }
3310*4882a593Smuzhiyun
3311*4882a593Smuzhiyun return 1;
3312*4882a593Smuzhiyun }
3313*4882a593Smuzhiyun
vmx_has_apicv_interrupt(struct kvm_vcpu * vcpu)3314*4882a593Smuzhiyun static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3315*4882a593Smuzhiyun {
3316*4882a593Smuzhiyun u8 rvi = vmx_get_rvi();
3317*4882a593Smuzhiyun u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3318*4882a593Smuzhiyun
3319*4882a593Smuzhiyun return ((rvi & 0xf0) > (vppr & 0xf0));
3320*4882a593Smuzhiyun }
3321*4882a593Smuzhiyun
3322*4882a593Smuzhiyun static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3323*4882a593Smuzhiyun struct vmcs12 *vmcs12);
3324*4882a593Smuzhiyun
3325*4882a593Smuzhiyun /*
3326*4882a593Smuzhiyun * If from_vmentry is false, this is being called from state restore (either RSM
3327*4882a593Smuzhiyun * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
3328*4882a593Smuzhiyun *
3329*4882a593Smuzhiyun * Returns:
3330*4882a593Smuzhiyun * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
3331*4882a593Smuzhiyun * NVMX_VMENTRY_VMFAIL: Consistency check VMFail
3332*4882a593Smuzhiyun * NVMX_VMENTRY_VMEXIT: Consistency check VMExit
3333*4882a593Smuzhiyun * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
3334*4882a593Smuzhiyun */
nested_vmx_enter_non_root_mode(struct kvm_vcpu * vcpu,bool from_vmentry)3335*4882a593Smuzhiyun enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3336*4882a593Smuzhiyun bool from_vmentry)
3337*4882a593Smuzhiyun {
3338*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
3339*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3340*4882a593Smuzhiyun enum vm_entry_failure_code entry_failure_code;
3341*4882a593Smuzhiyun bool evaluate_pending_interrupts;
3342*4882a593Smuzhiyun union vmx_exit_reason exit_reason = {
3343*4882a593Smuzhiyun .basic = EXIT_REASON_INVALID_STATE,
3344*4882a593Smuzhiyun .failed_vmentry = 1,
3345*4882a593Smuzhiyun };
3346*4882a593Smuzhiyun u32 failed_index;
3347*4882a593Smuzhiyun
3348*4882a593Smuzhiyun if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
3349*4882a593Smuzhiyun kvm_vcpu_flush_tlb_current(vcpu);
3350*4882a593Smuzhiyun
3351*4882a593Smuzhiyun evaluate_pending_interrupts = exec_controls_get(vmx) &
3352*4882a593Smuzhiyun (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
3353*4882a593Smuzhiyun if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3354*4882a593Smuzhiyun evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3355*4882a593Smuzhiyun
3356*4882a593Smuzhiyun if (!vmx->nested.nested_run_pending ||
3357*4882a593Smuzhiyun !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3358*4882a593Smuzhiyun vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3359*4882a593Smuzhiyun if (kvm_mpx_supported() &&
3360*4882a593Smuzhiyun (!vmx->nested.nested_run_pending ||
3361*4882a593Smuzhiyun !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
3362*4882a593Smuzhiyun vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3363*4882a593Smuzhiyun
3364*4882a593Smuzhiyun /*
3365*4882a593Smuzhiyun * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3366*4882a593Smuzhiyun * nested early checks are disabled. In the event of a "late" VM-Fail,
3367*4882a593Smuzhiyun * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3368*4882a593Smuzhiyun * software model to the pre-VMEntry host state. When EPT is disabled,
3369*4882a593Smuzhiyun * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3370*4882a593Smuzhiyun * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
3371*4882a593Smuzhiyun * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3372*4882a593Smuzhiyun * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
3373*4882a593Smuzhiyun * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3374*4882a593Smuzhiyun * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3375*4882a593Smuzhiyun * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3376*4882a593Smuzhiyun * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3377*4882a593Smuzhiyun * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3378*4882a593Smuzhiyun * path would need to manually save/restore vmcs01.GUEST_CR3.
3379*4882a593Smuzhiyun */
3380*4882a593Smuzhiyun if (!enable_ept && !nested_early_check)
3381*4882a593Smuzhiyun vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3382*4882a593Smuzhiyun
3383*4882a593Smuzhiyun vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3384*4882a593Smuzhiyun
3385*4882a593Smuzhiyun prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
3386*4882a593Smuzhiyun
3387*4882a593Smuzhiyun if (from_vmentry) {
3388*4882a593Smuzhiyun if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
3389*4882a593Smuzhiyun vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3390*4882a593Smuzhiyun return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
3391*4882a593Smuzhiyun }
3392*4882a593Smuzhiyun
3393*4882a593Smuzhiyun if (nested_vmx_check_vmentry_hw(vcpu)) {
3394*4882a593Smuzhiyun vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3395*4882a593Smuzhiyun return NVMX_VMENTRY_VMFAIL;
3396*4882a593Smuzhiyun }
3397*4882a593Smuzhiyun
3398*4882a593Smuzhiyun if (nested_vmx_check_guest_state(vcpu, vmcs12,
3399*4882a593Smuzhiyun &entry_failure_code)) {
3400*4882a593Smuzhiyun exit_reason.basic = EXIT_REASON_INVALID_STATE;
3401*4882a593Smuzhiyun vmcs12->exit_qualification = entry_failure_code;
3402*4882a593Smuzhiyun goto vmentry_fail_vmexit;
3403*4882a593Smuzhiyun }
3404*4882a593Smuzhiyun }
3405*4882a593Smuzhiyun
3406*4882a593Smuzhiyun enter_guest_mode(vcpu);
3407*4882a593Smuzhiyun if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3408*4882a593Smuzhiyun vcpu->arch.tsc_offset += vmcs12->tsc_offset;
3409*4882a593Smuzhiyun
3410*4882a593Smuzhiyun if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) {
3411*4882a593Smuzhiyun exit_reason.basic = EXIT_REASON_INVALID_STATE;
3412*4882a593Smuzhiyun vmcs12->exit_qualification = entry_failure_code;
3413*4882a593Smuzhiyun goto vmentry_fail_vmexit_guest_mode;
3414*4882a593Smuzhiyun }
3415*4882a593Smuzhiyun
3416*4882a593Smuzhiyun if (from_vmentry) {
3417*4882a593Smuzhiyun failed_index = nested_vmx_load_msr(vcpu,
3418*4882a593Smuzhiyun vmcs12->vm_entry_msr_load_addr,
3419*4882a593Smuzhiyun vmcs12->vm_entry_msr_load_count);
3420*4882a593Smuzhiyun if (failed_index) {
3421*4882a593Smuzhiyun exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
3422*4882a593Smuzhiyun vmcs12->exit_qualification = failed_index;
3423*4882a593Smuzhiyun goto vmentry_fail_vmexit_guest_mode;
3424*4882a593Smuzhiyun }
3425*4882a593Smuzhiyun } else {
3426*4882a593Smuzhiyun /*
3427*4882a593Smuzhiyun * The MMU is not initialized to point at the right entities yet and
3428*4882a593Smuzhiyun * "get pages" would need to read data from the guest (i.e. we will
3429*4882a593Smuzhiyun * need to perform gpa to hpa translation). Request a call
3430*4882a593Smuzhiyun * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
3431*4882a593Smuzhiyun * have already been set at vmentry time and should not be reset.
3432*4882a593Smuzhiyun */
3433*4882a593Smuzhiyun kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
3434*4882a593Smuzhiyun }
3435*4882a593Smuzhiyun
3436*4882a593Smuzhiyun /*
3437*4882a593Smuzhiyun * If L1 had a pending IRQ/NMI until it executed
3438*4882a593Smuzhiyun * VMLAUNCH/VMRESUME which wasn't delivered because it was
3439*4882a593Smuzhiyun * disallowed (e.g. interrupts disabled), L0 needs to
3440*4882a593Smuzhiyun * evaluate if this pending event should cause an exit from L2
3441*4882a593Smuzhiyun * to L1 or delivered directly to L2 (e.g. In case L1 don't
3442*4882a593Smuzhiyun * intercept EXTERNAL_INTERRUPT).
3443*4882a593Smuzhiyun *
3444*4882a593Smuzhiyun * Usually this would be handled by the processor noticing an
3445*4882a593Smuzhiyun * IRQ/NMI window request, or checking RVI during evaluation of
3446*4882a593Smuzhiyun * pending virtual interrupts. However, this setting was done
3447*4882a593Smuzhiyun * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3448*4882a593Smuzhiyun * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3449*4882a593Smuzhiyun */
3450*4882a593Smuzhiyun if (unlikely(evaluate_pending_interrupts))
3451*4882a593Smuzhiyun kvm_make_request(KVM_REQ_EVENT, vcpu);
3452*4882a593Smuzhiyun
3453*4882a593Smuzhiyun /*
3454*4882a593Smuzhiyun * Do not start the preemption timer hrtimer until after we know
3455*4882a593Smuzhiyun * we are successful, so that only nested_vmx_vmexit needs to cancel
3456*4882a593Smuzhiyun * the timer.
3457*4882a593Smuzhiyun */
3458*4882a593Smuzhiyun vmx->nested.preemption_timer_expired = false;
3459*4882a593Smuzhiyun if (nested_cpu_has_preemption_timer(vmcs12)) {
3460*4882a593Smuzhiyun u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
3461*4882a593Smuzhiyun vmx_start_preemption_timer(vcpu, timer_value);
3462*4882a593Smuzhiyun }
3463*4882a593Smuzhiyun
3464*4882a593Smuzhiyun /*
3465*4882a593Smuzhiyun * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3466*4882a593Smuzhiyun * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3467*4882a593Smuzhiyun * returned as far as L1 is concerned. It will only return (and set
3468*4882a593Smuzhiyun * the success flag) when L2 exits (see nested_vmx_vmexit()).
3469*4882a593Smuzhiyun */
3470*4882a593Smuzhiyun return NVMX_VMENTRY_SUCCESS;
3471*4882a593Smuzhiyun
3472*4882a593Smuzhiyun /*
3473*4882a593Smuzhiyun * A failed consistency check that leads to a VMExit during L1's
3474*4882a593Smuzhiyun * VMEnter to L2 is a variation of a normal VMexit, as explained in
3475*4882a593Smuzhiyun * 26.7 "VM-entry failures during or after loading guest state".
3476*4882a593Smuzhiyun */
3477*4882a593Smuzhiyun vmentry_fail_vmexit_guest_mode:
3478*4882a593Smuzhiyun if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3479*4882a593Smuzhiyun vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3480*4882a593Smuzhiyun leave_guest_mode(vcpu);
3481*4882a593Smuzhiyun
3482*4882a593Smuzhiyun vmentry_fail_vmexit:
3483*4882a593Smuzhiyun vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3484*4882a593Smuzhiyun
3485*4882a593Smuzhiyun if (!from_vmentry)
3486*4882a593Smuzhiyun return NVMX_VMENTRY_VMEXIT;
3487*4882a593Smuzhiyun
3488*4882a593Smuzhiyun load_vmcs12_host_state(vcpu, vmcs12);
3489*4882a593Smuzhiyun vmcs12->vm_exit_reason = exit_reason.full;
3490*4882a593Smuzhiyun if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
3491*4882a593Smuzhiyun vmx->nested.need_vmcs12_to_shadow_sync = true;
3492*4882a593Smuzhiyun return NVMX_VMENTRY_VMEXIT;
3493*4882a593Smuzhiyun }
3494*4882a593Smuzhiyun
3495*4882a593Smuzhiyun /*
3496*4882a593Smuzhiyun * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3497*4882a593Smuzhiyun * for running an L2 nested guest.
3498*4882a593Smuzhiyun */
nested_vmx_run(struct kvm_vcpu * vcpu,bool launch)3499*4882a593Smuzhiyun static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3500*4882a593Smuzhiyun {
3501*4882a593Smuzhiyun struct vmcs12 *vmcs12;
3502*4882a593Smuzhiyun enum nvmx_vmentry_status status;
3503*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
3504*4882a593Smuzhiyun u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3505*4882a593Smuzhiyun enum nested_evmptrld_status evmptrld_status;
3506*4882a593Smuzhiyun
3507*4882a593Smuzhiyun if (!nested_vmx_check_permission(vcpu))
3508*4882a593Smuzhiyun return 1;
3509*4882a593Smuzhiyun
3510*4882a593Smuzhiyun evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
3511*4882a593Smuzhiyun if (evmptrld_status == EVMPTRLD_ERROR) {
3512*4882a593Smuzhiyun kvm_queue_exception(vcpu, UD_VECTOR);
3513*4882a593Smuzhiyun return 1;
3514*4882a593Smuzhiyun } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
3515*4882a593Smuzhiyun return nested_vmx_failInvalid(vcpu);
3516*4882a593Smuzhiyun }
3517*4882a593Smuzhiyun
3518*4882a593Smuzhiyun if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull))
3519*4882a593Smuzhiyun return nested_vmx_failInvalid(vcpu);
3520*4882a593Smuzhiyun
3521*4882a593Smuzhiyun vmcs12 = get_vmcs12(vcpu);
3522*4882a593Smuzhiyun
3523*4882a593Smuzhiyun /*
3524*4882a593Smuzhiyun * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3525*4882a593Smuzhiyun * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3526*4882a593Smuzhiyun * rather than RFLAGS.ZF, and no error number is stored to the
3527*4882a593Smuzhiyun * VM-instruction error field.
3528*4882a593Smuzhiyun */
3529*4882a593Smuzhiyun if (CC(vmcs12->hdr.shadow_vmcs))
3530*4882a593Smuzhiyun return nested_vmx_failInvalid(vcpu);
3531*4882a593Smuzhiyun
3532*4882a593Smuzhiyun if (vmx->nested.hv_evmcs) {
3533*4882a593Smuzhiyun copy_enlightened_to_vmcs12(vmx);
3534*4882a593Smuzhiyun /* Enlightened VMCS doesn't have launch state */
3535*4882a593Smuzhiyun vmcs12->launch_state = !launch;
3536*4882a593Smuzhiyun } else if (enable_shadow_vmcs) {
3537*4882a593Smuzhiyun copy_shadow_to_vmcs12(vmx);
3538*4882a593Smuzhiyun }
3539*4882a593Smuzhiyun
3540*4882a593Smuzhiyun /*
3541*4882a593Smuzhiyun * The nested entry process starts with enforcing various prerequisites
3542*4882a593Smuzhiyun * on vmcs12 as required by the Intel SDM, and act appropriately when
3543*4882a593Smuzhiyun * they fail: As the SDM explains, some conditions should cause the
3544*4882a593Smuzhiyun * instruction to fail, while others will cause the instruction to seem
3545*4882a593Smuzhiyun * to succeed, but return an EXIT_REASON_INVALID_STATE.
3546*4882a593Smuzhiyun * To speed up the normal (success) code path, we should avoid checking
3547*4882a593Smuzhiyun * for misconfigurations which will anyway be caught by the processor
3548*4882a593Smuzhiyun * when using the merged vmcs02.
3549*4882a593Smuzhiyun */
3550*4882a593Smuzhiyun if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
3551*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3552*4882a593Smuzhiyun
3553*4882a593Smuzhiyun if (CC(vmcs12->launch_state == launch))
3554*4882a593Smuzhiyun return nested_vmx_fail(vcpu,
3555*4882a593Smuzhiyun launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3556*4882a593Smuzhiyun : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3557*4882a593Smuzhiyun
3558*4882a593Smuzhiyun if (nested_vmx_check_controls(vcpu, vmcs12))
3559*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3560*4882a593Smuzhiyun
3561*4882a593Smuzhiyun if (nested_vmx_check_address_space_size(vcpu, vmcs12))
3562*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3563*4882a593Smuzhiyun
3564*4882a593Smuzhiyun if (nested_vmx_check_host_state(vcpu, vmcs12))
3565*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3566*4882a593Smuzhiyun
3567*4882a593Smuzhiyun /*
3568*4882a593Smuzhiyun * We're finally done with prerequisite checking, and can start with
3569*4882a593Smuzhiyun * the nested entry.
3570*4882a593Smuzhiyun */
3571*4882a593Smuzhiyun vmx->nested.nested_run_pending = 1;
3572*4882a593Smuzhiyun vmx->nested.has_preemption_timer_deadline = false;
3573*4882a593Smuzhiyun status = nested_vmx_enter_non_root_mode(vcpu, true);
3574*4882a593Smuzhiyun if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3575*4882a593Smuzhiyun goto vmentry_failed;
3576*4882a593Smuzhiyun
3577*4882a593Smuzhiyun /* Emulate processing of posted interrupts on VM-Enter. */
3578*4882a593Smuzhiyun if (nested_cpu_has_posted_intr(vmcs12) &&
3579*4882a593Smuzhiyun kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
3580*4882a593Smuzhiyun vmx->nested.pi_pending = true;
3581*4882a593Smuzhiyun kvm_make_request(KVM_REQ_EVENT, vcpu);
3582*4882a593Smuzhiyun kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
3583*4882a593Smuzhiyun }
3584*4882a593Smuzhiyun
3585*4882a593Smuzhiyun /* Hide L1D cache contents from the nested guest. */
3586*4882a593Smuzhiyun vmx->vcpu.arch.l1tf_flush_l1d = true;
3587*4882a593Smuzhiyun
3588*4882a593Smuzhiyun /*
3589*4882a593Smuzhiyun * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3590*4882a593Smuzhiyun * also be used as part of restoring nVMX state for
3591*4882a593Smuzhiyun * snapshot restore (migration).
3592*4882a593Smuzhiyun *
3593*4882a593Smuzhiyun * In this flow, it is assumed that vmcs12 cache was
3594*4882a593Smuzhiyun * trasferred as part of captured nVMX state and should
3595*4882a593Smuzhiyun * therefore not be read from guest memory (which may not
3596*4882a593Smuzhiyun * exist on destination host yet).
3597*4882a593Smuzhiyun */
3598*4882a593Smuzhiyun nested_cache_shadow_vmcs12(vcpu, vmcs12);
3599*4882a593Smuzhiyun
3600*4882a593Smuzhiyun /*
3601*4882a593Smuzhiyun * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3602*4882a593Smuzhiyun * awakened by event injection or by an NMI-window VM-exit or
3603*4882a593Smuzhiyun * by an interrupt-window VM-exit, halt the vcpu.
3604*4882a593Smuzhiyun */
3605*4882a593Smuzhiyun if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
3606*4882a593Smuzhiyun !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3607*4882a593Smuzhiyun !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) &&
3608*4882a593Smuzhiyun !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) &&
3609*4882a593Smuzhiyun (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3610*4882a593Smuzhiyun vmx->nested.nested_run_pending = 0;
3611*4882a593Smuzhiyun return kvm_vcpu_halt(vcpu);
3612*4882a593Smuzhiyun }
3613*4882a593Smuzhiyun return 1;
3614*4882a593Smuzhiyun
3615*4882a593Smuzhiyun vmentry_failed:
3616*4882a593Smuzhiyun vmx->nested.nested_run_pending = 0;
3617*4882a593Smuzhiyun if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3618*4882a593Smuzhiyun return 0;
3619*4882a593Smuzhiyun if (status == NVMX_VMENTRY_VMEXIT)
3620*4882a593Smuzhiyun return 1;
3621*4882a593Smuzhiyun WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
3622*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3623*4882a593Smuzhiyun }
3624*4882a593Smuzhiyun
3625*4882a593Smuzhiyun /*
3626*4882a593Smuzhiyun * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3627*4882a593Smuzhiyun * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
3628*4882a593Smuzhiyun * This function returns the new value we should put in vmcs12.guest_cr0.
3629*4882a593Smuzhiyun * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3630*4882a593Smuzhiyun * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3631*4882a593Smuzhiyun * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3632*4882a593Smuzhiyun * didn't trap the bit, because if L1 did, so would L0).
3633*4882a593Smuzhiyun * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3634*4882a593Smuzhiyun * been modified by L2, and L1 knows it. So just leave the old value of
3635*4882a593Smuzhiyun * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3636*4882a593Smuzhiyun * isn't relevant, because if L0 traps this bit it can set it to anything.
3637*4882a593Smuzhiyun * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3638*4882a593Smuzhiyun * changed these bits, and therefore they need to be updated, but L0
3639*4882a593Smuzhiyun * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3640*4882a593Smuzhiyun * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3641*4882a593Smuzhiyun */
3642*4882a593Smuzhiyun static inline unsigned long
vmcs12_guest_cr0(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)3643*4882a593Smuzhiyun vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3644*4882a593Smuzhiyun {
3645*4882a593Smuzhiyun return
3646*4882a593Smuzhiyun /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3647*4882a593Smuzhiyun /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3648*4882a593Smuzhiyun /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3649*4882a593Smuzhiyun vcpu->arch.cr0_guest_owned_bits));
3650*4882a593Smuzhiyun }
3651*4882a593Smuzhiyun
3652*4882a593Smuzhiyun static inline unsigned long
vmcs12_guest_cr4(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)3653*4882a593Smuzhiyun vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3654*4882a593Smuzhiyun {
3655*4882a593Smuzhiyun return
3656*4882a593Smuzhiyun /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3657*4882a593Smuzhiyun /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3658*4882a593Smuzhiyun /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3659*4882a593Smuzhiyun vcpu->arch.cr4_guest_owned_bits));
3660*4882a593Smuzhiyun }
3661*4882a593Smuzhiyun
vmcs12_save_pending_event(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,u32 vm_exit_reason,u32 exit_intr_info)3662*4882a593Smuzhiyun static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3663*4882a593Smuzhiyun struct vmcs12 *vmcs12,
3664*4882a593Smuzhiyun u32 vm_exit_reason, u32 exit_intr_info)
3665*4882a593Smuzhiyun {
3666*4882a593Smuzhiyun u32 idt_vectoring;
3667*4882a593Smuzhiyun unsigned int nr;
3668*4882a593Smuzhiyun
3669*4882a593Smuzhiyun /*
3670*4882a593Smuzhiyun * Per the SDM, VM-Exits due to double and triple faults are never
3671*4882a593Smuzhiyun * considered to occur during event delivery, even if the double/triple
3672*4882a593Smuzhiyun * fault is the result of an escalating vectoring issue.
3673*4882a593Smuzhiyun *
3674*4882a593Smuzhiyun * Note, the SDM qualifies the double fault behavior with "The original
3675*4882a593Smuzhiyun * event results in a double-fault exception". It's unclear why the
3676*4882a593Smuzhiyun * qualification exists since exits due to double fault can occur only
3677*4882a593Smuzhiyun * while vectoring a different exception (injected events are never
3678*4882a593Smuzhiyun * subject to interception), i.e. there's _always_ an original event.
3679*4882a593Smuzhiyun *
3680*4882a593Smuzhiyun * The SDM also uses NMI as a confusing example for the "original event
3681*4882a593Smuzhiyun * causes the VM exit directly" clause. NMI isn't special in any way,
3682*4882a593Smuzhiyun * the same rule applies to all events that cause an exit directly.
3683*4882a593Smuzhiyun * NMI is an odd choice for the example because NMIs can only occur on
3684*4882a593Smuzhiyun * instruction boundaries, i.e. they _can't_ occur during vectoring.
3685*4882a593Smuzhiyun */
3686*4882a593Smuzhiyun if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
3687*4882a593Smuzhiyun ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
3688*4882a593Smuzhiyun is_double_fault(exit_intr_info))) {
3689*4882a593Smuzhiyun vmcs12->idt_vectoring_info_field = 0;
3690*4882a593Smuzhiyun } else if (vcpu->arch.exception.injected) {
3691*4882a593Smuzhiyun nr = vcpu->arch.exception.nr;
3692*4882a593Smuzhiyun idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3693*4882a593Smuzhiyun
3694*4882a593Smuzhiyun if (kvm_exception_is_soft(nr)) {
3695*4882a593Smuzhiyun vmcs12->vm_exit_instruction_len =
3696*4882a593Smuzhiyun vcpu->arch.event_exit_inst_len;
3697*4882a593Smuzhiyun idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3698*4882a593Smuzhiyun } else
3699*4882a593Smuzhiyun idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3700*4882a593Smuzhiyun
3701*4882a593Smuzhiyun if (vcpu->arch.exception.has_error_code) {
3702*4882a593Smuzhiyun idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3703*4882a593Smuzhiyun vmcs12->idt_vectoring_error_code =
3704*4882a593Smuzhiyun vcpu->arch.exception.error_code;
3705*4882a593Smuzhiyun }
3706*4882a593Smuzhiyun
3707*4882a593Smuzhiyun vmcs12->idt_vectoring_info_field = idt_vectoring;
3708*4882a593Smuzhiyun } else if (vcpu->arch.nmi_injected) {
3709*4882a593Smuzhiyun vmcs12->idt_vectoring_info_field =
3710*4882a593Smuzhiyun INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3711*4882a593Smuzhiyun } else if (vcpu->arch.interrupt.injected) {
3712*4882a593Smuzhiyun nr = vcpu->arch.interrupt.nr;
3713*4882a593Smuzhiyun idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3714*4882a593Smuzhiyun
3715*4882a593Smuzhiyun if (vcpu->arch.interrupt.soft) {
3716*4882a593Smuzhiyun idt_vectoring |= INTR_TYPE_SOFT_INTR;
3717*4882a593Smuzhiyun vmcs12->vm_entry_instruction_len =
3718*4882a593Smuzhiyun vcpu->arch.event_exit_inst_len;
3719*4882a593Smuzhiyun } else
3720*4882a593Smuzhiyun idt_vectoring |= INTR_TYPE_EXT_INTR;
3721*4882a593Smuzhiyun
3722*4882a593Smuzhiyun vmcs12->idt_vectoring_info_field = idt_vectoring;
3723*4882a593Smuzhiyun } else {
3724*4882a593Smuzhiyun vmcs12->idt_vectoring_info_field = 0;
3725*4882a593Smuzhiyun }
3726*4882a593Smuzhiyun }
3727*4882a593Smuzhiyun
3728*4882a593Smuzhiyun
nested_mark_vmcs12_pages_dirty(struct kvm_vcpu * vcpu)3729*4882a593Smuzhiyun void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3730*4882a593Smuzhiyun {
3731*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3732*4882a593Smuzhiyun gfn_t gfn;
3733*4882a593Smuzhiyun
3734*4882a593Smuzhiyun /*
3735*4882a593Smuzhiyun * Don't need to mark the APIC access page dirty; it is never
3736*4882a593Smuzhiyun * written to by the CPU during APIC virtualization.
3737*4882a593Smuzhiyun */
3738*4882a593Smuzhiyun
3739*4882a593Smuzhiyun if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3740*4882a593Smuzhiyun gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3741*4882a593Smuzhiyun kvm_vcpu_mark_page_dirty(vcpu, gfn);
3742*4882a593Smuzhiyun }
3743*4882a593Smuzhiyun
3744*4882a593Smuzhiyun if (nested_cpu_has_posted_intr(vmcs12)) {
3745*4882a593Smuzhiyun gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3746*4882a593Smuzhiyun kvm_vcpu_mark_page_dirty(vcpu, gfn);
3747*4882a593Smuzhiyun }
3748*4882a593Smuzhiyun }
3749*4882a593Smuzhiyun
vmx_complete_nested_posted_interrupt(struct kvm_vcpu * vcpu)3750*4882a593Smuzhiyun static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3751*4882a593Smuzhiyun {
3752*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
3753*4882a593Smuzhiyun int max_irr;
3754*4882a593Smuzhiyun void *vapic_page;
3755*4882a593Smuzhiyun u16 status;
3756*4882a593Smuzhiyun
3757*4882a593Smuzhiyun if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
3758*4882a593Smuzhiyun return;
3759*4882a593Smuzhiyun
3760*4882a593Smuzhiyun vmx->nested.pi_pending = false;
3761*4882a593Smuzhiyun if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3762*4882a593Smuzhiyun return;
3763*4882a593Smuzhiyun
3764*4882a593Smuzhiyun max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3765*4882a593Smuzhiyun if (max_irr != 256) {
3766*4882a593Smuzhiyun vapic_page = vmx->nested.virtual_apic_map.hva;
3767*4882a593Smuzhiyun if (!vapic_page)
3768*4882a593Smuzhiyun return;
3769*4882a593Smuzhiyun
3770*4882a593Smuzhiyun __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3771*4882a593Smuzhiyun vapic_page, &max_irr);
3772*4882a593Smuzhiyun status = vmcs_read16(GUEST_INTR_STATUS);
3773*4882a593Smuzhiyun if ((u8)max_irr > ((u8)status & 0xff)) {
3774*4882a593Smuzhiyun status &= ~0xff;
3775*4882a593Smuzhiyun status |= (u8)max_irr;
3776*4882a593Smuzhiyun vmcs_write16(GUEST_INTR_STATUS, status);
3777*4882a593Smuzhiyun }
3778*4882a593Smuzhiyun }
3779*4882a593Smuzhiyun
3780*4882a593Smuzhiyun nested_mark_vmcs12_pages_dirty(vcpu);
3781*4882a593Smuzhiyun }
3782*4882a593Smuzhiyun
nested_vmx_inject_exception_vmexit(struct kvm_vcpu * vcpu,unsigned long exit_qual)3783*4882a593Smuzhiyun static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3784*4882a593Smuzhiyun unsigned long exit_qual)
3785*4882a593Smuzhiyun {
3786*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3787*4882a593Smuzhiyun unsigned int nr = vcpu->arch.exception.nr;
3788*4882a593Smuzhiyun u32 intr_info = nr | INTR_INFO_VALID_MASK;
3789*4882a593Smuzhiyun
3790*4882a593Smuzhiyun if (vcpu->arch.exception.has_error_code) {
3791*4882a593Smuzhiyun /*
3792*4882a593Smuzhiyun * Intel CPUs do not generate error codes with bits 31:16 set,
3793*4882a593Smuzhiyun * and more importantly VMX disallows setting bits 31:16 in the
3794*4882a593Smuzhiyun * injected error code for VM-Entry. Drop the bits to mimic
3795*4882a593Smuzhiyun * hardware and avoid inducing failure on nested VM-Entry if L1
3796*4882a593Smuzhiyun * chooses to inject the exception back to L2. AMD CPUs _do_
3797*4882a593Smuzhiyun * generate "full" 32-bit error codes, so KVM allows userspace
3798*4882a593Smuzhiyun * to inject exception error codes with bits 31:16 set.
3799*4882a593Smuzhiyun */
3800*4882a593Smuzhiyun vmcs12->vm_exit_intr_error_code = (u16)vcpu->arch.exception.error_code;
3801*4882a593Smuzhiyun intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3802*4882a593Smuzhiyun }
3803*4882a593Smuzhiyun
3804*4882a593Smuzhiyun if (kvm_exception_is_soft(nr))
3805*4882a593Smuzhiyun intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3806*4882a593Smuzhiyun else
3807*4882a593Smuzhiyun intr_info |= INTR_TYPE_HARD_EXCEPTION;
3808*4882a593Smuzhiyun
3809*4882a593Smuzhiyun if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3810*4882a593Smuzhiyun vmx_get_nmi_mask(vcpu))
3811*4882a593Smuzhiyun intr_info |= INTR_INFO_UNBLOCK_NMI;
3812*4882a593Smuzhiyun
3813*4882a593Smuzhiyun nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3814*4882a593Smuzhiyun }
3815*4882a593Smuzhiyun
3816*4882a593Smuzhiyun /*
3817*4882a593Smuzhiyun * Returns true if a debug trap is pending delivery.
3818*4882a593Smuzhiyun *
3819*4882a593Smuzhiyun * In KVM, debug traps bear an exception payload. As such, the class of a #DB
3820*4882a593Smuzhiyun * exception may be inferred from the presence of an exception payload.
3821*4882a593Smuzhiyun */
vmx_pending_dbg_trap(struct kvm_vcpu * vcpu)3822*4882a593Smuzhiyun static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
3823*4882a593Smuzhiyun {
3824*4882a593Smuzhiyun return vcpu->arch.exception.pending &&
3825*4882a593Smuzhiyun vcpu->arch.exception.nr == DB_VECTOR &&
3826*4882a593Smuzhiyun vcpu->arch.exception.payload;
3827*4882a593Smuzhiyun }
3828*4882a593Smuzhiyun
3829*4882a593Smuzhiyun /*
3830*4882a593Smuzhiyun * Certain VM-exits set the 'pending debug exceptions' field to indicate a
3831*4882a593Smuzhiyun * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
3832*4882a593Smuzhiyun * represents these debug traps with a payload that is said to be compatible
3833*4882a593Smuzhiyun * with the 'pending debug exceptions' field, write the payload to the VMCS
3834*4882a593Smuzhiyun * field if a VM-exit is delivered before the debug trap.
3835*4882a593Smuzhiyun */
nested_vmx_update_pending_dbg(struct kvm_vcpu * vcpu)3836*4882a593Smuzhiyun static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
3837*4882a593Smuzhiyun {
3838*4882a593Smuzhiyun if (vmx_pending_dbg_trap(vcpu))
3839*4882a593Smuzhiyun vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
3840*4882a593Smuzhiyun vcpu->arch.exception.payload);
3841*4882a593Smuzhiyun }
3842*4882a593Smuzhiyun
nested_vmx_preemption_timer_pending(struct kvm_vcpu * vcpu)3843*4882a593Smuzhiyun static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
3844*4882a593Smuzhiyun {
3845*4882a593Smuzhiyun return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3846*4882a593Smuzhiyun to_vmx(vcpu)->nested.preemption_timer_expired;
3847*4882a593Smuzhiyun }
3848*4882a593Smuzhiyun
vmx_check_nested_events(struct kvm_vcpu * vcpu)3849*4882a593Smuzhiyun static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
3850*4882a593Smuzhiyun {
3851*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
3852*4882a593Smuzhiyun unsigned long exit_qual;
3853*4882a593Smuzhiyun bool block_nested_events =
3854*4882a593Smuzhiyun vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3855*4882a593Smuzhiyun bool mtf_pending = vmx->nested.mtf_pending;
3856*4882a593Smuzhiyun struct kvm_lapic *apic = vcpu->arch.apic;
3857*4882a593Smuzhiyun
3858*4882a593Smuzhiyun /*
3859*4882a593Smuzhiyun * Clear the MTF state. If a higher priority VM-exit is delivered first,
3860*4882a593Smuzhiyun * this state is discarded.
3861*4882a593Smuzhiyun */
3862*4882a593Smuzhiyun if (!block_nested_events)
3863*4882a593Smuzhiyun vmx->nested.mtf_pending = false;
3864*4882a593Smuzhiyun
3865*4882a593Smuzhiyun if (lapic_in_kernel(vcpu) &&
3866*4882a593Smuzhiyun test_bit(KVM_APIC_INIT, &apic->pending_events)) {
3867*4882a593Smuzhiyun if (block_nested_events)
3868*4882a593Smuzhiyun return -EBUSY;
3869*4882a593Smuzhiyun nested_vmx_update_pending_dbg(vcpu);
3870*4882a593Smuzhiyun clear_bit(KVM_APIC_INIT, &apic->pending_events);
3871*4882a593Smuzhiyun nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
3872*4882a593Smuzhiyun return 0;
3873*4882a593Smuzhiyun }
3874*4882a593Smuzhiyun
3875*4882a593Smuzhiyun /*
3876*4882a593Smuzhiyun * Process any exceptions that are not debug traps before MTF.
3877*4882a593Smuzhiyun */
3878*4882a593Smuzhiyun if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
3879*4882a593Smuzhiyun if (block_nested_events)
3880*4882a593Smuzhiyun return -EBUSY;
3881*4882a593Smuzhiyun if (!nested_vmx_check_exception(vcpu, &exit_qual))
3882*4882a593Smuzhiyun goto no_vmexit;
3883*4882a593Smuzhiyun nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3884*4882a593Smuzhiyun return 0;
3885*4882a593Smuzhiyun }
3886*4882a593Smuzhiyun
3887*4882a593Smuzhiyun if (mtf_pending) {
3888*4882a593Smuzhiyun if (block_nested_events)
3889*4882a593Smuzhiyun return -EBUSY;
3890*4882a593Smuzhiyun nested_vmx_update_pending_dbg(vcpu);
3891*4882a593Smuzhiyun nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
3892*4882a593Smuzhiyun return 0;
3893*4882a593Smuzhiyun }
3894*4882a593Smuzhiyun
3895*4882a593Smuzhiyun if (vcpu->arch.exception.pending) {
3896*4882a593Smuzhiyun if (block_nested_events)
3897*4882a593Smuzhiyun return -EBUSY;
3898*4882a593Smuzhiyun if (!nested_vmx_check_exception(vcpu, &exit_qual))
3899*4882a593Smuzhiyun goto no_vmexit;
3900*4882a593Smuzhiyun nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3901*4882a593Smuzhiyun return 0;
3902*4882a593Smuzhiyun }
3903*4882a593Smuzhiyun
3904*4882a593Smuzhiyun if (nested_vmx_preemption_timer_pending(vcpu)) {
3905*4882a593Smuzhiyun if (block_nested_events)
3906*4882a593Smuzhiyun return -EBUSY;
3907*4882a593Smuzhiyun nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3908*4882a593Smuzhiyun return 0;
3909*4882a593Smuzhiyun }
3910*4882a593Smuzhiyun
3911*4882a593Smuzhiyun if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
3912*4882a593Smuzhiyun if (block_nested_events)
3913*4882a593Smuzhiyun return -EBUSY;
3914*4882a593Smuzhiyun goto no_vmexit;
3915*4882a593Smuzhiyun }
3916*4882a593Smuzhiyun
3917*4882a593Smuzhiyun if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
3918*4882a593Smuzhiyun if (block_nested_events)
3919*4882a593Smuzhiyun return -EBUSY;
3920*4882a593Smuzhiyun if (!nested_exit_on_nmi(vcpu))
3921*4882a593Smuzhiyun goto no_vmexit;
3922*4882a593Smuzhiyun
3923*4882a593Smuzhiyun nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3924*4882a593Smuzhiyun NMI_VECTOR | INTR_TYPE_NMI_INTR |
3925*4882a593Smuzhiyun INTR_INFO_VALID_MASK, 0);
3926*4882a593Smuzhiyun /*
3927*4882a593Smuzhiyun * The NMI-triggered VM exit counts as injection:
3928*4882a593Smuzhiyun * clear this one and block further NMIs.
3929*4882a593Smuzhiyun */
3930*4882a593Smuzhiyun vcpu->arch.nmi_pending = 0;
3931*4882a593Smuzhiyun vmx_set_nmi_mask(vcpu, true);
3932*4882a593Smuzhiyun return 0;
3933*4882a593Smuzhiyun }
3934*4882a593Smuzhiyun
3935*4882a593Smuzhiyun if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
3936*4882a593Smuzhiyun if (block_nested_events)
3937*4882a593Smuzhiyun return -EBUSY;
3938*4882a593Smuzhiyun if (!nested_exit_on_intr(vcpu))
3939*4882a593Smuzhiyun goto no_vmexit;
3940*4882a593Smuzhiyun nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3941*4882a593Smuzhiyun return 0;
3942*4882a593Smuzhiyun }
3943*4882a593Smuzhiyun
3944*4882a593Smuzhiyun no_vmexit:
3945*4882a593Smuzhiyun vmx_complete_nested_posted_interrupt(vcpu);
3946*4882a593Smuzhiyun return 0;
3947*4882a593Smuzhiyun }
3948*4882a593Smuzhiyun
vmx_get_preemption_timer_value(struct kvm_vcpu * vcpu)3949*4882a593Smuzhiyun static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3950*4882a593Smuzhiyun {
3951*4882a593Smuzhiyun ktime_t remaining =
3952*4882a593Smuzhiyun hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3953*4882a593Smuzhiyun u64 value;
3954*4882a593Smuzhiyun
3955*4882a593Smuzhiyun if (ktime_to_ns(remaining) <= 0)
3956*4882a593Smuzhiyun return 0;
3957*4882a593Smuzhiyun
3958*4882a593Smuzhiyun value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3959*4882a593Smuzhiyun do_div(value, 1000000);
3960*4882a593Smuzhiyun return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3961*4882a593Smuzhiyun }
3962*4882a593Smuzhiyun
is_vmcs12_ext_field(unsigned long field)3963*4882a593Smuzhiyun static bool is_vmcs12_ext_field(unsigned long field)
3964*4882a593Smuzhiyun {
3965*4882a593Smuzhiyun switch (field) {
3966*4882a593Smuzhiyun case GUEST_ES_SELECTOR:
3967*4882a593Smuzhiyun case GUEST_CS_SELECTOR:
3968*4882a593Smuzhiyun case GUEST_SS_SELECTOR:
3969*4882a593Smuzhiyun case GUEST_DS_SELECTOR:
3970*4882a593Smuzhiyun case GUEST_FS_SELECTOR:
3971*4882a593Smuzhiyun case GUEST_GS_SELECTOR:
3972*4882a593Smuzhiyun case GUEST_LDTR_SELECTOR:
3973*4882a593Smuzhiyun case GUEST_TR_SELECTOR:
3974*4882a593Smuzhiyun case GUEST_ES_LIMIT:
3975*4882a593Smuzhiyun case GUEST_CS_LIMIT:
3976*4882a593Smuzhiyun case GUEST_SS_LIMIT:
3977*4882a593Smuzhiyun case GUEST_DS_LIMIT:
3978*4882a593Smuzhiyun case GUEST_FS_LIMIT:
3979*4882a593Smuzhiyun case GUEST_GS_LIMIT:
3980*4882a593Smuzhiyun case GUEST_LDTR_LIMIT:
3981*4882a593Smuzhiyun case GUEST_TR_LIMIT:
3982*4882a593Smuzhiyun case GUEST_GDTR_LIMIT:
3983*4882a593Smuzhiyun case GUEST_IDTR_LIMIT:
3984*4882a593Smuzhiyun case GUEST_ES_AR_BYTES:
3985*4882a593Smuzhiyun case GUEST_DS_AR_BYTES:
3986*4882a593Smuzhiyun case GUEST_FS_AR_BYTES:
3987*4882a593Smuzhiyun case GUEST_GS_AR_BYTES:
3988*4882a593Smuzhiyun case GUEST_LDTR_AR_BYTES:
3989*4882a593Smuzhiyun case GUEST_TR_AR_BYTES:
3990*4882a593Smuzhiyun case GUEST_ES_BASE:
3991*4882a593Smuzhiyun case GUEST_CS_BASE:
3992*4882a593Smuzhiyun case GUEST_SS_BASE:
3993*4882a593Smuzhiyun case GUEST_DS_BASE:
3994*4882a593Smuzhiyun case GUEST_FS_BASE:
3995*4882a593Smuzhiyun case GUEST_GS_BASE:
3996*4882a593Smuzhiyun case GUEST_LDTR_BASE:
3997*4882a593Smuzhiyun case GUEST_TR_BASE:
3998*4882a593Smuzhiyun case GUEST_GDTR_BASE:
3999*4882a593Smuzhiyun case GUEST_IDTR_BASE:
4000*4882a593Smuzhiyun case GUEST_PENDING_DBG_EXCEPTIONS:
4001*4882a593Smuzhiyun case GUEST_BNDCFGS:
4002*4882a593Smuzhiyun return true;
4003*4882a593Smuzhiyun default:
4004*4882a593Smuzhiyun break;
4005*4882a593Smuzhiyun }
4006*4882a593Smuzhiyun
4007*4882a593Smuzhiyun return false;
4008*4882a593Smuzhiyun }
4009*4882a593Smuzhiyun
sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)4010*4882a593Smuzhiyun static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4011*4882a593Smuzhiyun struct vmcs12 *vmcs12)
4012*4882a593Smuzhiyun {
4013*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
4014*4882a593Smuzhiyun
4015*4882a593Smuzhiyun vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
4016*4882a593Smuzhiyun vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
4017*4882a593Smuzhiyun vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
4018*4882a593Smuzhiyun vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
4019*4882a593Smuzhiyun vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
4020*4882a593Smuzhiyun vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
4021*4882a593Smuzhiyun vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
4022*4882a593Smuzhiyun vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
4023*4882a593Smuzhiyun vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
4024*4882a593Smuzhiyun vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
4025*4882a593Smuzhiyun vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
4026*4882a593Smuzhiyun vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
4027*4882a593Smuzhiyun vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
4028*4882a593Smuzhiyun vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
4029*4882a593Smuzhiyun vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
4030*4882a593Smuzhiyun vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
4031*4882a593Smuzhiyun vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
4032*4882a593Smuzhiyun vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
4033*4882a593Smuzhiyun vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
4034*4882a593Smuzhiyun vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
4035*4882a593Smuzhiyun vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
4036*4882a593Smuzhiyun vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
4037*4882a593Smuzhiyun vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
4038*4882a593Smuzhiyun vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
4039*4882a593Smuzhiyun vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
4040*4882a593Smuzhiyun vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
4041*4882a593Smuzhiyun vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
4042*4882a593Smuzhiyun vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
4043*4882a593Smuzhiyun vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
4044*4882a593Smuzhiyun vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
4045*4882a593Smuzhiyun vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
4046*4882a593Smuzhiyun vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
4047*4882a593Smuzhiyun vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
4048*4882a593Smuzhiyun vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
4049*4882a593Smuzhiyun vmcs12->guest_pending_dbg_exceptions =
4050*4882a593Smuzhiyun vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
4051*4882a593Smuzhiyun if (kvm_mpx_supported())
4052*4882a593Smuzhiyun vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
4053*4882a593Smuzhiyun
4054*4882a593Smuzhiyun vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
4055*4882a593Smuzhiyun }
4056*4882a593Smuzhiyun
copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)4057*4882a593Smuzhiyun static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4058*4882a593Smuzhiyun struct vmcs12 *vmcs12)
4059*4882a593Smuzhiyun {
4060*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
4061*4882a593Smuzhiyun int cpu;
4062*4882a593Smuzhiyun
4063*4882a593Smuzhiyun if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
4064*4882a593Smuzhiyun return;
4065*4882a593Smuzhiyun
4066*4882a593Smuzhiyun
4067*4882a593Smuzhiyun WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
4068*4882a593Smuzhiyun
4069*4882a593Smuzhiyun cpu = get_cpu();
4070*4882a593Smuzhiyun vmx->loaded_vmcs = &vmx->nested.vmcs02;
4071*4882a593Smuzhiyun vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
4072*4882a593Smuzhiyun
4073*4882a593Smuzhiyun sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4074*4882a593Smuzhiyun
4075*4882a593Smuzhiyun vmx->loaded_vmcs = &vmx->vmcs01;
4076*4882a593Smuzhiyun vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
4077*4882a593Smuzhiyun put_cpu();
4078*4882a593Smuzhiyun }
4079*4882a593Smuzhiyun
4080*4882a593Smuzhiyun /*
4081*4882a593Smuzhiyun * Update the guest state fields of vmcs12 to reflect changes that
4082*4882a593Smuzhiyun * occurred while L2 was running. (The "IA-32e mode guest" bit of the
4083*4882a593Smuzhiyun * VM-entry controls is also updated, since this is really a guest
4084*4882a593Smuzhiyun * state bit.)
4085*4882a593Smuzhiyun */
sync_vmcs02_to_vmcs12(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)4086*4882a593Smuzhiyun static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4087*4882a593Smuzhiyun {
4088*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
4089*4882a593Smuzhiyun
4090*4882a593Smuzhiyun if (vmx->nested.hv_evmcs)
4091*4882a593Smuzhiyun sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4092*4882a593Smuzhiyun
4093*4882a593Smuzhiyun vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
4094*4882a593Smuzhiyun
4095*4882a593Smuzhiyun vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
4096*4882a593Smuzhiyun vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
4097*4882a593Smuzhiyun
4098*4882a593Smuzhiyun vmcs12->guest_rsp = kvm_rsp_read(vcpu);
4099*4882a593Smuzhiyun vmcs12->guest_rip = kvm_rip_read(vcpu);
4100*4882a593Smuzhiyun vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
4101*4882a593Smuzhiyun
4102*4882a593Smuzhiyun vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
4103*4882a593Smuzhiyun vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
4104*4882a593Smuzhiyun
4105*4882a593Smuzhiyun vmcs12->guest_interruptibility_info =
4106*4882a593Smuzhiyun vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
4107*4882a593Smuzhiyun
4108*4882a593Smuzhiyun if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
4109*4882a593Smuzhiyun vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
4110*4882a593Smuzhiyun else
4111*4882a593Smuzhiyun vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
4112*4882a593Smuzhiyun
4113*4882a593Smuzhiyun if (nested_cpu_has_preemption_timer(vmcs12) &&
4114*4882a593Smuzhiyun vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
4115*4882a593Smuzhiyun !vmx->nested.nested_run_pending)
4116*4882a593Smuzhiyun vmcs12->vmx_preemption_timer_value =
4117*4882a593Smuzhiyun vmx_get_preemption_timer_value(vcpu);
4118*4882a593Smuzhiyun
4119*4882a593Smuzhiyun /*
4120*4882a593Smuzhiyun * In some cases (usually, nested EPT), L2 is allowed to change its
4121*4882a593Smuzhiyun * own CR3 without exiting. If it has changed it, we must keep it.
4122*4882a593Smuzhiyun * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
4123*4882a593Smuzhiyun * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
4124*4882a593Smuzhiyun *
4125*4882a593Smuzhiyun * Additionally, restore L2's PDPTR to vmcs12.
4126*4882a593Smuzhiyun */
4127*4882a593Smuzhiyun if (enable_ept) {
4128*4882a593Smuzhiyun vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
4129*4882a593Smuzhiyun if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
4130*4882a593Smuzhiyun vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
4131*4882a593Smuzhiyun vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
4132*4882a593Smuzhiyun vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
4133*4882a593Smuzhiyun vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
4134*4882a593Smuzhiyun }
4135*4882a593Smuzhiyun }
4136*4882a593Smuzhiyun
4137*4882a593Smuzhiyun vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
4138*4882a593Smuzhiyun
4139*4882a593Smuzhiyun if (nested_cpu_has_vid(vmcs12))
4140*4882a593Smuzhiyun vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
4141*4882a593Smuzhiyun
4142*4882a593Smuzhiyun vmcs12->vm_entry_controls =
4143*4882a593Smuzhiyun (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
4144*4882a593Smuzhiyun (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
4145*4882a593Smuzhiyun
4146*4882a593Smuzhiyun if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
4147*4882a593Smuzhiyun kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
4148*4882a593Smuzhiyun
4149*4882a593Smuzhiyun if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
4150*4882a593Smuzhiyun vmcs12->guest_ia32_efer = vcpu->arch.efer;
4151*4882a593Smuzhiyun }
4152*4882a593Smuzhiyun
4153*4882a593Smuzhiyun /*
4154*4882a593Smuzhiyun * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
4155*4882a593Smuzhiyun * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
4156*4882a593Smuzhiyun * and this function updates it to reflect the changes to the guest state while
4157*4882a593Smuzhiyun * L2 was running (and perhaps made some exits which were handled directly by L0
4158*4882a593Smuzhiyun * without going back to L1), and to reflect the exit reason.
4159*4882a593Smuzhiyun * Note that we do not have to copy here all VMCS fields, just those that
4160*4882a593Smuzhiyun * could have changed by the L2 guest or the exit - i.e., the guest-state and
4161*4882a593Smuzhiyun * exit-information fields only. Other fields are modified by L1 with VMWRITE,
4162*4882a593Smuzhiyun * which already writes to vmcs12 directly.
4163*4882a593Smuzhiyun */
prepare_vmcs12(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,u32 vm_exit_reason,u32 exit_intr_info,unsigned long exit_qualification)4164*4882a593Smuzhiyun static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
4165*4882a593Smuzhiyun u32 vm_exit_reason, u32 exit_intr_info,
4166*4882a593Smuzhiyun unsigned long exit_qualification)
4167*4882a593Smuzhiyun {
4168*4882a593Smuzhiyun /* update exit information fields: */
4169*4882a593Smuzhiyun vmcs12->vm_exit_reason = vm_exit_reason;
4170*4882a593Smuzhiyun vmcs12->exit_qualification = exit_qualification;
4171*4882a593Smuzhiyun
4172*4882a593Smuzhiyun /*
4173*4882a593Smuzhiyun * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
4174*4882a593Smuzhiyun * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
4175*4882a593Smuzhiyun * exit info fields are unmodified.
4176*4882a593Smuzhiyun */
4177*4882a593Smuzhiyun if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
4178*4882a593Smuzhiyun vmcs12->launch_state = 1;
4179*4882a593Smuzhiyun
4180*4882a593Smuzhiyun /* vm_entry_intr_info_field is cleared on exit. Emulate this
4181*4882a593Smuzhiyun * instead of reading the real value. */
4182*4882a593Smuzhiyun vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
4183*4882a593Smuzhiyun
4184*4882a593Smuzhiyun /*
4185*4882a593Smuzhiyun * Transfer the event that L0 or L1 may wanted to inject into
4186*4882a593Smuzhiyun * L2 to IDT_VECTORING_INFO_FIELD.
4187*4882a593Smuzhiyun */
4188*4882a593Smuzhiyun vmcs12_save_pending_event(vcpu, vmcs12,
4189*4882a593Smuzhiyun vm_exit_reason, exit_intr_info);
4190*4882a593Smuzhiyun
4191*4882a593Smuzhiyun vmcs12->vm_exit_intr_info = exit_intr_info;
4192*4882a593Smuzhiyun vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4193*4882a593Smuzhiyun vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4194*4882a593Smuzhiyun
4195*4882a593Smuzhiyun /*
4196*4882a593Smuzhiyun * According to spec, there's no need to store the guest's
4197*4882a593Smuzhiyun * MSRs if the exit is due to a VM-entry failure that occurs
4198*4882a593Smuzhiyun * during or after loading the guest state. Since this exit
4199*4882a593Smuzhiyun * does not fall in that category, we need to save the MSRs.
4200*4882a593Smuzhiyun */
4201*4882a593Smuzhiyun if (nested_vmx_store_msr(vcpu,
4202*4882a593Smuzhiyun vmcs12->vm_exit_msr_store_addr,
4203*4882a593Smuzhiyun vmcs12->vm_exit_msr_store_count))
4204*4882a593Smuzhiyun nested_vmx_abort(vcpu,
4205*4882a593Smuzhiyun VMX_ABORT_SAVE_GUEST_MSR_FAIL);
4206*4882a593Smuzhiyun }
4207*4882a593Smuzhiyun }
4208*4882a593Smuzhiyun
4209*4882a593Smuzhiyun /*
4210*4882a593Smuzhiyun * A part of what we need to when the nested L2 guest exits and we want to
4211*4882a593Smuzhiyun * run its L1 parent, is to reset L1's guest state to the host state specified
4212*4882a593Smuzhiyun * in vmcs12.
4213*4882a593Smuzhiyun * This function is to be called not only on normal nested exit, but also on
4214*4882a593Smuzhiyun * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
4215*4882a593Smuzhiyun * Failures During or After Loading Guest State").
4216*4882a593Smuzhiyun * This function should be called when the active VMCS is L1's (vmcs01).
4217*4882a593Smuzhiyun */
load_vmcs12_host_state(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)4218*4882a593Smuzhiyun static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
4219*4882a593Smuzhiyun struct vmcs12 *vmcs12)
4220*4882a593Smuzhiyun {
4221*4882a593Smuzhiyun enum vm_entry_failure_code ignored;
4222*4882a593Smuzhiyun struct kvm_segment seg;
4223*4882a593Smuzhiyun
4224*4882a593Smuzhiyun if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
4225*4882a593Smuzhiyun vcpu->arch.efer = vmcs12->host_ia32_efer;
4226*4882a593Smuzhiyun else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4227*4882a593Smuzhiyun vcpu->arch.efer |= (EFER_LMA | EFER_LME);
4228*4882a593Smuzhiyun else
4229*4882a593Smuzhiyun vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
4230*4882a593Smuzhiyun vmx_set_efer(vcpu, vcpu->arch.efer);
4231*4882a593Smuzhiyun
4232*4882a593Smuzhiyun kvm_rsp_write(vcpu, vmcs12->host_rsp);
4233*4882a593Smuzhiyun kvm_rip_write(vcpu, vmcs12->host_rip);
4234*4882a593Smuzhiyun vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
4235*4882a593Smuzhiyun vmx_set_interrupt_shadow(vcpu, 0);
4236*4882a593Smuzhiyun
4237*4882a593Smuzhiyun /*
4238*4882a593Smuzhiyun * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
4239*4882a593Smuzhiyun * actually changed, because vmx_set_cr0 refers to efer set above.
4240*4882a593Smuzhiyun *
4241*4882a593Smuzhiyun * CR0_GUEST_HOST_MASK is already set in the original vmcs01
4242*4882a593Smuzhiyun * (KVM doesn't change it);
4243*4882a593Smuzhiyun */
4244*4882a593Smuzhiyun vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4245*4882a593Smuzhiyun vmx_set_cr0(vcpu, vmcs12->host_cr0);
4246*4882a593Smuzhiyun
4247*4882a593Smuzhiyun /* Same as above - no reason to call set_cr4_guest_host_mask(). */
4248*4882a593Smuzhiyun vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4249*4882a593Smuzhiyun vmx_set_cr4(vcpu, vmcs12->host_cr4);
4250*4882a593Smuzhiyun
4251*4882a593Smuzhiyun nested_ept_uninit_mmu_context(vcpu);
4252*4882a593Smuzhiyun
4253*4882a593Smuzhiyun /*
4254*4882a593Smuzhiyun * Only PDPTE load can fail as the value of cr3 was checked on entry and
4255*4882a593Smuzhiyun * couldn't have changed.
4256*4882a593Smuzhiyun */
4257*4882a593Smuzhiyun if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
4258*4882a593Smuzhiyun nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
4259*4882a593Smuzhiyun
4260*4882a593Smuzhiyun if (!enable_ept)
4261*4882a593Smuzhiyun vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
4262*4882a593Smuzhiyun
4263*4882a593Smuzhiyun nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
4264*4882a593Smuzhiyun
4265*4882a593Smuzhiyun vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
4266*4882a593Smuzhiyun vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
4267*4882a593Smuzhiyun vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
4268*4882a593Smuzhiyun vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
4269*4882a593Smuzhiyun vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
4270*4882a593Smuzhiyun vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
4271*4882a593Smuzhiyun vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4272*4882a593Smuzhiyun
4273*4882a593Smuzhiyun /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
4274*4882a593Smuzhiyun if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
4275*4882a593Smuzhiyun vmcs_write64(GUEST_BNDCFGS, 0);
4276*4882a593Smuzhiyun
4277*4882a593Smuzhiyun if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4278*4882a593Smuzhiyun vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
4279*4882a593Smuzhiyun vcpu->arch.pat = vmcs12->host_ia32_pat;
4280*4882a593Smuzhiyun }
4281*4882a593Smuzhiyun if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
4282*4882a593Smuzhiyun WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4283*4882a593Smuzhiyun vmcs12->host_ia32_perf_global_ctrl));
4284*4882a593Smuzhiyun
4285*4882a593Smuzhiyun /* Set L1 segment info according to Intel SDM
4286*4882a593Smuzhiyun 27.5.2 Loading Host Segment and Descriptor-Table Registers */
4287*4882a593Smuzhiyun seg = (struct kvm_segment) {
4288*4882a593Smuzhiyun .base = 0,
4289*4882a593Smuzhiyun .limit = 0xFFFFFFFF,
4290*4882a593Smuzhiyun .selector = vmcs12->host_cs_selector,
4291*4882a593Smuzhiyun .type = 11,
4292*4882a593Smuzhiyun .present = 1,
4293*4882a593Smuzhiyun .s = 1,
4294*4882a593Smuzhiyun .g = 1
4295*4882a593Smuzhiyun };
4296*4882a593Smuzhiyun if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4297*4882a593Smuzhiyun seg.l = 1;
4298*4882a593Smuzhiyun else
4299*4882a593Smuzhiyun seg.db = 1;
4300*4882a593Smuzhiyun vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4301*4882a593Smuzhiyun seg = (struct kvm_segment) {
4302*4882a593Smuzhiyun .base = 0,
4303*4882a593Smuzhiyun .limit = 0xFFFFFFFF,
4304*4882a593Smuzhiyun .type = 3,
4305*4882a593Smuzhiyun .present = 1,
4306*4882a593Smuzhiyun .s = 1,
4307*4882a593Smuzhiyun .db = 1,
4308*4882a593Smuzhiyun .g = 1
4309*4882a593Smuzhiyun };
4310*4882a593Smuzhiyun seg.selector = vmcs12->host_ds_selector;
4311*4882a593Smuzhiyun vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4312*4882a593Smuzhiyun seg.selector = vmcs12->host_es_selector;
4313*4882a593Smuzhiyun vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4314*4882a593Smuzhiyun seg.selector = vmcs12->host_ss_selector;
4315*4882a593Smuzhiyun vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4316*4882a593Smuzhiyun seg.selector = vmcs12->host_fs_selector;
4317*4882a593Smuzhiyun seg.base = vmcs12->host_fs_base;
4318*4882a593Smuzhiyun vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4319*4882a593Smuzhiyun seg.selector = vmcs12->host_gs_selector;
4320*4882a593Smuzhiyun seg.base = vmcs12->host_gs_base;
4321*4882a593Smuzhiyun vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4322*4882a593Smuzhiyun seg = (struct kvm_segment) {
4323*4882a593Smuzhiyun .base = vmcs12->host_tr_base,
4324*4882a593Smuzhiyun .limit = 0x67,
4325*4882a593Smuzhiyun .selector = vmcs12->host_tr_selector,
4326*4882a593Smuzhiyun .type = 11,
4327*4882a593Smuzhiyun .present = 1
4328*4882a593Smuzhiyun };
4329*4882a593Smuzhiyun vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4330*4882a593Smuzhiyun
4331*4882a593Smuzhiyun kvm_set_dr(vcpu, 7, 0x400);
4332*4882a593Smuzhiyun vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4333*4882a593Smuzhiyun
4334*4882a593Smuzhiyun if (cpu_has_vmx_msr_bitmap())
4335*4882a593Smuzhiyun vmx_update_msr_bitmap(vcpu);
4336*4882a593Smuzhiyun
4337*4882a593Smuzhiyun if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4338*4882a593Smuzhiyun vmcs12->vm_exit_msr_load_count))
4339*4882a593Smuzhiyun nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4340*4882a593Smuzhiyun }
4341*4882a593Smuzhiyun
nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx * vmx)4342*4882a593Smuzhiyun static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4343*4882a593Smuzhiyun {
4344*4882a593Smuzhiyun struct vmx_uret_msr *efer_msr;
4345*4882a593Smuzhiyun unsigned int i;
4346*4882a593Smuzhiyun
4347*4882a593Smuzhiyun if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4348*4882a593Smuzhiyun return vmcs_read64(GUEST_IA32_EFER);
4349*4882a593Smuzhiyun
4350*4882a593Smuzhiyun if (cpu_has_load_ia32_efer())
4351*4882a593Smuzhiyun return host_efer;
4352*4882a593Smuzhiyun
4353*4882a593Smuzhiyun for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4354*4882a593Smuzhiyun if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4355*4882a593Smuzhiyun return vmx->msr_autoload.guest.val[i].value;
4356*4882a593Smuzhiyun }
4357*4882a593Smuzhiyun
4358*4882a593Smuzhiyun efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
4359*4882a593Smuzhiyun if (efer_msr)
4360*4882a593Smuzhiyun return efer_msr->data;
4361*4882a593Smuzhiyun
4362*4882a593Smuzhiyun return host_efer;
4363*4882a593Smuzhiyun }
4364*4882a593Smuzhiyun
nested_vmx_restore_host_state(struct kvm_vcpu * vcpu)4365*4882a593Smuzhiyun static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4366*4882a593Smuzhiyun {
4367*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4368*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
4369*4882a593Smuzhiyun struct vmx_msr_entry g, h;
4370*4882a593Smuzhiyun gpa_t gpa;
4371*4882a593Smuzhiyun u32 i, j;
4372*4882a593Smuzhiyun
4373*4882a593Smuzhiyun vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4374*4882a593Smuzhiyun
4375*4882a593Smuzhiyun if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4376*4882a593Smuzhiyun /*
4377*4882a593Smuzhiyun * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4378*4882a593Smuzhiyun * as vmcs01.GUEST_DR7 contains a userspace defined value
4379*4882a593Smuzhiyun * and vcpu->arch.dr7 is not squirreled away before the
4380*4882a593Smuzhiyun * nested VMENTER (not worth adding a variable in nested_vmx).
4381*4882a593Smuzhiyun */
4382*4882a593Smuzhiyun if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4383*4882a593Smuzhiyun kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4384*4882a593Smuzhiyun else
4385*4882a593Smuzhiyun WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4386*4882a593Smuzhiyun }
4387*4882a593Smuzhiyun
4388*4882a593Smuzhiyun /*
4389*4882a593Smuzhiyun * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4390*4882a593Smuzhiyun * handle a variety of side effects to KVM's software model.
4391*4882a593Smuzhiyun */
4392*4882a593Smuzhiyun vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4393*4882a593Smuzhiyun
4394*4882a593Smuzhiyun vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4395*4882a593Smuzhiyun vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4396*4882a593Smuzhiyun
4397*4882a593Smuzhiyun vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4398*4882a593Smuzhiyun vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4399*4882a593Smuzhiyun
4400*4882a593Smuzhiyun nested_ept_uninit_mmu_context(vcpu);
4401*4882a593Smuzhiyun vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4402*4882a593Smuzhiyun kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
4403*4882a593Smuzhiyun
4404*4882a593Smuzhiyun /*
4405*4882a593Smuzhiyun * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4406*4882a593Smuzhiyun * from vmcs01 (if necessary). The PDPTRs are not loaded on
4407*4882a593Smuzhiyun * VMFail, like everything else we just need to ensure our
4408*4882a593Smuzhiyun * software model is up-to-date.
4409*4882a593Smuzhiyun */
4410*4882a593Smuzhiyun if (enable_ept && is_pae_paging(vcpu))
4411*4882a593Smuzhiyun ept_save_pdptrs(vcpu);
4412*4882a593Smuzhiyun
4413*4882a593Smuzhiyun kvm_mmu_reset_context(vcpu);
4414*4882a593Smuzhiyun
4415*4882a593Smuzhiyun if (cpu_has_vmx_msr_bitmap())
4416*4882a593Smuzhiyun vmx_update_msr_bitmap(vcpu);
4417*4882a593Smuzhiyun
4418*4882a593Smuzhiyun /*
4419*4882a593Smuzhiyun * This nasty bit of open coding is a compromise between blindly
4420*4882a593Smuzhiyun * loading L1's MSRs using the exit load lists (incorrect emulation
4421*4882a593Smuzhiyun * of VMFail), leaving the nested VM's MSRs in the software model
4422*4882a593Smuzhiyun * (incorrect behavior) and snapshotting the modified MSRs (too
4423*4882a593Smuzhiyun * expensive since the lists are unbound by hardware). For each
4424*4882a593Smuzhiyun * MSR that was (prematurely) loaded from the nested VMEntry load
4425*4882a593Smuzhiyun * list, reload it from the exit load list if it exists and differs
4426*4882a593Smuzhiyun * from the guest value. The intent is to stuff host state as
4427*4882a593Smuzhiyun * silently as possible, not to fully process the exit load list.
4428*4882a593Smuzhiyun */
4429*4882a593Smuzhiyun for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4430*4882a593Smuzhiyun gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4431*4882a593Smuzhiyun if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4432*4882a593Smuzhiyun pr_debug_ratelimited(
4433*4882a593Smuzhiyun "%s read MSR index failed (%u, 0x%08llx)\n",
4434*4882a593Smuzhiyun __func__, i, gpa);
4435*4882a593Smuzhiyun goto vmabort;
4436*4882a593Smuzhiyun }
4437*4882a593Smuzhiyun
4438*4882a593Smuzhiyun for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4439*4882a593Smuzhiyun gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4440*4882a593Smuzhiyun if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4441*4882a593Smuzhiyun pr_debug_ratelimited(
4442*4882a593Smuzhiyun "%s read MSR failed (%u, 0x%08llx)\n",
4443*4882a593Smuzhiyun __func__, j, gpa);
4444*4882a593Smuzhiyun goto vmabort;
4445*4882a593Smuzhiyun }
4446*4882a593Smuzhiyun if (h.index != g.index)
4447*4882a593Smuzhiyun continue;
4448*4882a593Smuzhiyun if (h.value == g.value)
4449*4882a593Smuzhiyun break;
4450*4882a593Smuzhiyun
4451*4882a593Smuzhiyun if (nested_vmx_load_msr_check(vcpu, &h)) {
4452*4882a593Smuzhiyun pr_debug_ratelimited(
4453*4882a593Smuzhiyun "%s check failed (%u, 0x%x, 0x%x)\n",
4454*4882a593Smuzhiyun __func__, j, h.index, h.reserved);
4455*4882a593Smuzhiyun goto vmabort;
4456*4882a593Smuzhiyun }
4457*4882a593Smuzhiyun
4458*4882a593Smuzhiyun if (kvm_set_msr(vcpu, h.index, h.value)) {
4459*4882a593Smuzhiyun pr_debug_ratelimited(
4460*4882a593Smuzhiyun "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4461*4882a593Smuzhiyun __func__, j, h.index, h.value);
4462*4882a593Smuzhiyun goto vmabort;
4463*4882a593Smuzhiyun }
4464*4882a593Smuzhiyun }
4465*4882a593Smuzhiyun }
4466*4882a593Smuzhiyun
4467*4882a593Smuzhiyun return;
4468*4882a593Smuzhiyun
4469*4882a593Smuzhiyun vmabort:
4470*4882a593Smuzhiyun nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4471*4882a593Smuzhiyun }
4472*4882a593Smuzhiyun
4473*4882a593Smuzhiyun /*
4474*4882a593Smuzhiyun * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4475*4882a593Smuzhiyun * and modify vmcs12 to make it see what it would expect to see there if
4476*4882a593Smuzhiyun * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4477*4882a593Smuzhiyun */
nested_vmx_vmexit(struct kvm_vcpu * vcpu,u32 vm_exit_reason,u32 exit_intr_info,unsigned long exit_qualification)4478*4882a593Smuzhiyun void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
4479*4882a593Smuzhiyun u32 exit_intr_info, unsigned long exit_qualification)
4480*4882a593Smuzhiyun {
4481*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
4482*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4483*4882a593Smuzhiyun
4484*4882a593Smuzhiyun /* trying to cancel vmlaunch/vmresume is a bug */
4485*4882a593Smuzhiyun WARN_ON_ONCE(vmx->nested.nested_run_pending);
4486*4882a593Smuzhiyun
4487*4882a593Smuzhiyun if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
4488*4882a593Smuzhiyun /*
4489*4882a593Smuzhiyun * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
4490*4882a593Smuzhiyun * Enlightened VMCS after migration and we still need to
4491*4882a593Smuzhiyun * do that when something is forcing L2->L1 exit prior to
4492*4882a593Smuzhiyun * the first L2 run.
4493*4882a593Smuzhiyun */
4494*4882a593Smuzhiyun (void)nested_get_evmcs_page(vcpu);
4495*4882a593Smuzhiyun }
4496*4882a593Smuzhiyun
4497*4882a593Smuzhiyun /* Service the TLB flush request for L2 before switching to L1. */
4498*4882a593Smuzhiyun if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
4499*4882a593Smuzhiyun kvm_vcpu_flush_tlb_current(vcpu);
4500*4882a593Smuzhiyun
4501*4882a593Smuzhiyun /*
4502*4882a593Smuzhiyun * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
4503*4882a593Smuzhiyun * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
4504*4882a593Smuzhiyun * up-to-date before switching to L1.
4505*4882a593Smuzhiyun */
4506*4882a593Smuzhiyun if (enable_ept && is_pae_paging(vcpu))
4507*4882a593Smuzhiyun vmx_ept_load_pdptrs(vcpu);
4508*4882a593Smuzhiyun
4509*4882a593Smuzhiyun leave_guest_mode(vcpu);
4510*4882a593Smuzhiyun
4511*4882a593Smuzhiyun if (nested_cpu_has_preemption_timer(vmcs12))
4512*4882a593Smuzhiyun hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4513*4882a593Smuzhiyun
4514*4882a593Smuzhiyun if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
4515*4882a593Smuzhiyun vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
4516*4882a593Smuzhiyun
4517*4882a593Smuzhiyun if (likely(!vmx->fail)) {
4518*4882a593Smuzhiyun sync_vmcs02_to_vmcs12(vcpu, vmcs12);
4519*4882a593Smuzhiyun
4520*4882a593Smuzhiyun if (vm_exit_reason != -1)
4521*4882a593Smuzhiyun prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
4522*4882a593Smuzhiyun exit_intr_info, exit_qualification);
4523*4882a593Smuzhiyun
4524*4882a593Smuzhiyun /*
4525*4882a593Smuzhiyun * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4526*4882a593Smuzhiyun * also be used to capture vmcs12 cache as part of
4527*4882a593Smuzhiyun * capturing nVMX state for snapshot (migration).
4528*4882a593Smuzhiyun *
4529*4882a593Smuzhiyun * Otherwise, this flush will dirty guest memory at a
4530*4882a593Smuzhiyun * point it is already assumed by user-space to be
4531*4882a593Smuzhiyun * immutable.
4532*4882a593Smuzhiyun */
4533*4882a593Smuzhiyun nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
4534*4882a593Smuzhiyun } else {
4535*4882a593Smuzhiyun /*
4536*4882a593Smuzhiyun * The only expected VM-instruction error is "VM entry with
4537*4882a593Smuzhiyun * invalid control field(s)." Anything else indicates a
4538*4882a593Smuzhiyun * problem with L0. And we should never get here with a
4539*4882a593Smuzhiyun * VMFail of any type if early consistency checks are enabled.
4540*4882a593Smuzhiyun */
4541*4882a593Smuzhiyun WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4542*4882a593Smuzhiyun VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4543*4882a593Smuzhiyun WARN_ON_ONCE(nested_early_check);
4544*4882a593Smuzhiyun }
4545*4882a593Smuzhiyun
4546*4882a593Smuzhiyun /*
4547*4882a593Smuzhiyun * Drop events/exceptions that were queued for re-injection to L2
4548*4882a593Smuzhiyun * (picked up via vmx_complete_interrupts()), as well as exceptions
4549*4882a593Smuzhiyun * that were pending for L2. Note, this must NOT be hoisted above
4550*4882a593Smuzhiyun * prepare_vmcs12(), events/exceptions queued for re-injection need to
4551*4882a593Smuzhiyun * be captured in vmcs12 (see vmcs12_save_pending_event()).
4552*4882a593Smuzhiyun */
4553*4882a593Smuzhiyun vcpu->arch.nmi_injected = false;
4554*4882a593Smuzhiyun kvm_clear_exception_queue(vcpu);
4555*4882a593Smuzhiyun kvm_clear_interrupt_queue(vcpu);
4556*4882a593Smuzhiyun
4557*4882a593Smuzhiyun vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4558*4882a593Smuzhiyun
4559*4882a593Smuzhiyun /* Update any VMCS fields that might have changed while L2 ran */
4560*4882a593Smuzhiyun vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4561*4882a593Smuzhiyun vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4562*4882a593Smuzhiyun vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
4563*4882a593Smuzhiyun if (vmx->nested.l1_tpr_threshold != -1)
4564*4882a593Smuzhiyun vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
4565*4882a593Smuzhiyun
4566*4882a593Smuzhiyun if (kvm_has_tsc_control)
4567*4882a593Smuzhiyun decache_tsc_multiplier(vmx);
4568*4882a593Smuzhiyun
4569*4882a593Smuzhiyun if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4570*4882a593Smuzhiyun vmx->nested.change_vmcs01_virtual_apic_mode = false;
4571*4882a593Smuzhiyun vmx_set_virtual_apic_mode(vcpu);
4572*4882a593Smuzhiyun }
4573*4882a593Smuzhiyun
4574*4882a593Smuzhiyun /* Unpin physical memory we referred to in vmcs02 */
4575*4882a593Smuzhiyun if (vmx->nested.apic_access_page) {
4576*4882a593Smuzhiyun kvm_release_page_clean(vmx->nested.apic_access_page);
4577*4882a593Smuzhiyun vmx->nested.apic_access_page = NULL;
4578*4882a593Smuzhiyun }
4579*4882a593Smuzhiyun kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
4580*4882a593Smuzhiyun kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4581*4882a593Smuzhiyun vmx->nested.pi_desc = NULL;
4582*4882a593Smuzhiyun
4583*4882a593Smuzhiyun if (vmx->nested.reload_vmcs01_apic_access_page) {
4584*4882a593Smuzhiyun vmx->nested.reload_vmcs01_apic_access_page = false;
4585*4882a593Smuzhiyun kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4586*4882a593Smuzhiyun }
4587*4882a593Smuzhiyun
4588*4882a593Smuzhiyun if ((vm_exit_reason != -1) &&
4589*4882a593Smuzhiyun (enable_shadow_vmcs || vmx->nested.hv_evmcs))
4590*4882a593Smuzhiyun vmx->nested.need_vmcs12_to_shadow_sync = true;
4591*4882a593Smuzhiyun
4592*4882a593Smuzhiyun /* in case we halted in L2 */
4593*4882a593Smuzhiyun vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4594*4882a593Smuzhiyun
4595*4882a593Smuzhiyun if (likely(!vmx->fail)) {
4596*4882a593Smuzhiyun if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
4597*4882a593Smuzhiyun nested_exit_intr_ack_set(vcpu)) {
4598*4882a593Smuzhiyun int irq = kvm_cpu_get_interrupt(vcpu);
4599*4882a593Smuzhiyun WARN_ON(irq < 0);
4600*4882a593Smuzhiyun vmcs12->vm_exit_intr_info = irq |
4601*4882a593Smuzhiyun INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4602*4882a593Smuzhiyun }
4603*4882a593Smuzhiyun
4604*4882a593Smuzhiyun if (vm_exit_reason != -1)
4605*4882a593Smuzhiyun trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4606*4882a593Smuzhiyun vmcs12->exit_qualification,
4607*4882a593Smuzhiyun vmcs12->idt_vectoring_info_field,
4608*4882a593Smuzhiyun vmcs12->vm_exit_intr_info,
4609*4882a593Smuzhiyun vmcs12->vm_exit_intr_error_code,
4610*4882a593Smuzhiyun KVM_ISA_VMX);
4611*4882a593Smuzhiyun
4612*4882a593Smuzhiyun load_vmcs12_host_state(vcpu, vmcs12);
4613*4882a593Smuzhiyun
4614*4882a593Smuzhiyun return;
4615*4882a593Smuzhiyun }
4616*4882a593Smuzhiyun
4617*4882a593Smuzhiyun /*
4618*4882a593Smuzhiyun * After an early L2 VM-entry failure, we're now back
4619*4882a593Smuzhiyun * in L1 which thinks it just finished a VMLAUNCH or
4620*4882a593Smuzhiyun * VMRESUME instruction, so we need to set the failure
4621*4882a593Smuzhiyun * flag and the VM-instruction error field of the VMCS
4622*4882a593Smuzhiyun * accordingly, and skip the emulated instruction.
4623*4882a593Smuzhiyun */
4624*4882a593Smuzhiyun (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4625*4882a593Smuzhiyun
4626*4882a593Smuzhiyun /*
4627*4882a593Smuzhiyun * Restore L1's host state to KVM's software model. We're here
4628*4882a593Smuzhiyun * because a consistency check was caught by hardware, which
4629*4882a593Smuzhiyun * means some amount of guest state has been propagated to KVM's
4630*4882a593Smuzhiyun * model and needs to be unwound to the host's state.
4631*4882a593Smuzhiyun */
4632*4882a593Smuzhiyun nested_vmx_restore_host_state(vcpu);
4633*4882a593Smuzhiyun
4634*4882a593Smuzhiyun vmx->fail = 0;
4635*4882a593Smuzhiyun }
4636*4882a593Smuzhiyun
4637*4882a593Smuzhiyun /*
4638*4882a593Smuzhiyun * Decode the memory-address operand of a vmx instruction, as recorded on an
4639*4882a593Smuzhiyun * exit caused by such an instruction (run by a guest hypervisor).
4640*4882a593Smuzhiyun * On success, returns 0. When the operand is invalid, returns 1 and throws
4641*4882a593Smuzhiyun * #UD, #GP, or #SS.
4642*4882a593Smuzhiyun */
get_vmx_mem_address(struct kvm_vcpu * vcpu,unsigned long exit_qualification,u32 vmx_instruction_info,bool wr,int len,gva_t * ret)4643*4882a593Smuzhiyun int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4644*4882a593Smuzhiyun u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
4645*4882a593Smuzhiyun {
4646*4882a593Smuzhiyun gva_t off;
4647*4882a593Smuzhiyun bool exn;
4648*4882a593Smuzhiyun struct kvm_segment s;
4649*4882a593Smuzhiyun
4650*4882a593Smuzhiyun /*
4651*4882a593Smuzhiyun * According to Vol. 3B, "Information for VM Exits Due to Instruction
4652*4882a593Smuzhiyun * Execution", on an exit, vmx_instruction_info holds most of the
4653*4882a593Smuzhiyun * addressing components of the operand. Only the displacement part
4654*4882a593Smuzhiyun * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4655*4882a593Smuzhiyun * For how an actual address is calculated from all these components,
4656*4882a593Smuzhiyun * refer to Vol. 1, "Operand Addressing".
4657*4882a593Smuzhiyun */
4658*4882a593Smuzhiyun int scaling = vmx_instruction_info & 3;
4659*4882a593Smuzhiyun int addr_size = (vmx_instruction_info >> 7) & 7;
4660*4882a593Smuzhiyun bool is_reg = vmx_instruction_info & (1u << 10);
4661*4882a593Smuzhiyun int seg_reg = (vmx_instruction_info >> 15) & 7;
4662*4882a593Smuzhiyun int index_reg = (vmx_instruction_info >> 18) & 0xf;
4663*4882a593Smuzhiyun bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4664*4882a593Smuzhiyun int base_reg = (vmx_instruction_info >> 23) & 0xf;
4665*4882a593Smuzhiyun bool base_is_valid = !(vmx_instruction_info & (1u << 27));
4666*4882a593Smuzhiyun
4667*4882a593Smuzhiyun if (is_reg) {
4668*4882a593Smuzhiyun kvm_queue_exception(vcpu, UD_VECTOR);
4669*4882a593Smuzhiyun return 1;
4670*4882a593Smuzhiyun }
4671*4882a593Smuzhiyun
4672*4882a593Smuzhiyun /* Addr = segment_base + offset */
4673*4882a593Smuzhiyun /* offset = base + [index * scale] + displacement */
4674*4882a593Smuzhiyun off = exit_qualification; /* holds the displacement */
4675*4882a593Smuzhiyun if (addr_size == 1)
4676*4882a593Smuzhiyun off = (gva_t)sign_extend64(off, 31);
4677*4882a593Smuzhiyun else if (addr_size == 0)
4678*4882a593Smuzhiyun off = (gva_t)sign_extend64(off, 15);
4679*4882a593Smuzhiyun if (base_is_valid)
4680*4882a593Smuzhiyun off += kvm_register_readl(vcpu, base_reg);
4681*4882a593Smuzhiyun if (index_is_valid)
4682*4882a593Smuzhiyun off += kvm_register_readl(vcpu, index_reg) << scaling;
4683*4882a593Smuzhiyun vmx_get_segment(vcpu, &s, seg_reg);
4684*4882a593Smuzhiyun
4685*4882a593Smuzhiyun /*
4686*4882a593Smuzhiyun * The effective address, i.e. @off, of a memory operand is truncated
4687*4882a593Smuzhiyun * based on the address size of the instruction. Note that this is
4688*4882a593Smuzhiyun * the *effective address*, i.e. the address prior to accounting for
4689*4882a593Smuzhiyun * the segment's base.
4690*4882a593Smuzhiyun */
4691*4882a593Smuzhiyun if (addr_size == 1) /* 32 bit */
4692*4882a593Smuzhiyun off &= 0xffffffff;
4693*4882a593Smuzhiyun else if (addr_size == 0) /* 16 bit */
4694*4882a593Smuzhiyun off &= 0xffff;
4695*4882a593Smuzhiyun
4696*4882a593Smuzhiyun /* Checks for #GP/#SS exceptions. */
4697*4882a593Smuzhiyun exn = false;
4698*4882a593Smuzhiyun if (is_long_mode(vcpu)) {
4699*4882a593Smuzhiyun /*
4700*4882a593Smuzhiyun * The virtual/linear address is never truncated in 64-bit
4701*4882a593Smuzhiyun * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4702*4882a593Smuzhiyun * address when using FS/GS with a non-zero base.
4703*4882a593Smuzhiyun */
4704*4882a593Smuzhiyun if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4705*4882a593Smuzhiyun *ret = s.base + off;
4706*4882a593Smuzhiyun else
4707*4882a593Smuzhiyun *ret = off;
4708*4882a593Smuzhiyun
4709*4882a593Smuzhiyun /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4710*4882a593Smuzhiyun * non-canonical form. This is the only check on the memory
4711*4882a593Smuzhiyun * destination for long mode!
4712*4882a593Smuzhiyun */
4713*4882a593Smuzhiyun exn = is_noncanonical_address(*ret, vcpu);
4714*4882a593Smuzhiyun } else {
4715*4882a593Smuzhiyun /*
4716*4882a593Smuzhiyun * When not in long mode, the virtual/linear address is
4717*4882a593Smuzhiyun * unconditionally truncated to 32 bits regardless of the
4718*4882a593Smuzhiyun * address size.
4719*4882a593Smuzhiyun */
4720*4882a593Smuzhiyun *ret = (s.base + off) & 0xffffffff;
4721*4882a593Smuzhiyun
4722*4882a593Smuzhiyun /* Protected mode: apply checks for segment validity in the
4723*4882a593Smuzhiyun * following order:
4724*4882a593Smuzhiyun * - segment type check (#GP(0) may be thrown)
4725*4882a593Smuzhiyun * - usability check (#GP(0)/#SS(0))
4726*4882a593Smuzhiyun * - limit check (#GP(0)/#SS(0))
4727*4882a593Smuzhiyun */
4728*4882a593Smuzhiyun if (wr)
4729*4882a593Smuzhiyun /* #GP(0) if the destination operand is located in a
4730*4882a593Smuzhiyun * read-only data segment or any code segment.
4731*4882a593Smuzhiyun */
4732*4882a593Smuzhiyun exn = ((s.type & 0xa) == 0 || (s.type & 8));
4733*4882a593Smuzhiyun else
4734*4882a593Smuzhiyun /* #GP(0) if the source operand is located in an
4735*4882a593Smuzhiyun * execute-only code segment
4736*4882a593Smuzhiyun */
4737*4882a593Smuzhiyun exn = ((s.type & 0xa) == 8);
4738*4882a593Smuzhiyun if (exn) {
4739*4882a593Smuzhiyun kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4740*4882a593Smuzhiyun return 1;
4741*4882a593Smuzhiyun }
4742*4882a593Smuzhiyun /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4743*4882a593Smuzhiyun */
4744*4882a593Smuzhiyun exn = (s.unusable != 0);
4745*4882a593Smuzhiyun
4746*4882a593Smuzhiyun /*
4747*4882a593Smuzhiyun * Protected mode: #GP(0)/#SS(0) if the memory operand is
4748*4882a593Smuzhiyun * outside the segment limit. All CPUs that support VMX ignore
4749*4882a593Smuzhiyun * limit checks for flat segments, i.e. segments with base==0,
4750*4882a593Smuzhiyun * limit==0xffffffff and of type expand-up data or code.
4751*4882a593Smuzhiyun */
4752*4882a593Smuzhiyun if (!(s.base == 0 && s.limit == 0xffffffff &&
4753*4882a593Smuzhiyun ((s.type & 8) || !(s.type & 4))))
4754*4882a593Smuzhiyun exn = exn || ((u64)off + len - 1 > s.limit);
4755*4882a593Smuzhiyun }
4756*4882a593Smuzhiyun if (exn) {
4757*4882a593Smuzhiyun kvm_queue_exception_e(vcpu,
4758*4882a593Smuzhiyun seg_reg == VCPU_SREG_SS ?
4759*4882a593Smuzhiyun SS_VECTOR : GP_VECTOR,
4760*4882a593Smuzhiyun 0);
4761*4882a593Smuzhiyun return 1;
4762*4882a593Smuzhiyun }
4763*4882a593Smuzhiyun
4764*4882a593Smuzhiyun return 0;
4765*4882a593Smuzhiyun }
4766*4882a593Smuzhiyun
nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu * vcpu)4767*4882a593Smuzhiyun void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
4768*4882a593Smuzhiyun {
4769*4882a593Smuzhiyun struct vcpu_vmx *vmx;
4770*4882a593Smuzhiyun
4771*4882a593Smuzhiyun if (!nested_vmx_allowed(vcpu))
4772*4882a593Smuzhiyun return;
4773*4882a593Smuzhiyun
4774*4882a593Smuzhiyun vmx = to_vmx(vcpu);
4775*4882a593Smuzhiyun if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
4776*4882a593Smuzhiyun vmx->nested.msrs.entry_ctls_high |=
4777*4882a593Smuzhiyun VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4778*4882a593Smuzhiyun vmx->nested.msrs.exit_ctls_high |=
4779*4882a593Smuzhiyun VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4780*4882a593Smuzhiyun } else {
4781*4882a593Smuzhiyun vmx->nested.msrs.entry_ctls_high &=
4782*4882a593Smuzhiyun ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4783*4882a593Smuzhiyun vmx->nested.msrs.exit_ctls_high &=
4784*4882a593Smuzhiyun ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4785*4882a593Smuzhiyun }
4786*4882a593Smuzhiyun }
4787*4882a593Smuzhiyun
nested_vmx_get_vmptr(struct kvm_vcpu * vcpu,gpa_t * vmpointer,int * ret)4788*4882a593Smuzhiyun static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
4789*4882a593Smuzhiyun int *ret)
4790*4882a593Smuzhiyun {
4791*4882a593Smuzhiyun gva_t gva;
4792*4882a593Smuzhiyun struct x86_exception e;
4793*4882a593Smuzhiyun int r;
4794*4882a593Smuzhiyun
4795*4882a593Smuzhiyun if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
4796*4882a593Smuzhiyun vmcs_read32(VMX_INSTRUCTION_INFO), false,
4797*4882a593Smuzhiyun sizeof(*vmpointer), &gva)) {
4798*4882a593Smuzhiyun *ret = 1;
4799*4882a593Smuzhiyun return -EINVAL;
4800*4882a593Smuzhiyun }
4801*4882a593Smuzhiyun
4802*4882a593Smuzhiyun r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
4803*4882a593Smuzhiyun if (r != X86EMUL_CONTINUE) {
4804*4882a593Smuzhiyun *ret = kvm_handle_memory_failure(vcpu, r, &e);
4805*4882a593Smuzhiyun return -EINVAL;
4806*4882a593Smuzhiyun }
4807*4882a593Smuzhiyun
4808*4882a593Smuzhiyun return 0;
4809*4882a593Smuzhiyun }
4810*4882a593Smuzhiyun
4811*4882a593Smuzhiyun /*
4812*4882a593Smuzhiyun * Allocate a shadow VMCS and associate it with the currently loaded
4813*4882a593Smuzhiyun * VMCS, unless such a shadow VMCS already exists. The newly allocated
4814*4882a593Smuzhiyun * VMCS is also VMCLEARed, so that it is ready for use.
4815*4882a593Smuzhiyun */
alloc_shadow_vmcs(struct kvm_vcpu * vcpu)4816*4882a593Smuzhiyun static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4817*4882a593Smuzhiyun {
4818*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
4819*4882a593Smuzhiyun struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4820*4882a593Smuzhiyun
4821*4882a593Smuzhiyun /*
4822*4882a593Smuzhiyun * We should allocate a shadow vmcs for vmcs01 only when L1
4823*4882a593Smuzhiyun * executes VMXON and free it when L1 executes VMXOFF.
4824*4882a593Smuzhiyun * As it is invalid to execute VMXON twice, we shouldn't reach
4825*4882a593Smuzhiyun * here when vmcs01 already have an allocated shadow vmcs.
4826*4882a593Smuzhiyun */
4827*4882a593Smuzhiyun WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4828*4882a593Smuzhiyun
4829*4882a593Smuzhiyun if (!loaded_vmcs->shadow_vmcs) {
4830*4882a593Smuzhiyun loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4831*4882a593Smuzhiyun if (loaded_vmcs->shadow_vmcs)
4832*4882a593Smuzhiyun vmcs_clear(loaded_vmcs->shadow_vmcs);
4833*4882a593Smuzhiyun }
4834*4882a593Smuzhiyun return loaded_vmcs->shadow_vmcs;
4835*4882a593Smuzhiyun }
4836*4882a593Smuzhiyun
enter_vmx_operation(struct kvm_vcpu * vcpu)4837*4882a593Smuzhiyun static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4838*4882a593Smuzhiyun {
4839*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
4840*4882a593Smuzhiyun int r;
4841*4882a593Smuzhiyun
4842*4882a593Smuzhiyun r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4843*4882a593Smuzhiyun if (r < 0)
4844*4882a593Smuzhiyun goto out_vmcs02;
4845*4882a593Smuzhiyun
4846*4882a593Smuzhiyun vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4847*4882a593Smuzhiyun if (!vmx->nested.cached_vmcs12)
4848*4882a593Smuzhiyun goto out_cached_vmcs12;
4849*4882a593Smuzhiyun
4850*4882a593Smuzhiyun vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4851*4882a593Smuzhiyun if (!vmx->nested.cached_shadow_vmcs12)
4852*4882a593Smuzhiyun goto out_cached_shadow_vmcs12;
4853*4882a593Smuzhiyun
4854*4882a593Smuzhiyun if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4855*4882a593Smuzhiyun goto out_shadow_vmcs;
4856*4882a593Smuzhiyun
4857*4882a593Smuzhiyun hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4858*4882a593Smuzhiyun HRTIMER_MODE_ABS_PINNED);
4859*4882a593Smuzhiyun vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4860*4882a593Smuzhiyun
4861*4882a593Smuzhiyun vmx->nested.vpid02 = allocate_vpid();
4862*4882a593Smuzhiyun
4863*4882a593Smuzhiyun vmx->nested.vmcs02_initialized = false;
4864*4882a593Smuzhiyun vmx->nested.vmxon = true;
4865*4882a593Smuzhiyun
4866*4882a593Smuzhiyun if (vmx_pt_mode_is_host_guest()) {
4867*4882a593Smuzhiyun vmx->pt_desc.guest.ctl = 0;
4868*4882a593Smuzhiyun pt_update_intercept_for_msr(vcpu);
4869*4882a593Smuzhiyun }
4870*4882a593Smuzhiyun
4871*4882a593Smuzhiyun return 0;
4872*4882a593Smuzhiyun
4873*4882a593Smuzhiyun out_shadow_vmcs:
4874*4882a593Smuzhiyun kfree(vmx->nested.cached_shadow_vmcs12);
4875*4882a593Smuzhiyun
4876*4882a593Smuzhiyun out_cached_shadow_vmcs12:
4877*4882a593Smuzhiyun kfree(vmx->nested.cached_vmcs12);
4878*4882a593Smuzhiyun
4879*4882a593Smuzhiyun out_cached_vmcs12:
4880*4882a593Smuzhiyun free_loaded_vmcs(&vmx->nested.vmcs02);
4881*4882a593Smuzhiyun
4882*4882a593Smuzhiyun out_vmcs02:
4883*4882a593Smuzhiyun return -ENOMEM;
4884*4882a593Smuzhiyun }
4885*4882a593Smuzhiyun
4886*4882a593Smuzhiyun /*
4887*4882a593Smuzhiyun * Emulate the VMXON instruction.
4888*4882a593Smuzhiyun * Currently, we just remember that VMX is active, and do not save or even
4889*4882a593Smuzhiyun * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4890*4882a593Smuzhiyun * do not currently need to store anything in that guest-allocated memory
4891*4882a593Smuzhiyun * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4892*4882a593Smuzhiyun * argument is different from the VMXON pointer (which the spec says they do).
4893*4882a593Smuzhiyun */
handle_vmon(struct kvm_vcpu * vcpu)4894*4882a593Smuzhiyun static int handle_vmon(struct kvm_vcpu *vcpu)
4895*4882a593Smuzhiyun {
4896*4882a593Smuzhiyun int ret;
4897*4882a593Smuzhiyun gpa_t vmptr;
4898*4882a593Smuzhiyun uint32_t revision;
4899*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
4900*4882a593Smuzhiyun const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
4901*4882a593Smuzhiyun | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
4902*4882a593Smuzhiyun
4903*4882a593Smuzhiyun /*
4904*4882a593Smuzhiyun * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks
4905*4882a593Smuzhiyun * that have higher priority than VM-Exit (see Intel SDM's pseudocode
4906*4882a593Smuzhiyun * for VMXON), as KVM must load valid CR0/CR4 values into hardware while
4907*4882a593Smuzhiyun * running the guest, i.e. KVM needs to check the _guest_ values.
4908*4882a593Smuzhiyun *
4909*4882a593Smuzhiyun * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and
4910*4882a593Smuzhiyun * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real
4911*4882a593Smuzhiyun * Mode, but KVM will never take the guest out of those modes.
4912*4882a593Smuzhiyun */
4913*4882a593Smuzhiyun if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
4914*4882a593Smuzhiyun !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
4915*4882a593Smuzhiyun kvm_queue_exception(vcpu, UD_VECTOR);
4916*4882a593Smuzhiyun return 1;
4917*4882a593Smuzhiyun }
4918*4882a593Smuzhiyun
4919*4882a593Smuzhiyun /*
4920*4882a593Smuzhiyun * CPL=0 and all other checks that are lower priority than VM-Exit must
4921*4882a593Smuzhiyun * be checked manually.
4922*4882a593Smuzhiyun */
4923*4882a593Smuzhiyun if (vmx_get_cpl(vcpu)) {
4924*4882a593Smuzhiyun kvm_inject_gp(vcpu, 0);
4925*4882a593Smuzhiyun return 1;
4926*4882a593Smuzhiyun }
4927*4882a593Smuzhiyun
4928*4882a593Smuzhiyun if (vmx->nested.vmxon)
4929*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4930*4882a593Smuzhiyun
4931*4882a593Smuzhiyun if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4932*4882a593Smuzhiyun != VMXON_NEEDED_FEATURES) {
4933*4882a593Smuzhiyun kvm_inject_gp(vcpu, 0);
4934*4882a593Smuzhiyun return 1;
4935*4882a593Smuzhiyun }
4936*4882a593Smuzhiyun
4937*4882a593Smuzhiyun if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
4938*4882a593Smuzhiyun return ret;
4939*4882a593Smuzhiyun
4940*4882a593Smuzhiyun /*
4941*4882a593Smuzhiyun * SDM 3: 24.11.5
4942*4882a593Smuzhiyun * The first 4 bytes of VMXON region contain the supported
4943*4882a593Smuzhiyun * VMCS revision identifier
4944*4882a593Smuzhiyun *
4945*4882a593Smuzhiyun * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4946*4882a593Smuzhiyun * which replaces physical address width with 32
4947*4882a593Smuzhiyun */
4948*4882a593Smuzhiyun if (!page_address_valid(vcpu, vmptr))
4949*4882a593Smuzhiyun return nested_vmx_failInvalid(vcpu);
4950*4882a593Smuzhiyun
4951*4882a593Smuzhiyun if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4952*4882a593Smuzhiyun revision != VMCS12_REVISION)
4953*4882a593Smuzhiyun return nested_vmx_failInvalid(vcpu);
4954*4882a593Smuzhiyun
4955*4882a593Smuzhiyun vmx->nested.vmxon_ptr = vmptr;
4956*4882a593Smuzhiyun ret = enter_vmx_operation(vcpu);
4957*4882a593Smuzhiyun if (ret)
4958*4882a593Smuzhiyun return ret;
4959*4882a593Smuzhiyun
4960*4882a593Smuzhiyun return nested_vmx_succeed(vcpu);
4961*4882a593Smuzhiyun }
4962*4882a593Smuzhiyun
nested_release_vmcs12(struct kvm_vcpu * vcpu)4963*4882a593Smuzhiyun static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4964*4882a593Smuzhiyun {
4965*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
4966*4882a593Smuzhiyun
4967*4882a593Smuzhiyun if (vmx->nested.current_vmptr == -1ull)
4968*4882a593Smuzhiyun return;
4969*4882a593Smuzhiyun
4970*4882a593Smuzhiyun copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4971*4882a593Smuzhiyun
4972*4882a593Smuzhiyun if (enable_shadow_vmcs) {
4973*4882a593Smuzhiyun /* copy to memory all shadowed fields in case
4974*4882a593Smuzhiyun they were modified */
4975*4882a593Smuzhiyun copy_shadow_to_vmcs12(vmx);
4976*4882a593Smuzhiyun vmx_disable_shadow_vmcs(vmx);
4977*4882a593Smuzhiyun }
4978*4882a593Smuzhiyun vmx->nested.posted_intr_nv = -1;
4979*4882a593Smuzhiyun
4980*4882a593Smuzhiyun /* Flush VMCS12 to guest memory */
4981*4882a593Smuzhiyun kvm_vcpu_write_guest_page(vcpu,
4982*4882a593Smuzhiyun vmx->nested.current_vmptr >> PAGE_SHIFT,
4983*4882a593Smuzhiyun vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4984*4882a593Smuzhiyun
4985*4882a593Smuzhiyun kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4986*4882a593Smuzhiyun
4987*4882a593Smuzhiyun vmx->nested.current_vmptr = -1ull;
4988*4882a593Smuzhiyun }
4989*4882a593Smuzhiyun
4990*4882a593Smuzhiyun /* Emulate the VMXOFF instruction */
handle_vmoff(struct kvm_vcpu * vcpu)4991*4882a593Smuzhiyun static int handle_vmoff(struct kvm_vcpu *vcpu)
4992*4882a593Smuzhiyun {
4993*4882a593Smuzhiyun if (!nested_vmx_check_permission(vcpu))
4994*4882a593Smuzhiyun return 1;
4995*4882a593Smuzhiyun
4996*4882a593Smuzhiyun free_nested(vcpu);
4997*4882a593Smuzhiyun
4998*4882a593Smuzhiyun /* Process a latched INIT during time CPU was in VMX operation */
4999*4882a593Smuzhiyun kvm_make_request(KVM_REQ_EVENT, vcpu);
5000*4882a593Smuzhiyun
5001*4882a593Smuzhiyun return nested_vmx_succeed(vcpu);
5002*4882a593Smuzhiyun }
5003*4882a593Smuzhiyun
5004*4882a593Smuzhiyun /* Emulate the VMCLEAR instruction */
handle_vmclear(struct kvm_vcpu * vcpu)5005*4882a593Smuzhiyun static int handle_vmclear(struct kvm_vcpu *vcpu)
5006*4882a593Smuzhiyun {
5007*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
5008*4882a593Smuzhiyun u32 zero = 0;
5009*4882a593Smuzhiyun gpa_t vmptr;
5010*4882a593Smuzhiyun u64 evmcs_gpa;
5011*4882a593Smuzhiyun int r;
5012*4882a593Smuzhiyun
5013*4882a593Smuzhiyun if (!nested_vmx_check_permission(vcpu))
5014*4882a593Smuzhiyun return 1;
5015*4882a593Smuzhiyun
5016*4882a593Smuzhiyun if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5017*4882a593Smuzhiyun return r;
5018*4882a593Smuzhiyun
5019*4882a593Smuzhiyun if (!page_address_valid(vcpu, vmptr))
5020*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5021*4882a593Smuzhiyun
5022*4882a593Smuzhiyun if (vmptr == vmx->nested.vmxon_ptr)
5023*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
5024*4882a593Smuzhiyun
5025*4882a593Smuzhiyun /*
5026*4882a593Smuzhiyun * When Enlightened VMEntry is enabled on the calling CPU we treat
5027*4882a593Smuzhiyun * memory area pointer by vmptr as Enlightened VMCS (as there's no good
5028*4882a593Smuzhiyun * way to distinguish it from VMCS12) and we must not corrupt it by
5029*4882a593Smuzhiyun * writing to the non-existent 'launch_state' field. The area doesn't
5030*4882a593Smuzhiyun * have to be the currently active EVMCS on the calling CPU and there's
5031*4882a593Smuzhiyun * nothing KVM has to do to transition it from 'active' to 'non-active'
5032*4882a593Smuzhiyun * state. It is possible that the area will stay mapped as
5033*4882a593Smuzhiyun * vmx->nested.hv_evmcs but this shouldn't be a problem.
5034*4882a593Smuzhiyun */
5035*4882a593Smuzhiyun if (likely(!vmx->nested.enlightened_vmcs_enabled ||
5036*4882a593Smuzhiyun !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
5037*4882a593Smuzhiyun if (vmptr == vmx->nested.current_vmptr)
5038*4882a593Smuzhiyun nested_release_vmcs12(vcpu);
5039*4882a593Smuzhiyun
5040*4882a593Smuzhiyun kvm_vcpu_write_guest(vcpu,
5041*4882a593Smuzhiyun vmptr + offsetof(struct vmcs12,
5042*4882a593Smuzhiyun launch_state),
5043*4882a593Smuzhiyun &zero, sizeof(zero));
5044*4882a593Smuzhiyun }
5045*4882a593Smuzhiyun
5046*4882a593Smuzhiyun return nested_vmx_succeed(vcpu);
5047*4882a593Smuzhiyun }
5048*4882a593Smuzhiyun
5049*4882a593Smuzhiyun /* Emulate the VMLAUNCH instruction */
handle_vmlaunch(struct kvm_vcpu * vcpu)5050*4882a593Smuzhiyun static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5051*4882a593Smuzhiyun {
5052*4882a593Smuzhiyun return nested_vmx_run(vcpu, true);
5053*4882a593Smuzhiyun }
5054*4882a593Smuzhiyun
5055*4882a593Smuzhiyun /* Emulate the VMRESUME instruction */
handle_vmresume(struct kvm_vcpu * vcpu)5056*4882a593Smuzhiyun static int handle_vmresume(struct kvm_vcpu *vcpu)
5057*4882a593Smuzhiyun {
5058*4882a593Smuzhiyun
5059*4882a593Smuzhiyun return nested_vmx_run(vcpu, false);
5060*4882a593Smuzhiyun }
5061*4882a593Smuzhiyun
handle_vmread(struct kvm_vcpu * vcpu)5062*4882a593Smuzhiyun static int handle_vmread(struct kvm_vcpu *vcpu)
5063*4882a593Smuzhiyun {
5064*4882a593Smuzhiyun struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5065*4882a593Smuzhiyun : get_vmcs12(vcpu);
5066*4882a593Smuzhiyun unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5067*4882a593Smuzhiyun u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5068*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
5069*4882a593Smuzhiyun struct x86_exception e;
5070*4882a593Smuzhiyun unsigned long field;
5071*4882a593Smuzhiyun u64 value;
5072*4882a593Smuzhiyun gva_t gva = 0;
5073*4882a593Smuzhiyun short offset;
5074*4882a593Smuzhiyun int len, r;
5075*4882a593Smuzhiyun
5076*4882a593Smuzhiyun if (!nested_vmx_check_permission(vcpu))
5077*4882a593Smuzhiyun return 1;
5078*4882a593Smuzhiyun
5079*4882a593Smuzhiyun /*
5080*4882a593Smuzhiyun * In VMX non-root operation, when the VMCS-link pointer is -1ull,
5081*4882a593Smuzhiyun * any VMREAD sets the ALU flags for VMfailInvalid.
5082*4882a593Smuzhiyun */
5083*4882a593Smuzhiyun if (vmx->nested.current_vmptr == -1ull ||
5084*4882a593Smuzhiyun (is_guest_mode(vcpu) &&
5085*4882a593Smuzhiyun get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
5086*4882a593Smuzhiyun return nested_vmx_failInvalid(vcpu);
5087*4882a593Smuzhiyun
5088*4882a593Smuzhiyun /* Decode instruction info and find the field to read */
5089*4882a593Smuzhiyun field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
5090*4882a593Smuzhiyun
5091*4882a593Smuzhiyun offset = vmcs_field_to_offset(field);
5092*4882a593Smuzhiyun if (offset < 0)
5093*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5094*4882a593Smuzhiyun
5095*4882a593Smuzhiyun if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
5096*4882a593Smuzhiyun copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5097*4882a593Smuzhiyun
5098*4882a593Smuzhiyun /* Read the field, zero-extended to a u64 value */
5099*4882a593Smuzhiyun value = vmcs12_read_any(vmcs12, field, offset);
5100*4882a593Smuzhiyun
5101*4882a593Smuzhiyun /*
5102*4882a593Smuzhiyun * Now copy part of this value to register or memory, as requested.
5103*4882a593Smuzhiyun * Note that the number of bits actually copied is 32 or 64 depending
5104*4882a593Smuzhiyun * on the guest's mode (32 or 64 bit), not on the given field's length.
5105*4882a593Smuzhiyun */
5106*4882a593Smuzhiyun if (instr_info & BIT(10)) {
5107*4882a593Smuzhiyun kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value);
5108*4882a593Smuzhiyun } else {
5109*4882a593Smuzhiyun len = is_64_bit_mode(vcpu) ? 8 : 4;
5110*4882a593Smuzhiyun if (get_vmx_mem_address(vcpu, exit_qualification,
5111*4882a593Smuzhiyun instr_info, true, len, &gva))
5112*4882a593Smuzhiyun return 1;
5113*4882a593Smuzhiyun /* _system ok, nested_vmx_check_permission has verified cpl=0 */
5114*4882a593Smuzhiyun r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
5115*4882a593Smuzhiyun if (r != X86EMUL_CONTINUE)
5116*4882a593Smuzhiyun return kvm_handle_memory_failure(vcpu, r, &e);
5117*4882a593Smuzhiyun }
5118*4882a593Smuzhiyun
5119*4882a593Smuzhiyun return nested_vmx_succeed(vcpu);
5120*4882a593Smuzhiyun }
5121*4882a593Smuzhiyun
is_shadow_field_rw(unsigned long field)5122*4882a593Smuzhiyun static bool is_shadow_field_rw(unsigned long field)
5123*4882a593Smuzhiyun {
5124*4882a593Smuzhiyun switch (field) {
5125*4882a593Smuzhiyun #define SHADOW_FIELD_RW(x, y) case x:
5126*4882a593Smuzhiyun #include "vmcs_shadow_fields.h"
5127*4882a593Smuzhiyun return true;
5128*4882a593Smuzhiyun default:
5129*4882a593Smuzhiyun break;
5130*4882a593Smuzhiyun }
5131*4882a593Smuzhiyun return false;
5132*4882a593Smuzhiyun }
5133*4882a593Smuzhiyun
is_shadow_field_ro(unsigned long field)5134*4882a593Smuzhiyun static bool is_shadow_field_ro(unsigned long field)
5135*4882a593Smuzhiyun {
5136*4882a593Smuzhiyun switch (field) {
5137*4882a593Smuzhiyun #define SHADOW_FIELD_RO(x, y) case x:
5138*4882a593Smuzhiyun #include "vmcs_shadow_fields.h"
5139*4882a593Smuzhiyun return true;
5140*4882a593Smuzhiyun default:
5141*4882a593Smuzhiyun break;
5142*4882a593Smuzhiyun }
5143*4882a593Smuzhiyun return false;
5144*4882a593Smuzhiyun }
5145*4882a593Smuzhiyun
handle_vmwrite(struct kvm_vcpu * vcpu)5146*4882a593Smuzhiyun static int handle_vmwrite(struct kvm_vcpu *vcpu)
5147*4882a593Smuzhiyun {
5148*4882a593Smuzhiyun struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5149*4882a593Smuzhiyun : get_vmcs12(vcpu);
5150*4882a593Smuzhiyun unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5151*4882a593Smuzhiyun u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5152*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
5153*4882a593Smuzhiyun struct x86_exception e;
5154*4882a593Smuzhiyun unsigned long field;
5155*4882a593Smuzhiyun short offset;
5156*4882a593Smuzhiyun gva_t gva;
5157*4882a593Smuzhiyun int len, r;
5158*4882a593Smuzhiyun
5159*4882a593Smuzhiyun /*
5160*4882a593Smuzhiyun * The value to write might be 32 or 64 bits, depending on L1's long
5161*4882a593Smuzhiyun * mode, and eventually we need to write that into a field of several
5162*4882a593Smuzhiyun * possible lengths. The code below first zero-extends the value to 64
5163*4882a593Smuzhiyun * bit (value), and then copies only the appropriate number of
5164*4882a593Smuzhiyun * bits into the vmcs12 field.
5165*4882a593Smuzhiyun */
5166*4882a593Smuzhiyun u64 value = 0;
5167*4882a593Smuzhiyun
5168*4882a593Smuzhiyun if (!nested_vmx_check_permission(vcpu))
5169*4882a593Smuzhiyun return 1;
5170*4882a593Smuzhiyun
5171*4882a593Smuzhiyun /*
5172*4882a593Smuzhiyun * In VMX non-root operation, when the VMCS-link pointer is -1ull,
5173*4882a593Smuzhiyun * any VMWRITE sets the ALU flags for VMfailInvalid.
5174*4882a593Smuzhiyun */
5175*4882a593Smuzhiyun if (vmx->nested.current_vmptr == -1ull ||
5176*4882a593Smuzhiyun (is_guest_mode(vcpu) &&
5177*4882a593Smuzhiyun get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
5178*4882a593Smuzhiyun return nested_vmx_failInvalid(vcpu);
5179*4882a593Smuzhiyun
5180*4882a593Smuzhiyun if (instr_info & BIT(10))
5181*4882a593Smuzhiyun value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf));
5182*4882a593Smuzhiyun else {
5183*4882a593Smuzhiyun len = is_64_bit_mode(vcpu) ? 8 : 4;
5184*4882a593Smuzhiyun if (get_vmx_mem_address(vcpu, exit_qualification,
5185*4882a593Smuzhiyun instr_info, false, len, &gva))
5186*4882a593Smuzhiyun return 1;
5187*4882a593Smuzhiyun r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
5188*4882a593Smuzhiyun if (r != X86EMUL_CONTINUE)
5189*4882a593Smuzhiyun return kvm_handle_memory_failure(vcpu, r, &e);
5190*4882a593Smuzhiyun }
5191*4882a593Smuzhiyun
5192*4882a593Smuzhiyun field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
5193*4882a593Smuzhiyun
5194*4882a593Smuzhiyun offset = vmcs_field_to_offset(field);
5195*4882a593Smuzhiyun if (offset < 0)
5196*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5197*4882a593Smuzhiyun
5198*4882a593Smuzhiyun /*
5199*4882a593Smuzhiyun * If the vCPU supports "VMWRITE to any supported field in the
5200*4882a593Smuzhiyun * VMCS," then the "read-only" fields are actually read/write.
5201*4882a593Smuzhiyun */
5202*4882a593Smuzhiyun if (vmcs_field_readonly(field) &&
5203*4882a593Smuzhiyun !nested_cpu_has_vmwrite_any_field(vcpu))
5204*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5205*4882a593Smuzhiyun
5206*4882a593Smuzhiyun /*
5207*4882a593Smuzhiyun * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
5208*4882a593Smuzhiyun * vmcs12, else we may crush a field or consume a stale value.
5209*4882a593Smuzhiyun */
5210*4882a593Smuzhiyun if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
5211*4882a593Smuzhiyun copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5212*4882a593Smuzhiyun
5213*4882a593Smuzhiyun /*
5214*4882a593Smuzhiyun * Some Intel CPUs intentionally drop the reserved bits of the AR byte
5215*4882a593Smuzhiyun * fields on VMWRITE. Emulate this behavior to ensure consistent KVM
5216*4882a593Smuzhiyun * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
5217*4882a593Smuzhiyun * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
5218*4882a593Smuzhiyun * from L1 will return a different value than VMREAD from L2 (L1 sees
5219*4882a593Smuzhiyun * the stripped down value, L2 sees the full value as stored by KVM).
5220*4882a593Smuzhiyun */
5221*4882a593Smuzhiyun if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
5222*4882a593Smuzhiyun value &= 0x1f0ff;
5223*4882a593Smuzhiyun
5224*4882a593Smuzhiyun vmcs12_write_any(vmcs12, field, offset, value);
5225*4882a593Smuzhiyun
5226*4882a593Smuzhiyun /*
5227*4882a593Smuzhiyun * Do not track vmcs12 dirty-state if in guest-mode as we actually
5228*4882a593Smuzhiyun * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
5229*4882a593Smuzhiyun * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
5230*4882a593Smuzhiyun * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
5231*4882a593Smuzhiyun */
5232*4882a593Smuzhiyun if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
5233*4882a593Smuzhiyun /*
5234*4882a593Smuzhiyun * L1 can read these fields without exiting, ensure the
5235*4882a593Smuzhiyun * shadow VMCS is up-to-date.
5236*4882a593Smuzhiyun */
5237*4882a593Smuzhiyun if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
5238*4882a593Smuzhiyun preempt_disable();
5239*4882a593Smuzhiyun vmcs_load(vmx->vmcs01.shadow_vmcs);
5240*4882a593Smuzhiyun
5241*4882a593Smuzhiyun __vmcs_writel(field, value);
5242*4882a593Smuzhiyun
5243*4882a593Smuzhiyun vmcs_clear(vmx->vmcs01.shadow_vmcs);
5244*4882a593Smuzhiyun vmcs_load(vmx->loaded_vmcs->vmcs);
5245*4882a593Smuzhiyun preempt_enable();
5246*4882a593Smuzhiyun }
5247*4882a593Smuzhiyun vmx->nested.dirty_vmcs12 = true;
5248*4882a593Smuzhiyun }
5249*4882a593Smuzhiyun
5250*4882a593Smuzhiyun return nested_vmx_succeed(vcpu);
5251*4882a593Smuzhiyun }
5252*4882a593Smuzhiyun
set_current_vmptr(struct vcpu_vmx * vmx,gpa_t vmptr)5253*4882a593Smuzhiyun static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
5254*4882a593Smuzhiyun {
5255*4882a593Smuzhiyun vmx->nested.current_vmptr = vmptr;
5256*4882a593Smuzhiyun if (enable_shadow_vmcs) {
5257*4882a593Smuzhiyun secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
5258*4882a593Smuzhiyun vmcs_write64(VMCS_LINK_POINTER,
5259*4882a593Smuzhiyun __pa(vmx->vmcs01.shadow_vmcs));
5260*4882a593Smuzhiyun vmx->nested.need_vmcs12_to_shadow_sync = true;
5261*4882a593Smuzhiyun }
5262*4882a593Smuzhiyun vmx->nested.dirty_vmcs12 = true;
5263*4882a593Smuzhiyun }
5264*4882a593Smuzhiyun
5265*4882a593Smuzhiyun /* Emulate the VMPTRLD instruction */
handle_vmptrld(struct kvm_vcpu * vcpu)5266*4882a593Smuzhiyun static int handle_vmptrld(struct kvm_vcpu *vcpu)
5267*4882a593Smuzhiyun {
5268*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
5269*4882a593Smuzhiyun gpa_t vmptr;
5270*4882a593Smuzhiyun int r;
5271*4882a593Smuzhiyun
5272*4882a593Smuzhiyun if (!nested_vmx_check_permission(vcpu))
5273*4882a593Smuzhiyun return 1;
5274*4882a593Smuzhiyun
5275*4882a593Smuzhiyun if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5276*4882a593Smuzhiyun return r;
5277*4882a593Smuzhiyun
5278*4882a593Smuzhiyun if (!page_address_valid(vcpu, vmptr))
5279*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5280*4882a593Smuzhiyun
5281*4882a593Smuzhiyun if (vmptr == vmx->nested.vmxon_ptr)
5282*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
5283*4882a593Smuzhiyun
5284*4882a593Smuzhiyun /* Forbid normal VMPTRLD if Enlightened version was used */
5285*4882a593Smuzhiyun if (vmx->nested.hv_evmcs)
5286*4882a593Smuzhiyun return 1;
5287*4882a593Smuzhiyun
5288*4882a593Smuzhiyun if (vmx->nested.current_vmptr != vmptr) {
5289*4882a593Smuzhiyun struct kvm_host_map map;
5290*4882a593Smuzhiyun struct vmcs12 *new_vmcs12;
5291*4882a593Smuzhiyun
5292*4882a593Smuzhiyun if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
5293*4882a593Smuzhiyun /*
5294*4882a593Smuzhiyun * Reads from an unbacked page return all 1s,
5295*4882a593Smuzhiyun * which means that the 32 bits located at the
5296*4882a593Smuzhiyun * given physical address won't match the required
5297*4882a593Smuzhiyun * VMCS12_REVISION identifier.
5298*4882a593Smuzhiyun */
5299*4882a593Smuzhiyun return nested_vmx_fail(vcpu,
5300*4882a593Smuzhiyun VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5301*4882a593Smuzhiyun }
5302*4882a593Smuzhiyun
5303*4882a593Smuzhiyun new_vmcs12 = map.hva;
5304*4882a593Smuzhiyun
5305*4882a593Smuzhiyun if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5306*4882a593Smuzhiyun (new_vmcs12->hdr.shadow_vmcs &&
5307*4882a593Smuzhiyun !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
5308*4882a593Smuzhiyun kvm_vcpu_unmap(vcpu, &map, false);
5309*4882a593Smuzhiyun return nested_vmx_fail(vcpu,
5310*4882a593Smuzhiyun VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5311*4882a593Smuzhiyun }
5312*4882a593Smuzhiyun
5313*4882a593Smuzhiyun nested_release_vmcs12(vcpu);
5314*4882a593Smuzhiyun
5315*4882a593Smuzhiyun /*
5316*4882a593Smuzhiyun * Load VMCS12 from guest memory since it is not already
5317*4882a593Smuzhiyun * cached.
5318*4882a593Smuzhiyun */
5319*4882a593Smuzhiyun memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
5320*4882a593Smuzhiyun kvm_vcpu_unmap(vcpu, &map, false);
5321*4882a593Smuzhiyun
5322*4882a593Smuzhiyun set_current_vmptr(vmx, vmptr);
5323*4882a593Smuzhiyun }
5324*4882a593Smuzhiyun
5325*4882a593Smuzhiyun return nested_vmx_succeed(vcpu);
5326*4882a593Smuzhiyun }
5327*4882a593Smuzhiyun
5328*4882a593Smuzhiyun /* Emulate the VMPTRST instruction */
handle_vmptrst(struct kvm_vcpu * vcpu)5329*4882a593Smuzhiyun static int handle_vmptrst(struct kvm_vcpu *vcpu)
5330*4882a593Smuzhiyun {
5331*4882a593Smuzhiyun unsigned long exit_qual = vmx_get_exit_qual(vcpu);
5332*4882a593Smuzhiyun u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5333*4882a593Smuzhiyun gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5334*4882a593Smuzhiyun struct x86_exception e;
5335*4882a593Smuzhiyun gva_t gva;
5336*4882a593Smuzhiyun int r;
5337*4882a593Smuzhiyun
5338*4882a593Smuzhiyun if (!nested_vmx_check_permission(vcpu))
5339*4882a593Smuzhiyun return 1;
5340*4882a593Smuzhiyun
5341*4882a593Smuzhiyun if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
5342*4882a593Smuzhiyun return 1;
5343*4882a593Smuzhiyun
5344*4882a593Smuzhiyun if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5345*4882a593Smuzhiyun true, sizeof(gpa_t), &gva))
5346*4882a593Smuzhiyun return 1;
5347*4882a593Smuzhiyun /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5348*4882a593Smuzhiyun r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr,
5349*4882a593Smuzhiyun sizeof(gpa_t), &e);
5350*4882a593Smuzhiyun if (r != X86EMUL_CONTINUE)
5351*4882a593Smuzhiyun return kvm_handle_memory_failure(vcpu, r, &e);
5352*4882a593Smuzhiyun
5353*4882a593Smuzhiyun return nested_vmx_succeed(vcpu);
5354*4882a593Smuzhiyun }
5355*4882a593Smuzhiyun
5356*4882a593Smuzhiyun #define EPTP_PA_MASK GENMASK_ULL(51, 12)
5357*4882a593Smuzhiyun
nested_ept_root_matches(hpa_t root_hpa,u64 root_eptp,u64 eptp)5358*4882a593Smuzhiyun static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
5359*4882a593Smuzhiyun {
5360*4882a593Smuzhiyun return VALID_PAGE(root_hpa) &&
5361*4882a593Smuzhiyun ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
5362*4882a593Smuzhiyun }
5363*4882a593Smuzhiyun
5364*4882a593Smuzhiyun /* Emulate the INVEPT instruction */
handle_invept(struct kvm_vcpu * vcpu)5365*4882a593Smuzhiyun static int handle_invept(struct kvm_vcpu *vcpu)
5366*4882a593Smuzhiyun {
5367*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
5368*4882a593Smuzhiyun u32 vmx_instruction_info, types;
5369*4882a593Smuzhiyun unsigned long type, roots_to_free;
5370*4882a593Smuzhiyun struct kvm_mmu *mmu;
5371*4882a593Smuzhiyun gva_t gva;
5372*4882a593Smuzhiyun struct x86_exception e;
5373*4882a593Smuzhiyun struct {
5374*4882a593Smuzhiyun u64 eptp, gpa;
5375*4882a593Smuzhiyun } operand;
5376*4882a593Smuzhiyun int i, r;
5377*4882a593Smuzhiyun
5378*4882a593Smuzhiyun if (!(vmx->nested.msrs.secondary_ctls_high &
5379*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_EPT) ||
5380*4882a593Smuzhiyun !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5381*4882a593Smuzhiyun kvm_queue_exception(vcpu, UD_VECTOR);
5382*4882a593Smuzhiyun return 1;
5383*4882a593Smuzhiyun }
5384*4882a593Smuzhiyun
5385*4882a593Smuzhiyun if (!nested_vmx_check_permission(vcpu))
5386*4882a593Smuzhiyun return 1;
5387*4882a593Smuzhiyun
5388*4882a593Smuzhiyun vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5389*4882a593Smuzhiyun type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5390*4882a593Smuzhiyun
5391*4882a593Smuzhiyun types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5392*4882a593Smuzhiyun
5393*4882a593Smuzhiyun if (type >= 32 || !(types & (1 << type)))
5394*4882a593Smuzhiyun return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5395*4882a593Smuzhiyun
5396*4882a593Smuzhiyun /* According to the Intel VMX instruction reference, the memory
5397*4882a593Smuzhiyun * operand is read even if it isn't needed (e.g., for type==global)
5398*4882a593Smuzhiyun */
5399*4882a593Smuzhiyun if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5400*4882a593Smuzhiyun vmx_instruction_info, false, sizeof(operand), &gva))
5401*4882a593Smuzhiyun return 1;
5402*4882a593Smuzhiyun r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5403*4882a593Smuzhiyun if (r != X86EMUL_CONTINUE)
5404*4882a593Smuzhiyun return kvm_handle_memory_failure(vcpu, r, &e);
5405*4882a593Smuzhiyun
5406*4882a593Smuzhiyun /*
5407*4882a593Smuzhiyun * Nested EPT roots are always held through guest_mmu,
5408*4882a593Smuzhiyun * not root_mmu.
5409*4882a593Smuzhiyun */
5410*4882a593Smuzhiyun mmu = &vcpu->arch.guest_mmu;
5411*4882a593Smuzhiyun
5412*4882a593Smuzhiyun switch (type) {
5413*4882a593Smuzhiyun case VMX_EPT_EXTENT_CONTEXT:
5414*4882a593Smuzhiyun if (!nested_vmx_check_eptp(vcpu, operand.eptp))
5415*4882a593Smuzhiyun return nested_vmx_fail(vcpu,
5416*4882a593Smuzhiyun VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5417*4882a593Smuzhiyun
5418*4882a593Smuzhiyun roots_to_free = 0;
5419*4882a593Smuzhiyun if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd,
5420*4882a593Smuzhiyun operand.eptp))
5421*4882a593Smuzhiyun roots_to_free |= KVM_MMU_ROOT_CURRENT;
5422*4882a593Smuzhiyun
5423*4882a593Smuzhiyun for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5424*4882a593Smuzhiyun if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
5425*4882a593Smuzhiyun mmu->prev_roots[i].pgd,
5426*4882a593Smuzhiyun operand.eptp))
5427*4882a593Smuzhiyun roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5428*4882a593Smuzhiyun }
5429*4882a593Smuzhiyun break;
5430*4882a593Smuzhiyun case VMX_EPT_EXTENT_GLOBAL:
5431*4882a593Smuzhiyun roots_to_free = KVM_MMU_ROOTS_ALL;
5432*4882a593Smuzhiyun break;
5433*4882a593Smuzhiyun default:
5434*4882a593Smuzhiyun BUG();
5435*4882a593Smuzhiyun break;
5436*4882a593Smuzhiyun }
5437*4882a593Smuzhiyun
5438*4882a593Smuzhiyun if (roots_to_free)
5439*4882a593Smuzhiyun kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
5440*4882a593Smuzhiyun
5441*4882a593Smuzhiyun return nested_vmx_succeed(vcpu);
5442*4882a593Smuzhiyun }
5443*4882a593Smuzhiyun
handle_invvpid(struct kvm_vcpu * vcpu)5444*4882a593Smuzhiyun static int handle_invvpid(struct kvm_vcpu *vcpu)
5445*4882a593Smuzhiyun {
5446*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
5447*4882a593Smuzhiyun u32 vmx_instruction_info;
5448*4882a593Smuzhiyun unsigned long type, types;
5449*4882a593Smuzhiyun gva_t gva;
5450*4882a593Smuzhiyun struct x86_exception e;
5451*4882a593Smuzhiyun struct {
5452*4882a593Smuzhiyun u64 vpid;
5453*4882a593Smuzhiyun u64 gla;
5454*4882a593Smuzhiyun } operand;
5455*4882a593Smuzhiyun u16 vpid02;
5456*4882a593Smuzhiyun int r;
5457*4882a593Smuzhiyun
5458*4882a593Smuzhiyun if (!(vmx->nested.msrs.secondary_ctls_high &
5459*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_VPID) ||
5460*4882a593Smuzhiyun !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5461*4882a593Smuzhiyun kvm_queue_exception(vcpu, UD_VECTOR);
5462*4882a593Smuzhiyun return 1;
5463*4882a593Smuzhiyun }
5464*4882a593Smuzhiyun
5465*4882a593Smuzhiyun if (!nested_vmx_check_permission(vcpu))
5466*4882a593Smuzhiyun return 1;
5467*4882a593Smuzhiyun
5468*4882a593Smuzhiyun vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5469*4882a593Smuzhiyun type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5470*4882a593Smuzhiyun
5471*4882a593Smuzhiyun types = (vmx->nested.msrs.vpid_caps &
5472*4882a593Smuzhiyun VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5473*4882a593Smuzhiyun
5474*4882a593Smuzhiyun if (type >= 32 || !(types & (1 << type)))
5475*4882a593Smuzhiyun return nested_vmx_fail(vcpu,
5476*4882a593Smuzhiyun VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5477*4882a593Smuzhiyun
5478*4882a593Smuzhiyun /* according to the intel vmx instruction reference, the memory
5479*4882a593Smuzhiyun * operand is read even if it isn't needed (e.g., for type==global)
5480*4882a593Smuzhiyun */
5481*4882a593Smuzhiyun if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5482*4882a593Smuzhiyun vmx_instruction_info, false, sizeof(operand), &gva))
5483*4882a593Smuzhiyun return 1;
5484*4882a593Smuzhiyun r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5485*4882a593Smuzhiyun if (r != X86EMUL_CONTINUE)
5486*4882a593Smuzhiyun return kvm_handle_memory_failure(vcpu, r, &e);
5487*4882a593Smuzhiyun
5488*4882a593Smuzhiyun if (operand.vpid >> 16)
5489*4882a593Smuzhiyun return nested_vmx_fail(vcpu,
5490*4882a593Smuzhiyun VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5491*4882a593Smuzhiyun
5492*4882a593Smuzhiyun vpid02 = nested_get_vpid02(vcpu);
5493*4882a593Smuzhiyun switch (type) {
5494*4882a593Smuzhiyun case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5495*4882a593Smuzhiyun if (!operand.vpid ||
5496*4882a593Smuzhiyun is_noncanonical_address(operand.gla, vcpu))
5497*4882a593Smuzhiyun return nested_vmx_fail(vcpu,
5498*4882a593Smuzhiyun VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5499*4882a593Smuzhiyun vpid_sync_vcpu_addr(vpid02, operand.gla);
5500*4882a593Smuzhiyun break;
5501*4882a593Smuzhiyun case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5502*4882a593Smuzhiyun case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5503*4882a593Smuzhiyun if (!operand.vpid)
5504*4882a593Smuzhiyun return nested_vmx_fail(vcpu,
5505*4882a593Smuzhiyun VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5506*4882a593Smuzhiyun vpid_sync_context(vpid02);
5507*4882a593Smuzhiyun break;
5508*4882a593Smuzhiyun case VMX_VPID_EXTENT_ALL_CONTEXT:
5509*4882a593Smuzhiyun vpid_sync_context(vpid02);
5510*4882a593Smuzhiyun break;
5511*4882a593Smuzhiyun default:
5512*4882a593Smuzhiyun WARN_ON_ONCE(1);
5513*4882a593Smuzhiyun return kvm_skip_emulated_instruction(vcpu);
5514*4882a593Smuzhiyun }
5515*4882a593Smuzhiyun
5516*4882a593Smuzhiyun /*
5517*4882a593Smuzhiyun * Sync the shadow page tables if EPT is disabled, L1 is invalidating
5518*4882a593Smuzhiyun * linear mappings for L2 (tagged with L2's VPID). Free all roots as
5519*4882a593Smuzhiyun * VPIDs are not tracked in the MMU role.
5520*4882a593Smuzhiyun *
5521*4882a593Smuzhiyun * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
5522*4882a593Smuzhiyun * an MMU when EPT is disabled.
5523*4882a593Smuzhiyun *
5524*4882a593Smuzhiyun * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
5525*4882a593Smuzhiyun */
5526*4882a593Smuzhiyun if (!enable_ept)
5527*4882a593Smuzhiyun kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu,
5528*4882a593Smuzhiyun KVM_MMU_ROOTS_ALL);
5529*4882a593Smuzhiyun
5530*4882a593Smuzhiyun return nested_vmx_succeed(vcpu);
5531*4882a593Smuzhiyun }
5532*4882a593Smuzhiyun
nested_vmx_eptp_switching(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)5533*4882a593Smuzhiyun static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
5534*4882a593Smuzhiyun struct vmcs12 *vmcs12)
5535*4882a593Smuzhiyun {
5536*4882a593Smuzhiyun u32 index = kvm_rcx_read(vcpu);
5537*4882a593Smuzhiyun u64 new_eptp;
5538*4882a593Smuzhiyun
5539*4882a593Smuzhiyun if (!nested_cpu_has_eptp_switching(vmcs12) ||
5540*4882a593Smuzhiyun !nested_cpu_has_ept(vmcs12))
5541*4882a593Smuzhiyun return 1;
5542*4882a593Smuzhiyun
5543*4882a593Smuzhiyun if (index >= VMFUNC_EPTP_ENTRIES)
5544*4882a593Smuzhiyun return 1;
5545*4882a593Smuzhiyun
5546*4882a593Smuzhiyun if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
5547*4882a593Smuzhiyun &new_eptp, index * 8, 8))
5548*4882a593Smuzhiyun return 1;
5549*4882a593Smuzhiyun
5550*4882a593Smuzhiyun /*
5551*4882a593Smuzhiyun * If the (L2) guest does a vmfunc to the currently
5552*4882a593Smuzhiyun * active ept pointer, we don't have to do anything else
5553*4882a593Smuzhiyun */
5554*4882a593Smuzhiyun if (vmcs12->ept_pointer != new_eptp) {
5555*4882a593Smuzhiyun if (!nested_vmx_check_eptp(vcpu, new_eptp))
5556*4882a593Smuzhiyun return 1;
5557*4882a593Smuzhiyun
5558*4882a593Smuzhiyun vmcs12->ept_pointer = new_eptp;
5559*4882a593Smuzhiyun
5560*4882a593Smuzhiyun kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
5561*4882a593Smuzhiyun }
5562*4882a593Smuzhiyun
5563*4882a593Smuzhiyun return 0;
5564*4882a593Smuzhiyun }
5565*4882a593Smuzhiyun
handle_vmfunc(struct kvm_vcpu * vcpu)5566*4882a593Smuzhiyun static int handle_vmfunc(struct kvm_vcpu *vcpu)
5567*4882a593Smuzhiyun {
5568*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
5569*4882a593Smuzhiyun struct vmcs12 *vmcs12;
5570*4882a593Smuzhiyun u32 function = kvm_rax_read(vcpu);
5571*4882a593Smuzhiyun
5572*4882a593Smuzhiyun /*
5573*4882a593Smuzhiyun * VMFUNC is only supported for nested guests, but we always enable the
5574*4882a593Smuzhiyun * secondary control for simplicity; for non-nested mode, fake that we
5575*4882a593Smuzhiyun * didn't by injecting #UD.
5576*4882a593Smuzhiyun */
5577*4882a593Smuzhiyun if (!is_guest_mode(vcpu)) {
5578*4882a593Smuzhiyun kvm_queue_exception(vcpu, UD_VECTOR);
5579*4882a593Smuzhiyun return 1;
5580*4882a593Smuzhiyun }
5581*4882a593Smuzhiyun
5582*4882a593Smuzhiyun vmcs12 = get_vmcs12(vcpu);
5583*4882a593Smuzhiyun if (!(vmcs12->vm_function_control & BIT_ULL(function)))
5584*4882a593Smuzhiyun goto fail;
5585*4882a593Smuzhiyun
5586*4882a593Smuzhiyun switch (function) {
5587*4882a593Smuzhiyun case 0:
5588*4882a593Smuzhiyun if (nested_vmx_eptp_switching(vcpu, vmcs12))
5589*4882a593Smuzhiyun goto fail;
5590*4882a593Smuzhiyun break;
5591*4882a593Smuzhiyun default:
5592*4882a593Smuzhiyun goto fail;
5593*4882a593Smuzhiyun }
5594*4882a593Smuzhiyun return kvm_skip_emulated_instruction(vcpu);
5595*4882a593Smuzhiyun
5596*4882a593Smuzhiyun fail:
5597*4882a593Smuzhiyun /*
5598*4882a593Smuzhiyun * This is effectively a reflected VM-Exit, as opposed to a synthesized
5599*4882a593Smuzhiyun * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
5600*4882a593Smuzhiyun * EXIT_REASON_VMFUNC as the exit reason.
5601*4882a593Smuzhiyun */
5602*4882a593Smuzhiyun nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
5603*4882a593Smuzhiyun vmx_get_intr_info(vcpu),
5604*4882a593Smuzhiyun vmx_get_exit_qual(vcpu));
5605*4882a593Smuzhiyun return 1;
5606*4882a593Smuzhiyun }
5607*4882a593Smuzhiyun
5608*4882a593Smuzhiyun /*
5609*4882a593Smuzhiyun * Return true if an IO instruction with the specified port and size should cause
5610*4882a593Smuzhiyun * a VM-exit into L1.
5611*4882a593Smuzhiyun */
nested_vmx_check_io_bitmaps(struct kvm_vcpu * vcpu,unsigned int port,int size)5612*4882a593Smuzhiyun bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
5613*4882a593Smuzhiyun int size)
5614*4882a593Smuzhiyun {
5615*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5616*4882a593Smuzhiyun gpa_t bitmap, last_bitmap;
5617*4882a593Smuzhiyun u8 b;
5618*4882a593Smuzhiyun
5619*4882a593Smuzhiyun last_bitmap = (gpa_t)-1;
5620*4882a593Smuzhiyun b = -1;
5621*4882a593Smuzhiyun
5622*4882a593Smuzhiyun while (size > 0) {
5623*4882a593Smuzhiyun if (port < 0x8000)
5624*4882a593Smuzhiyun bitmap = vmcs12->io_bitmap_a;
5625*4882a593Smuzhiyun else if (port < 0x10000)
5626*4882a593Smuzhiyun bitmap = vmcs12->io_bitmap_b;
5627*4882a593Smuzhiyun else
5628*4882a593Smuzhiyun return true;
5629*4882a593Smuzhiyun bitmap += (port & 0x7fff) / 8;
5630*4882a593Smuzhiyun
5631*4882a593Smuzhiyun if (last_bitmap != bitmap)
5632*4882a593Smuzhiyun if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5633*4882a593Smuzhiyun return true;
5634*4882a593Smuzhiyun if (b & (1 << (port & 7)))
5635*4882a593Smuzhiyun return true;
5636*4882a593Smuzhiyun
5637*4882a593Smuzhiyun port++;
5638*4882a593Smuzhiyun size--;
5639*4882a593Smuzhiyun last_bitmap = bitmap;
5640*4882a593Smuzhiyun }
5641*4882a593Smuzhiyun
5642*4882a593Smuzhiyun return false;
5643*4882a593Smuzhiyun }
5644*4882a593Smuzhiyun
nested_vmx_exit_handled_io(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)5645*4882a593Smuzhiyun static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5646*4882a593Smuzhiyun struct vmcs12 *vmcs12)
5647*4882a593Smuzhiyun {
5648*4882a593Smuzhiyun unsigned long exit_qualification;
5649*4882a593Smuzhiyun unsigned short port;
5650*4882a593Smuzhiyun int size;
5651*4882a593Smuzhiyun
5652*4882a593Smuzhiyun if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5653*4882a593Smuzhiyun return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5654*4882a593Smuzhiyun
5655*4882a593Smuzhiyun exit_qualification = vmx_get_exit_qual(vcpu);
5656*4882a593Smuzhiyun
5657*4882a593Smuzhiyun port = exit_qualification >> 16;
5658*4882a593Smuzhiyun size = (exit_qualification & 7) + 1;
5659*4882a593Smuzhiyun
5660*4882a593Smuzhiyun return nested_vmx_check_io_bitmaps(vcpu, port, size);
5661*4882a593Smuzhiyun }
5662*4882a593Smuzhiyun
5663*4882a593Smuzhiyun /*
5664*4882a593Smuzhiyun * Return 1 if we should exit from L2 to L1 to handle an MSR access,
5665*4882a593Smuzhiyun * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5666*4882a593Smuzhiyun * disinterest in the current event (read or write a specific MSR) by using an
5667*4882a593Smuzhiyun * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5668*4882a593Smuzhiyun */
nested_vmx_exit_handled_msr(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,union vmx_exit_reason exit_reason)5669*4882a593Smuzhiyun static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5670*4882a593Smuzhiyun struct vmcs12 *vmcs12,
5671*4882a593Smuzhiyun union vmx_exit_reason exit_reason)
5672*4882a593Smuzhiyun {
5673*4882a593Smuzhiyun u32 msr_index = kvm_rcx_read(vcpu);
5674*4882a593Smuzhiyun gpa_t bitmap;
5675*4882a593Smuzhiyun
5676*4882a593Smuzhiyun if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5677*4882a593Smuzhiyun return true;
5678*4882a593Smuzhiyun
5679*4882a593Smuzhiyun /*
5680*4882a593Smuzhiyun * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5681*4882a593Smuzhiyun * for the four combinations of read/write and low/high MSR numbers.
5682*4882a593Smuzhiyun * First we need to figure out which of the four to use:
5683*4882a593Smuzhiyun */
5684*4882a593Smuzhiyun bitmap = vmcs12->msr_bitmap;
5685*4882a593Smuzhiyun if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
5686*4882a593Smuzhiyun bitmap += 2048;
5687*4882a593Smuzhiyun if (msr_index >= 0xc0000000) {
5688*4882a593Smuzhiyun msr_index -= 0xc0000000;
5689*4882a593Smuzhiyun bitmap += 1024;
5690*4882a593Smuzhiyun }
5691*4882a593Smuzhiyun
5692*4882a593Smuzhiyun /* Then read the msr_index'th bit from this bitmap: */
5693*4882a593Smuzhiyun if (msr_index < 1024*8) {
5694*4882a593Smuzhiyun unsigned char b;
5695*4882a593Smuzhiyun if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5696*4882a593Smuzhiyun return true;
5697*4882a593Smuzhiyun return 1 & (b >> (msr_index & 7));
5698*4882a593Smuzhiyun } else
5699*4882a593Smuzhiyun return true; /* let L1 handle the wrong parameter */
5700*4882a593Smuzhiyun }
5701*4882a593Smuzhiyun
5702*4882a593Smuzhiyun /*
5703*4882a593Smuzhiyun * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5704*4882a593Smuzhiyun * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5705*4882a593Smuzhiyun * intercept (via guest_host_mask etc.) the current event.
5706*4882a593Smuzhiyun */
nested_vmx_exit_handled_cr(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)5707*4882a593Smuzhiyun static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5708*4882a593Smuzhiyun struct vmcs12 *vmcs12)
5709*4882a593Smuzhiyun {
5710*4882a593Smuzhiyun unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5711*4882a593Smuzhiyun int cr = exit_qualification & 15;
5712*4882a593Smuzhiyun int reg;
5713*4882a593Smuzhiyun unsigned long val;
5714*4882a593Smuzhiyun
5715*4882a593Smuzhiyun switch ((exit_qualification >> 4) & 3) {
5716*4882a593Smuzhiyun case 0: /* mov to cr */
5717*4882a593Smuzhiyun reg = (exit_qualification >> 8) & 15;
5718*4882a593Smuzhiyun val = kvm_register_readl(vcpu, reg);
5719*4882a593Smuzhiyun switch (cr) {
5720*4882a593Smuzhiyun case 0:
5721*4882a593Smuzhiyun if (vmcs12->cr0_guest_host_mask &
5722*4882a593Smuzhiyun (val ^ vmcs12->cr0_read_shadow))
5723*4882a593Smuzhiyun return true;
5724*4882a593Smuzhiyun break;
5725*4882a593Smuzhiyun case 3:
5726*4882a593Smuzhiyun if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5727*4882a593Smuzhiyun return true;
5728*4882a593Smuzhiyun break;
5729*4882a593Smuzhiyun case 4:
5730*4882a593Smuzhiyun if (vmcs12->cr4_guest_host_mask &
5731*4882a593Smuzhiyun (vmcs12->cr4_read_shadow ^ val))
5732*4882a593Smuzhiyun return true;
5733*4882a593Smuzhiyun break;
5734*4882a593Smuzhiyun case 8:
5735*4882a593Smuzhiyun if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5736*4882a593Smuzhiyun return true;
5737*4882a593Smuzhiyun break;
5738*4882a593Smuzhiyun }
5739*4882a593Smuzhiyun break;
5740*4882a593Smuzhiyun case 2: /* clts */
5741*4882a593Smuzhiyun if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5742*4882a593Smuzhiyun (vmcs12->cr0_read_shadow & X86_CR0_TS))
5743*4882a593Smuzhiyun return true;
5744*4882a593Smuzhiyun break;
5745*4882a593Smuzhiyun case 1: /* mov from cr */
5746*4882a593Smuzhiyun switch (cr) {
5747*4882a593Smuzhiyun case 3:
5748*4882a593Smuzhiyun if (vmcs12->cpu_based_vm_exec_control &
5749*4882a593Smuzhiyun CPU_BASED_CR3_STORE_EXITING)
5750*4882a593Smuzhiyun return true;
5751*4882a593Smuzhiyun break;
5752*4882a593Smuzhiyun case 8:
5753*4882a593Smuzhiyun if (vmcs12->cpu_based_vm_exec_control &
5754*4882a593Smuzhiyun CPU_BASED_CR8_STORE_EXITING)
5755*4882a593Smuzhiyun return true;
5756*4882a593Smuzhiyun break;
5757*4882a593Smuzhiyun }
5758*4882a593Smuzhiyun break;
5759*4882a593Smuzhiyun case 3: /* lmsw */
5760*4882a593Smuzhiyun /*
5761*4882a593Smuzhiyun * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5762*4882a593Smuzhiyun * cr0. Other attempted changes are ignored, with no exit.
5763*4882a593Smuzhiyun */
5764*4882a593Smuzhiyun val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5765*4882a593Smuzhiyun if (vmcs12->cr0_guest_host_mask & 0xe &
5766*4882a593Smuzhiyun (val ^ vmcs12->cr0_read_shadow))
5767*4882a593Smuzhiyun return true;
5768*4882a593Smuzhiyun if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5769*4882a593Smuzhiyun !(vmcs12->cr0_read_shadow & 0x1) &&
5770*4882a593Smuzhiyun (val & 0x1))
5771*4882a593Smuzhiyun return true;
5772*4882a593Smuzhiyun break;
5773*4882a593Smuzhiyun }
5774*4882a593Smuzhiyun return false;
5775*4882a593Smuzhiyun }
5776*4882a593Smuzhiyun
nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,gpa_t bitmap)5777*4882a593Smuzhiyun static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5778*4882a593Smuzhiyun struct vmcs12 *vmcs12, gpa_t bitmap)
5779*4882a593Smuzhiyun {
5780*4882a593Smuzhiyun u32 vmx_instruction_info;
5781*4882a593Smuzhiyun unsigned long field;
5782*4882a593Smuzhiyun u8 b;
5783*4882a593Smuzhiyun
5784*4882a593Smuzhiyun if (!nested_cpu_has_shadow_vmcs(vmcs12))
5785*4882a593Smuzhiyun return true;
5786*4882a593Smuzhiyun
5787*4882a593Smuzhiyun /* Decode instruction info and find the field to access */
5788*4882a593Smuzhiyun vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5789*4882a593Smuzhiyun field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5790*4882a593Smuzhiyun
5791*4882a593Smuzhiyun /* Out-of-range fields always cause a VM exit from L2 to L1 */
5792*4882a593Smuzhiyun if (field >> 15)
5793*4882a593Smuzhiyun return true;
5794*4882a593Smuzhiyun
5795*4882a593Smuzhiyun if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5796*4882a593Smuzhiyun return true;
5797*4882a593Smuzhiyun
5798*4882a593Smuzhiyun return 1 & (b >> (field & 7));
5799*4882a593Smuzhiyun }
5800*4882a593Smuzhiyun
nested_vmx_exit_handled_mtf(struct vmcs12 * vmcs12)5801*4882a593Smuzhiyun static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
5802*4882a593Smuzhiyun {
5803*4882a593Smuzhiyun u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
5804*4882a593Smuzhiyun
5805*4882a593Smuzhiyun if (nested_cpu_has_mtf(vmcs12))
5806*4882a593Smuzhiyun return true;
5807*4882a593Smuzhiyun
5808*4882a593Smuzhiyun /*
5809*4882a593Smuzhiyun * An MTF VM-exit may be injected into the guest by setting the
5810*4882a593Smuzhiyun * interruption-type to 7 (other event) and the vector field to 0. Such
5811*4882a593Smuzhiyun * is the case regardless of the 'monitor trap flag' VM-execution
5812*4882a593Smuzhiyun * control.
5813*4882a593Smuzhiyun */
5814*4882a593Smuzhiyun return entry_intr_info == (INTR_INFO_VALID_MASK
5815*4882a593Smuzhiyun | INTR_TYPE_OTHER_EVENT);
5816*4882a593Smuzhiyun }
5817*4882a593Smuzhiyun
5818*4882a593Smuzhiyun /*
5819*4882a593Smuzhiyun * Return true if L0 wants to handle an exit from L2 regardless of whether or not
5820*4882a593Smuzhiyun * L1 wants the exit. Only call this when in is_guest_mode (L2).
5821*4882a593Smuzhiyun */
nested_vmx_l0_wants_exit(struct kvm_vcpu * vcpu,union vmx_exit_reason exit_reason)5822*4882a593Smuzhiyun static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
5823*4882a593Smuzhiyun union vmx_exit_reason exit_reason)
5824*4882a593Smuzhiyun {
5825*4882a593Smuzhiyun u32 intr_info;
5826*4882a593Smuzhiyun
5827*4882a593Smuzhiyun switch ((u16)exit_reason.basic) {
5828*4882a593Smuzhiyun case EXIT_REASON_EXCEPTION_NMI:
5829*4882a593Smuzhiyun intr_info = vmx_get_intr_info(vcpu);
5830*4882a593Smuzhiyun if (is_nmi(intr_info))
5831*4882a593Smuzhiyun return true;
5832*4882a593Smuzhiyun else if (is_page_fault(intr_info))
5833*4882a593Smuzhiyun return vcpu->arch.apf.host_apf_flags ||
5834*4882a593Smuzhiyun vmx_need_pf_intercept(vcpu);
5835*4882a593Smuzhiyun else if (is_debug(intr_info) &&
5836*4882a593Smuzhiyun vcpu->guest_debug &
5837*4882a593Smuzhiyun (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5838*4882a593Smuzhiyun return true;
5839*4882a593Smuzhiyun else if (is_breakpoint(intr_info) &&
5840*4882a593Smuzhiyun vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5841*4882a593Smuzhiyun return true;
5842*4882a593Smuzhiyun else if (is_alignment_check(intr_info) &&
5843*4882a593Smuzhiyun !vmx_guest_inject_ac(vcpu))
5844*4882a593Smuzhiyun return true;
5845*4882a593Smuzhiyun return false;
5846*4882a593Smuzhiyun case EXIT_REASON_EXTERNAL_INTERRUPT:
5847*4882a593Smuzhiyun return true;
5848*4882a593Smuzhiyun case EXIT_REASON_MCE_DURING_VMENTRY:
5849*4882a593Smuzhiyun return true;
5850*4882a593Smuzhiyun case EXIT_REASON_EPT_VIOLATION:
5851*4882a593Smuzhiyun /*
5852*4882a593Smuzhiyun * L0 always deals with the EPT violation. If nested EPT is
5853*4882a593Smuzhiyun * used, and the nested mmu code discovers that the address is
5854*4882a593Smuzhiyun * missing in the guest EPT table (EPT12), the EPT violation
5855*4882a593Smuzhiyun * will be injected with nested_ept_inject_page_fault()
5856*4882a593Smuzhiyun */
5857*4882a593Smuzhiyun return true;
5858*4882a593Smuzhiyun case EXIT_REASON_EPT_MISCONFIG:
5859*4882a593Smuzhiyun /*
5860*4882a593Smuzhiyun * L2 never uses directly L1's EPT, but rather L0's own EPT
5861*4882a593Smuzhiyun * table (shadow on EPT) or a merged EPT table that L0 built
5862*4882a593Smuzhiyun * (EPT on EPT). So any problems with the structure of the
5863*4882a593Smuzhiyun * table is L0's fault.
5864*4882a593Smuzhiyun */
5865*4882a593Smuzhiyun return true;
5866*4882a593Smuzhiyun case EXIT_REASON_PREEMPTION_TIMER:
5867*4882a593Smuzhiyun return true;
5868*4882a593Smuzhiyun case EXIT_REASON_PML_FULL:
5869*4882a593Smuzhiyun /* We emulate PML support to L1. */
5870*4882a593Smuzhiyun return true;
5871*4882a593Smuzhiyun case EXIT_REASON_VMFUNC:
5872*4882a593Smuzhiyun /* VM functions are emulated through L2->L0 vmexits. */
5873*4882a593Smuzhiyun return true;
5874*4882a593Smuzhiyun case EXIT_REASON_ENCLS:
5875*4882a593Smuzhiyun /* SGX is never exposed to L1 */
5876*4882a593Smuzhiyun return true;
5877*4882a593Smuzhiyun default:
5878*4882a593Smuzhiyun break;
5879*4882a593Smuzhiyun }
5880*4882a593Smuzhiyun return false;
5881*4882a593Smuzhiyun }
5882*4882a593Smuzhiyun
5883*4882a593Smuzhiyun /*
5884*4882a593Smuzhiyun * Return 1 if L1 wants to intercept an exit from L2. Only call this when in
5885*4882a593Smuzhiyun * is_guest_mode (L2).
5886*4882a593Smuzhiyun */
nested_vmx_l1_wants_exit(struct kvm_vcpu * vcpu,union vmx_exit_reason exit_reason)5887*4882a593Smuzhiyun static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
5888*4882a593Smuzhiyun union vmx_exit_reason exit_reason)
5889*4882a593Smuzhiyun {
5890*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5891*4882a593Smuzhiyun u32 intr_info;
5892*4882a593Smuzhiyun
5893*4882a593Smuzhiyun switch ((u16)exit_reason.basic) {
5894*4882a593Smuzhiyun case EXIT_REASON_EXCEPTION_NMI:
5895*4882a593Smuzhiyun intr_info = vmx_get_intr_info(vcpu);
5896*4882a593Smuzhiyun if (is_nmi(intr_info))
5897*4882a593Smuzhiyun return true;
5898*4882a593Smuzhiyun else if (is_page_fault(intr_info))
5899*4882a593Smuzhiyun return true;
5900*4882a593Smuzhiyun return vmcs12->exception_bitmap &
5901*4882a593Smuzhiyun (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5902*4882a593Smuzhiyun case EXIT_REASON_EXTERNAL_INTERRUPT:
5903*4882a593Smuzhiyun return nested_exit_on_intr(vcpu);
5904*4882a593Smuzhiyun case EXIT_REASON_TRIPLE_FAULT:
5905*4882a593Smuzhiyun return true;
5906*4882a593Smuzhiyun case EXIT_REASON_INTERRUPT_WINDOW:
5907*4882a593Smuzhiyun return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
5908*4882a593Smuzhiyun case EXIT_REASON_NMI_WINDOW:
5909*4882a593Smuzhiyun return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
5910*4882a593Smuzhiyun case EXIT_REASON_TASK_SWITCH:
5911*4882a593Smuzhiyun return true;
5912*4882a593Smuzhiyun case EXIT_REASON_CPUID:
5913*4882a593Smuzhiyun return true;
5914*4882a593Smuzhiyun case EXIT_REASON_HLT:
5915*4882a593Smuzhiyun return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5916*4882a593Smuzhiyun case EXIT_REASON_INVD:
5917*4882a593Smuzhiyun return true;
5918*4882a593Smuzhiyun case EXIT_REASON_INVLPG:
5919*4882a593Smuzhiyun return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5920*4882a593Smuzhiyun case EXIT_REASON_RDPMC:
5921*4882a593Smuzhiyun return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5922*4882a593Smuzhiyun case EXIT_REASON_RDRAND:
5923*4882a593Smuzhiyun return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5924*4882a593Smuzhiyun case EXIT_REASON_RDSEED:
5925*4882a593Smuzhiyun return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5926*4882a593Smuzhiyun case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5927*4882a593Smuzhiyun return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5928*4882a593Smuzhiyun case EXIT_REASON_VMREAD:
5929*4882a593Smuzhiyun return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5930*4882a593Smuzhiyun vmcs12->vmread_bitmap);
5931*4882a593Smuzhiyun case EXIT_REASON_VMWRITE:
5932*4882a593Smuzhiyun return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5933*4882a593Smuzhiyun vmcs12->vmwrite_bitmap);
5934*4882a593Smuzhiyun case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5935*4882a593Smuzhiyun case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5936*4882a593Smuzhiyun case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5937*4882a593Smuzhiyun case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5938*4882a593Smuzhiyun case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5939*4882a593Smuzhiyun /*
5940*4882a593Smuzhiyun * VMX instructions trap unconditionally. This allows L1 to
5941*4882a593Smuzhiyun * emulate them for its L2 guest, i.e., allows 3-level nesting!
5942*4882a593Smuzhiyun */
5943*4882a593Smuzhiyun return true;
5944*4882a593Smuzhiyun case EXIT_REASON_CR_ACCESS:
5945*4882a593Smuzhiyun return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5946*4882a593Smuzhiyun case EXIT_REASON_DR_ACCESS:
5947*4882a593Smuzhiyun return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5948*4882a593Smuzhiyun case EXIT_REASON_IO_INSTRUCTION:
5949*4882a593Smuzhiyun return nested_vmx_exit_handled_io(vcpu, vmcs12);
5950*4882a593Smuzhiyun case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5951*4882a593Smuzhiyun return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5952*4882a593Smuzhiyun case EXIT_REASON_MSR_READ:
5953*4882a593Smuzhiyun case EXIT_REASON_MSR_WRITE:
5954*4882a593Smuzhiyun return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5955*4882a593Smuzhiyun case EXIT_REASON_INVALID_STATE:
5956*4882a593Smuzhiyun return true;
5957*4882a593Smuzhiyun case EXIT_REASON_MWAIT_INSTRUCTION:
5958*4882a593Smuzhiyun return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5959*4882a593Smuzhiyun case EXIT_REASON_MONITOR_TRAP_FLAG:
5960*4882a593Smuzhiyun return nested_vmx_exit_handled_mtf(vmcs12);
5961*4882a593Smuzhiyun case EXIT_REASON_MONITOR_INSTRUCTION:
5962*4882a593Smuzhiyun return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5963*4882a593Smuzhiyun case EXIT_REASON_PAUSE_INSTRUCTION:
5964*4882a593Smuzhiyun return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5965*4882a593Smuzhiyun nested_cpu_has2(vmcs12,
5966*4882a593Smuzhiyun SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5967*4882a593Smuzhiyun case EXIT_REASON_MCE_DURING_VMENTRY:
5968*4882a593Smuzhiyun return true;
5969*4882a593Smuzhiyun case EXIT_REASON_TPR_BELOW_THRESHOLD:
5970*4882a593Smuzhiyun return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5971*4882a593Smuzhiyun case EXIT_REASON_APIC_ACCESS:
5972*4882a593Smuzhiyun case EXIT_REASON_APIC_WRITE:
5973*4882a593Smuzhiyun case EXIT_REASON_EOI_INDUCED:
5974*4882a593Smuzhiyun /*
5975*4882a593Smuzhiyun * The controls for "virtualize APIC accesses," "APIC-
5976*4882a593Smuzhiyun * register virtualization," and "virtual-interrupt
5977*4882a593Smuzhiyun * delivery" only come from vmcs12.
5978*4882a593Smuzhiyun */
5979*4882a593Smuzhiyun return true;
5980*4882a593Smuzhiyun case EXIT_REASON_INVPCID:
5981*4882a593Smuzhiyun return
5982*4882a593Smuzhiyun nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
5983*4882a593Smuzhiyun nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5984*4882a593Smuzhiyun case EXIT_REASON_WBINVD:
5985*4882a593Smuzhiyun return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5986*4882a593Smuzhiyun case EXIT_REASON_XSETBV:
5987*4882a593Smuzhiyun return true;
5988*4882a593Smuzhiyun case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
5989*4882a593Smuzhiyun /*
5990*4882a593Smuzhiyun * This should never happen, since it is not possible to
5991*4882a593Smuzhiyun * set XSS to a non-zero value---neither in L1 nor in L2.
5992*4882a593Smuzhiyun * If if it were, XSS would have to be checked against
5993*4882a593Smuzhiyun * the XSS exit bitmap in vmcs12.
5994*4882a593Smuzhiyun */
5995*4882a593Smuzhiyun return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
5996*4882a593Smuzhiyun case EXIT_REASON_UMWAIT:
5997*4882a593Smuzhiyun case EXIT_REASON_TPAUSE:
5998*4882a593Smuzhiyun return nested_cpu_has2(vmcs12,
5999*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
6000*4882a593Smuzhiyun default:
6001*4882a593Smuzhiyun return true;
6002*4882a593Smuzhiyun }
6003*4882a593Smuzhiyun }
6004*4882a593Smuzhiyun
6005*4882a593Smuzhiyun /*
6006*4882a593Smuzhiyun * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
6007*4882a593Smuzhiyun * reflected into L1.
6008*4882a593Smuzhiyun */
nested_vmx_reflect_vmexit(struct kvm_vcpu * vcpu)6009*4882a593Smuzhiyun bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
6010*4882a593Smuzhiyun {
6011*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
6012*4882a593Smuzhiyun union vmx_exit_reason exit_reason = vmx->exit_reason;
6013*4882a593Smuzhiyun unsigned long exit_qual;
6014*4882a593Smuzhiyun u32 exit_intr_info;
6015*4882a593Smuzhiyun
6016*4882a593Smuzhiyun WARN_ON_ONCE(vmx->nested.nested_run_pending);
6017*4882a593Smuzhiyun
6018*4882a593Smuzhiyun /*
6019*4882a593Smuzhiyun * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
6020*4882a593Smuzhiyun * has already loaded L2's state.
6021*4882a593Smuzhiyun */
6022*4882a593Smuzhiyun if (unlikely(vmx->fail)) {
6023*4882a593Smuzhiyun trace_kvm_nested_vmenter_failed(
6024*4882a593Smuzhiyun "hardware VM-instruction error: ",
6025*4882a593Smuzhiyun vmcs_read32(VM_INSTRUCTION_ERROR));
6026*4882a593Smuzhiyun exit_intr_info = 0;
6027*4882a593Smuzhiyun exit_qual = 0;
6028*4882a593Smuzhiyun goto reflect_vmexit;
6029*4882a593Smuzhiyun }
6030*4882a593Smuzhiyun
6031*4882a593Smuzhiyun trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX);
6032*4882a593Smuzhiyun
6033*4882a593Smuzhiyun /* If L0 (KVM) wants the exit, it trumps L1's desires. */
6034*4882a593Smuzhiyun if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
6035*4882a593Smuzhiyun return false;
6036*4882a593Smuzhiyun
6037*4882a593Smuzhiyun /* If L1 doesn't want the exit, handle it in L0. */
6038*4882a593Smuzhiyun if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
6039*4882a593Smuzhiyun return false;
6040*4882a593Smuzhiyun
6041*4882a593Smuzhiyun /*
6042*4882a593Smuzhiyun * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
6043*4882a593Smuzhiyun * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
6044*4882a593Smuzhiyun * need to be synthesized by querying the in-kernel LAPIC, but external
6045*4882a593Smuzhiyun * interrupts are never reflected to L1 so it's a non-issue.
6046*4882a593Smuzhiyun */
6047*4882a593Smuzhiyun exit_intr_info = vmx_get_intr_info(vcpu);
6048*4882a593Smuzhiyun if (is_exception_with_error_code(exit_intr_info)) {
6049*4882a593Smuzhiyun struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6050*4882a593Smuzhiyun
6051*4882a593Smuzhiyun vmcs12->vm_exit_intr_error_code =
6052*4882a593Smuzhiyun vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6053*4882a593Smuzhiyun }
6054*4882a593Smuzhiyun exit_qual = vmx_get_exit_qual(vcpu);
6055*4882a593Smuzhiyun
6056*4882a593Smuzhiyun reflect_vmexit:
6057*4882a593Smuzhiyun nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
6058*4882a593Smuzhiyun return true;
6059*4882a593Smuzhiyun }
6060*4882a593Smuzhiyun
vmx_get_nested_state(struct kvm_vcpu * vcpu,struct kvm_nested_state __user * user_kvm_nested_state,u32 user_data_size)6061*4882a593Smuzhiyun static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
6062*4882a593Smuzhiyun struct kvm_nested_state __user *user_kvm_nested_state,
6063*4882a593Smuzhiyun u32 user_data_size)
6064*4882a593Smuzhiyun {
6065*4882a593Smuzhiyun struct vcpu_vmx *vmx;
6066*4882a593Smuzhiyun struct vmcs12 *vmcs12;
6067*4882a593Smuzhiyun struct kvm_nested_state kvm_state = {
6068*4882a593Smuzhiyun .flags = 0,
6069*4882a593Smuzhiyun .format = KVM_STATE_NESTED_FORMAT_VMX,
6070*4882a593Smuzhiyun .size = sizeof(kvm_state),
6071*4882a593Smuzhiyun .hdr.vmx.flags = 0,
6072*4882a593Smuzhiyun .hdr.vmx.vmxon_pa = -1ull,
6073*4882a593Smuzhiyun .hdr.vmx.vmcs12_pa = -1ull,
6074*4882a593Smuzhiyun .hdr.vmx.preemption_timer_deadline = 0,
6075*4882a593Smuzhiyun };
6076*4882a593Smuzhiyun struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6077*4882a593Smuzhiyun &user_kvm_nested_state->data.vmx[0];
6078*4882a593Smuzhiyun
6079*4882a593Smuzhiyun if (!vcpu)
6080*4882a593Smuzhiyun return kvm_state.size + sizeof(*user_vmx_nested_state);
6081*4882a593Smuzhiyun
6082*4882a593Smuzhiyun vmx = to_vmx(vcpu);
6083*4882a593Smuzhiyun vmcs12 = get_vmcs12(vcpu);
6084*4882a593Smuzhiyun
6085*4882a593Smuzhiyun if (nested_vmx_allowed(vcpu) &&
6086*4882a593Smuzhiyun (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
6087*4882a593Smuzhiyun kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
6088*4882a593Smuzhiyun kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
6089*4882a593Smuzhiyun
6090*4882a593Smuzhiyun if (vmx_has_valid_vmcs12(vcpu)) {
6091*4882a593Smuzhiyun kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
6092*4882a593Smuzhiyun
6093*4882a593Smuzhiyun if (vmx->nested.hv_evmcs)
6094*4882a593Smuzhiyun kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
6095*4882a593Smuzhiyun
6096*4882a593Smuzhiyun if (is_guest_mode(vcpu) &&
6097*4882a593Smuzhiyun nested_cpu_has_shadow_vmcs(vmcs12) &&
6098*4882a593Smuzhiyun vmcs12->vmcs_link_pointer != -1ull)
6099*4882a593Smuzhiyun kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
6100*4882a593Smuzhiyun }
6101*4882a593Smuzhiyun
6102*4882a593Smuzhiyun if (vmx->nested.smm.vmxon)
6103*4882a593Smuzhiyun kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
6104*4882a593Smuzhiyun
6105*4882a593Smuzhiyun if (vmx->nested.smm.guest_mode)
6106*4882a593Smuzhiyun kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
6107*4882a593Smuzhiyun
6108*4882a593Smuzhiyun if (is_guest_mode(vcpu)) {
6109*4882a593Smuzhiyun kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
6110*4882a593Smuzhiyun
6111*4882a593Smuzhiyun if (vmx->nested.nested_run_pending)
6112*4882a593Smuzhiyun kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
6113*4882a593Smuzhiyun
6114*4882a593Smuzhiyun if (vmx->nested.mtf_pending)
6115*4882a593Smuzhiyun kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
6116*4882a593Smuzhiyun
6117*4882a593Smuzhiyun if (nested_cpu_has_preemption_timer(vmcs12) &&
6118*4882a593Smuzhiyun vmx->nested.has_preemption_timer_deadline) {
6119*4882a593Smuzhiyun kvm_state.hdr.vmx.flags |=
6120*4882a593Smuzhiyun KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
6121*4882a593Smuzhiyun kvm_state.hdr.vmx.preemption_timer_deadline =
6122*4882a593Smuzhiyun vmx->nested.preemption_timer_deadline;
6123*4882a593Smuzhiyun }
6124*4882a593Smuzhiyun }
6125*4882a593Smuzhiyun }
6126*4882a593Smuzhiyun
6127*4882a593Smuzhiyun if (user_data_size < kvm_state.size)
6128*4882a593Smuzhiyun goto out;
6129*4882a593Smuzhiyun
6130*4882a593Smuzhiyun if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
6131*4882a593Smuzhiyun return -EFAULT;
6132*4882a593Smuzhiyun
6133*4882a593Smuzhiyun if (!vmx_has_valid_vmcs12(vcpu))
6134*4882a593Smuzhiyun goto out;
6135*4882a593Smuzhiyun
6136*4882a593Smuzhiyun /*
6137*4882a593Smuzhiyun * When running L2, the authoritative vmcs12 state is in the
6138*4882a593Smuzhiyun * vmcs02. When running L1, the authoritative vmcs12 state is
6139*4882a593Smuzhiyun * in the shadow or enlightened vmcs linked to vmcs01, unless
6140*4882a593Smuzhiyun * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
6141*4882a593Smuzhiyun * vmcs12 state is in the vmcs12 already.
6142*4882a593Smuzhiyun */
6143*4882a593Smuzhiyun if (is_guest_mode(vcpu)) {
6144*4882a593Smuzhiyun sync_vmcs02_to_vmcs12(vcpu, vmcs12);
6145*4882a593Smuzhiyun sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
6146*4882a593Smuzhiyun } else {
6147*4882a593Smuzhiyun copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
6148*4882a593Smuzhiyun if (!vmx->nested.need_vmcs12_to_shadow_sync) {
6149*4882a593Smuzhiyun if (vmx->nested.hv_evmcs)
6150*4882a593Smuzhiyun copy_enlightened_to_vmcs12(vmx);
6151*4882a593Smuzhiyun else if (enable_shadow_vmcs)
6152*4882a593Smuzhiyun copy_shadow_to_vmcs12(vmx);
6153*4882a593Smuzhiyun }
6154*4882a593Smuzhiyun }
6155*4882a593Smuzhiyun
6156*4882a593Smuzhiyun BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
6157*4882a593Smuzhiyun BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
6158*4882a593Smuzhiyun
6159*4882a593Smuzhiyun /*
6160*4882a593Smuzhiyun * Copy over the full allocated size of vmcs12 rather than just the size
6161*4882a593Smuzhiyun * of the struct.
6162*4882a593Smuzhiyun */
6163*4882a593Smuzhiyun if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
6164*4882a593Smuzhiyun return -EFAULT;
6165*4882a593Smuzhiyun
6166*4882a593Smuzhiyun if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6167*4882a593Smuzhiyun vmcs12->vmcs_link_pointer != -1ull) {
6168*4882a593Smuzhiyun if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
6169*4882a593Smuzhiyun get_shadow_vmcs12(vcpu), VMCS12_SIZE))
6170*4882a593Smuzhiyun return -EFAULT;
6171*4882a593Smuzhiyun }
6172*4882a593Smuzhiyun out:
6173*4882a593Smuzhiyun return kvm_state.size;
6174*4882a593Smuzhiyun }
6175*4882a593Smuzhiyun
6176*4882a593Smuzhiyun /*
6177*4882a593Smuzhiyun * Forcibly leave nested mode in order to be able to reset the VCPU later on.
6178*4882a593Smuzhiyun */
vmx_leave_nested(struct kvm_vcpu * vcpu)6179*4882a593Smuzhiyun void vmx_leave_nested(struct kvm_vcpu *vcpu)
6180*4882a593Smuzhiyun {
6181*4882a593Smuzhiyun if (is_guest_mode(vcpu)) {
6182*4882a593Smuzhiyun to_vmx(vcpu)->nested.nested_run_pending = 0;
6183*4882a593Smuzhiyun nested_vmx_vmexit(vcpu, -1, 0, 0);
6184*4882a593Smuzhiyun }
6185*4882a593Smuzhiyun free_nested(vcpu);
6186*4882a593Smuzhiyun }
6187*4882a593Smuzhiyun
vmx_set_nested_state(struct kvm_vcpu * vcpu,struct kvm_nested_state __user * user_kvm_nested_state,struct kvm_nested_state * kvm_state)6188*4882a593Smuzhiyun static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
6189*4882a593Smuzhiyun struct kvm_nested_state __user *user_kvm_nested_state,
6190*4882a593Smuzhiyun struct kvm_nested_state *kvm_state)
6191*4882a593Smuzhiyun {
6192*4882a593Smuzhiyun struct vcpu_vmx *vmx = to_vmx(vcpu);
6193*4882a593Smuzhiyun struct vmcs12 *vmcs12;
6194*4882a593Smuzhiyun enum vm_entry_failure_code ignored;
6195*4882a593Smuzhiyun struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6196*4882a593Smuzhiyun &user_kvm_nested_state->data.vmx[0];
6197*4882a593Smuzhiyun int ret;
6198*4882a593Smuzhiyun
6199*4882a593Smuzhiyun if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
6200*4882a593Smuzhiyun return -EINVAL;
6201*4882a593Smuzhiyun
6202*4882a593Smuzhiyun if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
6203*4882a593Smuzhiyun if (kvm_state->hdr.vmx.smm.flags)
6204*4882a593Smuzhiyun return -EINVAL;
6205*4882a593Smuzhiyun
6206*4882a593Smuzhiyun if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
6207*4882a593Smuzhiyun return -EINVAL;
6208*4882a593Smuzhiyun
6209*4882a593Smuzhiyun /*
6210*4882a593Smuzhiyun * KVM_STATE_NESTED_EVMCS used to signal that KVM should
6211*4882a593Smuzhiyun * enable eVMCS capability on vCPU. However, since then
6212*4882a593Smuzhiyun * code was changed such that flag signals vmcs12 should
6213*4882a593Smuzhiyun * be copied into eVMCS in guest memory.
6214*4882a593Smuzhiyun *
6215*4882a593Smuzhiyun * To preserve backwards compatability, allow user
6216*4882a593Smuzhiyun * to set this flag even when there is no VMXON region.
6217*4882a593Smuzhiyun */
6218*4882a593Smuzhiyun if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
6219*4882a593Smuzhiyun return -EINVAL;
6220*4882a593Smuzhiyun } else {
6221*4882a593Smuzhiyun if (!nested_vmx_allowed(vcpu))
6222*4882a593Smuzhiyun return -EINVAL;
6223*4882a593Smuzhiyun
6224*4882a593Smuzhiyun if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
6225*4882a593Smuzhiyun return -EINVAL;
6226*4882a593Smuzhiyun }
6227*4882a593Smuzhiyun
6228*4882a593Smuzhiyun if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6229*4882a593Smuzhiyun (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6230*4882a593Smuzhiyun return -EINVAL;
6231*4882a593Smuzhiyun
6232*4882a593Smuzhiyun if (kvm_state->hdr.vmx.smm.flags &
6233*4882a593Smuzhiyun ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
6234*4882a593Smuzhiyun return -EINVAL;
6235*4882a593Smuzhiyun
6236*4882a593Smuzhiyun if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
6237*4882a593Smuzhiyun return -EINVAL;
6238*4882a593Smuzhiyun
6239*4882a593Smuzhiyun /*
6240*4882a593Smuzhiyun * SMM temporarily disables VMX, so we cannot be in guest mode,
6241*4882a593Smuzhiyun * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
6242*4882a593Smuzhiyun * must be zero.
6243*4882a593Smuzhiyun */
6244*4882a593Smuzhiyun if (is_smm(vcpu) ?
6245*4882a593Smuzhiyun (kvm_state->flags &
6246*4882a593Smuzhiyun (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
6247*4882a593Smuzhiyun : kvm_state->hdr.vmx.smm.flags)
6248*4882a593Smuzhiyun return -EINVAL;
6249*4882a593Smuzhiyun
6250*4882a593Smuzhiyun if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6251*4882a593Smuzhiyun !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
6252*4882a593Smuzhiyun return -EINVAL;
6253*4882a593Smuzhiyun
6254*4882a593Smuzhiyun if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
6255*4882a593Smuzhiyun (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
6256*4882a593Smuzhiyun return -EINVAL;
6257*4882a593Smuzhiyun
6258*4882a593Smuzhiyun vmx_leave_nested(vcpu);
6259*4882a593Smuzhiyun
6260*4882a593Smuzhiyun if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
6261*4882a593Smuzhiyun return 0;
6262*4882a593Smuzhiyun
6263*4882a593Smuzhiyun vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
6264*4882a593Smuzhiyun ret = enter_vmx_operation(vcpu);
6265*4882a593Smuzhiyun if (ret)
6266*4882a593Smuzhiyun return ret;
6267*4882a593Smuzhiyun
6268*4882a593Smuzhiyun /* Empty 'VMXON' state is permitted if no VMCS loaded */
6269*4882a593Smuzhiyun if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
6270*4882a593Smuzhiyun /* See vmx_has_valid_vmcs12. */
6271*4882a593Smuzhiyun if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
6272*4882a593Smuzhiyun (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
6273*4882a593Smuzhiyun (kvm_state->hdr.vmx.vmcs12_pa != -1ull))
6274*4882a593Smuzhiyun return -EINVAL;
6275*4882a593Smuzhiyun else
6276*4882a593Smuzhiyun return 0;
6277*4882a593Smuzhiyun }
6278*4882a593Smuzhiyun
6279*4882a593Smuzhiyun if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
6280*4882a593Smuzhiyun if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
6281*4882a593Smuzhiyun !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
6282*4882a593Smuzhiyun return -EINVAL;
6283*4882a593Smuzhiyun
6284*4882a593Smuzhiyun set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
6285*4882a593Smuzhiyun } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
6286*4882a593Smuzhiyun /*
6287*4882a593Smuzhiyun * nested_vmx_handle_enlightened_vmptrld() cannot be called
6288*4882a593Smuzhiyun * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
6289*4882a593Smuzhiyun * restored yet. EVMCS will be mapped from
6290*4882a593Smuzhiyun * nested_get_vmcs12_pages().
6291*4882a593Smuzhiyun */
6292*4882a593Smuzhiyun kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
6293*4882a593Smuzhiyun } else {
6294*4882a593Smuzhiyun return -EINVAL;
6295*4882a593Smuzhiyun }
6296*4882a593Smuzhiyun
6297*4882a593Smuzhiyun if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
6298*4882a593Smuzhiyun vmx->nested.smm.vmxon = true;
6299*4882a593Smuzhiyun vmx->nested.vmxon = false;
6300*4882a593Smuzhiyun
6301*4882a593Smuzhiyun if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
6302*4882a593Smuzhiyun vmx->nested.smm.guest_mode = true;
6303*4882a593Smuzhiyun }
6304*4882a593Smuzhiyun
6305*4882a593Smuzhiyun vmcs12 = get_vmcs12(vcpu);
6306*4882a593Smuzhiyun if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
6307*4882a593Smuzhiyun return -EFAULT;
6308*4882a593Smuzhiyun
6309*4882a593Smuzhiyun if (vmcs12->hdr.revision_id != VMCS12_REVISION)
6310*4882a593Smuzhiyun return -EINVAL;
6311*4882a593Smuzhiyun
6312*4882a593Smuzhiyun if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6313*4882a593Smuzhiyun return 0;
6314*4882a593Smuzhiyun
6315*4882a593Smuzhiyun vmx->nested.nested_run_pending =
6316*4882a593Smuzhiyun !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
6317*4882a593Smuzhiyun
6318*4882a593Smuzhiyun vmx->nested.mtf_pending =
6319*4882a593Smuzhiyun !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
6320*4882a593Smuzhiyun
6321*4882a593Smuzhiyun ret = -EINVAL;
6322*4882a593Smuzhiyun if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6323*4882a593Smuzhiyun vmcs12->vmcs_link_pointer != -1ull) {
6324*4882a593Smuzhiyun struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
6325*4882a593Smuzhiyun
6326*4882a593Smuzhiyun if (kvm_state->size <
6327*4882a593Smuzhiyun sizeof(*kvm_state) +
6328*4882a593Smuzhiyun sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
6329*4882a593Smuzhiyun goto error_guest_mode;
6330*4882a593Smuzhiyun
6331*4882a593Smuzhiyun if (copy_from_user(shadow_vmcs12,
6332*4882a593Smuzhiyun user_vmx_nested_state->shadow_vmcs12,
6333*4882a593Smuzhiyun sizeof(*shadow_vmcs12))) {
6334*4882a593Smuzhiyun ret = -EFAULT;
6335*4882a593Smuzhiyun goto error_guest_mode;
6336*4882a593Smuzhiyun }
6337*4882a593Smuzhiyun
6338*4882a593Smuzhiyun if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
6339*4882a593Smuzhiyun !shadow_vmcs12->hdr.shadow_vmcs)
6340*4882a593Smuzhiyun goto error_guest_mode;
6341*4882a593Smuzhiyun }
6342*4882a593Smuzhiyun
6343*4882a593Smuzhiyun vmx->nested.has_preemption_timer_deadline = false;
6344*4882a593Smuzhiyun if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
6345*4882a593Smuzhiyun vmx->nested.has_preemption_timer_deadline = true;
6346*4882a593Smuzhiyun vmx->nested.preemption_timer_deadline =
6347*4882a593Smuzhiyun kvm_state->hdr.vmx.preemption_timer_deadline;
6348*4882a593Smuzhiyun }
6349*4882a593Smuzhiyun
6350*4882a593Smuzhiyun if (nested_vmx_check_controls(vcpu, vmcs12) ||
6351*4882a593Smuzhiyun nested_vmx_check_host_state(vcpu, vmcs12) ||
6352*4882a593Smuzhiyun nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
6353*4882a593Smuzhiyun goto error_guest_mode;
6354*4882a593Smuzhiyun
6355*4882a593Smuzhiyun vmx->nested.dirty_vmcs12 = true;
6356*4882a593Smuzhiyun ret = nested_vmx_enter_non_root_mode(vcpu, false);
6357*4882a593Smuzhiyun if (ret)
6358*4882a593Smuzhiyun goto error_guest_mode;
6359*4882a593Smuzhiyun
6360*4882a593Smuzhiyun return 0;
6361*4882a593Smuzhiyun
6362*4882a593Smuzhiyun error_guest_mode:
6363*4882a593Smuzhiyun vmx->nested.nested_run_pending = 0;
6364*4882a593Smuzhiyun return ret;
6365*4882a593Smuzhiyun }
6366*4882a593Smuzhiyun
nested_vmx_set_vmcs_shadowing_bitmap(void)6367*4882a593Smuzhiyun void nested_vmx_set_vmcs_shadowing_bitmap(void)
6368*4882a593Smuzhiyun {
6369*4882a593Smuzhiyun if (enable_shadow_vmcs) {
6370*4882a593Smuzhiyun vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
6371*4882a593Smuzhiyun vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
6372*4882a593Smuzhiyun }
6373*4882a593Smuzhiyun }
6374*4882a593Smuzhiyun
6375*4882a593Smuzhiyun /*
6376*4882a593Smuzhiyun * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
6377*4882a593Smuzhiyun * returned for the various VMX controls MSRs when nested VMX is enabled.
6378*4882a593Smuzhiyun * The same values should also be used to verify that vmcs12 control fields are
6379*4882a593Smuzhiyun * valid during nested entry from L1 to L2.
6380*4882a593Smuzhiyun * Each of these control msrs has a low and high 32-bit half: A low bit is on
6381*4882a593Smuzhiyun * if the corresponding bit in the (32-bit) control field *must* be on, and a
6382*4882a593Smuzhiyun * bit in the high half is on if the corresponding bit in the control field
6383*4882a593Smuzhiyun * may be on. See also vmx_control_verify().
6384*4882a593Smuzhiyun */
nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs * msrs,u32 ept_caps)6385*4882a593Smuzhiyun void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
6386*4882a593Smuzhiyun {
6387*4882a593Smuzhiyun /*
6388*4882a593Smuzhiyun * Note that as a general rule, the high half of the MSRs (bits in
6389*4882a593Smuzhiyun * the control fields which may be 1) should be initialized by the
6390*4882a593Smuzhiyun * intersection of the underlying hardware's MSR (i.e., features which
6391*4882a593Smuzhiyun * can be supported) and the list of features we want to expose -
6392*4882a593Smuzhiyun * because they are known to be properly supported in our code.
6393*4882a593Smuzhiyun * Also, usually, the low half of the MSRs (bits which must be 1) can
6394*4882a593Smuzhiyun * be set to 0, meaning that L1 may turn off any of these bits. The
6395*4882a593Smuzhiyun * reason is that if one of these bits is necessary, it will appear
6396*4882a593Smuzhiyun * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
6397*4882a593Smuzhiyun * fields of vmcs01 and vmcs02, will turn these bits off - and
6398*4882a593Smuzhiyun * nested_vmx_l1_wants_exit() will not pass related exits to L1.
6399*4882a593Smuzhiyun * These rules have exceptions below.
6400*4882a593Smuzhiyun */
6401*4882a593Smuzhiyun
6402*4882a593Smuzhiyun /* pin-based controls */
6403*4882a593Smuzhiyun rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
6404*4882a593Smuzhiyun msrs->pinbased_ctls_low,
6405*4882a593Smuzhiyun msrs->pinbased_ctls_high);
6406*4882a593Smuzhiyun msrs->pinbased_ctls_low |=
6407*4882a593Smuzhiyun PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6408*4882a593Smuzhiyun msrs->pinbased_ctls_high &=
6409*4882a593Smuzhiyun PIN_BASED_EXT_INTR_MASK |
6410*4882a593Smuzhiyun PIN_BASED_NMI_EXITING |
6411*4882a593Smuzhiyun PIN_BASED_VIRTUAL_NMIS |
6412*4882a593Smuzhiyun (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
6413*4882a593Smuzhiyun msrs->pinbased_ctls_high |=
6414*4882a593Smuzhiyun PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6415*4882a593Smuzhiyun PIN_BASED_VMX_PREEMPTION_TIMER;
6416*4882a593Smuzhiyun
6417*4882a593Smuzhiyun /* exit controls */
6418*4882a593Smuzhiyun rdmsr(MSR_IA32_VMX_EXIT_CTLS,
6419*4882a593Smuzhiyun msrs->exit_ctls_low,
6420*4882a593Smuzhiyun msrs->exit_ctls_high);
6421*4882a593Smuzhiyun msrs->exit_ctls_low =
6422*4882a593Smuzhiyun VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
6423*4882a593Smuzhiyun
6424*4882a593Smuzhiyun msrs->exit_ctls_high &=
6425*4882a593Smuzhiyun #ifdef CONFIG_X86_64
6426*4882a593Smuzhiyun VM_EXIT_HOST_ADDR_SPACE_SIZE |
6427*4882a593Smuzhiyun #endif
6428*4882a593Smuzhiyun VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
6429*4882a593Smuzhiyun VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
6430*4882a593Smuzhiyun msrs->exit_ctls_high |=
6431*4882a593Smuzhiyun VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
6432*4882a593Smuzhiyun VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
6433*4882a593Smuzhiyun VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
6434*4882a593Smuzhiyun
6435*4882a593Smuzhiyun /* We support free control of debug control saving. */
6436*4882a593Smuzhiyun msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
6437*4882a593Smuzhiyun
6438*4882a593Smuzhiyun /* entry controls */
6439*4882a593Smuzhiyun rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
6440*4882a593Smuzhiyun msrs->entry_ctls_low,
6441*4882a593Smuzhiyun msrs->entry_ctls_high);
6442*4882a593Smuzhiyun msrs->entry_ctls_low =
6443*4882a593Smuzhiyun VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
6444*4882a593Smuzhiyun msrs->entry_ctls_high &=
6445*4882a593Smuzhiyun #ifdef CONFIG_X86_64
6446*4882a593Smuzhiyun VM_ENTRY_IA32E_MODE |
6447*4882a593Smuzhiyun #endif
6448*4882a593Smuzhiyun VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
6449*4882a593Smuzhiyun VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
6450*4882a593Smuzhiyun msrs->entry_ctls_high |=
6451*4882a593Smuzhiyun (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
6452*4882a593Smuzhiyun
6453*4882a593Smuzhiyun /* We support free control of debug control loading. */
6454*4882a593Smuzhiyun msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
6455*4882a593Smuzhiyun
6456*4882a593Smuzhiyun /* cpu-based controls */
6457*4882a593Smuzhiyun rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6458*4882a593Smuzhiyun msrs->procbased_ctls_low,
6459*4882a593Smuzhiyun msrs->procbased_ctls_high);
6460*4882a593Smuzhiyun msrs->procbased_ctls_low =
6461*4882a593Smuzhiyun CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6462*4882a593Smuzhiyun msrs->procbased_ctls_high &=
6463*4882a593Smuzhiyun CPU_BASED_INTR_WINDOW_EXITING |
6464*4882a593Smuzhiyun CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
6465*4882a593Smuzhiyun CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
6466*4882a593Smuzhiyun CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
6467*4882a593Smuzhiyun CPU_BASED_CR3_STORE_EXITING |
6468*4882a593Smuzhiyun #ifdef CONFIG_X86_64
6469*4882a593Smuzhiyun CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
6470*4882a593Smuzhiyun #endif
6471*4882a593Smuzhiyun CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
6472*4882a593Smuzhiyun CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
6473*4882a593Smuzhiyun CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
6474*4882a593Smuzhiyun CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
6475*4882a593Smuzhiyun CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
6476*4882a593Smuzhiyun /*
6477*4882a593Smuzhiyun * We can allow some features even when not supported by the
6478*4882a593Smuzhiyun * hardware. For example, L1 can specify an MSR bitmap - and we
6479*4882a593Smuzhiyun * can use it to avoid exits to L1 - even when L0 runs L2
6480*4882a593Smuzhiyun * without MSR bitmaps.
6481*4882a593Smuzhiyun */
6482*4882a593Smuzhiyun msrs->procbased_ctls_high |=
6483*4882a593Smuzhiyun CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6484*4882a593Smuzhiyun CPU_BASED_USE_MSR_BITMAPS;
6485*4882a593Smuzhiyun
6486*4882a593Smuzhiyun /* We support free control of CR3 access interception. */
6487*4882a593Smuzhiyun msrs->procbased_ctls_low &=
6488*4882a593Smuzhiyun ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
6489*4882a593Smuzhiyun
6490*4882a593Smuzhiyun /*
6491*4882a593Smuzhiyun * secondary cpu-based controls. Do not include those that
6492*4882a593Smuzhiyun * depend on CPUID bits, they are added later by
6493*4882a593Smuzhiyun * vmx_vcpu_after_set_cpuid.
6494*4882a593Smuzhiyun */
6495*4882a593Smuzhiyun if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
6496*4882a593Smuzhiyun rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6497*4882a593Smuzhiyun msrs->secondary_ctls_low,
6498*4882a593Smuzhiyun msrs->secondary_ctls_high);
6499*4882a593Smuzhiyun
6500*4882a593Smuzhiyun msrs->secondary_ctls_low = 0;
6501*4882a593Smuzhiyun msrs->secondary_ctls_high &=
6502*4882a593Smuzhiyun SECONDARY_EXEC_DESC |
6503*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_RDTSCP |
6504*4882a593Smuzhiyun SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6505*4882a593Smuzhiyun SECONDARY_EXEC_WBINVD_EXITING |
6506*4882a593Smuzhiyun SECONDARY_EXEC_APIC_REGISTER_VIRT |
6507*4882a593Smuzhiyun SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
6508*4882a593Smuzhiyun SECONDARY_EXEC_RDRAND_EXITING |
6509*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_INVPCID |
6510*4882a593Smuzhiyun SECONDARY_EXEC_RDSEED_EXITING |
6511*4882a593Smuzhiyun SECONDARY_EXEC_XSAVES;
6512*4882a593Smuzhiyun
6513*4882a593Smuzhiyun /*
6514*4882a593Smuzhiyun * We can emulate "VMCS shadowing," even if the hardware
6515*4882a593Smuzhiyun * doesn't support it.
6516*4882a593Smuzhiyun */
6517*4882a593Smuzhiyun msrs->secondary_ctls_high |=
6518*4882a593Smuzhiyun SECONDARY_EXEC_SHADOW_VMCS;
6519*4882a593Smuzhiyun
6520*4882a593Smuzhiyun if (enable_ept) {
6521*4882a593Smuzhiyun /* nested EPT: emulate EPT also to L1 */
6522*4882a593Smuzhiyun msrs->secondary_ctls_high |=
6523*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_EPT;
6524*4882a593Smuzhiyun msrs->ept_caps =
6525*4882a593Smuzhiyun VMX_EPT_PAGE_WALK_4_BIT |
6526*4882a593Smuzhiyun VMX_EPT_PAGE_WALK_5_BIT |
6527*4882a593Smuzhiyun VMX_EPTP_WB_BIT |
6528*4882a593Smuzhiyun VMX_EPT_INVEPT_BIT |
6529*4882a593Smuzhiyun VMX_EPT_EXECUTE_ONLY_BIT;
6530*4882a593Smuzhiyun
6531*4882a593Smuzhiyun msrs->ept_caps &= ept_caps;
6532*4882a593Smuzhiyun msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
6533*4882a593Smuzhiyun VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
6534*4882a593Smuzhiyun VMX_EPT_1GB_PAGE_BIT;
6535*4882a593Smuzhiyun if (enable_ept_ad_bits) {
6536*4882a593Smuzhiyun msrs->secondary_ctls_high |=
6537*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_PML;
6538*4882a593Smuzhiyun msrs->ept_caps |= VMX_EPT_AD_BIT;
6539*4882a593Smuzhiyun }
6540*4882a593Smuzhiyun }
6541*4882a593Smuzhiyun
6542*4882a593Smuzhiyun if (cpu_has_vmx_vmfunc()) {
6543*4882a593Smuzhiyun msrs->secondary_ctls_high |=
6544*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_VMFUNC;
6545*4882a593Smuzhiyun /*
6546*4882a593Smuzhiyun * Advertise EPTP switching unconditionally
6547*4882a593Smuzhiyun * since we emulate it
6548*4882a593Smuzhiyun */
6549*4882a593Smuzhiyun if (enable_ept)
6550*4882a593Smuzhiyun msrs->vmfunc_controls =
6551*4882a593Smuzhiyun VMX_VMFUNC_EPTP_SWITCHING;
6552*4882a593Smuzhiyun }
6553*4882a593Smuzhiyun
6554*4882a593Smuzhiyun /*
6555*4882a593Smuzhiyun * Old versions of KVM use the single-context version without
6556*4882a593Smuzhiyun * checking for support, so declare that it is supported even
6557*4882a593Smuzhiyun * though it is treated as global context. The alternative is
6558*4882a593Smuzhiyun * not failing the single-context invvpid, and it is worse.
6559*4882a593Smuzhiyun */
6560*4882a593Smuzhiyun if (enable_vpid) {
6561*4882a593Smuzhiyun msrs->secondary_ctls_high |=
6562*4882a593Smuzhiyun SECONDARY_EXEC_ENABLE_VPID;
6563*4882a593Smuzhiyun msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
6564*4882a593Smuzhiyun VMX_VPID_EXTENT_SUPPORTED_MASK;
6565*4882a593Smuzhiyun }
6566*4882a593Smuzhiyun
6567*4882a593Smuzhiyun if (enable_unrestricted_guest)
6568*4882a593Smuzhiyun msrs->secondary_ctls_high |=
6569*4882a593Smuzhiyun SECONDARY_EXEC_UNRESTRICTED_GUEST;
6570*4882a593Smuzhiyun
6571*4882a593Smuzhiyun if (flexpriority_enabled)
6572*4882a593Smuzhiyun msrs->secondary_ctls_high |=
6573*4882a593Smuzhiyun SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6574*4882a593Smuzhiyun
6575*4882a593Smuzhiyun /* miscellaneous data */
6576*4882a593Smuzhiyun rdmsr(MSR_IA32_VMX_MISC,
6577*4882a593Smuzhiyun msrs->misc_low,
6578*4882a593Smuzhiyun msrs->misc_high);
6579*4882a593Smuzhiyun msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
6580*4882a593Smuzhiyun msrs->misc_low |=
6581*4882a593Smuzhiyun MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
6582*4882a593Smuzhiyun VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
6583*4882a593Smuzhiyun VMX_MISC_ACTIVITY_HLT;
6584*4882a593Smuzhiyun msrs->misc_high = 0;
6585*4882a593Smuzhiyun
6586*4882a593Smuzhiyun /*
6587*4882a593Smuzhiyun * This MSR reports some information about VMX support. We
6588*4882a593Smuzhiyun * should return information about the VMX we emulate for the
6589*4882a593Smuzhiyun * guest, and the VMCS structure we give it - not about the
6590*4882a593Smuzhiyun * VMX support of the underlying hardware.
6591*4882a593Smuzhiyun */
6592*4882a593Smuzhiyun msrs->basic =
6593*4882a593Smuzhiyun VMCS12_REVISION |
6594*4882a593Smuzhiyun VMX_BASIC_TRUE_CTLS |
6595*4882a593Smuzhiyun ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
6596*4882a593Smuzhiyun (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
6597*4882a593Smuzhiyun
6598*4882a593Smuzhiyun if (cpu_has_vmx_basic_inout())
6599*4882a593Smuzhiyun msrs->basic |= VMX_BASIC_INOUT;
6600*4882a593Smuzhiyun
6601*4882a593Smuzhiyun /*
6602*4882a593Smuzhiyun * These MSRs specify bits which the guest must keep fixed on
6603*4882a593Smuzhiyun * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6604*4882a593Smuzhiyun * We picked the standard core2 setting.
6605*4882a593Smuzhiyun */
6606*4882a593Smuzhiyun #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6607*4882a593Smuzhiyun #define VMXON_CR4_ALWAYSON X86_CR4_VMXE
6608*4882a593Smuzhiyun msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
6609*4882a593Smuzhiyun msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
6610*4882a593Smuzhiyun
6611*4882a593Smuzhiyun /* These MSRs specify bits which the guest must keep fixed off. */
6612*4882a593Smuzhiyun rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
6613*4882a593Smuzhiyun rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
6614*4882a593Smuzhiyun
6615*4882a593Smuzhiyun /* highest index: VMX_PREEMPTION_TIMER_VALUE */
6616*4882a593Smuzhiyun msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
6617*4882a593Smuzhiyun }
6618*4882a593Smuzhiyun
nested_vmx_hardware_unsetup(void)6619*4882a593Smuzhiyun void nested_vmx_hardware_unsetup(void)
6620*4882a593Smuzhiyun {
6621*4882a593Smuzhiyun int i;
6622*4882a593Smuzhiyun
6623*4882a593Smuzhiyun if (enable_shadow_vmcs) {
6624*4882a593Smuzhiyun for (i = 0; i < VMX_BITMAP_NR; i++)
6625*4882a593Smuzhiyun free_page((unsigned long)vmx_bitmap[i]);
6626*4882a593Smuzhiyun }
6627*4882a593Smuzhiyun }
6628*4882a593Smuzhiyun
nested_vmx_hardware_setup(int (* exit_handlers[])(struct kvm_vcpu *))6629*4882a593Smuzhiyun __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
6630*4882a593Smuzhiyun {
6631*4882a593Smuzhiyun int i;
6632*4882a593Smuzhiyun
6633*4882a593Smuzhiyun if (!cpu_has_vmx_shadow_vmcs())
6634*4882a593Smuzhiyun enable_shadow_vmcs = 0;
6635*4882a593Smuzhiyun if (enable_shadow_vmcs) {
6636*4882a593Smuzhiyun for (i = 0; i < VMX_BITMAP_NR; i++) {
6637*4882a593Smuzhiyun /*
6638*4882a593Smuzhiyun * The vmx_bitmap is not tied to a VM and so should
6639*4882a593Smuzhiyun * not be charged to a memcg.
6640*4882a593Smuzhiyun */
6641*4882a593Smuzhiyun vmx_bitmap[i] = (unsigned long *)
6642*4882a593Smuzhiyun __get_free_page(GFP_KERNEL);
6643*4882a593Smuzhiyun if (!vmx_bitmap[i]) {
6644*4882a593Smuzhiyun nested_vmx_hardware_unsetup();
6645*4882a593Smuzhiyun return -ENOMEM;
6646*4882a593Smuzhiyun }
6647*4882a593Smuzhiyun }
6648*4882a593Smuzhiyun
6649*4882a593Smuzhiyun init_vmcs_shadow_fields();
6650*4882a593Smuzhiyun }
6651*4882a593Smuzhiyun
6652*4882a593Smuzhiyun exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
6653*4882a593Smuzhiyun exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
6654*4882a593Smuzhiyun exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
6655*4882a593Smuzhiyun exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
6656*4882a593Smuzhiyun exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
6657*4882a593Smuzhiyun exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
6658*4882a593Smuzhiyun exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
6659*4882a593Smuzhiyun exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff;
6660*4882a593Smuzhiyun exit_handlers[EXIT_REASON_VMON] = handle_vmon;
6661*4882a593Smuzhiyun exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
6662*4882a593Smuzhiyun exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
6663*4882a593Smuzhiyun exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
6664*4882a593Smuzhiyun
6665*4882a593Smuzhiyun return 0;
6666*4882a593Smuzhiyun }
6667*4882a593Smuzhiyun
6668*4882a593Smuzhiyun struct kvm_x86_nested_ops vmx_nested_ops = {
6669*4882a593Smuzhiyun .leave_nested = vmx_leave_nested,
6670*4882a593Smuzhiyun .check_events = vmx_check_nested_events,
6671*4882a593Smuzhiyun .hv_timer_pending = nested_vmx_preemption_timer_pending,
6672*4882a593Smuzhiyun .get_state = vmx_get_nested_state,
6673*4882a593Smuzhiyun .set_state = vmx_set_nested_state,
6674*4882a593Smuzhiyun .get_nested_state_pages = vmx_get_nested_state_pages,
6675*4882a593Smuzhiyun .write_log_dirty = nested_vmx_write_pml_buffer,
6676*4882a593Smuzhiyun .enable_evmcs = nested_enable_evmcs,
6677*4882a593Smuzhiyun .get_evmcs_version = nested_get_evmcs_version,
6678*4882a593Smuzhiyun };
6679