2 * Copyright (c) 2003-2013 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Mihai Carabas <mihai.carabas@gmail.com>
6 * by Matthew Dillon <dillon@backplane.com>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
18 * 3. Neither the name of The DragonFly Project nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific, prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 #include <sys/malloc.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/types.h>
40 #include <sys/thread.h>
41 #include <sys/thread2.h>
42 #include <sys/sysctl.h>
45 #include <sys/syscall.h>
47 #include <sys/vkernel.h>
48 #include <sys/mplock2.h>
53 #include <machine/cpufunc.h>
54 #include <machine/cputypes.h>
55 #include <machine/smp.h>
56 #include <machine/globaldata.h>
57 #include <machine/trap.h>
58 #include <machine/pmap.h>
59 #include <machine/md_var.h>
61 #include <vm/vm_map.h>
62 #include <vm/vm_extern.h>
63 #include <vm/vm_param.h>
66 #include "vmm_utils.h"
69 #include "vmx_instr.h"
74 extern void trap(struct trapframe
*frame
);
76 static int vmx_check_cpu_migration(void);
77 static int execute_vmptrld(struct vmx_thread_info
*vti
);
79 struct instr_decode syscall_asm
= {
85 struct vmx_ctl_info vmx_pinbased
= {
86 .msr_addr
= IA32_VMX_PINBASED_CTLS
,
87 .msr_true_addr
= IA32_VMX_TRUE_PINBASED_CTLS
,
90 struct vmx_ctl_info vmx_procbased
= {
91 .msr_addr
= IA32_VMX_PROCBASED_CTLS
,
92 .msr_true_addr
= IA32_VMX_TRUE_PROCBASED_CTLS
,
95 struct vmx_ctl_info vmx_procbased2
= {
96 .msr_addr
= IA32_VMX_PROCBASED_CTLS2
,
97 .msr_true_addr
= IA32_VMX_PROCBASED_CTLS2
,
100 struct vmx_ctl_info vmx_exit
= {
101 .msr_addr
= IA32_VMX_EXIT_CTLS
,
102 .msr_true_addr
= IA32_VMX_TRUE_EXIT_CTLS
,
105 struct vmx_ctl_info vmx_entry
= {
106 .msr_addr
= IA32_VMX_ENTRY_CTLS
,
107 .msr_true_addr
= IA32_VMX_TRUE_ENTRY_CTLS
,
110 /* Declared in generic vmm.c - SYSCTL parent */
111 extern struct sysctl_oid
*vmm_sysctl_tree
;
113 /* SYSCTL tree and context */
114 static struct sysctl_oid
*vmx_sysctl_tree
;
115 static struct sysctl_ctx_list vmx_sysctl_ctx
;
118 struct vmx_pcpu_info
*pcpu_info
;
121 uint32_t vmx_revision
;
122 uint32_t vmx_region_size
;
123 uint8_t vmx_width_addr
;
125 /* IA32_VMX_EPT_VPID_CAP */
126 uint64_t vmx_ept_vpid_cap
;
129 uint64_t cr0_fixed_to_0
;
130 uint64_t cr4_fixed_to_0
;
131 uint64_t cr0_fixed_to_1
;
132 uint64_t cr4_fixed_to_1
;
135 static uint8_t vmx_enabled
= 0;
136 static uint8_t vmx_initialized
= 0;
138 /* VMX set control setting
139 * Intel System Programming Guide, Part 3, Order Number 326019
140 * 31.5.1 Algorithms for Determining VMX Capabilities
141 * Implement Algorithm 3
144 vmx_set_ctl_setting(struct vmx_ctl_info
*vmx_ctl
, uint32_t bit_no
, setting_t value
) {
148 /* Check if its branch b. or c. */
149 vmx_basic
= rdmsr(IA32_VMX_BASIC
);
150 if (IS_TRUE_CTL_AVAIL(vmx_basic
))
151 ctl_val
= rdmsr(vmx_ctl
->msr_true_addr
);
153 ctl_val
= rdmsr(vmx_ctl
->msr_addr
);
155 /* Check if the value is known by VMM or set on DEFAULT */
159 * Both settings are allowd
164 if (IS_ZERO_SETTING_ALLOWED(ctl_val
, bit_no
) &&
165 IS_ONE_SETTING_ALLOWED(ctl_val
, bit_no
)) {
166 /* For c.iii) and c.iv) */
167 if (IS_TRUE_CTL_AVAIL(vmx_basic
))
168 ctl_val
= rdmsr(vmx_ctl
->msr_addr
);
170 if (IS_ZERO_SETTING_ALLOWED(ctl_val
, bit_no
))
171 vmx_ctl
->ctls
&= ~BIT(bit_no
);
172 else if (IS_ONE_SETTING_ALLOWED(ctl_val
, bit_no
))
173 vmx_ctl
->ctls
|= BIT(bit_no
);
174 } else if (IS_ZERO_SETTING_ALLOWED(ctl_val
, bit_no
)) {
176 vmx_ctl
->ctls
&= ~BIT(bit_no
);
177 } else if (IS_ONE_SETTING_ALLOWED(ctl_val
, bit_no
)) {
179 vmx_ctl
->ctls
|= BIT(bit_no
);
185 /* For b.ii) or c.ii) */
186 if (!IS_ZERO_SETTING_ALLOWED(ctl_val
, bit_no
))
188 vmx_ctl
->ctls
&= ~BIT(bit_no
);
191 /* For b.ii) or c.ii) */
192 if (!IS_ONE_SETTING_ALLOWED(ctl_val
, bit_no
))
195 vmx_ctl
->ctls
|= BIT(bit_no
);
203 vmx_set_default_settings(struct vmx_ctl_info
*vmx_ctl
)
207 for (i
= 0; i
< 32; i
++) {
208 vmx_set_ctl_setting(vmx_ctl
, i
, DEFAULT
);
213 alloc_vmxon_regions(void)
216 pcpu_info
= kmalloc(ncpus
* sizeof(struct vmx_pcpu_info
), M_TEMP
, M_WAITOK
| M_ZERO
);
218 for (cpu
= 0; cpu
< ncpus
; cpu
++) {
220 /* The address must be aligned to 4K - alloc extra */
221 pcpu_info
[cpu
].vmxon_region_na
= kmalloc(vmx_region_size
+ VMXON_REGION_ALIGN_SIZE
,
226 pcpu_info
[cpu
].vmxon_region
= (unsigned char*) VMXON_REGION_ALIGN(pcpu_info
[cpu
].vmxon_region_na
);
228 /* In the first 31 bits put the vmx revision*/
229 *((uint32_t *) pcpu_info
[cpu
].vmxon_region
) = vmx_revision
;
234 free_vmxon_regions(void)
238 for (i
= 0; i
< ncpus
; i
++) {
239 pcpu_info
[i
].vmxon_region
= NULL
;
241 kfree(pcpu_info
[i
].vmxon_region_na
, M_TEMP
);
244 kfree(pcpu_info
, M_TEMP
);
248 build_vmx_sysctl(void)
250 sysctl_ctx_init(&vmx_sysctl_ctx
);
251 vmx_sysctl_tree
= SYSCTL_ADD_NODE(&vmx_sysctl_ctx
,
252 SYSCTL_CHILDREN(vmm_sysctl_tree
),
254 CTLFLAG_RD
, 0, "VMX options");
256 SYSCTL_ADD_UINT(&vmx_sysctl_ctx
,
257 SYSCTL_CHILDREN(vmx_sysctl_tree
),
258 OID_AUTO
, "revision", CTLFLAG_RD
,
261 SYSCTL_ADD_UINT(&vmx_sysctl_ctx
,
262 SYSCTL_CHILDREN(vmx_sysctl_tree
),
263 OID_AUTO
, "region_size", CTLFLAG_RD
,
266 SYSCTL_ADD_INT(&vmx_sysctl_ctx
,
267 SYSCTL_CHILDREN(vmx_sysctl_tree
),
268 OID_AUTO
, "width_addr", CTLFLAG_RD
,
270 "VMX width address");
271 SYSCTL_ADD_UINT(&vmx_sysctl_ctx
,
272 SYSCTL_CHILDREN(vmx_sysctl_tree
),
273 OID_AUTO
, "pinbased_ctls", CTLFLAG_RD
,
274 &vmx_pinbased
.ctls
, 0,
275 "VMX pin-based controls");
276 SYSCTL_ADD_UINT(&vmx_sysctl_ctx
,
277 SYSCTL_CHILDREN(vmx_sysctl_tree
),
278 OID_AUTO
, "procbased_ctls", CTLFLAG_RD
,
279 &vmx_procbased
.ctls
, 0,
280 "VMX primary processor-based controls");
281 SYSCTL_ADD_UINT(&vmx_sysctl_ctx
,
282 SYSCTL_CHILDREN(vmx_sysctl_tree
),
283 OID_AUTO
, "procbased2_ctls", CTLFLAG_RD
,
284 &vmx_procbased2
.ctls
, 0,
285 "VMX secondary processor-based controls");
286 SYSCTL_ADD_UINT(&vmx_sysctl_ctx
,
287 SYSCTL_CHILDREN(vmx_sysctl_tree
),
288 OID_AUTO
, "vmexit_ctls", CTLFLAG_RD
,
290 "VMX exit controls");
291 SYSCTL_ADD_UINT(&vmx_sysctl_ctx
,
292 SYSCTL_CHILDREN(vmx_sysctl_tree
),
293 OID_AUTO
, "vmentry_ctls", CTLFLAG_RD
,
295 "VMX entry controls");
296 SYSCTL_ADD_ULONG(&vmx_sysctl_ctx
,
297 SYSCTL_CHILDREN(vmx_sysctl_tree
),
298 OID_AUTO
, "ept_vpid_cap", CTLFLAG_RD
,
308 uint64_t feature_control
;
309 uint64_t vmx_basic_value
;
310 uint64_t cr0_fixed_bits_to_1
;
311 uint64_t cr0_fixed_bits_to_0
;
312 uint64_t cr4_fixed_bits_to_0
;
313 uint64_t cr4_fixed_bits_to_1
;
319 * The ability of a processor to support VMX operation
320 * and related instructions is indicated by:
321 * CPUID.1:ECX.VMX[bit 5] = 1
323 if (!(cpu_feature2
& CPUID2_VMX
)) {
324 kprintf("VMM: VMX is not supported by this Intel CPU\n");
328 vmx_set_default_settings(&vmx_pinbased
);
330 vmx_set_default_settings(&vmx_procbased
);
331 /* Enable second level for procbased */
332 err
= vmx_set_ctl_setting(&vmx_procbased
,
333 PROCBASED_ACTIVATE_SECONDARY_CONTROLS
,
336 kprintf("VMM: PROCBASED_ACTIVATE_SECONDARY_CONTROLS not "
337 "supported by this CPU\n");
340 vmx_set_default_settings(&vmx_procbased2
);
342 vmx_set_default_settings(&vmx_exit
);
343 vmx_set_default_settings(&vmx_entry
);
345 /* Enable external interrupts exiting */
346 err
= vmx_set_ctl_setting(&vmx_pinbased
,
347 PINBASED_EXTERNAL_INTERRUPT_EXITING
,
350 kprintf("VMM: PINBASED_EXTERNAL_INTERRUPT_EXITING not "
351 "supported by this CPU\n");
355 /* Enable non-maskable interrupts exiting */
356 err
= vmx_set_ctl_setting(&vmx_pinbased
, PINBASED_NMI_EXITING
, ONE
);
358 kprintf("VMM: PINBASED_NMI_EXITING not "
359 "supported by this CPU\n");
364 /* Set 64bits mode for GUEST */
365 err
= vmx_set_ctl_setting(&vmx_entry
, VMENTRY_IA32e_MODE_GUEST
, ONE
);
367 kprintf("VMM: VMENTRY_IA32e_MODE_GUEST not "
368 "supported by this CPU\n");
372 /* Load MSR EFER on enry */
373 err
= vmx_set_ctl_setting(&vmx_entry
,
374 VMENTRY_LOAD_IA32_EFER
, ONE
);
376 kprintf("VMM: VMENTRY_LOAD_IA32_EFER not "
377 "supported by this CPU\n");
381 /* Set 64bits mode */
382 err
= vmx_set_ctl_setting(&vmx_exit
,
383 VMEXIT_HOST_ADDRESS_SPACE_SIZE
, ONE
);
385 kprintf("VMM: VMEXIT_HOST_ADDRESS_SPACE_SIZE not "
386 "supported by this CPU\n");
390 /* Save/Load Efer on exit */
391 err
= vmx_set_ctl_setting(&vmx_exit
,
392 VMEXIT_SAVE_IA32_EFER
,
395 kprintf("VMM: VMEXIT_SAVE_IA32_EFER not "
396 "supported by this CPU\n");
400 /* Load Efer on exit */
401 err
= vmx_set_ctl_setting(&vmx_exit
,
402 VMEXIT_LOAD_IA32_EFER
,
405 kprintf("VMM: VMEXIT_LOAD_IA32_EFER not "
406 "supported by this CPU\n");
410 /* Enable EPT feature */
411 err
= vmx_set_ctl_setting(&vmx_procbased2
,
412 PROCBASED2_ENABLE_EPT
,
415 kprintf("VMM: PROCBASED2_ENABLE_EPT not "
416 "supported by this CPU\n");
420 if (vmx_ept_init()) {
421 kprintf("VMM: vmx_ept_init failed\n");
425 /* XXX - to implement in the feature */
426 /* Enable VPID feature */
427 err
= vmx_set_ctl_setting(&vmx_procbased2
,
428 PROCBASED2_ENABLE_VPID
,
431 kprintf("VMM: PROCBASED2_ENABLE_VPID not "
432 "supported by this CPU\n");
437 /* Check for the feature control status */
438 feature_control
= rdmsr(IA32_FEATURE_CONTROL
);
439 if (!(feature_control
& BIT(FEATURE_CONTROL_LOCKED
))) {
440 kprintf("VMM: IA32_FEATURE_CONTROL is not locked\n");
443 if (!(feature_control
& BIT(FEATURE_CONTROL_VMX_BIOS_ENABLED
))) {
444 kprintf("VMM: VMX is disabled by the BIOS\n");
448 vmx_basic_value
= rdmsr(IA32_VMX_BASIC
);
449 vmx_width_addr
= (uint8_t) VMX_WIDTH_ADDR(vmx_basic_value
);
450 vmx_region_size
= (uint32_t) VMX_REGION_SIZE(vmx_basic_value
);
451 vmx_revision
= (uint32_t) VMX_REVISION(vmx_basic_value
);
453 /* A.7 VMX-FIXED BITS IN CR0 */
454 cr0_fixed_bits_to_1
= rdmsr(IA32_VMX_CR0_FIXED0
);
455 cr0_fixed_bits_to_0
= rdmsr(IA32_VMX_CR0_FIXED1
);
456 cr0_fixed_to_1
= cr0_fixed_bits_to_1
& cr0_fixed_bits_to_0
;
457 cr0_fixed_to_0
= ~cr0_fixed_bits_to_1
& ~cr0_fixed_bits_to_0
;
459 /* A.8 VMX-FIXED BITS IN CR4 */
460 cr4_fixed_bits_to_1
= rdmsr(IA32_VMX_CR4_FIXED0
);
461 cr4_fixed_bits_to_0
= rdmsr(IA32_VMX_CR4_FIXED1
);
462 cr4_fixed_to_1
= cr4_fixed_bits_to_1
& cr4_fixed_bits_to_0
;
463 cr4_fixed_to_0
= ~cr4_fixed_bits_to_1
& ~cr4_fixed_bits_to_0
;
472 execute_vmxon(void *perr
)
474 unsigned char *vmxon_region
;
475 int *err
= (int*) perr
;
477 /* A.7 VMX-FIXED BITS IN CR0 */
478 load_cr0((rcr0() | cr0_fixed_to_1
) & ~cr0_fixed_to_0
);
480 /* A.8 VMX-FIXED BITS IN CR4 */
481 load_cr4((rcr4() | cr4_fixed_to_1
) & ~cr4_fixed_to_0
);
484 load_cr4(rcr4() | CR4_VMXE
);
486 vmxon_region
= pcpu_info
[mycpuid
].vmxon_region
;
487 *err
= vmxon(vmxon_region
);
489 kprintf("VMM: vmxon failed on cpu%d\n", mycpuid
);
494 execute_vmxoff(void *dummy
)
496 invept_desc_t desc
= { 0 };
498 if (invept(INVEPT_TYPE_ALL_CONTEXTS
, (uint64_t*) &desc
))
499 kprintf("VMM: execute_vmxoff: invet failed on cpu%d\n", mycpu
->gd_cpuid
);
504 load_cr4(rcr4() & ~CR4_VMXE
);
508 execute_vmclear(void *data
)
510 struct vmx_thread_info
*vti
= data
;
512 globaldata_t gd
= mycpu
;
514 if (pcpu_info
[gd
->gd_cpuid
].loaded_vmx
== vti
) {
516 * Must set vti->launched to zero after vmclear'ing to
517 * force a vmlaunch the next time.
519 * Must not clear the loaded_vmx field until after we call
520 * vmclear on the region. This field triggers the interlocked
521 * cpusync from another cpu trying to destroy or reuse
522 * the vti. If we clear the field first, the other cpu will
523 * not interlock and may race our vmclear() on the underlying
526 ERROR_IF(vmclear(vti
->vmcs_region
));
528 pcpu_info
[gd
->gd_cpuid
].loaded_vmx
= NULL
;
535 execute_vmptrld(struct vmx_thread_info
*vti
)
537 globaldata_t gd
= mycpu
;
540 * Must vmclear previous active vcms if it is different.
542 if (pcpu_info
[gd
->gd_cpuid
].loaded_vmx
&&
543 pcpu_info
[gd
->gd_cpuid
].loaded_vmx
!= vti
)
544 execute_vmclear(pcpu_info
[gd
->gd_cpuid
].loaded_vmx
);
547 * Make this the current VMCS. Must set loaded_vmx field
548 * before calling vmptrld() to avoid races against cpusync.
550 * Must set vti->launched to zero after the vmptrld to force
553 if (pcpu_info
[gd
->gd_cpuid
].loaded_vmx
!= vti
) {
555 pcpu_info
[gd
->gd_cpuid
].loaded_vmx
= vti
;
556 return (vmptrld(vti
->vmcs_region
));
568 if (!vmx_initialized
) {
569 kprintf("VMM: vmx_enable - not allowed; vmx not initialized\n");
574 kprintf("VMM: vmx_enable - already enabled\n");
578 alloc_vmxon_regions();
579 for (cpu
= 0; cpu
< ncpus
; cpu
++) {
583 CPUMASK_ASSBIT(mask
, cpu
);
584 lwkt_cpusync_simple(mask
, execute_vmxon
, &err
);
586 kprintf("VMM: vmx_enable error %d on cpu%d\n", err
, cpu
);
600 kprintf("VMM: vmx_disable not allowed; vmx wasn't enabled\n");
603 for (cpu
= 0; cpu
< ncpus
; cpu
++) {
606 CPUMASK_ASSBIT(mask
, cpu
);
607 lwkt_cpusync_simple(mask
, execute_vmxoff
, NULL
);
610 free_vmxon_regions();
617 static int vmx_set_guest_descriptor(descriptor_t type
,
631 * Intel Manual Vol 3C. - page 60
632 * If any bit in the limit field in the range 11:0 is 0, G must be 0.
633 * If any bit in the limit field in the range 31:20 is 1, G must be 1.
635 if ((~rights
& VMCS_SEG_UNUSABLE
) || (type
== CS
)) {
636 if ((limit
& 0xfff) != 0xfff)
638 else if ((limit
& 0xfff00000) != 0)
644 selector_enc
= VMCS_GUEST_ES_SELECTOR
;
645 rights_enc
= VMCS_GUEST_ES_ACCESS_RIGHTS
;
646 base_enc
= VMCS_GUEST_ES_BASE
;
647 limit_enc
= VMCS_GUEST_ES_LIMIT
;
650 selector_enc
= VMCS_GUEST_CS_SELECTOR
;
651 rights_enc
= VMCS_GUEST_CS_ACCESS_RIGHTS
;
652 base_enc
= VMCS_GUEST_CS_BASE
;
653 limit_enc
= VMCS_GUEST_CS_LIMIT
;
656 selector_enc
= VMCS_GUEST_SS_SELECTOR
;
657 rights_enc
= VMCS_GUEST_SS_ACCESS_RIGHTS
;
658 base_enc
= VMCS_GUEST_SS_BASE
;
659 limit_enc
= VMCS_GUEST_SS_LIMIT
;
662 selector_enc
= VMCS_GUEST_DS_SELECTOR
;
663 rights_enc
= VMCS_GUEST_DS_ACCESS_RIGHTS
;
664 base_enc
= VMCS_GUEST_DS_BASE
;
665 limit_enc
= VMCS_GUEST_DS_LIMIT
;
668 selector_enc
= VMCS_GUEST_FS_SELECTOR
;
669 rights_enc
= VMCS_GUEST_FS_ACCESS_RIGHTS
;
670 base_enc
= VMCS_GUEST_FS_BASE
;
671 limit_enc
= VMCS_GUEST_FS_LIMIT
;
674 selector_enc
= VMCS_GUEST_GS_SELECTOR
;
675 rights_enc
= VMCS_GUEST_GS_ACCESS_RIGHTS
;
676 base_enc
= VMCS_GUEST_GS_BASE
;
677 limit_enc
= VMCS_GUEST_GS_LIMIT
;
680 selector_enc
= VMCS_GUEST_LDTR_SELECTOR
;
681 rights_enc
= VMCS_GUEST_LDTR_ACCESS_RIGHTS
;
682 base_enc
= VMCS_GUEST_LDTR_BASE
;
683 limit_enc
= VMCS_GUEST_LDTR_LIMIT
;
686 selector_enc
= VMCS_GUEST_TR_SELECTOR
;
687 rights_enc
= VMCS_GUEST_TR_ACCESS_RIGHTS
;
688 base_enc
= VMCS_GUEST_TR_BASE
;
689 limit_enc
= VMCS_GUEST_TR_LIMIT
;
692 kprintf("VMM: vmx_set_guest_descriptor: unknown descriptor\n");
698 ERROR_IF(vmwrite(selector_enc
, selector
));
699 ERROR_IF(vmwrite(rights_enc
, rights
));
700 ERROR_IF(vmwrite(base_enc
, base
));
701 ERROR_IF(vmwrite(limit_enc
, limit
));
705 kprintf("VMM: vmx_set_guest_descriptor failed\n");
710 * Called by the first thread of the VMM process
711 * - create a new vmspace
712 * - init the vmspace with EPT PG_* bits and
713 * EPT copyin/copyout functions
714 * - replace the vmspace of the current proc
715 * - remove the old vmspace
718 vmx_vminit_master(struct vmm_guest_options
*options
)
720 struct vmspace
*oldvmspace
;
721 struct vmspace
*newvmspace
;
722 struct proc
*p
= curthread
->td_proc
;
723 struct vmm_proc
*p_vmm
;
725 oldvmspace
= curthread
->td_lwp
->lwp_vmspace
;
726 newvmspace
= vmspace_fork(oldvmspace
);
728 vmx_ept_pmap_pinit(vmspace_pmap(newvmspace
));
729 bzero(vmspace_pmap(newvmspace
)->pm_pml4
, PAGE_SIZE
);
731 lwkt_gettoken(&oldvmspace
->vm_map
.token
);
732 lwkt_gettoken(&newvmspace
->vm_map
.token
);
734 pmap_pinit2(vmspace_pmap(newvmspace
));
735 pmap_replacevm(curthread
->td_proc
, newvmspace
, 0);
737 lwkt_reltoken(&newvmspace
->vm_map
.token
);
738 lwkt_reltoken(&oldvmspace
->vm_map
.token
);
740 vmspace_rel(oldvmspace
);
742 options
->vmm_cr3
= vtophys(vmspace_pmap(newvmspace
)->pm_pml4
);
744 p_vmm
= kmalloc(sizeof(struct vmm_proc
), M_TEMP
, M_WAITOK
| M_ZERO
);
745 p_vmm
->guest_cr3
= options
->guest_cr3
;
746 p_vmm
->vmm_cr3
= options
->vmm_cr3
;
747 p
->p_vmm
= (void *)p_vmm
;
750 p
->p_vkernel
->vkernel_cr3
= options
->guest_cr3
;
751 dkprintf("PROCESS CR3 %016jx\n", (intmax_t)options
->guest_cr3
);
758 vmx_vminit(struct vmm_guest_options
*options
)
760 struct vmx_thread_info
* vti
;
762 struct tls_info guest_fs
= curthread
->td_tls
.info
[0];
763 struct tls_info guest_gs
= curthread
->td_tls
.info
[1];
766 vti
= kmalloc(sizeof(struct vmx_thread_info
), M_TEMP
, M_WAITOK
| M_ZERO
);
767 curthread
->td_vmm
= (void*) vti
;
769 if (options
->master
) {
770 vmx_vminit_master(options
);
773 bcopy(&options
->tf
, &vti
->guest
, sizeof(struct trapframe
));
776 * Be sure we return success if the VMM hook enters
778 vti
->guest
.tf_rax
= 0;
779 vti
->guest
.tf_rflags
&= ~PSL_C
;
781 vti
->vmcs_region_na
= kmalloc(vmx_region_size
+ VMXON_REGION_ALIGN_SIZE
,
786 vti
->vmcs_region
= (unsigned char*) VMXON_REGION_ALIGN(vti
->vmcs_region_na
);
789 vti
->guest_cr3
= options
->guest_cr3
;
790 vti
->vmm_cr3
= options
->vmm_cr3
;
792 /* In the first 31 bits put the vmx revision*/
793 *((uint32_t *)vti
->vmcs_region
) = vmx_revision
;
796 * vmclear the vmcs to initialize it.
798 ERROR_IF(vmclear(vti
->vmcs_region
));
802 ERROR_IF(execute_vmptrld(vti
));
804 /* Load the VMX controls */
805 ERROR_IF(vmwrite(VMCS_PINBASED_CTLS
, vmx_pinbased
.ctls
));
806 ERROR_IF(vmwrite(VMCS_PROCBASED_CTLS
, vmx_procbased
.ctls
));
807 ERROR_IF(vmwrite(VMCS_PROCBASED2_CTLS
, vmx_procbased2
.ctls
));
808 ERROR_IF(vmwrite(VMCS_VMEXIT_CTLS
, vmx_exit
.ctls
));
809 ERROR_IF(vmwrite(VMCS_VMENTRY_CTLS
, vmx_entry
.ctls
));
812 ERROR_IF(vmwrite(VMCS_HOST_CR0
, rcr0()));
813 ERROR_IF(vmwrite(VMCS_HOST_CR4
, rcr4()));
815 /* Load HOST EFER and PAT */
816 // ERROR_IF(vmwrite(VMCS_HOST_IA32_PAT, rdmsr(MSR_PAT)));
817 ERROR_IF(vmwrite(VMCS_HOST_IA32_EFER
, rdmsr(MSR_EFER
)));
819 /* Load HOST selectors */
820 ERROR_IF(vmwrite(VMCS_HOST_ES_SELECTOR
, GSEL(GDATA_SEL
, SEL_KPL
)));
821 ERROR_IF(vmwrite(VMCS_HOST_SS_SELECTOR
, GSEL(GDATA_SEL
, SEL_KPL
)));
822 ERROR_IF(vmwrite(VMCS_HOST_FS_SELECTOR
, GSEL(GDATA_SEL
, SEL_KPL
)));
823 ERROR_IF(vmwrite(VMCS_HOST_GS_SELECTOR
, GSEL(GDATA_SEL
, SEL_KPL
)));
824 ERROR_IF(vmwrite(VMCS_HOST_CS_SELECTOR
, GSEL(GCODE_SEL
, SEL_KPL
)));
825 ERROR_IF(vmwrite(VMCS_HOST_TR_SELECTOR
, GSEL(GPROC0_SEL
, SEL_KPL
)));
828 * The BASE addresses are written on each VMRUN in case
829 * the CPU changes because are per-CPU values
833 * Call vmx_vmexit on VM_EXIT condition
834 * The RSP will point to the vmx_thread_info
836 ERROR_IF(vmwrite(VMCS_HOST_RIP
, (uint64_t) vmx_vmexit
));
837 ERROR_IF(vmwrite(VMCS_HOST_RSP
, (uint64_t) vti
));
838 ERROR_IF(vmwrite(VMCS_HOST_CR3
, (uint64_t) KPML4phys
));
841 * GUEST initialization
842 * - set the descriptors according the conditions from Intel
843 * manual "26.3.1.2 Checks on Guest Segment Registers
844 * - set the privilege to SEL_UPL (the vkernel will run
845 * in userspace context)
847 ERROR_IF(vmx_set_guest_descriptor(ES
, GSEL(GUDATA_SEL
, SEL_UPL
),
848 VMCS_SEG_TYPE(3) | VMCS_S
| VMCS_DPL(SEL_UPL
) | VMCS_P
,
851 ERROR_IF(vmx_set_guest_descriptor(SS
, GSEL(GUDATA_SEL
, SEL_UPL
),
852 VMCS_SEG_TYPE(3) | VMCS_S
| VMCS_DPL(SEL_UPL
) | VMCS_P
,
855 ERROR_IF(vmx_set_guest_descriptor(DS
, GSEL(GUDATA_SEL
, SEL_UPL
),
856 VMCS_SEG_TYPE(3) | VMCS_S
| VMCS_DPL(SEL_UPL
) | VMCS_P
,
859 ERROR_IF(vmx_set_guest_descriptor(FS
, GSEL(GUDATA_SEL
, SEL_UPL
),
860 VMCS_SEG_TYPE(3) | VMCS_S
| VMCS_DPL(SEL_UPL
) | VMCS_P
,
861 (uint64_t) guest_fs
.base
, (uint32_t) guest_fs
.size
));
863 ERROR_IF(vmx_set_guest_descriptor(GS
, GSEL(GUDATA_SEL
, SEL_UPL
),
864 VMCS_SEG_TYPE(3) | VMCS_S
| VMCS_DPL(SEL_UPL
) | VMCS_P
,
865 (uint64_t) guest_gs
.base
, (uint32_t) guest_gs
.size
));
867 ERROR_IF(vmx_set_guest_descriptor(CS
, GSEL(GUCODE_SEL
, SEL_UPL
),
868 VMCS_SEG_TYPE(11) | VMCS_S
| VMCS_DPL(SEL_UPL
) | VMCS_P
| VMCS_L
,
871 ERROR_IF(vmx_set_guest_descriptor(TR
, GSEL(GPROC0_SEL
, SEL_UPL
),
872 VMCS_SEG_TYPE(11) | VMCS_P
,
875 ERROR_IF(vmx_set_guest_descriptor(LDTR
, 0, VMCS_SEG_UNUSABLE
, 0, 0));
877 /* Set the CR0/CR4 registers, removing the unsupported bits */
878 ERROR_IF(vmwrite(VMCS_GUEST_CR0
, (CR0_PE
| CR0_PG
|
879 cr0_fixed_to_1
) & ~cr0_fixed_to_0
));
880 ERROR_IF(vmwrite(VMCS_GUEST_CR4
, (CR4_PAE
| CR4_FXSR
| CR4_XMM
| CR4_XSAVE
|
881 cr4_fixed_to_1
) & ~ cr4_fixed_to_0
));
883 /* Don't set EFER_SCE for catching "syscall" instructions */
884 ERROR_IF(vmwrite(VMCS_GUEST_IA32_EFER
, (EFER_LME
| EFER_LMA
)));
886 vti
->guest
.tf_rflags
= PSL_I
| 0x02;
887 ERROR_IF(vmwrite(VMCS_GUEST_RFLAGS
, vti
->guest
.tf_rflags
));
889 /* The Guest CR3 indicating CR3 pagetable */
890 ERROR_IF(vmwrite(VMCS_GUEST_CR3
, (uint64_t) vti
->guest_cr3
));
892 /* Throw all possible exceptions */
893 ERROR_IF(vmwrite(VMCS_EXCEPTION_BITMAP
,(uint64_t) 0xFFFFFFFF));
895 /* Guest RIP and RSP */
896 ERROR_IF(vmwrite(VMCS_GUEST_RIP
, options
->tf
.tf_rip
));
897 ERROR_IF(vmwrite(VMCS_GUEST_RSP
, options
->tf
.tf_rsp
));
900 * This field is included for future expansion.
901 * Software should set this field to FFFFFFFF_FFFFFFFFH
902 * to avoid VM-entry failures (see Section 26.3.1.5).
904 ERROR_IF(vmwrite(VMCS_LINK_POINTER
, ~0ULL));
906 /* The pointer to the EPT pagetable */
907 ERROR_IF(vmwrite(VMCS_EPTP
, vmx_eptp(vti
->vmm_cr3
)));
909 vti
->invept_desc
.eptp
= vmx_eptp(vti
->vmm_cr3
);
917 kprintf("VMM: vmx_vminit failed\n");
918 execute_vmclear(vti
);
920 kfree(vti
->vmcs_region_na
, M_TEMP
);
928 struct vmx_thread_info
*vti
= curthread
->td_vmm
;
929 struct proc
*p
= curproc
;
933 vmx_check_cpu_migration();
934 if (vti
->vmcs_region
&&
935 pcpu_info
[mycpu
->gd_cpuid
].loaded_vmx
== vti
)
936 execute_vmclear(vti
);
938 if (vti
->vmcs_region_na
!= NULL
) {
939 kfree(vti
->vmcs_region_na
, M_TEMP
);
943 curthread
->td_vmm
= NULL
;
944 lwkt_gettoken(&p
->p_token
);
945 if (p
->p_nthreads
== 1) {
946 kfree(p
->p_vmm
, M_TEMP
);
949 lwkt_reltoken(&p
->p_token
);
955 * Checks if we migrated to another cpu
957 * No locks are required
960 vmx_check_cpu_migration(void)
962 struct vmx_thread_info
* vti
;
963 struct globaldata
*gd
;
968 vti
= (struct vmx_thread_info
*) curthread
->td_vmm
;
969 ERROR_IF(vti
== NULL
);
971 if (vti
->last_cpu
!= -1 && vti
->last_cpu
!= gd
->gd_cpuid
&&
972 pcpu_info
[vti
->last_cpu
].loaded_vmx
== vti
) {
974 * Do not reset last_cpu to -1 here, leave it caching
975 * the cpu whos per-cpu fields the VMCS is synchronized
976 * with. The pcpu_info[] check prevents unecessary extra
979 dkprintf("VMM: cpusync from %d to %d\n",
980 gd
->gd_cpuid
, vti
->last_cpu
);
982 /* Clear the VMCS area if ran on another CPU */
983 CPUMASK_ASSBIT(mask
, vti
->last_cpu
);
984 lwkt_cpusync_simple(mask
, execute_vmclear
, (void *)vti
);
988 kprintf("VMM: vmx_check_cpu_migration failed\n");
992 /* Handle CPU migration
994 * We have to enter with interrupts disabled/critical section
995 * to be sure that another VMCS won't steel our CPU.
998 vmx_handle_cpu_migration(void)
1000 struct vmx_thread_info
* vti
;
1001 struct globaldata
*gd
;
1005 vti
= (struct vmx_thread_info
*) curthread
->td_vmm
;
1006 ERROR_IF(vti
== NULL
);
1008 if (vti
->last_cpu
!= gd
->gd_cpuid
) {
1010 * We need to synchronize the per-cpu fields after changing
1013 dkprintf("VMM: vmx_handle_cpu_migration init per CPU data\n");
1015 ERROR_IF(execute_vmptrld(vti
));
1017 /* Host related registers */
1018 ERROR_IF(vmwrite(VMCS_HOST_GS_BASE
, (uint64_t) gd
)); /* mycpu points to %gs:0 */
1019 ERROR_IF(vmwrite(VMCS_HOST_TR_BASE
, (uint64_t) &gd
->gd_prvspace
->mdglobaldata
.gd_common_tss
));
1021 ERROR_IF(vmwrite(VMCS_HOST_GDTR_BASE
, (uint64_t) &gdt
[gd
->gd_cpuid
* NGDT
]));
1022 ERROR_IF(vmwrite(VMCS_HOST_IDTR_BASE
, (uint64_t) r_idt_arr
[gd
->gd_cpuid
].rd_base
));
1025 /* Guest related register */
1026 ERROR_IF(vmwrite(VMCS_GUEST_GDTR_BASE
, (uint64_t) &gdt
[gd
->gd_cpuid
* NGDT
]));
1027 ERROR_IF(vmwrite(VMCS_GUEST_GDTR_LIMIT
, (uint64_t) (NGDT
* sizeof(gdt
[0]) - 1)));
1030 * Indicates which cpu the per-cpu fields are synchronized
1031 * with. Does not indicate whether the vmcs is active on
1032 * that particular cpu.
1034 vti
->last_cpu
= gd
->gd_cpuid
;
1035 } else if (pcpu_info
[gd
->gd_cpuid
].loaded_vmx
!= vti
) {
1037 * We only need to vmptrld
1039 dkprintf("VMM: vmx_handle_cpu_migration: vmcs is not loaded\n");
1041 ERROR_IF(execute_vmptrld(vti
));
1043 } /* else we don't need to do anything */
1046 kprintf("VMM: vmx_handle_cpu_migration failed\n");
1050 /* Load information about VMexit
1052 * We still are with interrupts disabled/critical secion
1053 * because we must operate with the VMCS on the CPU
1056 vmx_vmexit_loadinfo(void)
1058 struct vmx_thread_info
*vti
;
1061 vti
= (struct vmx_thread_info
*) curthread
->td_vmm
;
1062 ERROR_IF(vti
== NULL
);
1064 ERROR_IF(vmread(VMCS_VMEXIT_REASON
, &vti
->vmexit_reason
));
1065 ERROR_IF(vmread(VMCS_EXIT_QUALIFICATION
, &vti
->vmexit_qualification
));
1066 ERROR_IF(vmread(VMCS_VMEXIT_INTERRUPTION_INFO
, &vti
->vmexit_interruption_info
));
1067 ERROR_IF(vmread(VMCS_VMEXIT_INTERRUPTION_ERROR
, &vti
->vmexit_interruption_error
));
1068 ERROR_IF(vmread(VMCS_VMEXIT_INSTRUCTION_LENGTH
, &vti
->vmexit_instruction_length
));
1069 ERROR_IF(vmread(VMCS_GUEST_PHYSICAL_ADDRESS
, &vti
->guest_physical_address
));
1070 ERROR_IF(vmread(VMCS_GUEST_RIP
, &vti
->guest
.tf_rip
));
1071 ERROR_IF(vmread(VMCS_GUEST_CS_SELECTOR
, &vti
->guest
.tf_cs
));
1072 ERROR_IF(vmread(VMCS_GUEST_RFLAGS
, &vti
->guest
.tf_rflags
));
1073 ERROR_IF(vmread(VMCS_GUEST_RSP
, &vti
->guest
.tf_rsp
));
1074 ERROR_IF(vmread(VMCS_GUEST_SS_SELECTOR
, &vti
->guest
.tf_ss
));
1078 kprintf("VMM: vmx_vmexit_loadinfo failed\n");
1084 vmx_set_tls_area(void)
1086 struct tls_info
*guest_fs
= &curthread
->td_tls
.info
[0];
1087 struct tls_info
*guest_gs
= &curthread
->td_tls
.info
[1];
1091 dkprintf("VMM: vmx_set_tls_area hook\n");
1095 ERROR_IF(vmx_check_cpu_migration());
1096 ERROR_IF(vmx_handle_cpu_migration());
1099 ERROR_IF(vmx_set_guest_descriptor(FS
, GSEL(GUDATA_SEL
, SEL_UPL
),
1100 VMCS_SEG_TYPE(3) | VMCS_S
| VMCS_DPL(SEL_UPL
) | VMCS_P
,
1101 (uint64_t) guest_fs
->base
, (uint32_t) guest_fs
->size
));
1104 ERROR_IF(vmx_set_guest_descriptor(GS
, GSEL(GUDATA_SEL
, SEL_UPL
),
1105 VMCS_SEG_TYPE(3) | VMCS_S
| VMCS_DPL(SEL_UPL
) | VMCS_P
,
1106 (uint64_t) guest_gs
->base
, (uint32_t) guest_gs
->size
));
1118 vmx_handle_vmexit(void)
1120 struct vmx_thread_info
* vti
;
1123 int exception_number
;
1127 int fault_type
= VM_PROT_NONE
;
1128 int fault_flags
= 0;
1129 struct lwp
*lp
= curthread
->td_lwp
;
1131 dkprintf("VMM: handle_vmx_vmexit\n");
1132 vti
= (struct vmx_thread_info
*) curthread
->td_vmm
;
1133 ERROR_IF(vti
== NULL
);
1135 exit_reason
= VMCS_BASIC_EXIT_REASON(vti
->vmexit_reason
);
1136 switch (exit_reason
) {
1137 case EXIT_REASON_EXCEPTION
:
1138 dkprintf("VMM: handle_vmx_vmexit: "
1139 "EXIT_REASON_EXCEPTION with qualification "
1140 "%llx, interruption info %llx, "
1141 "interruption error %llx, instruction "
1143 (long long) vti
->vmexit_qualification
,
1144 (long long) vti
->vmexit_interruption_info
,
1145 (long long) vti
->vmexit_interruption_error
,
1146 (long long) vti
->vmexit_instruction_length
);
1148 dkprintf("VMM: handle_vmx_vmexit: rax: %llx, rip: %llx, "
1149 "rsp: %llx, rdi: %llx, "
1151 "vti: %p, master: %p\n",
1152 (long long)vti
->guest
.tf_rax
,
1153 (long long)vti
->guest
.tf_rip
,
1154 (long long)vti
->guest
.tf_rsp
,
1155 (long long)vti
->guest
.tf_rdi
,
1156 (long long)vti
->guest
.tf_rsi
,
1157 exit_reason
, vti
, curproc
->p_vmm
);
1160 VMCS_EXCEPTION_TYPE(vti
->vmexit_interruption_info
);
1162 VMCS_EXCEPTION_NUMBER(vti
->vmexit_interruption_info
);
1164 if (exception_type
== VMCS_EXCEPTION_HARDWARE
) {
1165 switch (exception_number
) {
1168 * Disabled "syscall" instruction and
1169 * now we catch it for executing
1171 dkprintf("VMM: handle_vmx_vmexit: "
1172 "VMCS_EXCEPTION_HARDWARE IDT_UD\n");
1174 /* Check to see if its syscall asm instuction */
1175 uint8_t instr
[INSTRUCTION_MAX_LENGTH
];
1176 if (copyin((const void *)vti
->guest
.tf_rip
,
1178 vti
->vmexit_instruction_length
) &&
1179 instr_check(&syscall_asm
,(void *)instr
,
1180 (uint8_t)vti
->vmexit_instruction_length
)) {
1181 kprintf("VMM: handle_vmx_vmexit: "
1182 "UD different from syscall: ");
1183 db_disasm((db_addr_t
)instr
, FALSE
, NULL
);
1186 /* Called to force a VMEXIT and invalidate TLB */
1187 if (vti
->guest
.tf_rax
== -1) {
1188 vti
->guest
.tf_rip
+=
1189 vti
->vmexit_instruction_length
;
1193 vti
->guest
.tf_err
= 2;
1194 vti
->guest
.tf_trapno
= T_FAST_SYSCALL
;
1195 vti
->guest
.tf_xflags
= 0;
1197 vti
->guest
.tf_rip
+=
1198 vti
->vmexit_instruction_length
;
1200 syscall2(&vti
->guest
);
1203 dkprintf("VMM: handle_vmx_vmexit: "
1204 "VMCS_EXCEPTION_HARDWARE IDT_PF "
1206 (long long) vti
->guest
.tf_rip
);
1209 if (vti
->guest
.tf_rip
== 0) {
1210 kprintf("VMM: handle_vmx_vmexit: "
1211 "Terminating...\n");
1218 vti
->vmexit_interruption_error
;
1219 vti
->guest
.tf_addr
=
1220 vti
->vmexit_qualification
;
1221 vti
->guest
.tf_xflags
= 0;
1222 vti
->guest
.tf_trapno
= T_PAGEFLT
;
1225 * If we are a user process in the vkernel
1226 * pass the PF to the vkernel and will trigger
1229 * If we are the vkernel, send a SIGSEGV signal
1230 * to us that will trigger the execution of
1235 if (lp
->lwp_vkernel
&& lp
->lwp_vkernel
->ve
) {
1236 vkernel_trap(lp
, &vti
->guest
);
1238 trapsignal(lp
, SIGSEGV
, SEGV_MAPERR
);
1243 kprintf("VMM: handle_vmx_vmexit: "
1244 "VMCS_EXCEPTION_HARDWARE unknown "
1245 "number %d rip: %llx, rsp: %llx\n",
1247 (long long)vti
->guest
.tf_rip
,
1248 (long long)vti
->guest
.tf_rsp
);
1252 } else if (exception_type
== VMCS_EXCEPTION_SOFTWARE
) {
1253 switch (exception_number
) {
1255 dkprintf("VMM: handle_vmx_vmexit: "
1256 "VMCS_EXCEPTION_SOFTWARE "
1257 "number %d rip: %llx, rsp: %llx\n",
1259 (long long)vti
->guest
.tf_rip
,
1260 (long long)vti
->guest
.tf_rsp
);
1262 vti
->guest
.tf_trapno
= T_BPTFLT
;
1263 vti
->guest
.tf_xflags
= 0;
1264 vti
->guest
.tf_err
= 0;
1265 vti
->guest
.tf_addr
= 0;
1267 vti
->guest
.tf_rip
+=
1268 vti
->vmexit_instruction_length
;
1273 kprintf("VMM: handle_vmx_vmexit: "
1274 "VMCS_EXCEPTION_SOFTWARE unknown "
1275 "number %d rip: %llx, rsp: %llx\n",
1277 (long long)vti
->guest
.tf_rip
,
1278 (long long)vti
->guest
.tf_rsp
);
1283 kprintf("VMM: handle_vmx_vmexit: "
1284 "VMCS_EXCEPTION_ %d unknown\n",
1290 case EXIT_REASON_EXT_INTR
:
1291 dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EXT_INTR\n");
1293 case EXIT_REASON_CPUID
:
1294 dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_CPUID\n");
1297 * Execute CPUID instruction and pass
1298 * the result to the vkernel
1300 func
= vti
->guest
.tf_rax
;
1301 do_cpuid(func
, regs
);
1303 vti
->guest
.tf_rax
= regs
[0];
1304 vti
->guest
.tf_rbx
= regs
[1];
1305 vti
->guest
.tf_rcx
= regs
[2];
1306 vti
->guest
.tf_rdx
= regs
[3];
1308 vti
->guest
.tf_rip
+= vti
->vmexit_instruction_length
;
1311 case EXIT_REASON_EPT_FAULT
:
1313 * EPT_FAULT are resolved like normal PFs. Nothing special
1314 * - get the fault type
1315 * - get the fault address (which is a GPA)
1316 * - execute vm_fault on the vm_map
1318 dkprintf("VMM: handle_vmx_vmexit: "
1319 "EXIT_REASON_EPT_FAULT with qualification %lld,"
1320 "GPA: %llx, fault_Type: %d\n",
1321 (long long)vti
->vmexit_qualification
,
1322 (unsigned long long)vti
->guest_physical_address
,
1325 fault_type
= vmx_ept_fault_type(vti
->vmexit_qualification
);
1327 if (fault_type
& VM_PROT_WRITE
)
1328 fault_flags
= VM_FAULT_DIRTY
;
1330 fault_flags
= VM_FAULT_NORMAL
;
1332 rv
= vm_fault(&curthread
->td_lwp
->lwp_vmspace
->vm_map
,
1333 trunc_page(vti
->guest_physical_address
),
1334 fault_type
, fault_flags
);
1336 if (rv
!= KERN_SUCCESS
) {
1337 kprintf("VMM: handle_vmx_vmexit: "
1338 "EXIT_REASON_EPT_FAULT couldn't resolve %jx\n",
1339 (intmax_t)vti
->guest_physical_address
);
1345 kprintf("VMM: handle_vmx_vmexit: "
1346 "unknown exit reason: %d with qualification %lld\n",
1348 (long long)vti
->vmexit_qualification
);
1360 struct vmx_thread_info
* vti
;
1361 struct globaldata
*gd
;
1368 struct trapframe
*save_frame
;
1369 thread_t td
= curthread
;
1371 vti
= (struct vmx_thread_info
*) td
->td_vmm
;
1372 save_frame
= td
->td_lwp
->lwp_md
.md_regs
;
1373 td
->td_lwp
->lwp_md
.md_regs
= &vti
->guest
;
1379 * This can change the cpu we are running on.
1381 trap_handle_userexit(&vti
->guest
, sticks
);
1384 ERROR2_IF(vti
== NULL
);
1385 ERROR2_IF(vmx_check_cpu_migration());
1386 ERROR2_IF(vmx_handle_cpu_migration());
1389 * Make the state safe to VMENTER
1390 * - disable interrupts and check if there were any pending
1391 * - check for ASTFLTs
1392 * - loop again until there are no ASTFLTs
1396 static int xcounter
;
1397 if ((++xcounter
& 65535) == 0)
1402 if (gd
->gd_reqflags
& RQF_AST_MASK
) {
1403 atomic_clear_int(&gd
->gd_reqflags
, RQF_AST_SIGNAL
);
1406 vti
->guest
.tf_trapno
= T_ASTFLT
;
1408 /* CURRENT CPU CAN CHANGE */
1411 if (vti
->last_cpu
!= gd
->gd_cpuid
) {
1414 kprintf("VMM: vmx_vmrun: vti unexpectedly "
1415 "changed cpus %d->%d\n",
1416 gd
->gd_cpuid
, vti
->last_cpu
);
1421 * Add us to the list of cpus running vkernel operations, interlock
1422 * against anyone trying to do an invalidation.
1424 * We must set the cpumask first to ensure that we interlock another
1425 * cpu that may desire to IPI us after we have successfully
1426 * incremented the cpulock counter.
1428 ATOMIC_CPUMASK_ORBIT(td
->td_proc
->p_vmm_cpumask
, gd
->gd_cpuid
);
1431 olock
= td
->td_proc
->p_vmm_cpulock
;
1433 if ((olock
& CPULOCK_EXCL
) == 0) {
1434 nlock
= olock
+ CPULOCK_INCR
;
1435 if (atomic_cmpset_int(&td
->td_proc
->p_vmm_cpulock
,
1446 * More complex. After sleeping we have to re-test
1449 ATOMIC_CPUMASK_NANDBIT(td
->td_proc
->p_vmm_cpumask
,
1452 tsleep_interlock(&td
->td_proc
->p_vmm_cpulock
, 0);
1453 if (td
->td_proc
->p_vmm_cpulock
& CPULOCK_EXCL
) {
1454 tsleep(&td
->td_proc
->p_vmm_cpulock
, PINTERLOCKED
,
1462 * Load specific Guest registers
1463 * GP registers will be loaded in vmx_launch/resume
1465 ERROR_IF(vmwrite(VMCS_GUEST_RIP
, vti
->guest
.tf_rip
));
1466 ERROR_IF(vmwrite(VMCS_GUEST_CS_SELECTOR
, vti
->guest
.tf_cs
));
1467 ERROR_IF(vmwrite(VMCS_GUEST_RFLAGS
, vti
->guest
.tf_rflags
));
1468 ERROR_IF(vmwrite(VMCS_GUEST_RSP
, vti
->guest
.tf_rsp
));
1469 ERROR_IF(vmwrite(VMCS_GUEST_SS_SELECTOR
, vti
->guest
.tf_ss
));
1470 ERROR_IF(vmwrite(VMCS_GUEST_CR3
, (uint64_t) vti
->guest_cr3
));
1475 if (mdcpu
->gd_npxthread
!= td
) {
1476 if (mdcpu
->gd_npxthread
)
1477 npxsave(mdcpu
->gd_npxthread
->td_savefpu
);
1482 * The kernel caches the MSR_FSBASE value in mdcpu->gd_user_fs.
1483 * A vmexit loads this unconditionally from the VMCS so make
1484 * sure it loads the correct value.
1486 ERROR_IF(vmwrite(VMCS_HOST_FS_BASE
, mdcpu
->gd_user_fs
));
1489 * EPT mappings can't be invalidated with normal invlpg/invltlb
1490 * instructions. We have to execute a special instruction that
1491 * invalidates all EPT cache ("invept").
1493 * pm_invgen it's a generation number which is incremented in
1494 * pmap_inval_smp*(), before doing any invalidates. This will
1495 * cause all CPUs thaat are using the EPT to VMEXIT and wait for
1496 * the interlock to complete. When they VMENTER they will see that
1497 * the generation number had changed from their current and do a
1500 if (vti
->eptgen
!= td
->td_proc
->p_vmspace
->vm_pmap
.pm_invgen
) {
1501 vti
->eptgen
= td
->td_proc
->p_vmspace
->vm_pmap
.pm_invgen
;
1503 ERROR_IF(invept(INVEPT_TYPE_SINGLE_CONTEXT
,
1504 (uint64_t*)&vti
->invept_desc
));
1507 if (vti
->launched
) { /* vmresume called from vmx_trap.s */
1508 dkprintf("\n\nVMM: vmx_vmrun: vmx_resume\n");
1509 ret
= vmx_resume(vti
);
1511 } else { /* vmlaunch called from vmx_trap.s */
1512 dkprintf("\n\nVMM: vmx_vmrun: vmx_launch\n");
1514 ret
= vmx_launch(vti
);
1518 * This is our return point from the vmlaunch/vmresume
1519 * There are two situations:
1520 * - the vmlaunch/vmresume executed successfully and they
1521 * would return through "vmx_vmexit" which will restore
1522 * the state (registers) and return here with the ret
1523 * set to VM_EXIT (ret is actually %rax)
1524 * - the vmlaunch/vmresume failed to execute and will return
1525 * immediately with ret set to the error code
1527 if (ret
== VM_EXIT
) {
1528 ERROR_IF(vmx_vmexit_loadinfo());
1530 ATOMIC_CPUMASK_NANDBIT(td
->td_proc
->p_vmm_cpumask
,
1532 atomic_add_int(&td
->td_proc
->p_vmm_cpulock
,
1534 /* WARNING: don't adjust cpulock twice! */
1537 trap_handle_userenter(td
);
1538 sticks
= td
->td_sticks
;
1542 * Handle the VMEXIT reason
1543 * - if successful we VMENTER again
1546 if (vmx_handle_vmexit())
1550 * We handled the VMEXIT reason and continue with
1559 * Two types of error:
1560 * - VM_FAIL_VALID - the host state was ok,
1561 * but probably the guest state was not
1562 * - VM_FAIL_INVALID - the parameters or the host state
1565 if (ret
== VM_FAIL_VALID
) {
1566 vmread(VMCS_INSTR_ERR
, &val
);
1568 kprintf("VMM: vmx_vmrun: vmenter failed with "
1569 "VM_FAIL_VALID, error code %d\n",
1572 kprintf("VMM: vmx_vmrun: vmenter failed with "
1573 "VM_FAIL_INVALID\n");
1578 kprintf("VMM: vmx_vmrun: returning with success\n");
1581 ATOMIC_CPUMASK_NANDBIT(td
->td_proc
->p_vmm_cpumask
, gd
->gd_cpuid
);
1582 atomic_add_int(&td
->td_proc
->p_vmm_cpulock
, -CPULOCK_INCR
);
1585 trap_handle_userenter(td
);
1586 td
->td_lwp
->lwp_md
.md_regs
= save_frame
;
1587 KKASSERT(CPUMASK_TESTMASK(td
->td_proc
->p_vmm_cpumask
,
1588 gd
->gd_cpumask
) == 0);
1589 /*atomic_clear_cpumask(&td->td_proc->p_vmm_cpumask, gd->gd_cpumask);*/
1591 kprintf("VMM: vmx_vmrun failed\n");
1596 * Called when returning to user-space
1597 * after executing lwp_fork.
1600 vmx_lwp_return(struct lwp
*lp
, struct trapframe
*frame
)
1602 struct vmm_guest_options options
;
1604 struct vmm_proc
*p_vmm
= (struct vmm_proc
*)curproc
->p_vmm
;
1606 dkprintf("VMM: vmx_lwp_return \n");
1608 bzero(&options
, sizeof(struct vmm_guest_options
));
1610 bcopy(frame
, &options
.tf
, sizeof(struct trapframe
));
1612 options
.guest_cr3
= p_vmm
->guest_cr3
;
1613 options
.vmm_cr3
= p_vmm
->vmm_cr3
;
1615 vmx_vminit(&options
);
1616 generic_lwp_return(lp
, frame
);
1618 vmrun_err
= vmx_vmrun();
1620 exit1(W_EXITCODE(vmrun_err
, 0));
1624 vmx_set_guest_cr3(register_t guest_cr3
)
1626 struct vmx_thread_info
*vti
= (struct vmx_thread_info
*) curthread
->td_vmm
;
1627 vti
->guest_cr3
= guest_cr3
;
1631 vmx_vm_get_gpa(struct proc
*p
, register_t
*gpa
, register_t uaddr
)
1633 return guest_phys_addr(p
->p_vmspace
, gpa
, p
->p_vkernel
->vkernel_cr3
, uaddr
);
1636 static struct vmm_ctl ctl_vmx
= {
1637 .name
= "VMX from Intel",
1639 .enable
= vmx_enable
,
1640 .disable
= vmx_disable
,
1641 .vminit
= vmx_vminit
,
1642 .vmdestroy
= vmx_vmdestroy
,
1644 .vm_set_tls_area
= vmx_set_tls_area
,
1645 .vm_lwp_return
= vmx_lwp_return
,
1646 .vm_set_guest_cr3
= vmx_set_guest_cr3
,
1647 .vm_get_gpa
= vmx_vm_get_gpa
,