kernel - Fix a system lockup with vmm
[dragonfly.git] / sys / platform / pc64 / vmm / vmx.c
blob06bf895526402b914ff84480b0ef022e1f5d5390
1 /*
2 * Copyright (c) 2003-2013 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Mihai Carabas <mihai.carabas@gmail.com>
6 * by Matthew Dillon <dillon@backplane.com>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 * 3. Neither the name of The DragonFly Project nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific, prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
36 #include <sys/malloc.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/types.h>
40 #include <sys/thread.h>
41 #include <sys/thread2.h>
42 #include <sys/sysctl.h>
43 #include <sys/vmm.h>
44 #include <sys/proc.h>
45 #include <sys/syscall.h>
46 #include <sys/wait.h>
47 #include <sys/vkernel.h>
48 #include <sys/mplock2.h>
49 #include <ddb/ddb.h>
51 #include <cpu/cpu.h>
53 #include <machine/cpufunc.h>
54 #include <machine/cputypes.h>
55 #include <machine/smp.h>
56 #include <machine/globaldata.h>
57 #include <machine/trap.h>
58 #include <machine/pmap.h>
59 #include <machine/md_var.h>
61 #include <vm/vm_map.h>
62 #include <vm/vm_extern.h>
63 #include <vm/vm_param.h>
65 #include "vmm.h"
66 #include "vmm_utils.h"
68 #include "vmx.h"
69 #include "vmx_instr.h"
70 #include "vmx_vmcs.h"
72 #include "ept.h"
74 extern void trap(struct trapframe *frame);
76 static int vmx_check_cpu_migration(void);
77 static int execute_vmptrld(struct vmx_thread_info *vti);
79 struct instr_decode syscall_asm = {
80 .opcode_bytes = 2,
81 .opcode.byte1 = 0x0F,
82 .opcode.byte2 = 0x05,
85 struct vmx_ctl_info vmx_pinbased = {
86 .msr_addr = IA32_VMX_PINBASED_CTLS,
87 .msr_true_addr = IA32_VMX_TRUE_PINBASED_CTLS,
90 struct vmx_ctl_info vmx_procbased = {
91 .msr_addr = IA32_VMX_PROCBASED_CTLS,
92 .msr_true_addr = IA32_VMX_TRUE_PROCBASED_CTLS,
95 struct vmx_ctl_info vmx_procbased2 = {
96 .msr_addr = IA32_VMX_PROCBASED_CTLS2,
97 .msr_true_addr = IA32_VMX_PROCBASED_CTLS2,
100 struct vmx_ctl_info vmx_exit = {
101 .msr_addr = IA32_VMX_EXIT_CTLS,
102 .msr_true_addr = IA32_VMX_TRUE_EXIT_CTLS,
105 struct vmx_ctl_info vmx_entry = {
106 .msr_addr = IA32_VMX_ENTRY_CTLS,
107 .msr_true_addr = IA32_VMX_TRUE_ENTRY_CTLS,
110 /* Declared in generic vmm.c - SYSCTL parent */
111 extern struct sysctl_oid *vmm_sysctl_tree;
113 /* SYSCTL tree and context */
114 static struct sysctl_oid *vmx_sysctl_tree;
115 static struct sysctl_ctx_list vmx_sysctl_ctx;
117 /* Per cpu info */
118 struct vmx_pcpu_info *pcpu_info;
120 /* VMX BASIC INFO */
121 uint32_t vmx_revision;
122 uint32_t vmx_region_size;
123 uint8_t vmx_width_addr;
125 /* IA32_VMX_EPT_VPID_CAP */
126 uint64_t vmx_ept_vpid_cap;
128 /* VMX fixed bits */
129 uint64_t cr0_fixed_to_0;
130 uint64_t cr4_fixed_to_0;
131 uint64_t cr0_fixed_to_1;
132 uint64_t cr4_fixed_to_1;
134 /* VMX status */
135 static uint8_t vmx_enabled = 0;
136 static uint8_t vmx_initialized = 0;
138 /* VMX set control setting
139 * Intel System Programming Guide, Part 3, Order Number 326019
140 * 31.5.1 Algorithms for Determining VMX Capabilities
141 * Implement Algorithm 3
143 static int
144 vmx_set_ctl_setting(struct vmx_ctl_info *vmx_ctl, uint32_t bit_no, setting_t value) {
145 uint64_t vmx_basic;
146 uint64_t ctl_val;
148 /* Check if its branch b. or c. */
149 vmx_basic = rdmsr(IA32_VMX_BASIC);
150 if (IS_TRUE_CTL_AVAIL(vmx_basic))
151 ctl_val = rdmsr(vmx_ctl->msr_true_addr);
152 else
153 ctl_val = rdmsr(vmx_ctl->msr_addr);
155 /* Check if the value is known by VMM or set on DEFAULT */
156 switch(value) {
157 case DEFAULT:
159 * Both settings are allowd
160 * - step b.iii)
161 * or
162 * - c.iii), c.iv)
164 if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no) &&
165 IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) {
166 /* For c.iii) and c.iv) */
167 if (IS_TRUE_CTL_AVAIL(vmx_basic))
168 ctl_val = rdmsr(vmx_ctl->msr_addr);
170 if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no))
171 vmx_ctl->ctls &= ~BIT(bit_no);
172 else if (IS_ONE_SETTING_ALLOWED(ctl_val, bit_no))
173 vmx_ctl->ctls |= BIT(bit_no);
174 } else if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)) {
175 /* b.i), c.i) */
176 vmx_ctl->ctls &= ~BIT(bit_no);
177 } else if (IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) {
178 /* b.i), c.i) */
179 vmx_ctl->ctls |= BIT(bit_no);
180 } else {
181 return (EINVAL);
183 break;
184 case ZERO:
185 /* For b.ii) or c.ii) */
186 if (!IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no))
187 return (EINVAL);
188 vmx_ctl->ctls &= ~BIT(bit_no);
189 break;
190 case ONE:
191 /* For b.ii) or c.ii) */
192 if (!IS_ONE_SETTING_ALLOWED(ctl_val, bit_no))
193 return (EINVAL);
195 vmx_ctl->ctls |= BIT(bit_no);
197 break;
199 return 0;
202 static void
203 vmx_set_default_settings(struct vmx_ctl_info *vmx_ctl)
205 int i;
207 for (i = 0; i < 32; i++) {
208 vmx_set_ctl_setting(vmx_ctl, i, DEFAULT);
212 static void
213 alloc_vmxon_regions(void)
215 int cpu;
216 pcpu_info = kmalloc(ncpus * sizeof(struct vmx_pcpu_info), M_TEMP, M_WAITOK | M_ZERO);
218 for (cpu = 0; cpu < ncpus; cpu++) {
220 /* The address must be aligned to 4K - alloc extra */
221 pcpu_info[cpu].vmxon_region_na = kmalloc(vmx_region_size + VMXON_REGION_ALIGN_SIZE,
222 M_TEMP,
223 M_WAITOK | M_ZERO);
225 /* Align address */
226 pcpu_info[cpu].vmxon_region = (unsigned char*) VMXON_REGION_ALIGN(pcpu_info[cpu].vmxon_region_na);
228 /* In the first 31 bits put the vmx revision*/
229 *((uint32_t *) pcpu_info[cpu].vmxon_region) = vmx_revision;
233 static void
234 free_vmxon_regions(void)
236 int i;
238 for (i = 0; i < ncpus; i++) {
239 pcpu_info[i].vmxon_region = NULL;
241 kfree(pcpu_info[i].vmxon_region_na, M_TEMP);
244 kfree(pcpu_info, M_TEMP);
247 static void
248 build_vmx_sysctl(void)
250 sysctl_ctx_init(&vmx_sysctl_ctx);
251 vmx_sysctl_tree = SYSCTL_ADD_NODE(&vmx_sysctl_ctx,
252 SYSCTL_CHILDREN(vmm_sysctl_tree),
253 OID_AUTO, "vmx",
254 CTLFLAG_RD, 0, "VMX options");
256 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
257 SYSCTL_CHILDREN(vmx_sysctl_tree),
258 OID_AUTO, "revision", CTLFLAG_RD,
259 &vmx_revision, 0,
260 "VMX revision");
261 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
262 SYSCTL_CHILDREN(vmx_sysctl_tree),
263 OID_AUTO, "region_size", CTLFLAG_RD,
264 &vmx_region_size, 0,
265 "VMX region size");
266 SYSCTL_ADD_INT(&vmx_sysctl_ctx,
267 SYSCTL_CHILDREN(vmx_sysctl_tree),
268 OID_AUTO, "width_addr", CTLFLAG_RD,
269 &vmx_width_addr, 0,
270 "VMX width address");
271 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
272 SYSCTL_CHILDREN(vmx_sysctl_tree),
273 OID_AUTO, "pinbased_ctls", CTLFLAG_RD,
274 &vmx_pinbased.ctls, 0,
275 "VMX pin-based controls");
276 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
277 SYSCTL_CHILDREN(vmx_sysctl_tree),
278 OID_AUTO, "procbased_ctls", CTLFLAG_RD,
279 &vmx_procbased.ctls, 0,
280 "VMX primary processor-based controls");
281 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
282 SYSCTL_CHILDREN(vmx_sysctl_tree),
283 OID_AUTO, "procbased2_ctls", CTLFLAG_RD,
284 &vmx_procbased2.ctls, 0,
285 "VMX secondary processor-based controls");
286 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
287 SYSCTL_CHILDREN(vmx_sysctl_tree),
288 OID_AUTO, "vmexit_ctls", CTLFLAG_RD,
289 &vmx_exit.ctls, 0,
290 "VMX exit controls");
291 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
292 SYSCTL_CHILDREN(vmx_sysctl_tree),
293 OID_AUTO, "vmentry_ctls", CTLFLAG_RD,
294 &vmx_entry.ctls, 0,
295 "VMX entry controls");
296 SYSCTL_ADD_ULONG(&vmx_sysctl_ctx,
297 SYSCTL_CHILDREN(vmx_sysctl_tree),
298 OID_AUTO, "ept_vpid_cap", CTLFLAG_RD,
299 &vmx_ept_vpid_cap,
300 "VMX EPT VPID CAP");
305 static int
306 vmx_init(void)
308 uint64_t feature_control;
309 uint64_t vmx_basic_value;
310 uint64_t cr0_fixed_bits_to_1;
311 uint64_t cr0_fixed_bits_to_0;
312 uint64_t cr4_fixed_bits_to_0;
313 uint64_t cr4_fixed_bits_to_1;
315 int err;
319 * The ability of a processor to support VMX operation
320 * and related instructions is indicated by:
321 * CPUID.1:ECX.VMX[bit 5] = 1
323 if (!(cpu_feature2 & CPUID2_VMX)) {
324 kprintf("VMM: VMX is not supported by this Intel CPU\n");
325 return (ENODEV);
328 vmx_set_default_settings(&vmx_pinbased);
330 vmx_set_default_settings(&vmx_procbased);
331 /* Enable second level for procbased */
332 err = vmx_set_ctl_setting(&vmx_procbased,
333 PROCBASED_ACTIVATE_SECONDARY_CONTROLS,
334 ONE);
335 if (err) {
336 kprintf("VMM: PROCBASED_ACTIVATE_SECONDARY_CONTROLS not "
337 "supported by this CPU\n");
338 return (ENODEV);
340 vmx_set_default_settings(&vmx_procbased2);
342 vmx_set_default_settings(&vmx_exit);
343 vmx_set_default_settings(&vmx_entry);
345 /* Enable external interrupts exiting */
346 err = vmx_set_ctl_setting(&vmx_pinbased,
347 PINBASED_EXTERNAL_INTERRUPT_EXITING,
348 ONE);
349 if (err) {
350 kprintf("VMM: PINBASED_EXTERNAL_INTERRUPT_EXITING not "
351 "supported by this CPU\n");
352 return (ENODEV);
355 /* Enable non-maskable interrupts exiting */
356 err = vmx_set_ctl_setting(&vmx_pinbased, PINBASED_NMI_EXITING, ONE);
357 if (err) {
358 kprintf("VMM: PINBASED_NMI_EXITING not "
359 "supported by this CPU\n");
360 return (ENODEV);
364 /* Set 64bits mode for GUEST */
365 err = vmx_set_ctl_setting(&vmx_entry, VMENTRY_IA32e_MODE_GUEST, ONE);
366 if (err) {
367 kprintf("VMM: VMENTRY_IA32e_MODE_GUEST not "
368 "supported by this CPU\n");
369 return (ENODEV);
372 /* Load MSR EFER on enry */
373 err = vmx_set_ctl_setting(&vmx_entry,
374 VMENTRY_LOAD_IA32_EFER, ONE);
375 if (err) {
376 kprintf("VMM: VMENTRY_LOAD_IA32_EFER not "
377 "supported by this CPU\n");
378 return (ENODEV);
381 /* Set 64bits mode */
382 err = vmx_set_ctl_setting(&vmx_exit,
383 VMEXIT_HOST_ADDRESS_SPACE_SIZE, ONE);
384 if (err) {
385 kprintf("VMM: VMEXIT_HOST_ADDRESS_SPACE_SIZE not "
386 "supported by this CPU\n");
387 return (ENODEV);
390 /* Save/Load Efer on exit */
391 err = vmx_set_ctl_setting(&vmx_exit,
392 VMEXIT_SAVE_IA32_EFER,
393 ONE);
394 if (err) {
395 kprintf("VMM: VMEXIT_SAVE_IA32_EFER not "
396 "supported by this CPU\n");
397 return (ENODEV);
400 /* Load Efer on exit */
401 err = vmx_set_ctl_setting(&vmx_exit,
402 VMEXIT_LOAD_IA32_EFER,
403 ONE);
404 if (err) {
405 kprintf("VMM: VMEXIT_LOAD_IA32_EFER not "
406 "supported by this CPU\n");
407 return (ENODEV);
410 /* Enable EPT feature */
411 err = vmx_set_ctl_setting(&vmx_procbased2,
412 PROCBASED2_ENABLE_EPT,
413 ONE);
414 if (err) {
415 kprintf("VMM: PROCBASED2_ENABLE_EPT not "
416 "supported by this CPU\n");
417 return (ENODEV);
420 if (vmx_ept_init()) {
421 kprintf("VMM: vmx_ept_init failed\n");
422 return (ENODEV);
424 #if 0
425 /* XXX - to implement in the feature */
426 /* Enable VPID feature */
427 err = vmx_set_ctl_setting(&vmx_procbased2,
428 PROCBASED2_ENABLE_VPID,
429 ONE);
430 if (err) {
431 kprintf("VMM: PROCBASED2_ENABLE_VPID not "
432 "supported by this CPU\n");
433 return (ENODEV);
435 #endif
437 /* Check for the feature control status */
438 feature_control = rdmsr(IA32_FEATURE_CONTROL);
439 if (!(feature_control & BIT(FEATURE_CONTROL_LOCKED))) {
440 kprintf("VMM: IA32_FEATURE_CONTROL is not locked\n");
441 return (EINVAL);
443 if (!(feature_control & BIT(FEATURE_CONTROL_VMX_BIOS_ENABLED))) {
444 kprintf("VMM: VMX is disabled by the BIOS\n");
445 return (EINVAL);
448 vmx_basic_value = rdmsr(IA32_VMX_BASIC);
449 vmx_width_addr = (uint8_t) VMX_WIDTH_ADDR(vmx_basic_value);
450 vmx_region_size = (uint32_t) VMX_REGION_SIZE(vmx_basic_value);
451 vmx_revision = (uint32_t) VMX_REVISION(vmx_basic_value);
453 /* A.7 VMX-FIXED BITS IN CR0 */
454 cr0_fixed_bits_to_1 = rdmsr(IA32_VMX_CR0_FIXED0);
455 cr0_fixed_bits_to_0 = rdmsr(IA32_VMX_CR0_FIXED1);
456 cr0_fixed_to_1 = cr0_fixed_bits_to_1 & cr0_fixed_bits_to_0;
457 cr0_fixed_to_0 = ~cr0_fixed_bits_to_1 & ~cr0_fixed_bits_to_0;
459 /* A.8 VMX-FIXED BITS IN CR4 */
460 cr4_fixed_bits_to_1 = rdmsr(IA32_VMX_CR4_FIXED0);
461 cr4_fixed_bits_to_0 = rdmsr(IA32_VMX_CR4_FIXED1);
462 cr4_fixed_to_1 = cr4_fixed_bits_to_1 & cr4_fixed_bits_to_0;
463 cr4_fixed_to_0 = ~cr4_fixed_bits_to_1 & ~cr4_fixed_bits_to_0;
465 build_vmx_sysctl();
467 vmx_initialized = 1;
468 return 0;
471 static void
472 execute_vmxon(void *perr)
474 unsigned char *vmxon_region;
475 int *err = (int*) perr;
477 /* A.7 VMX-FIXED BITS IN CR0 */
478 load_cr0((rcr0() | cr0_fixed_to_1) & ~cr0_fixed_to_0);
480 /* A.8 VMX-FIXED BITS IN CR4 */
481 load_cr4((rcr4() | cr4_fixed_to_1) & ~cr4_fixed_to_0);
483 /* Enable VMX */
484 load_cr4(rcr4() | CR4_VMXE);
486 vmxon_region = pcpu_info[mycpuid].vmxon_region;
487 *err = vmxon(vmxon_region);
488 if (*err) {
489 kprintf("VMM: vmxon failed on cpu%d\n", mycpuid);
493 static void
494 execute_vmxoff(void *dummy)
496 invept_desc_t desc = { 0 };
498 if (invept(INVEPT_TYPE_ALL_CONTEXTS, (uint64_t*) &desc))
499 kprintf("VMM: execute_vmxoff: invet failed on cpu%d\n", mycpu->gd_cpuid);
501 vmxoff();
503 /* Disable VMX */
504 load_cr4(rcr4() & ~CR4_VMXE);
507 static void
508 execute_vmclear(void *data)
510 struct vmx_thread_info *vti = data;
511 int err;
512 globaldata_t gd = mycpu;
514 if (pcpu_info[gd->gd_cpuid].loaded_vmx == vti) {
516 * Must set vti->launched to zero after vmclear'ing to
517 * force a vmlaunch the next time.
519 * Must not clear the loaded_vmx field until after we call
520 * vmclear on the region. This field triggers the interlocked
521 * cpusync from another cpu trying to destroy or reuse
522 * the vti. If we clear the field first, the other cpu will
523 * not interlock and may race our vmclear() on the underlying
524 * memory.
526 ERROR_IF(vmclear(vti->vmcs_region));
527 error:
528 pcpu_info[gd->gd_cpuid].loaded_vmx = NULL;
529 vti->launched = 0;
531 return;
534 static int
535 execute_vmptrld(struct vmx_thread_info *vti)
537 globaldata_t gd = mycpu;
540 * Must vmclear previous active vcms if it is different.
542 if (pcpu_info[gd->gd_cpuid].loaded_vmx &&
543 pcpu_info[gd->gd_cpuid].loaded_vmx != vti)
544 execute_vmclear(pcpu_info[gd->gd_cpuid].loaded_vmx);
547 * Make this the current VMCS. Must set loaded_vmx field
548 * before calling vmptrld() to avoid races against cpusync.
550 * Must set vti->launched to zero after the vmptrld to force
551 * a vmlaunch.
553 if (pcpu_info[gd->gd_cpuid].loaded_vmx != vti) {
554 vti->launched = 0;
555 pcpu_info[gd->gd_cpuid].loaded_vmx = vti;
556 return (vmptrld(vti->vmcs_region));
557 } else {
558 return (0);
562 static int
563 vmx_enable(void)
565 int err;
566 int cpu;
568 if (!vmx_initialized) {
569 kprintf("VMM: vmx_enable - not allowed; vmx not initialized\n");
570 return (EINVAL);
573 if (vmx_enabled) {
574 kprintf("VMM: vmx_enable - already enabled\n");
575 return (EINVAL);
578 alloc_vmxon_regions();
579 for (cpu = 0; cpu < ncpus; cpu++) {
580 cpumask_t mask;
582 err = 0;
583 CPUMASK_ASSBIT(mask, cpu);
584 lwkt_cpusync_simple(mask, execute_vmxon, &err);
585 if(err) {
586 kprintf("VMM: vmx_enable error %d on cpu%d\n", err, cpu);
587 return err;
590 vmx_enabled = 1;
591 return 0;
594 static int
595 vmx_disable(void)
597 int cpu;
599 if (!vmx_enabled) {
600 kprintf("VMM: vmx_disable not allowed; vmx wasn't enabled\n");
603 for (cpu = 0; cpu < ncpus; cpu++) {
604 cpumask_t mask;
606 CPUMASK_ASSBIT(mask, cpu);
607 lwkt_cpusync_simple(mask, execute_vmxoff, NULL);
610 free_vmxon_regions();
612 vmx_enabled = 0;
614 return 0;
617 static int vmx_set_guest_descriptor(descriptor_t type,
618 uint16_t selector,
619 uint32_t rights,
620 uint64_t base,
621 uint32_t limit)
623 int err;
624 int selector_enc;
625 int rights_enc;
626 int base_enc;
627 int limit_enc;
631 * Intel Manual Vol 3C. - page 60
632 * If any bit in the limit field in the range 11:0 is 0, G must be 0.
633 * If any bit in the limit field in the range 31:20 is 1, G must be 1.
635 if ((~rights & VMCS_SEG_UNUSABLE) || (type == CS)) {
636 if ((limit & 0xfff) != 0xfff)
637 rights &= ~VMCS_G;
638 else if ((limit & 0xfff00000) != 0)
639 rights |= VMCS_G;
642 switch(type) {
643 case ES:
644 selector_enc = VMCS_GUEST_ES_SELECTOR;
645 rights_enc = VMCS_GUEST_ES_ACCESS_RIGHTS;
646 base_enc = VMCS_GUEST_ES_BASE;
647 limit_enc = VMCS_GUEST_ES_LIMIT;
648 break;
649 case CS:
650 selector_enc = VMCS_GUEST_CS_SELECTOR;
651 rights_enc = VMCS_GUEST_CS_ACCESS_RIGHTS;
652 base_enc = VMCS_GUEST_CS_BASE;
653 limit_enc = VMCS_GUEST_CS_LIMIT;
654 break;
655 case SS:
656 selector_enc = VMCS_GUEST_SS_SELECTOR;
657 rights_enc = VMCS_GUEST_SS_ACCESS_RIGHTS;
658 base_enc = VMCS_GUEST_SS_BASE;
659 limit_enc = VMCS_GUEST_SS_LIMIT;
660 break;
661 case DS:
662 selector_enc = VMCS_GUEST_DS_SELECTOR;
663 rights_enc = VMCS_GUEST_DS_ACCESS_RIGHTS;
664 base_enc = VMCS_GUEST_DS_BASE;
665 limit_enc = VMCS_GUEST_DS_LIMIT;
666 break;
667 case FS:
668 selector_enc = VMCS_GUEST_FS_SELECTOR;
669 rights_enc = VMCS_GUEST_FS_ACCESS_RIGHTS;
670 base_enc = VMCS_GUEST_FS_BASE;
671 limit_enc = VMCS_GUEST_FS_LIMIT;
672 break;
673 case GS:
674 selector_enc = VMCS_GUEST_GS_SELECTOR;
675 rights_enc = VMCS_GUEST_GS_ACCESS_RIGHTS;
676 base_enc = VMCS_GUEST_GS_BASE;
677 limit_enc = VMCS_GUEST_GS_LIMIT;
678 break;
679 case LDTR:
680 selector_enc = VMCS_GUEST_LDTR_SELECTOR;
681 rights_enc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
682 base_enc = VMCS_GUEST_LDTR_BASE;
683 limit_enc = VMCS_GUEST_LDTR_LIMIT;
684 break;
685 case TR:
686 selector_enc = VMCS_GUEST_TR_SELECTOR;
687 rights_enc = VMCS_GUEST_TR_ACCESS_RIGHTS;
688 base_enc = VMCS_GUEST_TR_BASE;
689 limit_enc = VMCS_GUEST_TR_LIMIT;
690 break;
691 default:
692 kprintf("VMM: vmx_set_guest_descriptor: unknown descriptor\n");
693 err = -1;
694 goto error;
695 break;
698 ERROR_IF(vmwrite(selector_enc, selector));
699 ERROR_IF(vmwrite(rights_enc, rights));
700 ERROR_IF(vmwrite(base_enc, base));
701 ERROR_IF(vmwrite(limit_enc, limit));
703 return 0;
704 error:
705 kprintf("VMM: vmx_set_guest_descriptor failed\n");
706 return err;
710 * Called by the first thread of the VMM process
711 * - create a new vmspace
712 * - init the vmspace with EPT PG_* bits and
713 * EPT copyin/copyout functions
714 * - replace the vmspace of the current proc
715 * - remove the old vmspace
717 static int
718 vmx_vminit_master(struct vmm_guest_options *options)
720 struct vmspace *oldvmspace;
721 struct vmspace *newvmspace;
722 struct proc *p = curthread->td_proc;
723 struct vmm_proc *p_vmm;
725 oldvmspace = curthread->td_lwp->lwp_vmspace;
726 newvmspace = vmspace_fork(oldvmspace);
728 vmx_ept_pmap_pinit(vmspace_pmap(newvmspace));
729 bzero(vmspace_pmap(newvmspace)->pm_pml4, PAGE_SIZE);
731 lwkt_gettoken(&oldvmspace->vm_map.token);
732 lwkt_gettoken(&newvmspace->vm_map.token);
734 pmap_pinit2(vmspace_pmap(newvmspace));
735 pmap_replacevm(curthread->td_proc, newvmspace, 0);
737 lwkt_reltoken(&newvmspace->vm_map.token);
738 lwkt_reltoken(&oldvmspace->vm_map.token);
740 vmspace_rel(oldvmspace);
742 options->vmm_cr3 = vtophys(vmspace_pmap(newvmspace)->pm_pml4);
744 p_vmm = kmalloc(sizeof(struct vmm_proc), M_TEMP, M_WAITOK | M_ZERO);
745 p_vmm->guest_cr3 = options->guest_cr3;
746 p_vmm->vmm_cr3 = options->vmm_cr3;
747 p->p_vmm = (void *)p_vmm;
749 if (p->p_vkernel) {
750 p->p_vkernel->vkernel_cr3 = options->guest_cr3;
751 dkprintf("PROCESS CR3 %016jx\n", (intmax_t)options->guest_cr3);
754 return 0;
757 static int
758 vmx_vminit(struct vmm_guest_options *options)
760 struct vmx_thread_info * vti;
761 int err;
762 struct tls_info guest_fs = curthread->td_tls.info[0];
763 struct tls_info guest_gs = curthread->td_tls.info[1];
766 vti = kmalloc(sizeof(struct vmx_thread_info), M_TEMP, M_WAITOK | M_ZERO);
767 curthread->td_vmm = (void*) vti;
769 if (options->master) {
770 vmx_vminit_master(options);
773 bcopy(&options->tf, &vti->guest, sizeof(struct trapframe));
776 * Be sure we return success if the VMM hook enters
778 vti->guest.tf_rax = 0;
779 vti->guest.tf_rflags &= ~PSL_C;
781 vti->vmcs_region_na = kmalloc(vmx_region_size + VMXON_REGION_ALIGN_SIZE,
782 M_TEMP,
783 M_WAITOK | M_ZERO);
785 /* Align address */
786 vti->vmcs_region = (unsigned char*) VMXON_REGION_ALIGN(vti->vmcs_region_na);
787 vti->last_cpu = -1;
789 vti->guest_cr3 = options->guest_cr3;
790 vti->vmm_cr3 = options->vmm_cr3;
792 /* In the first 31 bits put the vmx revision*/
793 *((uint32_t *)vti->vmcs_region) = vmx_revision;
796 * vmclear the vmcs to initialize it.
798 ERROR_IF(vmclear(vti->vmcs_region));
800 crit_enter();
802 ERROR_IF(execute_vmptrld(vti));
804 /* Load the VMX controls */
805 ERROR_IF(vmwrite(VMCS_PINBASED_CTLS, vmx_pinbased.ctls));
806 ERROR_IF(vmwrite(VMCS_PROCBASED_CTLS, vmx_procbased.ctls));
807 ERROR_IF(vmwrite(VMCS_PROCBASED2_CTLS, vmx_procbased2.ctls));
808 ERROR_IF(vmwrite(VMCS_VMEXIT_CTLS, vmx_exit.ctls));
809 ERROR_IF(vmwrite(VMCS_VMENTRY_CTLS, vmx_entry.ctls));
811 /* Load HOST CRs */
812 ERROR_IF(vmwrite(VMCS_HOST_CR0, rcr0()));
813 ERROR_IF(vmwrite(VMCS_HOST_CR4, rcr4()));
815 /* Load HOST EFER and PAT */
816 // ERROR_IF(vmwrite(VMCS_HOST_IA32_PAT, rdmsr(MSR_PAT)));
817 ERROR_IF(vmwrite(VMCS_HOST_IA32_EFER, rdmsr(MSR_EFER)));
819 /* Load HOST selectors */
820 ERROR_IF(vmwrite(VMCS_HOST_ES_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
821 ERROR_IF(vmwrite(VMCS_HOST_SS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
822 ERROR_IF(vmwrite(VMCS_HOST_FS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
823 ERROR_IF(vmwrite(VMCS_HOST_GS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
824 ERROR_IF(vmwrite(VMCS_HOST_CS_SELECTOR, GSEL(GCODE_SEL, SEL_KPL)));
825 ERROR_IF(vmwrite(VMCS_HOST_TR_SELECTOR, GSEL(GPROC0_SEL, SEL_KPL)));
828 * The BASE addresses are written on each VMRUN in case
829 * the CPU changes because are per-CPU values
833 * Call vmx_vmexit on VM_EXIT condition
834 * The RSP will point to the vmx_thread_info
836 ERROR_IF(vmwrite(VMCS_HOST_RIP, (uint64_t) vmx_vmexit));
837 ERROR_IF(vmwrite(VMCS_HOST_RSP, (uint64_t) vti));
838 ERROR_IF(vmwrite(VMCS_HOST_CR3, (uint64_t) KPML4phys));
841 * GUEST initialization
842 * - set the descriptors according the conditions from Intel
843 * manual "26.3.1.2 Checks on Guest Segment Registers
844 * - set the privilege to SEL_UPL (the vkernel will run
845 * in userspace context)
847 ERROR_IF(vmx_set_guest_descriptor(ES, GSEL(GUDATA_SEL, SEL_UPL),
848 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
849 0, 0));
851 ERROR_IF(vmx_set_guest_descriptor(SS, GSEL(GUDATA_SEL, SEL_UPL),
852 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
853 0, 0));
855 ERROR_IF(vmx_set_guest_descriptor(DS, GSEL(GUDATA_SEL, SEL_UPL),
856 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
857 0, 0));
859 ERROR_IF(vmx_set_guest_descriptor(FS, GSEL(GUDATA_SEL, SEL_UPL),
860 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
861 (uint64_t) guest_fs.base, (uint32_t) guest_fs.size));
863 ERROR_IF(vmx_set_guest_descriptor(GS, GSEL(GUDATA_SEL, SEL_UPL),
864 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
865 (uint64_t) guest_gs.base, (uint32_t) guest_gs.size));
867 ERROR_IF(vmx_set_guest_descriptor(CS, GSEL(GUCODE_SEL, SEL_UPL),
868 VMCS_SEG_TYPE(11) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P | VMCS_L,
869 0, 0));
871 ERROR_IF(vmx_set_guest_descriptor(TR, GSEL(GPROC0_SEL, SEL_UPL),
872 VMCS_SEG_TYPE(11) | VMCS_P,
873 0, 0));
875 ERROR_IF(vmx_set_guest_descriptor(LDTR, 0, VMCS_SEG_UNUSABLE, 0, 0));
877 /* Set the CR0/CR4 registers, removing the unsupported bits */
878 ERROR_IF(vmwrite(VMCS_GUEST_CR0, (CR0_PE | CR0_PG |
879 cr0_fixed_to_1) & ~cr0_fixed_to_0));
880 ERROR_IF(vmwrite(VMCS_GUEST_CR4, (CR4_PAE | CR4_FXSR | CR4_XMM | CR4_XSAVE |
881 cr4_fixed_to_1) & ~ cr4_fixed_to_0));
883 /* Don't set EFER_SCE for catching "syscall" instructions */
884 ERROR_IF(vmwrite(VMCS_GUEST_IA32_EFER, (EFER_LME | EFER_LMA)));
886 vti->guest.tf_rflags = PSL_I | 0x02;
887 ERROR_IF(vmwrite(VMCS_GUEST_RFLAGS, vti->guest.tf_rflags));
889 /* The Guest CR3 indicating CR3 pagetable */
890 ERROR_IF(vmwrite(VMCS_GUEST_CR3, (uint64_t) vti->guest_cr3));
892 /* Throw all possible exceptions */
893 ERROR_IF(vmwrite(VMCS_EXCEPTION_BITMAP,(uint64_t) 0xFFFFFFFF));
895 /* Guest RIP and RSP */
896 ERROR_IF(vmwrite(VMCS_GUEST_RIP, options->tf.tf_rip));
897 ERROR_IF(vmwrite(VMCS_GUEST_RSP, options->tf.tf_rsp));
900 * This field is included for future expansion.
901 * Software should set this field to FFFFFFFF_FFFFFFFFH
902 * to avoid VM-entry failures (see Section 26.3.1.5).
904 ERROR_IF(vmwrite(VMCS_LINK_POINTER, ~0ULL));
906 /* The pointer to the EPT pagetable */
907 ERROR_IF(vmwrite(VMCS_EPTP, vmx_eptp(vti->vmm_cr3)));
909 vti->invept_desc.eptp = vmx_eptp(vti->vmm_cr3);
911 crit_exit();
913 return 0;
914 error:
915 crit_exit();
917 kprintf("VMM: vmx_vminit failed\n");
918 execute_vmclear(vti);
920 kfree(vti->vmcs_region_na, M_TEMP);
921 kfree(vti, M_TEMP);
922 return err;
925 static int
926 vmx_vmdestroy(void)
928 struct vmx_thread_info *vti = curthread->td_vmm;
929 struct proc *p = curproc;
930 int error = -1;
932 if (vti != NULL) {
933 vmx_check_cpu_migration();
934 if (vti->vmcs_region &&
935 pcpu_info[mycpu->gd_cpuid].loaded_vmx == vti)
936 execute_vmclear(vti);
938 if (vti->vmcs_region_na != NULL) {
939 kfree(vti->vmcs_region_na, M_TEMP);
940 kfree(vti, M_TEMP);
941 error = 0;
943 curthread->td_vmm = NULL;
944 lwkt_gettoken(&p->p_token);
945 if (p->p_nthreads == 1) {
946 kfree(p->p_vmm, M_TEMP);
947 p->p_vmm = NULL;
949 lwkt_reltoken(&p->p_token);
951 return error;
955 * Checks if we migrated to another cpu
957 * No locks are required
959 static int
960 vmx_check_cpu_migration(void)
962 struct vmx_thread_info * vti;
963 struct globaldata *gd;
964 cpumask_t mask;
965 int err;
967 gd = mycpu;
968 vti = (struct vmx_thread_info *) curthread->td_vmm;
969 ERROR_IF(vti == NULL);
971 if (vti->last_cpu != -1 && vti->last_cpu != gd->gd_cpuid &&
972 pcpu_info[vti->last_cpu].loaded_vmx == vti) {
974 * Do not reset last_cpu to -1 here, leave it caching
975 * the cpu whos per-cpu fields the VMCS is synchronized
976 * with. The pcpu_info[] check prevents unecessary extra
977 * cpusyncs.
979 dkprintf("VMM: cpusync from %d to %d\n",
980 gd->gd_cpuid, vti->last_cpu);
982 /* Clear the VMCS area if ran on another CPU */
983 CPUMASK_ASSBIT(mask, vti->last_cpu);
984 lwkt_cpusync_simple(mask, execute_vmclear, (void *)vti);
986 return 0;
987 error:
988 kprintf("VMM: vmx_check_cpu_migration failed\n");
989 return err;
992 /* Handle CPU migration
994 * We have to enter with interrupts disabled/critical section
995 * to be sure that another VMCS won't steel our CPU.
997 static inline int
998 vmx_handle_cpu_migration(void)
1000 struct vmx_thread_info * vti;
1001 struct globaldata *gd;
1002 int err;
1004 gd = mycpu;
1005 vti = (struct vmx_thread_info *) curthread->td_vmm;
1006 ERROR_IF(vti == NULL);
1008 if (vti->last_cpu != gd->gd_cpuid) {
1010 * We need to synchronize the per-cpu fields after changing
1011 * cpus.
1013 dkprintf("VMM: vmx_handle_cpu_migration init per CPU data\n");
1015 ERROR_IF(execute_vmptrld(vti));
1017 /* Host related registers */
1018 ERROR_IF(vmwrite(VMCS_HOST_GS_BASE, (uint64_t) gd)); /* mycpu points to %gs:0 */
1019 ERROR_IF(vmwrite(VMCS_HOST_TR_BASE, (uint64_t) &gd->gd_prvspace->mdglobaldata.gd_common_tss));
1021 ERROR_IF(vmwrite(VMCS_HOST_GDTR_BASE, (uint64_t) &gdt[gd->gd_cpuid * NGDT]));
1022 ERROR_IF(vmwrite(VMCS_HOST_IDTR_BASE, (uint64_t) r_idt_arr[gd->gd_cpuid].rd_base));
1025 /* Guest related register */
1026 ERROR_IF(vmwrite(VMCS_GUEST_GDTR_BASE, (uint64_t) &gdt[gd->gd_cpuid * NGDT]));
1027 ERROR_IF(vmwrite(VMCS_GUEST_GDTR_LIMIT, (uint64_t) (NGDT * sizeof(gdt[0]) - 1)));
1030 * Indicates which cpu the per-cpu fields are synchronized
1031 * with. Does not indicate whether the vmcs is active on
1032 * that particular cpu.
1034 vti->last_cpu = gd->gd_cpuid;
1035 } else if (pcpu_info[gd->gd_cpuid].loaded_vmx != vti) {
1037 * We only need to vmptrld
1039 dkprintf("VMM: vmx_handle_cpu_migration: vmcs is not loaded\n");
1041 ERROR_IF(execute_vmptrld(vti));
1043 } /* else we don't need to do anything */
1044 return 0;
1045 error:
1046 kprintf("VMM: vmx_handle_cpu_migration failed\n");
1047 return err;
1050 /* Load information about VMexit
1052 * We still are with interrupts disabled/critical secion
1053 * because we must operate with the VMCS on the CPU
1055 static inline int
1056 vmx_vmexit_loadinfo(void)
1058 struct vmx_thread_info *vti;
1059 int err;
1061 vti = (struct vmx_thread_info *) curthread->td_vmm;
1062 ERROR_IF(vti == NULL);
1064 ERROR_IF(vmread(VMCS_VMEXIT_REASON, &vti->vmexit_reason));
1065 ERROR_IF(vmread(VMCS_EXIT_QUALIFICATION, &vti->vmexit_qualification));
1066 ERROR_IF(vmread(VMCS_VMEXIT_INTERRUPTION_INFO, &vti->vmexit_interruption_info));
1067 ERROR_IF(vmread(VMCS_VMEXIT_INTERRUPTION_ERROR, &vti->vmexit_interruption_error));
1068 ERROR_IF(vmread(VMCS_VMEXIT_INSTRUCTION_LENGTH, &vti->vmexit_instruction_length));
1069 ERROR_IF(vmread(VMCS_GUEST_PHYSICAL_ADDRESS, &vti->guest_physical_address));
1070 ERROR_IF(vmread(VMCS_GUEST_RIP, &vti->guest.tf_rip));
1071 ERROR_IF(vmread(VMCS_GUEST_CS_SELECTOR, &vti->guest.tf_cs));
1072 ERROR_IF(vmread(VMCS_GUEST_RFLAGS, &vti->guest.tf_rflags));
1073 ERROR_IF(vmread(VMCS_GUEST_RSP, &vti->guest.tf_rsp));
1074 ERROR_IF(vmread(VMCS_GUEST_SS_SELECTOR, &vti->guest.tf_ss));
1076 return 0;
1077 error:
1078 kprintf("VMM: vmx_vmexit_loadinfo failed\n");
1079 return err;
1083 static int
1084 vmx_set_tls_area(void)
1086 struct tls_info *guest_fs = &curthread->td_tls.info[0];
1087 struct tls_info *guest_gs = &curthread->td_tls.info[1];
1089 int err;
1091 dkprintf("VMM: vmx_set_tls_area hook\n");
1093 crit_enter();
1095 ERROR_IF(vmx_check_cpu_migration());
1096 ERROR_IF(vmx_handle_cpu_migration());
1098 /* set %fs */
1099 ERROR_IF(vmx_set_guest_descriptor(FS, GSEL(GUDATA_SEL, SEL_UPL),
1100 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
1101 (uint64_t) guest_fs->base, (uint32_t) guest_fs->size));
1103 /* set %gs */
1104 ERROR_IF(vmx_set_guest_descriptor(GS, GSEL(GUDATA_SEL, SEL_UPL),
1105 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
1106 (uint64_t) guest_gs->base, (uint32_t) guest_gs->size));
1108 crit_exit();
1109 return 0;
1111 error:
1112 crit_exit();
1113 return err;
1117 static int
1118 vmx_handle_vmexit(void)
1120 struct vmx_thread_info * vti;
1121 int exit_reason;
1122 int exception_type;
1123 int exception_number;
1124 int err;
1125 int func, regs[4];
1126 int rv;
1127 int fault_type = VM_PROT_NONE;
1128 int fault_flags = 0;
1129 struct lwp *lp = curthread->td_lwp;
1131 dkprintf("VMM: handle_vmx_vmexit\n");
1132 vti = (struct vmx_thread_info *) curthread->td_vmm;
1133 ERROR_IF(vti == NULL);
1135 exit_reason = VMCS_BASIC_EXIT_REASON(vti->vmexit_reason);
1136 switch (exit_reason) {
1137 case EXIT_REASON_EXCEPTION:
1138 dkprintf("VMM: handle_vmx_vmexit: "
1139 "EXIT_REASON_EXCEPTION with qualification "
1140 "%llx, interruption info %llx, "
1141 "interruption error %llx, instruction "
1142 "length %llx\n",
1143 (long long) vti->vmexit_qualification,
1144 (long long) vti->vmexit_interruption_info,
1145 (long long) vti->vmexit_interruption_error,
1146 (long long) vti->vmexit_instruction_length);
1148 dkprintf("VMM: handle_vmx_vmexit: rax: %llx, rip: %llx, "
1149 "rsp: %llx, rdi: %llx, "
1150 "rsi: %llx, %d, "
1151 "vti: %p, master: %p\n",
1152 (long long)vti->guest.tf_rax,
1153 (long long)vti->guest.tf_rip,
1154 (long long)vti->guest.tf_rsp,
1155 (long long)vti->guest.tf_rdi,
1156 (long long)vti->guest.tf_rsi,
1157 exit_reason, vti, curproc->p_vmm);
1159 exception_type =
1160 VMCS_EXCEPTION_TYPE(vti->vmexit_interruption_info);
1161 exception_number =
1162 VMCS_EXCEPTION_NUMBER(vti->vmexit_interruption_info);
1164 if (exception_type == VMCS_EXCEPTION_HARDWARE) {
1165 switch (exception_number) {
1166 case IDT_UD:
1168 * Disabled "syscall" instruction and
1169 * now we catch it for executing
1171 dkprintf("VMM: handle_vmx_vmexit: "
1172 "VMCS_EXCEPTION_HARDWARE IDT_UD\n");
1173 #ifdef VMM_DEBUG
1174 /* Check to see if its syscall asm instuction */
1175 uint8_t instr[INSTRUCTION_MAX_LENGTH];
1176 if (copyin((const void *)vti->guest.tf_rip,
1177 instr,
1178 vti->vmexit_instruction_length) &&
1179 instr_check(&syscall_asm,(void *)instr,
1180 (uint8_t)vti->vmexit_instruction_length)) {
1181 kprintf("VMM: handle_vmx_vmexit: "
1182 "UD different from syscall: ");
1183 db_disasm((db_addr_t)instr, FALSE, NULL);
1185 #endif
1186 /* Called to force a VMEXIT and invalidate TLB */
1187 if (vti->guest.tf_rax == -1) {
1188 vti->guest.tf_rip +=
1189 vti->vmexit_instruction_length;
1190 break;
1193 vti->guest.tf_err = 2;
1194 vti->guest.tf_trapno = T_FAST_SYSCALL;
1195 vti->guest.tf_xflags = 0;
1197 vti->guest.tf_rip +=
1198 vti->vmexit_instruction_length;
1200 syscall2(&vti->guest);
1201 break;
1202 case IDT_PF:
1203 dkprintf("VMM: handle_vmx_vmexit: "
1204 "VMCS_EXCEPTION_HARDWARE IDT_PF "
1205 "at %llx\n",
1206 (long long) vti->guest.tf_rip);
1208 #if 0
1209 if (vti->guest.tf_rip == 0) {
1210 kprintf("VMM: handle_vmx_vmexit: "
1211 "Terminating...\n");
1212 err = -1;
1213 goto error;
1215 #endif
1217 vti->guest.tf_err =
1218 vti->vmexit_interruption_error;
1219 vti->guest.tf_addr =
1220 vti->vmexit_qualification;
1221 vti->guest.tf_xflags = 0;
1222 vti->guest.tf_trapno = T_PAGEFLT;
1225 * If we are a user process in the vkernel
1226 * pass the PF to the vkernel and will trigger
1227 * the user_trap()
1229 * If we are the vkernel, send a SIGSEGV signal
1230 * to us that will trigger the execution of
1231 * kern_trap()
1235 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
1236 vkernel_trap(lp, &vti->guest);
1237 } else {
1238 trapsignal(lp, SIGSEGV, SEGV_MAPERR);
1241 break;
1242 default:
1243 kprintf("VMM: handle_vmx_vmexit: "
1244 "VMCS_EXCEPTION_HARDWARE unknown "
1245 "number %d rip: %llx, rsp: %llx\n",
1246 exception_number,
1247 (long long)vti->guest.tf_rip,
1248 (long long)vti->guest.tf_rsp);
1249 err = -1;
1250 goto error;
1252 } else if (exception_type == VMCS_EXCEPTION_SOFTWARE) {
1253 switch (exception_number) {
1254 case 3:
1255 dkprintf("VMM: handle_vmx_vmexit: "
1256 "VMCS_EXCEPTION_SOFTWARE "
1257 "number %d rip: %llx, rsp: %llx\n",
1258 exception_number,
1259 (long long)vti->guest.tf_rip,
1260 (long long)vti->guest.tf_rsp);
1262 vti->guest.tf_trapno = T_BPTFLT;
1263 vti->guest.tf_xflags = 0;
1264 vti->guest.tf_err = 0;
1265 vti->guest.tf_addr = 0;
1267 vti->guest.tf_rip +=
1268 vti->vmexit_instruction_length;
1270 trap(&vti->guest);
1271 break;
1272 default:
1273 kprintf("VMM: handle_vmx_vmexit: "
1274 "VMCS_EXCEPTION_SOFTWARE unknown "
1275 "number %d rip: %llx, rsp: %llx\n",
1276 exception_number,
1277 (long long)vti->guest.tf_rip,
1278 (long long)vti->guest.tf_rsp);
1279 err = -1;
1280 goto error;
1282 } else {
1283 kprintf("VMM: handle_vmx_vmexit: "
1284 "VMCS_EXCEPTION_ %d unknown\n",
1285 exception_type);
1286 err = -1;
1287 goto error;
1289 break;
1290 case EXIT_REASON_EXT_INTR:
1291 dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EXT_INTR\n");
1292 break;
1293 case EXIT_REASON_CPUID:
1294 dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_CPUID\n");
1297 * Execute CPUID instruction and pass
1298 * the result to the vkernel
1300 func = vti->guest.tf_rax;
1301 do_cpuid(func, regs);
1303 vti->guest.tf_rax = regs[0];
1304 vti->guest.tf_rbx = regs[1];
1305 vti->guest.tf_rcx = regs[2];
1306 vti->guest.tf_rdx = regs[3];
1308 vti->guest.tf_rip += vti->vmexit_instruction_length;
1310 break;
1311 case EXIT_REASON_EPT_FAULT:
1313 * EPT_FAULT are resolved like normal PFs. Nothing special
1314 * - get the fault type
1315 * - get the fault address (which is a GPA)
1316 * - execute vm_fault on the vm_map
1318 dkprintf("VMM: handle_vmx_vmexit: "
1319 "EXIT_REASON_EPT_FAULT with qualification %lld,"
1320 "GPA: %llx, fault_Type: %d\n",
1321 (long long)vti->vmexit_qualification,
1322 (unsigned long long)vti->guest_physical_address,
1323 fault_type);
1325 fault_type = vmx_ept_fault_type(vti->vmexit_qualification);
1327 if (fault_type & VM_PROT_WRITE)
1328 fault_flags = VM_FAULT_DIRTY;
1329 else
1330 fault_flags = VM_FAULT_NORMAL;
1332 rv = vm_fault(&curthread->td_lwp->lwp_vmspace->vm_map,
1333 trunc_page(vti->guest_physical_address),
1334 fault_type, fault_flags);
1336 if (rv != KERN_SUCCESS) {
1337 kprintf("VMM: handle_vmx_vmexit: "
1338 "EXIT_REASON_EPT_FAULT couldn't resolve %jx\n",
1339 (intmax_t)vti->guest_physical_address);
1340 err = -1;
1341 goto error;
1343 break;
1344 default:
1345 kprintf("VMM: handle_vmx_vmexit: "
1346 "unknown exit reason: %d with qualification %lld\n",
1347 exit_reason,
1348 (long long)vti->vmexit_qualification);
1349 err = -1;
1350 goto error;
1352 return 0;
1353 error:
1354 return err;
1357 static int
1358 vmx_vmrun(void)
1360 struct vmx_thread_info * vti;
1361 struct globaldata *gd;
1362 int err;
1363 int ret;
1364 int sticks = 0;
1365 uint64_t val;
1366 cpulock_t olock;
1367 cpulock_t nlock;
1368 struct trapframe *save_frame;
1369 thread_t td = curthread;
1371 vti = (struct vmx_thread_info *) td->td_vmm;
1372 save_frame = td->td_lwp->lwp_md.md_regs;
1373 td->td_lwp->lwp_md.md_regs = &vti->guest;
1374 restart:
1375 lwkt_user_yield();
1376 crit_enter();
1379 * This can change the cpu we are running on.
1381 trap_handle_userexit(&vti->guest, sticks);
1382 gd = mycpu;
1384 ERROR2_IF(vti == NULL);
1385 ERROR2_IF(vmx_check_cpu_migration());
1386 ERROR2_IF(vmx_handle_cpu_migration());
1389 * Make the state safe to VMENTER
1390 * - disable interrupts and check if there were any pending
1391 * - check for ASTFLTs
1392 * - loop again until there are no ASTFLTs
1394 #if 0
1396 static int xcounter;
1397 if ((++xcounter & 65535) == 0)
1398 kprintf("x");
1400 #endif
1401 cpu_disable_intr();
1402 if (gd->gd_reqflags & RQF_AST_MASK) {
1403 atomic_clear_int(&gd->gd_reqflags, RQF_AST_SIGNAL);
1404 cpu_enable_intr();
1405 crit_exit();
1406 vti->guest.tf_trapno = T_ASTFLT;
1407 trap(&vti->guest);
1408 /* CURRENT CPU CAN CHANGE */
1409 goto restart;
1411 if (vti->last_cpu != gd->gd_cpuid) {
1412 cpu_enable_intr();
1413 crit_exit();
1414 kprintf("VMM: vmx_vmrun: vti unexpectedly "
1415 "changed cpus %d->%d\n",
1416 gd->gd_cpuid, vti->last_cpu);
1417 goto restart;
1421 * Add us to the list of cpus running vkernel operations, interlock
1422 * against anyone trying to do an invalidation.
1424 * We must set the cpumask first to ensure that we interlock another
1425 * cpu that may desire to IPI us after we have successfully
1426 * incremented the cpulock counter.
1428 ATOMIC_CPUMASK_ORBIT(td->td_proc->p_vmm_cpumask, gd->gd_cpuid);
1430 for (;;) {
1431 olock = td->td_proc->p_vmm_cpulock;
1432 cpu_ccfence();
1433 if ((olock & CPULOCK_EXCL) == 0) {
1434 nlock = olock + CPULOCK_INCR;
1435 if (atomic_cmpset_int(&td->td_proc->p_vmm_cpulock,
1436 olock, nlock)) {
1437 /* fast path */
1438 break;
1440 /* cmpset race */
1441 cpu_pause();
1442 continue;
1446 * More complex. After sleeping we have to re-test
1447 * everything.
1449 ATOMIC_CPUMASK_NANDBIT(td->td_proc->p_vmm_cpumask,
1450 gd->gd_cpuid);
1451 cpu_enable_intr();
1452 tsleep_interlock(&td->td_proc->p_vmm_cpulock, 0);
1453 if (td->td_proc->p_vmm_cpulock & CPULOCK_EXCL) {
1454 tsleep(&td->td_proc->p_vmm_cpulock, PINTERLOCKED,
1455 "vmminvl", hz);
1457 crit_exit();
1458 goto restart;
1462 * Load specific Guest registers
1463 * GP registers will be loaded in vmx_launch/resume
1465 ERROR_IF(vmwrite(VMCS_GUEST_RIP, vti->guest.tf_rip));
1466 ERROR_IF(vmwrite(VMCS_GUEST_CS_SELECTOR, vti->guest.tf_cs));
1467 ERROR_IF(vmwrite(VMCS_GUEST_RFLAGS, vti->guest.tf_rflags));
1468 ERROR_IF(vmwrite(VMCS_GUEST_RSP, vti->guest.tf_rsp));
1469 ERROR_IF(vmwrite(VMCS_GUEST_SS_SELECTOR, vti->guest.tf_ss));
1470 ERROR_IF(vmwrite(VMCS_GUEST_CR3, (uint64_t) vti->guest_cr3));
1473 * FPU
1475 if (mdcpu->gd_npxthread != td) {
1476 if (mdcpu->gd_npxthread)
1477 npxsave(mdcpu->gd_npxthread->td_savefpu);
1478 npxdna();
1482 * The kernel caches the MSR_FSBASE value in mdcpu->gd_user_fs.
1483 * A vmexit loads this unconditionally from the VMCS so make
1484 * sure it loads the correct value.
1486 ERROR_IF(vmwrite(VMCS_HOST_FS_BASE, mdcpu->gd_user_fs));
1489 * EPT mappings can't be invalidated with normal invlpg/invltlb
1490 * instructions. We have to execute a special instruction that
1491 * invalidates all EPT cache ("invept").
1493 * pm_invgen it's a generation number which is incremented in
1494 * pmap_inval_smp*(), before doing any invalidates. This will
1495 * cause all CPUs thaat are using the EPT to VMEXIT and wait for
1496 * the interlock to complete. When they VMENTER they will see that
1497 * the generation number had changed from their current and do a
1498 * invept.
1500 if (vti->eptgen != td->td_proc->p_vmspace->vm_pmap.pm_invgen) {
1501 vti->eptgen = td->td_proc->p_vmspace->vm_pmap.pm_invgen;
1503 ERROR_IF(invept(INVEPT_TYPE_SINGLE_CONTEXT,
1504 (uint64_t*)&vti->invept_desc));
1507 if (vti->launched) { /* vmresume called from vmx_trap.s */
1508 dkprintf("\n\nVMM: vmx_vmrun: vmx_resume\n");
1509 ret = vmx_resume(vti);
1511 } else { /* vmlaunch called from vmx_trap.s */
1512 dkprintf("\n\nVMM: vmx_vmrun: vmx_launch\n");
1513 vti->launched = 1;
1514 ret = vmx_launch(vti);
1518 * This is our return point from the vmlaunch/vmresume
1519 * There are two situations:
1520 * - the vmlaunch/vmresume executed successfully and they
1521 * would return through "vmx_vmexit" which will restore
1522 * the state (registers) and return here with the ret
1523 * set to VM_EXIT (ret is actually %rax)
1524 * - the vmlaunch/vmresume failed to execute and will return
1525 * immediately with ret set to the error code
1527 if (ret == VM_EXIT) {
1528 ERROR_IF(vmx_vmexit_loadinfo());
1530 ATOMIC_CPUMASK_NANDBIT(td->td_proc->p_vmm_cpumask,
1531 gd->gd_cpuid);
1532 atomic_add_int(&td->td_proc->p_vmm_cpulock,
1533 -CPULOCK_INCR);
1534 /* WARNING: don't adjust cpulock twice! */
1536 cpu_enable_intr();
1537 trap_handle_userenter(td);
1538 sticks = td->td_sticks;
1539 crit_exit();
1542 * Handle the VMEXIT reason
1543 * - if successful we VMENTER again
1544 * - if not, we exit
1546 if (vmx_handle_vmexit())
1547 goto done;
1550 * We handled the VMEXIT reason and continue with
1551 * VM execution
1553 goto restart;
1555 } else {
1556 vti->launched = 0;
1559 * Two types of error:
1560 * - VM_FAIL_VALID - the host state was ok,
1561 * but probably the guest state was not
1562 * - VM_FAIL_INVALID - the parameters or the host state
1563 * was not ok
1565 if (ret == VM_FAIL_VALID) {
1566 vmread(VMCS_INSTR_ERR, &val);
1567 err = (int) val;
1568 kprintf("VMM: vmx_vmrun: vmenter failed with "
1569 "VM_FAIL_VALID, error code %d\n",
1570 err);
1571 } else {
1572 kprintf("VMM: vmx_vmrun: vmenter failed with "
1573 "VM_FAIL_INVALID\n");
1575 goto error;
1577 done:
1578 kprintf("VMM: vmx_vmrun: returning with success\n");
1579 return 0;
1580 error:
1581 ATOMIC_CPUMASK_NANDBIT(td->td_proc->p_vmm_cpumask, gd->gd_cpuid);
1582 atomic_add_int(&td->td_proc->p_vmm_cpulock, -CPULOCK_INCR);
1583 cpu_enable_intr();
1584 error2:
1585 trap_handle_userenter(td);
1586 td->td_lwp->lwp_md.md_regs = save_frame;
1587 KKASSERT(CPUMASK_TESTMASK(td->td_proc->p_vmm_cpumask,
1588 gd->gd_cpumask) == 0);
1589 /*atomic_clear_cpumask(&td->td_proc->p_vmm_cpumask, gd->gd_cpumask);*/
1590 crit_exit();
1591 kprintf("VMM: vmx_vmrun failed\n");
1592 return err;
1596 * Called when returning to user-space
1597 * after executing lwp_fork.
1599 static void
1600 vmx_lwp_return(struct lwp *lp, struct trapframe *frame)
1602 struct vmm_guest_options options;
1603 int vmrun_err;
1604 struct vmm_proc *p_vmm = (struct vmm_proc *)curproc->p_vmm;
1606 dkprintf("VMM: vmx_lwp_return \n");
1608 bzero(&options, sizeof(struct vmm_guest_options));
1610 bcopy(frame, &options.tf, sizeof(struct trapframe));
1612 options.guest_cr3 = p_vmm->guest_cr3;
1613 options.vmm_cr3 = p_vmm->vmm_cr3;
1615 vmx_vminit(&options);
1616 generic_lwp_return(lp, frame);
1618 vmrun_err = vmx_vmrun();
1620 exit1(W_EXITCODE(vmrun_err, 0));
1623 static void
1624 vmx_set_guest_cr3(register_t guest_cr3)
1626 struct vmx_thread_info *vti = (struct vmx_thread_info *) curthread->td_vmm;
1627 vti->guest_cr3 = guest_cr3;
1630 static int
1631 vmx_vm_get_gpa(struct proc *p, register_t *gpa, register_t uaddr)
1633 return guest_phys_addr(p->p_vmspace, gpa, p->p_vkernel->vkernel_cr3, uaddr);
1636 static struct vmm_ctl ctl_vmx = {
1637 .name = "VMX from Intel",
1638 .init = vmx_init,
1639 .enable = vmx_enable,
1640 .disable = vmx_disable,
1641 .vminit = vmx_vminit,
1642 .vmdestroy = vmx_vmdestroy,
1643 .vmrun = vmx_vmrun,
1644 .vm_set_tls_area = vmx_set_tls_area,
1645 .vm_lwp_return = vmx_lwp_return,
1646 .vm_set_guest_cr3 = vmx_set_guest_cr3,
1647 .vm_get_gpa = vmx_vm_get_gpa,
1650 struct vmm_ctl*
1651 get_ctl_intel(void)
1653 return &ctl_vmx;