From e1ea8b243e0eeb23a3181ea4ba7c273ac2f7048a Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 7 Oct 2016 19:10:06 -0700 Subject: [PATCH] kernel - Fix a system lockup with vmm * Fix an issue where vkernel_lwp_exit() was improperly trying to kfree() the vklp->ve pointer for the guest-thread case. This field holds a user-supplied address in that case, not a kernel structure. * Yield the cpu more aggressively in the VMM_GUEST_RUN loop. We were testing for pending interrupts but we were not calling lwkt_switch() * Do not exit the vkernel on a call or jump to address 0. This debugging code should have been removed and wasn't. A user process running under the vkernel could cause the vkernel itself to exit. * Numerous syntactical cleanups. Reported-by: tuxillo --- sys/kern/sys_vmm.c | 82 +++--- sys/platform/pc64/include/pmap.h | 2 + sys/platform/pc64/vmm/ept.c | 17 +- sys/platform/pc64/vmm/vmm.c | 6 + sys/platform/pc64/vmm/vmm.h | 7 + sys/platform/pc64/vmm/vmx.c | 563 +++++++++++++++++++++------------------ sys/platform/pc64/x86_64/pmap.c | 2 + sys/sys/proc.h | 2 +- sys/vm/vm_vmspace.c | 23 +- 9 files changed, 388 insertions(+), 316 deletions(-) diff --git a/sys/kern/sys_vmm.c b/sys/kern/sys_vmm.c index 1c2f9ff6e6..cf3ad31193 100644 --- a/sys/kern/sys_vmm.c +++ b/sys/kern/sys_vmm.c @@ -63,58 +63,58 @@ sys_vmm_guest_ctl(struct vmm_guest_ctl_args *uap) clear_quickret(); switch (uap->op) { - case VMM_GUEST_RUN: - error = copyin(uap->options, &options, - sizeof(struct vmm_guest_options)); + case VMM_GUEST_RUN: + error = copyin(uap->options, &options, + sizeof(struct vmm_guest_options)); + if (error) { + kprintf("%s: error copyin vmm_guest_options\n", + __func__); + goto out; + } + + while(stack_limit > tf->tf_sp) { + stack_limit -= PAGE_SIZE; + options.new_stack -= PAGE_SIZE; + + error = copyin((const void *)stack_limit, + (void *)stack_page, PAGE_SIZE); if (error) { - kprintf("%s: error copyin vmm_guest_options\n", - __func__); + kprintf("%s: error copyin stack\n", + __func__); goto out; } - while(stack_limit > tf->tf_sp) { - stack_limit -= PAGE_SIZE; - options.new_stack -= PAGE_SIZE; - - error = copyin((const void *)stack_limit, - (void *)stack_page, PAGE_SIZE); - if (error) { - kprintf("%s: error copyin stack\n", - __func__); - goto out; - } - - error = copyout((const void *)stack_page, - (void *)options.new_stack, PAGE_SIZE); - if (error) { - kprintf("%s: error copyout stack\n", - __func__); - goto out; - } + error = copyout((const void *)stack_page, + (void *)options.new_stack, PAGE_SIZE); + if (error) { + kprintf("%s: error copyout stack\n", + __func__); + goto out; } + } - bcopy(tf, &options.tf, sizeof(struct trapframe)); + bcopy(tf, &options.tf, sizeof(struct trapframe)); - error = vmm_vminit(&options); - if (error) { - if (error == ENODEV) { - kprintf("%s: vmm_vminit failed - " - "no VMM available \n", __func__); - goto out; - } - kprintf("%s: vmm_vminit failed\n", __func__); - goto out_exit; + error = vmm_vminit(&options); + if (error) { + if (error == ENODEV) { + kprintf("%s: vmm_vminit failed - " + "no VMM available \n", __func__); + goto out; } + kprintf("%s: vmm_vminit failed\n", __func__); + goto out_exit; + } - generic_lwp_return(curthread->td_lwp, tf); + generic_lwp_return(curthread->td_lwp, tf); - error = vmm_vmrun(); + error = vmm_vmrun(); - break; - default: - kprintf("%s: INVALID op\n", __func__); - error = EINVAL; - goto out; + break; + default: + kprintf("%s: INVALID op\n", __func__); + error = EINVAL; + goto out; } out_exit: exit1(W_EXITCODE(error, 0)); diff --git a/sys/platform/pc64/include/pmap.h b/sys/platform/pc64/include/pmap.h index e3486eb10c..54ca8b38f3 100644 --- a/sys/platform/pc64/include/pmap.h +++ b/sys/platform/pc64/include/pmap.h @@ -301,6 +301,8 @@ struct pmap { #define PMAP_FLAG_SIMPLE 0x00000001 #define PMAP_EMULATE_AD_BITS 0x00000002 +#define PMAP_HVM 0x00000004 +#define PMAP_SEGSHARED 0x00000008 /* segment shared opt */ #define pmap_resident_count(pmap) (pmap)->pm_stats.resident_count diff --git a/sys/platform/pc64/vmm/ept.c b/sys/platform/pc64/vmm/ept.c index 0cae3a6323..214b67bfd7 100644 --- a/sys/platform/pc64/vmm/ept.c +++ b/sys/platform/pc64/vmm/ept.c @@ -48,13 +48,14 @@ #include "vmx.h" #include "ept.h" #include "vmm_utils.h" +#include "vmm.h" static uint64_t pmap_bits_ept[PG_BITS_SIZE]; static pt_entry_t pmap_cache_bits_ept[PAT_INDEX_SIZE]; static int ept_protection_codes[PROTECTION_CODES_SIZE]; static pt_entry_t pmap_cache_mask_ept; -static int pmap_pm_flags_ept; +static int pmap_pm_flags_ept = PMAP_HVM; static int eptp_bits; extern uint64_t vmx_ept_vpid_cap; @@ -79,7 +80,7 @@ vmx_ept_init(void) if (EPT_AD_BITS_SUPPORTED(vmx_ept_vpid_cap)) { eptp_bits |= EPTP_AD_ENABLE; } else { - pmap_pm_flags_ept = PMAP_EMULATE_AD_BITS; + pmap_pm_flags_ept |= PMAP_EMULATE_AD_BITS; } /* Initialize EPT bits @@ -165,8 +166,10 @@ ept_copyin(const void *udaddr, void *kaddr, size_t len) m = vm_fault_page(&vm->vm_map, trunc_page(gpa), VM_PROT_READ, VM_FAULT_NORMAL, &err); if (err) { - kprintf("%s: could not fault in vm map, gpa: %llx\n", - __func__, (unsigned long long) gpa); + if (vmm_debug) { + kprintf("%s: could not fault in vm map, gpa: %llx\n", + __func__, (unsigned long long) gpa); + } break; } @@ -213,8 +216,10 @@ ept_copyout(const void *kaddr, void *udaddr, size_t len) VM_PROT_READ | VM_PROT_WRITE, VM_FAULT_NORMAL, &err); if (err) { - kprintf("%s: could not fault in vm map, gpa: %llx\n", - __func__, (unsigned long long) gpa); + if (vmm_debug) { + kprintf("%s: could not fault in vm map, gpa: %llx\n", + __func__, (unsigned long long) gpa); + } break; } diff --git a/sys/platform/pc64/vmm/vmm.c b/sys/platform/pc64/vmm/vmm.c index 493923b868..4edf49b199 100644 --- a/sys/platform/pc64/vmm/vmm.c +++ b/sys/platform/pc64/vmm/vmm.c @@ -50,6 +50,7 @@ struct sysctl_ctx_list vmm_sysctl_ctx; struct sysctl_oid *vmm_sysctl_tree; int vmm_enabled; +int vmm_debug; static int sysctl_vmm_enable(SYSCTL_HANDLER_ARGS) @@ -125,6 +126,11 @@ vmm_init(void) OID_AUTO, "enable", CTLTYPE_INT | CTLFLAG_WR, NULL, sizeof vmm_enabled, sysctl_vmm_enable, "I", "Control the state of the VMM"); + SYSCTL_ADD_INT(&vmm_sysctl_ctx, + SYSCTL_CHILDREN(vmm_sysctl_tree), + OID_AUTO, "debug", CTLTYPE_INT | CTLFLAG_RW, + &vmm_debug, 0, + "vmm debugging"); if (ctl->enable()) { kprintf("VMM: vmm enable() failed\n"); diff --git a/sys/platform/pc64/vmm/vmm.h b/sys/platform/pc64/vmm/vmm.h index 4580f84db7..d4d2d4bad0 100644 --- a/sys/platform/pc64/vmm/vmm.h +++ b/sys/platform/pc64/vmm/vmm.h @@ -90,4 +90,11 @@ struct vmm_proc { struct vmm_ctl* get_ctl_intel(void); struct vmm_ctl* get_ctl_amd(void); +#ifdef _KERNEL + +extern int vmm_enabled; +extern int vmm_debug; + +#endif + #endif diff --git a/sys/platform/pc64/vmm/vmx.c b/sys/platform/pc64/vmm/vmx.c index 606b39c41b..06bf895526 100644 --- a/sys/platform/pc64/vmm/vmx.c +++ b/sys/platform/pc64/vmm/vmx.c @@ -154,52 +154,47 @@ vmx_set_ctl_setting(struct vmx_ctl_info *vmx_ctl, uint32_t bit_no, setting_t val /* Check if the value is known by VMM or set on DEFAULT */ switch(value) { - case DEFAULT: - /* Both settings are allowd - * - step b.iii) - * or - * - c.iii), c.iv) - */ - if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no) - && IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) { - - /* For c.iii) and c.iv) */ - if(IS_TRUE_CTL_AVAIL(vmx_basic)) - ctl_val = rdmsr(vmx_ctl->msr_addr); - - if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)) - vmx_ctl->ctls &= ~BIT(bit_no); - else if (IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) - vmx_ctl->ctls |= BIT(bit_no); - - } else if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)) { - /* b.i), c.i) */ - vmx_ctl->ctls &= ~BIT(bit_no); + case DEFAULT: + /* + * Both settings are allowd + * - step b.iii) + * or + * - c.iii), c.iv) + */ + if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no) && + IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) { + /* For c.iii) and c.iv) */ + if (IS_TRUE_CTL_AVAIL(vmx_basic)) + ctl_val = rdmsr(vmx_ctl->msr_addr); - } else if (IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) { - /* b.i), c.i) */ + if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)) + vmx_ctl->ctls &= ~BIT(bit_no); + else if (IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) vmx_ctl->ctls |= BIT(bit_no); - - } else { - return (EINVAL); - } - break; - case ZERO: - /* For b.ii) or c.ii) */ - if (!IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)) - return (EINVAL); - + } else if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)) { + /* b.i), c.i) */ vmx_ctl->ctls &= ~BIT(bit_no); - - break; - case ONE: - /* For b.ii) or c.ii) */ - if (!IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) - return (EINVAL); - + } else if (IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) { + /* b.i), c.i) */ vmx_ctl->ctls |= BIT(bit_no); - - break; + } else { + return (EINVAL); + } + break; + case ZERO: + /* For b.ii) or c.ii) */ + if (!IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)) + return (EINVAL); + vmx_ctl->ctls &= ~BIT(bit_no); + break; + case ONE: + /* For b.ii) or c.ii) */ + if (!IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) + return (EINVAL); + + vmx_ctl->ctls |= BIT(bit_no); + + break; } return 0; } @@ -209,7 +204,7 @@ vmx_set_default_settings(struct vmx_ctl_info *vmx_ctl) { int i; - for(i = 0; i < 32; i++) { + for (i = 0; i < 32; i++) { vmx_set_ctl_setting(vmx_ctl, i, DEFAULT); } } @@ -645,59 +640,59 @@ static int vmx_set_guest_descriptor(descriptor_t type, } switch(type) { - case ES: - selector_enc = VMCS_GUEST_ES_SELECTOR; - rights_enc = VMCS_GUEST_ES_ACCESS_RIGHTS; - base_enc = VMCS_GUEST_ES_BASE; - limit_enc = VMCS_GUEST_ES_LIMIT; - break; - case CS: - selector_enc = VMCS_GUEST_CS_SELECTOR; - rights_enc = VMCS_GUEST_CS_ACCESS_RIGHTS; - base_enc = VMCS_GUEST_CS_BASE; - limit_enc = VMCS_GUEST_CS_LIMIT; - break; - case SS: - selector_enc = VMCS_GUEST_SS_SELECTOR; - rights_enc = VMCS_GUEST_SS_ACCESS_RIGHTS; - base_enc = VMCS_GUEST_SS_BASE; - limit_enc = VMCS_GUEST_SS_LIMIT; - break; - case DS: - selector_enc = VMCS_GUEST_DS_SELECTOR; - rights_enc = VMCS_GUEST_DS_ACCESS_RIGHTS; - base_enc = VMCS_GUEST_DS_BASE; - limit_enc = VMCS_GUEST_DS_LIMIT; - break; - case FS: - selector_enc = VMCS_GUEST_FS_SELECTOR; - rights_enc = VMCS_GUEST_FS_ACCESS_RIGHTS; - base_enc = VMCS_GUEST_FS_BASE; - limit_enc = VMCS_GUEST_FS_LIMIT; - break; - case GS: - selector_enc = VMCS_GUEST_GS_SELECTOR; - rights_enc = VMCS_GUEST_GS_ACCESS_RIGHTS; - base_enc = VMCS_GUEST_GS_BASE; - limit_enc = VMCS_GUEST_GS_LIMIT; - break; - case LDTR: - selector_enc = VMCS_GUEST_LDTR_SELECTOR; - rights_enc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; - base_enc = VMCS_GUEST_LDTR_BASE; - limit_enc = VMCS_GUEST_LDTR_LIMIT; - break; - case TR: - selector_enc = VMCS_GUEST_TR_SELECTOR; - rights_enc = VMCS_GUEST_TR_ACCESS_RIGHTS; - base_enc = VMCS_GUEST_TR_BASE; - limit_enc = VMCS_GUEST_TR_LIMIT; - break; - default: - kprintf("VMM: vmx_set_guest_descriptor: unknown descriptor\n"); - err = -1; - goto error; - break; + case ES: + selector_enc = VMCS_GUEST_ES_SELECTOR; + rights_enc = VMCS_GUEST_ES_ACCESS_RIGHTS; + base_enc = VMCS_GUEST_ES_BASE; + limit_enc = VMCS_GUEST_ES_LIMIT; + break; + case CS: + selector_enc = VMCS_GUEST_CS_SELECTOR; + rights_enc = VMCS_GUEST_CS_ACCESS_RIGHTS; + base_enc = VMCS_GUEST_CS_BASE; + limit_enc = VMCS_GUEST_CS_LIMIT; + break; + case SS: + selector_enc = VMCS_GUEST_SS_SELECTOR; + rights_enc = VMCS_GUEST_SS_ACCESS_RIGHTS; + base_enc = VMCS_GUEST_SS_BASE; + limit_enc = VMCS_GUEST_SS_LIMIT; + break; + case DS: + selector_enc = VMCS_GUEST_DS_SELECTOR; + rights_enc = VMCS_GUEST_DS_ACCESS_RIGHTS; + base_enc = VMCS_GUEST_DS_BASE; + limit_enc = VMCS_GUEST_DS_LIMIT; + break; + case FS: + selector_enc = VMCS_GUEST_FS_SELECTOR; + rights_enc = VMCS_GUEST_FS_ACCESS_RIGHTS; + base_enc = VMCS_GUEST_FS_BASE; + limit_enc = VMCS_GUEST_FS_LIMIT; + break; + case GS: + selector_enc = VMCS_GUEST_GS_SELECTOR; + rights_enc = VMCS_GUEST_GS_ACCESS_RIGHTS; + base_enc = VMCS_GUEST_GS_BASE; + limit_enc = VMCS_GUEST_GS_LIMIT; + break; + case LDTR: + selector_enc = VMCS_GUEST_LDTR_SELECTOR; + rights_enc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; + base_enc = VMCS_GUEST_LDTR_BASE; + limit_enc = VMCS_GUEST_LDTR_LIMIT; + break; + case TR: + selector_enc = VMCS_GUEST_TR_SELECTOR; + rights_enc = VMCS_GUEST_TR_ACCESS_RIGHTS; + base_enc = VMCS_GUEST_TR_BASE; + limit_enc = VMCS_GUEST_TR_LIMIT; + break; + default: + kprintf("VMM: vmx_set_guest_descriptor: unknown descriptor\n"); + err = -1; + goto error; + break; } ERROR_IF(vmwrite(selector_enc, selector)); @@ -1139,183 +1134,220 @@ vmx_handle_vmexit(void) exit_reason = VMCS_BASIC_EXIT_REASON(vti->vmexit_reason); switch (exit_reason) { - case EXIT_REASON_EXCEPTION: - dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EXCEPTION with qualification " - "%llx, interruption info %llx, interruption error %llx, instruction " - "length %llx\n", - (long long) vti->vmexit_qualification, - (long long) vti->vmexit_interruption_info, - (long long) vti->vmexit_interruption_error, - (long long) vti->vmexit_instruction_length); - - dkprintf("VMM: handle_vmx_vmexit: rax: %llx, rip: %llx, " - "rsp: %llx, rdi: %llx, rsi: %llx, %d, vti: %p, master: %p\n", - (long long)vti->guest.tf_rax, - (long long)vti->guest.tf_rip, - (long long)vti->guest.tf_rsp, - (long long)vti->guest.tf_rdi, - (long long)vti->guest.tf_rsi, exit_reason, vti, curproc->p_vmm); - - exception_type = VMCS_EXCEPTION_TYPE(vti->vmexit_interruption_info); - exception_number = VMCS_EXCEPTION_NUMBER(vti->vmexit_interruption_info); - - if (exception_type == VMCS_EXCEPTION_HARDWARE) { - switch (exception_number) { - case IDT_UD: - /* - * Disabled "syscall" instruction and - * now we catch it for executing - */ - dkprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_HARDWARE IDT_UD\n"); + case EXIT_REASON_EXCEPTION: + dkprintf("VMM: handle_vmx_vmexit: " + "EXIT_REASON_EXCEPTION with qualification " + "%llx, interruption info %llx, " + "interruption error %llx, instruction " + "length %llx\n", + (long long) vti->vmexit_qualification, + (long long) vti->vmexit_interruption_info, + (long long) vti->vmexit_interruption_error, + (long long) vti->vmexit_instruction_length); + + dkprintf("VMM: handle_vmx_vmexit: rax: %llx, rip: %llx, " + "rsp: %llx, rdi: %llx, " + "rsi: %llx, %d, " + "vti: %p, master: %p\n", + (long long)vti->guest.tf_rax, + (long long)vti->guest.tf_rip, + (long long)vti->guest.tf_rsp, + (long long)vti->guest.tf_rdi, + (long long)vti->guest.tf_rsi, + exit_reason, vti, curproc->p_vmm); + + exception_type = + VMCS_EXCEPTION_TYPE(vti->vmexit_interruption_info); + exception_number = + VMCS_EXCEPTION_NUMBER(vti->vmexit_interruption_info); + + if (exception_type == VMCS_EXCEPTION_HARDWARE) { + switch (exception_number) { + case IDT_UD: + /* + * Disabled "syscall" instruction and + * now we catch it for executing + */ + dkprintf("VMM: handle_vmx_vmexit: " + "VMCS_EXCEPTION_HARDWARE IDT_UD\n"); #ifdef VMM_DEBUG - /* Check to see if its syscall asm instuction */ - uint8_t instr[INSTRUCTION_MAX_LENGTH]; - if (copyin((const void *) vti->guest.tf_rip, instr, vti->vmexit_instruction_length) && - instr_check(&syscall_asm,(void *) instr, (uint8_t) vti->vmexit_instruction_length)) { - kprintf("VMM: handle_vmx_vmexit: UD different from syscall: "); - db_disasm((db_addr_t) instr, FALSE, NULL); - } + /* Check to see if its syscall asm instuction */ + uint8_t instr[INSTRUCTION_MAX_LENGTH]; + if (copyin((const void *)vti->guest.tf_rip, + instr, + vti->vmexit_instruction_length) && + instr_check(&syscall_asm,(void *)instr, + (uint8_t)vti->vmexit_instruction_length)) { + kprintf("VMM: handle_vmx_vmexit: " + "UD different from syscall: "); + db_disasm((db_addr_t)instr, FALSE, NULL); + } #endif - /* Called to force a VMEXIT and invalidate TLB */ - if (vti->guest.tf_rax == -1) { - vti->guest.tf_rip += vti->vmexit_instruction_length; - break; - } - - vti->guest.tf_err = 2; - vti->guest.tf_trapno = T_FAST_SYSCALL; - vti->guest.tf_xflags = 0; - - vti->guest.tf_rip += vti->vmexit_instruction_length; - - syscall2(&vti->guest); - - break; - case IDT_PF: - dkprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_HARDWARE IDT_PF at %llx\n", - (long long) vti->guest.tf_rip); - - if (vti->guest.tf_rip == 0) { - kprintf("VMM: handle_vmx_vmexit: Terminating...\n"); - err = -1; - goto error; - } - - vti->guest.tf_err = vti->vmexit_interruption_error; - vti->guest.tf_addr = vti->vmexit_qualification; - vti->guest.tf_xflags = 0; - vti->guest.tf_trapno = T_PAGEFLT; - - /* - * If we are a user process in the vkernel - * pass the PF to the vkernel and will trigger - * the user_trap() - * - * If we are the vkernel, send a SIGSEGV signal - * to us that will trigger the execution of - * kern_trap() - * - */ - - if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { - vkernel_trap(lp, &vti->guest); - } else { - trapsignal(lp, SIGSEGV, SEGV_MAPERR); - } - - break; - default: - kprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_HARDWARE unknown " - "number %d rip: %llx, rsp: %llx\n", exception_number, - (long long)vti->guest.tf_rip, (long long)vti->guest.tf_rsp); - err = -1; - goto error; + /* Called to force a VMEXIT and invalidate TLB */ + if (vti->guest.tf_rax == -1) { + vti->guest.tf_rip += + vti->vmexit_instruction_length; + break; } - } else if (exception_type == VMCS_EXCEPTION_SOFTWARE) { - switch (exception_number) { - case 3: - dkprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_SOFTWARE " - "number %d rip: %llx, rsp: %llx\n", exception_number, - (long long)vti->guest.tf_rip, (long long)vti->guest.tf_rsp); - - vti->guest.tf_trapno = T_BPTFLT; - vti->guest.tf_xflags = 0; - vti->guest.tf_err = 0; - vti->guest.tf_addr = 0; - - vti->guest.tf_rip += vti->vmexit_instruction_length; - - trap(&vti->guest); - - break; - default: - kprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_SOFTWARE unknown " - "number %d rip: %llx, rsp: %llx\n", exception_number, - (long long)vti->guest.tf_rip, (long long)vti->guest.tf_rsp); - err = -1; - goto error; + + vti->guest.tf_err = 2; + vti->guest.tf_trapno = T_FAST_SYSCALL; + vti->guest.tf_xflags = 0; + + vti->guest.tf_rip += + vti->vmexit_instruction_length; + + syscall2(&vti->guest); + break; + case IDT_PF: + dkprintf("VMM: handle_vmx_vmexit: " + "VMCS_EXCEPTION_HARDWARE IDT_PF " + "at %llx\n", + (long long) vti->guest.tf_rip); + +#if 0 + if (vti->guest.tf_rip == 0) { + kprintf("VMM: handle_vmx_vmexit: " + "Terminating...\n"); + err = -1; + goto error; } - } else { - kprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_ %d unknown\n", exception_type); +#endif + + vti->guest.tf_err = + vti->vmexit_interruption_error; + vti->guest.tf_addr = + vti->vmexit_qualification; + vti->guest.tf_xflags = 0; + vti->guest.tf_trapno = T_PAGEFLT; + + /* + * If we are a user process in the vkernel + * pass the PF to the vkernel and will trigger + * the user_trap() + * + * If we are the vkernel, send a SIGSEGV signal + * to us that will trigger the execution of + * kern_trap() + * + */ + + if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { + vkernel_trap(lp, &vti->guest); + } else { + trapsignal(lp, SIGSEGV, SEGV_MAPERR); + } + + break; + default: + kprintf("VMM: handle_vmx_vmexit: " + "VMCS_EXCEPTION_HARDWARE unknown " + "number %d rip: %llx, rsp: %llx\n", + exception_number, + (long long)vti->guest.tf_rip, + (long long)vti->guest.tf_rsp); err = -1; goto error; } - break; - case EXIT_REASON_EXT_INTR: - dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EXT_INTR\n"); - break; - case EXIT_REASON_CPUID: - dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_CPUID\n"); - - /* - * Execute CPUID instruction and pass - * the result to the vkernel - */ - - func = vti->guest.tf_rax; - do_cpuid(func, regs); - - vti->guest.tf_rax = regs[0]; - vti->guest.tf_rbx = regs[1]; - vti->guest.tf_rcx = regs[2]; - vti->guest.tf_rdx = regs[3]; - - vti->guest.tf_rip += vti->vmexit_instruction_length; - - break; - case EXIT_REASON_EPT_FAULT: - /* - * EPT_FAULT are resolved like normal PFs. Nothing special - * - get the fault type - * - get the fault address (which is a GPA) - * - execute vm_fault on the vm_map - */ - dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EPT_FAULT with qualification %lld," - "GPA: %llx, fault_Type: %d\n",(long long) vti->vmexit_qualification, - (unsigned long long) vti->guest_physical_address, fault_type); - - fault_type = vmx_ept_fault_type(vti->vmexit_qualification); - - if (fault_type & VM_PROT_WRITE) - fault_flags = VM_FAULT_DIRTY; - else - fault_flags = VM_FAULT_NORMAL; - - rv = vm_fault(&curthread->td_lwp->lwp_vmspace->vm_map, - trunc_page(vti->guest_physical_address), fault_type, fault_flags); - - if (rv != KERN_SUCCESS) { - kprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EPT_FAULT couldn't resolve %llx\n", - (unsigned long long) vti->guest_physical_address); + } else if (exception_type == VMCS_EXCEPTION_SOFTWARE) { + switch (exception_number) { + case 3: + dkprintf("VMM: handle_vmx_vmexit: " + "VMCS_EXCEPTION_SOFTWARE " + "number %d rip: %llx, rsp: %llx\n", + exception_number, + (long long)vti->guest.tf_rip, + (long long)vti->guest.tf_rsp); + + vti->guest.tf_trapno = T_BPTFLT; + vti->guest.tf_xflags = 0; + vti->guest.tf_err = 0; + vti->guest.tf_addr = 0; + + vti->guest.tf_rip += + vti->vmexit_instruction_length; + + trap(&vti->guest); + break; + default: + kprintf("VMM: handle_vmx_vmexit: " + "VMCS_EXCEPTION_SOFTWARE unknown " + "number %d rip: %llx, rsp: %llx\n", + exception_number, + (long long)vti->guest.tf_rip, + (long long)vti->guest.tf_rsp); err = -1; goto error; } - break; - default: - kprintf("VMM: handle_vmx_vmexit: unknown exit reason: %d with qualification %lld\n", - exit_reason, (long long) vti->vmexit_qualification); + } else { + kprintf("VMM: handle_vmx_vmexit: " + "VMCS_EXCEPTION_ %d unknown\n", + exception_type); + err = -1; + goto error; + } + break; + case EXIT_REASON_EXT_INTR: + dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EXT_INTR\n"); + break; + case EXIT_REASON_CPUID: + dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_CPUID\n"); + + /* + * Execute CPUID instruction and pass + * the result to the vkernel + */ + func = vti->guest.tf_rax; + do_cpuid(func, regs); + + vti->guest.tf_rax = regs[0]; + vti->guest.tf_rbx = regs[1]; + vti->guest.tf_rcx = regs[2]; + vti->guest.tf_rdx = regs[3]; + + vti->guest.tf_rip += vti->vmexit_instruction_length; + + break; + case EXIT_REASON_EPT_FAULT: + /* + * EPT_FAULT are resolved like normal PFs. Nothing special + * - get the fault type + * - get the fault address (which is a GPA) + * - execute vm_fault on the vm_map + */ + dkprintf("VMM: handle_vmx_vmexit: " + "EXIT_REASON_EPT_FAULT with qualification %lld," + "GPA: %llx, fault_Type: %d\n", + (long long)vti->vmexit_qualification, + (unsigned long long)vti->guest_physical_address, + fault_type); + + fault_type = vmx_ept_fault_type(vti->vmexit_qualification); + + if (fault_type & VM_PROT_WRITE) + fault_flags = VM_FAULT_DIRTY; + else + fault_flags = VM_FAULT_NORMAL; + + rv = vm_fault(&curthread->td_lwp->lwp_vmspace->vm_map, + trunc_page(vti->guest_physical_address), + fault_type, fault_flags); + + if (rv != KERN_SUCCESS) { + kprintf("VMM: handle_vmx_vmexit: " + "EXIT_REASON_EPT_FAULT couldn't resolve %jx\n", + (intmax_t)vti->guest_physical_address); err = -1; goto error; + } + break; + default: + kprintf("VMM: handle_vmx_vmexit: " + "unknown exit reason: %d with qualification %lld\n", + exit_reason, + (long long)vti->vmexit_qualification); + err = -1; + goto error; } return 0; error: @@ -1340,6 +1372,7 @@ vmx_vmrun(void) save_frame = td->td_lwp->lwp_md.md_regs; td->td_lwp->lwp_md.md_regs = &vti->guest; restart: + lwkt_user_yield(); crit_enter(); /* @@ -1358,8 +1391,14 @@ restart: * - check for ASTFLTs * - loop again until there are no ASTFLTs */ +#if 0 + { + static int xcounter; + if ((++xcounter & 65535) == 0) + kprintf("x"); + } +#endif cpu_disable_intr(); - splz(); if (gd->gd_reqflags & RQF_AST_MASK) { atomic_clear_int(&gd->gd_reqflags, RQF_AST_SIGNAL); cpu_enable_intr(); diff --git a/sys/platform/pc64/x86_64/pmap.c b/sys/platform/pc64/x86_64/pmap.c index 89e23f1943..7764abc345 100644 --- a/sys/platform/pc64/x86_64/pmap.c +++ b/sys/platform/pc64/x86_64/pmap.c @@ -2124,6 +2124,7 @@ retry: */ if (entry == NULL || pmap_mmu_optimize == 0 || /* not enabled */ + (pmap->pm_flags & PMAP_HVM) || /* special pmap */ ptepindex >= pmap_pd_pindex(0) || /* not terminal or pt */ entry->inheritance != VM_INHERIT_SHARE || /* not shared */ entry->maptype != VM_MAPTYPE_NORMAL || /* weird map type */ @@ -2190,6 +2191,7 @@ retry: obpmap = *obpmapp; /* safety */ } else { obpmap->pm_active = smp_active_mask; + obpmap->pm_flags |= PMAP_SEGSHARED; *obpmapp = obpmap; spin_unlock(&pmap_spin); } diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 18860c2eef..cc9945388d 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -369,7 +369,7 @@ struct proc { #define P_EXEC 0x04000 /* Process called exec */ #define P_CONTINUED 0x08000 /* Proc has continued from a stopped state */ -#define P_UNUSED16 0x00010000 +#define P_LOWMEMKILL 0x00010000 /* trying to kill due to low memory */ #define P_UNUSED17 0x00020000 #define P_SWAPWAIT 0x00040000 /* Waiting for a swapin */ diff --git a/sys/vm/vm_vmspace.c b/sys/vm/vm_vmspace.c index 58276b55e4..a78b9536cd 100644 --- a/sys/vm/vm_vmspace.c +++ b/sys/vm/vm_vmspace.c @@ -674,13 +674,23 @@ vkernel_lwp_exit(struct lwp *lp) struct vmspace_entry *ve; if ((vklp = lp->lwp_vkernel) != NULL) { - if ((ve = vklp->ve) != NULL) { - kprintf("Warning, pid %d killed with " - "active VC!\n", lp->lwp_proc->p_pid); - pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace); + if (lp->lwp_thread->td_vmm == NULL) { + /* + * vkernel thread + */ + if ((ve = vklp->ve) != NULL) { + kprintf("Warning, pid %d killed with " + "active VC!\n", lp->lwp_proc->p_pid); + pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace); + vklp->ve = NULL; + KKASSERT(ve->refs > 0); + atomic_subtract_int(&ve->refs, 1); + } + } else { + /* + * guest thread + */ vklp->ve = NULL; - KKASSERT(ve->refs > 0); - atomic_subtract_int(&ve->refs, 1); } lp->lwp_vkernel = NULL; kfree(vklp, M_VKERNEL); @@ -724,6 +734,7 @@ vkernel_trap(struct lwp *lp, struct trapframe *frame) vklp->ve = NULL; vmm_vm_set_guest_cr3(p->p_vkernel->vkernel_cr3); } + /* * Copy the emulated process frame to the virtual kernel process. * The emulated process cannot change TLS descriptors so don't -- 2.11.4.GIT