2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2012 Sandvine, Inc.
5 * Copyright (c) 2012 NetApp, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * This file and its contents are supplied under the terms of the
33 * Common Development and Distribution License ("CDDL"), version 1.0.
34 * You may only use this file in accordance with the terms of version
37 * A full copy of the text of the CDDL should have accompanied this
38 * source. A copy of the CDDL is also available via the Internet at
39 * http://www.illumos.org/license/CDDL.
41 * Copyright 2015 Pluribus Networks Inc.
42 * Copyright 2018 Joyent, Inc.
43 * Copyright 2021 Oxide Computer Company
44 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
50 #include <sys/param.h>
52 #include <sys/systm.h>
55 #include <machine/vmparam.h>
56 #include <machine/vmm.h>
57 #include <sys/vmm_kernel.h>
58 #include <sys/vmm_vm.h>
60 #include <sys/vmm_instruction_emul.h>
62 #include <x86/specialreg.h>
64 #include "vmm_ioport.h"
67 VIES_INIT
= (1U << 0),
68 VIES_MMIO
= (1U << 1),
69 VIES_INOUT
= (1U << 2),
70 VIES_OTHER
= (1U << 3),
71 VIES_INST_FETCH
= (1U << 4),
72 VIES_INST_DECODE
= (1U << 5),
73 VIES_PENDING_MMIO
= (1U << 6),
74 VIES_PENDING_INOUT
= (1U << 7),
75 VIES_REPEAT
= (1U << 8),
76 VIES_USER_FALLBACK
= (1U << 9),
77 VIES_COMPLETE
= (1U << 10),
80 /* State of request to perform emulated access (inout or MMIO) */
95 uint8_t op_byte
; /* actual opcode byte */
96 uint8_t op_type
; /* type of operation (e.g. MOV) */
100 #define VIE_INST_SIZE 15
102 uint8_t inst
[VIE_INST_SIZE
]; /* instruction bytes */
103 uint8_t num_valid
; /* size of the instruction */
104 uint8_t num_processed
;
106 uint8_t addrsize
:4, opsize
:4; /* address and operand sizes */
107 uint8_t rex_w
:1, /* REX prefix */
112 repz_present
:1, /* REP/REPE/REPZ prefix */
113 repnz_present
:1, /* REPNE/REPNZ prefix */
114 opsize_override
:1, /* Operand size override */
115 addrsize_override
:1, /* Address size override */
116 segment_override
:1; /* Segment override */
118 uint8_t mod
:2, /* ModRM byte */
122 uint8_t ss
:2, /* SIB byte */
123 vex_present
:1, /* VEX prefixed */
125 index
:4, /* SIB byte */
126 base
:4; /* SIB byte */
133 uint8_t vex_reg
:4, /* vvvv: first source reg specifier */
137 uint8_t _sparebytes
[2];
139 int base_register
; /* VM_REG_GUEST_xyz */
140 int index_register
; /* VM_REG_GUEST_xyz */
141 int segment_register
; /* VM_REG_GUEST_xyz */
143 int64_t displacement
; /* optional addr displacement */
144 int64_t immediate
; /* optional immediate operand */
146 struct vie_op op
; /* opcode description */
148 enum vie_status status
;
150 struct vm_guest_paging paging
; /* guest paging state */
152 uint64_t mmio_gpa
; /* faulting GPA */
153 struct vie_mmio mmio_req_read
;
154 struct vie_mmio mmio_req_write
;
156 struct vm_inout inout
; /* active in/out op */
157 enum vie_req inout_req_state
;
158 uint32_t inout_req_val
; /* value from userspace */
162 /* struct vie_op.op_type */
164 VIE_OP_TYPE_NONE
= 0,
172 VIE_OP_TYPE_TWO_BYTE
,
180 VIE_OP_TYPE_TWOB_GRP15
,
189 /* struct vie_op.op_flags */
190 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */
191 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
192 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */
193 #define VIE_OP_F_NO_MODRM (1 << 3)
194 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
195 #define VIE_OP_F_REG_REG (1 << 5) /* special-case for mov-cr */
197 static const struct vie_op three_byte_opcodes_0f38
[256] = {
200 .op_type
= VIE_OP_TYPE_BEXTR
,
204 static const struct vie_op two_byte_opcodes
[256] = {
207 .op_type
= VIE_OP_TYPE_CLTS
,
208 .op_flags
= VIE_OP_F_NO_MODRM
| VIE_OP_F_NO_GLA_VERIFICATION
212 .op_type
= VIE_OP_TYPE_MOV_CR
,
213 .op_flags
= VIE_OP_F_REG_REG
| VIE_OP_F_NO_GLA_VERIFICATION
217 .op_type
= VIE_OP_TYPE_MOV_CR
,
218 .op_flags
= VIE_OP_F_REG_REG
| VIE_OP_F_NO_GLA_VERIFICATION
222 .op_type
= VIE_OP_TYPE_TWOB_GRP15
,
226 .op_type
= VIE_OP_TYPE_MUL
,
230 .op_type
= VIE_OP_TYPE_MOVZX
,
234 .op_type
= VIE_OP_TYPE_MOVZX
,
238 .op_type
= VIE_OP_TYPE_BITTEST
,
239 .op_flags
= VIE_OP_F_IMM8
,
243 .op_type
= VIE_OP_TYPE_MOVSX
,
247 static const struct vie_op one_byte_opcodes
[256] = {
250 .op_type
= VIE_OP_TYPE_ADD
,
254 .op_type
= VIE_OP_TYPE_TWO_BYTE
258 .op_type
= VIE_OP_TYPE_OR
,
262 .op_type
= VIE_OP_TYPE_SUB
,
266 .op_type
= VIE_OP_TYPE_CMP
,
270 .op_type
= VIE_OP_TYPE_CMP
,
274 .op_type
= VIE_OP_TYPE_MOV
,
278 .op_type
= VIE_OP_TYPE_MOV
,
282 .op_type
= VIE_OP_TYPE_MOV
,
286 .op_type
= VIE_OP_TYPE_MOV
,
290 .op_type
= VIE_OP_TYPE_MOV
,
291 .op_flags
= VIE_OP_F_MOFFSET
| VIE_OP_F_NO_MODRM
,
295 .op_type
= VIE_OP_TYPE_MOV
,
296 .op_flags
= VIE_OP_F_MOFFSET
| VIE_OP_F_NO_MODRM
,
300 .op_type
= VIE_OP_TYPE_MOVS
,
301 .op_flags
= VIE_OP_F_NO_MODRM
| VIE_OP_F_NO_GLA_VERIFICATION
305 .op_type
= VIE_OP_TYPE_MOVS
,
306 .op_flags
= VIE_OP_F_NO_MODRM
| VIE_OP_F_NO_GLA_VERIFICATION
310 .op_type
= VIE_OP_TYPE_STOS
,
311 .op_flags
= VIE_OP_F_NO_MODRM
| VIE_OP_F_NO_GLA_VERIFICATION
315 .op_type
= VIE_OP_TYPE_STOS
,
316 .op_flags
= VIE_OP_F_NO_MODRM
| VIE_OP_F_NO_GLA_VERIFICATION
319 /* XXX Group 11 extended opcode - not just MOV */
321 .op_type
= VIE_OP_TYPE_MOV
,
322 .op_flags
= VIE_OP_F_IMM8
,
326 .op_type
= VIE_OP_TYPE_MOV
,
327 .op_flags
= VIE_OP_F_IMM
,
331 .op_type
= VIE_OP_TYPE_AND
,
334 /* Group 1 extended opcode */
336 .op_type
= VIE_OP_TYPE_GROUP1
,
337 .op_flags
= VIE_OP_F_IMM8
,
340 /* Group 1 extended opcode */
342 .op_type
= VIE_OP_TYPE_GROUP1
,
343 .op_flags
= VIE_OP_F_IMM
,
346 /* Group 1 extended opcode */
348 .op_type
= VIE_OP_TYPE_GROUP1
,
349 .op_flags
= VIE_OP_F_IMM8
,
352 /* XXX Group 1A extended opcode - not just POP */
354 .op_type
= VIE_OP_TYPE_POP
,
357 /* XXX Group 3 extended opcode - not just TEST */
359 .op_type
= VIE_OP_TYPE_TEST
,
360 .op_flags
= VIE_OP_F_IMM8
,
363 /* XXX Group 3 extended opcode - not just TEST */
365 .op_type
= VIE_OP_TYPE_TEST
,
366 .op_flags
= VIE_OP_F_IMM
,
369 /* XXX Group 5 extended opcode - not just PUSH */
371 .op_type
= VIE_OP_TYPE_PUSH
,
376 #define VIE_MOD_INDIRECT 0
377 #define VIE_MOD_INDIRECT_DISP8 1
378 #define VIE_MOD_INDIRECT_DISP32 2
379 #define VIE_MOD_DIRECT 3
383 #define VIE_RM_DISP32 5
385 #define GB (1024 * 1024 * 1024)
389 * Paging defines, previously pulled in from machine/pmap.h
391 #define PG_V (1 << 0) /* Present */
392 #define PG_RW (1 << 1) /* Read/Write */
393 #define PG_U (1 << 2) /* User/Supervisor */
394 #define PG_A (1 << 5) /* Accessed */
395 #define PG_M (1 << 6) /* Dirty */
396 #define PG_PS (1 << 7) /* Largepage */
399 * Paging except defines, previously pulled in from machine/pmap.h
401 #define PGEX_P (1 << 0) /* Non-present/Protection */
402 #define PGEX_W (1 << 1) /* Read/Write */
403 #define PGEX_U (1 << 2) /* User/Supervisor */
404 #define PGEX_RSV (1 << 3) /* (Non-)Reserved */
405 #define PGEX_I (1 << 4) /* Instruction */
408 static enum vm_reg_name gpr_map
[16] = {
427 static const char *gpr_name_map
[][16] = {
429 "a[hl]", "c[hl]", "d[hl]", "b[hl]", "spl", "bpl", "sil", "dil",
430 "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b",
433 "ax", "cx", "dx", "bx", "sp", "bp", "si", "di",
434 "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w",
437 "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi",
438 "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d",
441 "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
442 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
446 static enum vm_reg_name cr_map
[16] = {
465 static uint64_t size2mask
[] = {
469 [8] = 0xffffffffffffffff,
473 static int vie_mmio_read(struct vie
*vie
, struct vm
*vm
, int cpuid
,
474 uint64_t gpa
, uint64_t *rval
, int bytes
);
475 static int vie_mmio_write(struct vie
*vie
, struct vm
*vm
, int cpuid
,
476 uint64_t gpa
, uint64_t wval
, int bytes
);
477 static int vie_calculate_gla(enum vm_cpu_mode cpu_mode
, enum vm_reg_name seg
,
478 struct seg_desc
*desc
, uint64_t offset
, int length
, int addrsize
,
479 int prot
, uint64_t *gla
);
480 static int vie_canonical_check(enum vm_cpu_mode cpu_mode
, uint64_t gla
);
481 static int vie_alignment_check(int cpl
, int size
, uint64_t cr0
, uint64_t rf
,
483 static uint64_t vie_size2mask(int size
);
488 return (kmem_zalloc(sizeof (struct vie
), KM_SLEEP
));
492 vie_free(struct vie
*vie
)
494 kmem_free(vie
, sizeof (struct vie
));
498 vie_regnum_map(uint8_t regnum
)
500 VERIFY3U(regnum
, <, 16);
501 return (gpr_map
[regnum
]);
505 vie_regnum_name(uint8_t regnum
, uint8_t size
)
507 VERIFY3U(regnum
, <, 16);
508 VERIFY(size
== 1 || size
== 2 || size
== 4 || size
== 8);
509 return (gpr_name_map
[size
][regnum
]);
513 vie_calc_bytereg(struct vie
*vie
, enum vm_reg_name
*reg
, int *lhbr
)
516 *reg
= gpr_map
[vie
->reg
];
519 * 64-bit mode imposes limitations on accessing legacy high byte
522 * The legacy high-byte registers cannot be addressed if the REX
523 * prefix is present. In this case the values 4, 5, 6 and 7 of the
524 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
526 * If the REX prefix is not present then the values 4, 5, 6 and 7
527 * of the 'ModRM:reg' field address the legacy high-byte registers,
528 * %ah, %ch, %dh and %bh respectively.
530 if (!vie
->rex_present
) {
531 if (vie
->reg
& 0x4) {
533 *reg
= gpr_map
[vie
->reg
& 0x3];
539 vie_read_bytereg(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint8_t *rval
)
543 enum vm_reg_name reg
;
545 vie_calc_bytereg(vie
, ®
, &lhbr
);
546 error
= vm_get_register(vm
, vcpuid
, reg
, &val
);
549 * To obtain the value of a legacy high byte register shift the
550 * base register right by 8 bits (%ah = %rax >> 8).
560 vie_write_bytereg(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint8_t byte
)
562 uint64_t origval
, val
, mask
;
564 enum vm_reg_name reg
;
566 vie_calc_bytereg(vie
, ®
, &lhbr
);
567 error
= vm_get_register(vm
, vcpuid
, reg
, &origval
);
573 * Shift left by 8 to store 'byte' in a legacy high
579 val
|= origval
& ~mask
;
580 error
= vm_set_register(vm
, vcpuid
, reg
, val
);
586 vie_update_register(struct vm
*vm
, int vcpuid
, enum vm_reg_name reg
,
587 uint64_t val
, int size
)
595 error
= vm_get_register(vm
, vcpuid
, reg
, &origval
);
598 val
&= size2mask
[size
];
599 val
|= origval
& ~size2mask
[size
];
610 error
= vm_set_register(vm
, vcpuid
, reg
, val
);
615 vie_repeat(struct vie
*vie
)
617 vie
->status
|= VIES_REPEAT
;
620 * Clear out any cached operation values so the repeated instruction can
621 * begin without using that stale state. Other state, such as the
622 * decoding results, are kept around as it will not vary between
623 * iterations of a rep-prefixed instruction.
625 if ((vie
->status
& VIES_MMIO
) != 0) {
626 vie
->mmio_req_read
.state
= VR_NONE
;
627 vie
->mmio_req_write
.state
= VR_NONE
;
628 } else if ((vie
->status
& VIES_INOUT
) != 0) {
629 vie
->inout_req_state
= VR_NONE
;
631 panic("unexpected emulation state");
637 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
640 * Return the status flags that would result from doing (x - y).
645 getcc##sz(uint##sz##_t x, uint##sz##_t y) \
649 __asm __volatile("sub %2,%1; pushfq; popq %0" : \
650 "=r" (rflags), "+r" (x) : "m" (y)); \
661 getcc(int opsize
, uint64_t x
, uint64_t y
)
663 KASSERT(opsize
== 1 || opsize
== 2 || opsize
== 4 || opsize
== 8,
664 ("getcc: invalid operand size %d", opsize
));
667 return (getcc8(x
, y
));
668 else if (opsize
== 2)
669 return (getcc16(x
, y
));
670 else if (opsize
== 4)
671 return (getcc32(x
, y
));
673 return (getcc64(x
, y
));
677 * Macro creation of functions getaddflags{8,16,32,64}
680 #define GETADDFLAGS(sz) \
682 getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \
686 __asm __volatile("add %2,%1; pushfq; popq %0" : \
687 "=r" (rflags), "+r" (x) : "m" (y)); \
698 getaddflags(int opsize
, uint64_t x
, uint64_t y
)
700 KASSERT(opsize
== 1 || opsize
== 2 || opsize
== 4 || opsize
== 8,
701 ("getaddflags: invalid operand size %d", opsize
));
704 return (getaddflags8(x
, y
));
705 else if (opsize
== 2)
706 return (getaddflags16(x
, y
));
707 else if (opsize
== 4)
708 return (getaddflags32(x
, y
));
710 return (getaddflags64(x
, y
));
714 * Macro creation of functions getimulflags{16,32,64}
717 #define GETIMULFLAGS(sz) \
719 getimulflags##sz(uint##sz##_t x, uint##sz##_t y) \
723 __asm __volatile("imul %2,%1; pushfq; popq %0" : \
724 "=r" (rflags), "+r" (x) : "m" (y)); \
734 getimulflags(int opsize
, uint64_t x
, uint64_t y
)
736 KASSERT(opsize
== 2 || opsize
== 4 || opsize
== 8,
737 ("getimulflags: invalid operand size %d", opsize
));
740 return (getimulflags16(x
, y
));
741 else if (opsize
== 4)
742 return (getimulflags32(x
, y
));
744 return (getimulflags64(x
, y
));
748 * Return the status flags that would result from doing (x & y).
751 #define GETANDFLAGS(sz) \
753 getandflags##sz(uint##sz##_t x, uint##sz##_t y) \
757 __asm __volatile("and %2,%1; pushfq; popq %0" : \
758 "=r" (rflags), "+r" (x) : "m" (y)); \
769 getandflags(int opsize
, uint64_t x
, uint64_t y
)
771 KASSERT(opsize
== 1 || opsize
== 2 || opsize
== 4 || opsize
== 8,
772 ("getandflags: invalid operand size %d", opsize
));
775 return (getandflags8(x
, y
));
776 else if (opsize
== 2)
777 return (getandflags16(x
, y
));
778 else if (opsize
== 4)
779 return (getandflags32(x
, y
));
781 return (getandflags64(x
, y
));
785 vie_emulate_mov_cr(struct vie
*vie
, struct vm
*vm
, int vcpuid
)
789 enum vm_reg_name gpr
= gpr_map
[vie
->rm
];
790 enum vm_reg_name cr
= cr_map
[vie
->reg
];
793 if (vie
->paging
.cpu_mode
== CPU_MODE_64BIT
) {
797 switch (vie
->op
.op_byte
) {
800 * MOV control register (ModRM:reg) to reg (ModRM:r/m)
801 * 20/r: mov r32, CR0-CR7
802 * 20/r: mov r64, CR0-CR7
803 * REX.R + 20/0: mov r64, CR8
805 if (vie
->paging
.cpl
!= 0) {
806 vm_inject_gp(vm
, vcpuid
);
807 vie
->num_processed
= 0;
810 err
= vm_get_register(vm
, vcpuid
, cr
, &val
);
812 /* #UD for access to non-existent CRs */
813 vm_inject_ud(vm
, vcpuid
);
814 vie
->num_processed
= 0;
817 err
= vie_update_register(vm
, vcpuid
, gpr
, val
, size
);
821 * MOV reg (ModRM:r/m) to control register (ModRM:reg)
822 * 22/r: mov CR0-CR7, r32
823 * 22/r: mov CR0-CR7, r64
824 * REX.R + 22/0: mov CR8, r64
828 if (vie
->paging
.cpl
!= 0) {
829 vm_inject_gp(vm
, vcpuid
);
830 vie
->num_processed
= 0;
833 err
= vm_get_register(vm
, vcpuid
, cr
, &old
);
835 /* #UD for access to non-existent CRs */
836 vm_inject_ud(vm
, vcpuid
);
837 vie
->num_processed
= 0;
840 err
= vm_get_register(vm
, vcpuid
, gpr
, &val
);
842 val
&= size2mask
[size
];
846 case VM_REG_GUEST_CR0
:
847 if ((diff
& CR0_PG
) != 0) {
850 err
= vm_get_register(vm
, vcpuid
,
851 VM_REG_GUEST_EFER
, &efer
);
854 /* Keep the long-mode state in EFER in sync */
855 if ((val
& CR0_PG
) != 0 &&
856 (efer
& EFER_LME
) != 0) {
859 if ((val
& CR0_PG
) == 0 &&
860 (efer
& EFER_LME
) != 0) {
864 err
= vm_set_register(vm
, vcpuid
,
865 VM_REG_GUEST_EFER
, efer
);
868 /* TODO: enforce more of the #GP checks */
869 err
= vm_set_register(vm
, vcpuid
, cr
, val
);
872 case VM_REG_GUEST_CR2
:
873 case VM_REG_GUEST_CR3
:
874 case VM_REG_GUEST_CR4
:
875 /* TODO: enforce more of the #GP checks */
876 err
= vm_set_register(vm
, vcpuid
, cr
, val
);
879 /* The cr_map mapping should prevent this */
880 panic("invalid cr %d", cr
);
891 vie_emulate_mov(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
894 enum vm_reg_name reg
;
901 switch (vie
->op
.op_byte
) {
904 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
906 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
908 size
= 1; /* override for byte operation */
909 error
= vie_read_bytereg(vie
, vm
, vcpuid
, &byte
);
911 error
= vie_mmio_write(vie
, vm
, vcpuid
, gpa
, byte
,
917 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
918 * 89/r: mov r/m16, r16
919 * 89/r: mov r/m32, r32
920 * REX.W + 89/r mov r/m64, r64
922 reg
= gpr_map
[vie
->reg
];
923 error
= vm_get_register(vm
, vcpuid
, reg
, &val
);
925 val
&= size2mask
[size
];
926 error
= vie_mmio_write(vie
, vm
, vcpuid
, gpa
, val
, size
);
931 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
933 * REX + 8A/r: mov r8, r/m8
935 size
= 1; /* override for byte operation */
936 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val
, size
);
938 error
= vie_write_bytereg(vie
, vm
, vcpuid
, val
);
942 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
943 * 8B/r: mov r16, r/m16
944 * 8B/r: mov r32, r/m32
945 * REX.W 8B/r: mov r64, r/m64
947 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val
, size
);
949 reg
= gpr_map
[vie
->reg
];
950 error
= vie_update_register(vm
, vcpuid
, reg
, val
, size
);
955 * MOV from seg:moffset to AX/EAX/RAX
956 * A1: mov AX, moffs16
957 * A1: mov EAX, moffs32
958 * REX.W + A1: mov RAX, moffs64
960 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val
, size
);
962 reg
= VM_REG_GUEST_RAX
;
963 error
= vie_update_register(vm
, vcpuid
, reg
, val
, size
);
968 * MOV from AX/EAX/RAX to seg:moffset
969 * A3: mov moffs16, AX
970 * A3: mov moffs32, EAX
971 * REX.W + A3: mov moffs64, RAX
973 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RAX
, &val
);
975 val
&= size2mask
[size
];
976 error
= vie_mmio_write(vie
, vm
, vcpuid
, gpa
, val
, size
);
981 * MOV from imm8 to mem (ModRM:r/m)
982 * C6/0 mov r/m8, imm8
983 * REX + C6/0 mov r/m8, imm8
985 size
= 1; /* override for byte operation */
986 val
= vie
->immediate
;
987 error
= vie_mmio_write(vie
, vm
, vcpuid
, gpa
, val
, size
);
991 * MOV from imm16/imm32 to mem (ModRM:r/m)
992 * C7/0 mov r/m16, imm16
993 * C7/0 mov r/m32, imm32
994 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
996 val
= vie
->immediate
& size2mask
[size
];
997 error
= vie_mmio_write(vie
, vm
, vcpuid
, gpa
, val
, size
);
1007 vie_emulate_movx(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
1010 enum vm_reg_name reg
;
1016 switch (vie
->op
.op_byte
) {
1019 * MOV and zero extend byte from mem (ModRM:r/m) to
1022 * 0F B6/r movzx r16, r/m8
1023 * 0F B6/r movzx r32, r/m8
1024 * REX.W + 0F B6/r movzx r64, r/m8
1027 /* get the first operand */
1028 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val
, 1);
1032 /* get the second operand */
1033 reg
= gpr_map
[vie
->reg
];
1035 /* zero-extend byte */
1038 /* write the result */
1039 error
= vie_update_register(vm
, vcpuid
, reg
, val
, size
);
1043 * MOV and zero extend word from mem (ModRM:r/m) to
1046 * 0F B7/r movzx r32, r/m16
1047 * REX.W + 0F B7/r movzx r64, r/m16
1049 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val
, 2);
1053 reg
= gpr_map
[vie
->reg
];
1055 /* zero-extend word */
1056 val
= (uint16_t)val
;
1058 error
= vie_update_register(vm
, vcpuid
, reg
, val
, size
);
1062 * MOV and sign extend byte from mem (ModRM:r/m) to
1065 * 0F BE/r movsx r16, r/m8
1066 * 0F BE/r movsx r32, r/m8
1067 * REX.W + 0F BE/r movsx r64, r/m8
1070 /* get the first operand */
1071 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val
, 1);
1075 /* get the second operand */
1076 reg
= gpr_map
[vie
->reg
];
1078 /* sign extend byte */
1081 /* write the result */
1082 error
= vie_update_register(vm
, vcpuid
, reg
, val
, size
);
1091 * Helper function to calculate and validate a linear address.
1094 vie_get_gla(struct vie
*vie
, struct vm
*vm
, int vcpuid
, int opsize
,
1095 int addrsize
, int prot
, enum vm_reg_name seg
, enum vm_reg_name gpr
,
1098 struct seg_desc desc
;
1099 uint64_t cr0
, val
, rflags
;
1101 struct vm_guest_paging
*paging
;
1103 paging
= &vie
->paging
;
1105 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_CR0
, &cr0
);
1106 KASSERT(error
== 0, ("%s: error %d getting cr0", __func__
, error
));
1108 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, &rflags
);
1109 KASSERT(error
== 0, ("%s: error %d getting rflags", __func__
, error
));
1111 error
= vm_get_seg_desc(vm
, vcpuid
, seg
, &desc
);
1112 KASSERT(error
== 0, ("%s: error %d getting segment descriptor %d",
1113 __func__
, error
, seg
));
1115 error
= vm_get_register(vm
, vcpuid
, gpr
, &val
);
1116 KASSERT(error
== 0, ("%s: error %d getting register %d", __func__
,
1119 if (vie_calculate_gla(paging
->cpu_mode
, seg
, &desc
, val
, opsize
,
1120 addrsize
, prot
, gla
)) {
1121 if (seg
== VM_REG_GUEST_SS
)
1122 vm_inject_ss(vm
, vcpuid
, 0);
1124 vm_inject_gp(vm
, vcpuid
);
1128 if (vie_canonical_check(paging
->cpu_mode
, *gla
)) {
1129 if (seg
== VM_REG_GUEST_SS
)
1130 vm_inject_ss(vm
, vcpuid
, 0);
1132 vm_inject_gp(vm
, vcpuid
);
1136 if (vie_alignment_check(paging
->cpl
, opsize
, cr0
, rflags
, *gla
)) {
1137 vm_inject_ac(vm
, vcpuid
, 0);
1145 vie_emulate_movs(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
1147 struct vm_copyinfo copyinfo
[2];
1148 uint64_t dstaddr
, srcaddr
, dstgpa
, srcgpa
, val
;
1149 uint64_t rcx
, rdi
, rsi
, rflags
;
1150 int error
, fault
, opsize
, seg
, repeat
;
1151 struct vm_guest_paging
*paging
;
1153 opsize
= (vie
->op
.op_byte
== 0xA4) ? 1 : vie
->opsize
;
1156 paging
= &vie
->paging
;
1159 * XXX although the MOVS instruction is only supposed to be used with
1160 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
1162 * Empirically the "repnz" prefix has identical behavior to "rep"
1163 * and the zero flag does not make a difference.
1165 repeat
= vie
->repz_present
| vie
->repnz_present
;
1168 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RCX
, &rcx
);
1169 KASSERT(!error
, ("%s: error %d getting rcx", __func__
, error
));
1172 * The count register is %rcx, %ecx or %cx depending on the
1173 * address size of the instruction.
1175 if ((rcx
& vie_size2mask(vie
->addrsize
)) == 0) {
1182 * Source Destination Comments
1183 * --------------------------------------------
1184 * (1) memory memory n/a
1185 * (2) memory mmio emulated
1186 * (3) mmio memory emulated
1187 * (4) mmio mmio emulated
1189 * At this point we don't have sufficient information to distinguish
1190 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
1191 * out because it will succeed only when operating on regular memory.
1193 * XXX the emulation doesn't properly handle the case where 'gpa'
1194 * is straddling the boundary between the normal memory and MMIO.
1197 seg
= vie
->segment_override
? vie
->segment_register
: VM_REG_GUEST_DS
;
1198 if (vie_get_gla(vie
, vm
, vcpuid
, opsize
, vie
->addrsize
, PROT_READ
, seg
,
1199 VM_REG_GUEST_RSI
, &srcaddr
) != 0) {
1203 error
= vm_copy_setup(vm
, vcpuid
, paging
, srcaddr
, opsize
, PROT_READ
,
1204 copyinfo
, nitems(copyinfo
), &fault
);
1207 goto done
; /* Resume guest to handle fault */
1210 * case (2): read from system memory and write to mmio.
1212 vm_copyin(vm
, vcpuid
, copyinfo
, &val
, opsize
);
1213 vm_copy_teardown(vm
, vcpuid
, copyinfo
, nitems(copyinfo
));
1214 error
= vie_mmio_write(vie
, vm
, vcpuid
, gpa
, val
, opsize
);
1219 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
1220 * if 'srcaddr' is in the mmio space.
1223 if (vie_get_gla(vie
, vm
, vcpuid
, opsize
, vie
->addrsize
,
1224 PROT_WRITE
, VM_REG_GUEST_ES
, VM_REG_GUEST_RDI
,
1229 error
= vm_copy_setup(vm
, vcpuid
, paging
, dstaddr
, opsize
,
1230 PROT_WRITE
, copyinfo
, nitems(copyinfo
), &fault
);
1233 goto done
; /* Resume guest to handle fault */
1236 * case (3): read from MMIO and write to system memory.
1238 * A MMIO read can have side-effects so we
1239 * commit to it only after vm_copy_setup() is
1240 * successful. If a page-fault needs to be
1241 * injected into the guest then it will happen
1242 * before the MMIO read is attempted.
1244 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val
,
1248 vm_copyout(vm
, vcpuid
, &val
, copyinfo
, opsize
);
1251 * Regardless of whether the MMIO read was successful or
1252 * not, the copy resources must be cleaned up.
1254 vm_copy_teardown(vm
, vcpuid
, copyinfo
,
1261 * Case (4): read from and write to mmio.
1263 * Commit to the MMIO read/write (with potential
1264 * side-effects) only after we are sure that the
1265 * instruction is not going to be restarted due
1266 * to address translation faults.
1268 error
= vm_gla2gpa(vm
, vcpuid
, paging
, srcaddr
,
1269 PROT_READ
, &srcgpa
, &fault
);
1273 error
= vm_gla2gpa(vm
, vcpuid
, paging
, dstaddr
,
1274 PROT_WRITE
, &dstgpa
, &fault
);
1278 error
= vie_mmio_read(vie
, vm
, vcpuid
, srcgpa
, &val
,
1283 error
= vie_mmio_write(vie
, vm
, vcpuid
, dstgpa
, val
,
1290 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RSI
, &rsi
);
1291 KASSERT(error
== 0, ("%s: error %d getting rsi", __func__
, error
));
1293 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RDI
, &rdi
);
1294 KASSERT(error
== 0, ("%s: error %d getting rdi", __func__
, error
));
1296 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, &rflags
);
1297 KASSERT(error
== 0, ("%s: error %d getting rflags", __func__
, error
));
1299 if (rflags
& PSL_D
) {
1307 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RSI
, rsi
,
1309 KASSERT(error
== 0, ("%s: error %d updating rsi", __func__
, error
));
1311 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RDI
, rdi
,
1313 KASSERT(error
== 0, ("%s: error %d updating rdi", __func__
, error
));
1317 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RCX
,
1318 rcx
, vie
->addrsize
);
1319 KASSERT(!error
, ("%s: error %d updating rcx", __func__
, error
));
1322 * Repeat the instruction if the count register is not zero.
1324 if ((rcx
& vie_size2mask(vie
->addrsize
)) != 0)
1325 return (vie_repeat(vie
));
1332 vie_emulate_stos(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
1334 int error
, opsize
, repeat
;
1336 uint64_t rcx
, rdi
, rflags
;
1338 opsize
= (vie
->op
.op_byte
== 0xAA) ? 1 : vie
->opsize
;
1339 repeat
= vie
->repz_present
| vie
->repnz_present
;
1342 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RCX
, &rcx
);
1343 KASSERT(!error
, ("%s: error %d getting rcx", __func__
, error
));
1346 * The count register is %rcx, %ecx or %cx depending on the
1347 * address size of the instruction.
1349 if ((rcx
& vie_size2mask(vie
->addrsize
)) == 0)
1353 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RAX
, &val
);
1354 KASSERT(!error
, ("%s: error %d getting rax", __func__
, error
));
1356 error
= vie_mmio_write(vie
, vm
, vcpuid
, gpa
, val
, opsize
);
1360 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RDI
, &rdi
);
1361 KASSERT(error
== 0, ("%s: error %d getting rdi", __func__
, error
));
1363 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, &rflags
);
1364 KASSERT(error
== 0, ("%s: error %d getting rflags", __func__
, error
));
1371 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RDI
, rdi
,
1373 KASSERT(error
== 0, ("%s: error %d updating rdi", __func__
, error
));
1377 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RCX
,
1378 rcx
, vie
->addrsize
);
1379 KASSERT(!error
, ("%s: error %d updating rcx", __func__
, error
));
1382 * Repeat the instruction if the count register is not zero.
1384 if ((rcx
& vie_size2mask(vie
->addrsize
)) != 0)
1385 return (vie_repeat(vie
));
1392 vie_emulate_and(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
1395 enum vm_reg_name reg
;
1396 uint64_t result
, rflags
, rflags2
, val1
, val2
;
1401 switch (vie
->op
.op_byte
) {
1404 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1407 * 23/r and r16, r/m16
1408 * 23/r and r32, r/m32
1409 * REX.W + 23/r and r64, r/m64
1412 /* get the first operand */
1413 reg
= gpr_map
[vie
->reg
];
1414 error
= vm_get_register(vm
, vcpuid
, reg
, &val1
);
1418 /* get the second operand */
1419 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val2
, size
);
1423 /* perform the operation and write the result */
1424 result
= val1
& val2
;
1425 error
= vie_update_register(vm
, vcpuid
, reg
, result
, size
);
1430 * AND mem (ModRM:r/m) with immediate and store the
1433 * 81 /4 and r/m16, imm16
1434 * 81 /4 and r/m32, imm32
1435 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64
1437 * 83 /4 and r/m16, imm8 sign-extended to 16
1438 * 83 /4 and r/m32, imm8 sign-extended to 32
1439 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64
1442 /* get the first operand */
1443 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val1
, size
);
1448 * perform the operation with the pre-fetched immediate
1449 * operand and write the result
1451 result
= val1
& vie
->immediate
;
1452 error
= vie_mmio_write(vie
, vm
, vcpuid
, gpa
, result
, size
);
1460 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, &rflags
);
1465 * OF and CF are cleared; the SF, ZF and PF flags are set according
1466 * to the result; AF is undefined.
1468 * The updated status flags are obtained by subtracting 0 from 'result'.
1470 rflags2
= getcc(size
, result
, 0);
1471 rflags
&= ~RFLAGS_STATUS_BITS
;
1472 rflags
|= rflags2
& (PSL_PF
| PSL_Z
| PSL_N
);
1474 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, rflags
, 8);
1479 vie_emulate_or(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
1482 enum vm_reg_name reg
;
1483 uint64_t result
, rflags
, rflags2
, val1
, val2
;
1488 switch (vie
->op
.op_byte
) {
1491 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1494 * 0b/r or r16, r/m16
1495 * 0b/r or r32, r/m32
1496 * REX.W + 0b/r or r64, r/m64
1499 /* get the first operand */
1500 reg
= gpr_map
[vie
->reg
];
1501 error
= vm_get_register(vm
, vcpuid
, reg
, &val1
);
1505 /* get the second operand */
1506 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val2
, size
);
1510 /* perform the operation and write the result */
1511 result
= val1
| val2
;
1512 error
= vie_update_register(vm
, vcpuid
, reg
, result
, size
);
1517 * OR mem (ModRM:r/m) with immediate and store the
1520 * 81 /1 or r/m16, imm16
1521 * 81 /1 or r/m32, imm32
1522 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64
1524 * 83 /1 or r/m16, imm8 sign-extended to 16
1525 * 83 /1 or r/m32, imm8 sign-extended to 32
1526 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64
1529 /* get the first operand */
1530 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val1
, size
);
1535 * perform the operation with the pre-fetched immediate
1536 * operand and write the result
1538 result
= val1
| vie
->immediate
;
1539 error
= vie_mmio_write(vie
, vm
, vcpuid
, gpa
, result
, size
);
1547 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, &rflags
);
1552 * OF and CF are cleared; the SF, ZF and PF flags are set according
1553 * to the result; AF is undefined.
1555 * The updated status flags are obtained by subtracting 0 from 'result'.
1557 rflags2
= getcc(size
, result
, 0);
1558 rflags
&= ~RFLAGS_STATUS_BITS
;
1559 rflags
|= rflags2
& (PSL_PF
| PSL_Z
| PSL_N
);
1561 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, rflags
, 8);
1566 vie_emulate_cmp(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
1569 uint64_t regop
, memop
, op1
, op2
, rflags
, rflags2
;
1570 enum vm_reg_name reg
;
1573 switch (vie
->op
.op_byte
) {
1577 * 39/r CMP r/m16, r16
1578 * 39/r CMP r/m32, r32
1579 * REX.W 39/r CMP r/m64, r64
1581 * 3B/r CMP r16, r/m16
1582 * 3B/r CMP r32, r/m32
1583 * REX.W + 3B/r CMP r64, r/m64
1585 * Compare the first operand with the second operand and
1586 * set status flags in EFLAGS register. The comparison is
1587 * performed by subtracting the second operand from the first
1588 * operand and then setting the status flags.
1591 /* Get the register operand */
1592 reg
= gpr_map
[vie
->reg
];
1593 error
= vm_get_register(vm
, vcpuid
, reg
, ®op
);
1597 /* Get the memory operand */
1598 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &memop
, size
);
1602 if (vie
->op
.op_byte
== 0x3B) {
1609 rflags2
= getcc(size
, op1
, op2
);
1615 * 80 /7 cmp r/m8, imm8
1616 * REX + 80 /7 cmp r/m8, imm8
1618 * 81 /7 cmp r/m16, imm16
1619 * 81 /7 cmp r/m32, imm32
1620 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64
1622 * 83 /7 cmp r/m16, imm8 sign-extended to 16
1623 * 83 /7 cmp r/m32, imm8 sign-extended to 32
1624 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64
1626 * Compare mem (ModRM:r/m) with immediate and set
1627 * status flags according to the results. The
1628 * comparison is performed by subtracting the
1629 * immediate from the first operand and then setting
1633 if (vie
->op
.op_byte
== 0x80)
1636 /* get the first operand */
1637 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &op1
, size
);
1641 rflags2
= getcc(size
, op1
, vie
->immediate
);
1646 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, &rflags
);
1649 rflags
&= ~RFLAGS_STATUS_BITS
;
1650 rflags
|= rflags2
& RFLAGS_STATUS_BITS
;
1652 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, rflags
, 8);
1657 vie_emulate_test(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
1660 uint64_t op1
, rflags
, rflags2
;
1665 switch (vie
->op
.op_byte
) {
1668 * F6 /0 test r/m8, imm8
1670 * Test mem (ModRM:r/m) with immediate and set status
1671 * flags according to the results. The comparison is
1672 * performed by anding the immediate from the first
1673 * operand and then setting the status flags.
1675 if ((vie
->reg
& 7) != 0)
1678 size
= 1; /* override for byte operation */
1680 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &op1
, size
);
1684 rflags2
= getandflags(size
, op1
, vie
->immediate
);
1688 * F7 /0 test r/m16, imm16
1689 * F7 /0 test r/m32, imm32
1690 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64
1692 * Test mem (ModRM:r/m) with immediate and set status
1693 * flags according to the results. The comparison is
1694 * performed by anding the immediate from the first
1695 * operand and then setting the status flags.
1697 if ((vie
->reg
& 7) != 0)
1700 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &op1
, size
);
1704 rflags2
= getandflags(size
, op1
, vie
->immediate
);
1709 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, &rflags
);
1714 * OF and CF are cleared; the SF, ZF and PF flags are set according
1715 * to the result; AF is undefined.
1717 rflags
&= ~RFLAGS_STATUS_BITS
;
1718 rflags
|= rflags2
& (PSL_PF
| PSL_Z
| PSL_N
);
1720 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, rflags
, 8);
1725 vie_emulate_bextr(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
1727 uint64_t src1
, src2
, dst
, rflags
;
1728 unsigned start
, len
, size
;
1730 struct vm_guest_paging
*paging
;
1734 paging
= &vie
->paging
;
1737 * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b
1738 * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b
1740 * Destination operand is ModRM:reg. Source operands are ModRM:r/m and
1743 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
1745 if (size
!= 4 && paging
->cpu_mode
!= CPU_MODE_64BIT
)
1749 * Extracts contiguous bits from the first /source/ operand (second
1750 * operand) using an index and length specified in the second /source/
1751 * operand (third operand).
1753 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &src1
, size
);
1756 error
= vm_get_register(vm
, vcpuid
, gpr_map
[vie
->vex_reg
], &src2
);
1759 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, &rflags
);
1763 start
= (src2
& 0xff);
1764 len
= (src2
& 0xff00) >> 8;
1766 /* If no bits are extracted, the destination register is cleared. */
1769 /* If START exceeds the operand size, no bits are extracted. */
1770 if (start
> size
* 8)
1772 /* Length is bounded by both the destination size and start offset. */
1773 if (start
+ len
> size
* 8)
1774 len
= (size
* 8) - start
;
1779 src1
= (src1
>> start
);
1781 src1
= src1
& ((1ull << len
) - 1);
1785 error
= vie_update_register(vm
, vcpuid
, gpr_map
[vie
->reg
], dst
, size
);
1790 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
1791 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
1793 rflags
&= ~RFLAGS_STATUS_BITS
;
1796 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, rflags
,
1802 vie_emulate_add(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
1805 uint64_t nval
, rflags
, rflags2
, val1
, val2
;
1806 enum vm_reg_name reg
;
1811 switch (vie
->op
.op_byte
) {
1814 * ADD r/m to r and store the result in r
1816 * 03/r ADD r16, r/m16
1817 * 03/r ADD r32, r/m32
1818 * REX.W + 03/r ADD r64, r/m64
1821 /* get the first operand */
1822 reg
= gpr_map
[vie
->reg
];
1823 error
= vm_get_register(vm
, vcpuid
, reg
, &val1
);
1827 /* get the second operand */
1828 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val2
, size
);
1832 /* perform the operation and write the result */
1834 error
= vie_update_register(vm
, vcpuid
, reg
, nval
, size
);
1841 rflags2
= getaddflags(size
, val1
, val2
);
1842 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
,
1847 rflags
&= ~RFLAGS_STATUS_BITS
;
1848 rflags
|= rflags2
& RFLAGS_STATUS_BITS
;
1849 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
,
1857 vie_emulate_sub(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
1860 uint64_t nval
, rflags
, rflags2
, val1
, val2
;
1861 enum vm_reg_name reg
;
1866 switch (vie
->op
.op_byte
) {
1869 * SUB r/m from r and store the result in r
1871 * 2B/r SUB r16, r/m16
1872 * 2B/r SUB r32, r/m32
1873 * REX.W + 2B/r SUB r64, r/m64
1876 /* get the first operand */
1877 reg
= gpr_map
[vie
->reg
];
1878 error
= vm_get_register(vm
, vcpuid
, reg
, &val1
);
1882 /* get the second operand */
1883 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val2
, size
);
1887 /* perform the operation and write the result */
1889 error
= vie_update_register(vm
, vcpuid
, reg
, nval
, size
);
1896 rflags2
= getcc(size
, val1
, val2
);
1897 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
,
1902 rflags
&= ~RFLAGS_STATUS_BITS
;
1903 rflags
|= rflags2
& RFLAGS_STATUS_BITS
;
1904 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
,
1912 vie_emulate_mul(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
1915 uint64_t rflags
, rflags2
, val1
, val2
;
1917 enum vm_reg_name reg
;
1918 ulong_t (*getflags
)(int, uint64_t, uint64_t) = NULL
;
1923 switch (vie
->op
.op_byte
) {
1926 * Multiply the contents of a destination register by
1927 * the contents of a register or memory operand and
1928 * put the signed result in the destination register.
1930 * AF/r IMUL r16, r/m16
1931 * AF/r IMUL r32, r/m32
1932 * REX.W + AF/r IMUL r64, r/m64
1935 getflags
= getimulflags
;
1937 /* get the first operand */
1938 reg
= gpr_map
[vie
->reg
];
1939 error
= vm_get_register(vm
, vcpuid
, reg
, &val1
);
1943 /* get the second operand */
1944 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val2
, size
);
1948 /* perform the operation and write the result */
1949 nval
= (int64_t)val1
* (int64_t)val2
;
1951 error
= vie_update_register(vm
, vcpuid
, reg
, nval
, size
);
1953 DTRACE_PROBE4(vie__imul
,
1954 const char *, vie_regnum_name(vie
->reg
, size
),
1955 uint64_t, val1
, uint64_t, val2
, __uint128_t
, nval
);
1963 rflags2
= getflags(size
, val1
, val2
);
1964 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
,
1969 rflags
&= ~RFLAGS_STATUS_BITS
;
1970 rflags
|= rflags2
& RFLAGS_STATUS_BITS
;
1971 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
,
1974 DTRACE_PROBE2(vie__imul__rflags
,
1975 uint64_t, rflags
, uint64_t, rflags2
);
1982 vie_emulate_stack_op(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
1984 struct vm_copyinfo copyinfo
[2];
1985 struct seg_desc ss_desc
;
1986 uint64_t cr0
, rflags
, rsp
, stack_gla
, val
;
1987 int error
, fault
, size
, stackaddrsize
, pushop
;
1988 struct vm_guest_paging
*paging
;
1992 pushop
= (vie
->op
.op_type
== VIE_OP_TYPE_PUSH
) ? 1 : 0;
1993 paging
= &vie
->paging
;
1996 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1998 if (paging
->cpu_mode
== CPU_MODE_REAL
) {
2000 } else if (paging
->cpu_mode
== CPU_MODE_64BIT
) {
2002 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
2003 * - Stack pointer size is always 64-bits.
2004 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
2005 * - 16-bit PUSH/POP is supported by using the operand size
2006 * override prefix (66H).
2009 size
= vie
->opsize_override
? 2 : 8;
2012 * In protected or compatibility mode the 'B' flag in the
2013 * stack-segment descriptor determines the size of the
2016 error
= vm_get_seg_desc(vm
, vcpuid
, VM_REG_GUEST_SS
, &ss_desc
);
2017 KASSERT(error
== 0, ("%s: error %d getting SS descriptor",
2019 if (SEG_DESC_DEF32(ss_desc
.access
))
2025 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_CR0
, &cr0
);
2026 KASSERT(error
== 0, ("%s: error %d getting cr0", __func__
, error
));
2028 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, &rflags
);
2029 KASSERT(error
== 0, ("%s: error %d getting rflags", __func__
, error
));
2031 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RSP
, &rsp
);
2032 KASSERT(error
== 0, ("%s: error %d getting rsp", __func__
, error
));
2037 if (vie_calculate_gla(paging
->cpu_mode
, VM_REG_GUEST_SS
, &ss_desc
,
2038 rsp
, size
, stackaddrsize
, pushop
? PROT_WRITE
: PROT_READ
,
2040 vm_inject_ss(vm
, vcpuid
, 0);
2044 if (vie_canonical_check(paging
->cpu_mode
, stack_gla
)) {
2045 vm_inject_ss(vm
, vcpuid
, 0);
2049 if (vie_alignment_check(paging
->cpl
, size
, cr0
, rflags
, stack_gla
)) {
2050 vm_inject_ac(vm
, vcpuid
, 0);
2054 error
= vm_copy_setup(vm
, vcpuid
, paging
, stack_gla
, size
,
2055 pushop
? PROT_WRITE
: PROT_READ
, copyinfo
, nitems(copyinfo
),
2061 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val
, size
);
2063 vm_copyout(vm
, vcpuid
, &val
, copyinfo
, size
);
2065 vm_copyin(vm
, vcpuid
, copyinfo
, &val
, size
);
2066 error
= vie_mmio_write(vie
, vm
, vcpuid
, gpa
, val
, size
);
2069 vm_copy_teardown(vm
, vcpuid
, copyinfo
, nitems(copyinfo
));
2072 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RSP
, rsp
,
2074 KASSERT(error
== 0, ("error %d updating rsp", error
));
2080 vie_emulate_push(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
2085 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
2087 * PUSH is part of the group 5 extended opcodes and is identified
2088 * by ModRM:reg = b110.
2090 if ((vie
->reg
& 7) != 6)
2093 error
= vie_emulate_stack_op(vie
, vm
, vcpuid
, gpa
);
2098 vie_emulate_pop(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
2103 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
2105 * POP is part of the group 1A extended opcodes and is identified
2106 * by ModRM:reg = b000.
2108 if ((vie
->reg
& 7) != 0)
2111 error
= vie_emulate_stack_op(vie
, vm
, vcpuid
, gpa
);
2116 vie_emulate_group1(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
2120 switch (vie
->reg
& 7) {
2122 error
= vie_emulate_or(vie
, vm
, vcpuid
, gpa
);
2125 error
= vie_emulate_and(vie
, vm
, vcpuid
, gpa
);
2128 error
= vie_emulate_cmp(vie
, vm
, vcpuid
, gpa
);
2139 vie_emulate_bittest(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t gpa
)
2141 uint64_t val
, rflags
;
2142 int error
, bitmask
, bitoff
;
2145 * 0F BA is a Group 8 extended opcode.
2147 * Currently we only emulate the 'Bit Test' instruction which is
2148 * identified by a ModR/M:reg encoding of 100b.
2150 if ((vie
->reg
& 7) != 4)
2153 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, &rflags
);
2154 KASSERT(error
== 0, ("%s: error %d getting rflags", __func__
, error
));
2156 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &val
, vie
->opsize
);
2161 * Intel SDM, Vol 2, Table 3-2:
2162 * "Range of Bit Positions Specified by Bit Offset Operands"
2164 bitmask
= vie
->opsize
* 8 - 1;
2165 bitoff
= vie
->immediate
& bitmask
;
2167 /* Copy the bit into the Carry flag in %rflags */
2168 if (val
& (1UL << bitoff
))
2173 error
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
, rflags
, 8);
2174 KASSERT(error
== 0, ("%s: error %d updating rflags", __func__
, error
));
2180 vie_emulate_twob_group15(struct vie
*vie
, struct vm
*vm
, int vcpuid
,
2186 switch (vie
->reg
& 7) {
2187 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */
2188 if (vie
->mod
== 0x3) {
2190 * SFENCE. Ignore it, VM exit provides enough
2191 * barriers on its own.
2196 * CLFLUSH, CLFLUSHOPT. Only check for access
2199 error
= vie_mmio_read(vie
, vm
, vcpuid
, gpa
, &buf
, 1);
2211 vie_emulate_clts(struct vie
*vie
, struct vm
*vm
, int vcpuid
)
2214 int error __maybe_unused
;
2216 if (vie
->paging
.cpl
!= 0) {
2217 vm_inject_gp(vm
, vcpuid
);
2218 vie
->num_processed
= 0;
2222 error
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_CR0
, &val
);
2228 error
= vm_set_register(vm
, vcpuid
, VM_REG_GUEST_CR0
, val
);
2235 vie_mmio_read(struct vie
*vie
, struct vm
*vm
, int cpuid
, uint64_t gpa
,
2236 uint64_t *rval
, int bytes
)
2240 if (vie
->mmio_req_read
.state
== VR_DONE
) {
2241 ASSERT(vie
->mmio_req_read
.bytes
== bytes
);
2242 ASSERT(vie
->mmio_req_read
.gpa
== gpa
);
2244 *rval
= vie
->mmio_req_read
.data
;
2248 err
= vm_service_mmio_read(vm
, cpuid
, gpa
, rval
, bytes
);
2251 * A successful read from an in-kernel-emulated device may come
2252 * with side effects, so stash the result in case it's used for
2253 * an instruction which subsequently needs to issue an MMIO
2254 * write to userspace.
2256 ASSERT(vie
->mmio_req_read
.state
== VR_NONE
);
2258 vie
->mmio_req_read
.bytes
= bytes
;
2259 vie
->mmio_req_read
.gpa
= gpa
;
2260 vie
->mmio_req_read
.data
= *rval
;
2261 vie
->mmio_req_read
.state
= VR_DONE
;
2263 } else if (err
== ESRCH
) {
2264 /* Hope that userspace emulation can fulfill this read */
2265 vie
->mmio_req_read
.bytes
= bytes
;
2266 vie
->mmio_req_read
.gpa
= gpa
;
2267 vie
->mmio_req_read
.state
= VR_PENDING
;
2268 vie
->status
|= VIES_PENDING_MMIO
;
2269 } else if (err
< 0) {
2271 * The MMIO read failed in such a way that fallback to handling
2272 * in userspace is required.
2274 vie
->status
|= VIES_USER_FALLBACK
;
2280 vie_mmio_write(struct vie
*vie
, struct vm
*vm
, int cpuid
, uint64_t gpa
,
2281 uint64_t wval
, int bytes
)
2285 if (vie
->mmio_req_write
.state
== VR_DONE
) {
2286 ASSERT(vie
->mmio_req_write
.bytes
== bytes
);
2287 ASSERT(vie
->mmio_req_write
.gpa
== gpa
);
2292 err
= vm_service_mmio_write(vm
, cpuid
, gpa
, wval
, bytes
);
2295 * A successful write to an in-kernel-emulated device probably
2296 * results in side effects, so stash the fact that such a write
2297 * succeeded in case the operation requires other work.
2299 vie
->mmio_req_write
.bytes
= bytes
;
2300 vie
->mmio_req_write
.gpa
= gpa
;
2301 vie
->mmio_req_write
.data
= wval
;
2302 vie
->mmio_req_write
.state
= VR_DONE
;
2303 } else if (err
== ESRCH
) {
2304 /* Hope that userspace emulation can fulfill this write */
2305 vie
->mmio_req_write
.bytes
= bytes
;
2306 vie
->mmio_req_write
.gpa
= gpa
;
2307 vie
->mmio_req_write
.data
= wval
;
2308 vie
->mmio_req_write
.state
= VR_PENDING
;
2309 vie
->status
|= VIES_PENDING_MMIO
;
2310 } else if (err
< 0) {
2312 * The MMIO write failed in such a way that fallback to handling
2313 * in userspace is required.
2315 vie
->status
|= VIES_USER_FALLBACK
;
2321 vie_emulate_mmio(struct vie
*vie
, struct vm
*vm
, int vcpuid
)
2326 if ((vie
->status
& (VIES_INST_DECODE
| VIES_MMIO
)) !=
2327 (VIES_INST_DECODE
| VIES_MMIO
)) {
2331 gpa
= vie
->mmio_gpa
;
2333 switch (vie
->op
.op_type
) {
2334 case VIE_OP_TYPE_GROUP1
:
2335 error
= vie_emulate_group1(vie
, vm
, vcpuid
, gpa
);
2337 case VIE_OP_TYPE_POP
:
2338 error
= vie_emulate_pop(vie
, vm
, vcpuid
, gpa
);
2340 case VIE_OP_TYPE_PUSH
:
2341 error
= vie_emulate_push(vie
, vm
, vcpuid
, gpa
);
2343 case VIE_OP_TYPE_CMP
:
2344 error
= vie_emulate_cmp(vie
, vm
, vcpuid
, gpa
);
2346 case VIE_OP_TYPE_MOV
:
2347 error
= vie_emulate_mov(vie
, vm
, vcpuid
, gpa
);
2349 case VIE_OP_TYPE_MOVSX
:
2350 case VIE_OP_TYPE_MOVZX
:
2351 error
= vie_emulate_movx(vie
, vm
, vcpuid
, gpa
);
2353 case VIE_OP_TYPE_MOVS
:
2354 error
= vie_emulate_movs(vie
, vm
, vcpuid
, gpa
);
2356 case VIE_OP_TYPE_STOS
:
2357 error
= vie_emulate_stos(vie
, vm
, vcpuid
, gpa
);
2359 case VIE_OP_TYPE_AND
:
2360 error
= vie_emulate_and(vie
, vm
, vcpuid
, gpa
);
2362 case VIE_OP_TYPE_OR
:
2363 error
= vie_emulate_or(vie
, vm
, vcpuid
, gpa
);
2365 case VIE_OP_TYPE_SUB
:
2366 error
= vie_emulate_sub(vie
, vm
, vcpuid
, gpa
);
2368 case VIE_OP_TYPE_BITTEST
:
2369 error
= vie_emulate_bittest(vie
, vm
, vcpuid
, gpa
);
2371 case VIE_OP_TYPE_TWOB_GRP15
:
2372 error
= vie_emulate_twob_group15(vie
, vm
, vcpuid
, gpa
);
2374 case VIE_OP_TYPE_ADD
:
2375 error
= vie_emulate_add(vie
, vm
, vcpuid
, gpa
);
2377 case VIE_OP_TYPE_TEST
:
2378 error
= vie_emulate_test(vie
, vm
, vcpuid
, gpa
);
2380 case VIE_OP_TYPE_BEXTR
:
2381 error
= vie_emulate_bextr(vie
, vm
, vcpuid
, gpa
);
2383 case VIE_OP_TYPE_MUL
:
2384 error
= vie_emulate_mul(vie
, vm
, vcpuid
, gpa
);
2391 if (error
== ESRCH
) {
2392 /* Return to userspace with the mmio request */
2400 vie_emulate_inout_port(struct vie
*vie
, struct vm
*vm
, int vcpuid
,
2407 mask
= vie_size2mask(vie
->inout
.bytes
);
2408 in
= (vie
->inout
.flags
& INOUT_IN
) != 0;
2414 if (vie
->inout_req_state
!= VR_DONE
) {
2415 err
= vm_ioport_access(vm
, vcpuid
, in
, vie
->inout
.port
,
2416 vie
->inout
.bytes
, &val
);
2420 * This port access was handled in userspace and the result was
2421 * injected in to be handled now.
2423 val
= vie
->inout_req_val
& mask
;
2424 vie
->inout_req_state
= VR_NONE
;
2429 vie
->status
|= VIES_PENDING_INOUT
;
2430 vie
->inout_req_state
= VR_PENDING
;
2432 } else if (err
!= 0) {
2437 *eax
= (*eax
& ~mask
) | val
;
2442 static enum vm_reg_name
2443 vie_inout_segname(const struct vie
*vie
)
2445 uint8_t segidx
= vie
->inout
.segment
;
2446 const enum vm_reg_name segmap
[] = {
2454 const uint8_t maxidx
= (sizeof (segmap
) / sizeof (segmap
[0]));
2456 if (segidx
>= maxidx
) {
2457 panic("unexpected segment index %u", segidx
);
2459 return (segmap
[segidx
]);
2463 vie_emulate_inout_str(struct vie
*vie
, struct vm
*vm
, int vcpuid
)
2465 uint8_t bytes
, addrsize
;
2466 uint64_t index
, count
= 0, gla
, rflags
;
2467 int prot
, err
, fault
;
2469 enum vm_reg_name seg_reg
, idx_reg
;
2470 struct vm_copyinfo copyinfo
[2];
2472 in
= (vie
->inout
.flags
& INOUT_IN
) != 0;
2473 bytes
= vie
->inout
.bytes
;
2474 addrsize
= vie
->inout
.addrsize
;
2475 prot
= in
? PROT_WRITE
: PROT_READ
;
2477 ASSERT(bytes
== 1 || bytes
== 2 || bytes
== 4);
2478 ASSERT(addrsize
== 2 || addrsize
== 4 || addrsize
== 8);
2480 idx_reg
= (in
) ? VM_REG_GUEST_RDI
: VM_REG_GUEST_RSI
;
2481 seg_reg
= vie_inout_segname(vie
);
2482 err
= vm_get_register(vm
, vcpuid
, idx_reg
, &index
);
2484 index
= index
& vie_size2mask(addrsize
);
2486 repeat
= (vie
->inout
.flags
& INOUT_REP
) != 0;
2488 /* Count register */
2490 err
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RCX
, &count
);
2491 count
&= vie_size2mask(addrsize
);
2495 * If we were asked to emulate a REP INS/OUTS when the
2496 * count register is zero, no further work is required.
2505 if (vie_get_gla(vie
, vm
, vcpuid
, bytes
, addrsize
, prot
, seg_reg
,
2506 idx_reg
, &gla
) != 0) {
2507 /* vie_get_gla() already injected the appropriate fault */
2512 * The INS/OUTS emulate currently assumes that the memory target resides
2513 * within the guest system memory, rather than a device MMIO region. If
2514 * such a case becomes a necessity, that additional handling could be
2517 err
= vm_copy_setup(vm
, vcpuid
, &vie
->paging
, gla
, bytes
, prot
,
2518 copyinfo
, nitems(copyinfo
), &fault
);
2521 /* Unrecoverable error */
2524 /* Resume guest to handle fault */
2529 vm_copyin(vm
, vcpuid
, copyinfo
, &vie
->inout
.eax
, bytes
);
2532 err
= vie_emulate_inout_port(vie
, vm
, vcpuid
, &vie
->inout
.eax
);
2534 if (err
== 0 && in
) {
2535 vm_copyout(vm
, vcpuid
, &vie
->inout
.eax
, copyinfo
, bytes
);
2538 vm_copy_teardown(vm
, vcpuid
, copyinfo
, nitems(copyinfo
));
2541 err
= vm_get_register(vm
, vcpuid
, VM_REG_GUEST_RFLAGS
,
2546 if (rflags
& PSL_D
) {
2552 /* Update index register */
2553 err
= vie_update_register(vm
, vcpuid
, idx_reg
, index
, addrsize
);
2557 * Update count register only if the instruction had a repeat
2560 if ((vie
->inout
.flags
& INOUT_REP
) != 0) {
2562 err
= vie_update_register(vm
, vcpuid
, VM_REG_GUEST_RCX
,
2567 return (vie_repeat(vie
));
2576 vie_emulate_inout(struct vie
*vie
, struct vm
*vm
, int vcpuid
)
2580 if ((vie
->status
& VIES_INOUT
) == 0) {
2584 if ((vie
->inout
.flags
& INOUT_STR
) == 0) {
2586 * For now, using the 'rep' prefixes with plain (non-string)
2587 * in/out is not supported.
2589 if ((vie
->inout
.flags
& INOUT_REP
) != 0) {
2593 err
= vie_emulate_inout_port(vie
, vm
, vcpuid
, &vie
->inout
.eax
);
2594 if (err
== 0 && (vie
->inout
.flags
& INOUT_IN
) != 0) {
2596 * With the inX access now a success, the result needs
2597 * to be stored in the guest %rax.
2599 err
= vm_set_register(vm
, vcpuid
, VM_REG_GUEST_RAX
,
2604 vie
->status
&= ~VIES_REPEAT
;
2605 err
= vie_emulate_inout_str(vie
, vm
, vcpuid
);
2610 * Access to an I/O port failed in such a way that fallback to
2611 * handling in userspace is required.
2613 vie
->status
|= VIES_USER_FALLBACK
;
2614 } else if (err
== ESRCH
) {
2615 ASSERT(vie
->status
& VIES_PENDING_INOUT
);
2616 /* Return to userspace with the in/out request */
2624 vie_emulate_other(struct vie
*vie
, struct vm
*vm
, int vcpuid
)
2628 if ((vie
->status
& (VIES_INST_DECODE
| VIES_OTHER
)) !=
2629 (VIES_INST_DECODE
| VIES_OTHER
)) {
2633 switch (vie
->op
.op_type
) {
2634 case VIE_OP_TYPE_CLTS
:
2635 error
= vie_emulate_clts(vie
, vm
, vcpuid
);
2637 case VIE_OP_TYPE_MOV_CR
:
2638 error
= vie_emulate_mov_cr(vie
, vm
, vcpuid
);
2649 vie_reset(struct vie
*vie
)
2652 vie
->num_processed
= vie
->num_valid
= 0;
2656 vie_advance_pc(struct vie
*vie
, uint64_t *nextrip
)
2658 VERIFY((vie
->status
& VIES_REPEAT
) == 0);
2660 *nextrip
+= vie
->num_processed
;
2665 vie_exitinfo(const struct vie
*vie
, struct vm_exit
*vme
)
2667 if (vie
->status
& VIES_USER_FALLBACK
) {
2669 * Despite the fact that the instruction was successfully
2670 * decoded, some aspect of the emulation failed in such a way
2671 * that it is left up to userspace to complete the operation.
2673 vie_fallback_exitinfo(vie
, vme
);
2674 } else if (vie
->status
& VIES_MMIO
) {
2675 vme
->exitcode
= VM_EXITCODE_MMIO
;
2676 if (vie
->mmio_req_read
.state
== VR_PENDING
) {
2677 vme
->u
.mmio
.gpa
= vie
->mmio_req_read
.gpa
;
2678 vme
->u
.mmio
.data
= 0;
2679 vme
->u
.mmio
.bytes
= vie
->mmio_req_read
.bytes
;
2680 vme
->u
.mmio
.read
= 1;
2681 } else if (vie
->mmio_req_write
.state
== VR_PENDING
) {
2682 vme
->u
.mmio
.gpa
= vie
->mmio_req_write
.gpa
;
2683 vme
->u
.mmio
.data
= vie
->mmio_req_write
.data
&
2684 vie_size2mask(vie
->mmio_req_write
.bytes
);
2685 vme
->u
.mmio
.bytes
= vie
->mmio_req_write
.bytes
;
2686 vme
->u
.mmio
.read
= 0;
2688 panic("bad pending MMIO state");
2690 } else if (vie
->status
& VIES_INOUT
) {
2691 vme
->exitcode
= VM_EXITCODE_INOUT
;
2692 vme
->u
.inout
.port
= vie
->inout
.port
;
2693 vme
->u
.inout
.bytes
= vie
->inout
.bytes
;
2694 if ((vie
->inout
.flags
& INOUT_IN
) != 0) {
2695 vme
->u
.inout
.flags
= INOUT_IN
;
2696 vme
->u
.inout
.eax
= 0;
2698 vme
->u
.inout
.flags
= 0;
2699 vme
->u
.inout
.eax
= vie
->inout
.eax
&
2700 vie_size2mask(vie
->inout
.bytes
);
2703 panic("no pending operation");
2708 * In the case of a decoding or verification failure, bailing out to userspace
2709 * to do the instruction emulation is our only option for now.
2712 vie_fallback_exitinfo(const struct vie
*vie
, struct vm_exit
*vme
)
2714 if ((vie
->status
& VIES_INST_FETCH
) == 0) {
2715 bzero(&vme
->u
.inst_emul
, sizeof (vme
->u
.inst_emul
));
2717 ASSERT(sizeof (vie
->inst
) == sizeof (vme
->u
.inst_emul
.inst
));
2719 bcopy(vie
->inst
, vme
->u
.inst_emul
.inst
, sizeof (vie
->inst
));
2720 vme
->u
.inst_emul
.num_valid
= vie
->num_valid
;
2722 vme
->exitcode
= VM_EXITCODE_INST_EMUL
;
2726 vie_cs_info(const struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t *cs_base
,
2729 struct seg_desc cs_desc
;
2730 int error __maybe_unused
;
2732 error
= vm_get_seg_desc(vm
, vcpuid
, VM_REG_GUEST_CS
, &cs_desc
);
2735 /* Initialization required for the paging info to be populated */
2736 VERIFY(vie
->status
& VIES_INIT
);
2737 switch (vie
->paging
.cpu_mode
) {
2739 *cs_base
= cs_desc
.base
;
2742 case CPU_MODE_PROTECTED
:
2743 case CPU_MODE_COMPATIBILITY
:
2744 *cs_base
= cs_desc
.base
;
2745 *cs_d
= SEG_DESC_DEF32(cs_desc
.access
) ? 1 : 0;
2755 vie_pending(const struct vie
*vie
)
2758 * These VIE status bits indicate conditions which must be addressed
2759 * through either device IO fulfillment (with corresponding
2760 * vie_fulfill_*()) or complete userspace emulation (followed by a
2763 const enum vie_status of_interest
=
2764 VIES_PENDING_MMIO
| VIES_PENDING_INOUT
| VIES_USER_FALLBACK
;
2766 return ((vie
->status
& of_interest
) != 0);
2770 vie_needs_fetch(const struct vie
*vie
)
2772 if (vie
->status
& VIES_INST_FETCH
) {
2773 ASSERT(vie
->num_valid
!= 0);
2780 vie_alignment_check(int cpl
, int size
, uint64_t cr0
, uint64_t rf
, uint64_t gla
)
2782 KASSERT(size
== 1 || size
== 2 || size
== 4 || size
== 8,
2783 ("%s: invalid size %d", __func__
, size
));
2784 KASSERT(cpl
>= 0 && cpl
<= 3, ("%s: invalid cpl %d", __func__
, cpl
));
2786 if (cpl
!= 3 || (cr0
& CR0_AM
) == 0 || (rf
& PSL_AC
) == 0)
2789 return ((gla
& (size
- 1)) ? 1 : 0);
2793 vie_canonical_check(enum vm_cpu_mode cpu_mode
, uint64_t gla
)
2797 if (cpu_mode
!= CPU_MODE_64BIT
)
2801 * The value of the bit 47 in the 'gla' should be replicated in the
2802 * most significant 16 bits.
2804 mask
= ~((1UL << 48) - 1);
2805 if (gla
& (1UL << 47))
2806 return ((gla
& mask
) != mask
);
2808 return ((gla
& mask
) != 0);
2812 vie_size2mask(int size
)
2814 KASSERT(size
== 1 || size
== 2 || size
== 4 || size
== 8,
2815 ("vie_size2mask: invalid size %d", size
));
2816 return (size2mask
[size
]);
2820 vie_calculate_gla(enum vm_cpu_mode cpu_mode
, enum vm_reg_name seg
,
2821 struct seg_desc
*desc
, uint64_t offset
, int length
, int addrsize
,
2822 int prot
, uint64_t *gla
)
2824 uint64_t firstoff
, low_limit
, high_limit
, segbase
;
2827 KASSERT(seg
>= VM_REG_GUEST_ES
&& seg
<= VM_REG_GUEST_GS
,
2828 ("%s: invalid segment %d", __func__
, seg
));
2829 KASSERT(length
== 1 || length
== 2 || length
== 4 || length
== 8,
2830 ("%s: invalid operand size %d", __func__
, length
));
2831 KASSERT((prot
& ~(PROT_READ
| PROT_WRITE
)) == 0,
2832 ("%s: invalid prot %x", __func__
, prot
));
2835 if (cpu_mode
== CPU_MODE_64BIT
) {
2836 KASSERT(addrsize
== 4 || addrsize
== 8, ("%s: invalid address "
2837 "size %d for cpu_mode %d", __func__
, addrsize
, cpu_mode
));
2840 KASSERT(addrsize
== 2 || addrsize
== 4, ("%s: invalid address "
2841 "size %d for cpu mode %d", __func__
, addrsize
, cpu_mode
));
2844 * If the segment selector is loaded with a NULL selector
2845 * then the descriptor is unusable and attempting to use
2846 * it results in a #GP(0).
2848 if (SEG_DESC_UNUSABLE(desc
->access
))
2852 * The processor generates a #NP exception when a segment
2853 * register is loaded with a selector that points to a
2854 * descriptor that is not present. If this was the case then
2855 * it would have been checked before the VM-exit.
2857 KASSERT(SEG_DESC_PRESENT(desc
->access
),
2858 ("segment %d not present: %x", seg
, desc
->access
));
2861 * The descriptor type must indicate a code/data segment.
2863 type
= SEG_DESC_TYPE(desc
->access
);
2864 KASSERT(type
>= 16 && type
<= 31, ("segment %d has invalid "
2865 "descriptor type %x", seg
, type
));
2867 if (prot
& PROT_READ
) {
2868 /* #GP on a read access to a exec-only code segment */
2869 if ((type
& 0xA) == 0x8)
2873 if (prot
& PROT_WRITE
) {
2875 * #GP on a write access to a code segment or a
2876 * read-only data segment.
2878 if (type
& 0x8) /* code segment */
2881 if ((type
& 0xA) == 0) /* read-only data seg */
2886 * 'desc->limit' is fully expanded taking granularity into
2889 if ((type
& 0xC) == 0x4) {
2890 /* expand-down data segment */
2891 low_limit
= desc
->limit
+ 1;
2892 high_limit
= SEG_DESC_DEF32(desc
->access
) ?
2893 0xffffffff : 0xffff;
2895 /* code segment or expand-up data segment */
2897 high_limit
= desc
->limit
;
2900 while (length
> 0) {
2901 offset
&= vie_size2mask(addrsize
);
2902 if (offset
< low_limit
|| offset
> high_limit
)
2910 * In 64-bit mode all segments except %fs and %gs have a segment
2911 * base address of 0.
2913 if (cpu_mode
== CPU_MODE_64BIT
&& seg
!= VM_REG_GUEST_FS
&&
2914 seg
!= VM_REG_GUEST_GS
) {
2917 segbase
= desc
->base
;
2921 * Truncate 'firstoff' to the effective address size before adding
2922 * it to the segment base.
2924 firstoff
&= vie_size2mask(addrsize
);
2925 *gla
= (segbase
+ firstoff
) & vie_size2mask(glasize
);
2930 vie_init_mmio(struct vie
*vie
, const char *inst_bytes
, uint8_t inst_length
,
2931 const struct vm_guest_paging
*paging
, uint64_t gpa
)
2933 KASSERT(inst_length
<= VIE_INST_SIZE
,
2934 ("%s: invalid instruction length (%d)", __func__
, inst_length
));
2936 bzero(vie
, sizeof (struct vie
));
2938 vie
->base_register
= VM_REG_LAST
;
2939 vie
->index_register
= VM_REG_LAST
;
2940 vie
->segment_register
= VM_REG_LAST
;
2941 vie
->status
= VIES_INIT
| VIES_MMIO
;
2943 if (inst_length
!= 0) {
2944 bcopy(inst_bytes
, vie
->inst
, inst_length
);
2945 vie
->num_valid
= inst_length
;
2946 vie
->status
|= VIES_INST_FETCH
;
2949 vie
->paging
= *paging
;
2950 vie
->mmio_gpa
= gpa
;
2954 vie_init_inout(struct vie
*vie
, const struct vm_inout
*inout
, uint8_t inst_len
,
2955 const struct vm_guest_paging
*paging
)
2957 bzero(vie
, sizeof (struct vie
));
2959 vie
->status
= VIES_INIT
| VIES_INOUT
;
2961 vie
->inout
= *inout
;
2962 vie
->paging
= *paging
;
2965 * Since VMX/SVM assists already decoded the nature of the in/out
2966 * instruction, let the status reflect that.
2968 vie
->status
|= VIES_INST_FETCH
| VIES_INST_DECODE
;
2969 vie
->num_processed
= inst_len
;
2973 vie_init_other(struct vie
*vie
, const struct vm_guest_paging
*paging
)
2975 bzero(vie
, sizeof (struct vie
));
2977 vie
->base_register
= VM_REG_LAST
;
2978 vie
->index_register
= VM_REG_LAST
;
2979 vie
->segment_register
= VM_REG_LAST
;
2980 vie
->status
= VIES_INIT
| VIES_OTHER
;
2982 vie
->paging
= *paging
;
2986 vie_fulfill_mmio(struct vie
*vie
, const struct vm_mmio
*result
)
2988 struct vie_mmio
*pending
;
2990 if ((vie
->status
& VIES_MMIO
) == 0 ||
2991 (vie
->status
& VIES_PENDING_MMIO
) == 0) {
2996 pending
= &vie
->mmio_req_read
;
2998 pending
= &vie
->mmio_req_write
;
3001 if (pending
->state
!= VR_PENDING
||
3002 pending
->bytes
!= result
->bytes
|| pending
->gpa
!= result
->gpa
) {
3007 pending
->data
= result
->data
& vie_size2mask(pending
->bytes
);
3009 pending
->state
= VR_DONE
;
3010 vie
->status
&= ~VIES_PENDING_MMIO
;
3016 vie_fulfill_inout(struct vie
*vie
, const struct vm_inout
*result
)
3018 if ((vie
->status
& VIES_INOUT
) == 0 ||
3019 (vie
->status
& VIES_PENDING_INOUT
) == 0) {
3022 if ((vie
->inout
.flags
& INOUT_IN
) != (result
->flags
& INOUT_IN
) ||
3023 vie
->inout
.bytes
!= result
->bytes
||
3024 vie
->inout
.port
!= result
->port
) {
3028 if (result
->flags
& INOUT_IN
) {
3029 vie
->inout_req_val
= result
->eax
&
3030 vie_size2mask(vie
->inout
.bytes
);
3032 vie
->inout_req_state
= VR_DONE
;
3033 vie
->status
&= ~(VIES_PENDING_INOUT
);
3039 vie_mmio_gpa(const struct vie
*vie
)
3041 return (vie
->mmio_gpa
);
3045 pf_error_code(int usermode
, int prot
, int rsvd
, uint64_t pte
)
3050 error_code
|= PGEX_P
;
3051 if (prot
& PROT_WRITE
)
3052 error_code
|= PGEX_W
;
3054 error_code
|= PGEX_U
;
3056 error_code
|= PGEX_RSV
;
3057 if (prot
& PROT_EXEC
)
3058 error_code
|= PGEX_I
;
3060 return (error_code
);
3064 ptp_release(vm_page_t
**vmp
)
3067 (void) vmp_release(*vmp
);
3073 ptp_hold(struct vm
*vm
, int vcpu
, uintptr_t gpa
, size_t len
, vm_page_t
**vmp
)
3075 vm_client_t
*vmc
= vm_get_vmclient(vm
, vcpu
);
3076 const uintptr_t hold_gpa
= gpa
& PAGEMASK
;
3078 /* Hold must not cross a page boundary */
3079 VERIFY3U(gpa
+ len
, <=, hold_gpa
+ PAGESIZE
);
3082 (void) vmp_release(*vmp
);
3085 *vmp
= vmc_hold(vmc
, hold_gpa
, PROT_READ
| PROT_WRITE
);
3090 return ((caddr_t
)vmp_get_writable(*vmp
) + (gpa
- hold_gpa
));
3094 _vm_gla2gpa(struct vm
*vm
, int vcpuid
, struct vm_guest_paging
*paging
,
3095 uint64_t gla
, int prot
, uint64_t *gpa
, int *guest_fault
, bool check_only
)
3097 int nlevels
, pfcode
;
3098 int ptpshift
= 0, ptpindex
= 0;
3100 uint64_t *ptpbase
= NULL
, pte
= 0, pgsize
= 0;
3101 vm_page_t
*cookie
= NULL
;
3102 const bool usermode
= paging
->cpl
== 3;
3103 const bool writable
= (prot
& PROT_WRITE
) != 0;
3107 ptpphys
= paging
->cr3
; /* root of the page tables */
3108 ptp_release(&cookie
);
3110 if (vie_canonical_check(paging
->cpu_mode
, gla
)) {
3112 * XXX assuming a non-stack reference otherwise a stack fault
3113 * should be generated.
3116 vm_inject_gp(vm
, vcpuid
);
3121 if (paging
->paging_mode
== PAGING_MODE_FLAT
) {
3126 if (paging
->paging_mode
== PAGING_MODE_32
) {
3127 uint32_t *ptpbase32
, pte32
;
3130 while (--nlevels
>= 0) {
3131 /* Zero out the lower 12 bits. */
3134 ptpbase32
= ptp_hold(vm
, vcpuid
, ptpphys
, PAGE_SIZE
,
3137 if (ptpbase32
== NULL
) {
3141 ptpshift
= PAGE_SHIFT
+ nlevels
* 10;
3142 ptpindex
= (gla
>> ptpshift
) & 0x3FF;
3143 pgsize
= 1UL << ptpshift
;
3145 pte32
= ptpbase32
[ptpindex
];
3147 if ((pte32
& PG_V
) == 0 ||
3148 (usermode
&& (pte32
& PG_U
) == 0) ||
3149 (writable
&& (pte32
& PG_RW
) == 0)) {
3151 pfcode
= pf_error_code(usermode
, prot
,
3153 vm_inject_pf(vm
, vcpuid
, pfcode
, gla
);
3156 ptp_release(&cookie
);
3162 * Emulate the x86 MMU's management of the accessed
3163 * and dirty flags. While the accessed flag is set
3164 * at every level of the page table, the dirty flag
3165 * is only set at the last level providing the guest
3168 if (!check_only
&& (pte32
& PG_A
) == 0) {
3169 if (atomic_cmpset_32(&ptpbase32
[ptpindex
],
3170 pte32
, pte32
| PG_A
) == 0) {
3175 /* XXX must be ignored if CR4.PSE=0 */
3176 if (nlevels
> 0 && (pte32
& PG_PS
) != 0)
3182 /* Set the dirty bit in the page table entry if necessary */
3183 if (!check_only
&& writable
&& (pte32
& PG_M
) == 0) {
3184 if (atomic_cmpset_32(&ptpbase32
[ptpindex
],
3185 pte32
, pte32
| PG_M
) == 0) {
3190 /* Zero out the lower 'ptpshift' bits */
3191 pte32
>>= ptpshift
; pte32
<<= ptpshift
;
3192 *gpa
= pte32
| (gla
& (pgsize
- 1));
3193 ptp_release(&cookie
);
3197 if (paging
->paging_mode
== PAGING_MODE_PAE
) {
3198 /* Zero out the lower 5 bits and the upper 32 bits */
3199 ptpphys
&= 0xffffffe0UL
;
3201 ptpbase
= ptp_hold(vm
, vcpuid
, ptpphys
, sizeof (*ptpbase
) * 4,
3203 if (ptpbase
== NULL
) {
3207 ptpindex
= (gla
>> 30) & 0x3;
3209 pte
= ptpbase
[ptpindex
];
3211 if ((pte
& PG_V
) == 0) {
3213 pfcode
= pf_error_code(usermode
, prot
, 0, pte
);
3214 vm_inject_pf(vm
, vcpuid
, pfcode
, gla
);
3217 ptp_release(&cookie
);
3229 while (--nlevels
>= 0) {
3230 /* Zero out the lower 12 bits and the upper 12 bits */
3231 ptpphys
&= 0x000ffffffffff000UL
;
3233 ptpbase
= ptp_hold(vm
, vcpuid
, ptpphys
, PAGE_SIZE
, &cookie
);
3234 if (ptpbase
== NULL
) {
3238 ptpshift
= PAGE_SHIFT
+ nlevels
* 9;
3239 ptpindex
= (gla
>> ptpshift
) & 0x1FF;
3240 pgsize
= 1UL << ptpshift
;
3242 pte
= ptpbase
[ptpindex
];
3244 if ((pte
& PG_V
) == 0 ||
3245 (usermode
&& (pte
& PG_U
) == 0) ||
3246 (writable
&& (pte
& PG_RW
) == 0)) {
3248 pfcode
= pf_error_code(usermode
, prot
, 0, pte
);
3249 vm_inject_pf(vm
, vcpuid
, pfcode
, gla
);
3252 ptp_release(&cookie
);
3257 /* Set the accessed bit in the page table entry */
3258 if (!check_only
&& (pte
& PG_A
) == 0) {
3259 if (atomic_cmpset_64(&ptpbase
[ptpindex
],
3260 pte
, pte
| PG_A
) == 0) {
3265 if (nlevels
> 0 && (pte
& PG_PS
) != 0) {
3266 if (pgsize
> 1 * GB
) {
3268 pfcode
= pf_error_code(usermode
, prot
,
3270 vm_inject_pf(vm
, vcpuid
, pfcode
, gla
);
3273 ptp_release(&cookie
);
3283 /* Set the dirty bit in the page table entry if necessary */
3284 if (!check_only
&& writable
&& (pte
& PG_M
) == 0) {
3285 if (atomic_cmpset_64(&ptpbase
[ptpindex
], pte
, pte
| PG_M
) == 0)
3288 ptp_release(&cookie
);
3290 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */
3291 pte
>>= ptpshift
; pte
<<= (ptpshift
+ 12); pte
>>= 12;
3292 *gpa
= pte
| (gla
& (pgsize
- 1));
3297 vm_gla2gpa(struct vm
*vm
, int vcpuid
, struct vm_guest_paging
*paging
,
3298 uint64_t gla
, int prot
, uint64_t *gpa
, int *guest_fault
)
3301 return (_vm_gla2gpa(vm
, vcpuid
, paging
, gla
, prot
, gpa
, guest_fault
,
3306 vm_gla2gpa_nofault(struct vm
*vm
, int vcpuid
, struct vm_guest_paging
*paging
,
3307 uint64_t gla
, int prot
, uint64_t *gpa
, int *guest_fault
)
3310 return (_vm_gla2gpa(vm
, vcpuid
, paging
, gla
, prot
, gpa
, guest_fault
,
3315 vie_fetch_instruction(struct vie
*vie
, struct vm
*vm
, int vcpuid
, uint64_t rip
,
3318 struct vm_copyinfo copyinfo
[2];
3321 if ((vie
->status
& VIES_INIT
) == 0) {
3325 prot
= PROT_READ
| PROT_EXEC
;
3326 error
= vm_copy_setup(vm
, vcpuid
, &vie
->paging
, rip
, VIE_INST_SIZE
,
3327 prot
, copyinfo
, nitems(copyinfo
), faultptr
);
3328 if (error
|| *faultptr
)
3331 vm_copyin(vm
, vcpuid
, copyinfo
, vie
->inst
, VIE_INST_SIZE
);
3332 vm_copy_teardown(vm
, vcpuid
, copyinfo
, nitems(copyinfo
));
3333 vie
->num_valid
= VIE_INST_SIZE
;
3334 vie
->status
|= VIES_INST_FETCH
;
3339 vie_peek(struct vie
*vie
, uint8_t *x
)
3342 if (vie
->num_processed
< vie
->num_valid
) {
3343 *x
= vie
->inst
[vie
->num_processed
];
3350 vie_advance(struct vie
*vie
)
3353 vie
->num_processed
++;
3357 segment_override(uint8_t x
, int *seg
)
3362 *seg
= VM_REG_GUEST_CS
;
3365 *seg
= VM_REG_GUEST_SS
;
3368 *seg
= VM_REG_GUEST_DS
;
3371 *seg
= VM_REG_GUEST_ES
;
3374 *seg
= VM_REG_GUEST_FS
;
3377 *seg
= VM_REG_GUEST_GS
;
3386 decode_prefixes(struct vie
*vie
, enum vm_cpu_mode cpu_mode
, int cs_d
)
3391 if (vie_peek(vie
, &x
))
3395 vie
->opsize_override
= 1;
3397 vie
->addrsize_override
= 1;
3399 vie
->repz_present
= 1;
3401 vie
->repnz_present
= 1;
3402 else if (segment_override(x
, &vie
->segment_register
))
3403 vie
->segment_override
= 1;
3411 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
3412 * - Only one REX prefix is allowed per instruction.
3413 * - The REX prefix must immediately precede the opcode byte or the
3414 * escape opcode byte.
3415 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
3416 * the mandatory prefix must come before the REX prefix.
3418 if (cpu_mode
== CPU_MODE_64BIT
&& x
>= 0x40 && x
<= 0x4F) {
3419 vie
->rex_present
= 1;
3420 vie
->rex_w
= x
& 0x8 ? 1 : 0;
3421 vie
->rex_r
= x
& 0x4 ? 1 : 0;
3422 vie
->rex_x
= x
& 0x2 ? 1 : 0;
3423 vie
->rex_b
= x
& 0x1 ? 1 : 0;
3428 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
3430 if ((cpu_mode
== CPU_MODE_64BIT
||
3431 cpu_mode
== CPU_MODE_COMPATIBILITY
) && x
== 0xC4) {
3432 const struct vie_op
*optab
;
3434 /* 3-byte VEX prefix. */
3435 vie
->vex_present
= 1;
3438 if (vie_peek(vie
, &x
))
3442 * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted
3443 * relative to REX encoding.
3445 vie
->rex_r
= x
& 0x80 ? 0 : 1;
3446 vie
->rex_x
= x
& 0x40 ? 0 : 1;
3447 vie
->rex_b
= x
& 0x20 ? 0 : 1;
3452 optab
= three_byte_opcodes_0f38
;
3455 /* 0F class - nothing handled here yet. */
3458 /* 0F 3A class - nothing handled here yet. */
3461 /* Reserved (#UD). */
3466 if (vie_peek(vie
, &x
))
3469 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
3470 vie
->rex_w
= x
& 0x80 ? 1 : 0;
3472 vie
->vex_reg
= ((~(unsigned)x
& 0x78u
) >> 3);
3473 vie
->vex_l
= !!(x
& 0x4);
3474 vie
->vex_pp
= (x
& 0x3);
3476 /* PP: 1=66 2=F3 3=F2 prefixes. */
3477 switch (vie
->vex_pp
) {
3479 vie
->opsize_override
= 1;
3482 vie
->repz_present
= 1;
3485 vie
->repnz_present
= 1;
3491 /* Opcode, sans literal prefix prefix. */
3492 if (vie_peek(vie
, &x
))
3496 if (vie
->op
.op_type
== VIE_OP_TYPE_NONE
)
3503 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
3505 if (cpu_mode
== CPU_MODE_64BIT
) {
3507 * Default address size is 64-bits and default operand size
3510 vie
->addrsize
= vie
->addrsize_override
? 4 : 8;
3513 else if (vie
->opsize_override
)
3518 /* Default address and operand sizes are 32-bits */
3519 vie
->addrsize
= vie
->addrsize_override
? 2 : 4;
3520 vie
->opsize
= vie
->opsize_override
? 2 : 4;
3522 /* Default address and operand sizes are 16-bits */
3523 vie
->addrsize
= vie
->addrsize_override
? 4 : 2;
3524 vie
->opsize
= vie
->opsize_override
? 4 : 2;
3530 decode_two_byte_opcode(struct vie
*vie
)
3534 if (vie_peek(vie
, &x
))
3537 vie
->op
= two_byte_opcodes
[x
];
3539 if (vie
->op
.op_type
== VIE_OP_TYPE_NONE
)
3547 decode_opcode(struct vie
*vie
)
3551 if (vie_peek(vie
, &x
))
3554 /* Already did this via VEX prefix. */
3555 if (vie
->op
.op_type
!= VIE_OP_TYPE_NONE
)
3558 vie
->op
= one_byte_opcodes
[x
];
3560 if (vie
->op
.op_type
== VIE_OP_TYPE_NONE
)
3565 if (vie
->op
.op_type
== VIE_OP_TYPE_TWO_BYTE
)
3566 return (decode_two_byte_opcode(vie
));
3572 decode_modrm(struct vie
*vie
, enum vm_cpu_mode cpu_mode
)
3576 * Handling mov-to/from-cr is special since it is not issuing
3577 * mmio/pio requests and can be done in real mode. We must bypass some
3578 * of the other existing decoding restrictions for it.
3580 const bool is_movcr
= ((vie
->op
.op_flags
& VIE_OP_F_REG_REG
) != 0);
3582 if (vie
->op
.op_flags
& VIE_OP_F_NO_MODRM
)
3585 if (cpu_mode
== CPU_MODE_REAL
&& !is_movcr
)
3588 if (vie_peek(vie
, &x
))
3591 vie
->mod
= (x
>> 6) & 0x3;
3592 vie
->rm
= (x
>> 0) & 0x7;
3593 vie
->reg
= (x
>> 3) & 0x7;
3596 * A direct addressing mode makes no sense in the context of an EPT
3597 * fault. There has to be a memory access involved to cause the
3600 if (vie
->mod
== VIE_MOD_DIRECT
&& !is_movcr
)
3603 if ((vie
->mod
== VIE_MOD_INDIRECT
&& vie
->rm
== VIE_RM_DISP32
) ||
3604 (vie
->mod
!= VIE_MOD_DIRECT
&& vie
->rm
== VIE_RM_SIB
)) {
3606 * Table 2-5: Special Cases of REX Encodings
3608 * mod=0, r/m=5 is used in the compatibility mode to
3609 * indicate a disp32 without a base register.
3611 * mod!=3, r/m=4 is used in the compatibility mode to
3612 * indicate that the SIB byte is present.
3614 * The 'b' bit in the REX prefix is don't care in
3618 vie
->rm
|= (vie
->rex_b
<< 3);
3621 vie
->reg
|= (vie
->rex_r
<< 3);
3624 if (vie
->mod
!= VIE_MOD_DIRECT
&& vie
->rm
== VIE_RM_SIB
)
3627 vie
->base_register
= gpr_map
[vie
->rm
];
3630 case VIE_MOD_INDIRECT_DISP8
:
3631 vie
->disp_bytes
= 1;
3633 case VIE_MOD_INDIRECT_DISP32
:
3634 vie
->disp_bytes
= 4;
3636 case VIE_MOD_INDIRECT
:
3637 if (vie
->rm
== VIE_RM_DISP32
) {
3638 vie
->disp_bytes
= 4;
3640 * Table 2-7. RIP-Relative Addressing
3642 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
3643 * whereas in compatibility mode it just implies disp32.
3646 if (cpu_mode
== CPU_MODE_64BIT
)
3647 vie
->base_register
= VM_REG_GUEST_RIP
;
3649 vie
->base_register
= VM_REG_LAST
;
3661 decode_sib(struct vie
*vie
)
3665 /* Proceed only if SIB byte is present */
3666 if (vie
->mod
== VIE_MOD_DIRECT
|| vie
->rm
!= VIE_RM_SIB
)
3669 if (vie_peek(vie
, &x
))
3672 /* De-construct the SIB byte */
3673 vie
->ss
= (x
>> 6) & 0x3;
3674 vie
->index
= (x
>> 3) & 0x7;
3675 vie
->base
= (x
>> 0) & 0x7;
3677 /* Apply the REX prefix modifiers */
3678 vie
->index
|= vie
->rex_x
<< 3;
3679 vie
->base
|= vie
->rex_b
<< 3;
3682 case VIE_MOD_INDIRECT_DISP8
:
3683 vie
->disp_bytes
= 1;
3685 case VIE_MOD_INDIRECT_DISP32
:
3686 vie
->disp_bytes
= 4;
3690 if (vie
->mod
== VIE_MOD_INDIRECT
&&
3691 (vie
->base
== 5 || vie
->base
== 13)) {
3693 * Special case when base register is unused if mod = 0
3694 * and base = %rbp or %r13.
3697 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
3698 * Table 2-5: Special Cases of REX Encodings
3700 vie
->disp_bytes
= 4;
3702 vie
->base_register
= gpr_map
[vie
->base
];
3706 * All encodings of 'index' are valid except for %rsp (4).
3709 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
3710 * Table 2-5: Special Cases of REX Encodings
3712 if (vie
->index
!= 4)
3713 vie
->index_register
= gpr_map
[vie
->index
];
3715 /* 'scale' makes sense only in the context of an index register */
3716 if (vie
->index_register
< VM_REG_LAST
)
3717 vie
->scale
= 1 << vie
->ss
;
3725 decode_displacement(struct vie
*vie
)
3736 if ((n
= vie
->disp_bytes
) == 0)
3739 if (n
!= 1 && n
!= 4)
3740 panic("decode_displacement: invalid disp_bytes %d", n
);
3742 for (i
= 0; i
< n
; i
++) {
3743 if (vie_peek(vie
, &x
))
3751 vie
->displacement
= u
.signed8
; /* sign-extended */
3753 vie
->displacement
= u
.signed32
; /* sign-extended */
3759 decode_immediate(struct vie
*vie
)
3770 /* Figure out immediate operand size (if any) */
3771 if (vie
->op
.op_flags
& VIE_OP_F_IMM
) {
3773 * Section 2.2.1.5 "Immediates", Intel SDM:
3774 * In 64-bit mode the typical size of immediate operands
3775 * remains 32-bits. When the operand size if 64-bits, the
3776 * processor sign-extends all immediates to 64-bits prior
3779 if (vie
->opsize
== 4 || vie
->opsize
== 8)
3783 } else if (vie
->op
.op_flags
& VIE_OP_F_IMM8
) {
3787 if ((n
= vie
->imm_bytes
) == 0)
3790 KASSERT(n
== 1 || n
== 2 || n
== 4,
3791 ("%s: invalid number of immediate bytes: %d", __func__
, n
));
3793 for (i
= 0; i
< n
; i
++) {
3794 if (vie_peek(vie
, &x
))
3801 /* sign-extend the immediate value before use */
3803 vie
->immediate
= u
.signed8
;
3805 vie
->immediate
= u
.signed16
;
3807 vie
->immediate
= u
.signed32
;
3813 decode_moffset(struct vie
*vie
)
3822 if ((vie
->op
.op_flags
& VIE_OP_F_MOFFSET
) == 0)
3826 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
3827 * The memory offset size follows the address-size of the instruction.
3830 KASSERT(n
== 2 || n
== 4 || n
== 8, ("invalid moffset bytes: %d", n
));
3833 for (i
= 0; i
< n
; i
++) {
3834 if (vie_peek(vie
, &x
))
3840 vie
->displacement
= u
.u64
;
3845 * Verify that the 'guest linear address' provided as collateral of the nested
3846 * page table fault matches with our instruction decoding.
3849 vie_verify_gla(struct vie
*vie
, struct vm
*vm
, int cpuid
, uint64_t gla
)
3852 uint64_t base
, segbase
, idx
, gla2
;
3853 enum vm_reg_name seg
;
3854 struct seg_desc desc
;
3856 ASSERT((vie
->status
& VIES_INST_DECODE
) != 0);
3859 * If there was no valid GLA context with the exit, or the decoded
3860 * instruction acts on more than one address, verification is done.
3862 if (gla
== VIE_INVALID_GLA
||
3863 (vie
->op
.op_flags
& VIE_OP_F_NO_GLA_VERIFICATION
) != 0) {
3868 if (vie
->base_register
!= VM_REG_LAST
) {
3869 error
= vm_get_register(vm
, cpuid
, vie
->base_register
, &base
);
3871 printf("verify_gla: error %d getting base reg %d\n",
3872 error
, vie
->base_register
);
3877 * RIP-relative addressing starts from the following
3880 if (vie
->base_register
== VM_REG_GUEST_RIP
)
3881 base
+= vie
->num_processed
;
3885 if (vie
->index_register
!= VM_REG_LAST
) {
3886 error
= vm_get_register(vm
, cpuid
, vie
->index_register
, &idx
);
3888 printf("verify_gla: error %d getting index reg %d\n",
3889 error
, vie
->index_register
);
3895 * From "Specifying a Segment Selector", Intel SDM, Vol 1
3897 * In 64-bit mode, segmentation is generally (but not
3898 * completely) disabled. The exceptions are the FS and GS
3901 * In legacy IA-32 mode, when the ESP or EBP register is used
3902 * as the base, the SS segment is the default segment. For
3903 * other data references, except when relative to stack or
3904 * string destination the DS segment is the default. These
3905 * can be overridden to allow other segments to be accessed.
3907 if (vie
->segment_override
) {
3908 seg
= vie
->segment_register
;
3909 } else if (vie
->base_register
== VM_REG_GUEST_RSP
||
3910 vie
->base_register
== VM_REG_GUEST_RBP
) {
3911 seg
= VM_REG_GUEST_SS
;
3913 seg
= VM_REG_GUEST_DS
;
3915 if (vie
->paging
.cpu_mode
== CPU_MODE_64BIT
&&
3916 seg
!= VM_REG_GUEST_FS
&& seg
!= VM_REG_GUEST_GS
) {
3919 error
= vm_get_seg_desc(vm
, cpuid
, seg
, &desc
);
3921 printf("verify_gla: error %d getting segment"
3922 " descriptor %d", error
, vie
->segment_register
);
3925 segbase
= desc
.base
;
3928 gla2
= segbase
+ base
+ vie
->scale
* idx
+ vie
->displacement
;
3929 gla2
&= size2mask
[vie
->addrsize
];
3931 printf("verify_gla mismatch: segbase(0x%0lx)"
3932 "base(0x%0lx), scale(%d), index(0x%0lx), "
3933 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
3934 segbase
, base
, vie
->scale
, idx
, vie
->displacement
,
3943 vie_decode_instruction(struct vie
*vie
, struct vm
*vm
, int cpuid
, int cs_d
)
3945 enum vm_cpu_mode cpu_mode
;
3947 if ((vie
->status
& VIES_INST_FETCH
) == 0) {
3951 cpu_mode
= vie
->paging
.cpu_mode
;
3953 if (decode_prefixes(vie
, cpu_mode
, cs_d
))
3956 if (decode_opcode(vie
))
3959 if (decode_modrm(vie
, cpu_mode
))
3962 if (decode_sib(vie
))
3965 if (decode_displacement(vie
))
3968 if (decode_immediate(vie
))
3971 if (decode_moffset(vie
))
3974 vie
->status
|= VIES_INST_DECODE
;