15325 bhyve upstream sync 2023 January
[illumos-gate.git] / usr / src / uts / intel / io / vmm / vmm_instruction_emul.c
blobc3b3b4d03ecaa1b383c93b7ea618913cd1c1bb3d
1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2012 Sandvine, Inc.
5 * Copyright (c) 2012 NetApp, Inc.
6 * All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
29 * $FreeBSD$
32 * This file and its contents are supplied under the terms of the
33 * Common Development and Distribution License ("CDDL"), version 1.0.
34 * You may only use this file in accordance with the terms of version
35 * 1.0 of the CDDL.
37 * A full copy of the text of the CDDL should have accompanied this
38 * source. A copy of the CDDL is also available via the Internet at
39 * http://www.illumos.org/license/CDDL.
41 * Copyright 2015 Pluribus Networks Inc.
42 * Copyright 2018 Joyent, Inc.
43 * Copyright 2021 Oxide Computer Company
44 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
50 #include <sys/param.h>
51 #include <sys/pcpu.h>
52 #include <sys/systm.h>
53 #include <sys/proc.h>
55 #include <machine/vmparam.h>
56 #include <machine/vmm.h>
57 #include <sys/vmm_kernel.h>
58 #include <sys/vmm_vm.h>
60 #include <sys/vmm_instruction_emul.h>
61 #include <x86/psl.h>
62 #include <x86/specialreg.h>
64 #include "vmm_ioport.h"
66 enum vie_status {
67 VIES_INIT = (1U << 0),
68 VIES_MMIO = (1U << 1),
69 VIES_INOUT = (1U << 2),
70 VIES_OTHER = (1U << 3),
71 VIES_INST_FETCH = (1U << 4),
72 VIES_INST_DECODE = (1U << 5),
73 VIES_PENDING_MMIO = (1U << 6),
74 VIES_PENDING_INOUT = (1U << 7),
75 VIES_REPEAT = (1U << 8),
76 VIES_USER_FALLBACK = (1U << 9),
77 VIES_COMPLETE = (1U << 10),
80 /* State of request to perform emulated access (inout or MMIO) */
81 enum vie_req {
82 VR_NONE,
83 VR_PENDING,
84 VR_DONE,
87 struct vie_mmio {
88 uint64_t data;
89 uint64_t gpa;
90 uint8_t bytes;
91 enum vie_req state;
94 struct vie_op {
95 uint8_t op_byte; /* actual opcode byte */
96 uint8_t op_type; /* type of operation (e.g. MOV) */
97 uint16_t op_flags;
100 #define VIE_INST_SIZE 15
101 struct vie {
102 uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
103 uint8_t num_valid; /* size of the instruction */
104 uint8_t num_processed;
106 uint8_t addrsize:4, opsize:4; /* address and operand sizes */
107 uint8_t rex_w:1, /* REX prefix */
108 rex_r:1,
109 rex_x:1,
110 rex_b:1,
111 rex_present:1,
112 repz_present:1, /* REP/REPE/REPZ prefix */
113 repnz_present:1, /* REPNE/REPNZ prefix */
114 opsize_override:1, /* Operand size override */
115 addrsize_override:1, /* Address size override */
116 segment_override:1; /* Segment override */
118 uint8_t mod:2, /* ModRM byte */
119 reg:4,
120 rm:4;
122 uint8_t ss:2, /* SIB byte */
123 vex_present:1, /* VEX prefixed */
124 vex_l:1, /* L bit */
125 index:4, /* SIB byte */
126 base:4; /* SIB byte */
128 uint8_t disp_bytes;
129 uint8_t imm_bytes;
131 uint8_t scale;
133 uint8_t vex_reg:4, /* vvvv: first source reg specifier */
134 vex_pp:2, /* pp */
135 _sparebits:2;
137 uint8_t _sparebytes[2];
139 int base_register; /* VM_REG_GUEST_xyz */
140 int index_register; /* VM_REG_GUEST_xyz */
141 int segment_register; /* VM_REG_GUEST_xyz */
143 int64_t displacement; /* optional addr displacement */
144 int64_t immediate; /* optional immediate operand */
146 struct vie_op op; /* opcode description */
148 enum vie_status status;
150 struct vm_guest_paging paging; /* guest paging state */
152 uint64_t mmio_gpa; /* faulting GPA */
153 struct vie_mmio mmio_req_read;
154 struct vie_mmio mmio_req_write;
156 struct vm_inout inout; /* active in/out op */
157 enum vie_req inout_req_state;
158 uint32_t inout_req_val; /* value from userspace */
162 /* struct vie_op.op_type */
163 enum {
164 VIE_OP_TYPE_NONE = 0,
165 VIE_OP_TYPE_MOV,
166 VIE_OP_TYPE_MOVSX,
167 VIE_OP_TYPE_MOVZX,
168 VIE_OP_TYPE_MOV_CR,
169 VIE_OP_TYPE_AND,
170 VIE_OP_TYPE_OR,
171 VIE_OP_TYPE_SUB,
172 VIE_OP_TYPE_TWO_BYTE,
173 VIE_OP_TYPE_PUSH,
174 VIE_OP_TYPE_CMP,
175 VIE_OP_TYPE_POP,
176 VIE_OP_TYPE_MOVS,
177 VIE_OP_TYPE_GROUP1,
178 VIE_OP_TYPE_STOS,
179 VIE_OP_TYPE_BITTEST,
180 VIE_OP_TYPE_TWOB_GRP15,
181 VIE_OP_TYPE_ADD,
182 VIE_OP_TYPE_TEST,
183 VIE_OP_TYPE_BEXTR,
184 VIE_OP_TYPE_CLTS,
185 VIE_OP_TYPE_MUL,
186 VIE_OP_TYPE_LAST
189 /* struct vie_op.op_flags */
190 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */
191 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
192 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */
193 #define VIE_OP_F_NO_MODRM (1 << 3)
194 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
195 #define VIE_OP_F_REG_REG (1 << 5) /* special-case for mov-cr */
197 static const struct vie_op three_byte_opcodes_0f38[256] = {
198 [0xF7] = {
199 .op_byte = 0xF7,
200 .op_type = VIE_OP_TYPE_BEXTR,
204 static const struct vie_op two_byte_opcodes[256] = {
205 [0x06] = {
206 .op_byte = 0x06,
207 .op_type = VIE_OP_TYPE_CLTS,
208 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
210 [0x20] = {
211 .op_byte = 0x20,
212 .op_type = VIE_OP_TYPE_MOV_CR,
213 .op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION
215 [0x22] = {
216 .op_byte = 0x22,
217 .op_type = VIE_OP_TYPE_MOV_CR,
218 .op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION
220 [0xAE] = {
221 .op_byte = 0xAE,
222 .op_type = VIE_OP_TYPE_TWOB_GRP15,
224 [0xAF] = {
225 .op_byte = 0xAF,
226 .op_type = VIE_OP_TYPE_MUL,
228 [0xB6] = {
229 .op_byte = 0xB6,
230 .op_type = VIE_OP_TYPE_MOVZX,
232 [0xB7] = {
233 .op_byte = 0xB7,
234 .op_type = VIE_OP_TYPE_MOVZX,
236 [0xBA] = {
237 .op_byte = 0xBA,
238 .op_type = VIE_OP_TYPE_BITTEST,
239 .op_flags = VIE_OP_F_IMM8,
241 [0xBE] = {
242 .op_byte = 0xBE,
243 .op_type = VIE_OP_TYPE_MOVSX,
247 static const struct vie_op one_byte_opcodes[256] = {
248 [0x03] = {
249 .op_byte = 0x03,
250 .op_type = VIE_OP_TYPE_ADD,
252 [0x0F] = {
253 .op_byte = 0x0F,
254 .op_type = VIE_OP_TYPE_TWO_BYTE
256 [0x0B] = {
257 .op_byte = 0x0B,
258 .op_type = VIE_OP_TYPE_OR,
260 [0x2B] = {
261 .op_byte = 0x2B,
262 .op_type = VIE_OP_TYPE_SUB,
264 [0x39] = {
265 .op_byte = 0x39,
266 .op_type = VIE_OP_TYPE_CMP,
268 [0x3B] = {
269 .op_byte = 0x3B,
270 .op_type = VIE_OP_TYPE_CMP,
272 [0x88] = {
273 .op_byte = 0x88,
274 .op_type = VIE_OP_TYPE_MOV,
276 [0x89] = {
277 .op_byte = 0x89,
278 .op_type = VIE_OP_TYPE_MOV,
280 [0x8A] = {
281 .op_byte = 0x8A,
282 .op_type = VIE_OP_TYPE_MOV,
284 [0x8B] = {
285 .op_byte = 0x8B,
286 .op_type = VIE_OP_TYPE_MOV,
288 [0xA1] = {
289 .op_byte = 0xA1,
290 .op_type = VIE_OP_TYPE_MOV,
291 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
293 [0xA3] = {
294 .op_byte = 0xA3,
295 .op_type = VIE_OP_TYPE_MOV,
296 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
298 [0xA4] = {
299 .op_byte = 0xA4,
300 .op_type = VIE_OP_TYPE_MOVS,
301 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
303 [0xA5] = {
304 .op_byte = 0xA5,
305 .op_type = VIE_OP_TYPE_MOVS,
306 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
308 [0xAA] = {
309 .op_byte = 0xAA,
310 .op_type = VIE_OP_TYPE_STOS,
311 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
313 [0xAB] = {
314 .op_byte = 0xAB,
315 .op_type = VIE_OP_TYPE_STOS,
316 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
318 [0xC6] = {
319 /* XXX Group 11 extended opcode - not just MOV */
320 .op_byte = 0xC6,
321 .op_type = VIE_OP_TYPE_MOV,
322 .op_flags = VIE_OP_F_IMM8,
324 [0xC7] = {
325 .op_byte = 0xC7,
326 .op_type = VIE_OP_TYPE_MOV,
327 .op_flags = VIE_OP_F_IMM,
329 [0x23] = {
330 .op_byte = 0x23,
331 .op_type = VIE_OP_TYPE_AND,
333 [0x80] = {
334 /* Group 1 extended opcode */
335 .op_byte = 0x80,
336 .op_type = VIE_OP_TYPE_GROUP1,
337 .op_flags = VIE_OP_F_IMM8,
339 [0x81] = {
340 /* Group 1 extended opcode */
341 .op_byte = 0x81,
342 .op_type = VIE_OP_TYPE_GROUP1,
343 .op_flags = VIE_OP_F_IMM,
345 [0x83] = {
346 /* Group 1 extended opcode */
347 .op_byte = 0x83,
348 .op_type = VIE_OP_TYPE_GROUP1,
349 .op_flags = VIE_OP_F_IMM8,
351 [0x8F] = {
352 /* XXX Group 1A extended opcode - not just POP */
353 .op_byte = 0x8F,
354 .op_type = VIE_OP_TYPE_POP,
356 [0xF6] = {
357 /* XXX Group 3 extended opcode - not just TEST */
358 .op_byte = 0xF6,
359 .op_type = VIE_OP_TYPE_TEST,
360 .op_flags = VIE_OP_F_IMM8,
362 [0xF7] = {
363 /* XXX Group 3 extended opcode - not just TEST */
364 .op_byte = 0xF7,
365 .op_type = VIE_OP_TYPE_TEST,
366 .op_flags = VIE_OP_F_IMM,
368 [0xFF] = {
369 /* XXX Group 5 extended opcode - not just PUSH */
370 .op_byte = 0xFF,
371 .op_type = VIE_OP_TYPE_PUSH,
375 /* struct vie.mod */
376 #define VIE_MOD_INDIRECT 0
377 #define VIE_MOD_INDIRECT_DISP8 1
378 #define VIE_MOD_INDIRECT_DISP32 2
379 #define VIE_MOD_DIRECT 3
381 /* struct vie.rm */
382 #define VIE_RM_SIB 4
383 #define VIE_RM_DISP32 5
385 #define GB (1024 * 1024 * 1024)
389 * Paging defines, previously pulled in from machine/pmap.h
391 #define PG_V (1 << 0) /* Present */
392 #define PG_RW (1 << 1) /* Read/Write */
393 #define PG_U (1 << 2) /* User/Supervisor */
394 #define PG_A (1 << 5) /* Accessed */
395 #define PG_M (1 << 6) /* Dirty */
396 #define PG_PS (1 << 7) /* Largepage */
399 * Paging except defines, previously pulled in from machine/pmap.h
401 #define PGEX_P (1 << 0) /* Non-present/Protection */
402 #define PGEX_W (1 << 1) /* Read/Write */
403 #define PGEX_U (1 << 2) /* User/Supervisor */
404 #define PGEX_RSV (1 << 3) /* (Non-)Reserved */
405 #define PGEX_I (1 << 4) /* Instruction */
408 static enum vm_reg_name gpr_map[16] = {
409 VM_REG_GUEST_RAX,
410 VM_REG_GUEST_RCX,
411 VM_REG_GUEST_RDX,
412 VM_REG_GUEST_RBX,
413 VM_REG_GUEST_RSP,
414 VM_REG_GUEST_RBP,
415 VM_REG_GUEST_RSI,
416 VM_REG_GUEST_RDI,
417 VM_REG_GUEST_R8,
418 VM_REG_GUEST_R9,
419 VM_REG_GUEST_R10,
420 VM_REG_GUEST_R11,
421 VM_REG_GUEST_R12,
422 VM_REG_GUEST_R13,
423 VM_REG_GUEST_R14,
424 VM_REG_GUEST_R15
427 static const char *gpr_name_map[][16] = {
428 [1] = {
429 "a[hl]", "c[hl]", "d[hl]", "b[hl]", "spl", "bpl", "sil", "dil",
430 "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b",
432 [2] = {
433 "ax", "cx", "dx", "bx", "sp", "bp", "si", "di",
434 "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w",
436 [4] = {
437 "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi",
438 "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d",
440 [8] = {
441 "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
442 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
446 static enum vm_reg_name cr_map[16] = {
447 VM_REG_GUEST_CR0,
448 VM_REG_LAST,
449 VM_REG_GUEST_CR2,
450 VM_REG_GUEST_CR3,
451 VM_REG_GUEST_CR4,
452 VM_REG_LAST,
453 VM_REG_LAST,
454 VM_REG_LAST,
455 VM_REG_LAST,
456 VM_REG_LAST,
457 VM_REG_LAST,
458 VM_REG_LAST,
459 VM_REG_LAST,
460 VM_REG_LAST,
461 VM_REG_LAST,
462 VM_REG_LAST
465 static uint64_t size2mask[] = {
466 [1] = 0xff,
467 [2] = 0xffff,
468 [4] = 0xffffffff,
469 [8] = 0xffffffffffffffff,
473 static int vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid,
474 uint64_t gpa, uint64_t *rval, int bytes);
475 static int vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid,
476 uint64_t gpa, uint64_t wval, int bytes);
477 static int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
478 struct seg_desc *desc, uint64_t offset, int length, int addrsize,
479 int prot, uint64_t *gla);
480 static int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla);
481 static int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf,
482 uint64_t gla);
483 static uint64_t vie_size2mask(int size);
485 struct vie *
486 vie_alloc()
488 return (kmem_zalloc(sizeof (struct vie), KM_SLEEP));
491 void
492 vie_free(struct vie *vie)
494 kmem_free(vie, sizeof (struct vie));
497 enum vm_reg_name
498 vie_regnum_map(uint8_t regnum)
500 VERIFY3U(regnum, <, 16);
501 return (gpr_map[regnum]);
504 const char *
505 vie_regnum_name(uint8_t regnum, uint8_t size)
507 VERIFY3U(regnum, <, 16);
508 VERIFY(size == 1 || size == 2 || size == 4 || size == 8);
509 return (gpr_name_map[size][regnum]);
512 static void
513 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
515 *lhbr = 0;
516 *reg = gpr_map[vie->reg];
519 * 64-bit mode imposes limitations on accessing legacy high byte
520 * registers (lhbr).
522 * The legacy high-byte registers cannot be addressed if the REX
523 * prefix is present. In this case the values 4, 5, 6 and 7 of the
524 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
526 * If the REX prefix is not present then the values 4, 5, 6 and 7
527 * of the 'ModRM:reg' field address the legacy high-byte registers,
528 * %ah, %ch, %dh and %bh respectively.
530 if (!vie->rex_present) {
531 if (vie->reg & 0x4) {
532 *lhbr = 1;
533 *reg = gpr_map[vie->reg & 0x3];
538 static int
539 vie_read_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t *rval)
541 uint64_t val;
542 int error, lhbr;
543 enum vm_reg_name reg;
545 vie_calc_bytereg(vie, &reg, &lhbr);
546 error = vm_get_register(vm, vcpuid, reg, &val);
549 * To obtain the value of a legacy high byte register shift the
550 * base register right by 8 bits (%ah = %rax >> 8).
552 if (lhbr)
553 *rval = val >> 8;
554 else
555 *rval = val;
556 return (error);
559 static int
560 vie_write_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t byte)
562 uint64_t origval, val, mask;
563 int error, lhbr;
564 enum vm_reg_name reg;
566 vie_calc_bytereg(vie, &reg, &lhbr);
567 error = vm_get_register(vm, vcpuid, reg, &origval);
568 if (error == 0) {
569 val = byte;
570 mask = 0xff;
571 if (lhbr) {
573 * Shift left by 8 to store 'byte' in a legacy high
574 * byte register.
576 val <<= 8;
577 mask <<= 8;
579 val |= origval & ~mask;
580 error = vm_set_register(vm, vcpuid, reg, val);
582 return (error);
585 static int
586 vie_update_register(struct vm *vm, int vcpuid, enum vm_reg_name reg,
587 uint64_t val, int size)
589 int error;
590 uint64_t origval;
592 switch (size) {
593 case 1:
594 case 2:
595 error = vm_get_register(vm, vcpuid, reg, &origval);
596 if (error)
597 return (error);
598 val &= size2mask[size];
599 val |= origval & ~size2mask[size];
600 break;
601 case 4:
602 val &= 0xffffffffUL;
603 break;
604 case 8:
605 break;
606 default:
607 return (EINVAL);
610 error = vm_set_register(vm, vcpuid, reg, val);
611 return (error);
614 static int
615 vie_repeat(struct vie *vie)
617 vie->status |= VIES_REPEAT;
620 * Clear out any cached operation values so the repeated instruction can
621 * begin without using that stale state. Other state, such as the
622 * decoding results, are kept around as it will not vary between
623 * iterations of a rep-prefixed instruction.
625 if ((vie->status & VIES_MMIO) != 0) {
626 vie->mmio_req_read.state = VR_NONE;
627 vie->mmio_req_write.state = VR_NONE;
628 } else if ((vie->status & VIES_INOUT) != 0) {
629 vie->inout_req_state = VR_NONE;
630 } else {
631 panic("unexpected emulation state");
634 return (EAGAIN);
637 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
640 * Return the status flags that would result from doing (x - y).
642 /* BEGIN CSTYLED */
643 #define GETCC(sz) \
644 static ulong_t \
645 getcc##sz(uint##sz##_t x, uint##sz##_t y) \
647 ulong_t rflags; \
649 __asm __volatile("sub %2,%1; pushfq; popq %0" : \
650 "=r" (rflags), "+r" (x) : "m" (y)); \
651 return (rflags); \
652 } struct __hack
653 /* END CSTYLED */
655 GETCC(8);
656 GETCC(16);
657 GETCC(32);
658 GETCC(64);
660 static ulong_t
661 getcc(int opsize, uint64_t x, uint64_t y)
663 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
664 ("getcc: invalid operand size %d", opsize));
666 if (opsize == 1)
667 return (getcc8(x, y));
668 else if (opsize == 2)
669 return (getcc16(x, y));
670 else if (opsize == 4)
671 return (getcc32(x, y));
672 else
673 return (getcc64(x, y));
677 * Macro creation of functions getaddflags{8,16,32,64}
679 /* BEGIN CSTYLED */
680 #define GETADDFLAGS(sz) \
681 static ulong_t \
682 getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \
684 ulong_t rflags; \
686 __asm __volatile("add %2,%1; pushfq; popq %0" : \
687 "=r" (rflags), "+r" (x) : "m" (y)); \
688 return (rflags); \
689 } struct __hack
690 /* END CSTYLED */
692 GETADDFLAGS(8);
693 GETADDFLAGS(16);
694 GETADDFLAGS(32);
695 GETADDFLAGS(64);
697 static ulong_t
698 getaddflags(int opsize, uint64_t x, uint64_t y)
700 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
701 ("getaddflags: invalid operand size %d", opsize));
703 if (opsize == 1)
704 return (getaddflags8(x, y));
705 else if (opsize == 2)
706 return (getaddflags16(x, y));
707 else if (opsize == 4)
708 return (getaddflags32(x, y));
709 else
710 return (getaddflags64(x, y));
714 * Macro creation of functions getimulflags{16,32,64}
716 /* BEGIN CSTYLED */
717 #define GETIMULFLAGS(sz) \
718 static ulong_t \
719 getimulflags##sz(uint##sz##_t x, uint##sz##_t y) \
721 ulong_t rflags; \
723 __asm __volatile("imul %2,%1; pushfq; popq %0" : \
724 "=r" (rflags), "+r" (x) : "m" (y)); \
725 return (rflags); \
726 } struct __hack
727 /* END CSTYLED */
729 GETIMULFLAGS(16);
730 GETIMULFLAGS(32);
731 GETIMULFLAGS(64);
733 static ulong_t
734 getimulflags(int opsize, uint64_t x, uint64_t y)
736 KASSERT(opsize == 2 || opsize == 4 || opsize == 8,
737 ("getimulflags: invalid operand size %d", opsize));
739 if (opsize == 2)
740 return (getimulflags16(x, y));
741 else if (opsize == 4)
742 return (getimulflags32(x, y));
743 else
744 return (getimulflags64(x, y));
748 * Return the status flags that would result from doing (x & y).
750 /* BEGIN CSTYLED */
751 #define GETANDFLAGS(sz) \
752 static ulong_t \
753 getandflags##sz(uint##sz##_t x, uint##sz##_t y) \
755 ulong_t rflags; \
757 __asm __volatile("and %2,%1; pushfq; popq %0" : \
758 "=r" (rflags), "+r" (x) : "m" (y)); \
759 return (rflags); \
760 } struct __hack
761 /* END CSTYLED */
763 GETANDFLAGS(8);
764 GETANDFLAGS(16);
765 GETANDFLAGS(32);
766 GETANDFLAGS(64);
768 static ulong_t
769 getandflags(int opsize, uint64_t x, uint64_t y)
771 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
772 ("getandflags: invalid operand size %d", opsize));
774 if (opsize == 1)
775 return (getandflags8(x, y));
776 else if (opsize == 2)
777 return (getandflags16(x, y));
778 else if (opsize == 4)
779 return (getandflags32(x, y));
780 else
781 return (getandflags64(x, y));
784 static int
785 vie_emulate_mov_cr(struct vie *vie, struct vm *vm, int vcpuid)
787 uint64_t val;
788 int err;
789 enum vm_reg_name gpr = gpr_map[vie->rm];
790 enum vm_reg_name cr = cr_map[vie->reg];
792 uint_t size = 4;
793 if (vie->paging.cpu_mode == CPU_MODE_64BIT) {
794 size = 8;
797 switch (vie->op.op_byte) {
798 case 0x20:
800 * MOV control register (ModRM:reg) to reg (ModRM:r/m)
801 * 20/r: mov r32, CR0-CR7
802 * 20/r: mov r64, CR0-CR7
803 * REX.R + 20/0: mov r64, CR8
805 if (vie->paging.cpl != 0) {
806 vm_inject_gp(vm, vcpuid);
807 vie->num_processed = 0;
808 return (0);
810 err = vm_get_register(vm, vcpuid, cr, &val);
811 if (err != 0) {
812 /* #UD for access to non-existent CRs */
813 vm_inject_ud(vm, vcpuid);
814 vie->num_processed = 0;
815 return (0);
817 err = vie_update_register(vm, vcpuid, gpr, val, size);
818 break;
819 case 0x22: {
821 * MOV reg (ModRM:r/m) to control register (ModRM:reg)
822 * 22/r: mov CR0-CR7, r32
823 * 22/r: mov CR0-CR7, r64
824 * REX.R + 22/0: mov CR8, r64
826 uint64_t old, diff;
828 if (vie->paging.cpl != 0) {
829 vm_inject_gp(vm, vcpuid);
830 vie->num_processed = 0;
831 return (0);
833 err = vm_get_register(vm, vcpuid, cr, &old);
834 if (err != 0) {
835 /* #UD for access to non-existent CRs */
836 vm_inject_ud(vm, vcpuid);
837 vie->num_processed = 0;
838 return (0);
840 err = vm_get_register(vm, vcpuid, gpr, &val);
841 VERIFY0(err);
842 val &= size2mask[size];
843 diff = old ^ val;
845 switch (cr) {
846 case VM_REG_GUEST_CR0:
847 if ((diff & CR0_PG) != 0) {
848 uint64_t efer;
850 err = vm_get_register(vm, vcpuid,
851 VM_REG_GUEST_EFER, &efer);
852 VERIFY0(err);
854 /* Keep the long-mode state in EFER in sync */
855 if ((val & CR0_PG) != 0 &&
856 (efer & EFER_LME) != 0) {
857 efer |= EFER_LMA;
859 if ((val & CR0_PG) == 0 &&
860 (efer & EFER_LME) != 0) {
861 efer &= ~EFER_LMA;
864 err = vm_set_register(vm, vcpuid,
865 VM_REG_GUEST_EFER, efer);
866 VERIFY0(err);
868 /* TODO: enforce more of the #GP checks */
869 err = vm_set_register(vm, vcpuid, cr, val);
870 VERIFY0(err);
871 break;
872 case VM_REG_GUEST_CR2:
873 case VM_REG_GUEST_CR3:
874 case VM_REG_GUEST_CR4:
875 /* TODO: enforce more of the #GP checks */
876 err = vm_set_register(vm, vcpuid, cr, val);
877 break;
878 default:
879 /* The cr_map mapping should prevent this */
880 panic("invalid cr %d", cr);
882 break;
884 default:
885 return (EINVAL);
887 return (err);
890 static int
891 vie_emulate_mov(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
893 int error, size;
894 enum vm_reg_name reg;
895 uint8_t byte;
896 uint64_t val;
898 size = vie->opsize;
899 error = EINVAL;
901 switch (vie->op.op_byte) {
902 case 0x88:
904 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
905 * 88/r: mov r/m8, r8
906 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
908 size = 1; /* override for byte operation */
909 error = vie_read_bytereg(vie, vm, vcpuid, &byte);
910 if (error == 0) {
911 error = vie_mmio_write(vie, vm, vcpuid, gpa, byte,
912 size);
914 break;
915 case 0x89:
917 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
918 * 89/r: mov r/m16, r16
919 * 89/r: mov r/m32, r32
920 * REX.W + 89/r mov r/m64, r64
922 reg = gpr_map[vie->reg];
923 error = vm_get_register(vm, vcpuid, reg, &val);
924 if (error == 0) {
925 val &= size2mask[size];
926 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
928 break;
929 case 0x8A:
931 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
932 * 8A/r: mov r8, r/m8
933 * REX + 8A/r: mov r8, r/m8
935 size = 1; /* override for byte operation */
936 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
937 if (error == 0)
938 error = vie_write_bytereg(vie, vm, vcpuid, val);
939 break;
940 case 0x8B:
942 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
943 * 8B/r: mov r16, r/m16
944 * 8B/r: mov r32, r/m32
945 * REX.W 8B/r: mov r64, r/m64
947 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
948 if (error == 0) {
949 reg = gpr_map[vie->reg];
950 error = vie_update_register(vm, vcpuid, reg, val, size);
952 break;
953 case 0xA1:
955 * MOV from seg:moffset to AX/EAX/RAX
956 * A1: mov AX, moffs16
957 * A1: mov EAX, moffs32
958 * REX.W + A1: mov RAX, moffs64
960 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
961 if (error == 0) {
962 reg = VM_REG_GUEST_RAX;
963 error = vie_update_register(vm, vcpuid, reg, val, size);
965 break;
966 case 0xA3:
968 * MOV from AX/EAX/RAX to seg:moffset
969 * A3: mov moffs16, AX
970 * A3: mov moffs32, EAX
971 * REX.W + A3: mov moffs64, RAX
973 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
974 if (error == 0) {
975 val &= size2mask[size];
976 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
978 break;
979 case 0xC6:
981 * MOV from imm8 to mem (ModRM:r/m)
982 * C6/0 mov r/m8, imm8
983 * REX + C6/0 mov r/m8, imm8
985 size = 1; /* override for byte operation */
986 val = vie->immediate;
987 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
988 break;
989 case 0xC7:
991 * MOV from imm16/imm32 to mem (ModRM:r/m)
992 * C7/0 mov r/m16, imm16
993 * C7/0 mov r/m32, imm32
994 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
996 val = vie->immediate & size2mask[size];
997 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
998 break;
999 default:
1000 break;
1003 return (error);
1006 static int
1007 vie_emulate_movx(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1009 int error, size;
1010 enum vm_reg_name reg;
1011 uint64_t val;
1013 size = vie->opsize;
1014 error = EINVAL;
1016 switch (vie->op.op_byte) {
1017 case 0xB6:
1019 * MOV and zero extend byte from mem (ModRM:r/m) to
1020 * reg (ModRM:reg).
1022 * 0F B6/r movzx r16, r/m8
1023 * 0F B6/r movzx r32, r/m8
1024 * REX.W + 0F B6/r movzx r64, r/m8
1027 /* get the first operand */
1028 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1);
1029 if (error)
1030 break;
1032 /* get the second operand */
1033 reg = gpr_map[vie->reg];
1035 /* zero-extend byte */
1036 val = (uint8_t)val;
1038 /* write the result */
1039 error = vie_update_register(vm, vcpuid, reg, val, size);
1040 break;
1041 case 0xB7:
1043 * MOV and zero extend word from mem (ModRM:r/m) to
1044 * reg (ModRM:reg).
1046 * 0F B7/r movzx r32, r/m16
1047 * REX.W + 0F B7/r movzx r64, r/m16
1049 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 2);
1050 if (error)
1051 return (error);
1053 reg = gpr_map[vie->reg];
1055 /* zero-extend word */
1056 val = (uint16_t)val;
1058 error = vie_update_register(vm, vcpuid, reg, val, size);
1059 break;
1060 case 0xBE:
1062 * MOV and sign extend byte from mem (ModRM:r/m) to
1063 * reg (ModRM:reg).
1065 * 0F BE/r movsx r16, r/m8
1066 * 0F BE/r movsx r32, r/m8
1067 * REX.W + 0F BE/r movsx r64, r/m8
1070 /* get the first operand */
1071 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1);
1072 if (error)
1073 break;
1075 /* get the second operand */
1076 reg = gpr_map[vie->reg];
1078 /* sign extend byte */
1079 val = (int8_t)val;
1081 /* write the result */
1082 error = vie_update_register(vm, vcpuid, reg, val, size);
1083 break;
1084 default:
1085 break;
1087 return (error);
1091 * Helper function to calculate and validate a linear address.
1093 static int
1094 vie_get_gla(struct vie *vie, struct vm *vm, int vcpuid, int opsize,
1095 int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr,
1096 uint64_t *gla)
1098 struct seg_desc desc;
1099 uint64_t cr0, val, rflags;
1100 int error;
1101 struct vm_guest_paging *paging;
1103 paging = &vie->paging;
1105 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1106 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1108 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1109 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1111 error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
1112 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
1113 __func__, error, seg));
1115 error = vm_get_register(vm, vcpuid, gpr, &val);
1116 KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
1117 error, gpr));
1119 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
1120 addrsize, prot, gla)) {
1121 if (seg == VM_REG_GUEST_SS)
1122 vm_inject_ss(vm, vcpuid, 0);
1123 else
1124 vm_inject_gp(vm, vcpuid);
1125 return (-1);
1128 if (vie_canonical_check(paging->cpu_mode, *gla)) {
1129 if (seg == VM_REG_GUEST_SS)
1130 vm_inject_ss(vm, vcpuid, 0);
1131 else
1132 vm_inject_gp(vm, vcpuid);
1133 return (-1);
1136 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
1137 vm_inject_ac(vm, vcpuid, 0);
1138 return (-1);
1141 return (0);
1144 static int
1145 vie_emulate_movs(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1147 struct vm_copyinfo copyinfo[2];
1148 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
1149 uint64_t rcx, rdi, rsi, rflags;
1150 int error, fault, opsize, seg, repeat;
1151 struct vm_guest_paging *paging;
1153 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
1154 val = 0;
1155 error = 0;
1156 paging = &vie->paging;
1159 * XXX although the MOVS instruction is only supposed to be used with
1160 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
1162 * Empirically the "repnz" prefix has identical behavior to "rep"
1163 * and the zero flag does not make a difference.
1165 repeat = vie->repz_present | vie->repnz_present;
1167 if (repeat) {
1168 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
1169 KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
1172 * The count register is %rcx, %ecx or %cx depending on the
1173 * address size of the instruction.
1175 if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
1176 error = 0;
1177 goto done;
1182 * Source Destination Comments
1183 * --------------------------------------------
1184 * (1) memory memory n/a
1185 * (2) memory mmio emulated
1186 * (3) mmio memory emulated
1187 * (4) mmio mmio emulated
1189 * At this point we don't have sufficient information to distinguish
1190 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
1191 * out because it will succeed only when operating on regular memory.
1193 * XXX the emulation doesn't properly handle the case where 'gpa'
1194 * is straddling the boundary between the normal memory and MMIO.
1197 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
1198 if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, PROT_READ, seg,
1199 VM_REG_GUEST_RSI, &srcaddr) != 0) {
1200 goto done;
1203 error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
1204 copyinfo, nitems(copyinfo), &fault);
1205 if (error == 0) {
1206 if (fault)
1207 goto done; /* Resume guest to handle fault */
1210 * case (2): read from system memory and write to mmio.
1212 vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
1213 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1214 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize);
1215 if (error)
1216 goto done;
1217 } else {
1219 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
1220 * if 'srcaddr' is in the mmio space.
1223 if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize,
1224 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI,
1225 &dstaddr) != 0) {
1226 goto done;
1229 error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
1230 PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
1231 if (error == 0) {
1232 if (fault)
1233 goto done; /* Resume guest to handle fault */
1236 * case (3): read from MMIO and write to system memory.
1238 * A MMIO read can have side-effects so we
1239 * commit to it only after vm_copy_setup() is
1240 * successful. If a page-fault needs to be
1241 * injected into the guest then it will happen
1242 * before the MMIO read is attempted.
1244 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val,
1245 opsize);
1247 if (error == 0) {
1248 vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
1251 * Regardless of whether the MMIO read was successful or
1252 * not, the copy resources must be cleaned up.
1254 vm_copy_teardown(vm, vcpuid, copyinfo,
1255 nitems(copyinfo));
1256 if (error != 0) {
1257 goto done;
1259 } else {
1261 * Case (4): read from and write to mmio.
1263 * Commit to the MMIO read/write (with potential
1264 * side-effects) only after we are sure that the
1265 * instruction is not going to be restarted due
1266 * to address translation faults.
1268 error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
1269 PROT_READ, &srcgpa, &fault);
1270 if (error || fault)
1271 goto done;
1273 error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
1274 PROT_WRITE, &dstgpa, &fault);
1275 if (error || fault)
1276 goto done;
1278 error = vie_mmio_read(vie, vm, vcpuid, srcgpa, &val,
1279 opsize);
1280 if (error)
1281 goto done;
1283 error = vie_mmio_write(vie, vm, vcpuid, dstgpa, val,
1284 opsize);
1285 if (error)
1286 goto done;
1290 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
1291 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
1293 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
1294 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
1296 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1297 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1299 if (rflags & PSL_D) {
1300 rsi -= opsize;
1301 rdi -= opsize;
1302 } else {
1303 rsi += opsize;
1304 rdi += opsize;
1307 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
1308 vie->addrsize);
1309 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
1311 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
1312 vie->addrsize);
1313 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
1315 if (repeat) {
1316 rcx = rcx - 1;
1317 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
1318 rcx, vie->addrsize);
1319 KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
1322 * Repeat the instruction if the count register is not zero.
1324 if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1325 return (vie_repeat(vie));
1327 done:
1328 return (error);
1331 static int
1332 vie_emulate_stos(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1334 int error, opsize, repeat;
1335 uint64_t val;
1336 uint64_t rcx, rdi, rflags;
1338 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
1339 repeat = vie->repz_present | vie->repnz_present;
1341 if (repeat) {
1342 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
1343 KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
1346 * The count register is %rcx, %ecx or %cx depending on the
1347 * address size of the instruction.
1349 if ((rcx & vie_size2mask(vie->addrsize)) == 0)
1350 return (0);
1353 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
1354 KASSERT(!error, ("%s: error %d getting rax", __func__, error));
1356 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize);
1357 if (error)
1358 return (error);
1360 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
1361 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
1363 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1364 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1366 if (rflags & PSL_D)
1367 rdi -= opsize;
1368 else
1369 rdi += opsize;
1371 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
1372 vie->addrsize);
1373 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
1375 if (repeat) {
1376 rcx = rcx - 1;
1377 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
1378 rcx, vie->addrsize);
1379 KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
1382 * Repeat the instruction if the count register is not zero.
1384 if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1385 return (vie_repeat(vie));
1388 return (0);
1391 static int
1392 vie_emulate_and(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1394 int error, size;
1395 enum vm_reg_name reg;
1396 uint64_t result, rflags, rflags2, val1, val2;
1398 size = vie->opsize;
1399 error = EINVAL;
1401 switch (vie->op.op_byte) {
1402 case 0x23:
1404 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1405 * result in reg.
1407 * 23/r and r16, r/m16
1408 * 23/r and r32, r/m32
1409 * REX.W + 23/r and r64, r/m64
1412 /* get the first operand */
1413 reg = gpr_map[vie->reg];
1414 error = vm_get_register(vm, vcpuid, reg, &val1);
1415 if (error)
1416 break;
1418 /* get the second operand */
1419 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1420 if (error)
1421 break;
1423 /* perform the operation and write the result */
1424 result = val1 & val2;
1425 error = vie_update_register(vm, vcpuid, reg, result, size);
1426 break;
1427 case 0x81:
1428 case 0x83:
1430 * AND mem (ModRM:r/m) with immediate and store the
1431 * result in mem.
1433 * 81 /4 and r/m16, imm16
1434 * 81 /4 and r/m32, imm32
1435 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64
1437 * 83 /4 and r/m16, imm8 sign-extended to 16
1438 * 83 /4 and r/m32, imm8 sign-extended to 32
1439 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64
1442 /* get the first operand */
1443 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size);
1444 if (error)
1445 break;
1448 * perform the operation with the pre-fetched immediate
1449 * operand and write the result
1451 result = val1 & vie->immediate;
1452 error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size);
1453 break;
1454 default:
1455 break;
1457 if (error)
1458 return (error);
1460 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1461 if (error)
1462 return (error);
1465 * OF and CF are cleared; the SF, ZF and PF flags are set according
1466 * to the result; AF is undefined.
1468 * The updated status flags are obtained by subtracting 0 from 'result'.
1470 rflags2 = getcc(size, result, 0);
1471 rflags &= ~RFLAGS_STATUS_BITS;
1472 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1474 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1475 return (error);
1478 static int
1479 vie_emulate_or(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1481 int error, size;
1482 enum vm_reg_name reg;
1483 uint64_t result, rflags, rflags2, val1, val2;
1485 size = vie->opsize;
1486 error = EINVAL;
1488 switch (vie->op.op_byte) {
1489 case 0x0B:
1491 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1492 * result in reg.
1494 * 0b/r or r16, r/m16
1495 * 0b/r or r32, r/m32
1496 * REX.W + 0b/r or r64, r/m64
1499 /* get the first operand */
1500 reg = gpr_map[vie->reg];
1501 error = vm_get_register(vm, vcpuid, reg, &val1);
1502 if (error)
1503 break;
1505 /* get the second operand */
1506 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1507 if (error)
1508 break;
1510 /* perform the operation and write the result */
1511 result = val1 | val2;
1512 error = vie_update_register(vm, vcpuid, reg, result, size);
1513 break;
1514 case 0x81:
1515 case 0x83:
1517 * OR mem (ModRM:r/m) with immediate and store the
1518 * result in mem.
1520 * 81 /1 or r/m16, imm16
1521 * 81 /1 or r/m32, imm32
1522 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64
1524 * 83 /1 or r/m16, imm8 sign-extended to 16
1525 * 83 /1 or r/m32, imm8 sign-extended to 32
1526 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64
1529 /* get the first operand */
1530 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size);
1531 if (error)
1532 break;
1535 * perform the operation with the pre-fetched immediate
1536 * operand and write the result
1538 result = val1 | vie->immediate;
1539 error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size);
1540 break;
1541 default:
1542 break;
1544 if (error)
1545 return (error);
1547 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1548 if (error)
1549 return (error);
1552 * OF and CF are cleared; the SF, ZF and PF flags are set according
1553 * to the result; AF is undefined.
1555 * The updated status flags are obtained by subtracting 0 from 'result'.
1557 rflags2 = getcc(size, result, 0);
1558 rflags &= ~RFLAGS_STATUS_BITS;
1559 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1561 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1562 return (error);
1565 static int
1566 vie_emulate_cmp(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1568 int error, size;
1569 uint64_t regop, memop, op1, op2, rflags, rflags2;
1570 enum vm_reg_name reg;
1572 size = vie->opsize;
1573 switch (vie->op.op_byte) {
1574 case 0x39:
1575 case 0x3B:
1577 * 39/r CMP r/m16, r16
1578 * 39/r CMP r/m32, r32
1579 * REX.W 39/r CMP r/m64, r64
1581 * 3B/r CMP r16, r/m16
1582 * 3B/r CMP r32, r/m32
1583 * REX.W + 3B/r CMP r64, r/m64
1585 * Compare the first operand with the second operand and
1586 * set status flags in EFLAGS register. The comparison is
1587 * performed by subtracting the second operand from the first
1588 * operand and then setting the status flags.
1591 /* Get the register operand */
1592 reg = gpr_map[vie->reg];
1593 error = vm_get_register(vm, vcpuid, reg, &regop);
1594 if (error)
1595 return (error);
1597 /* Get the memory operand */
1598 error = vie_mmio_read(vie, vm, vcpuid, gpa, &memop, size);
1599 if (error)
1600 return (error);
1602 if (vie->op.op_byte == 0x3B) {
1603 op1 = regop;
1604 op2 = memop;
1605 } else {
1606 op1 = memop;
1607 op2 = regop;
1609 rflags2 = getcc(size, op1, op2);
1610 break;
1611 case 0x80:
1612 case 0x81:
1613 case 0x83:
1615 * 80 /7 cmp r/m8, imm8
1616 * REX + 80 /7 cmp r/m8, imm8
1618 * 81 /7 cmp r/m16, imm16
1619 * 81 /7 cmp r/m32, imm32
1620 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64
1622 * 83 /7 cmp r/m16, imm8 sign-extended to 16
1623 * 83 /7 cmp r/m32, imm8 sign-extended to 32
1624 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64
1626 * Compare mem (ModRM:r/m) with immediate and set
1627 * status flags according to the results. The
1628 * comparison is performed by subtracting the
1629 * immediate from the first operand and then setting
1630 * the status flags.
1633 if (vie->op.op_byte == 0x80)
1634 size = 1;
1636 /* get the first operand */
1637 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1638 if (error)
1639 return (error);
1641 rflags2 = getcc(size, op1, vie->immediate);
1642 break;
1643 default:
1644 return (EINVAL);
1646 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1647 if (error)
1648 return (error);
1649 rflags &= ~RFLAGS_STATUS_BITS;
1650 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1652 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1653 return (error);
1656 static int
1657 vie_emulate_test(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1659 int error, size;
1660 uint64_t op1, rflags, rflags2;
1662 size = vie->opsize;
1663 error = EINVAL;
1665 switch (vie->op.op_byte) {
1666 case 0xF6:
1668 * F6 /0 test r/m8, imm8
1670 * Test mem (ModRM:r/m) with immediate and set status
1671 * flags according to the results. The comparison is
1672 * performed by anding the immediate from the first
1673 * operand and then setting the status flags.
1675 if ((vie->reg & 7) != 0)
1676 return (EINVAL);
1678 size = 1; /* override for byte operation */
1680 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1681 if (error)
1682 return (error);
1684 rflags2 = getandflags(size, op1, vie->immediate);
1685 break;
1686 case 0xF7:
1688 * F7 /0 test r/m16, imm16
1689 * F7 /0 test r/m32, imm32
1690 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64
1692 * Test mem (ModRM:r/m) with immediate and set status
1693 * flags according to the results. The comparison is
1694 * performed by anding the immediate from the first
1695 * operand and then setting the status flags.
1697 if ((vie->reg & 7) != 0)
1698 return (EINVAL);
1700 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1701 if (error)
1702 return (error);
1704 rflags2 = getandflags(size, op1, vie->immediate);
1705 break;
1706 default:
1707 return (EINVAL);
1709 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1710 if (error)
1711 return (error);
1714 * OF and CF are cleared; the SF, ZF and PF flags are set according
1715 * to the result; AF is undefined.
1717 rflags &= ~RFLAGS_STATUS_BITS;
1718 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1720 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1721 return (error);
1724 static int
1725 vie_emulate_bextr(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1727 uint64_t src1, src2, dst, rflags;
1728 unsigned start, len, size;
1729 int error;
1730 struct vm_guest_paging *paging;
1732 size = vie->opsize;
1733 error = EINVAL;
1734 paging = &vie->paging;
1737 * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b
1738 * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b
1740 * Destination operand is ModRM:reg. Source operands are ModRM:r/m and
1741 * Vex.vvvv.
1743 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
1745 if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
1746 size = 4;
1749 * Extracts contiguous bits from the first /source/ operand (second
1750 * operand) using an index and length specified in the second /source/
1751 * operand (third operand).
1753 error = vie_mmio_read(vie, vm, vcpuid, gpa, &src1, size);
1754 if (error)
1755 return (error);
1756 error = vm_get_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2);
1757 if (error)
1758 return (error);
1759 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1760 if (error)
1761 return (error);
1763 start = (src2 & 0xff);
1764 len = (src2 & 0xff00) >> 8;
1766 /* If no bits are extracted, the destination register is cleared. */
1767 dst = 0;
1769 /* If START exceeds the operand size, no bits are extracted. */
1770 if (start > size * 8)
1771 goto done;
1772 /* Length is bounded by both the destination size and start offset. */
1773 if (start + len > size * 8)
1774 len = (size * 8) - start;
1775 if (len == 0)
1776 goto done;
1778 if (start > 0)
1779 src1 = (src1 >> start);
1780 if (len < 64)
1781 src1 = src1 & ((1ull << len) - 1);
1782 dst = src1;
1784 done:
1785 error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size);
1786 if (error)
1787 return (error);
1790 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
1791 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
1793 rflags &= ~RFLAGS_STATUS_BITS;
1794 if (dst == 0)
1795 rflags |= PSL_Z;
1796 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags,
1798 return (error);
1801 static int
1802 vie_emulate_add(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1804 int error, size;
1805 uint64_t nval, rflags, rflags2, val1, val2;
1806 enum vm_reg_name reg;
1808 size = vie->opsize;
1809 error = EINVAL;
1811 switch (vie->op.op_byte) {
1812 case 0x03:
1814 * ADD r/m to r and store the result in r
1816 * 03/r ADD r16, r/m16
1817 * 03/r ADD r32, r/m32
1818 * REX.W + 03/r ADD r64, r/m64
1821 /* get the first operand */
1822 reg = gpr_map[vie->reg];
1823 error = vm_get_register(vm, vcpuid, reg, &val1);
1824 if (error)
1825 break;
1827 /* get the second operand */
1828 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1829 if (error)
1830 break;
1832 /* perform the operation and write the result */
1833 nval = val1 + val2;
1834 error = vie_update_register(vm, vcpuid, reg, nval, size);
1835 break;
1836 default:
1837 break;
1840 if (!error) {
1841 rflags2 = getaddflags(size, val1, val2);
1842 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1843 &rflags);
1844 if (error)
1845 return (error);
1847 rflags &= ~RFLAGS_STATUS_BITS;
1848 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1849 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1850 rflags, 8);
1853 return (error);
1856 static int
1857 vie_emulate_sub(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1859 int error, size;
1860 uint64_t nval, rflags, rflags2, val1, val2;
1861 enum vm_reg_name reg;
1863 size = vie->opsize;
1864 error = EINVAL;
1866 switch (vie->op.op_byte) {
1867 case 0x2B:
1869 * SUB r/m from r and store the result in r
1871 * 2B/r SUB r16, r/m16
1872 * 2B/r SUB r32, r/m32
1873 * REX.W + 2B/r SUB r64, r/m64
1876 /* get the first operand */
1877 reg = gpr_map[vie->reg];
1878 error = vm_get_register(vm, vcpuid, reg, &val1);
1879 if (error)
1880 break;
1882 /* get the second operand */
1883 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1884 if (error)
1885 break;
1887 /* perform the operation and write the result */
1888 nval = val1 - val2;
1889 error = vie_update_register(vm, vcpuid, reg, nval, size);
1890 break;
1891 default:
1892 break;
1895 if (!error) {
1896 rflags2 = getcc(size, val1, val2);
1897 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1898 &rflags);
1899 if (error)
1900 return (error);
1902 rflags &= ~RFLAGS_STATUS_BITS;
1903 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1904 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1905 rflags, 8);
1908 return (error);
1911 static int
1912 vie_emulate_mul(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1914 int error, size;
1915 uint64_t rflags, rflags2, val1, val2;
1916 __int128_t nval;
1917 enum vm_reg_name reg;
1918 ulong_t (*getflags)(int, uint64_t, uint64_t) = NULL;
1920 size = vie->opsize;
1921 error = EINVAL;
1923 switch (vie->op.op_byte) {
1924 case 0xAF:
1926 * Multiply the contents of a destination register by
1927 * the contents of a register or memory operand and
1928 * put the signed result in the destination register.
1930 * AF/r IMUL r16, r/m16
1931 * AF/r IMUL r32, r/m32
1932 * REX.W + AF/r IMUL r64, r/m64
1935 getflags = getimulflags;
1937 /* get the first operand */
1938 reg = gpr_map[vie->reg];
1939 error = vm_get_register(vm, vcpuid, reg, &val1);
1940 if (error != 0)
1941 break;
1943 /* get the second operand */
1944 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1945 if (error != 0)
1946 break;
1948 /* perform the operation and write the result */
1949 nval = (int64_t)val1 * (int64_t)val2;
1951 error = vie_update_register(vm, vcpuid, reg, nval, size);
1953 DTRACE_PROBE4(vie__imul,
1954 const char *, vie_regnum_name(vie->reg, size),
1955 uint64_t, val1, uint64_t, val2, __uint128_t, nval);
1957 break;
1958 default:
1959 break;
1962 if (error == 0) {
1963 rflags2 = getflags(size, val1, val2);
1964 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1965 &rflags);
1966 if (error)
1967 return (error);
1969 rflags &= ~RFLAGS_STATUS_BITS;
1970 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1971 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1972 rflags, 8);
1974 DTRACE_PROBE2(vie__imul__rflags,
1975 uint64_t, rflags, uint64_t, rflags2);
1978 return (error);
1981 static int
1982 vie_emulate_stack_op(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1984 struct vm_copyinfo copyinfo[2];
1985 struct seg_desc ss_desc;
1986 uint64_t cr0, rflags, rsp, stack_gla, val;
1987 int error, fault, size, stackaddrsize, pushop;
1988 struct vm_guest_paging *paging;
1990 val = 0;
1991 size = vie->opsize;
1992 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1993 paging = &vie->paging;
1996 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1998 if (paging->cpu_mode == CPU_MODE_REAL) {
1999 stackaddrsize = 2;
2000 } else if (paging->cpu_mode == CPU_MODE_64BIT) {
2002 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
2003 * - Stack pointer size is always 64-bits.
2004 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
2005 * - 16-bit PUSH/POP is supported by using the operand size
2006 * override prefix (66H).
2008 stackaddrsize = 8;
2009 size = vie->opsize_override ? 2 : 8;
2010 } else {
2012 * In protected or compatibility mode the 'B' flag in the
2013 * stack-segment descriptor determines the size of the
2014 * stack pointer.
2016 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
2017 KASSERT(error == 0, ("%s: error %d getting SS descriptor",
2018 __func__, error));
2019 if (SEG_DESC_DEF32(ss_desc.access))
2020 stackaddrsize = 4;
2021 else
2022 stackaddrsize = 2;
2025 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
2026 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
2028 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
2029 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
2031 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
2032 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
2033 if (pushop) {
2034 rsp -= size;
2037 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
2038 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
2039 &stack_gla)) {
2040 vm_inject_ss(vm, vcpuid, 0);
2041 return (0);
2044 if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
2045 vm_inject_ss(vm, vcpuid, 0);
2046 return (0);
2049 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
2050 vm_inject_ac(vm, vcpuid, 0);
2051 return (0);
2054 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
2055 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
2056 &fault);
2057 if (error || fault)
2058 return (error);
2060 if (pushop) {
2061 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
2062 if (error == 0)
2063 vm_copyout(vm, vcpuid, &val, copyinfo, size);
2064 } else {
2065 vm_copyin(vm, vcpuid, copyinfo, &val, size);
2066 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
2067 rsp += size;
2069 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2071 if (error == 0) {
2072 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
2073 stackaddrsize);
2074 KASSERT(error == 0, ("error %d updating rsp", error));
2076 return (error);
2079 static int
2080 vie_emulate_push(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2082 int error;
2085 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
2087 * PUSH is part of the group 5 extended opcodes and is identified
2088 * by ModRM:reg = b110.
2090 if ((vie->reg & 7) != 6)
2091 return (EINVAL);
2093 error = vie_emulate_stack_op(vie, vm, vcpuid, gpa);
2094 return (error);
2097 static int
2098 vie_emulate_pop(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2100 int error;
2103 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
2105 * POP is part of the group 1A extended opcodes and is identified
2106 * by ModRM:reg = b000.
2108 if ((vie->reg & 7) != 0)
2109 return (EINVAL);
2111 error = vie_emulate_stack_op(vie, vm, vcpuid, gpa);
2112 return (error);
2115 static int
2116 vie_emulate_group1(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2118 int error;
2120 switch (vie->reg & 7) {
2121 case 0x1: /* OR */
2122 error = vie_emulate_or(vie, vm, vcpuid, gpa);
2123 break;
2124 case 0x4: /* AND */
2125 error = vie_emulate_and(vie, vm, vcpuid, gpa);
2126 break;
2127 case 0x7: /* CMP */
2128 error = vie_emulate_cmp(vie, vm, vcpuid, gpa);
2129 break;
2130 default:
2131 error = EINVAL;
2132 break;
2135 return (error);
2138 static int
2139 vie_emulate_bittest(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2141 uint64_t val, rflags;
2142 int error, bitmask, bitoff;
2145 * 0F BA is a Group 8 extended opcode.
2147 * Currently we only emulate the 'Bit Test' instruction which is
2148 * identified by a ModR/M:reg encoding of 100b.
2150 if ((vie->reg & 7) != 4)
2151 return (EINVAL);
2153 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
2154 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
2156 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, vie->opsize);
2157 if (error)
2158 return (error);
2161 * Intel SDM, Vol 2, Table 3-2:
2162 * "Range of Bit Positions Specified by Bit Offset Operands"
2164 bitmask = vie->opsize * 8 - 1;
2165 bitoff = vie->immediate & bitmask;
2167 /* Copy the bit into the Carry flag in %rflags */
2168 if (val & (1UL << bitoff))
2169 rflags |= PSL_C;
2170 else
2171 rflags &= ~PSL_C;
2173 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
2174 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
2176 return (0);
2179 static int
2180 vie_emulate_twob_group15(struct vie *vie, struct vm *vm, int vcpuid,
2181 uint64_t gpa)
2183 int error;
2184 uint64_t buf;
2186 switch (vie->reg & 7) {
2187 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */
2188 if (vie->mod == 0x3) {
2190 * SFENCE. Ignore it, VM exit provides enough
2191 * barriers on its own.
2193 error = 0;
2194 } else {
2196 * CLFLUSH, CLFLUSHOPT. Only check for access
2197 * rights.
2199 error = vie_mmio_read(vie, vm, vcpuid, gpa, &buf, 1);
2201 break;
2202 default:
2203 error = EINVAL;
2204 break;
2207 return (error);
2210 static int
2211 vie_emulate_clts(struct vie *vie, struct vm *vm, int vcpuid)
2213 uint64_t val;
2214 int error __maybe_unused;
2216 if (vie->paging.cpl != 0) {
2217 vm_inject_gp(vm, vcpuid);
2218 vie->num_processed = 0;
2219 return (0);
2222 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &val);
2223 ASSERT(error == 0);
2225 /* Clear %cr0.TS */
2226 val &= ~CR0_TS;
2228 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, val);
2229 ASSERT(error == 0);
2231 return (0);
2234 static int
2235 vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa,
2236 uint64_t *rval, int bytes)
2238 int err;
2240 if (vie->mmio_req_read.state == VR_DONE) {
2241 ASSERT(vie->mmio_req_read.bytes == bytes);
2242 ASSERT(vie->mmio_req_read.gpa == gpa);
2244 *rval = vie->mmio_req_read.data;
2245 return (0);
2248 err = vm_service_mmio_read(vm, cpuid, gpa, rval, bytes);
2249 if (err == 0) {
2251 * A successful read from an in-kernel-emulated device may come
2252 * with side effects, so stash the result in case it's used for
2253 * an instruction which subsequently needs to issue an MMIO
2254 * write to userspace.
2256 ASSERT(vie->mmio_req_read.state == VR_NONE);
2258 vie->mmio_req_read.bytes = bytes;
2259 vie->mmio_req_read.gpa = gpa;
2260 vie->mmio_req_read.data = *rval;
2261 vie->mmio_req_read.state = VR_DONE;
2263 } else if (err == ESRCH) {
2264 /* Hope that userspace emulation can fulfill this read */
2265 vie->mmio_req_read.bytes = bytes;
2266 vie->mmio_req_read.gpa = gpa;
2267 vie->mmio_req_read.state = VR_PENDING;
2268 vie->status |= VIES_PENDING_MMIO;
2269 } else if (err < 0) {
2271 * The MMIO read failed in such a way that fallback to handling
2272 * in userspace is required.
2274 vie->status |= VIES_USER_FALLBACK;
2276 return (err);
2279 static int
2280 vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa,
2281 uint64_t wval, int bytes)
2283 int err;
2285 if (vie->mmio_req_write.state == VR_DONE) {
2286 ASSERT(vie->mmio_req_write.bytes == bytes);
2287 ASSERT(vie->mmio_req_write.gpa == gpa);
2289 return (0);
2292 err = vm_service_mmio_write(vm, cpuid, gpa, wval, bytes);
2293 if (err == 0) {
2295 * A successful write to an in-kernel-emulated device probably
2296 * results in side effects, so stash the fact that such a write
2297 * succeeded in case the operation requires other work.
2299 vie->mmio_req_write.bytes = bytes;
2300 vie->mmio_req_write.gpa = gpa;
2301 vie->mmio_req_write.data = wval;
2302 vie->mmio_req_write.state = VR_DONE;
2303 } else if (err == ESRCH) {
2304 /* Hope that userspace emulation can fulfill this write */
2305 vie->mmio_req_write.bytes = bytes;
2306 vie->mmio_req_write.gpa = gpa;
2307 vie->mmio_req_write.data = wval;
2308 vie->mmio_req_write.state = VR_PENDING;
2309 vie->status |= VIES_PENDING_MMIO;
2310 } else if (err < 0) {
2312 * The MMIO write failed in such a way that fallback to handling
2313 * in userspace is required.
2315 vie->status |= VIES_USER_FALLBACK;
2317 return (err);
2321 vie_emulate_mmio(struct vie *vie, struct vm *vm, int vcpuid)
2323 int error;
2324 uint64_t gpa;
2326 if ((vie->status & (VIES_INST_DECODE | VIES_MMIO)) !=
2327 (VIES_INST_DECODE | VIES_MMIO)) {
2328 return (EINVAL);
2331 gpa = vie->mmio_gpa;
2333 switch (vie->op.op_type) {
2334 case VIE_OP_TYPE_GROUP1:
2335 error = vie_emulate_group1(vie, vm, vcpuid, gpa);
2336 break;
2337 case VIE_OP_TYPE_POP:
2338 error = vie_emulate_pop(vie, vm, vcpuid, gpa);
2339 break;
2340 case VIE_OP_TYPE_PUSH:
2341 error = vie_emulate_push(vie, vm, vcpuid, gpa);
2342 break;
2343 case VIE_OP_TYPE_CMP:
2344 error = vie_emulate_cmp(vie, vm, vcpuid, gpa);
2345 break;
2346 case VIE_OP_TYPE_MOV:
2347 error = vie_emulate_mov(vie, vm, vcpuid, gpa);
2348 break;
2349 case VIE_OP_TYPE_MOVSX:
2350 case VIE_OP_TYPE_MOVZX:
2351 error = vie_emulate_movx(vie, vm, vcpuid, gpa);
2352 break;
2353 case VIE_OP_TYPE_MOVS:
2354 error = vie_emulate_movs(vie, vm, vcpuid, gpa);
2355 break;
2356 case VIE_OP_TYPE_STOS:
2357 error = vie_emulate_stos(vie, vm, vcpuid, gpa);
2358 break;
2359 case VIE_OP_TYPE_AND:
2360 error = vie_emulate_and(vie, vm, vcpuid, gpa);
2361 break;
2362 case VIE_OP_TYPE_OR:
2363 error = vie_emulate_or(vie, vm, vcpuid, gpa);
2364 break;
2365 case VIE_OP_TYPE_SUB:
2366 error = vie_emulate_sub(vie, vm, vcpuid, gpa);
2367 break;
2368 case VIE_OP_TYPE_BITTEST:
2369 error = vie_emulate_bittest(vie, vm, vcpuid, gpa);
2370 break;
2371 case VIE_OP_TYPE_TWOB_GRP15:
2372 error = vie_emulate_twob_group15(vie, vm, vcpuid, gpa);
2373 break;
2374 case VIE_OP_TYPE_ADD:
2375 error = vie_emulate_add(vie, vm, vcpuid, gpa);
2376 break;
2377 case VIE_OP_TYPE_TEST:
2378 error = vie_emulate_test(vie, vm, vcpuid, gpa);
2379 break;
2380 case VIE_OP_TYPE_BEXTR:
2381 error = vie_emulate_bextr(vie, vm, vcpuid, gpa);
2382 break;
2383 case VIE_OP_TYPE_MUL:
2384 error = vie_emulate_mul(vie, vm, vcpuid, gpa);
2385 break;
2386 default:
2387 error = EINVAL;
2388 break;
2391 if (error == ESRCH) {
2392 /* Return to userspace with the mmio request */
2393 return (-1);
2396 return (error);
2399 static int
2400 vie_emulate_inout_port(struct vie *vie, struct vm *vm, int vcpuid,
2401 uint32_t *eax)
2403 uint32_t mask, val;
2404 bool in;
2405 int err;
2407 mask = vie_size2mask(vie->inout.bytes);
2408 in = (vie->inout.flags & INOUT_IN) != 0;
2410 if (!in) {
2411 val = *eax & mask;
2414 if (vie->inout_req_state != VR_DONE) {
2415 err = vm_ioport_access(vm, vcpuid, in, vie->inout.port,
2416 vie->inout.bytes, &val);
2417 val &= mask;
2418 } else {
2420 * This port access was handled in userspace and the result was
2421 * injected in to be handled now.
2423 val = vie->inout_req_val & mask;
2424 vie->inout_req_state = VR_NONE;
2425 err = 0;
2428 if (err == ESRCH) {
2429 vie->status |= VIES_PENDING_INOUT;
2430 vie->inout_req_state = VR_PENDING;
2431 return (err);
2432 } else if (err != 0) {
2433 return (err);
2436 if (in) {
2437 *eax = (*eax & ~mask) | val;
2439 return (0);
2442 static enum vm_reg_name
2443 vie_inout_segname(const struct vie *vie)
2445 uint8_t segidx = vie->inout.segment;
2446 const enum vm_reg_name segmap[] = {
2447 VM_REG_GUEST_ES,
2448 VM_REG_GUEST_CS,
2449 VM_REG_GUEST_SS,
2450 VM_REG_GUEST_DS,
2451 VM_REG_GUEST_FS,
2452 VM_REG_GUEST_GS,
2454 const uint8_t maxidx = (sizeof (segmap) / sizeof (segmap[0]));
2456 if (segidx >= maxidx) {
2457 panic("unexpected segment index %u", segidx);
2459 return (segmap[segidx]);
2462 static int
2463 vie_emulate_inout_str(struct vie *vie, struct vm *vm, int vcpuid)
2465 uint8_t bytes, addrsize;
2466 uint64_t index, count = 0, gla, rflags;
2467 int prot, err, fault;
2468 bool in, repeat;
2469 enum vm_reg_name seg_reg, idx_reg;
2470 struct vm_copyinfo copyinfo[2];
2472 in = (vie->inout.flags & INOUT_IN) != 0;
2473 bytes = vie->inout.bytes;
2474 addrsize = vie->inout.addrsize;
2475 prot = in ? PROT_WRITE : PROT_READ;
2477 ASSERT(bytes == 1 || bytes == 2 || bytes == 4);
2478 ASSERT(addrsize == 2 || addrsize == 4 || addrsize == 8);
2480 idx_reg = (in) ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
2481 seg_reg = vie_inout_segname(vie);
2482 err = vm_get_register(vm, vcpuid, idx_reg, &index);
2483 ASSERT(err == 0);
2484 index = index & vie_size2mask(addrsize);
2486 repeat = (vie->inout.flags & INOUT_REP) != 0;
2488 /* Count register */
2489 if (repeat) {
2490 err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &count);
2491 count &= vie_size2mask(addrsize);
2493 if (count == 0) {
2495 * If we were asked to emulate a REP INS/OUTS when the
2496 * count register is zero, no further work is required.
2498 return (0);
2500 } else {
2501 count = 1;
2504 gla = 0;
2505 if (vie_get_gla(vie, vm, vcpuid, bytes, addrsize, prot, seg_reg,
2506 idx_reg, &gla) != 0) {
2507 /* vie_get_gla() already injected the appropriate fault */
2508 return (0);
2512 * The INS/OUTS emulate currently assumes that the memory target resides
2513 * within the guest system memory, rather than a device MMIO region. If
2514 * such a case becomes a necessity, that additional handling could be
2515 * put in place.
2517 err = vm_copy_setup(vm, vcpuid, &vie->paging, gla, bytes, prot,
2518 copyinfo, nitems(copyinfo), &fault);
2520 if (err) {
2521 /* Unrecoverable error */
2522 return (err);
2523 } else if (fault) {
2524 /* Resume guest to handle fault */
2525 return (0);
2528 if (!in) {
2529 vm_copyin(vm, vcpuid, copyinfo, &vie->inout.eax, bytes);
2532 err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax);
2534 if (err == 0 && in) {
2535 vm_copyout(vm, vcpuid, &vie->inout.eax, copyinfo, bytes);
2538 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2540 if (err == 0) {
2541 err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
2542 &rflags);
2543 ASSERT(err == 0);
2545 /* Update index */
2546 if (rflags & PSL_D) {
2547 index -= bytes;
2548 } else {
2549 index += bytes;
2552 /* Update index register */
2553 err = vie_update_register(vm, vcpuid, idx_reg, index, addrsize);
2554 ASSERT(err == 0);
2557 * Update count register only if the instruction had a repeat
2558 * prefix.
2560 if ((vie->inout.flags & INOUT_REP) != 0) {
2561 count--;
2562 err = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
2563 count, addrsize);
2564 ASSERT(err == 0);
2566 if (count != 0) {
2567 return (vie_repeat(vie));
2572 return (err);
2576 vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid)
2578 int err = 0;
2580 if ((vie->status & VIES_INOUT) == 0) {
2581 return (EINVAL);
2584 if ((vie->inout.flags & INOUT_STR) == 0) {
2586 * For now, using the 'rep' prefixes with plain (non-string)
2587 * in/out is not supported.
2589 if ((vie->inout.flags & INOUT_REP) != 0) {
2590 return (EINVAL);
2593 err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax);
2594 if (err == 0 && (vie->inout.flags & INOUT_IN) != 0) {
2596 * With the inX access now a success, the result needs
2597 * to be stored in the guest %rax.
2599 err = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
2600 vie->inout.eax);
2601 VERIFY0(err);
2603 } else {
2604 vie->status &= ~VIES_REPEAT;
2605 err = vie_emulate_inout_str(vie, vm, vcpuid);
2608 if (err < 0) {
2610 * Access to an I/O port failed in such a way that fallback to
2611 * handling in userspace is required.
2613 vie->status |= VIES_USER_FALLBACK;
2614 } else if (err == ESRCH) {
2615 ASSERT(vie->status & VIES_PENDING_INOUT);
2616 /* Return to userspace with the in/out request */
2617 err = -1;
2620 return (err);
2624 vie_emulate_other(struct vie *vie, struct vm *vm, int vcpuid)
2626 int error;
2628 if ((vie->status & (VIES_INST_DECODE | VIES_OTHER)) !=
2629 (VIES_INST_DECODE | VIES_OTHER)) {
2630 return (EINVAL);
2633 switch (vie->op.op_type) {
2634 case VIE_OP_TYPE_CLTS:
2635 error = vie_emulate_clts(vie, vm, vcpuid);
2636 break;
2637 case VIE_OP_TYPE_MOV_CR:
2638 error = vie_emulate_mov_cr(vie, vm, vcpuid);
2639 break;
2640 default:
2641 error = EINVAL;
2642 break;
2645 return (error);
2648 void
2649 vie_reset(struct vie *vie)
2651 vie->status = 0;
2652 vie->num_processed = vie->num_valid = 0;
2655 void
2656 vie_advance_pc(struct vie *vie, uint64_t *nextrip)
2658 VERIFY((vie->status & VIES_REPEAT) == 0);
2660 *nextrip += vie->num_processed;
2661 vie_reset(vie);
2664 void
2665 vie_exitinfo(const struct vie *vie, struct vm_exit *vme)
2667 if (vie->status & VIES_USER_FALLBACK) {
2669 * Despite the fact that the instruction was successfully
2670 * decoded, some aspect of the emulation failed in such a way
2671 * that it is left up to userspace to complete the operation.
2673 vie_fallback_exitinfo(vie, vme);
2674 } else if (vie->status & VIES_MMIO) {
2675 vme->exitcode = VM_EXITCODE_MMIO;
2676 if (vie->mmio_req_read.state == VR_PENDING) {
2677 vme->u.mmio.gpa = vie->mmio_req_read.gpa;
2678 vme->u.mmio.data = 0;
2679 vme->u.mmio.bytes = vie->mmio_req_read.bytes;
2680 vme->u.mmio.read = 1;
2681 } else if (vie->mmio_req_write.state == VR_PENDING) {
2682 vme->u.mmio.gpa = vie->mmio_req_write.gpa;
2683 vme->u.mmio.data = vie->mmio_req_write.data &
2684 vie_size2mask(vie->mmio_req_write.bytes);
2685 vme->u.mmio.bytes = vie->mmio_req_write.bytes;
2686 vme->u.mmio.read = 0;
2687 } else {
2688 panic("bad pending MMIO state");
2690 } else if (vie->status & VIES_INOUT) {
2691 vme->exitcode = VM_EXITCODE_INOUT;
2692 vme->u.inout.port = vie->inout.port;
2693 vme->u.inout.bytes = vie->inout.bytes;
2694 if ((vie->inout.flags & INOUT_IN) != 0) {
2695 vme->u.inout.flags = INOUT_IN;
2696 vme->u.inout.eax = 0;
2697 } else {
2698 vme->u.inout.flags = 0;
2699 vme->u.inout.eax = vie->inout.eax &
2700 vie_size2mask(vie->inout.bytes);
2702 } else {
2703 panic("no pending operation");
2708 * In the case of a decoding or verification failure, bailing out to userspace
2709 * to do the instruction emulation is our only option for now.
2711 void
2712 vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme)
2714 if ((vie->status & VIES_INST_FETCH) == 0) {
2715 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
2716 } else {
2717 ASSERT(sizeof (vie->inst) == sizeof (vme->u.inst_emul.inst));
2719 bcopy(vie->inst, vme->u.inst_emul.inst, sizeof (vie->inst));
2720 vme->u.inst_emul.num_valid = vie->num_valid;
2722 vme->exitcode = VM_EXITCODE_INST_EMUL;
2725 void
2726 vie_cs_info(const struct vie *vie, struct vm *vm, int vcpuid, uint64_t *cs_base,
2727 int *cs_d)
2729 struct seg_desc cs_desc;
2730 int error __maybe_unused;
2732 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &cs_desc);
2733 ASSERT(error == 0);
2735 /* Initialization required for the paging info to be populated */
2736 VERIFY(vie->status & VIES_INIT);
2737 switch (vie->paging.cpu_mode) {
2738 case CPU_MODE_REAL:
2739 *cs_base = cs_desc.base;
2740 *cs_d = 0;
2741 break;
2742 case CPU_MODE_PROTECTED:
2743 case CPU_MODE_COMPATIBILITY:
2744 *cs_base = cs_desc.base;
2745 *cs_d = SEG_DESC_DEF32(cs_desc.access) ? 1 : 0;
2746 break;
2747 default:
2748 *cs_base = 0;
2749 *cs_d = 0;
2750 break;
2754 bool
2755 vie_pending(const struct vie *vie)
2758 * These VIE status bits indicate conditions which must be addressed
2759 * through either device IO fulfillment (with corresponding
2760 * vie_fulfill_*()) or complete userspace emulation (followed by a
2761 * vie_reset()).
2763 const enum vie_status of_interest =
2764 VIES_PENDING_MMIO | VIES_PENDING_INOUT | VIES_USER_FALLBACK;
2766 return ((vie->status & of_interest) != 0);
2769 bool
2770 vie_needs_fetch(const struct vie *vie)
2772 if (vie->status & VIES_INST_FETCH) {
2773 ASSERT(vie->num_valid != 0);
2774 return (false);
2776 return (true);
2779 static int
2780 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
2782 KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
2783 ("%s: invalid size %d", __func__, size));
2784 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
2786 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
2787 return (0);
2789 return ((gla & (size - 1)) ? 1 : 0);
2792 static int
2793 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
2795 uint64_t mask;
2797 if (cpu_mode != CPU_MODE_64BIT)
2798 return (0);
2801 * The value of the bit 47 in the 'gla' should be replicated in the
2802 * most significant 16 bits.
2804 mask = ~((1UL << 48) - 1);
2805 if (gla & (1UL << 47))
2806 return ((gla & mask) != mask);
2807 else
2808 return ((gla & mask) != 0);
2811 static uint64_t
2812 vie_size2mask(int size)
2814 KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
2815 ("vie_size2mask: invalid size %d", size));
2816 return (size2mask[size]);
2819 static int
2820 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
2821 struct seg_desc *desc, uint64_t offset, int length, int addrsize,
2822 int prot, uint64_t *gla)
2824 uint64_t firstoff, low_limit, high_limit, segbase;
2825 int glasize, type;
2827 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
2828 ("%s: invalid segment %d", __func__, seg));
2829 KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
2830 ("%s: invalid operand size %d", __func__, length));
2831 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
2832 ("%s: invalid prot %x", __func__, prot));
2834 firstoff = offset;
2835 if (cpu_mode == CPU_MODE_64BIT) {
2836 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
2837 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
2838 glasize = 8;
2839 } else {
2840 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
2841 "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
2842 glasize = 4;
2844 * If the segment selector is loaded with a NULL selector
2845 * then the descriptor is unusable and attempting to use
2846 * it results in a #GP(0).
2848 if (SEG_DESC_UNUSABLE(desc->access))
2849 return (-1);
2852 * The processor generates a #NP exception when a segment
2853 * register is loaded with a selector that points to a
2854 * descriptor that is not present. If this was the case then
2855 * it would have been checked before the VM-exit.
2857 KASSERT(SEG_DESC_PRESENT(desc->access),
2858 ("segment %d not present: %x", seg, desc->access));
2861 * The descriptor type must indicate a code/data segment.
2863 type = SEG_DESC_TYPE(desc->access);
2864 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
2865 "descriptor type %x", seg, type));
2867 if (prot & PROT_READ) {
2868 /* #GP on a read access to a exec-only code segment */
2869 if ((type & 0xA) == 0x8)
2870 return (-1);
2873 if (prot & PROT_WRITE) {
2875 * #GP on a write access to a code segment or a
2876 * read-only data segment.
2878 if (type & 0x8) /* code segment */
2879 return (-1);
2881 if ((type & 0xA) == 0) /* read-only data seg */
2882 return (-1);
2886 * 'desc->limit' is fully expanded taking granularity into
2887 * account.
2889 if ((type & 0xC) == 0x4) {
2890 /* expand-down data segment */
2891 low_limit = desc->limit + 1;
2892 high_limit = SEG_DESC_DEF32(desc->access) ?
2893 0xffffffff : 0xffff;
2894 } else {
2895 /* code segment or expand-up data segment */
2896 low_limit = 0;
2897 high_limit = desc->limit;
2900 while (length > 0) {
2901 offset &= vie_size2mask(addrsize);
2902 if (offset < low_limit || offset > high_limit)
2903 return (-1);
2904 offset++;
2905 length--;
2910 * In 64-bit mode all segments except %fs and %gs have a segment
2911 * base address of 0.
2913 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2914 seg != VM_REG_GUEST_GS) {
2915 segbase = 0;
2916 } else {
2917 segbase = desc->base;
2921 * Truncate 'firstoff' to the effective address size before adding
2922 * it to the segment base.
2924 firstoff &= vie_size2mask(addrsize);
2925 *gla = (segbase + firstoff) & vie_size2mask(glasize);
2926 return (0);
2929 void
2930 vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length,
2931 const struct vm_guest_paging *paging, uint64_t gpa)
2933 KASSERT(inst_length <= VIE_INST_SIZE,
2934 ("%s: invalid instruction length (%d)", __func__, inst_length));
2936 bzero(vie, sizeof (struct vie));
2938 vie->base_register = VM_REG_LAST;
2939 vie->index_register = VM_REG_LAST;
2940 vie->segment_register = VM_REG_LAST;
2941 vie->status = VIES_INIT | VIES_MMIO;
2943 if (inst_length != 0) {
2944 bcopy(inst_bytes, vie->inst, inst_length);
2945 vie->num_valid = inst_length;
2946 vie->status |= VIES_INST_FETCH;
2949 vie->paging = *paging;
2950 vie->mmio_gpa = gpa;
2953 void
2954 vie_init_inout(struct vie *vie, const struct vm_inout *inout, uint8_t inst_len,
2955 const struct vm_guest_paging *paging)
2957 bzero(vie, sizeof (struct vie));
2959 vie->status = VIES_INIT | VIES_INOUT;
2961 vie->inout = *inout;
2962 vie->paging = *paging;
2965 * Since VMX/SVM assists already decoded the nature of the in/out
2966 * instruction, let the status reflect that.
2968 vie->status |= VIES_INST_FETCH | VIES_INST_DECODE;
2969 vie->num_processed = inst_len;
2972 void
2973 vie_init_other(struct vie *vie, const struct vm_guest_paging *paging)
2975 bzero(vie, sizeof (struct vie));
2977 vie->base_register = VM_REG_LAST;
2978 vie->index_register = VM_REG_LAST;
2979 vie->segment_register = VM_REG_LAST;
2980 vie->status = VIES_INIT | VIES_OTHER;
2982 vie->paging = *paging;
2986 vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *result)
2988 struct vie_mmio *pending;
2990 if ((vie->status & VIES_MMIO) == 0 ||
2991 (vie->status & VIES_PENDING_MMIO) == 0) {
2992 return (EINVAL);
2995 if (result->read) {
2996 pending = &vie->mmio_req_read;
2997 } else {
2998 pending = &vie->mmio_req_write;
3001 if (pending->state != VR_PENDING ||
3002 pending->bytes != result->bytes || pending->gpa != result->gpa) {
3003 return (EINVAL);
3006 if (result->read) {
3007 pending->data = result->data & vie_size2mask(pending->bytes);
3009 pending->state = VR_DONE;
3010 vie->status &= ~VIES_PENDING_MMIO;
3012 return (0);
3016 vie_fulfill_inout(struct vie *vie, const struct vm_inout *result)
3018 if ((vie->status & VIES_INOUT) == 0 ||
3019 (vie->status & VIES_PENDING_INOUT) == 0) {
3020 return (EINVAL);
3022 if ((vie->inout.flags & INOUT_IN) != (result->flags & INOUT_IN) ||
3023 vie->inout.bytes != result->bytes ||
3024 vie->inout.port != result->port) {
3025 return (EINVAL);
3028 if (result->flags & INOUT_IN) {
3029 vie->inout_req_val = result->eax &
3030 vie_size2mask(vie->inout.bytes);
3032 vie->inout_req_state = VR_DONE;
3033 vie->status &= ~(VIES_PENDING_INOUT);
3035 return (0);
3038 uint64_t
3039 vie_mmio_gpa(const struct vie *vie)
3041 return (vie->mmio_gpa);
3044 static int
3045 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
3047 int error_code = 0;
3049 if (pte & PG_V)
3050 error_code |= PGEX_P;
3051 if (prot & PROT_WRITE)
3052 error_code |= PGEX_W;
3053 if (usermode)
3054 error_code |= PGEX_U;
3055 if (rsvd)
3056 error_code |= PGEX_RSV;
3057 if (prot & PROT_EXEC)
3058 error_code |= PGEX_I;
3060 return (error_code);
3063 static void
3064 ptp_release(vm_page_t **vmp)
3066 if (*vmp != NULL) {
3067 (void) vmp_release(*vmp);
3068 *vmp = NULL;
3072 static void *
3073 ptp_hold(struct vm *vm, int vcpu, uintptr_t gpa, size_t len, vm_page_t **vmp)
3075 vm_client_t *vmc = vm_get_vmclient(vm, vcpu);
3076 const uintptr_t hold_gpa = gpa & PAGEMASK;
3078 /* Hold must not cross a page boundary */
3079 VERIFY3U(gpa + len, <=, hold_gpa + PAGESIZE);
3081 if (*vmp != NULL) {
3082 (void) vmp_release(*vmp);
3085 *vmp = vmc_hold(vmc, hold_gpa, PROT_READ | PROT_WRITE);
3086 if (*vmp == NULL) {
3087 return (NULL);
3090 return ((caddr_t)vmp_get_writable(*vmp) + (gpa - hold_gpa));
3093 static int
3094 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3095 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
3097 int nlevels, pfcode;
3098 int ptpshift = 0, ptpindex = 0;
3099 uint64_t ptpphys;
3100 uint64_t *ptpbase = NULL, pte = 0, pgsize = 0;
3101 vm_page_t *cookie = NULL;
3102 const bool usermode = paging->cpl == 3;
3103 const bool writable = (prot & PROT_WRITE) != 0;
3105 *guest_fault = 0;
3106 restart:
3107 ptpphys = paging->cr3; /* root of the page tables */
3108 ptp_release(&cookie);
3110 if (vie_canonical_check(paging->cpu_mode, gla)) {
3112 * XXX assuming a non-stack reference otherwise a stack fault
3113 * should be generated.
3115 if (!check_only)
3116 vm_inject_gp(vm, vcpuid);
3117 *guest_fault = 1;
3118 return (0);
3121 if (paging->paging_mode == PAGING_MODE_FLAT) {
3122 *gpa = gla;
3123 return (0);
3126 if (paging->paging_mode == PAGING_MODE_32) {
3127 uint32_t *ptpbase32, pte32;
3129 nlevels = 2;
3130 while (--nlevels >= 0) {
3131 /* Zero out the lower 12 bits. */
3132 ptpphys &= ~0xfff;
3134 ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
3135 &cookie);
3137 if (ptpbase32 == NULL) {
3138 return (EFAULT);
3141 ptpshift = PAGE_SHIFT + nlevels * 10;
3142 ptpindex = (gla >> ptpshift) & 0x3FF;
3143 pgsize = 1UL << ptpshift;
3145 pte32 = ptpbase32[ptpindex];
3147 if ((pte32 & PG_V) == 0 ||
3148 (usermode && (pte32 & PG_U) == 0) ||
3149 (writable && (pte32 & PG_RW) == 0)) {
3150 if (!check_only) {
3151 pfcode = pf_error_code(usermode, prot,
3152 0, pte32);
3153 vm_inject_pf(vm, vcpuid, pfcode, gla);
3156 ptp_release(&cookie);
3157 *guest_fault = 1;
3158 return (0);
3162 * Emulate the x86 MMU's management of the accessed
3163 * and dirty flags. While the accessed flag is set
3164 * at every level of the page table, the dirty flag
3165 * is only set at the last level providing the guest
3166 * physical address.
3168 if (!check_only && (pte32 & PG_A) == 0) {
3169 if (atomic_cmpset_32(&ptpbase32[ptpindex],
3170 pte32, pte32 | PG_A) == 0) {
3171 goto restart;
3175 /* XXX must be ignored if CR4.PSE=0 */
3176 if (nlevels > 0 && (pte32 & PG_PS) != 0)
3177 break;
3179 ptpphys = pte32;
3182 /* Set the dirty bit in the page table entry if necessary */
3183 if (!check_only && writable && (pte32 & PG_M) == 0) {
3184 if (atomic_cmpset_32(&ptpbase32[ptpindex],
3185 pte32, pte32 | PG_M) == 0) {
3186 goto restart;
3190 /* Zero out the lower 'ptpshift' bits */
3191 pte32 >>= ptpshift; pte32 <<= ptpshift;
3192 *gpa = pte32 | (gla & (pgsize - 1));
3193 ptp_release(&cookie);
3194 return (0);
3197 if (paging->paging_mode == PAGING_MODE_PAE) {
3198 /* Zero out the lower 5 bits and the upper 32 bits */
3199 ptpphys &= 0xffffffe0UL;
3201 ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof (*ptpbase) * 4,
3202 &cookie);
3203 if (ptpbase == NULL) {
3204 return (EFAULT);
3207 ptpindex = (gla >> 30) & 0x3;
3209 pte = ptpbase[ptpindex];
3211 if ((pte & PG_V) == 0) {
3212 if (!check_only) {
3213 pfcode = pf_error_code(usermode, prot, 0, pte);
3214 vm_inject_pf(vm, vcpuid, pfcode, gla);
3217 ptp_release(&cookie);
3218 *guest_fault = 1;
3219 return (0);
3222 ptpphys = pte;
3224 nlevels = 2;
3225 } else {
3226 nlevels = 4;
3229 while (--nlevels >= 0) {
3230 /* Zero out the lower 12 bits and the upper 12 bits */
3231 ptpphys &= 0x000ffffffffff000UL;
3233 ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
3234 if (ptpbase == NULL) {
3235 return (EFAULT);
3238 ptpshift = PAGE_SHIFT + nlevels * 9;
3239 ptpindex = (gla >> ptpshift) & 0x1FF;
3240 pgsize = 1UL << ptpshift;
3242 pte = ptpbase[ptpindex];
3244 if ((pte & PG_V) == 0 ||
3245 (usermode && (pte & PG_U) == 0) ||
3246 (writable && (pte & PG_RW) == 0)) {
3247 if (!check_only) {
3248 pfcode = pf_error_code(usermode, prot, 0, pte);
3249 vm_inject_pf(vm, vcpuid, pfcode, gla);
3252 ptp_release(&cookie);
3253 *guest_fault = 1;
3254 return (0);
3257 /* Set the accessed bit in the page table entry */
3258 if (!check_only && (pte & PG_A) == 0) {
3259 if (atomic_cmpset_64(&ptpbase[ptpindex],
3260 pte, pte | PG_A) == 0) {
3261 goto restart;
3265 if (nlevels > 0 && (pte & PG_PS) != 0) {
3266 if (pgsize > 1 * GB) {
3267 if (!check_only) {
3268 pfcode = pf_error_code(usermode, prot,
3269 1, pte);
3270 vm_inject_pf(vm, vcpuid, pfcode, gla);
3273 ptp_release(&cookie);
3274 *guest_fault = 1;
3275 return (0);
3277 break;
3280 ptpphys = pte;
3283 /* Set the dirty bit in the page table entry if necessary */
3284 if (!check_only && writable && (pte & PG_M) == 0) {
3285 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
3286 goto restart;
3288 ptp_release(&cookie);
3290 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */
3291 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
3292 *gpa = pte | (gla & (pgsize - 1));
3293 return (0);
3297 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3298 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
3301 return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
3302 false));
3306 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3307 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
3310 return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
3311 true));
3315 vie_fetch_instruction(struct vie *vie, struct vm *vm, int vcpuid, uint64_t rip,
3316 int *faultptr)
3318 struct vm_copyinfo copyinfo[2];
3319 int error, prot;
3321 if ((vie->status & VIES_INIT) == 0) {
3322 return (EINVAL);
3325 prot = PROT_READ | PROT_EXEC;
3326 error = vm_copy_setup(vm, vcpuid, &vie->paging, rip, VIE_INST_SIZE,
3327 prot, copyinfo, nitems(copyinfo), faultptr);
3328 if (error || *faultptr)
3329 return (error);
3331 vm_copyin(vm, vcpuid, copyinfo, vie->inst, VIE_INST_SIZE);
3332 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
3333 vie->num_valid = VIE_INST_SIZE;
3334 vie->status |= VIES_INST_FETCH;
3335 return (0);
3338 static int
3339 vie_peek(struct vie *vie, uint8_t *x)
3342 if (vie->num_processed < vie->num_valid) {
3343 *x = vie->inst[vie->num_processed];
3344 return (0);
3345 } else
3346 return (-1);
3349 static void
3350 vie_advance(struct vie *vie)
3353 vie->num_processed++;
3356 static bool
3357 segment_override(uint8_t x, int *seg)
3360 switch (x) {
3361 case 0x2E:
3362 *seg = VM_REG_GUEST_CS;
3363 break;
3364 case 0x36:
3365 *seg = VM_REG_GUEST_SS;
3366 break;
3367 case 0x3E:
3368 *seg = VM_REG_GUEST_DS;
3369 break;
3370 case 0x26:
3371 *seg = VM_REG_GUEST_ES;
3372 break;
3373 case 0x64:
3374 *seg = VM_REG_GUEST_FS;
3375 break;
3376 case 0x65:
3377 *seg = VM_REG_GUEST_GS;
3378 break;
3379 default:
3380 return (false);
3382 return (true);
3385 static int
3386 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
3388 uint8_t x;
3390 while (1) {
3391 if (vie_peek(vie, &x))
3392 return (-1);
3394 if (x == 0x66)
3395 vie->opsize_override = 1;
3396 else if (x == 0x67)
3397 vie->addrsize_override = 1;
3398 else if (x == 0xF3)
3399 vie->repz_present = 1;
3400 else if (x == 0xF2)
3401 vie->repnz_present = 1;
3402 else if (segment_override(x, &vie->segment_register))
3403 vie->segment_override = 1;
3404 else
3405 break;
3407 vie_advance(vie);
3411 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
3412 * - Only one REX prefix is allowed per instruction.
3413 * - The REX prefix must immediately precede the opcode byte or the
3414 * escape opcode byte.
3415 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
3416 * the mandatory prefix must come before the REX prefix.
3418 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
3419 vie->rex_present = 1;
3420 vie->rex_w = x & 0x8 ? 1 : 0;
3421 vie->rex_r = x & 0x4 ? 1 : 0;
3422 vie->rex_x = x & 0x2 ? 1 : 0;
3423 vie->rex_b = x & 0x1 ? 1 : 0;
3424 vie_advance(vie);
3428 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
3430 if ((cpu_mode == CPU_MODE_64BIT ||
3431 cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) {
3432 const struct vie_op *optab;
3434 /* 3-byte VEX prefix. */
3435 vie->vex_present = 1;
3437 vie_advance(vie);
3438 if (vie_peek(vie, &x))
3439 return (-1);
3442 * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted
3443 * relative to REX encoding.
3445 vie->rex_r = x & 0x80 ? 0 : 1;
3446 vie->rex_x = x & 0x40 ? 0 : 1;
3447 vie->rex_b = x & 0x20 ? 0 : 1;
3449 switch (x & 0x1F) {
3450 case 0x2:
3451 /* 0F 38. */
3452 optab = three_byte_opcodes_0f38;
3453 break;
3454 case 0x1:
3455 /* 0F class - nothing handled here yet. */
3456 /* FALLTHROUGH */
3457 case 0x3:
3458 /* 0F 3A class - nothing handled here yet. */
3459 /* FALLTHROUGH */
3460 default:
3461 /* Reserved (#UD). */
3462 return (-1);
3465 vie_advance(vie);
3466 if (vie_peek(vie, &x))
3467 return (-1);
3469 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
3470 vie->rex_w = x & 0x80 ? 1 : 0;
3472 vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
3473 vie->vex_l = !!(x & 0x4);
3474 vie->vex_pp = (x & 0x3);
3476 /* PP: 1=66 2=F3 3=F2 prefixes. */
3477 switch (vie->vex_pp) {
3478 case 0x1:
3479 vie->opsize_override = 1;
3480 break;
3481 case 0x2:
3482 vie->repz_present = 1;
3483 break;
3484 case 0x3:
3485 vie->repnz_present = 1;
3486 break;
3489 vie_advance(vie);
3491 /* Opcode, sans literal prefix prefix. */
3492 if (vie_peek(vie, &x))
3493 return (-1);
3495 vie->op = optab[x];
3496 if (vie->op.op_type == VIE_OP_TYPE_NONE)
3497 return (-1);
3499 vie_advance(vie);
3503 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
3505 if (cpu_mode == CPU_MODE_64BIT) {
3507 * Default address size is 64-bits and default operand size
3508 * is 32-bits.
3510 vie->addrsize = vie->addrsize_override ? 4 : 8;
3511 if (vie->rex_w)
3512 vie->opsize = 8;
3513 else if (vie->opsize_override)
3514 vie->opsize = 2;
3515 else
3516 vie->opsize = 4;
3517 } else if (cs_d) {
3518 /* Default address and operand sizes are 32-bits */
3519 vie->addrsize = vie->addrsize_override ? 2 : 4;
3520 vie->opsize = vie->opsize_override ? 2 : 4;
3521 } else {
3522 /* Default address and operand sizes are 16-bits */
3523 vie->addrsize = vie->addrsize_override ? 4 : 2;
3524 vie->opsize = vie->opsize_override ? 4 : 2;
3526 return (0);
3529 static int
3530 decode_two_byte_opcode(struct vie *vie)
3532 uint8_t x;
3534 if (vie_peek(vie, &x))
3535 return (-1);
3537 vie->op = two_byte_opcodes[x];
3539 if (vie->op.op_type == VIE_OP_TYPE_NONE)
3540 return (-1);
3542 vie_advance(vie);
3543 return (0);
3546 static int
3547 decode_opcode(struct vie *vie)
3549 uint8_t x;
3551 if (vie_peek(vie, &x))
3552 return (-1);
3554 /* Already did this via VEX prefix. */
3555 if (vie->op.op_type != VIE_OP_TYPE_NONE)
3556 return (0);
3558 vie->op = one_byte_opcodes[x];
3560 if (vie->op.op_type == VIE_OP_TYPE_NONE)
3561 return (-1);
3563 vie_advance(vie);
3565 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
3566 return (decode_two_byte_opcode(vie));
3568 return (0);
3571 static int
3572 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
3574 uint8_t x;
3576 * Handling mov-to/from-cr is special since it is not issuing
3577 * mmio/pio requests and can be done in real mode. We must bypass some
3578 * of the other existing decoding restrictions for it.
3580 const bool is_movcr = ((vie->op.op_flags & VIE_OP_F_REG_REG) != 0);
3582 if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
3583 return (0);
3585 if (cpu_mode == CPU_MODE_REAL && !is_movcr)
3586 return (-1);
3588 if (vie_peek(vie, &x))
3589 return (-1);
3591 vie->mod = (x >> 6) & 0x3;
3592 vie->rm = (x >> 0) & 0x7;
3593 vie->reg = (x >> 3) & 0x7;
3596 * A direct addressing mode makes no sense in the context of an EPT
3597 * fault. There has to be a memory access involved to cause the
3598 * EPT fault.
3600 if (vie->mod == VIE_MOD_DIRECT && !is_movcr)
3601 return (-1);
3603 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
3604 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
3606 * Table 2-5: Special Cases of REX Encodings
3608 * mod=0, r/m=5 is used in the compatibility mode to
3609 * indicate a disp32 without a base register.
3611 * mod!=3, r/m=4 is used in the compatibility mode to
3612 * indicate that the SIB byte is present.
3614 * The 'b' bit in the REX prefix is don't care in
3615 * this case.
3617 } else {
3618 vie->rm |= (vie->rex_b << 3);
3621 vie->reg |= (vie->rex_r << 3);
3623 /* SIB */
3624 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
3625 goto done;
3627 vie->base_register = gpr_map[vie->rm];
3629 switch (vie->mod) {
3630 case VIE_MOD_INDIRECT_DISP8:
3631 vie->disp_bytes = 1;
3632 break;
3633 case VIE_MOD_INDIRECT_DISP32:
3634 vie->disp_bytes = 4;
3635 break;
3636 case VIE_MOD_INDIRECT:
3637 if (vie->rm == VIE_RM_DISP32) {
3638 vie->disp_bytes = 4;
3640 * Table 2-7. RIP-Relative Addressing
3642 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
3643 * whereas in compatibility mode it just implies disp32.
3646 if (cpu_mode == CPU_MODE_64BIT)
3647 vie->base_register = VM_REG_GUEST_RIP;
3648 else
3649 vie->base_register = VM_REG_LAST;
3651 break;
3654 done:
3655 vie_advance(vie);
3657 return (0);
3660 static int
3661 decode_sib(struct vie *vie)
3663 uint8_t x;
3665 /* Proceed only if SIB byte is present */
3666 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
3667 return (0);
3669 if (vie_peek(vie, &x))
3670 return (-1);
3672 /* De-construct the SIB byte */
3673 vie->ss = (x >> 6) & 0x3;
3674 vie->index = (x >> 3) & 0x7;
3675 vie->base = (x >> 0) & 0x7;
3677 /* Apply the REX prefix modifiers */
3678 vie->index |= vie->rex_x << 3;
3679 vie->base |= vie->rex_b << 3;
3681 switch (vie->mod) {
3682 case VIE_MOD_INDIRECT_DISP8:
3683 vie->disp_bytes = 1;
3684 break;
3685 case VIE_MOD_INDIRECT_DISP32:
3686 vie->disp_bytes = 4;
3687 break;
3690 if (vie->mod == VIE_MOD_INDIRECT &&
3691 (vie->base == 5 || vie->base == 13)) {
3693 * Special case when base register is unused if mod = 0
3694 * and base = %rbp or %r13.
3696 * Documented in:
3697 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
3698 * Table 2-5: Special Cases of REX Encodings
3700 vie->disp_bytes = 4;
3701 } else {
3702 vie->base_register = gpr_map[vie->base];
3706 * All encodings of 'index' are valid except for %rsp (4).
3708 * Documented in:
3709 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
3710 * Table 2-5: Special Cases of REX Encodings
3712 if (vie->index != 4)
3713 vie->index_register = gpr_map[vie->index];
3715 /* 'scale' makes sense only in the context of an index register */
3716 if (vie->index_register < VM_REG_LAST)
3717 vie->scale = 1 << vie->ss;
3719 vie_advance(vie);
3721 return (0);
3724 static int
3725 decode_displacement(struct vie *vie)
3727 int n, i;
3728 uint8_t x;
3730 union {
3731 char buf[4];
3732 int8_t signed8;
3733 int32_t signed32;
3734 } u;
3736 if ((n = vie->disp_bytes) == 0)
3737 return (0);
3739 if (n != 1 && n != 4)
3740 panic("decode_displacement: invalid disp_bytes %d", n);
3742 for (i = 0; i < n; i++) {
3743 if (vie_peek(vie, &x))
3744 return (-1);
3746 u.buf[i] = x;
3747 vie_advance(vie);
3750 if (n == 1)
3751 vie->displacement = u.signed8; /* sign-extended */
3752 else
3753 vie->displacement = u.signed32; /* sign-extended */
3755 return (0);
3758 static int
3759 decode_immediate(struct vie *vie)
3761 int i, n;
3762 uint8_t x;
3763 union {
3764 char buf[4];
3765 int8_t signed8;
3766 int16_t signed16;
3767 int32_t signed32;
3768 } u;
3770 /* Figure out immediate operand size (if any) */
3771 if (vie->op.op_flags & VIE_OP_F_IMM) {
3773 * Section 2.2.1.5 "Immediates", Intel SDM:
3774 * In 64-bit mode the typical size of immediate operands
3775 * remains 32-bits. When the operand size if 64-bits, the
3776 * processor sign-extends all immediates to 64-bits prior
3777 * to their use.
3779 if (vie->opsize == 4 || vie->opsize == 8)
3780 vie->imm_bytes = 4;
3781 else
3782 vie->imm_bytes = 2;
3783 } else if (vie->op.op_flags & VIE_OP_F_IMM8) {
3784 vie->imm_bytes = 1;
3787 if ((n = vie->imm_bytes) == 0)
3788 return (0);
3790 KASSERT(n == 1 || n == 2 || n == 4,
3791 ("%s: invalid number of immediate bytes: %d", __func__, n));
3793 for (i = 0; i < n; i++) {
3794 if (vie_peek(vie, &x))
3795 return (-1);
3797 u.buf[i] = x;
3798 vie_advance(vie);
3801 /* sign-extend the immediate value before use */
3802 if (n == 1)
3803 vie->immediate = u.signed8;
3804 else if (n == 2)
3805 vie->immediate = u.signed16;
3806 else
3807 vie->immediate = u.signed32;
3809 return (0);
3812 static int
3813 decode_moffset(struct vie *vie)
3815 int i, n;
3816 uint8_t x;
3817 union {
3818 char buf[8];
3819 uint64_t u64;
3820 } u;
3822 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
3823 return (0);
3826 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
3827 * The memory offset size follows the address-size of the instruction.
3829 n = vie->addrsize;
3830 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
3832 u.u64 = 0;
3833 for (i = 0; i < n; i++) {
3834 if (vie_peek(vie, &x))
3835 return (-1);
3837 u.buf[i] = x;
3838 vie_advance(vie);
3840 vie->displacement = u.u64;
3841 return (0);
3845 * Verify that the 'guest linear address' provided as collateral of the nested
3846 * page table fault matches with our instruction decoding.
3849 vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla)
3851 int error;
3852 uint64_t base, segbase, idx, gla2;
3853 enum vm_reg_name seg;
3854 struct seg_desc desc;
3856 ASSERT((vie->status & VIES_INST_DECODE) != 0);
3859 * If there was no valid GLA context with the exit, or the decoded
3860 * instruction acts on more than one address, verification is done.
3862 if (gla == VIE_INVALID_GLA ||
3863 (vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) != 0) {
3864 return (0);
3867 base = 0;
3868 if (vie->base_register != VM_REG_LAST) {
3869 error = vm_get_register(vm, cpuid, vie->base_register, &base);
3870 if (error) {
3871 printf("verify_gla: error %d getting base reg %d\n",
3872 error, vie->base_register);
3873 return (-1);
3877 * RIP-relative addressing starts from the following
3878 * instruction
3880 if (vie->base_register == VM_REG_GUEST_RIP)
3881 base += vie->num_processed;
3884 idx = 0;
3885 if (vie->index_register != VM_REG_LAST) {
3886 error = vm_get_register(vm, cpuid, vie->index_register, &idx);
3887 if (error) {
3888 printf("verify_gla: error %d getting index reg %d\n",
3889 error, vie->index_register);
3890 return (-1);
3895 * From "Specifying a Segment Selector", Intel SDM, Vol 1
3897 * In 64-bit mode, segmentation is generally (but not
3898 * completely) disabled. The exceptions are the FS and GS
3899 * segments.
3901 * In legacy IA-32 mode, when the ESP or EBP register is used
3902 * as the base, the SS segment is the default segment. For
3903 * other data references, except when relative to stack or
3904 * string destination the DS segment is the default. These
3905 * can be overridden to allow other segments to be accessed.
3907 if (vie->segment_override) {
3908 seg = vie->segment_register;
3909 } else if (vie->base_register == VM_REG_GUEST_RSP ||
3910 vie->base_register == VM_REG_GUEST_RBP) {
3911 seg = VM_REG_GUEST_SS;
3912 } else {
3913 seg = VM_REG_GUEST_DS;
3915 if (vie->paging.cpu_mode == CPU_MODE_64BIT &&
3916 seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) {
3917 segbase = 0;
3918 } else {
3919 error = vm_get_seg_desc(vm, cpuid, seg, &desc);
3920 if (error) {
3921 printf("verify_gla: error %d getting segment"
3922 " descriptor %d", error, vie->segment_register);
3923 return (-1);
3925 segbase = desc.base;
3928 gla2 = segbase + base + vie->scale * idx + vie->displacement;
3929 gla2 &= size2mask[vie->addrsize];
3930 if (gla != gla2) {
3931 printf("verify_gla mismatch: segbase(0x%0lx)"
3932 "base(0x%0lx), scale(%d), index(0x%0lx), "
3933 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
3934 segbase, base, vie->scale, idx, vie->displacement,
3935 gla, gla2);
3936 return (-1);
3939 return (0);
3943 vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int cs_d)
3945 enum vm_cpu_mode cpu_mode;
3947 if ((vie->status & VIES_INST_FETCH) == 0) {
3948 return (EINVAL);
3951 cpu_mode = vie->paging.cpu_mode;
3953 if (decode_prefixes(vie, cpu_mode, cs_d))
3954 return (-1);
3956 if (decode_opcode(vie))
3957 return (-1);
3959 if (decode_modrm(vie, cpu_mode))
3960 return (-1);
3962 if (decode_sib(vie))
3963 return (-1);
3965 if (decode_displacement(vie))
3966 return (-1);
3968 if (decode_immediate(vie))
3969 return (-1);
3971 if (decode_moffset(vie))
3972 return (-1);
3974 vie->status |= VIES_INST_DECODE;
3976 return (0);