15325 bhyve upstream sync 2023 January
[illumos-gate.git] / usr / src / cmd / bhyve / task_switch.c
blob4b2a464a7b203f5f33a38e70608da5c385e8709a
1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
5 * All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
29 * This file and its contents are supplied under the terms of the
30 * Common Development and Distribution License ("CDDL"), version 1.0.
31 * You may only use this file in accordance with the terms of version
32 * 1.0 of the CDDL.
34 * A full copy of the text of the CDDL should have accompanied this
35 * source. A copy of the CDDL is also available via the Internet at
36 * http://www.illumos.org/license/CDDL.
38 * Copyright 2020 Oxide Computer Company
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
44 #include <sys/param.h>
45 #include <sys/_iovec.h>
46 #include <sys/mman.h>
48 #include <x86/psl.h>
49 #include <x86/segments.h>
50 #include <x86/specialreg.h>
51 #include <machine/vmm.h>
53 #include <assert.h>
54 #include <errno.h>
55 #include <stdbool.h>
56 #include <stdio.h>
57 #include <stdlib.h>
59 #include <vmmapi.h>
61 #include "bhyverun.h"
62 #include "debug.h"
65 * Using 'struct i386tss' is tempting but causes myriad sign extension
66 * issues because all of its fields are defined as signed integers.
68 struct tss32 {
69 uint16_t tss_link;
70 uint16_t rsvd1;
71 uint32_t tss_esp0;
72 uint16_t tss_ss0;
73 uint16_t rsvd2;
74 uint32_t tss_esp1;
75 uint16_t tss_ss1;
76 uint16_t rsvd3;
77 uint32_t tss_esp2;
78 uint16_t tss_ss2;
79 uint16_t rsvd4;
80 uint32_t tss_cr3;
81 uint32_t tss_eip;
82 uint32_t tss_eflags;
83 uint32_t tss_eax;
84 uint32_t tss_ecx;
85 uint32_t tss_edx;
86 uint32_t tss_ebx;
87 uint32_t tss_esp;
88 uint32_t tss_ebp;
89 uint32_t tss_esi;
90 uint32_t tss_edi;
91 uint16_t tss_es;
92 uint16_t rsvd5;
93 uint16_t tss_cs;
94 uint16_t rsvd6;
95 uint16_t tss_ss;
96 uint16_t rsvd7;
97 uint16_t tss_ds;
98 uint16_t rsvd8;
99 uint16_t tss_fs;
100 uint16_t rsvd9;
101 uint16_t tss_gs;
102 uint16_t rsvd10;
103 uint16_t tss_ldt;
104 uint16_t rsvd11;
105 uint16_t tss_trap;
106 uint16_t tss_iomap;
108 static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
110 #define SEL_START(sel) (((sel) & ~0x7))
111 #define SEL_LIMIT(sel) (((sel) | 0x7))
112 #define TSS_BUSY(type) (((type) & 0x2) != 0)
114 static uint64_t
115 GETREG(struct vmctx *ctx, int vcpu, int reg)
117 uint64_t val;
118 int error;
120 error = vm_get_register(ctx, vcpu, reg, &val);
121 assert(error == 0);
122 return (val);
125 static void
126 SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
128 int error;
130 error = vm_set_register(ctx, vcpu, reg, val);
131 assert(error == 0);
134 static struct seg_desc
135 usd_to_seg_desc(struct user_segment_descriptor *usd)
137 struct seg_desc seg_desc;
139 seg_desc.base = (u_int)USD_GETBASE(usd);
140 if (usd->sd_gran)
141 seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
142 else
143 seg_desc.limit = (u_int)USD_GETLIMIT(usd);
144 seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
145 seg_desc.access |= usd->sd_xx << 12;
146 seg_desc.access |= usd->sd_def32 << 14;
147 seg_desc.access |= usd->sd_gran << 15;
149 return (seg_desc);
153 * Inject an exception with an error code that is a segment selector.
154 * The format of the error code is described in section 6.13, "Error Code",
155 * Intel SDM volume 3.
157 * Bit 0 (EXT) denotes whether the exception occurred during delivery
158 * of an external event like an interrupt.
160 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
161 * in the IDT.
163 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
165 static void
166 sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
169 * Bit 2 from the selector is retained as-is in the error code.
171 * Bit 1 can be safely cleared because none of the selectors
172 * encountered during task switch emulation refer to a task
173 * gate in the IDT.
175 * Bit 0 is set depending on the value of 'ext'.
177 sel &= ~0x3;
178 if (ext)
179 sel |= 0x1;
180 vm_inject_fault(ctx, vcpu, vector, 1, sel);
184 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
185 * and non-zero otherwise.
187 static int
188 desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
190 uint64_t base;
191 uint32_t limit, access;
192 int error, reg;
194 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
195 error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
196 assert(error == 0);
198 if (reg == VM_REG_GUEST_LDTR) {
199 if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
200 return (-1);
203 if (limit < SEL_LIMIT(sel))
204 return (-1);
205 else
206 return (0);
210 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
211 * by the selector 'sel'.
213 * Returns 0 on success.
214 * Returns 1 if an exception was injected into the guest.
215 * Returns -1 otherwise.
217 static int
218 desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
219 uint16_t sel, struct user_segment_descriptor *desc, bool doread,
220 int *faultptr)
222 struct iovec iov[2];
223 uint64_t base;
224 uint32_t limit, access;
225 int error, reg;
227 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
228 error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
229 assert(error == 0);
230 assert(limit >= SEL_LIMIT(sel));
232 error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
233 sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
234 faultptr);
235 if (error || *faultptr)
236 return (error);
238 if (doread)
239 vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
240 else
241 vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
242 return (0);
245 static int
246 desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
247 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
249 return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr));
252 static int
253 desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
254 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
256 return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr));
260 * Read the TSS descriptor referenced by 'sel' into 'desc'.
262 * Returns 0 on success.
263 * Returns 1 if an exception was injected into the guest.
264 * Returns -1 otherwise.
266 static int
267 read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
268 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
270 struct vm_guest_paging sup_paging;
271 int error;
273 assert(!ISLDT(sel));
274 assert(IDXSEL(sel) != 0);
276 /* Fetch the new TSS descriptor */
277 if (desc_table_limit_check(ctx, vcpu, sel)) {
278 if (ts->reason == TSR_IRET)
279 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
280 else
281 sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
282 return (1);
285 sup_paging = ts->paging;
286 sup_paging.cpl = 0; /* implicit supervisor mode */
287 error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr);
288 return (error);
291 static bool
292 code_desc(int sd_type)
294 /* code descriptor */
295 return ((sd_type & 0x18) == 0x18);
298 static bool
299 stack_desc(int sd_type)
301 /* writable data descriptor */
302 return ((sd_type & 0x1A) == 0x12);
305 static bool
306 data_desc(int sd_type)
308 /* data descriptor or a readable code descriptor */
309 return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
312 static bool
313 ldt_desc(int sd_type)
316 return (sd_type == SDT_SYSLDT);
320 * Validate the descriptor 'seg_desc' associated with 'segment'.
322 static int
323 validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
324 int segment, struct seg_desc *seg_desc, int *faultptr)
326 struct vm_guest_paging sup_paging;
327 struct user_segment_descriptor usd;
328 int error, idtvec;
329 int cpl, dpl, rpl;
330 uint16_t sel, cs;
331 bool ldtseg, codeseg, stackseg, dataseg, conforming;
333 ldtseg = codeseg = stackseg = dataseg = false;
334 switch (segment) {
335 case VM_REG_GUEST_LDTR:
336 ldtseg = true;
337 break;
338 case VM_REG_GUEST_CS:
339 codeseg = true;
340 break;
341 case VM_REG_GUEST_SS:
342 stackseg = true;
343 break;
344 case VM_REG_GUEST_DS:
345 case VM_REG_GUEST_ES:
346 case VM_REG_GUEST_FS:
347 case VM_REG_GUEST_GS:
348 dataseg = true;
349 break;
350 default:
351 assert(0);
354 /* Get the segment selector */
355 sel = GETREG(ctx, vcpu, segment);
357 /* LDT selector must point into the GDT */
358 if (ldtseg && ISLDT(sel)) {
359 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
360 return (1);
363 /* Descriptor table limit check */
364 if (desc_table_limit_check(ctx, vcpu, sel)) {
365 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
366 return (1);
369 /* NULL selector */
370 if (IDXSEL(sel) == 0) {
371 /* Code and stack segment selectors cannot be NULL */
372 if (codeseg || stackseg) {
373 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
374 return (1);
376 seg_desc->base = 0;
377 seg_desc->limit = 0;
378 seg_desc->access = 0x10000; /* unusable */
379 return (0);
382 /* Read the descriptor from the GDT/LDT */
383 sup_paging = ts->paging;
384 sup_paging.cpl = 0; /* implicit supervisor mode */
385 error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr);
386 if (error || *faultptr)
387 return (error);
389 /* Verify that the descriptor type is compatible with the segment */
390 if ((ldtseg && !ldt_desc(usd.sd_type)) ||
391 (codeseg && !code_desc(usd.sd_type)) ||
392 (dataseg && !data_desc(usd.sd_type)) ||
393 (stackseg && !stack_desc(usd.sd_type))) {
394 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
395 return (1);
398 /* Segment must be marked present */
399 if (!usd.sd_p) {
400 if (ldtseg)
401 idtvec = IDT_TS;
402 else if (stackseg)
403 idtvec = IDT_SS;
404 else
405 idtvec = IDT_NP;
406 sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
407 return (1);
410 cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
411 cpl = cs & SEL_RPL_MASK;
412 rpl = sel & SEL_RPL_MASK;
413 dpl = usd.sd_dpl;
415 if (stackseg && (rpl != cpl || dpl != cpl)) {
416 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
417 return (1);
420 if (codeseg) {
421 conforming = (usd.sd_type & 0x4) ? true : false;
422 if ((conforming && (cpl < dpl)) ||
423 (!conforming && (cpl != dpl))) {
424 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
425 return (1);
429 if (dataseg) {
431 * A data segment is always non-conforming except when it's
432 * descriptor is a readable, conforming code segment.
434 if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
435 conforming = true;
436 else
437 conforming = false;
439 if (!conforming && (rpl > dpl || cpl > dpl)) {
440 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
441 return (1);
444 *seg_desc = usd_to_seg_desc(&usd);
445 return (0);
448 static void
449 tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
450 uint32_t eip, struct tss32 *tss, struct iovec *iov)
453 /* General purpose registers */
454 tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
455 tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
456 tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
457 tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
458 tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
459 tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
460 tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
461 tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
463 /* Segment selectors */
464 tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
465 tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
466 tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
467 tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
468 tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
469 tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
471 /* eflags and eip */
472 tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
473 if (task_switch->reason == TSR_IRET)
474 tss->tss_eflags &= ~PSL_NT;
475 tss->tss_eip = eip;
477 /* Copy updated old TSS into guest memory */
478 vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
481 static void
482 update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
484 int error;
486 error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
487 assert(error == 0);
491 * Update the vcpu registers to reflect the state of the new task.
493 static int
494 tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
495 uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
497 struct seg_desc seg_desc, seg_desc2;
498 uint64_t *pdpte, maxphyaddr, reserved;
499 uint32_t eflags;
500 int error, i;
501 bool nested;
503 nested = false;
504 if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
505 tss->tss_link = ot_sel;
506 nested = true;
509 eflags = tss->tss_eflags;
510 if (nested)
511 eflags |= PSL_NT;
513 /* LDTR */
514 SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
516 /* PBDR */
517 if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
518 if (ts->paging.paging_mode == PAGING_MODE_PAE) {
520 * XXX Assuming 36-bit MAXPHYADDR.
522 maxphyaddr = (1UL << 36) - 1;
523 pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
524 for (i = 0; i < 4; i++) {
525 /* Check reserved bits if the PDPTE is valid */
526 if (!(pdpte[i] & 0x1))
527 continue;
529 * Bits 2:1, 8:5 and bits above the processor's
530 * maximum physical address are reserved.
532 reserved = ~maxphyaddr | 0x1E6;
533 if (pdpte[i] & reserved) {
534 vm_inject_gp(ctx, vcpu);
535 return (1);
538 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
539 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
540 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
541 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
543 SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
544 ts->paging.cr3 = tss->tss_cr3;
547 /* eflags and eip */
548 SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
549 SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
551 /* General purpose registers */
552 SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
553 SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
554 SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
555 SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
556 SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
557 SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
558 SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
559 SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
561 /* Segment selectors */
562 SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
563 SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
564 SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
565 SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
566 SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
567 SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
570 * If this is a nested task then write out the new TSS to update
571 * the previous link field.
573 if (nested)
574 vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
576 /* Validate segment descriptors */
577 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
578 faultptr);
579 if (error || *faultptr)
580 return (error);
581 update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
584 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
586 * The SS and CS attribute checks on VM-entry are inter-dependent so
587 * we need to make sure that both segments are valid before updating
588 * either of them. This ensures that the VMCS state can pass the
589 * VM-entry checks so the guest can handle any exception injected
590 * during task switch emulation.
592 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
593 faultptr);
594 if (error || *faultptr)
595 return (error);
597 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
598 faultptr);
599 if (error || *faultptr)
600 return (error);
601 update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
602 update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
603 ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
605 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
606 faultptr);
607 if (error || *faultptr)
608 return (error);
609 update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
611 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
612 faultptr);
613 if (error || *faultptr)
614 return (error);
615 update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
617 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
618 faultptr);
619 if (error || *faultptr)
620 return (error);
621 update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
623 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
624 faultptr);
625 if (error || *faultptr)
626 return (error);
627 update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
629 return (0);
634 * Copy of vie_alignment_check() from vmm_instruction_emul.c
636 static int
637 alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
639 assert(size == 1 || size == 2 || size == 4 || size == 8);
640 assert(cpl >= 0 && cpl <= 3);
642 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
643 return (0);
645 return ((gla & (size - 1)) ? 1 : 0);
649 * Copy of vie_size2mask() from vmm_instruction_emul.c
651 static uint64_t
652 size2mask(int size)
654 switch (size) {
655 case 1:
656 return (0xff);
657 case 2:
658 return (0xffff);
659 case 4:
660 return (0xffffffff);
661 case 8:
662 return (0xffffffffffffffff);
663 default:
664 assert(0);
665 /* not reached */
666 return (0);
671 * Copy of vie_calculate_gla() from vmm_instruction_emul.c
673 static int
674 calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
675 struct seg_desc *desc, uint64_t offset, int length, int addrsize,
676 int prot, uint64_t *gla)
678 uint64_t firstoff, low_limit, high_limit, segbase;
679 int glasize, type;
681 assert(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS);
682 assert((length == 1 || length == 2 || length == 4 || length == 8));
683 assert((prot & ~(PROT_READ | PROT_WRITE)) == 0);
685 firstoff = offset;
686 if (cpu_mode == CPU_MODE_64BIT) {
687 assert(addrsize == 4 || addrsize == 8);
688 glasize = 8;
689 } else {
690 assert(addrsize == 2 || addrsize == 4);
691 glasize = 4;
693 * If the segment selector is loaded with a NULL selector
694 * then the descriptor is unusable and attempting to use
695 * it results in a #GP(0).
697 if (SEG_DESC_UNUSABLE(desc->access))
698 return (-1);
701 * The processor generates a #NP exception when a segment
702 * register is loaded with a selector that points to a
703 * descriptor that is not present. If this was the case then
704 * it would have been checked before the VM-exit.
706 assert(SEG_DESC_PRESENT(desc->access));
709 * The descriptor type must indicate a code/data segment.
711 type = SEG_DESC_TYPE(desc->access);
712 assert(type >= 16 && type <= 31);
714 if (prot & PROT_READ) {
715 /* #GP on a read access to a exec-only code segment */
716 if ((type & 0xA) == 0x8)
717 return (-1);
720 if (prot & PROT_WRITE) {
722 * #GP on a write access to a code segment or a
723 * read-only data segment.
725 if (type & 0x8) /* code segment */
726 return (-1);
728 if ((type & 0xA) == 0) /* read-only data seg */
729 return (-1);
733 * 'desc->limit' is fully expanded taking granularity into
734 * account.
736 if ((type & 0xC) == 0x4) {
737 /* expand-down data segment */
738 low_limit = desc->limit + 1;
739 high_limit = SEG_DESC_DEF32(desc->access) ?
740 0xffffffff : 0xffff;
741 } else {
742 /* code segment or expand-up data segment */
743 low_limit = 0;
744 high_limit = desc->limit;
747 while (length > 0) {
748 offset &= size2mask(addrsize);
749 if (offset < low_limit || offset > high_limit)
750 return (-1);
751 offset++;
752 length--;
757 * In 64-bit mode all segments except %fs and %gs have a segment
758 * base address of 0.
760 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
761 seg != VM_REG_GUEST_GS) {
762 segbase = 0;
763 } else {
764 segbase = desc->base;
768 * Truncate 'firstoff' to the effective address size before adding
769 * it to the segment base.
771 firstoff &= size2mask(addrsize);
772 *gla = (segbase + firstoff) & size2mask(glasize);
773 return (0);
777 * Push an error code on the stack of the new task. This is needed if the
778 * task switch was triggered by a hardware exception that causes an error
779 * code to be saved (e.g. #PF).
781 static int
782 push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
783 int task_type, uint32_t errcode, int *faultptr)
785 struct iovec iov[2];
786 struct seg_desc seg_desc;
787 int stacksize, bytes, error;
788 uint64_t gla, cr0, rflags;
789 uint32_t esp;
790 uint16_t stacksel;
792 *faultptr = 0;
794 cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
795 rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
796 stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
798 error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
799 &seg_desc.limit, &seg_desc.access);
800 assert(error == 0);
803 * Section "Error Code" in the Intel SDM vol 3: the error code is
804 * pushed on the stack as a doubleword or word (depending on the
805 * default interrupt, trap or task gate size).
807 if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
808 bytes = 4;
809 else
810 bytes = 2;
813 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
814 * stack-segment descriptor determines the size of the stack
815 * pointer outside of 64-bit mode.
817 if (SEG_DESC_DEF32(seg_desc.access))
818 stacksize = 4;
819 else
820 stacksize = 2;
822 esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
823 esp -= bytes;
825 if (calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
826 &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
827 sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
828 *faultptr = 1;
829 return (0);
832 if (alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
833 vm_inject_ac(ctx, vcpu, 1);
834 *faultptr = 1;
835 return (0);
838 error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
839 iov, nitems(iov), faultptr);
840 if (error || *faultptr)
841 return (error);
843 vm_copyout(ctx, vcpu, &errcode, iov, bytes);
844 SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
845 return (0);
849 * Evaluate return value from helper functions and potentially return to
850 * the VM run loop.
852 #define CHKERR(error,fault) \
853 do { \
854 assert((error == 0) || (error == EFAULT)); \
855 if (error) \
856 return (VMEXIT_ABORT); \
857 else if (fault) \
858 return (VMEXIT_CONTINUE); \
859 } while (0)
862 vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
864 struct seg_desc nt;
865 struct tss32 oldtss, newtss;
866 struct vm_task_switch *task_switch;
867 struct vm_guest_paging *paging, sup_paging;
868 struct user_segment_descriptor nt_desc, ot_desc;
869 struct iovec nt_iov[2], ot_iov[2];
870 uint64_t cr0, ot_base;
871 uint32_t eip, ot_lim, access;
872 int error, ext, fault, minlimit, nt_type, ot_type, vcpu;
873 enum task_switch_reason reason;
874 uint16_t nt_sel, ot_sel;
876 task_switch = &vmexit->u.task_switch;
877 nt_sel = task_switch->tsssel;
878 ext = vmexit->u.task_switch.ext;
879 reason = vmexit->u.task_switch.reason;
880 paging = &vmexit->u.task_switch.paging;
881 vcpu = *pvcpu;
883 assert(paging->cpu_mode == CPU_MODE_PROTECTED);
886 * Calculate the instruction pointer to store in the old TSS.
888 eip = vmexit->rip + vmexit->inst_length;
891 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
892 * The following page table accesses are implicitly supervisor mode:
893 * - accesses to GDT or LDT to load segment descriptors
894 * - accesses to the task state segment during task switch
896 sup_paging = *paging;
897 sup_paging.cpl = 0; /* implicit supervisor mode */
899 /* Fetch the new TSS descriptor */
900 error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc,
901 &fault);
902 CHKERR(error, fault);
904 nt = usd_to_seg_desc(&nt_desc);
906 /* Verify the type of the new TSS */
907 nt_type = SEG_DESC_TYPE(nt.access);
908 if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
909 nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
910 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
911 goto done;
914 /* TSS descriptor must have present bit set */
915 if (!SEG_DESC_PRESENT(nt.access)) {
916 sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
917 goto done;
921 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
922 * 44 bytes for a 16-bit TSS.
924 if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
925 minlimit = 104 - 1;
926 else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
927 minlimit = 44 - 1;
928 else
929 minlimit = 0;
931 assert(minlimit > 0);
932 if (nt.limit < (unsigned int)minlimit) {
933 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
934 goto done;
937 /* TSS must be busy if task switch is due to IRET */
938 if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
939 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
940 goto done;
944 * TSS must be available (not busy) if task switch reason is
945 * CALL, JMP, exception or interrupt.
947 if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
948 sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
949 goto done;
952 /* Fetch the new TSS */
953 error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
954 PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
955 CHKERR(error, fault);
956 vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
958 /* Get the old TSS selector from the guest's task register */
959 ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
960 if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
962 * This might happen if a task switch was attempted without
963 * ever loading the task register with LTR. In this case the
964 * TR would contain the values from power-on:
965 * (sel = 0, base = 0, limit = 0xffff).
967 sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
968 goto done;
971 /* Get the old TSS base and limit from the guest's task register */
972 error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
973 &access);
974 assert(error == 0);
975 assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
976 ot_type = SEG_DESC_TYPE(access);
977 assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
979 /* Fetch the old TSS descriptor */
980 error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc,
981 &fault);
982 CHKERR(error, fault);
984 /* Get the old TSS */
985 error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
986 PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
987 CHKERR(error, fault);
988 vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
991 * Clear the busy bit in the old TSS descriptor if the task switch
992 * due to an IRET or JMP instruction.
994 if (reason == TSR_IRET || reason == TSR_JMP) {
995 ot_desc.sd_type &= ~0x2;
996 error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
997 &ot_desc, &fault);
998 CHKERR(error, fault);
1001 if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
1002 EPRINTLN("Task switch to 16-bit TSS not supported");
1003 return (VMEXIT_ABORT);
1006 /* Save processor state in old TSS */
1007 tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
1010 * If the task switch was triggered for any reason other than IRET
1011 * then set the busy bit in the new TSS descriptor.
1013 if (reason != TSR_IRET) {
1014 nt_desc.sd_type |= 0x2;
1015 error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
1016 &nt_desc, &fault);
1017 CHKERR(error, fault);
1020 /* Update task register to point at the new TSS */
1021 SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
1023 /* Update the hidden descriptor state of the task register */
1024 nt = usd_to_seg_desc(&nt_desc);
1025 update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
1027 /* Set CR0.TS */
1028 cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
1029 SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
1032 * We are now committed to the task switch. Any exceptions encountered
1033 * after this point will be handled in the context of the new task and
1034 * the saved instruction pointer will belong to the new task.
1036 error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
1037 assert(error == 0);
1039 /* Load processor state from new TSS */
1040 error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
1041 &fault);
1042 CHKERR(error, fault);
1045 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
1046 * caused an error code to be generated, this error code is copied
1047 * to the stack of the new task.
1049 if (task_switch->errcode_valid) {
1050 assert(task_switch->ext);
1051 assert(task_switch->reason == TSR_IDT_GATE);
1052 error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
1053 task_switch->errcode, &fault);
1054 CHKERR(error, fault);
1058 * Treatment of virtual-NMI blocking if NMI is delivered through
1059 * a task gate.
1061 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
1062 * If the virtual NMIs VM-execution control is 1, VM entry injects
1063 * an NMI, and delivery of the NMI causes a task switch that causes
1064 * a VM exit, virtual-NMI blocking is in effect before the VM exit
1065 * commences.
1067 * Thus, virtual-NMI blocking is in effect at the time of the task
1068 * switch VM exit.
1072 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
1074 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
1075 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
1076 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
1078 * Thus, virtual-NMI blocking is cleared at the time of the task switch
1079 * VM exit.
1083 * If the task switch was triggered by an event delivered through
1084 * the IDT then extinguish the pending event from the vcpu's
1085 * exitintinfo.
1087 if (task_switch->reason == TSR_IDT_GATE) {
1088 error = vm_set_intinfo(ctx, vcpu, 0);
1089 assert(error == 0);
1093 * XXX should inject debug exception if 'T' bit is 1
1095 done:
1096 return (VMEXIT_CONTINUE);