2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * This file and its contents are supplied under the terms of the
30 * Common Development and Distribution License ("CDDL"), version 1.0.
31 * You may only use this file in accordance with the terms of version
34 * A full copy of the text of the CDDL should have accompanied this
35 * source. A copy of the CDDL is also available via the Internet at
36 * http://www.illumos.org/license/CDDL.
38 * Copyright 2020 Oxide Computer Company
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
44 #include <sys/param.h>
45 #include <sys/_iovec.h>
49 #include <x86/segments.h>
50 #include <x86/specialreg.h>
51 #include <machine/vmm.h>
65 * Using 'struct i386tss' is tempting but causes myriad sign extension
66 * issues because all of its fields are defined as signed integers.
108 static_assert(sizeof(struct tss32
) == 104, "compile-time assertion failed");
110 #define SEL_START(sel) (((sel) & ~0x7))
111 #define SEL_LIMIT(sel) (((sel) | 0x7))
112 #define TSS_BUSY(type) (((type) & 0x2) != 0)
115 GETREG(struct vmctx
*ctx
, int vcpu
, int reg
)
120 error
= vm_get_register(ctx
, vcpu
, reg
, &val
);
126 SETREG(struct vmctx
*ctx
, int vcpu
, int reg
, uint64_t val
)
130 error
= vm_set_register(ctx
, vcpu
, reg
, val
);
134 static struct seg_desc
135 usd_to_seg_desc(struct user_segment_descriptor
*usd
)
137 struct seg_desc seg_desc
;
139 seg_desc
.base
= (u_int
)USD_GETBASE(usd
);
141 seg_desc
.limit
= (u_int
)(USD_GETLIMIT(usd
) << 12) | 0xfff;
143 seg_desc
.limit
= (u_int
)USD_GETLIMIT(usd
);
144 seg_desc
.access
= usd
->sd_type
| usd
->sd_dpl
<< 5 | usd
->sd_p
<< 7;
145 seg_desc
.access
|= usd
->sd_xx
<< 12;
146 seg_desc
.access
|= usd
->sd_def32
<< 14;
147 seg_desc
.access
|= usd
->sd_gran
<< 15;
153 * Inject an exception with an error code that is a segment selector.
154 * The format of the error code is described in section 6.13, "Error Code",
155 * Intel SDM volume 3.
157 * Bit 0 (EXT) denotes whether the exception occurred during delivery
158 * of an external event like an interrupt.
160 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
163 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
166 sel_exception(struct vmctx
*ctx
, int vcpu
, int vector
, uint16_t sel
, int ext
)
169 * Bit 2 from the selector is retained as-is in the error code.
171 * Bit 1 can be safely cleared because none of the selectors
172 * encountered during task switch emulation refer to a task
175 * Bit 0 is set depending on the value of 'ext'.
180 vm_inject_fault(ctx
, vcpu
, vector
, 1, sel
);
184 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
185 * and non-zero otherwise.
188 desc_table_limit_check(struct vmctx
*ctx
, int vcpu
, uint16_t sel
)
191 uint32_t limit
, access
;
194 reg
= ISLDT(sel
) ? VM_REG_GUEST_LDTR
: VM_REG_GUEST_GDTR
;
195 error
= vm_get_desc(ctx
, vcpu
, reg
, &base
, &limit
, &access
);
198 if (reg
== VM_REG_GUEST_LDTR
) {
199 if (SEG_DESC_UNUSABLE(access
) || !SEG_DESC_PRESENT(access
))
203 if (limit
< SEL_LIMIT(sel
))
210 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
211 * by the selector 'sel'.
213 * Returns 0 on success.
214 * Returns 1 if an exception was injected into the guest.
215 * Returns -1 otherwise.
218 desc_table_rw(struct vmctx
*ctx
, int vcpu
, struct vm_guest_paging
*paging
,
219 uint16_t sel
, struct user_segment_descriptor
*desc
, bool doread
,
224 uint32_t limit
, access
;
227 reg
= ISLDT(sel
) ? VM_REG_GUEST_LDTR
: VM_REG_GUEST_GDTR
;
228 error
= vm_get_desc(ctx
, vcpu
, reg
, &base
, &limit
, &access
);
230 assert(limit
>= SEL_LIMIT(sel
));
232 error
= vm_copy_setup(ctx
, vcpu
, paging
, base
+ SEL_START(sel
),
233 sizeof(*desc
), doread
? PROT_READ
: PROT_WRITE
, iov
, nitems(iov
),
235 if (error
|| *faultptr
)
239 vm_copyin(ctx
, vcpu
, iov
, desc
, sizeof(*desc
));
241 vm_copyout(ctx
, vcpu
, desc
, iov
, sizeof(*desc
));
246 desc_table_read(struct vmctx
*ctx
, int vcpu
, struct vm_guest_paging
*paging
,
247 uint16_t sel
, struct user_segment_descriptor
*desc
, int *faultptr
)
249 return (desc_table_rw(ctx
, vcpu
, paging
, sel
, desc
, true, faultptr
));
253 desc_table_write(struct vmctx
*ctx
, int vcpu
, struct vm_guest_paging
*paging
,
254 uint16_t sel
, struct user_segment_descriptor
*desc
, int *faultptr
)
256 return (desc_table_rw(ctx
, vcpu
, paging
, sel
, desc
, false, faultptr
));
260 * Read the TSS descriptor referenced by 'sel' into 'desc'.
262 * Returns 0 on success.
263 * Returns 1 if an exception was injected into the guest.
264 * Returns -1 otherwise.
267 read_tss_descriptor(struct vmctx
*ctx
, int vcpu
, struct vm_task_switch
*ts
,
268 uint16_t sel
, struct user_segment_descriptor
*desc
, int *faultptr
)
270 struct vm_guest_paging sup_paging
;
274 assert(IDXSEL(sel
) != 0);
276 /* Fetch the new TSS descriptor */
277 if (desc_table_limit_check(ctx
, vcpu
, sel
)) {
278 if (ts
->reason
== TSR_IRET
)
279 sel_exception(ctx
, vcpu
, IDT_TS
, sel
, ts
->ext
);
281 sel_exception(ctx
, vcpu
, IDT_GP
, sel
, ts
->ext
);
285 sup_paging
= ts
->paging
;
286 sup_paging
.cpl
= 0; /* implicit supervisor mode */
287 error
= desc_table_read(ctx
, vcpu
, &sup_paging
, sel
, desc
, faultptr
);
292 code_desc(int sd_type
)
294 /* code descriptor */
295 return ((sd_type
& 0x18) == 0x18);
299 stack_desc(int sd_type
)
301 /* writable data descriptor */
302 return ((sd_type
& 0x1A) == 0x12);
306 data_desc(int sd_type
)
308 /* data descriptor or a readable code descriptor */
309 return ((sd_type
& 0x18) == 0x10 || (sd_type
& 0x1A) == 0x1A);
313 ldt_desc(int sd_type
)
316 return (sd_type
== SDT_SYSLDT
);
320 * Validate the descriptor 'seg_desc' associated with 'segment'.
323 validate_seg_desc(struct vmctx
*ctx
, int vcpu
, struct vm_task_switch
*ts
,
324 int segment
, struct seg_desc
*seg_desc
, int *faultptr
)
326 struct vm_guest_paging sup_paging
;
327 struct user_segment_descriptor usd
;
331 bool ldtseg
, codeseg
, stackseg
, dataseg
, conforming
;
333 ldtseg
= codeseg
= stackseg
= dataseg
= false;
335 case VM_REG_GUEST_LDTR
:
338 case VM_REG_GUEST_CS
:
341 case VM_REG_GUEST_SS
:
344 case VM_REG_GUEST_DS
:
345 case VM_REG_GUEST_ES
:
346 case VM_REG_GUEST_FS
:
347 case VM_REG_GUEST_GS
:
354 /* Get the segment selector */
355 sel
= GETREG(ctx
, vcpu
, segment
);
357 /* LDT selector must point into the GDT */
358 if (ldtseg
&& ISLDT(sel
)) {
359 sel_exception(ctx
, vcpu
, IDT_TS
, sel
, ts
->ext
);
363 /* Descriptor table limit check */
364 if (desc_table_limit_check(ctx
, vcpu
, sel
)) {
365 sel_exception(ctx
, vcpu
, IDT_TS
, sel
, ts
->ext
);
370 if (IDXSEL(sel
) == 0) {
371 /* Code and stack segment selectors cannot be NULL */
372 if (codeseg
|| stackseg
) {
373 sel_exception(ctx
, vcpu
, IDT_TS
, sel
, ts
->ext
);
378 seg_desc
->access
= 0x10000; /* unusable */
382 /* Read the descriptor from the GDT/LDT */
383 sup_paging
= ts
->paging
;
384 sup_paging
.cpl
= 0; /* implicit supervisor mode */
385 error
= desc_table_read(ctx
, vcpu
, &sup_paging
, sel
, &usd
, faultptr
);
386 if (error
|| *faultptr
)
389 /* Verify that the descriptor type is compatible with the segment */
390 if ((ldtseg
&& !ldt_desc(usd
.sd_type
)) ||
391 (codeseg
&& !code_desc(usd
.sd_type
)) ||
392 (dataseg
&& !data_desc(usd
.sd_type
)) ||
393 (stackseg
&& !stack_desc(usd
.sd_type
))) {
394 sel_exception(ctx
, vcpu
, IDT_TS
, sel
, ts
->ext
);
398 /* Segment must be marked present */
406 sel_exception(ctx
, vcpu
, idtvec
, sel
, ts
->ext
);
410 cs
= GETREG(ctx
, vcpu
, VM_REG_GUEST_CS
);
411 cpl
= cs
& SEL_RPL_MASK
;
412 rpl
= sel
& SEL_RPL_MASK
;
415 if (stackseg
&& (rpl
!= cpl
|| dpl
!= cpl
)) {
416 sel_exception(ctx
, vcpu
, IDT_TS
, sel
, ts
->ext
);
421 conforming
= (usd
.sd_type
& 0x4) ? true : false;
422 if ((conforming
&& (cpl
< dpl
)) ||
423 (!conforming
&& (cpl
!= dpl
))) {
424 sel_exception(ctx
, vcpu
, IDT_TS
, sel
, ts
->ext
);
431 * A data segment is always non-conforming except when it's
432 * descriptor is a readable, conforming code segment.
434 if (code_desc(usd
.sd_type
) && (usd
.sd_type
& 0x4) != 0)
439 if (!conforming
&& (rpl
> dpl
|| cpl
> dpl
)) {
440 sel_exception(ctx
, vcpu
, IDT_TS
, sel
, ts
->ext
);
444 *seg_desc
= usd_to_seg_desc(&usd
);
449 tss32_save(struct vmctx
*ctx
, int vcpu
, struct vm_task_switch
*task_switch
,
450 uint32_t eip
, struct tss32
*tss
, struct iovec
*iov
)
453 /* General purpose registers */
454 tss
->tss_eax
= GETREG(ctx
, vcpu
, VM_REG_GUEST_RAX
);
455 tss
->tss_ecx
= GETREG(ctx
, vcpu
, VM_REG_GUEST_RCX
);
456 tss
->tss_edx
= GETREG(ctx
, vcpu
, VM_REG_GUEST_RDX
);
457 tss
->tss_ebx
= GETREG(ctx
, vcpu
, VM_REG_GUEST_RBX
);
458 tss
->tss_esp
= GETREG(ctx
, vcpu
, VM_REG_GUEST_RSP
);
459 tss
->tss_ebp
= GETREG(ctx
, vcpu
, VM_REG_GUEST_RBP
);
460 tss
->tss_esi
= GETREG(ctx
, vcpu
, VM_REG_GUEST_RSI
);
461 tss
->tss_edi
= GETREG(ctx
, vcpu
, VM_REG_GUEST_RDI
);
463 /* Segment selectors */
464 tss
->tss_es
= GETREG(ctx
, vcpu
, VM_REG_GUEST_ES
);
465 tss
->tss_cs
= GETREG(ctx
, vcpu
, VM_REG_GUEST_CS
);
466 tss
->tss_ss
= GETREG(ctx
, vcpu
, VM_REG_GUEST_SS
);
467 tss
->tss_ds
= GETREG(ctx
, vcpu
, VM_REG_GUEST_DS
);
468 tss
->tss_fs
= GETREG(ctx
, vcpu
, VM_REG_GUEST_FS
);
469 tss
->tss_gs
= GETREG(ctx
, vcpu
, VM_REG_GUEST_GS
);
472 tss
->tss_eflags
= GETREG(ctx
, vcpu
, VM_REG_GUEST_RFLAGS
);
473 if (task_switch
->reason
== TSR_IRET
)
474 tss
->tss_eflags
&= ~PSL_NT
;
477 /* Copy updated old TSS into guest memory */
478 vm_copyout(ctx
, vcpu
, tss
, iov
, sizeof(struct tss32
));
482 update_seg_desc(struct vmctx
*ctx
, int vcpu
, int reg
, struct seg_desc
*sd
)
486 error
= vm_set_desc(ctx
, vcpu
, reg
, sd
->base
, sd
->limit
, sd
->access
);
491 * Update the vcpu registers to reflect the state of the new task.
494 tss32_restore(struct vmctx
*ctx
, int vcpu
, struct vm_task_switch
*ts
,
495 uint16_t ot_sel
, struct tss32
*tss
, struct iovec
*iov
, int *faultptr
)
497 struct seg_desc seg_desc
, seg_desc2
;
498 uint64_t *pdpte
, maxphyaddr
, reserved
;
504 if (ts
->reason
!= TSR_IRET
&& ts
->reason
!= TSR_JMP
) {
505 tss
->tss_link
= ot_sel
;
509 eflags
= tss
->tss_eflags
;
514 SETREG(ctx
, vcpu
, VM_REG_GUEST_LDTR
, tss
->tss_ldt
);
517 if (ts
->paging
.paging_mode
!= PAGING_MODE_FLAT
) {
518 if (ts
->paging
.paging_mode
== PAGING_MODE_PAE
) {
520 * XXX Assuming 36-bit MAXPHYADDR.
522 maxphyaddr
= (1UL << 36) - 1;
523 pdpte
= paddr_guest2host(ctx
, tss
->tss_cr3
& ~0x1f, 32);
524 for (i
= 0; i
< 4; i
++) {
525 /* Check reserved bits if the PDPTE is valid */
526 if (!(pdpte
[i
] & 0x1))
529 * Bits 2:1, 8:5 and bits above the processor's
530 * maximum physical address are reserved.
532 reserved
= ~maxphyaddr
| 0x1E6;
533 if (pdpte
[i
] & reserved
) {
534 vm_inject_gp(ctx
, vcpu
);
538 SETREG(ctx
, vcpu
, VM_REG_GUEST_PDPTE0
, pdpte
[0]);
539 SETREG(ctx
, vcpu
, VM_REG_GUEST_PDPTE1
, pdpte
[1]);
540 SETREG(ctx
, vcpu
, VM_REG_GUEST_PDPTE2
, pdpte
[2]);
541 SETREG(ctx
, vcpu
, VM_REG_GUEST_PDPTE3
, pdpte
[3]);
543 SETREG(ctx
, vcpu
, VM_REG_GUEST_CR3
, tss
->tss_cr3
);
544 ts
->paging
.cr3
= tss
->tss_cr3
;
548 SETREG(ctx
, vcpu
, VM_REG_GUEST_RFLAGS
, eflags
);
549 SETREG(ctx
, vcpu
, VM_REG_GUEST_RIP
, tss
->tss_eip
);
551 /* General purpose registers */
552 SETREG(ctx
, vcpu
, VM_REG_GUEST_RAX
, tss
->tss_eax
);
553 SETREG(ctx
, vcpu
, VM_REG_GUEST_RCX
, tss
->tss_ecx
);
554 SETREG(ctx
, vcpu
, VM_REG_GUEST_RDX
, tss
->tss_edx
);
555 SETREG(ctx
, vcpu
, VM_REG_GUEST_RBX
, tss
->tss_ebx
);
556 SETREG(ctx
, vcpu
, VM_REG_GUEST_RSP
, tss
->tss_esp
);
557 SETREG(ctx
, vcpu
, VM_REG_GUEST_RBP
, tss
->tss_ebp
);
558 SETREG(ctx
, vcpu
, VM_REG_GUEST_RSI
, tss
->tss_esi
);
559 SETREG(ctx
, vcpu
, VM_REG_GUEST_RDI
, tss
->tss_edi
);
561 /* Segment selectors */
562 SETREG(ctx
, vcpu
, VM_REG_GUEST_ES
, tss
->tss_es
);
563 SETREG(ctx
, vcpu
, VM_REG_GUEST_CS
, tss
->tss_cs
);
564 SETREG(ctx
, vcpu
, VM_REG_GUEST_SS
, tss
->tss_ss
);
565 SETREG(ctx
, vcpu
, VM_REG_GUEST_DS
, tss
->tss_ds
);
566 SETREG(ctx
, vcpu
, VM_REG_GUEST_FS
, tss
->tss_fs
);
567 SETREG(ctx
, vcpu
, VM_REG_GUEST_GS
, tss
->tss_gs
);
570 * If this is a nested task then write out the new TSS to update
571 * the previous link field.
574 vm_copyout(ctx
, vcpu
, tss
, iov
, sizeof(*tss
));
576 /* Validate segment descriptors */
577 error
= validate_seg_desc(ctx
, vcpu
, ts
, VM_REG_GUEST_LDTR
, &seg_desc
,
579 if (error
|| *faultptr
)
581 update_seg_desc(ctx
, vcpu
, VM_REG_GUEST_LDTR
, &seg_desc
);
584 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
586 * The SS and CS attribute checks on VM-entry are inter-dependent so
587 * we need to make sure that both segments are valid before updating
588 * either of them. This ensures that the VMCS state can pass the
589 * VM-entry checks so the guest can handle any exception injected
590 * during task switch emulation.
592 error
= validate_seg_desc(ctx
, vcpu
, ts
, VM_REG_GUEST_CS
, &seg_desc
,
594 if (error
|| *faultptr
)
597 error
= validate_seg_desc(ctx
, vcpu
, ts
, VM_REG_GUEST_SS
, &seg_desc2
,
599 if (error
|| *faultptr
)
601 update_seg_desc(ctx
, vcpu
, VM_REG_GUEST_CS
, &seg_desc
);
602 update_seg_desc(ctx
, vcpu
, VM_REG_GUEST_SS
, &seg_desc2
);
603 ts
->paging
.cpl
= tss
->tss_cs
& SEL_RPL_MASK
;
605 error
= validate_seg_desc(ctx
, vcpu
, ts
, VM_REG_GUEST_DS
, &seg_desc
,
607 if (error
|| *faultptr
)
609 update_seg_desc(ctx
, vcpu
, VM_REG_GUEST_DS
, &seg_desc
);
611 error
= validate_seg_desc(ctx
, vcpu
, ts
, VM_REG_GUEST_ES
, &seg_desc
,
613 if (error
|| *faultptr
)
615 update_seg_desc(ctx
, vcpu
, VM_REG_GUEST_ES
, &seg_desc
);
617 error
= validate_seg_desc(ctx
, vcpu
, ts
, VM_REG_GUEST_FS
, &seg_desc
,
619 if (error
|| *faultptr
)
621 update_seg_desc(ctx
, vcpu
, VM_REG_GUEST_FS
, &seg_desc
);
623 error
= validate_seg_desc(ctx
, vcpu
, ts
, VM_REG_GUEST_GS
, &seg_desc
,
625 if (error
|| *faultptr
)
627 update_seg_desc(ctx
, vcpu
, VM_REG_GUEST_GS
, &seg_desc
);
634 * Copy of vie_alignment_check() from vmm_instruction_emul.c
637 alignment_check(int cpl
, int size
, uint64_t cr0
, uint64_t rf
, uint64_t gla
)
639 assert(size
== 1 || size
== 2 || size
== 4 || size
== 8);
640 assert(cpl
>= 0 && cpl
<= 3);
642 if (cpl
!= 3 || (cr0
& CR0_AM
) == 0 || (rf
& PSL_AC
) == 0)
645 return ((gla
& (size
- 1)) ? 1 : 0);
649 * Copy of vie_size2mask() from vmm_instruction_emul.c
662 return (0xffffffffffffffff);
671 * Copy of vie_calculate_gla() from vmm_instruction_emul.c
674 calculate_gla(enum vm_cpu_mode cpu_mode
, enum vm_reg_name seg
,
675 struct seg_desc
*desc
, uint64_t offset
, int length
, int addrsize
,
676 int prot
, uint64_t *gla
)
678 uint64_t firstoff
, low_limit
, high_limit
, segbase
;
681 assert(seg
>= VM_REG_GUEST_ES
&& seg
<= VM_REG_GUEST_GS
);
682 assert((length
== 1 || length
== 2 || length
== 4 || length
== 8));
683 assert((prot
& ~(PROT_READ
| PROT_WRITE
)) == 0);
686 if (cpu_mode
== CPU_MODE_64BIT
) {
687 assert(addrsize
== 4 || addrsize
== 8);
690 assert(addrsize
== 2 || addrsize
== 4);
693 * If the segment selector is loaded with a NULL selector
694 * then the descriptor is unusable and attempting to use
695 * it results in a #GP(0).
697 if (SEG_DESC_UNUSABLE(desc
->access
))
701 * The processor generates a #NP exception when a segment
702 * register is loaded with a selector that points to a
703 * descriptor that is not present. If this was the case then
704 * it would have been checked before the VM-exit.
706 assert(SEG_DESC_PRESENT(desc
->access
));
709 * The descriptor type must indicate a code/data segment.
711 type
= SEG_DESC_TYPE(desc
->access
);
712 assert(type
>= 16 && type
<= 31);
714 if (prot
& PROT_READ
) {
715 /* #GP on a read access to a exec-only code segment */
716 if ((type
& 0xA) == 0x8)
720 if (prot
& PROT_WRITE
) {
722 * #GP on a write access to a code segment or a
723 * read-only data segment.
725 if (type
& 0x8) /* code segment */
728 if ((type
& 0xA) == 0) /* read-only data seg */
733 * 'desc->limit' is fully expanded taking granularity into
736 if ((type
& 0xC) == 0x4) {
737 /* expand-down data segment */
738 low_limit
= desc
->limit
+ 1;
739 high_limit
= SEG_DESC_DEF32(desc
->access
) ?
742 /* code segment or expand-up data segment */
744 high_limit
= desc
->limit
;
748 offset
&= size2mask(addrsize
);
749 if (offset
< low_limit
|| offset
> high_limit
)
757 * In 64-bit mode all segments except %fs and %gs have a segment
760 if (cpu_mode
== CPU_MODE_64BIT
&& seg
!= VM_REG_GUEST_FS
&&
761 seg
!= VM_REG_GUEST_GS
) {
764 segbase
= desc
->base
;
768 * Truncate 'firstoff' to the effective address size before adding
769 * it to the segment base.
771 firstoff
&= size2mask(addrsize
);
772 *gla
= (segbase
+ firstoff
) & size2mask(glasize
);
777 * Push an error code on the stack of the new task. This is needed if the
778 * task switch was triggered by a hardware exception that causes an error
779 * code to be saved (e.g. #PF).
782 push_errcode(struct vmctx
*ctx
, int vcpu
, struct vm_guest_paging
*paging
,
783 int task_type
, uint32_t errcode
, int *faultptr
)
786 struct seg_desc seg_desc
;
787 int stacksize
, bytes
, error
;
788 uint64_t gla
, cr0
, rflags
;
794 cr0
= GETREG(ctx
, vcpu
, VM_REG_GUEST_CR0
);
795 rflags
= GETREG(ctx
, vcpu
, VM_REG_GUEST_RFLAGS
);
796 stacksel
= GETREG(ctx
, vcpu
, VM_REG_GUEST_SS
);
798 error
= vm_get_desc(ctx
, vcpu
, VM_REG_GUEST_SS
, &seg_desc
.base
,
799 &seg_desc
.limit
, &seg_desc
.access
);
803 * Section "Error Code" in the Intel SDM vol 3: the error code is
804 * pushed on the stack as a doubleword or word (depending on the
805 * default interrupt, trap or task gate size).
807 if (task_type
== SDT_SYS386BSY
|| task_type
== SDT_SYS386TSS
)
813 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
814 * stack-segment descriptor determines the size of the stack
815 * pointer outside of 64-bit mode.
817 if (SEG_DESC_DEF32(seg_desc
.access
))
822 esp
= GETREG(ctx
, vcpu
, VM_REG_GUEST_RSP
);
825 if (calculate_gla(paging
->cpu_mode
, VM_REG_GUEST_SS
,
826 &seg_desc
, esp
, bytes
, stacksize
, PROT_WRITE
, &gla
)) {
827 sel_exception(ctx
, vcpu
, IDT_SS
, stacksel
, 1);
832 if (alignment_check(paging
->cpl
, bytes
, cr0
, rflags
, gla
)) {
833 vm_inject_ac(ctx
, vcpu
, 1);
838 error
= vm_copy_setup(ctx
, vcpu
, paging
, gla
, bytes
, PROT_WRITE
,
839 iov
, nitems(iov
), faultptr
);
840 if (error
|| *faultptr
)
843 vm_copyout(ctx
, vcpu
, &errcode
, iov
, bytes
);
844 SETREG(ctx
, vcpu
, VM_REG_GUEST_RSP
, esp
);
849 * Evaluate return value from helper functions and potentially return to
852 #define CHKERR(error,fault) \
854 assert((error == 0) || (error == EFAULT)); \
856 return (VMEXIT_ABORT); \
858 return (VMEXIT_CONTINUE); \
862 vmexit_task_switch(struct vmctx
*ctx
, struct vm_exit
*vmexit
, int *pvcpu
)
865 struct tss32 oldtss
, newtss
;
866 struct vm_task_switch
*task_switch
;
867 struct vm_guest_paging
*paging
, sup_paging
;
868 struct user_segment_descriptor nt_desc
, ot_desc
;
869 struct iovec nt_iov
[2], ot_iov
[2];
870 uint64_t cr0
, ot_base
;
871 uint32_t eip
, ot_lim
, access
;
872 int error
, ext
, fault
, minlimit
, nt_type
, ot_type
, vcpu
;
873 enum task_switch_reason reason
;
874 uint16_t nt_sel
, ot_sel
;
876 task_switch
= &vmexit
->u
.task_switch
;
877 nt_sel
= task_switch
->tsssel
;
878 ext
= vmexit
->u
.task_switch
.ext
;
879 reason
= vmexit
->u
.task_switch
.reason
;
880 paging
= &vmexit
->u
.task_switch
.paging
;
883 assert(paging
->cpu_mode
== CPU_MODE_PROTECTED
);
886 * Calculate the instruction pointer to store in the old TSS.
888 eip
= vmexit
->rip
+ vmexit
->inst_length
;
891 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
892 * The following page table accesses are implicitly supervisor mode:
893 * - accesses to GDT or LDT to load segment descriptors
894 * - accesses to the task state segment during task switch
896 sup_paging
= *paging
;
897 sup_paging
.cpl
= 0; /* implicit supervisor mode */
899 /* Fetch the new TSS descriptor */
900 error
= read_tss_descriptor(ctx
, vcpu
, task_switch
, nt_sel
, &nt_desc
,
902 CHKERR(error
, fault
);
904 nt
= usd_to_seg_desc(&nt_desc
);
906 /* Verify the type of the new TSS */
907 nt_type
= SEG_DESC_TYPE(nt
.access
);
908 if (nt_type
!= SDT_SYS386BSY
&& nt_type
!= SDT_SYS386TSS
&&
909 nt_type
!= SDT_SYS286BSY
&& nt_type
!= SDT_SYS286TSS
) {
910 sel_exception(ctx
, vcpu
, IDT_TS
, nt_sel
, ext
);
914 /* TSS descriptor must have present bit set */
915 if (!SEG_DESC_PRESENT(nt
.access
)) {
916 sel_exception(ctx
, vcpu
, IDT_NP
, nt_sel
, ext
);
921 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
922 * 44 bytes for a 16-bit TSS.
924 if (nt_type
== SDT_SYS386BSY
|| nt_type
== SDT_SYS386TSS
)
926 else if (nt_type
== SDT_SYS286BSY
|| nt_type
== SDT_SYS286TSS
)
931 assert(minlimit
> 0);
932 if (nt
.limit
< (unsigned int)minlimit
) {
933 sel_exception(ctx
, vcpu
, IDT_TS
, nt_sel
, ext
);
937 /* TSS must be busy if task switch is due to IRET */
938 if (reason
== TSR_IRET
&& !TSS_BUSY(nt_type
)) {
939 sel_exception(ctx
, vcpu
, IDT_TS
, nt_sel
, ext
);
944 * TSS must be available (not busy) if task switch reason is
945 * CALL, JMP, exception or interrupt.
947 if (reason
!= TSR_IRET
&& TSS_BUSY(nt_type
)) {
948 sel_exception(ctx
, vcpu
, IDT_GP
, nt_sel
, ext
);
952 /* Fetch the new TSS */
953 error
= vm_copy_setup(ctx
, vcpu
, &sup_paging
, nt
.base
, minlimit
+ 1,
954 PROT_READ
| PROT_WRITE
, nt_iov
, nitems(nt_iov
), &fault
);
955 CHKERR(error
, fault
);
956 vm_copyin(ctx
, vcpu
, nt_iov
, &newtss
, minlimit
+ 1);
958 /* Get the old TSS selector from the guest's task register */
959 ot_sel
= GETREG(ctx
, vcpu
, VM_REG_GUEST_TR
);
960 if (ISLDT(ot_sel
) || IDXSEL(ot_sel
) == 0) {
962 * This might happen if a task switch was attempted without
963 * ever loading the task register with LTR. In this case the
964 * TR would contain the values from power-on:
965 * (sel = 0, base = 0, limit = 0xffff).
967 sel_exception(ctx
, vcpu
, IDT_TS
, ot_sel
, task_switch
->ext
);
971 /* Get the old TSS base and limit from the guest's task register */
972 error
= vm_get_desc(ctx
, vcpu
, VM_REG_GUEST_TR
, &ot_base
, &ot_lim
,
975 assert(!SEG_DESC_UNUSABLE(access
) && SEG_DESC_PRESENT(access
));
976 ot_type
= SEG_DESC_TYPE(access
);
977 assert(ot_type
== SDT_SYS386BSY
|| ot_type
== SDT_SYS286BSY
);
979 /* Fetch the old TSS descriptor */
980 error
= read_tss_descriptor(ctx
, vcpu
, task_switch
, ot_sel
, &ot_desc
,
982 CHKERR(error
, fault
);
984 /* Get the old TSS */
985 error
= vm_copy_setup(ctx
, vcpu
, &sup_paging
, ot_base
, minlimit
+ 1,
986 PROT_READ
| PROT_WRITE
, ot_iov
, nitems(ot_iov
), &fault
);
987 CHKERR(error
, fault
);
988 vm_copyin(ctx
, vcpu
, ot_iov
, &oldtss
, minlimit
+ 1);
991 * Clear the busy bit in the old TSS descriptor if the task switch
992 * due to an IRET or JMP instruction.
994 if (reason
== TSR_IRET
|| reason
== TSR_JMP
) {
995 ot_desc
.sd_type
&= ~0x2;
996 error
= desc_table_write(ctx
, vcpu
, &sup_paging
, ot_sel
,
998 CHKERR(error
, fault
);
1001 if (nt_type
== SDT_SYS286BSY
|| nt_type
== SDT_SYS286TSS
) {
1002 EPRINTLN("Task switch to 16-bit TSS not supported");
1003 return (VMEXIT_ABORT
);
1006 /* Save processor state in old TSS */
1007 tss32_save(ctx
, vcpu
, task_switch
, eip
, &oldtss
, ot_iov
);
1010 * If the task switch was triggered for any reason other than IRET
1011 * then set the busy bit in the new TSS descriptor.
1013 if (reason
!= TSR_IRET
) {
1014 nt_desc
.sd_type
|= 0x2;
1015 error
= desc_table_write(ctx
, vcpu
, &sup_paging
, nt_sel
,
1017 CHKERR(error
, fault
);
1020 /* Update task register to point at the new TSS */
1021 SETREG(ctx
, vcpu
, VM_REG_GUEST_TR
, nt_sel
);
1023 /* Update the hidden descriptor state of the task register */
1024 nt
= usd_to_seg_desc(&nt_desc
);
1025 update_seg_desc(ctx
, vcpu
, VM_REG_GUEST_TR
, &nt
);
1028 cr0
= GETREG(ctx
, vcpu
, VM_REG_GUEST_CR0
);
1029 SETREG(ctx
, vcpu
, VM_REG_GUEST_CR0
, cr0
| CR0_TS
);
1032 * We are now committed to the task switch. Any exceptions encountered
1033 * after this point will be handled in the context of the new task and
1034 * the saved instruction pointer will belong to the new task.
1036 error
= vm_set_register(ctx
, vcpu
, VM_REG_GUEST_RIP
, newtss
.tss_eip
);
1039 /* Load processor state from new TSS */
1040 error
= tss32_restore(ctx
, vcpu
, task_switch
, ot_sel
, &newtss
, nt_iov
,
1042 CHKERR(error
, fault
);
1045 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
1046 * caused an error code to be generated, this error code is copied
1047 * to the stack of the new task.
1049 if (task_switch
->errcode_valid
) {
1050 assert(task_switch
->ext
);
1051 assert(task_switch
->reason
== TSR_IDT_GATE
);
1052 error
= push_errcode(ctx
, vcpu
, &task_switch
->paging
, nt_type
,
1053 task_switch
->errcode
, &fault
);
1054 CHKERR(error
, fault
);
1058 * Treatment of virtual-NMI blocking if NMI is delivered through
1061 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
1062 * If the virtual NMIs VM-execution control is 1, VM entry injects
1063 * an NMI, and delivery of the NMI causes a task switch that causes
1064 * a VM exit, virtual-NMI blocking is in effect before the VM exit
1067 * Thus, virtual-NMI blocking is in effect at the time of the task
1072 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
1074 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
1075 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
1076 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
1078 * Thus, virtual-NMI blocking is cleared at the time of the task switch
1083 * If the task switch was triggered by an event delivered through
1084 * the IDT then extinguish the pending event from the vcpu's
1087 if (task_switch
->reason
== TSR_IDT_GATE
) {
1088 error
= vm_set_intinfo(ctx
, vcpu
, 0);
1093 * XXX should inject debug exception if 'T' bit is 1
1096 return (VMEXIT_CONTINUE
);