2 * Copyright (c) 1989, 1990 William F. Jolitz.
3 * Copyright (c) 1990 The Regents of the University of California.
4 * Copyright (c) 2007 The FreeBSD Foundation
5 * Copyright (c) 2008 The DragonFly Project.
6 * Copyright (c) 2008 Jordan Gordeev.
9 * Portions of this software were developed by A. Joseph Koshy under
10 * sponsorship from the FreeBSD Foundation and Google, Inc.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 #include "opt_atpic.h"
41 #include <machine/asmacros.h>
42 #include <machine/psl.h>
43 #include <machine/trap.h>
44 #include <machine/segments.h>
50 .globl lwkt_switch_return
52 /*****************************************************************************/
54 /*****************************************************************************/
56 * Trap and fault vector routines.
58 * All traps are 'interrupt gates', SDT_SYSIGT. An interrupt gate pushes
59 * state on the stack but also disables interrupts. This is important for
60 * us for the use of the swapgs instruction. We cannot be interrupted
61 * until the GS.base value is correct. For most traps, we automatically
62 * then enable interrupts if the interrupted context had them enabled.
64 * The cpu will push a certain amount of state onto the kernel stack for
65 * the current process. See x86_64/include/frame.h.
66 * This includes the current RFLAGS (status register, which includes
67 * the interrupt disable state prior to the trap), the code segment register,
68 * and the return instruction pointer are pushed by the cpu. The cpu
69 * will also push an 'error' code for certain traps. We push a dummy
70 * error code for those traps where the cpu doesn't in order to maintain
71 * a consistent frame. We also push a contrived 'trap number'.
73 * The cpu does not push the general registers, we must do that, and we
74 * must restore them prior to calling 'iret'. The cpu adjusts the %cs and
75 * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we
76 * must load them with appropriate values for supervisor mode operation.
83 * Interrupts must be disabled for all traps, otherwise horrible %gs
87 /* Regular traps; The cpu does not supply tf_err for these. */
90 movq $0,TF_XFLAGS(%rsp) ; \
91 movq $(a),TF_TRAPNO(%rsp) ; \
92 movq $0,TF_ADDR(%rsp) ; \
93 movq $0,TF_ERR(%rsp) ; \
96 /* This group of traps have tf_err already pushed by the cpu */
99 movq $(a),TF_TRAPNO(%rsp) ; \
100 movq $0,TF_ADDR(%rsp) ; \
101 movq $0,TF_XFLAGS(%rsp) ; \
105 * Due to a historical artifact, it is possible for a #DB exception
106 * to occur in certain bad places that would normlally be protected by
107 * the interrupt gate's interrupt disablement.
109 * Due to this possibly occuring in the system call entry code, we also
110 * run #DB on an ist2 stack to force the cpu to load a new %rsp, otherwise
111 * it might push the cpu exception frame onto the user stack. To make things
112 * easier we just point ist2 at our trampoline area.
115 #ifdef DIRECT_DISALLOW_SS_CPUBUG
117 * Directly disallow #DB faults which can occur at critical points
118 * in the code due to a historical artifact of how the cpu operates.
119 * %gs state might not match RPL. Test the %rip and iretq immediately
120 * (valid %gs and %cr3 state not needed). If we don't need kernel
121 * reporting we can enable this and its a bit safer from unintended
124 * If this is not enabled the kernel still catches the problem. It
125 * will report the problem and continue properly.
129 cmpq $Xfast_syscall,0(%rsp)
134 * Ok, regardless of the RPL mask in the trap frame, we took
135 * the trap on a separate stack via ist2. This means we
136 * must copy it appropriately.
138 * If coming from userland we can skip directly to the normal
139 * TRAP code because it will handle the fact that we are on an
140 * alternative stack (dbgstack set by ist2), even though it isn't
141 * the trampoline stack). The frame will be moved to the correct
144 testb $SEL_RPL_MASK,TF_CS-TF_RIP(%rsp)
145 jnz 210f /* jnz from userland */
148 * From kernel - %gs and %cr3 may be inconsistent. Save original
149 * values and load consistent values, restore after return.
151 * The trap handler is NOT allowed to block for this case.
154 movq %rax, TR_RAX(%rsp)
155 movq %rcx, TR_RCX(%rsp)
156 movq %rdx, TR_RDX(%rsp)
159 movq %cr3,%rax /* save CR3 */
160 movq %rax, TR_PCB_CR3_SAVED(%rsp)
161 movl $MSR_GSBASE,%ecx /* save %gs */
165 movq %rax, TR_PCB_GS_SAVED(%rsp)
166 movq TR_PCB_GS_KERNEL(%rsp),%rdx /* retrieve kernel %gs */
170 movq PCPU(trampoline)+TR_PCB_CR3,%rax
173 movq TR_RDX(%rsp), %rdx
174 movq TR_RCX(%rsp), %rcx
175 movq TR_RAX(%rsp), %rax
179 * We are coming from the kernel.
181 * We are on the IST2 stack and, in fact, we have to *STAY* on this
182 * stack so no longer try to shift our frame to the kernel %rsp
183 * in the trap frame, since this %rsp might actually be a user %rsp
184 * in the mov mem,%ss + syscall DBG trap case.
186 * Run the normal trap. Because TF_CS is at a kernel RPL, the
187 * normal code will skip the usual swapgs and KMMU (trampoline)
188 * code. We've handled the rest.
190 * NOTE: at this point the trampframe is above the normal stack
191 * frame. The trap code will be ignorant of the special
192 * TR_* registers above the cpu hardware frame portion,
193 * and the TR_* registers below it will be overwritten.
196 movq $0,TF_XFLAGS(%rsp)
197 movq $T_TRCTRAP,TF_TRAPNO(%rsp)
198 movq $0,TF_ADDR(%rsp)
201 FAKE_MCOUNT(TF_RIP(%rsp))
208 * Pop the frame (since we're coming from kernel mode, this will
209 * not mess with %cr3 or %gs), then restore %cr3 and %gs for our
210 * iretq. Not optimal but more readable and this is not a
216 movq %rax, TR_RAX(%rsp)
217 movq %rcx, TR_RCX(%rsp)
218 movq %rdx, TR_RDX(%rsp)
220 movl $MSR_GSBASE,%ecx /* restore %gs */
221 movq TR_PCB_GS_SAVED(%rsp),%rdx
226 movq TR_PCB_CR3_SAVED(%rsp),%rax /* restore %cr3 */
229 movq TR_RAX(%rsp),%rax
230 movq TR_RCX(%rsp),%rcx
231 movq TR_RDX(%rsp),%rdx
235 * Direct iretq. No point jumping to doreti because the
236 * exception code that deals with iretq faults can't handle
237 * non-deterministic %gs/%cr3 state.
239 #ifdef DIRECT_DISALLOW_SS_CPUBUG
245 * From userland (normal trap path)
282 * alltraps entry point. Use swapgs if this is the first time in the
283 * kernel from userland. Reenable interrupts if they were enabled
286 * WARNING! %gs not available until after our swapgs code
290 .type alltraps,@function
295 movq %rdi,TF_RDI(%rsp)
296 alltraps_pushregs_no_rdi:
297 movq %rsi,TF_RSI(%rsp)
298 movq %rdx,TF_RDX(%rsp)
299 movq %rcx,TF_RCX(%rsp)
302 movq %rax,TF_RAX(%rsp)
303 movq %rbx,TF_RBX(%rsp)
304 movq %rbp,TF_RBP(%rsp)
305 movq %r10,TF_R10(%rsp)
306 movq %r11,TF_R11(%rsp)
307 movq %r12,TF_R12(%rsp)
308 movq %r13,TF_R13(%rsp)
309 movq %r14,TF_R14(%rsp)
310 movq %r15,TF_R15(%rsp)
313 FAKE_MCOUNT(TF_RIP(%rsp))
315 .type calltrap,@function
321 jmp doreti /* Handle any pending ASTs */
325 movq $T_DOUBLEFLT,TF_TRAPNO(%rsp)
326 movq $0,TF_ADDR(%rsp)
327 movq $0,TF_XFLAGS(%rsp)
331 call dblfault_handler
336 * We need to save the contents of %cr2 before PUSH_FRAME* messes
340 PUSH_FRAME_TFERR_SAVECR2
341 movq $T_PAGEFLT,TF_TRAPNO(%rsp)
342 movq $0,TF_XFLAGS(%rsp)
346 * We have to special-case this one. If we get a trap in doreti() at
347 * the iretq stage, we'll reenter as a kernel exception with the
348 * wrong gs and isolation state. We have to act as through we came
353 leaq doreti_iret(%rip),%r10
354 cmpq %r10,TF_RIP-TF_ERR+8(%rsp) /* +8 due to pushq */
356 testb $SEL_RPL_MASK,TF_CS-TF_ERR+8(%rsp) /* +8 due to pushq */
360 * Special fault during iretq
367 movq $T_PROTFLT,TF_TRAPNO(%rsp)
368 movq $0,TF_ADDR(%rsp)
369 movq $0,TF_XFLAGS(%rsp)
375 movq $T_PROTFLT,TF_TRAPNO(%rsp)
376 movq $0,TF_ADDR(%rsp)
377 movq $0,TF_XFLAGS(%rsp)
381 * Fast syscall entry point. We enter here with just our new %cs/%ss set,
382 * and the new privilige level. We are still running on the old user stack
383 * pointer. We have to juggle a few things around to find our stack etc.
384 * swapgs gives us access to our PCPU space only.
386 * We use GD_TRAMPOLINE+TR_CR2 to save the user stack pointer temporarily.
389 swapgs /* get kernel %gs */
390 movq %rsp,PCPU(trampoline)+TR_CR2 /* save user %rsp */
391 movq PCPU(common_tss)+TSS_RSP0,%rsp
394 * NOTE: KMMUENTER_SYSCALL does not actually use the stack but
395 * adjust the stack pointer for correctness in case we
398 subq $TR_PCB_RSP,%rsp
400 movq PCPU(trampoline)+TR_PCB_RSP,%rsp
402 /* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
404 /* defer TF_RSP till we have a spare register */
405 movq %r11,TF_RFLAGS(%rsp)
406 movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */
407 movq PCPU(trampoline)+TR_CR2,%r11 /* %r11 already saved */
408 movq %r11,TF_RSP(%rsp) /* user stack pointer */
409 orl $RQF_QUICKRET,PCPU(reqflags)
410 movq $KUDSEL,TF_SS(%rsp)
411 movq $KUCSEL,TF_CS(%rsp)
413 movq $T_FAST_SYSCALL,TF_TRAPNO(%rsp) /* for the vkernel */
414 movq $0,TF_XFLAGS(%rsp) /* note: used in signal frame */
415 movq %rdi,TF_RDI(%rsp) /* arg 1 */
416 movq %rsi,TF_RSI(%rsp) /* arg 2 */
417 movq %rdx,TF_RDX(%rsp) /* arg 3 */
418 movq %r10,TF_RCX(%rsp) /* arg 4 */
419 movq %r8,TF_R8(%rsp) /* arg 5 */
420 movq %r9,TF_R9(%rsp) /* arg 6 */
421 movq %rax,TF_RAX(%rsp) /* syscall number */
422 movq %rbx,TF_RBX(%rsp) /* C preserved */
423 movq %rbp,TF_RBP(%rsp) /* C preserved */
424 movq %r12,TF_R12(%rsp) /* C preserved */
425 movq %r13,TF_R13(%rsp) /* C preserved */
426 movq %r14,TF_R14(%rsp) /* C preserved */
427 movq %r15,TF_R15(%rsp) /* C preserved */
429 xorq %rax,%rax /* SECURITY CLEAR REGS */
446 FAKE_MCOUNT(TF_RIP(%rsp))
451 * Fast return from system call
454 testl $RQF_IPIQ|RQF_TIMER|RQF_INTPEND|RQF_AST_MASK,PCPU(reqflags)
456 testl $RQF_QUICKRET,PCPU(reqflags)
460 movq TF_RBX(%rsp),%rbx /* SECURITY RESTORE */
461 movq TF_RCX(%rsp),%rcx
462 movq TF_RBP(%rsp),%rbp
465 xorq %r10,%r10 /* (security - clear scratch) */
467 movq TF_R12(%rsp),%r12
468 movq TF_R13(%rsp),%r13
469 movq TF_R14(%rsp),%r14
470 movq TF_R15(%rsp),%r15
472 movq TF_RDI(%rsp),%rdi /* NORMAL RESTORE */
473 movq TF_RSI(%rsp),%rsi
474 movq TF_RDX(%rsp),%rdx
475 movq TF_RAX(%rsp),%rax
476 movq TF_RFLAGS(%rsp),%r11
477 movq TF_RIP(%rsp),%rcx
478 movq TF_RSP(%rsp),%rsp
484 * Normal slow / full iret
491 * Here for CYA insurance, in case a "syscall" instruction gets
492 * issued from 32 bit compatibility mode. MSR_CSTAR has to point
493 * to *something* if EFER_SCE is enabled.
495 IDTVEC(fast_syscall32)
499 * NMI handling is special.
501 * First, an NMI is taken on its own pcpu stack. RFLAGS.IF, %gs, and %cr3
502 * will be inconsistent when interrupt supervisor mode.
504 * Second, the processor treats NMIs specially, blocking further NMIs
505 * until an 'iretq' instruction is executed. We therefore need to
506 * execute the NMI handler with interrupts disabled to prevent a
507 * nested interrupt from executing an 'iretq' instruction and
508 * inadvertently taking the processor out of NMI mode.
512 * We don't need to special-case entry from userland, %gs will
513 * be consistent with expectations.
515 testb $SEL_RPL_MASK,TF_CS-TF_RIP(%rsp) ; /* from userland? */ \
519 * From kernel - %gs and %cr3 may be inconsistent. Save original
520 * values and load consistent values, restore on return.
522 * The trap handler is NOT allowed to block for this case.
525 movq %rax, TR_RAX(%rsp)
526 movq %rcx, TR_RCX(%rsp)
527 movq %rdx, TR_RDX(%rsp)
530 movq %cr3,%rax /* save CR3 */
531 movq %rax, TR_PCB_CR3_SAVED(%rsp)
532 movl $MSR_GSBASE,%ecx /* save %gs */
536 movq %rax, TR_PCB_GS_SAVED(%rsp)
537 movq TR_PCB_GS_KERNEL(%rsp),%rdx /* retrieve kernel %gs */
542 movq TR_PCB_CR3(%rsp),%rax /* retrieve kernel %cr3 */
544 movq PCPU(trampoline)+TR_PCB_CR3,%rax
547 movq TR_RDX(%rsp), %rdx
548 movq TR_RCX(%rsp), %rcx
549 movq TR_RAX(%rsp), %rax
553 * Ok, run the normal trap. Because TF_CS is at a kernel RPL,
554 * the normal code will skip the usual swapgs and KMMU (trampoline)
555 * code. We've handled the rest.
557 * NOTE: at this point the trampframe is above the normal stack
558 * frame. The trap code will be ignorant of the special
559 * TR_* registers above the cpu hardware frame portion,
560 * and the TR_* registers below it will be overwritten.
563 movq $0,TF_XFLAGS(%rsp)
564 movq $T_NMI,TF_TRAPNO(%rsp)
565 movq $0,TF_ADDR(%rsp)
568 FAKE_MCOUNT(TF_RIP(%rsp))
575 * Pop the frame (since we're coming from kernel mode, this will
576 * not mess with %cr3 or %gs), then restore %cr3 and %gs for our
577 * iretq. Not optimal but more readable and this is not a
583 movq %rax, TR_RAX(%rsp)
584 movq %rcx, TR_RCX(%rsp)
585 movq %rdx, TR_RDX(%rsp)
587 movl $MSR_GSBASE,%ecx /* restore %gs */
588 movq TR_PCB_GS_SAVED(%rsp),%rdx
593 movq TR_PCB_CR3_SAVED(%rsp),%rax /* restore %cr3 */
596 movq TR_RAX(%rsp),%rax
597 movq TR_RCX(%rsp),%rcx
598 movq TR_RDX(%rsp),%rdx
602 * Direct iretq. No point jumping to doreti because the
603 * exception code that deals with iretq faults can't handle
604 * non-deterministic %gs/%cr3 state.
609 * From userland (normal trap path)
613 movq $0,TF_XFLAGS(%rsp)
614 movq $T_NMI,TF_TRAPNO(%rsp)
615 movq $0,TF_ADDR(%rsp)
618 FAKE_MCOUNT(TF_RIP(%rsp))
624 POP_FRAME(jmp doreti_iret)
627 * Reserved (unconfigured) traps rsvd00 - rsvdff
629 .macro reservetrap a b
631 TRAP(T_RESERVED + 0x\a\b)
671 * This function is what cpu_heavy_restore jumps to after a new process
672 * is created. The LWKT subsystem switches while holding a critical
673 * section and we maintain that abstraction here (e.g. because
674 * cpu_heavy_restore needs it due to PCB_*() manipulation), then get out of
675 * it before calling the initial function (typically fork_return()) and/or
676 * returning to user mode.
678 * The MP lock is not held at any point but the critcount is bumped
679 * on entry to prevent interruption of the trampoline at a bad point.
681 * This is effectively what td->td_switch() returns to. It 'returns' the
682 * old thread in %rax and since this is not returning to a td->td_switch()
683 * call from lwkt_switch() we must handle the cleanup for the old thread
684 * by calling lwkt_switch_return().
686 * fork_trampoline(%rax:otd, %rbx:func, %r12:arg)
688 ENTRY(fork_trampoline)
690 call lwkt_switch_return
691 movq PCPU(curthread),%rax
692 decl TD_CRITCOUNT(%rax)
695 * cpu_set_fork_handler intercepts this function call to
696 * have this call a non-return function to stay in kernel mode.
698 * initproc has its own fork handler, start_init(), which DOES
701 * %rbx - chaining function (typically fork_return)
702 * %r12 -> %rdi (argument)
703 * frame-> %rsi (trap frame)
705 * void (func:rbx)(arg:rdi, trapframe:rsi)
707 movq %rsp, %rsi /* pass trapframe by reference */
708 movq %r12, %rdi /* arg1 */
709 call *%rbx /* function */
711 /* cut from syscall */
717 * Return via doreti to handle ASTs.
719 * trapframe is at the top of the stack.
725 * To efficiently implement classification of trap and interrupt handlers
726 * for profiling, there must be only trap handlers between the labels btrap
727 * and bintr, and only interrupt handlers between the labels bintr and
728 * eintr. This is implemented (partly) by including files that contain
729 * some of the handlers. Before including the files, set up a normal asm
730 * environment so that the included files doen't need to know that they are
742 #include <x86_64/x86_64/apic_vector.S>
751 #include <x86_64/isa/atpic_vector.S>