2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 * The Regents of the University of California. All rights reserved.
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
38 * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $
42 * x86_64 Trap and System call handling
48 #include "opt_ktrace.h"
50 #include <sys/param.h>
51 #include <sys/systm.h>
53 #include <sys/pioctl.h>
54 #include <sys/kernel.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/signal2.h>
58 #include <sys/syscall.h>
59 #include <sys/sysctl.h>
60 #include <sys/sysent.h>
61 #include <sys/vmmeter.h>
62 #include <sys/malloc.h>
64 #include <sys/ktrace.h>
67 #include <sys/vkernel.h>
68 #include <sys/sysmsg.h>
69 #include <sys/vmspace.h>
72 #include <vm/vm_param.h>
75 #include <vm/vm_kern.h>
76 #include <vm/vm_map.h>
77 #include <vm/vm_page.h>
78 #include <vm/vm_extern.h>
80 #include <machine/cpu.h>
81 #include <machine/md_var.h>
82 #include <machine/pcb.h>
83 #include <machine/smp.h>
84 #include <machine/tss.h>
85 #include <machine/globaldata.h>
89 #include <sys/msgport2.h>
90 #include <sys/thread2.h>
91 #include <sys/mplock2.h>
93 int (*pmath_emulate
) (struct trapframe
*);
95 static int trap_pfault (struct trapframe
*, int, vm_offset_t
);
96 static void trap_fatal (struct trapframe
*, int, vm_offset_t
);
97 void dblfault_handler (void);
99 static struct krate segfltrate
= { 1 };
102 extern inthand_t
IDTVEC(syscall
);
105 #define MAX_TRAP_MSG 30
106 static char *trap_msg
[] = {
108 "privileged instruction fault", /* 1 T_PRIVINFLT */
110 "breakpoint instruction fault", /* 3 T_BPTFLT */
113 "arithmetic trap", /* 6 T_ARITHTRAP */
114 "system forced exception", /* 7 T_ASTFLT */
116 "general protection fault", /* 9 T_PROTFLT */
117 "trace trap", /* 10 T_TRCTRAP */
119 "page fault", /* 12 T_PAGEFLT */
121 "alignment fault", /* 14 T_ALIGNFLT */
125 "integer divide fault", /* 18 T_DIVIDE */
126 "non-maskable interrupt trap", /* 19 T_NMI */
127 "overflow trap", /* 20 T_OFLOW */
128 "FPU bounds check fault", /* 21 T_BOUND */
129 "FPU device not available", /* 22 T_DNA */
130 "double fault", /* 23 T_DOUBLEFLT */
131 "FPU operand fetch fault", /* 24 T_FPOPFLT */
132 "invalid TSS fault", /* 25 T_TSSFLT */
133 "segment not present fault", /* 26 T_SEGNPFLT */
134 "stack fault", /* 27 T_STKFLT */
135 "machine check trap", /* 28 T_MCHK */
136 "SIMD floating-point exception", /* 29 T_XMMFLT */
137 "reserved (unknown) fault", /* 30 T_RESERVED */
141 static int ddb_on_nmi
= 1;
142 SYSCTL_INT(_machdep
, OID_AUTO
, ddb_on_nmi
, CTLFLAG_RW
,
143 &ddb_on_nmi
, 0, "Go to DDB on NMI");
145 static int panic_on_nmi
= 1;
146 SYSCTL_INT(_machdep
, OID_AUTO
, panic_on_nmi
, CTLFLAG_RW
,
147 &panic_on_nmi
, 0, "Panic on NMI");
150 * Passively intercepts the thread switch function to increase
151 * the thread priority from a user priority to a kernel priority, reducing
152 * syscall and trap overhead for the case where no switch occurs.
154 * Synchronizes td_ucred with p_ucred. This is used by system calls,
155 * signal handling, faults, AST traps, and anything else that enters the
156 * kernel from userland and provides the kernel with a stable read-only
157 * copy of the process ucred.
160 userenter(struct thread
*curtd
, struct proc
*curp
)
165 curtd
->td_release
= lwkt_passive_release
;
167 if (curtd
->td_ucred
!= curp
->p_ucred
) {
168 ncred
= crhold(curp
->p_ucred
);
169 ocred
= curtd
->td_ucred
;
170 curtd
->td_ucred
= ncred
;
177 * Handle signals, profiling, and other AST's and/or tasks that
178 * must be completed before we can return to or try to return to userland.
180 * Note that td_sticks is a 64 bit quantity, but there's no point doing 64
181 * arithmatic on the delta calculation so the absolute tick values are
182 * truncated to an integer.
185 userret(struct lwp
*lp
, struct trapframe
*frame
, int sticks
)
187 struct proc
*p
= lp
->lwp_proc
;
192 * Charge system time if profiling. Note: times are in microseconds.
193 * This may do a copyout and block, so do it first even though it
194 * means some system time will be charged as user time.
196 if (p
->p_flags
& P_PROFIL
) {
197 addupc_task(p
, frame
->tf_rip
,
198 (u_int
)((int)lp
->lwp_thread
->td_sticks
- sticks
));
203 * Specific on-return-to-usermode checks (LWP_MP_WEXIT,
204 * LWP_MP_VNLRU, etc).
206 if (lp
->lwp_mpflags
& LWP_MP_URETMASK
)
210 * Block here if we are in a stopped state.
212 if (STOPLWP(p
, lp
)) {
213 lwkt_gettoken(&p
->p_token
);
215 lwkt_reltoken(&p
->p_token
);
220 * Post any pending upcalls. If running a virtual kernel be sure
221 * to restore the virtual kernel's vmspace before posting the upcall.
223 if (p
->p_flags
& (P_SIGVTALRM
| P_SIGPROF
)) {
224 lwkt_gettoken(&p
->p_token
);
225 if (p
->p_flags
& P_SIGVTALRM
) {
226 p
->p_flags
&= ~P_SIGVTALRM
;
227 ksignal(p
, SIGVTALRM
);
229 if (p
->p_flags
& P_SIGPROF
) {
230 p
->p_flags
&= ~P_SIGPROF
;
233 lwkt_reltoken(&p
->p_token
);
238 * Post any pending signals
240 * WARNING! postsig() can exit and not return.
242 if ((sig
= CURSIG_LCK_TRACE(lp
, &ptok
)) != 0) {
248 * In a multi-threaded program it is possible for a thread to change
249 * signal state during a system call which temporarily changes the
250 * signal mask. In this case postsig() might not be run and we
251 * have to restore the mask ourselves.
253 if (lp
->lwp_flags
& LWP_OLDMASK
) {
254 lp
->lwp_flags
&= ~LWP_OLDMASK
;
255 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
261 * Cleanup from userenter and any passive release that might have occured.
262 * We must reclaim the current-process designation before we can return
263 * to usermode. We also handle both LWKT and USER reschedule requests.
266 userexit(struct lwp
*lp
)
268 struct thread
*td
= lp
->lwp_thread
;
269 /* globaldata_t gd = td->td_gd; */
272 * Handle stop requests at kernel priority. Any requests queued
273 * after this loop will generate another AST.
275 while (STOPLWP(lp
->lwp_proc
, lp
)) {
276 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
278 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
282 * Reduce our priority in preparation for a return to userland. If
283 * our passive release function was still in place, our priority was
284 * never raised and does not need to be reduced.
286 lwkt_passive_recover(td
);
289 * Become the current user scheduled process if we aren't already,
290 * and deal with reschedule requests and other factors.
292 lp
->lwp_proc
->p_usched
->acquire_curproc(lp
);
293 /* WARNING: we may have migrated cpu's */
294 /* gd = td->td_gd; */
297 #if !defined(KTR_KERNENTRY)
298 #define KTR_KERNENTRY KTR_ALL
300 KTR_INFO_MASTER(kernentry
);
301 KTR_INFO(KTR_KERNENTRY
, kernentry
, trap
, 0,
302 "TRAP(pid %hd, tid %hd, trapno %ld, eva %lu)",
303 pid_t pid
, lwpid_t tid
, register_t trapno
, vm_offset_t eva
);
304 KTR_INFO(KTR_KERNENTRY
, kernentry
, trap_ret
, 0, "TRAP_RET(pid %hd, tid %hd)",
305 pid_t pid
, lwpid_t tid
);
306 KTR_INFO(KTR_KERNENTRY
, kernentry
, syscall
, 0, "SYSC(pid %hd, tid %hd, nr %ld)",
307 pid_t pid
, lwpid_t tid
, register_t trapno
);
308 KTR_INFO(KTR_KERNENTRY
, kernentry
, syscall_ret
, 0, "SYSRET(pid %hd, tid %hd, err %d)",
309 pid_t pid
, lwpid_t tid
, int err
);
310 KTR_INFO(KTR_KERNENTRY
, kernentry
, fork_ret
, 0, "FORKRET(pid %hd, tid %hd)",
311 pid_t pid
, lwpid_t tid
);
314 * Exception, fault, and trap interface to the kernel.
315 * This common code is called from assembly language IDT gate entry
316 * routines that prepare a suitable stack frame, and restore this
317 * frame after the exception has been processed.
319 * This function is also called from doreti in an interlock to handle ASTs.
320 * For example: hardwareint->INTROUTINE->(set ast)->doreti->trap
322 * NOTE! We have to retrieve the fault address prior to obtaining the
323 * MP lock because get_mplock() may switch out. YYY cr2 really ought
324 * to be retrieved by the assembly code, not here.
326 * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing
327 * if an attempt is made to switch from a fast interrupt or IPI. This is
328 * necessary to properly take fatal kernel traps on SMP machines if
329 * get_mplock() has to block.
333 user_trap(struct trapframe
*frame
)
335 struct globaldata
*gd
= mycpu
;
336 struct thread
*td
= gd
->gd_curthread
;
337 struct lwp
*lp
= td
->td_lwp
;
340 int i
= 0, ucode
= 0, type
, code
;
342 int crit_count
= td
->td_critcount
;
343 lwkt_tokref_t curstop
= td
->td_toks_stop
;
349 if (frame
->tf_trapno
== T_PAGEFLT
)
350 eva
= frame
->tf_addr
;
354 kprintf("USER_TRAP AT %08lx xflags %ld trapno %ld eva %08lx\n",
355 frame
->tf_rip
, frame
->tf_xflags
, frame
->tf_trapno
, eva
);
359 * Everything coming from user mode runs through user_trap,
360 * including system calls.
362 if (frame
->tf_trapno
== T_FAST_SYSCALL
) {
367 KTR_LOG(kernentry_trap
, lp
->lwp_proc
->p_pid
, lp
->lwp_tid
,
368 frame
->tf_trapno
, eva
);
372 eva
= (frame
->tf_trapno
== T_PAGEFLT
? rcr2() : 0);
373 ++gd
->gd_trap_nesting_level
;
374 trap_fatal(frame
, TRUE
, eva
);
375 --gd
->gd_trap_nesting_level
;
380 type
= frame
->tf_trapno
;
381 code
= frame
->tf_err
;
385 sticks
= (int)td
->td_sticks
;
386 lp
->lwp_md
.md_regs
= frame
;
389 case T_PRIVINFLT
: /* privileged instruction fault */
394 case T_BPTFLT
: /* bpt instruction fault */
395 case T_TRCTRAP
: /* trace trap */
396 frame
->tf_rflags
&= ~PSL_T
;
398 ucode
= (type
== T_TRCTRAP
? TRAP_TRACE
: TRAP_BRKPT
);
401 case T_ARITHTRAP
: /* arithmetic trap */
406 case T_ASTFLT
: /* Allow process switch */
407 mycpu
->gd_cnt
.v_soft
++;
408 if (mycpu
->gd_reqflags
& RQF_AST_OWEUPC
) {
409 atomic_clear_int(&mycpu
->gd_reqflags
, RQF_AST_OWEUPC
);
410 addupc_task(p
, p
->p_prof
.pr_addr
, p
->p_prof
.pr_ticks
);
415 * The following two traps can happen in
416 * vm86 mode, and, if so, we want to handle
419 case T_PROTFLT
: /* general protection fault */
420 case T_STKFLT
: /* stack fault */
422 if (frame
->tf_eflags
& PSL_VM
) {
423 i
= vm86_emulate((struct vm86frame
*)frame
);
431 case T_SEGNPFLT
: /* segment not present fault */
432 case T_TSSFLT
: /* invalid TSS fault */
433 case T_DOUBLEFLT
: /* double fault */
436 ucode
= code
+ BUS_SEGM_FAULT
;
439 case T_PAGEFLT
: /* page fault */
440 i
= trap_pfault(frame
, TRUE
, eva
);
441 if (i
== -1 || i
== 0)
453 case T_DIVIDE
: /* integer divide fault */
460 /* machine/parity/power fail/"kitchen sink" faults */
461 if (isa_nmi(code
) == 0) {
464 * NMI can be hooked up to a pushbutton
468 kprintf ("NMI ... going to debugger\n");
469 kdb_trap(type
, 0, frame
);
473 } else if (panic_on_nmi
)
474 panic("NMI indicates hardware failure");
476 #endif /* NISA > 0 */
478 case T_OFLOW
: /* integer overflow fault */
483 case T_BOUND
: /* bounds check fault */
490 * Virtual kernel intercept - pass the DNA exception
491 * to the (emulated) virtual kernel if it asked to handle
492 * it. This occurs when the virtual kernel is holding
493 * onto the FP context for a different emulated
494 * process then the one currently running.
496 * We must still call npxdna() since we may have
497 * saved FP state that the (emulated) virtual kernel
498 * needs to hand over to a different emulated process.
500 if (lp
->lwp_vkernel
&& lp
->lwp_vkernel
->ve
&&
501 (td
->td_pcb
->pcb_flags
& FP_VIRTFP
)
508 * The kernel may have switched out the FP unit's
509 * state, causing the user process to take a fault
510 * when it tries to use the FP unit. Restore the
517 if (!pmath_emulate
) {
519 ucode
= FPE_FPU_NP_TRAP
;
522 i
= (*pmath_emulate
)(frame
);
524 if (!(frame
->tf_rflags
& PSL_T
))
526 frame
->tf_rflags
&= ~PSL_T
;
529 /* else ucode = emulator_only_knows() XXX */
532 case T_FPOPFLT
: /* FPU operand fetch fault */
537 case T_XMMFLT
: /* SIMD floating-point exception */
544 * Virtual kernel intercept - if the fault is directly related to a
545 * VM context managed by a virtual kernel then let the virtual kernel
548 if (lp
->lwp_vkernel
&& lp
->lwp_vkernel
->ve
) {
549 vkernel_trap(lp
, frame
);
554 * Translate fault for emulators (e.g. Linux)
556 if (*p
->p_sysent
->sv_transtrap
)
557 i
= (*p
->p_sysent
->sv_transtrap
)(i
, type
);
559 trapsignal(lp
, i
, ucode
);
562 if (type
<= MAX_TRAP_MSG
) {
563 uprintf("fatal process exception: %s",
565 if ((type
== T_PAGEFLT
) || (type
== T_PROTFLT
))
566 uprintf(", fault VA = 0x%lx", (u_long
)eva
);
572 userret(lp
, frame
, sticks
);
575 KTR_LOG(kernentry_trap_ret
, lp
->lwp_proc
->p_pid
, lp
->lwp_tid
);
577 KASSERT(crit_count
== td
->td_critcount
,
578 ("trap: critical section count mismatch! %d/%d",
579 crit_count
, td
->td_pri
));
580 KASSERT(curstop
== td
->td_toks_stop
,
581 ("trap: extra tokens held after trap! %ld/%ld",
582 curstop
- &td
->td_toks_base
,
583 td
->td_toks_stop
- &td
->td_toks_base
));
588 kern_trap(struct trapframe
*frame
)
590 struct globaldata
*gd
= mycpu
;
591 struct thread
*td
= gd
->gd_curthread
;
594 int i
= 0, ucode
= 0, type
, code
;
596 int crit_count
= td
->td_critcount
;
597 lwkt_tokref_t curstop
= td
->td_toks_stop
;
604 if (frame
->tf_trapno
== T_PAGEFLT
)
605 eva
= frame
->tf_addr
;
611 ++gd
->gd_trap_nesting_level
;
612 trap_fatal(frame
, FALSE
, eva
);
613 --gd
->gd_trap_nesting_level
;
618 type
= frame
->tf_trapno
;
619 code
= frame
->tf_err
;
627 case T_PAGEFLT
: /* page fault */
628 trap_pfault(frame
, FALSE
, eva
);
633 * The kernel may be using npx for copying or other
636 panic("kernel NPX should not happen");
641 case T_PROTFLT
: /* general protection fault */
642 case T_SEGNPFLT
: /* segment not present fault */
644 * Invalid segment selectors and out of bounds
645 * %eip's and %esp's can be set up in user mode.
646 * This causes a fault in kernel mode when the
647 * kernel tries to return to user mode. We want
648 * to get this fault so that we can fix the
649 * problem here and not have to check all the
650 * selectors and pointers when the user changes
653 if (mycpu
->gd_intr_nesting_level
== 0) {
654 if (td
->td_pcb
->pcb_onfault
) {
656 (register_t
)td
->td_pcb
->pcb_onfault
;
664 * PSL_NT can be set in user mode and isn't cleared
665 * automatically when the kernel is entered. This
666 * causes a TSS fault when the kernel attempts to
667 * `iret' because the TSS link is uninitialized. We
668 * want to get this fault so that we can fix the
669 * problem here and not every time the kernel is
672 if (frame
->tf_rflags
& PSL_NT
) {
673 frame
->tf_rflags
&= ~PSL_NT
;
678 case T_TRCTRAP
: /* trace trap */
680 if (frame
->tf_eip
== (int)IDTVEC(syscall
)) {
682 * We've just entered system mode via the
683 * syscall lcall. Continue single stepping
684 * silently until the syscall handler has
689 if (frame
->tf_eip
== (int)IDTVEC(syscall
) + 1) {
691 * The syscall handler has now saved the
692 * flags. Stop single stepping it.
694 frame
->tf_eflags
&= ~PSL_T
;
700 * Ignore debug register trace traps due to
701 * accesses in the user's address space, which
702 * can happen under several conditions such as
703 * if a user sets a watchpoint on a buffer and
704 * then passes that buffer to a system call.
705 * We still want to get TRCTRAPS for addresses
706 * in kernel space because that is useful when
707 * debugging the kernel.
709 if (user_dbreg_trap()) {
711 * Reset breakpoint bits because the
714 load_dr6(rdr6() & 0xfffffff0);
719 * Fall through (TRCTRAP kernel mode, kernel address)
723 * If DDB is enabled, let it handle the debugger trap.
724 * Otherwise, debugger traps "can't happen".
727 if (kdb_trap (type
, 0, frame
))
732 trap_fatal(frame
, FALSE
, eva
);
735 trap_fatal(frame
, FALSE
, eva
);
740 * Ignore this trap generated from a spurious SIGTRAP.
742 * single stepping in / syscalls leads to spurious / SIGTRAP
745 * Haiku (c) 2007 Simon 'corecode' Schubert
751 * Translate fault for emulators (e.g. Linux)
753 if (*p
->p_sysent
->sv_transtrap
)
754 i
= (*p
->p_sysent
->sv_transtrap
)(i
, type
);
757 trapsignal(lp
, i
, ucode
);
760 if (type
<= MAX_TRAP_MSG
) {
761 uprintf("fatal process exception: %s",
763 if ((type
== T_PAGEFLT
) || (type
== T_PROTFLT
))
764 uprintf(", fault VA = 0x%lx", (u_long
)eva
);
772 KASSERT(crit_count
== td
->td_critcount
,
773 ("trap: critical section count mismatch! %d/%d",
774 crit_count
, td
->td_pri
));
775 KASSERT(curstop
== td
->td_toks_stop
,
776 ("trap: extra tokens held after trap! %ld/%ld",
777 curstop
- &td
->td_toks_base
,
778 td
->td_toks_stop
- &td
->td_toks_base
));
783 trap_pfault(struct trapframe
*frame
, int usermode
, vm_offset_t eva
)
786 struct vmspace
*vm
= NULL
;
790 thread_t td
= curthread
;
791 struct lwp
*lp
= td
->td_lwp
;
794 va
= trunc_page(eva
);
795 if (usermode
== FALSE
) {
797 * This is a fault on kernel virtual memory.
802 * This is a fault on non-kernel virtual memory.
803 * vm is initialized above to NULL. If curproc is NULL
804 * or curproc->p_vmspace is NULL the fault is fatal.
807 vm
= lp
->lwp_vmspace
;
815 if (frame
->tf_err
& PGEX_W
)
816 ftype
= VM_PROT_READ
| VM_PROT_WRITE
;
817 else if (frame
->tf_err
& PGEX_I
)
818 ftype
= VM_PROT_EXECUTE
;
820 ftype
= VM_PROT_READ
;
822 if (map
!= kernel_map
) {
824 * Keep swapout from messing with us during this
831 * Grow the stack if necessary
833 /* grow_stack returns false only if va falls into
834 * a growable stack region and the stack growth
835 * fails. It returns true if va was not within
836 * a growable stack region, or if the stack
839 if (!grow_stack (map
, va
)) {
848 fault_flags
|= VM_FAULT_BURST
| VM_FAULT_USERMODE
;
849 if (ftype
& VM_PROT_WRITE
)
850 fault_flags
|= VM_FAULT_DIRTY
;
852 fault_flags
|= VM_FAULT_NORMAL
;
853 rv
= vm_fault(map
, va
, ftype
, fault_flags
);
858 * Don't have to worry about process locking or stacks in the kernel.
860 rv
= vm_fault(map
, va
, ftype
, VM_FAULT_NORMAL
);
863 if (rv
== KERN_SUCCESS
)
867 if (td
->td_gd
->gd_intr_nesting_level
== 0 &&
868 td
->td_pcb
->pcb_onfault
) {
869 frame
->tf_rip
= (register_t
)td
->td_pcb
->pcb_onfault
;
872 trap_fatal(frame
, usermode
, eva
);
877 * NOTE: on x86_64 we have a tf_addr field in the trapframe, no
878 * kludge is needed to pass the fault address to signal handlers.
880 struct proc
*p
= td
->td_proc
;
881 krateprintf(&segfltrate
,
882 "seg-fault accessing address %p "
883 "rip=%p pid=%d p_comm=%s\n",
885 (void *)frame
->tf_rip
, p
->p_pid
, p
->p_comm
);
886 /* Debugger("seg-fault"); */
888 return((rv
== KERN_PROTECTION_FAILURE
) ? SIGBUS
: SIGSEGV
);
892 trap_fatal(struct trapframe
*frame
, int usermode
, vm_offset_t eva
)
897 code
= frame
->tf_xflags
;
898 type
= frame
->tf_trapno
;
900 if (type
<= MAX_TRAP_MSG
) {
901 kprintf("\n\nFatal trap %d: %s while in %s mode\n",
902 type
, trap_msg
[type
],
903 (usermode
? "user" : "kernel"));
905 /* two separate prints in case of a trap on an unmapped page */
906 kprintf("cpuid = %d\n", mycpu
->gd_cpuid
);
907 if (type
== T_PAGEFLT
) {
908 kprintf("fault virtual address = %p\n", (void *)eva
);
909 kprintf("fault code = %s %s, %s\n",
910 usermode
? "user" : "supervisor",
911 code
& PGEX_W
? "write" : "read",
912 code
& PGEX_P
? "protection violation" : "page not present");
914 kprintf("instruction pointer = 0x%lx:0x%lx\n",
915 frame
->tf_cs
& 0xffff, frame
->tf_rip
);
917 ss
= frame
->tf_ss
& 0xffff;
920 ss
= GSEL(GDATA_SEL
, SEL_KPL
);
921 rsp
= (long)&frame
->tf_rsp
;
923 kprintf("stack pointer = 0x%x:0x%lx\n", ss
, rsp
);
924 kprintf("frame pointer = 0x%x:0x%lx\n", ss
, frame
->tf_rbp
);
925 kprintf("processor eflags = ");
926 if (frame
->tf_rflags
& PSL_T
)
927 kprintf("trace trap, ");
928 if (frame
->tf_rflags
& PSL_I
)
929 kprintf("interrupt enabled, ");
930 if (frame
->tf_rflags
& PSL_NT
)
931 kprintf("nested task, ");
932 if (frame
->tf_rflags
& PSL_RF
)
935 if (frame
->tf_eflags
& PSL_VM
)
938 kprintf("IOPL = %jd\n", (intmax_t)((frame
->tf_rflags
& PSL_IOPL
) >> 12));
939 kprintf("current process = ");
941 kprintf("%lu (%s)\n",
942 (u_long
)curproc
->p_pid
, curproc
->p_comm
?
943 curproc
->p_comm
: "");
947 kprintf("current thread = pri %d ", curthread
->td_pri
);
948 if (curthread
->td_critcount
)
953 * we probably SHOULD have stopped the other CPUs before now!
954 * another CPU COULD have been touching cpl at this moment...
956 kprintf(" <- SMP: XXX");
964 if ((debugger_on_panic
|| db_active
) && kdb_trap(type
, code
, frame
))
967 kprintf("trap number = %d\n", type
);
968 if (type
<= MAX_TRAP_MSG
)
969 panic("%s", trap_msg
[type
]);
971 panic("unknown/reserved trap");
975 * Double fault handler. Called when a fault occurs while writing
976 * a frame for a trap/exception onto the stack. This usually occurs
977 * when the stack overflows (such is the case with infinite recursion,
980 * XXX Note that the current PTD gets replaced by IdlePTD when the
981 * task switch occurs. This means that the stack that was active at
982 * the time of the double fault is not available at <kstack> unless
983 * the machine was idle when the double fault occurred. The downside
984 * of this is that "trace <ebp>" in ddb won't work.
987 dblfault_handler(void)
990 struct mdglobaldata
*gd
= mdcpu
;
993 kprintf("\nFatal double fault:\n");
995 kprintf("rip = 0x%lx\n", gd
->gd_common_tss
.tss_rip
);
996 kprintf("rsp = 0x%lx\n", gd
->gd_common_tss
.tss_rsp
);
997 kprintf("rbp = 0x%lx\n", gd
->gd_common_tss
.tss_rbp
);
999 /* two separate prints in case of a trap on an unmapped page */
1000 kprintf("cpuid = %d\n", mycpu
->gd_cpuid
);
1001 panic("double fault");
1005 * syscall2 - MP aware system call request C handler
1007 * A system call is essentially treated as a trap except that the
1008 * MP lock is not held on entry or return. We are responsible for
1009 * obtaining the MP lock if necessary and for handling ASTs
1010 * (e.g. a task switch) prior to return.
1013 syscall2(struct trapframe
*frame
)
1015 struct thread
*td
= curthread
;
1016 struct proc
*p
= td
->td_proc
;
1017 struct lwp
*lp
= td
->td_lwp
;
1018 struct sysent
*callp
;
1019 register_t orig_tf_rflags
;
1024 int crit_count
= td
->td_critcount
;
1025 lwkt_tokref_t curstop
= td
->td_toks_stop
;
1027 struct sysmsg sysmsg
;
1028 union sysunion
*argp
;
1030 const int regcnt
= 6;
1032 mycpu
->gd_cnt
.v_syscall
++;
1034 KTR_LOG(kernentry_syscall
, lp
->lwp_proc
->p_pid
, lp
->lwp_tid
,
1037 userenter(td
, p
); /* lazy raise our priority */
1042 sticks
= (int)td
->td_sticks
;
1043 orig_tf_rflags
= frame
->tf_rflags
;
1046 * Virtual kernel intercept - if a VM context managed by a virtual
1047 * kernel issues a system call the virtual kernel handles it, not us.
1048 * Restore the virtual kernel context and return from its system
1049 * call. The current frame is copied out to the virtual kernel.
1051 if (__predict_false(lp
->lwp_vkernel
&& lp
->lwp_vkernel
->ve
)) {
1052 vkernel_trap(lp
, frame
);
1053 error
= EJUSTRETURN
;
1060 * Get the system call parameters and account for time
1062 lp
->lwp_md
.md_regs
= frame
;
1063 code
= frame
->tf_rax
;
1065 if (code
>= p
->p_sysent
->sv_size
)
1067 argp
= (union sysunion
*)&frame
->tf_rdi
;
1068 callp
= &p
->p_sysent
->sv_table
[code
];
1071 * On x86_64 we get up to six arguments in registers. The rest are
1072 * on the stack. The first six members of 'struct trapframe' happen
1073 * to be the registers used to pass arguments, in exactly the right
1076 * Any arguments beyond available argument-passing registers must
1077 * be copyin()'d from the user stack.
1079 narg
= callp
->sy_narg
;
1080 if (__predict_false(narg
> regcnt
)) {
1081 register_t
*argsdst
;
1084 argsdst
= (register_t
*)&sysmsg
.extargs
;
1085 bcopy(argp
, argsdst
, sizeof(register_t
) * regcnt
);
1086 params
= (caddr_t
)frame
->tf_rsp
+ sizeof(register_t
);
1088 KASSERT(params
!= NULL
, ("copyin args with no params!"));
1089 error
= copyin(params
, &argsdst
[regcnt
],
1090 (narg
- regcnt
) * sizeof(register_t
));
1091 argp
= (void *)argsdst
;
1094 if (KTRPOINT(td
, KTR_SYSCALL
)) {
1095 ktrsyscall(lp
, code
, narg
, argp
);
1103 if (KTRPOINT(td
, KTR_SYSCALL
)) {
1104 ktrsyscall(lp
, code
, narg
, argp
);
1109 * Default return value is 0 (will be copied to %rax). Double-value
1110 * returns use %rax and %rdx. %rdx is left unchanged for system
1111 * calls which return only one result.
1113 sysmsg
.sysmsg_fds
[0] = 0;
1114 sysmsg
.sysmsg_fds
[1] = frame
->tf_rdx
;
1117 * The syscall might manipulate the trap frame. If it does it
1118 * will probably return EJUSTRETURN.
1120 sysmsg
.sysmsg_frame
= frame
;
1122 STOPEVENT(p
, S_SCE
, narg
); /* MP aware */
1125 * NOTE: All system calls run MPSAFE now. The system call itself
1126 * is responsible for getting the MP lock.
1128 error
= (*callp
->sy_call
)(&sysmsg
, argp
);
1131 kprintf("system call %d returned %d\n", code
, error
);
1136 * MP SAFE (we may or may not have the MP lock at this point)
1141 * Reinitialize proc pointer `p' as it may be different
1142 * if this is a child returning from fork syscall.
1145 lp
= curthread
->td_lwp
;
1146 frame
->tf_rax
= sysmsg
.sysmsg_fds
[0];
1147 frame
->tf_rdx
= sysmsg
.sysmsg_fds
[1];
1148 frame
->tf_rflags
&= ~PSL_C
;
1152 * Reconstruct pc, we know that 'syscall' is 2 bytes.
1153 * We have to do a full context restore so that %r10
1154 * (which was holding the value of %rcx) is restored for
1155 * the next iteration.
1157 frame
->tf_rip
-= frame
->tf_err
;
1158 frame
->tf_r10
= frame
->tf_rcx
;
1163 panic("Unexpected EASYNC return value (for now)");
1166 if (p
->p_sysent
->sv_errsize
) {
1167 if (error
>= p
->p_sysent
->sv_errsize
)
1168 error
= -1; /* XXX */
1170 error
= p
->p_sysent
->sv_errtbl
[error
];
1172 frame
->tf_rax
= error
;
1173 frame
->tf_rflags
|= PSL_C
;
1178 * Traced syscall. trapsignal() is not MP aware.
1180 if (orig_tf_rflags
& PSL_T
) {
1181 frame
->tf_rflags
&= ~PSL_T
;
1182 trapsignal(lp
, SIGTRAP
, 0);
1186 * Handle reschedule and other end-of-syscall issues
1188 userret(lp
, frame
, sticks
);
1191 if (KTRPOINT(td
, KTR_SYSRET
)) {
1192 ktrsysret(lp
, code
, error
, sysmsg
.sysmsg_result
);
1197 * This works because errno is findable through the
1198 * register set. If we ever support an emulation where this
1199 * is not the case, this code will need to be revisited.
1201 STOPEVENT(p
, S_SCX
, code
);
1204 KTR_LOG(kernentry_syscall_ret
, lp
->lwp_proc
->p_pid
, lp
->lwp_tid
, error
);
1206 KASSERT(&td
->td_toks_base
== td
->td_toks_stop
,
1207 ("syscall: critical section count mismatch! %d/%d",
1208 crit_count
, td
->td_pri
));
1209 KASSERT(curstop
== td
->td_toks_stop
,
1210 ("syscall: extra tokens held after trap! %ld",
1211 td
->td_toks_stop
- &td
->td_toks_base
));
1216 * Handles the syscall() and __syscall() API
1218 void xsyscall(struct sysmsg
*sysmsg
, struct nosys_args
*uap
);
1221 sys_xsyscall(struct sysmsg
*sysmsg
, const struct nosys_args
*uap
)
1223 struct trapframe
*frame
;
1224 struct sysent
*callp
;
1225 union sysunion
*argp
;
1227 const int regcnt
= 5; /* number of args passed in registers */
1233 frame
= sysmsg
->sysmsg_frame
;
1234 code
= (u_int
)frame
->tf_rdi
;
1235 if (code
>= td
->td_proc
->p_sysent
->sv_size
)
1237 argp
= (union sysunion
*)(&frame
->tf_rdi
+ 1);
1238 callp
= &td
->td_proc
->p_sysent
->sv_table
[code
];
1239 narg
= callp
->sy_narg
;
1242 * On x86_64 we get up to six arguments in registers. The rest are
1243 * on the stack. However, for syscall() and __syscall() the syscall
1244 * number is inserted as the first argument, so the limit is reduced
1247 if (__predict_false(narg
> regcnt
)) {
1248 register_t
*argsdst
;
1251 argsdst
= (register_t
*)&sysmsg
->extargs
;
1252 bcopy(argp
, argsdst
, sizeof(register_t
) * regcnt
);
1253 params
= (caddr_t
)frame
->tf_rsp
+ sizeof(register_t
);
1254 error
= copyin(params
, &argsdst
[regcnt
],
1255 (narg
- regcnt
) * sizeof(register_t
));
1256 argp
= (void *)argsdst
;
1262 if (KTRPOINTP(td
->td_proc
, td
, KTR_SYSCALL
)) {
1263 ktrsyscall(td
->td_lwp
, code
, narg
, argp
);
1267 error
= (*callp
->sy_call
)(sysmsg
, argp
);
1270 if (KTRPOINTP(td
->td_proc
, td
, KTR_SYSRET
)) {
1271 ktrsysret(td
->td_lwp
, code
, error
, sysmsg
->sysmsg_result
);
1279 * NOTE: mplock not held at any point
1282 fork_return(struct lwp
*lp
, struct trapframe
*frame
)
1284 frame
->tf_rax
= 0; /* Child returns zero */
1285 frame
->tf_rflags
&= ~PSL_C
; /* success */
1288 generic_lwp_return(lp
, frame
);
1289 KTR_LOG(kernentry_fork_ret
, lp
->lwp_proc
->p_pid
, lp
->lwp_tid
);
1293 * Simplified back end of syscall(), used when returning from fork()
1294 * directly into user mode.
1296 * This code will return back into the fork trampoline code which then
1299 * NOTE: The mplock is not held at any point.
1302 generic_lwp_return(struct lwp
*lp
, struct trapframe
*frame
)
1304 struct proc
*p
= lp
->lwp_proc
;
1307 * Check for exit-race. If one lwp exits the process concurrent with
1308 * another lwp creating a new thread, the two operations may cross
1309 * each other resulting in the newly-created lwp not receiving a
1312 if (p
->p_flags
& P_WEXIT
) {
1313 lwpsignal(p
, lp
, SIGKILL
);
1317 * Newly forked processes are given a kernel priority. We have to
1318 * adjust the priority to a normal user priority and fake entry
1319 * into the kernel (call userenter()) to install a passive release
1320 * function just in case userret() decides to stop the process. This
1321 * can occur when ^Z races a fork. If we do not install the passive
1322 * release function the current process designation will not be
1323 * released when the thread goes to sleep.
1325 lwkt_setpri_self(TDPRI_USER_NORM
);
1326 userenter(lp
->lwp_thread
, p
);
1327 userret(lp
, frame
, 0);
1329 if (KTRPOINT(lp
->lwp_thread
, KTR_SYSRET
))
1330 ktrsysret(lp
, SYS_fork
, 0, 0);
1332 lp
->lwp_flags
|= LWP_PASSIVE_ACQ
;
1334 lp
->lwp_flags
&= ~LWP_PASSIVE_ACQ
;
1338 * doreti has turned into this. The frame is directly on the stack. We
1339 * pull everything else we need (fpu and tls context) from the current
1342 * Note on fpu interactions: In a virtual kernel, the fpu context for
1343 * an emulated user mode process is not shared with the virtual kernel's
1344 * fpu context, so we only have to 'stack' fpu contexts within the virtual
1345 * kernel itself, and not even then since the signal() contexts that we care
1346 * about save and restore the FPU state (I think anyhow).
1348 * vmspace_ctl() returns an error only if it had problems instaling the
1349 * context we supplied or problems copying data to/from our VM space.
1352 go_user(struct intrframe
*frame
)
1354 struct trapframe
*tf
= (void *)&frame
->if_rdi
;
1360 * Interrupts may be disabled on entry, make sure all signals
1361 * can be received before beginning our loop.
1366 * Switch to the current simulated user process, then call
1367 * user_trap() when we break out of it (usually due to a signal).
1372 * Always make the FPU state correct. This should generally
1373 * be faster because the cost of taking a #NM fault through
1374 * the vkernel to the real kernel is astronomical.
1377 tf
->tf_xflags
&= ~PGEX_FPFAULT
;
1378 if (mdcpu
->gd_npxthread
!= curthread
) {
1379 if (mdcpu
->gd_npxthread
)
1380 npxsave(mdcpu
->gd_npxthread
->td_savefpu
);
1385 * Tell the real kernel whether it is ok to use the FP
1386 * unit or not, allowing us to take a T_DNA exception
1387 * if the context tries to use the FP.
1389 if (mdcpu
->gd_npxthread
== curthread
) {
1390 tf
->tf_xflags
&= ~PGEX_FPFAULT
;
1392 tf
->tf_xflags
|= PGEX_FPFAULT
;
1397 * Run emulated user process context. This call interlocks
1398 * with new mailbox signals.
1400 * Set PGEX_U unconditionally, indicating a user frame (the
1401 * bit is normally set only by T_PAGEFLT).
1403 id
= &curproc
->p_vmspace
->vm_pmap
;
1406 * The GDF_VIRTUSER hack helps statclock() figure out who
1407 * the tick belongs to.
1410 gd
->gd_flags
|= GDF_VIRTUSER
;
1411 r
= vmspace_ctl(id
, VMSPACE_CTL_RUN
, tf
,
1412 &curthread
->td_savevext
);
1414 frame
->if_xflags
|= PGEX_U
;
1417 * Immediately save the user FPU state. The vkernel is a
1418 * user program and libraries like libc will use the FP
1421 if (mdcpu
->gd_npxthread
== curthread
) {
1422 npxsave(mdcpu
->gd_npxthread
->td_savefpu
);
1425 gd
->gd_flags
&= ~GDF_VIRTUSER
;
1427 kprintf("GO USER %d trap %ld EVA %08lx RIP %08lx RSP %08lx XFLAGS %02lx/%02lx\n",
1428 r
, tf
->tf_trapno
, tf
->tf_addr
, tf
->tf_rip
, tf
->tf_rsp
,
1429 tf
->tf_xflags
, frame
->if_xflags
);
1433 panic("vmspace_ctl failed error %d", errno
);
1435 if (tf
->tf_trapno
) {
1439 if (mycpu
->gd_reqflags
& RQF_AST_MASK
) {
1440 tf
->tf_trapno
= T_ASTFLT
;
1448 * If PGEX_FPFAULT is set then set FP_VIRTFP in the PCB to force a T_DNA
1449 * fault (which is then passed back to the virtual kernel) if an attempt is
1450 * made to use the FP unit.
1452 * XXX this is a fairly big hack.
1455 set_vkernel_fp(struct trapframe
*frame
)
1457 struct thread
*td
= curthread
;
1459 if (frame
->tf_xflags
& PGEX_FPFAULT
) {
1460 td
->td_pcb
->pcb_flags
|= FP_VIRTFP
;
1461 if (mdcpu
->gd_npxthread
== td
)
1464 td
->td_pcb
->pcb_flags
&= ~FP_VIRTFP
;
1469 * Called from vkernel_trap() to fixup the vkernel's syscall
1470 * frame for vmspace_ctl() return.
1473 cpu_vkernel_trap(struct trapframe
*frame
, int error
)
1475 frame
->tf_rax
= error
;
1477 frame
->tf_rflags
|= PSL_C
;
1479 frame
->tf_rflags
&= ~PSL_C
;