sys/platform/vkernel64/x86_64/trap.c

   1 /*-
   2  * Copyright (C) 1994, David Greenman
   3  * Copyright (c) 1990, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to Berkeley by
   7  * the University of Utah, and William Jolitz.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  * 3. All advertising materials mentioning features or use of this software
  18  *    must display the following acknowledgement:
  19  *      This product includes software developed by the University of
  20  *      California, Berkeley and its contributors.
  21  * 4. Neither the name of the University nor the names of its contributors
  22  *    may be used to endorse or promote products derived from this software
  23  *    without specific prior written permission.
  24  *
  25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  35  * SUCH DAMAGE.
  36  *
  37  *      from: @(#)trap.c        7.4 (Berkeley) 5/13/91
  38  * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $
  39  */
  40
  41 /*
  42  * x86_64 Trap and System call handling
  43  */
  44
  45 #include "use_isa.h"
  46
  47 #include "opt_ddb.h"
  48 #include "opt_ktrace.h"
  49
  50 #include <sys/param.h>
  51 #include <sys/systm.h>
  52 #include <sys/proc.h>
  53 #include <sys/pioctl.h>
  54 #include <sys/kernel.h>
  55 #include <sys/resourcevar.h>
  56 #include <sys/signalvar.h>
  57 #include <sys/signal2.h>
  58 #include <sys/syscall.h>
  59 #include <sys/sysctl.h>
  60 #include <sys/sysent.h>
  61 #include <sys/vmmeter.h>
  62 #include <sys/malloc.h>
  63 #ifdef KTRACE
  64 #include <sys/ktrace.h>
  65 #endif
  66 #include <sys/ktr.h>
  67 #include <sys/vkernel.h>
  68 #include <sys/sysmsg.h>
  69 #include <sys/vmspace.h>
  70
  71 #include <vm/vm.h>
  72 #include <vm/vm_param.h>
  73 #include <sys/lock.h>
  74 #include <vm/pmap.h>
  75 #include <vm/vm_kern.h>
  76 #include <vm/vm_map.h>
  77 #include <vm/vm_page.h>
  78 #include <vm/vm_extern.h>
  79
  80 #include <machine/cpu.h>
  81 #include <machine/md_var.h>
  82 #include <machine/pcb.h>
  83 #include <machine/smp.h>
  84 #include <machine/tss.h>
  85 #include <machine/globaldata.h>
  86
  87 #include <ddb/ddb.h>
  88
  89 #include <sys/msgport2.h>
  90 #include <sys/thread2.h>
  91 #include <sys/mplock2.h>
  92
  93 int (*pmath_emulate) (struct trapframe *);
  94
  95 static int trap_pfault (struct trapframe *, int, vm_offset_t);
  96 static void trap_fatal (struct trapframe *, int, vm_offset_t);
  97 void dblfault_handler (void);
  98
  99 static struct krate segfltrate = { 1 };
 100
 101 #if 0
 102 extern inthand_t IDTVEC(syscall);
 103 #endif
 104
 105 #define MAX_TRAP_MSG            30
 106 static char *trap_msg[] = {
 107         "",                                     /*  0 unused */
 108         "privileged instruction fault",         /*  1 T_PRIVINFLT */
 109         "",                                     /*  2 unused */
 110         "breakpoint instruction fault",         /*  3 T_BPTFLT */
 111         "",                                     /*  4 unused */
 112         "",                                     /*  5 unused */
 113         "arithmetic trap",                      /*  6 T_ARITHTRAP */
 114         "system forced exception",              /*  7 T_ASTFLT */
 115         "",                                     /*  8 unused */
 116         "general protection fault",             /*  9 T_PROTFLT */
 117         "trace trap",                           /* 10 T_TRCTRAP */
 118         "",                                     /* 11 unused */
 119         "page fault",                           /* 12 T_PAGEFLT */
 120         "",                                     /* 13 unused */
 121         "alignment fault",                      /* 14 T_ALIGNFLT */
 122         "",                                     /* 15 unused */
 123         "",                                     /* 16 unused */
 124         "",                                     /* 17 unused */
 125         "integer divide fault",                 /* 18 T_DIVIDE */
 126         "non-maskable interrupt trap",          /* 19 T_NMI */
 127         "overflow trap",                        /* 20 T_OFLOW */
 128         "FPU bounds check fault",               /* 21 T_BOUND */
 129         "FPU device not available",             /* 22 T_DNA */
 130         "double fault",                         /* 23 T_DOUBLEFLT */
 131         "FPU operand fetch fault",              /* 24 T_FPOPFLT */
 132         "invalid TSS fault",                    /* 25 T_TSSFLT */
 133         "segment not present fault",            /* 26 T_SEGNPFLT */
 134         "stack fault",                          /* 27 T_STKFLT */
 135         "machine check trap",                   /* 28 T_MCHK */
 136         "SIMD floating-point exception",        /* 29 T_XMMFLT */
 137         "reserved (unknown) fault",             /* 30 T_RESERVED */
 138 };
 139
 140 #ifdef DDB
 141 static int ddb_on_nmi = 1;
 142 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
 143         &ddb_on_nmi, 0, "Go to DDB on NMI");
 144 #endif
 145 static int panic_on_nmi = 1;
 146 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
 147         &panic_on_nmi, 0, "Panic on NMI");
 148
 149 /*
 150  * Passively intercepts the thread switch function to increase
 151  * the thread priority from a user priority to a kernel priority, reducing
 152  * syscall and trap overhead for the case where no switch occurs.
 153  *
 154  * Synchronizes td_ucred with p_ucred.  This is used by system calls,
 155  * signal handling, faults, AST traps, and anything else that enters the
 156  * kernel from userland and provides the kernel with a stable read-only
 157  * copy of the process ucred.
 158  */
 159 static __inline void
 160 userenter(struct thread *curtd, struct proc *curp)
 161 {
 162         struct ucred *ocred;
 163         struct ucred *ncred;
 164
 165         curtd->td_release = lwkt_passive_release;
 166
 167         if (curtd->td_ucred != curp->p_ucred) {
 168                 ncred = crhold(curp->p_ucred);
 169                 ocred = curtd->td_ucred;
 170                 curtd->td_ucred = ncred;
 171                 if (ocred)
 172                         crfree(ocred);
 173         }
 174 }
 175
 176 /*
 177  * Handle signals, profiling, and other AST's and/or tasks that
 178  * must be completed before we can return to or try to return to userland.
 179  *
 180  * Note that td_sticks is a 64 bit quantity, but there's no point doing 64
 181  * arithmatic on the delta calculation so the absolute tick values are
 182  * truncated to an integer.
 183  */
 184 static void
 185 userret(struct lwp *lp, struct trapframe *frame, int sticks)
 186 {
 187         struct proc *p = lp->lwp_proc;
 188         int sig;
 189         int ptok;
 190
 191         /*
 192          * Charge system time if profiling.  Note: times are in microseconds.
 193          * This may do a copyout and block, so do it first even though it
 194          * means some system time will be charged as user time.
 195          */
 196         if (p->p_flags & P_PROFIL) {
 197                 addupc_task(p, frame->tf_rip,
 198                         (u_int)((int)lp->lwp_thread->td_sticks - sticks));
 199         }
 200
 201 recheck:
 202         /*
 203          * Specific on-return-to-usermode checks (LWP_MP_WEXIT,
 204          * LWP_MP_VNLRU, etc).
 205          */
 206         if (lp->lwp_mpflags & LWP_MP_URETMASK)
 207                 lwpuserret(lp);
 208
 209         /*
 210          * Block here if we are in a stopped state.
 211          */
 212         if (STOPLWP(p, lp)) {
 213                 lwkt_gettoken(&p->p_token);
 214                 tstop();
 215                 lwkt_reltoken(&p->p_token);
 216                 goto recheck;
 217         }
 218
 219         /*
 220          * Post any pending upcalls.  If running a virtual kernel be sure
 221          * to restore the virtual kernel's vmspace before posting the upcall.
 222          */
 223         if (p->p_flags & (P_SIGVTALRM | P_SIGPROF)) {
 224                 lwkt_gettoken(&p->p_token);
 225                 if (p->p_flags & P_SIGVTALRM) {
 226                         p->p_flags &= ~P_SIGVTALRM;
 227                         ksignal(p, SIGVTALRM);
 228                 }
 229                 if (p->p_flags & P_SIGPROF) {
 230                         p->p_flags &= ~P_SIGPROF;
 231                         ksignal(p, SIGPROF);
 232                 }
 233                 lwkt_reltoken(&p->p_token);
 234                 goto recheck;
 235         }
 236
 237         /*
 238          * Post any pending signals
 239          *
 240          * WARNING!  postsig() can exit and not return.
 241          */
 242         if ((sig = CURSIG_LCK_TRACE(lp, &ptok)) != 0) {
 243                 postsig(sig, ptok);
 244                 goto recheck;
 245         }
 246
 247         /*
 248          * In a multi-threaded program it is possible for a thread to change
 249          * signal state during a system call which temporarily changes the
 250          * signal mask.  In this case postsig() might not be run and we
 251          * have to restore the mask ourselves.
 252          */
 253         if (lp->lwp_flags & LWP_OLDMASK) {
 254                 lp->lwp_flags &= ~LWP_OLDMASK;
 255                 lp->lwp_sigmask = lp->lwp_oldsigmask;
 256                 goto recheck;
 257         }
 258 }
 259
 260 /*
 261  * Cleanup from userenter and any passive release that might have occured.
 262  * We must reclaim the current-process designation before we can return
 263  * to usermode.  We also handle both LWKT and USER reschedule requests.
 264  */
 265 static __inline void
 266 userexit(struct lwp *lp)
 267 {
 268         struct thread *td = lp->lwp_thread;
 269         /* globaldata_t gd = td->td_gd; */
 270
 271         /*
 272          * Handle stop requests at kernel priority.  Any requests queued
 273          * after this loop will generate another AST.
 274          */
 275         while (STOPLWP(lp->lwp_proc, lp)) {
 276                 lwkt_gettoken(&lp->lwp_proc->p_token);
 277                 tstop();
 278                 lwkt_reltoken(&lp->lwp_proc->p_token);
 279         }
 280
 281         /*
 282          * Reduce our priority in preparation for a return to userland.  If
 283          * our passive release function was still in place, our priority was
 284          * never raised and does not need to be reduced.
 285          */
 286         lwkt_passive_recover(td);
 287
 288         /*
 289          * Become the current user scheduled process if we aren't already,
 290          * and deal with reschedule requests and other factors.
 291          */
 292         lp->lwp_proc->p_usched->acquire_curproc(lp);
 293         /* WARNING: we may have migrated cpu's */
 294         /* gd = td->td_gd; */
 295 }
 296
 297 #if !defined(KTR_KERNENTRY)
 298 #define KTR_KERNENTRY   KTR_ALL
 299 #endif
 300 KTR_INFO_MASTER(kernentry);
 301 KTR_INFO(KTR_KERNENTRY, kernentry, trap, 0,
 302          "TRAP(pid %hd, tid %hd, trapno %ld, eva %lu)",
 303          pid_t pid, lwpid_t tid,  register_t trapno, vm_offset_t eva);
 304 KTR_INFO(KTR_KERNENTRY, kernentry, trap_ret, 0, "TRAP_RET(pid %hd, tid %hd)",
 305          pid_t pid, lwpid_t tid);
 306 KTR_INFO(KTR_KERNENTRY, kernentry, syscall, 0, "SYSC(pid %hd, tid %hd, nr %ld)",
 307          pid_t pid, lwpid_t tid,  register_t trapno);
 308 KTR_INFO(KTR_KERNENTRY, kernentry, syscall_ret, 0, "SYSRET(pid %hd, tid %hd, err %d)",
 309          pid_t pid, lwpid_t tid,  int err);
 310 KTR_INFO(KTR_KERNENTRY, kernentry, fork_ret, 0, "FORKRET(pid %hd, tid %hd)",
 311          pid_t pid, lwpid_t tid);
 312
 313 /*
 314  * Exception, fault, and trap interface to the kernel.
 315  * This common code is called from assembly language IDT gate entry
 316  * routines that prepare a suitable stack frame, and restore this
 317  * frame after the exception has been processed.
 318  *
 319  * This function is also called from doreti in an interlock to handle ASTs.
 320  * For example:  hardwareint->INTROUTINE->(set ast)->doreti->trap
 321  *
 322  * NOTE!  We have to retrieve the fault address prior to obtaining the
 323  * MP lock because get_mplock() may switch out.  YYY cr2 really ought
 324  * to be retrieved by the assembly code, not here.
 325  *
 326  * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing
 327  * if an attempt is made to switch from a fast interrupt or IPI.  This is
 328  * necessary to properly take fatal kernel traps on SMP machines if
 329  * get_mplock() has to block.
 330  */
 331
 332 void
 333 user_trap(struct trapframe *frame)
 334 {
 335         struct globaldata *gd = mycpu;
 336         struct thread *td = gd->gd_curthread;
 337         struct lwp *lp = td->td_lwp;
 338         struct proc *p;
 339         int sticks = 0;
 340         int i = 0, ucode = 0, type, code;
 341 #ifdef INVARIANTS
 342         int crit_count = td->td_critcount;
 343         lwkt_tokref_t curstop = td->td_toks_stop;
 344 #endif
 345         vm_offset_t eva;
 346
 347         p = td->td_proc;
 348
 349         if (frame->tf_trapno == T_PAGEFLT)
 350                 eva = frame->tf_addr;
 351         else
 352                 eva = 0;
 353 #if 0
 354         kprintf("USER_TRAP AT %08lx xflags %ld trapno %ld eva %08lx\n",
 355                 frame->tf_rip, frame->tf_xflags, frame->tf_trapno, eva);
 356 #endif
 357
 358         /*
 359          * Everything coming from user mode runs through user_trap,
 360          * including system calls.
 361          */
 362         if (frame->tf_trapno == T_FAST_SYSCALL) {
 363                 syscall2(frame);
 364                 return;
 365         }
 366
 367         KTR_LOG(kernentry_trap, lp->lwp_proc->p_pid, lp->lwp_tid,
 368                 frame->tf_trapno, eva);
 369
 370 #ifdef DDB
 371         if (db_active) {
 372                 eva = (frame->tf_trapno == T_PAGEFLT ? rcr2() : 0);
 373                 ++gd->gd_trap_nesting_level;
 374                 trap_fatal(frame, TRUE, eva);
 375                 --gd->gd_trap_nesting_level;
 376                 goto out2;
 377         }
 378 #endif
 379
 380         type = frame->tf_trapno;
 381         code = frame->tf_err;
 382
 383         userenter(td, p);
 384
 385         sticks = (int)td->td_sticks;
 386         lp->lwp_md.md_regs = frame;
 387
 388         switch (type) {
 389         case T_PRIVINFLT:       /* privileged instruction fault */
 390                 i = SIGILL;
 391                 ucode = ILL_PRVOPC;
 392                 break;
 393
 394         case T_BPTFLT:          /* bpt instruction fault */
 395         case T_TRCTRAP:         /* trace trap */
 396                 frame->tf_rflags &= ~PSL_T;
 397                 i = SIGTRAP;
 398                 ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
 399                 break;
 400
 401         case T_ARITHTRAP:       /* arithmetic trap */
 402                 ucode = code;
 403                 i = SIGFPE;
 404                 break;
 405
 406         case T_ASTFLT:          /* Allow process switch */
 407                 mycpu->gd_cnt.v_soft++;
 408                 if (mycpu->gd_reqflags & RQF_AST_OWEUPC) {
 409                         atomic_clear_int(&mycpu->gd_reqflags, RQF_AST_OWEUPC);
 410                         addupc_task(p, p->p_prof.pr_addr, p->p_prof.pr_ticks);
 411                 }
 412                 goto out;
 413
 414                 /*
 415                  * The following two traps can happen in
 416                  * vm86 mode, and, if so, we want to handle
 417                  * them specially.
 418                  */
 419         case T_PROTFLT:         /* general protection fault */
 420         case T_STKFLT:          /* stack fault */
 421 #if 0
 422                 if (frame->tf_eflags & PSL_VM) {
 423                         i = vm86_emulate((struct vm86frame *)frame);
 424                         if (i == 0)
 425                                 goto out;
 426                         break;
 427                 }
 428 #endif
 429                 /* FALL THROUGH */
 430
 431         case T_SEGNPFLT:        /* segment not present fault */
 432         case T_TSSFLT:          /* invalid TSS fault */
 433         case T_DOUBLEFLT:       /* double fault */
 434         default:
 435                 i = SIGBUS;
 436                 ucode = code + BUS_SEGM_FAULT ;
 437                 break;
 438
 439         case T_PAGEFLT:         /* page fault */
 440                 i = trap_pfault(frame, TRUE, eva);
 441                 if (i == -1 || i == 0)
 442                         goto out;
 443
 444
 445                 if (i == SIGSEGV)
 446                         ucode = SEGV_MAPERR;
 447                 else {
 448                         i = SIGSEGV;
 449                         ucode = SEGV_ACCERR;
 450                 }
 451                 break;
 452
 453         case T_DIVIDE:          /* integer divide fault */
 454                 ucode = FPE_INTDIV;
 455                 i = SIGFPE;
 456                 break;
 457
 458 #if NISA > 0
 459         case T_NMI:
 460                 /* machine/parity/power fail/"kitchen sink" faults */
 461                 if (isa_nmi(code) == 0) {
 462 #ifdef DDB
 463                         /*
 464                          * NMI can be hooked up to a pushbutton
 465                          * for debugging.
 466                          */
 467                         if (ddb_on_nmi) {
 468                                 kprintf ("NMI ... going to debugger\n");
 469                                 kdb_trap(type, 0, frame);
 470                         }
 471 #endif /* DDB */
 472                         goto out2;
 473                 } else if (panic_on_nmi)
 474                         panic("NMI indicates hardware failure");
 475                 break;
 476 #endif /* NISA > 0 */
 477
 478         case T_OFLOW:           /* integer overflow fault */
 479                 ucode = FPE_INTOVF;
 480                 i = SIGFPE;
 481                 break;
 482
 483         case T_BOUND:           /* bounds check fault */
 484                 ucode = FPE_FLTSUB;
 485                 i = SIGFPE;
 486                 break;
 487
 488         case T_DNA:
 489                 /*
 490                  * Virtual kernel intercept - pass the DNA exception
 491                  * to the (emulated) virtual kernel if it asked to handle
 492                  * it.  This occurs when the virtual kernel is holding
 493                  * onto the FP context for a different emulated
 494                  * process then the one currently running.
 495                  *
 496                  * We must still call npxdna() since we may have
 497                  * saved FP state that the (emulated) virtual kernel
 498                  * needs to hand over to a different emulated process.
 499                  */
 500                 if (lp->lwp_vkernel && lp->lwp_vkernel->ve &&
 501                     (td->td_pcb->pcb_flags & FP_VIRTFP)
 502                 ) {
 503                         npxdna(frame);
 504                         break;
 505                 }
 506
 507                 /*
 508                  * The kernel may have switched out the FP unit's
 509                  * state, causing the user process to take a fault
 510                  * when it tries to use the FP unit.  Restore the
 511                  * state here
 512                  */
 513                 if (npxdna(frame)) {
 514                         gd->gd_cnt.v_trap++;
 515                         goto out;
 516                 }
 517                 if (!pmath_emulate) {
 518                         i = SIGFPE;
 519                         ucode = FPE_FPU_NP_TRAP;
 520                         break;
 521                 }
 522                 i = (*pmath_emulate)(frame);
 523                 if (i == 0) {
 524                         if (!(frame->tf_rflags & PSL_T))
 525                                 goto out2;
 526                         frame->tf_rflags &= ~PSL_T;
 527                         i = SIGTRAP;
 528                 }
 529                 /* else ucode = emulator_only_knows() XXX */
 530                 break;
 531
 532         case T_FPOPFLT:         /* FPU operand fetch fault */
 533                 ucode = T_FPOPFLT;
 534                 i = SIGILL;
 535                 break;
 536
 537         case T_XMMFLT:          /* SIMD floating-point exception */
 538                 ucode = 0; /* XXX */
 539                 i = SIGFPE;
 540                 break;
 541         }
 542
 543         /*
 544          * Virtual kernel intercept - if the fault is directly related to a
 545          * VM context managed by a virtual kernel then let the virtual kernel
 546          * handle it.
 547          */
 548         if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
 549                 vkernel_trap(lp, frame);
 550                 goto out;
 551         }
 552
 553         /*
 554          * Translate fault for emulators (e.g. Linux)
 555          */
 556         if (*p->p_sysent->sv_transtrap)
 557                 i = (*p->p_sysent->sv_transtrap)(i, type);
 558
 559         trapsignal(lp, i, ucode);
 560
 561 #ifdef DEBUG
 562         if (type <= MAX_TRAP_MSG) {
 563                 uprintf("fatal process exception: %s",
 564                         trap_msg[type]);
 565                 if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 566                         uprintf(", fault VA = 0x%lx", (u_long)eva);
 567                 uprintf("\n");
 568         }
 569 #endif
 570
 571 out:
 572         userret(lp, frame, sticks);
 573         userexit(lp);
 574 out2:   ;
 575         KTR_LOG(kernentry_trap_ret, lp->lwp_proc->p_pid, lp->lwp_tid);
 576 #ifdef INVARIANTS
 577         KASSERT(crit_count == td->td_critcount,
 578                 ("trap: critical section count mismatch! %d/%d",
 579                 crit_count, td->td_pri));
 580         KASSERT(curstop == td->td_toks_stop,
 581                 ("trap: extra tokens held after trap! %ld/%ld",
 582                 curstop - &td->td_toks_base,
 583                 td->td_toks_stop - &td->td_toks_base));
 584 #endif
 585 }
 586
 587 void
 588 kern_trap(struct trapframe *frame)
 589 {
 590         struct globaldata *gd = mycpu;
 591         struct thread *td = gd->gd_curthread;
 592         struct lwp *lp;
 593         struct proc *p;
 594         int i = 0, ucode = 0, type, code;
 595 #ifdef INVARIANTS
 596         int crit_count = td->td_critcount;
 597         lwkt_tokref_t curstop = td->td_toks_stop;
 598 #endif
 599         vm_offset_t eva;
 600
 601         lp = td->td_lwp;
 602         p = td->td_proc;
 603
 604         if (frame->tf_trapno == T_PAGEFLT)
 605                 eva = frame->tf_addr;
 606         else
 607                 eva = 0;
 608
 609 #ifdef DDB
 610         if (db_active) {
 611                 ++gd->gd_trap_nesting_level;
 612                 trap_fatal(frame, FALSE, eva);
 613                 --gd->gd_trap_nesting_level;
 614                 goto out2;
 615         }
 616 #endif
 617
 618         type = frame->tf_trapno;
 619         code = frame->tf_err;
 620
 621 #if 0
 622 kernel_trap:
 623 #endif
 624         /* kernel trap */
 625
 626         switch (type) {
 627         case T_PAGEFLT:                 /* page fault */
 628                 trap_pfault(frame, FALSE, eva);
 629                 goto out2;
 630
 631         case T_DNA:
 632                 /*
 633                  * The kernel may be using npx for copying or other
 634                  * purposes.
 635                  */
 636                 panic("kernel NPX should not happen");
 637                 if (npxdna(frame))
 638                         goto out2;
 639                 break;
 640
 641         case T_PROTFLT:         /* general protection fault */
 642         case T_SEGNPFLT:        /* segment not present fault */
 643                 /*
 644                  * Invalid segment selectors and out of bounds
 645                  * %eip's and %esp's can be set up in user mode.
 646                  * This causes a fault in kernel mode when the
 647                  * kernel tries to return to user mode.  We want
 648                  * to get this fault so that we can fix the
 649                  * problem here and not have to check all the
 650                  * selectors and pointers when the user changes
 651                  * them.
 652                  */
 653                 if (mycpu->gd_intr_nesting_level == 0) {
 654                         if (td->td_pcb->pcb_onfault) {
 655                                 frame->tf_rip =
 656                                     (register_t)td->td_pcb->pcb_onfault;
 657                                 goto out2;
 658                         }
 659                 }
 660                 break;
 661
 662         case T_TSSFLT:
 663                 /*
 664                  * PSL_NT can be set in user mode and isn't cleared
 665                  * automatically when the kernel is entered.  This
 666                  * causes a TSS fault when the kernel attempts to
 667                  * `iret' because the TSS link is uninitialized.  We
 668                  * want to get this fault so that we can fix the
 669                  * problem here and not every time the kernel is
 670                  * entered.
 671                  */
 672                 if (frame->tf_rflags & PSL_NT) {
 673                         frame->tf_rflags &= ~PSL_NT;
 674                         goto out2;
 675                 }
 676                 break;
 677
 678         case T_TRCTRAP:  /* trace trap */
 679 #if 0
 680                 if (frame->tf_eip == (int)IDTVEC(syscall)) {
 681                         /*
 682                          * We've just entered system mode via the
 683                          * syscall lcall.  Continue single stepping
 684                          * silently until the syscall handler has
 685                          * saved the flags.
 686                          */
 687                         goto out2;
 688                 }
 689                 if (frame->tf_eip == (int)IDTVEC(syscall) + 1) {
 690                         /*
 691                          * The syscall handler has now saved the
 692                          * flags.  Stop single stepping it.
 693                          */
 694                         frame->tf_eflags &= ~PSL_T;
 695                         goto out2;
 696                 }
 697 #endif
 698 #if 0
 699                 /*
 700                  * Ignore debug register trace traps due to
 701                  * accesses in the user's address space, which
 702                  * can happen under several conditions such as
 703                  * if a user sets a watchpoint on a buffer and
 704                  * then passes that buffer to a system call.
 705                  * We still want to get TRCTRAPS for addresses
 706                  * in kernel space because that is useful when
 707                  * debugging the kernel.
 708                  */
 709                 if (user_dbreg_trap()) {
 710                         /*
 711                          * Reset breakpoint bits because the
 712                          * processor doesn't
 713                          */
 714                         load_dr6(rdr6() & 0xfffffff0);
 715                         goto out2;
 716                 }
 717 #endif
 718                 /*
 719                  * Fall through (TRCTRAP kernel mode, kernel address)
 720                  */
 721         case T_BPTFLT:
 722                 /*
 723                  * If DDB is enabled, let it handle the debugger trap.
 724                  * Otherwise, debugger traps "can't happen".
 725                  */
 726 #ifdef DDB
 727                 if (kdb_trap (type, 0, frame))
 728                         goto out2;
 729 #endif
 730                 break;
 731         case T_DIVIDE:
 732                 trap_fatal(frame, FALSE, eva);
 733                 goto out2;
 734         case T_NMI:
 735                 trap_fatal(frame, FALSE, eva);
 736                 goto out2;
 737         case T_SYSCALL80:
 738         case T_FAST_SYSCALL:
 739                 /*
 740                  * Ignore this trap generated from a spurious SIGTRAP.
 741                  *
 742                  * single stepping in / syscalls leads to spurious / SIGTRAP
 743                  * so ignore
 744                  *
 745                  * Haiku (c) 2007 Simon 'corecode' Schubert
 746                  */
 747                 goto out2;
 748         }
 749
 750         /*
 751          * Translate fault for emulators (e.g. Linux)
 752          */
 753         if (*p->p_sysent->sv_transtrap)
 754                 i = (*p->p_sysent->sv_transtrap)(i, type);
 755
 756         gd->gd_cnt.v_trap++;
 757         trapsignal(lp, i, ucode);
 758
 759 #ifdef DEBUG
 760         if (type <= MAX_TRAP_MSG) {
 761                 uprintf("fatal process exception: %s",
 762                         trap_msg[type]);
 763                 if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 764                         uprintf(", fault VA = 0x%lx", (u_long)eva);
 765                 uprintf("\n");
 766         }
 767 #endif
 768
 769 out2:
 770         ;
 771 #ifdef INVARIANTS
 772         KASSERT(crit_count == td->td_critcount,
 773                 ("trap: critical section count mismatch! %d/%d",
 774                 crit_count, td->td_pri));
 775         KASSERT(curstop == td->td_toks_stop,
 776                 ("trap: extra tokens held after trap! %ld/%ld",
 777                 curstop - &td->td_toks_base,
 778                 td->td_toks_stop - &td->td_toks_base));
 779 #endif
 780 }
 781
 782 int
 783 trap_pfault(struct trapframe *frame, int usermode, vm_offset_t eva)
 784 {
 785         vm_offset_t va;
 786         struct vmspace *vm = NULL;
 787         vm_map_t map = 0;
 788         int rv = 0;
 789         vm_prot_t ftype;
 790         thread_t td = curthread;
 791         struct lwp *lp = td->td_lwp;
 792         int fault_flags;
 793
 794         va = trunc_page(eva);
 795         if (usermode == FALSE) {
 796                 /*
 797                  * This is a fault on kernel virtual memory.
 798                  */
 799                 map = kernel_map;
 800         } else {
 801                 /*
 802                  * This is a fault on non-kernel virtual memory.
 803                  * vm is initialized above to NULL. If curproc is NULL
 804                  * or curproc->p_vmspace is NULL the fault is fatal.
 805                  */
 806                 if (lp != NULL)
 807                         vm = lp->lwp_vmspace;
 808
 809                 if (vm == NULL)
 810                         goto nogo;
 811
 812                 map = &vm->vm_map;
 813         }
 814
 815         if (frame->tf_err & PGEX_W)
 816                 ftype = VM_PROT_READ | VM_PROT_WRITE;
 817         else if (frame->tf_err & PGEX_I)
 818                 ftype = VM_PROT_EXECUTE;
 819         else
 820                 ftype = VM_PROT_READ;
 821
 822         if (map != kernel_map) {
 823                 /*
 824                  * Keep swapout from messing with us during this
 825                  *      critical time.
 826                  */
 827                 PHOLD(lp->lwp_proc);
 828
 829 #if 0
 830                 /*
 831                  * Grow the stack if necessary
 832                  */
 833                 /* grow_stack returns false only if va falls into
 834                  * a growable stack region and the stack growth
 835                  * fails.  It returns true if va was not within
 836                  * a growable stack region, or if the stack
 837                  * growth succeeded.
 838                  */
 839                 if (!grow_stack (map, va)) {
 840                         rv = KERN_FAILURE;
 841                         PRELE(lp->lwp_proc);
 842                         goto nogo;
 843                 }
 844 #endif
 845
 846                 fault_flags = 0;
 847                 if (usermode)
 848                         fault_flags |= VM_FAULT_BURST | VM_FAULT_USERMODE;
 849                 if (ftype & VM_PROT_WRITE)
 850                         fault_flags |= VM_FAULT_DIRTY;
 851                 else
 852                         fault_flags |= VM_FAULT_NORMAL;
 853                 rv = vm_fault(map, va, ftype, fault_flags);
 854
 855                 PRELE(lp->lwp_proc);
 856         } else {
 857                 /*
 858                  * Don't have to worry about process locking or stacks in the kernel.
 859                  */
 860                 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 861         }
 862
 863         if (rv == KERN_SUCCESS)
 864                 return (0);
 865 nogo:
 866         if (!usermode) {
 867                 if (td->td_gd->gd_intr_nesting_level == 0 &&
 868                     td->td_pcb->pcb_onfault) {
 869                         frame->tf_rip = (register_t)td->td_pcb->pcb_onfault;
 870                         return (0);
 871                 }
 872                 trap_fatal(frame, usermode, eva);
 873                 return (-1);
 874         }
 875
 876         /*
 877          * NOTE: on x86_64 we have a tf_addr field in the trapframe, no
 878          * kludge is needed to pass the fault address to signal handlers.
 879          */
 880         struct proc *p = td->td_proc;
 881         krateprintf(&segfltrate,
 882                     "seg-fault accessing address %p "
 883                     "rip=%p pid=%d p_comm=%s\n",
 884                     (void *)va,
 885                     (void *)frame->tf_rip, p->p_pid, p->p_comm);
 886         /* Debugger("seg-fault"); */
 887
 888         return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 889 }
 890
 891 static void
 892 trap_fatal(struct trapframe *frame, int usermode, vm_offset_t eva)
 893 {
 894         int code, type, ss;
 895         long rsp;
 896
 897         code = frame->tf_xflags;
 898         type = frame->tf_trapno;
 899
 900         if (type <= MAX_TRAP_MSG) {
 901                 kprintf("\n\nFatal trap %d: %s while in %s mode\n",
 902                         type, trap_msg[type],
 903                         (usermode ? "user" : "kernel"));
 904         }
 905         /* two separate prints in case of a trap on an unmapped page */
 906         kprintf("cpuid = %d\n", mycpu->gd_cpuid);
 907         if (type == T_PAGEFLT) {
 908                 kprintf("fault virtual address  = %p\n", (void *)eva);
 909                 kprintf("fault code             = %s %s, %s\n",
 910                         usermode ? "user" : "supervisor",
 911                         code & PGEX_W ? "write" : "read",
 912                         code & PGEX_P ? "protection violation" : "page not present");
 913         }
 914         kprintf("instruction pointer    = 0x%lx:0x%lx\n",
 915                frame->tf_cs & 0xffff, frame->tf_rip);
 916         if (usermode) {
 917                 ss = frame->tf_ss & 0xffff;
 918                 rsp = frame->tf_rsp;
 919         } else {
 920                 ss = GSEL(GDATA_SEL, SEL_KPL);
 921                 rsp = (long)&frame->tf_rsp;
 922         }
 923         kprintf("stack pointer          = 0x%x:0x%lx\n", ss, rsp);
 924         kprintf("frame pointer          = 0x%x:0x%lx\n", ss, frame->tf_rbp);
 925         kprintf("processor eflags       = ");
 926         if (frame->tf_rflags & PSL_T)
 927                 kprintf("trace trap, ");
 928         if (frame->tf_rflags & PSL_I)
 929                 kprintf("interrupt enabled, ");
 930         if (frame->tf_rflags & PSL_NT)
 931                 kprintf("nested task, ");
 932         if (frame->tf_rflags & PSL_RF)
 933                 kprintf("resume, ");
 934 #if 0
 935         if (frame->tf_eflags & PSL_VM)
 936                 kprintf("vm86, ");
 937 #endif
 938         kprintf("IOPL = %jd\n", (intmax_t)((frame->tf_rflags & PSL_IOPL) >> 12));
 939         kprintf("current process                = ");
 940         if (curproc) {
 941                 kprintf("%lu (%s)\n",
 942                     (u_long)curproc->p_pid, curproc->p_comm ?
 943                     curproc->p_comm : "");
 944         } else {
 945                 kprintf("Idle\n");
 946         }
 947         kprintf("current thread          = pri %d ", curthread->td_pri);
 948         if (curthread->td_critcount)
 949                 kprintf("(CRIT)");
 950         kprintf("\n");
 951 /**
 952  *  XXX FIXME:
 953  *      we probably SHOULD have stopped the other CPUs before now!
 954  *      another CPU COULD have been touching cpl at this moment...
 955  */
 956         kprintf(" <- SMP: XXX");
 957         kprintf("\n");
 958
 959 #ifdef KDB
 960         if (kdb_trap(&psl))
 961                 return;
 962 #endif
 963 #ifdef DDB
 964         if ((debugger_on_panic || db_active) && kdb_trap(type, code, frame))
 965                 return;
 966 #endif
 967         kprintf("trap number            = %d\n", type);
 968         if (type <= MAX_TRAP_MSG)
 969                 panic("%s", trap_msg[type]);
 970         else
 971                 panic("unknown/reserved trap");
 972 }
 973
 974 /*
 975  * Double fault handler. Called when a fault occurs while writing
 976  * a frame for a trap/exception onto the stack. This usually occurs
 977  * when the stack overflows (such is the case with infinite recursion,
 978  * for example).
 979  *
 980  * XXX Note that the current PTD gets replaced by IdlePTD when the
 981  * task switch occurs. This means that the stack that was active at
 982  * the time of the double fault is not available at <kstack> unless
 983  * the machine was idle when the double fault occurred. The downside
 984  * of this is that "trace <ebp>" in ddb won't work.
 985  */
 986 void
 987 dblfault_handler(void)
 988 {
 989 #if 0 /* JG */
 990         struct mdglobaldata *gd = mdcpu;
 991 #endif
 992
 993         kprintf("\nFatal double fault:\n");
 994 #if 0 /* JG */
 995         kprintf("rip = 0x%lx\n", gd->gd_common_tss.tss_rip);
 996         kprintf("rsp = 0x%lx\n", gd->gd_common_tss.tss_rsp);
 997         kprintf("rbp = 0x%lx\n", gd->gd_common_tss.tss_rbp);
 998 #endif
 999         /* two separate prints in case of a trap on an unmapped page */
1000         kprintf("cpuid = %d\n", mycpu->gd_cpuid);
1001         panic("double fault");
1002 }
1003
1004 /*
1005  * syscall2 -   MP aware system call request C handler
1006  *
1007  * A system call is essentially treated as a trap except that the
1008  * MP lock is not held on entry or return.  We are responsible for
1009  * obtaining the MP lock if necessary and for handling ASTs
1010  * (e.g. a task switch) prior to return.
1011  */
1012 void
1013 syscall2(struct trapframe *frame)
1014 {
1015         struct thread *td = curthread;
1016         struct proc *p = td->td_proc;
1017         struct lwp *lp = td->td_lwp;
1018         struct sysent *callp;
1019         register_t orig_tf_rflags;
1020         int sticks;
1021         int error;
1022         int narg;
1023 #ifdef INVARIANTS
1024         int crit_count = td->td_critcount;
1025         lwkt_tokref_t curstop = td->td_toks_stop;
1026 #endif
1027         struct sysmsg sysmsg;
1028         union sysunion *argp;
1029         u_int code;
1030         const int regcnt = 6;
1031
1032         mycpu->gd_cnt.v_syscall++;
1033
1034         KTR_LOG(kernentry_syscall, lp->lwp_proc->p_pid, lp->lwp_tid,
1035                 frame->tf_rax);
1036
1037         userenter(td, p);       /* lazy raise our priority */
1038
1039         /*
1040          * Misc
1041          */
1042         sticks = (int)td->td_sticks;
1043         orig_tf_rflags = frame->tf_rflags;
1044
1045         /*
1046          * Virtual kernel intercept - if a VM context managed by a virtual
1047          * kernel issues a system call the virtual kernel handles it, not us.
1048          * Restore the virtual kernel context and return from its system
1049          * call.  The current frame is copied out to the virtual kernel.
1050          */
1051         if (__predict_false(lp->lwp_vkernel && lp->lwp_vkernel->ve)) {
1052                 vkernel_trap(lp, frame);
1053                 error = EJUSTRETURN;
1054                 callp = NULL;
1055                 code = 0;
1056                 goto out;
1057         }
1058
1059         /*
1060          * Get the system call parameters and account for time
1061          */
1062         lp->lwp_md.md_regs = frame;
1063         code = frame->tf_rax;
1064
1065         if (code >= p->p_sysent->sv_size)
1066                 code = SYS___nosys;
1067         argp = (union sysunion *)&frame->tf_rdi;
1068         callp = &p->p_sysent->sv_table[code];
1069
1070         /*
1071          * On x86_64 we get up to six arguments in registers. The rest are
1072          * on the stack. The first six members of 'struct trapframe' happen
1073          * to be the registers used to pass arguments, in exactly the right
1074          * order.
1075          *
1076          * Any arguments beyond available argument-passing registers must
1077          * be copyin()'d from the user stack.
1078          */
1079         narg = callp->sy_narg;
1080         if (__predict_false(narg > regcnt)) {
1081                 register_t *argsdst;
1082                 caddr_t params;
1083
1084                 argsdst = (register_t *)&sysmsg.extargs;
1085                 bcopy(argp, argsdst, sizeof(register_t) * regcnt);
1086                 params = (caddr_t)frame->tf_rsp + sizeof(register_t);
1087
1088                 KASSERT(params != NULL, ("copyin args with no params!"));
1089                 error = copyin(params, &argsdst[regcnt],
1090                                (narg - regcnt) * sizeof(register_t));
1091                 argp = (void *)argsdst;
1092                 if (error) {
1093 #ifdef KTRACE
1094                         if (KTRPOINT(td, KTR_SYSCALL)) {
1095                                 ktrsyscall(lp, code, narg, argp);
1096                         }
1097 #endif
1098                         goto bad;
1099                 }
1100         }
1101
1102 #ifdef KTRACE
1103         if (KTRPOINT(td, KTR_SYSCALL)) {
1104                 ktrsyscall(lp, code, narg, argp);
1105         }
1106 #endif
1107
1108         /*
1109          * Default return value is 0 (will be copied to %rax).  Double-value
1110          * returns use %rax and %rdx.  %rdx is left unchanged for system
1111          * calls which return only one result.
1112          */
1113         sysmsg.sysmsg_fds[0] = 0;
1114         sysmsg.sysmsg_fds[1] = frame->tf_rdx;
1115
1116         /*
1117          * The syscall might manipulate the trap frame. If it does it
1118          * will probably return EJUSTRETURN.
1119          */
1120         sysmsg.sysmsg_frame = frame;
1121
1122         STOPEVENT(p, S_SCE, narg);      /* MP aware */
1123
1124         /*
1125          * NOTE: All system calls run MPSAFE now.  The system call itself
1126          *       is responsible for getting the MP lock.
1127          */
1128         error = (*callp->sy_call)(&sysmsg, argp);
1129
1130 #if 0
1131         kprintf("system call %d returned %d\n", code, error);
1132 #endif
1133
1134 out:
1135         /*
1136          * MP SAFE (we may or may not have the MP lock at this point)
1137          */
1138         switch (error) {
1139         case 0:
1140                 /*
1141                  * Reinitialize proc pointer `p' as it may be different
1142                  * if this is a child returning from fork syscall.
1143                  */
1144                 p = curproc;
1145                 lp = curthread->td_lwp;
1146                 frame->tf_rax = sysmsg.sysmsg_fds[0];
1147                 frame->tf_rdx = sysmsg.sysmsg_fds[1];
1148                 frame->tf_rflags &= ~PSL_C;
1149                 break;
1150         case ERESTART:
1151                 /*
1152                  * Reconstruct pc, we know that 'syscall' is 2 bytes.
1153                  * We have to do a full context restore so that %r10
1154                  * (which was holding the value of %rcx) is restored for
1155                  * the next iteration.
1156                  */
1157                 frame->tf_rip -= frame->tf_err;
1158                 frame->tf_r10 = frame->tf_rcx;
1159                 break;
1160         case EJUSTRETURN:
1161                 break;
1162         case EASYNC:
1163                 panic("Unexpected EASYNC return value (for now)");
1164         default:
1165 bad:
1166                 if (p->p_sysent->sv_errsize) {
1167                         if (error >= p->p_sysent->sv_errsize)
1168                                 error = -1;     /* XXX */
1169                         else
1170                                 error = p->p_sysent->sv_errtbl[error];
1171                 }
1172                 frame->tf_rax = error;
1173                 frame->tf_rflags |= PSL_C;
1174                 break;
1175         }
1176
1177         /*
1178          * Traced syscall.  trapsignal() is not MP aware.
1179          */
1180         if (orig_tf_rflags & PSL_T) {
1181                 frame->tf_rflags &= ~PSL_T;
1182                 trapsignal(lp, SIGTRAP, 0);
1183         }
1184
1185         /*
1186          * Handle reschedule and other end-of-syscall issues
1187          */
1188         userret(lp, frame, sticks);
1189
1190 #ifdef KTRACE
1191         if (KTRPOINT(td, KTR_SYSRET)) {
1192                 ktrsysret(lp, code, error, sysmsg.sysmsg_result);
1193         }
1194 #endif
1195
1196         /*
1197          * This works because errno is findable through the
1198          * register set.  If we ever support an emulation where this
1199          * is not the case, this code will need to be revisited.
1200          */
1201         STOPEVENT(p, S_SCX, code);
1202
1203         userexit(lp);
1204         KTR_LOG(kernentry_syscall_ret, lp->lwp_proc->p_pid, lp->lwp_tid, error);
1205 #ifdef INVARIANTS
1206         KASSERT(&td->td_toks_base == td->td_toks_stop,
1207                 ("syscall: critical section count mismatch! %d/%d",
1208                 crit_count, td->td_pri));
1209         KASSERT(curstop == td->td_toks_stop,
1210                 ("syscall: extra tokens held after trap! %ld",
1211                 td->td_toks_stop - &td->td_toks_base));
1212 #endif
1213 }
1214
1215 /*
1216  * Handles the syscall() and __syscall() API
1217  */
1218 void xsyscall(struct sysmsg *sysmsg, struct nosys_args *uap);
1219
1220 int
1221 sys_xsyscall(struct sysmsg *sysmsg, const struct nosys_args *uap)
1222 {
1223         struct trapframe *frame;
1224         struct sysent *callp;
1225         union sysunion *argp;
1226         struct thread *td;
1227         const int regcnt = 5;   /* number of args passed in registers */
1228         u_int code;
1229         int error;
1230         int narg;
1231
1232         td = curthread;
1233         frame = sysmsg->sysmsg_frame;
1234         code = (u_int)frame->tf_rdi;
1235         if (code >= td->td_proc->p_sysent->sv_size)
1236                 code = SYS___nosys;
1237         argp = (union sysunion *)(&frame->tf_rdi + 1);
1238         callp = &td->td_proc->p_sysent->sv_table[code];
1239         narg = callp->sy_narg;
1240
1241         /*
1242          * On x86_64 we get up to six arguments in registers.  The rest are
1243          * on the stack.  However, for syscall() and __syscall() the syscall
1244          * number is inserted as the first argument, so the limit is reduced
1245          * by one to five.
1246          */
1247         if (__predict_false(narg > regcnt)) {
1248                 register_t *argsdst;
1249                 caddr_t params;
1250
1251                 argsdst = (register_t *)&sysmsg->extargs;
1252                 bcopy(argp, argsdst, sizeof(register_t) * regcnt);
1253                 params = (caddr_t)frame->tf_rsp + sizeof(register_t);
1254                 error = copyin(params, &argsdst[regcnt],
1255                                (narg - regcnt) * sizeof(register_t));
1256                 argp = (void *)argsdst;
1257                 if (error)
1258                         return error;
1259         }
1260
1261 #ifdef KTRACE
1262         if (KTRPOINTP(td->td_proc, td, KTR_SYSCALL)) {
1263                 ktrsyscall(td->td_lwp, code, narg, argp);
1264         }
1265 #endif
1266
1267         error = (*callp->sy_call)(sysmsg, argp);
1268
1269 #ifdef KTRACE
1270         if (KTRPOINTP(td->td_proc, td, KTR_SYSRET)) {
1271                 ktrsysret(td->td_lwp, code, error, sysmsg->sysmsg_result);
1272         }
1273 #endif
1274
1275         return error;
1276 }
1277
1278 /*
1279  * NOTE: mplock not held at any point
1280  */
1281 void
1282 fork_return(struct lwp *lp, struct trapframe *frame)
1283 {
1284         frame->tf_rax = 0;              /* Child returns zero */
1285         frame->tf_rflags &= ~PSL_C;     /* success */
1286         frame->tf_rdx = 1;
1287
1288         generic_lwp_return(lp, frame);
1289         KTR_LOG(kernentry_fork_ret, lp->lwp_proc->p_pid, lp->lwp_tid);
1290 }
1291
1292 /*
1293  * Simplified back end of syscall(), used when returning from fork()
1294  * directly into user mode.
1295  *
1296  * This code will return back into the fork trampoline code which then
1297  * runs doreti.
1298  *
1299  * NOTE: The mplock is not held at any point.
1300  */
1301 void
1302 generic_lwp_return(struct lwp *lp, struct trapframe *frame)
1303 {
1304         struct proc *p = lp->lwp_proc;
1305
1306         /*
1307          * Check for exit-race.  If one lwp exits the process concurrent with
1308          * another lwp creating a new thread, the two operations may cross
1309          * each other resulting in the newly-created lwp not receiving a
1310          * KILL signal.
1311          */
1312         if (p->p_flags & P_WEXIT) {
1313                 lwpsignal(p, lp, SIGKILL);
1314         }
1315
1316         /*
1317          * Newly forked processes are given a kernel priority.  We have to
1318          * adjust the priority to a normal user priority and fake entry
1319          * into the kernel (call userenter()) to install a passive release
1320          * function just in case userret() decides to stop the process.  This
1321          * can occur when ^Z races a fork.  If we do not install the passive
1322          * release function the current process designation will not be
1323          * released when the thread goes to sleep.
1324          */
1325         lwkt_setpri_self(TDPRI_USER_NORM);
1326         userenter(lp->lwp_thread, p);
1327         userret(lp, frame, 0);
1328 #ifdef KTRACE
1329         if (KTRPOINT(lp->lwp_thread, KTR_SYSRET))
1330                 ktrsysret(lp, SYS_fork, 0, 0);
1331 #endif
1332         lp->lwp_flags |= LWP_PASSIVE_ACQ;
1333         userexit(lp);
1334         lp->lwp_flags &= ~LWP_PASSIVE_ACQ;
1335 }
1336
1337 /*
1338  * doreti has turned into this.  The frame is directly on the stack.  We
1339  * pull everything else we need (fpu and tls context) from the current
1340  * thread.
1341  *
1342  * Note on fpu interactions: In a virtual kernel, the fpu context for
1343  * an emulated user mode process is not shared with the virtual kernel's
1344  * fpu context, so we only have to 'stack' fpu contexts within the virtual
1345  * kernel itself, and not even then since the signal() contexts that we care
1346  * about save and restore the FPU state (I think anyhow).
1347  *
1348  * vmspace_ctl() returns an error only if it had problems instaling the
1349  * context we supplied or problems copying data to/from our VM space.
1350  */
1351 void
1352 go_user(struct intrframe *frame)
1353 {
1354         struct trapframe *tf = (void *)&frame->if_rdi;
1355         globaldata_t gd;
1356         int r;
1357         void *id;
1358
1359         /*
1360          * Interrupts may be disabled on entry, make sure all signals
1361          * can be received before beginning our loop.
1362          */
1363         sigsetmask(0);
1364
1365         /*
1366          * Switch to the current simulated user process, then call
1367          * user_trap() when we break out of it (usually due to a signal).
1368          */
1369         for (;;) {
1370 #if 1
1371                 /*
1372                  * Always make the FPU state correct.  This should generally
1373                  * be faster because the cost of taking a #NM fault through
1374                  * the vkernel to the real kernel is astronomical.
1375                  */
1376                 crit_enter();
1377                 tf->tf_xflags &= ~PGEX_FPFAULT;
1378                 if (mdcpu->gd_npxthread != curthread) {
1379                         if (mdcpu->gd_npxthread)
1380                                 npxsave(mdcpu->gd_npxthread->td_savefpu);
1381                         npxdna(tf);
1382                 }
1383 #else
1384                 /*
1385                  * Tell the real kernel whether it is ok to use the FP
1386                  * unit or not, allowing us to take a T_DNA exception
1387                  * if the context tries to use the FP.
1388                  */
1389                 if (mdcpu->gd_npxthread == curthread) {
1390                         tf->tf_xflags &= ~PGEX_FPFAULT;
1391                 } else {
1392                         tf->tf_xflags |= PGEX_FPFAULT;
1393                 }
1394 #endif
1395
1396                 /*
1397                  * Run emulated user process context.  This call interlocks
1398                  * with new mailbox signals.
1399                  *
1400                  * Set PGEX_U unconditionally, indicating a user frame (the
1401                  * bit is normally set only by T_PAGEFLT).
1402                  */
1403                 id = &curproc->p_vmspace->vm_pmap;
1404
1405                 /*
1406                  * The GDF_VIRTUSER hack helps statclock() figure out who
1407                  * the tick belongs to.
1408                  */
1409                 gd = mycpu;
1410                 gd->gd_flags |= GDF_VIRTUSER;
1411                 r = vmspace_ctl(id, VMSPACE_CTL_RUN, tf,
1412                                 &curthread->td_savevext);
1413
1414                 frame->if_xflags |= PGEX_U;
1415
1416                 /*
1417                  * Immediately save the user FPU state.  The vkernel is a
1418                  * user program and libraries like libc will use the FP
1419                  * unit.
1420                  */
1421                 if (mdcpu->gd_npxthread == curthread) {
1422                         npxsave(mdcpu->gd_npxthread->td_savefpu);
1423                 }
1424                 crit_exit();
1425                 gd->gd_flags &= ~GDF_VIRTUSER;
1426 #if 0
1427                 kprintf("GO USER %d trap %ld EVA %08lx RIP %08lx RSP %08lx XFLAGS %02lx/%02lx\n",
1428                         r, tf->tf_trapno, tf->tf_addr, tf->tf_rip, tf->tf_rsp,
1429                         tf->tf_xflags, frame->if_xflags);
1430 #endif
1431                 if (r < 0) {
1432                         if (errno != EINTR)
1433                                 panic("vmspace_ctl failed error %d", errno);
1434                 } else {
1435                         if (tf->tf_trapno) {
1436                                 user_trap(tf);
1437                         }
1438                 }
1439                 if (mycpu->gd_reqflags & RQF_AST_MASK) {
1440                         tf->tf_trapno = T_ASTFLT;
1441                         user_trap(tf);
1442                 }
1443                 tf->tf_trapno = 0;
1444         }
1445 }
1446
1447 /*
1448  * If PGEX_FPFAULT is set then set FP_VIRTFP in the PCB to force a T_DNA
1449  * fault (which is then passed back to the virtual kernel) if an attempt is
1450  * made to use the FP unit.
1451  *
1452  * XXX this is a fairly big hack.
1453  */
1454 void
1455 set_vkernel_fp(struct trapframe *frame)
1456 {
1457         struct thread *td = curthread;
1458
1459         if (frame->tf_xflags & PGEX_FPFAULT) {
1460                 td->td_pcb->pcb_flags |= FP_VIRTFP;
1461                 if (mdcpu->gd_npxthread == td)
1462                         npxexit();
1463         } else {
1464                 td->td_pcb->pcb_flags &= ~FP_VIRTFP;
1465         }
1466 }
1467
1468 /*
1469  * Called from vkernel_trap() to fixup the vkernel's syscall
1470  * frame for vmspace_ctl() return.
1471  */
1472 void
1473 cpu_vkernel_trap(struct trapframe *frame, int error)
1474 {
1475         frame->tf_rax = error;
1476         if (error)
1477                 frame->tf_rflags |= PSL_C;
1478         else
1479                 frame->tf_rflags &= ~PSL_C;
1480 }