sys/platform/vkernel64/x86_64/trap.c

   1 /*-
   2  * Copyright (C) 1994, David Greenman
   3  * Copyright (c) 1990, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to Berkeley by
   7  * the University of Utah, and William Jolitz.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  * 3. All advertising materials mentioning features or use of this software
  18  *    must display the following acknowledgement:
  19  *      This product includes software developed by the University of
  20  *      California, Berkeley and its contributors.
  21  * 4. Neither the name of the University nor the names of its contributors
  22  *    may be used to endorse or promote products derived from this software
  23  *    without specific prior written permission.
  24  *
  25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  35  * SUCH DAMAGE.
  36  *
  37  *      from: @(#)trap.c        7.4 (Berkeley) 5/13/91
  38  * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $
  39  */
  40
  41 /*
  42  * x86_64 Trap and System call handling
  43  */
  44
  45 #include "use_isa.h"
  46
  47 #include "opt_ddb.h"
  48 #include "opt_ktrace.h"
  49
  50 #include <sys/param.h>
  51 #include <sys/systm.h>
  52 #include <sys/proc.h>
  53 #include <sys/pioctl.h>
  54 #include <sys/kernel.h>
  55 #include <sys/resourcevar.h>
  56 #include <sys/signalvar.h>
  57 #include <sys/signal2.h>
  58 #include <sys/syscall.h>
  59 #include <sys/sysctl.h>
  60 #include <sys/sysent.h>
  61 #include <sys/uio.h>
  62 #include <sys/vmmeter.h>
  63 #include <sys/malloc.h>
  64 #ifdef KTRACE
  65 #include <sys/ktrace.h>
  66 #endif
  67 #include <sys/ktr.h>
  68 #include <sys/vkernel.h>
  69 #include <sys/sysproto.h>
  70 #include <sys/sysunion.h>
  71 #include <sys/vmspace.h>
  72
  73 #include <vm/vm.h>
  74 #include <vm/vm_param.h>
  75 #include <sys/lock.h>
  76 #include <vm/pmap.h>
  77 #include <vm/vm_kern.h>
  78 #include <vm/vm_map.h>
  79 #include <vm/vm_page.h>
  80 #include <vm/vm_extern.h>
  81
  82 #include <machine/cpu.h>
  83 #include <machine/md_var.h>
  84 #include <machine/pcb.h>
  85 #include <machine/smp.h>
  86 #include <machine/tss.h>
  87 #include <machine/globaldata.h>
  88
  89 #include <ddb/ddb.h>
  90
  91 #include <sys/msgport2.h>
  92 #include <sys/thread2.h>
  93 #include <sys/mplock2.h>
  94
  95 int (*pmath_emulate) (struct trapframe *);
  96
  97 static int trap_pfault (struct trapframe *, int, vm_offset_t);
  98 static void trap_fatal (struct trapframe *, int, vm_offset_t);
  99 void dblfault_handler (void);
 100 extern int vmm_enabled;
 101
 102 #if 0
 103 extern inthand_t IDTVEC(syscall);
 104 #endif
 105
 106 #define MAX_TRAP_MSG            30
 107 static char *trap_msg[] = {
 108         "",                                     /*  0 unused */
 109         "privileged instruction fault",         /*  1 T_PRIVINFLT */
 110         "",                                     /*  2 unused */
 111         "breakpoint instruction fault",         /*  3 T_BPTFLT */
 112         "",                                     /*  4 unused */
 113         "",                                     /*  5 unused */
 114         "arithmetic trap",                      /*  6 T_ARITHTRAP */
 115         "system forced exception",              /*  7 T_ASTFLT */
 116         "",                                     /*  8 unused */
 117         "general protection fault",             /*  9 T_PROTFLT */
 118         "trace trap",                           /* 10 T_TRCTRAP */
 119         "",                                     /* 11 unused */
 120         "page fault",                           /* 12 T_PAGEFLT */
 121         "",                                     /* 13 unused */
 122         "alignment fault",                      /* 14 T_ALIGNFLT */
 123         "",                                     /* 15 unused */
 124         "",                                     /* 16 unused */
 125         "",                                     /* 17 unused */
 126         "integer divide fault",                 /* 18 T_DIVIDE */
 127         "non-maskable interrupt trap",          /* 19 T_NMI */
 128         "overflow trap",                        /* 20 T_OFLOW */
 129         "FPU bounds check fault",               /* 21 T_BOUND */
 130         "FPU device not available",             /* 22 T_DNA */
 131         "double fault",                         /* 23 T_DOUBLEFLT */
 132         "FPU operand fetch fault",              /* 24 T_FPOPFLT */
 133         "invalid TSS fault",                    /* 25 T_TSSFLT */
 134         "segment not present fault",            /* 26 T_SEGNPFLT */
 135         "stack fault",                          /* 27 T_STKFLT */
 136         "machine check trap",                   /* 28 T_MCHK */
 137         "SIMD floating-point exception",        /* 29 T_XMMFLT */
 138         "reserved (unknown) fault",             /* 30 T_RESERVED */
 139 };
 140
 141 #ifdef DDB
 142 static int ddb_on_nmi = 1;
 143 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
 144         &ddb_on_nmi, 0, "Go to DDB on NMI");
 145 #endif
 146 static int panic_on_nmi = 1;
 147 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
 148         &panic_on_nmi, 0, "Panic on NMI");
 149 static int fast_release;
 150 SYSCTL_INT(_machdep, OID_AUTO, fast_release, CTLFLAG_RW,
 151         &fast_release, 0, "Passive Release was optimal");
 152 static int slow_release;
 153 SYSCTL_INT(_machdep, OID_AUTO, slow_release, CTLFLAG_RW,
 154         &slow_release, 0, "Passive Release was nonoptimal");
 155
 156 /*
 157  * Passively intercepts the thread switch function to increase
 158  * the thread priority from a user priority to a kernel priority, reducing
 159  * syscall and trap overhead for the case where no switch occurs.
 160  *
 161  * Synchronizes td_ucred with p_ucred.  This is used by system calls,
 162  * signal handling, faults, AST traps, and anything else that enters the
 163  * kernel from userland and provides the kernel with a stable read-only
 164  * copy of the process ucred.
 165  */
 166 static __inline void
 167 userenter(struct thread *curtd, struct proc *curp)
 168 {
 169         struct ucred *ocred;
 170         struct ucred *ncred;
 171
 172         curtd->td_release = lwkt_passive_release;
 173
 174         if (curtd->td_ucred != curp->p_ucred) {
 175                 ncred = crhold(curp->p_ucred);
 176                 ocred = curtd->td_ucred;
 177                 curtd->td_ucred = ncred;
 178                 if (ocred)
 179                         crfree(ocred);
 180         }
 181 }
 182
 183 /*
 184  * Handle signals, profiling, and other AST's and/or tasks that
 185  * must be completed before we can return to or try to return to userland.
 186  *
 187  * Note that td_sticks is a 64 bit quantity, but there's no point doing 64
 188  * arithmatic on the delta calculation so the absolute tick values are
 189  * truncated to an integer.
 190  */
 191 static void
 192 userret(struct lwp *lp, struct trapframe *frame, int sticks)
 193 {
 194         struct proc *p = lp->lwp_proc;
 195         int sig;
 196         int ptok;
 197
 198         /*
 199          * Charge system time if profiling.  Note: times are in microseconds.
 200          * This may do a copyout and block, so do it first even though it
 201          * means some system time will be charged as user time.
 202          */
 203         if (p->p_flags & P_PROFIL) {
 204                 addupc_task(p, frame->tf_rip,
 205                         (u_int)((int)lp->lwp_thread->td_sticks - sticks));
 206         }
 207
 208 recheck:
 209         /*
 210          * Specific on-return-to-usermode checks (LWP_MP_WEXIT,
 211          * LWP_MP_VNLRU, etc).
 212          */
 213         if (lp->lwp_mpflags & LWP_MP_URETMASK)
 214                 lwpuserret(lp);
 215
 216         /*
 217          * Block here if we are in a stopped state.
 218          */
 219         if (STOPLWP(p, lp)) {
 220                 lwkt_gettoken(&p->p_token);
 221                 tstop();
 222                 lwkt_reltoken(&p->p_token);
 223                 goto recheck;
 224         }
 225
 226         /*
 227          * Post any pending upcalls.  If running a virtual kernel be sure
 228          * to restore the virtual kernel's vmspace before posting the upcall.
 229          */
 230         if (p->p_flags & (P_SIGVTALRM | P_SIGPROF)) {
 231                 lwkt_gettoken(&p->p_token);
 232                 if (p->p_flags & P_SIGVTALRM) {
 233                         p->p_flags &= ~P_SIGVTALRM;
 234                         ksignal(p, SIGVTALRM);
 235                 }
 236                 if (p->p_flags & P_SIGPROF) {
 237                         p->p_flags &= ~P_SIGPROF;
 238                         ksignal(p, SIGPROF);
 239                 }
 240                 lwkt_reltoken(&p->p_token);
 241                 goto recheck;
 242         }
 243
 244         /*
 245          * Post any pending signals
 246          *
 247          * WARNING!  postsig() can exit and not return.
 248          */
 249         if ((sig = CURSIG_LCK_TRACE(lp, &ptok)) != 0) {
 250                 postsig(sig, ptok);
 251                 goto recheck;
 252         }
 253
 254         /*
 255          * block here if we are swapped out, but still process signals
 256          * (such as SIGKILL).  proc0 (the swapin scheduler) is already
 257          * aware of our situation, we do not have to wake it up.
 258          */
 259         if (p->p_flags & P_SWAPPEDOUT) {
 260                 lwkt_gettoken(&p->p_token);
 261                 get_mplock();
 262                 p->p_flags |= P_SWAPWAIT;
 263                 swapin_request();
 264                 if (p->p_flags & P_SWAPWAIT)
 265                         tsleep(p, PCATCH, "SWOUT", 0);
 266                 p->p_flags &= ~P_SWAPWAIT;
 267                 rel_mplock();
 268                 lwkt_reltoken(&p->p_token);
 269                 goto recheck;
 270         }
 271
 272         /*
 273          * In a multi-threaded program it is possible for a thread to change
 274          * signal state during a system call which temporarily changes the
 275          * signal mask.  In this case postsig() might not be run and we
 276          * have to restore the mask ourselves.
 277          */
 278         if (lp->lwp_flags & LWP_OLDMASK) {
 279                 lp->lwp_flags &= ~LWP_OLDMASK;
 280                 lp->lwp_sigmask = lp->lwp_oldsigmask;
 281                 goto recheck;
 282         }
 283 }
 284
 285 /*
 286  * Cleanup from userenter and any passive release that might have occured.
 287  * We must reclaim the current-process designation before we can return
 288  * to usermode.  We also handle both LWKT and USER reschedule requests.
 289  */
 290 static __inline void
 291 userexit(struct lwp *lp)
 292 {
 293         struct thread *td = lp->lwp_thread;
 294         /* globaldata_t gd = td->td_gd; */
 295
 296         /*
 297          * Handle stop requests at kernel priority.  Any requests queued
 298          * after this loop will generate another AST.
 299          */
 300         while (STOPLWP(lp->lwp_proc, lp)) {
 301                 lwkt_gettoken(&lp->lwp_proc->p_token);
 302                 tstop();
 303                 lwkt_reltoken(&lp->lwp_proc->p_token);
 304         }
 305
 306         /*
 307          * Reduce our priority in preparation for a return to userland.  If
 308          * our passive release function was still in place, our priority was
 309          * never raised and does not need to be reduced.
 310          */
 311         lwkt_passive_recover(td);
 312
 313         /*
 314          * Become the current user scheduled process if we aren't already,
 315          * and deal with reschedule requests and other factors.
 316          */
 317         lp->lwp_proc->p_usched->acquire_curproc(lp);
 318         /* WARNING: we may have migrated cpu's */
 319         /* gd = td->td_gd; */
 320 }
 321
 322 #if !defined(KTR_KERNENTRY)
 323 #define KTR_KERNENTRY   KTR_ALL
 324 #endif
 325 KTR_INFO_MASTER(kernentry);
 326 KTR_INFO(KTR_KERNENTRY, kernentry, trap, 0,
 327          "TRAP(pid %hd, tid %hd, trapno %ld, eva %lu)",
 328          pid_t pid, lwpid_t tid,  register_t trapno, vm_offset_t eva);
 329 KTR_INFO(KTR_KERNENTRY, kernentry, trap_ret, 0, "TRAP_RET(pid %hd, tid %hd)",
 330          pid_t pid, lwpid_t tid);
 331 KTR_INFO(KTR_KERNENTRY, kernentry, syscall, 0, "SYSC(pid %hd, tid %hd, nr %ld)",
 332          pid_t pid, lwpid_t tid,  register_t trapno);
 333 KTR_INFO(KTR_KERNENTRY, kernentry, syscall_ret, 0, "SYSRET(pid %hd, tid %hd, err %d)",
 334          pid_t pid, lwpid_t tid,  int err);
 335 KTR_INFO(KTR_KERNENTRY, kernentry, fork_ret, 0, "FORKRET(pid %hd, tid %hd)",
 336          pid_t pid, lwpid_t tid);
 337
 338 /*
 339  * Exception, fault, and trap interface to the kernel.
 340  * This common code is called from assembly language IDT gate entry
 341  * routines that prepare a suitable stack frame, and restore this
 342  * frame after the exception has been processed.
 343  *
 344  * This function is also called from doreti in an interlock to handle ASTs.
 345  * For example:  hardwareint->INTROUTINE->(set ast)->doreti->trap
 346  *
 347  * NOTE!  We have to retrieve the fault address prior to obtaining the
 348  * MP lock because get_mplock() may switch out.  YYY cr2 really ought
 349  * to be retrieved by the assembly code, not here.
 350  *
 351  * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing
 352  * if an attempt is made to switch from a fast interrupt or IPI.  This is
 353  * necessary to properly take fatal kernel traps on SMP machines if
 354  * get_mplock() has to block.
 355  */
 356
 357 void
 358 user_trap(struct trapframe *frame)
 359 {
 360         struct globaldata *gd = mycpu;
 361         struct thread *td = gd->gd_curthread;
 362         struct lwp *lp = td->td_lwp;
 363         struct proc *p;
 364         int sticks = 0;
 365         int i = 0, ucode = 0, type, code;
 366 #ifdef INVARIANTS
 367         int crit_count = td->td_critcount;
 368         lwkt_tokref_t curstop = td->td_toks_stop;
 369 #endif
 370         vm_offset_t eva;
 371
 372         p = td->td_proc;
 373
 374         if (frame->tf_trapno == T_PAGEFLT)
 375                 eva = frame->tf_addr;
 376         else
 377                 eva = 0;
 378 #if 0
 379         kprintf("USER_TRAP AT %08lx xflags %ld trapno %ld eva %08lx\n",
 380                 frame->tf_rip, frame->tf_xflags, frame->tf_trapno, eva);
 381 #endif
 382
 383         /*
 384          * Everything coming from user mode runs through user_trap,
 385          * including system calls.
 386          */
 387         if (frame->tf_trapno == T_FAST_SYSCALL) {
 388                 syscall2(frame);
 389                 return;
 390         }
 391
 392         KTR_LOG(kernentry_trap, lp->lwp_proc->p_pid, lp->lwp_tid,
 393                 frame->tf_trapno, eva);
 394
 395 #ifdef DDB
 396         if (db_active) {
 397                 eva = (frame->tf_trapno == T_PAGEFLT ? rcr2() : 0);
 398                 ++gd->gd_trap_nesting_level;
 399                 trap_fatal(frame, TRUE, eva);
 400                 --gd->gd_trap_nesting_level;
 401                 goto out2;
 402         }
 403 #endif
 404
 405         type = frame->tf_trapno;
 406         code = frame->tf_err;
 407
 408         userenter(td, p);
 409
 410         sticks = (int)td->td_sticks;
 411         lp->lwp_md.md_regs = frame;
 412
 413         switch (type) {
 414         case T_PRIVINFLT:       /* privileged instruction fault */
 415                 i = SIGILL;
 416                 ucode = ILL_PRVOPC;
 417                 break;
 418
 419         case T_BPTFLT:          /* bpt instruction fault */
 420         case T_TRCTRAP:         /* trace trap */
 421                 frame->tf_rflags &= ~PSL_T;
 422                 i = SIGTRAP;
 423                 ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
 424                 break;
 425
 426         case T_ARITHTRAP:       /* arithmetic trap */
 427                 ucode = code;
 428                 i = SIGFPE;
 429                 break;
 430
 431         case T_ASTFLT:          /* Allow process switch */
 432                 mycpu->gd_cnt.v_soft++;
 433                 if (mycpu->gd_reqflags & RQF_AST_OWEUPC) {
 434                         atomic_clear_int(&mycpu->gd_reqflags, RQF_AST_OWEUPC);
 435                         addupc_task(p, p->p_prof.pr_addr, p->p_prof.pr_ticks);
 436                 }
 437                 goto out;
 438
 439                 /*
 440                  * The following two traps can happen in
 441                  * vm86 mode, and, if so, we want to handle
 442                  * them specially.
 443                  */
 444         case T_PROTFLT:         /* general protection fault */
 445         case T_STKFLT:          /* stack fault */
 446 #if 0
 447                 if (frame->tf_eflags & PSL_VM) {
 448                         i = vm86_emulate((struct vm86frame *)frame);
 449                         if (i == 0)
 450                                 goto out;
 451                         break;
 452                 }
 453 #endif
 454                 /* FALL THROUGH */
 455
 456         case T_SEGNPFLT:        /* segment not present fault */
 457         case T_TSSFLT:          /* invalid TSS fault */
 458         case T_DOUBLEFLT:       /* double fault */
 459         default:
 460                 i = SIGBUS;
 461                 ucode = code + BUS_SEGM_FAULT ;
 462                 break;
 463
 464         case T_PAGEFLT:         /* page fault */
 465                 i = trap_pfault(frame, TRUE, eva);
 466                 if (i == -1 || i == 0)
 467                         goto out;
 468
 469
 470                 if (i == SIGSEGV)
 471                         ucode = SEGV_MAPERR;
 472                 else {
 473                         i = SIGSEGV;
 474                         ucode = SEGV_ACCERR;
 475                 }
 476                 break;
 477
 478         case T_DIVIDE:          /* integer divide fault */
 479                 ucode = FPE_INTDIV;
 480                 i = SIGFPE;
 481                 break;
 482
 483 #if NISA > 0
 484         case T_NMI:
 485                 /* machine/parity/power fail/"kitchen sink" faults */
 486                 if (isa_nmi(code) == 0) {
 487 #ifdef DDB
 488                         /*
 489                          * NMI can be hooked up to a pushbutton
 490                          * for debugging.
 491                          */
 492                         if (ddb_on_nmi) {
 493                                 kprintf ("NMI ... going to debugger\n");
 494                                 kdb_trap(type, 0, frame);
 495                         }
 496 #endif /* DDB */
 497                         goto out2;
 498                 } else if (panic_on_nmi)
 499                         panic("NMI indicates hardware failure");
 500                 break;
 501 #endif /* NISA > 0 */
 502
 503         case T_OFLOW:           /* integer overflow fault */
 504                 ucode = FPE_INTOVF;
 505                 i = SIGFPE;
 506                 break;
 507
 508         case T_BOUND:           /* bounds check fault */
 509                 ucode = FPE_FLTSUB;
 510                 i = SIGFPE;
 511                 break;
 512
 513         case T_DNA:
 514                 /*
 515                  * Virtual kernel intercept - pass the DNA exception
 516                  * to the (emulated) virtual kernel if it asked to handle
 517                  * it.  This occurs when the virtual kernel is holding
 518                  * onto the FP context for a different emulated
 519                  * process then the one currently running.
 520                  *
 521                  * We must still call npxdna() since we may have
 522                  * saved FP state that the (emulated) virtual kernel
 523                  * needs to hand over to a different emulated process.
 524                  */
 525                 if (lp->lwp_vkernel && lp->lwp_vkernel->ve &&
 526                     (td->td_pcb->pcb_flags & FP_VIRTFP)
 527                 ) {
 528                         npxdna(frame);
 529                         break;
 530                 }
 531
 532                 /*
 533                  * The kernel may have switched out the FP unit's
 534                  * state, causing the user process to take a fault
 535                  * when it tries to use the FP unit.  Restore the
 536                  * state here
 537                  */
 538                 if (npxdna(frame))
 539                         goto out;
 540                 if (!pmath_emulate) {
 541                         i = SIGFPE;
 542                         ucode = FPE_FPU_NP_TRAP;
 543                         break;
 544                 }
 545                 i = (*pmath_emulate)(frame);
 546                 if (i == 0) {
 547                         if (!(frame->tf_rflags & PSL_T))
 548                                 goto out2;
 549                         frame->tf_rflags &= ~PSL_T;
 550                         i = SIGTRAP;
 551                 }
 552                 /* else ucode = emulator_only_knows() XXX */
 553                 break;
 554
 555         case T_FPOPFLT:         /* FPU operand fetch fault */
 556                 ucode = T_FPOPFLT;
 557                 i = SIGILL;
 558                 break;
 559
 560         case T_XMMFLT:          /* SIMD floating-point exception */
 561                 ucode = 0; /* XXX */
 562                 i = SIGFPE;
 563                 break;
 564         }
 565
 566         /*
 567          * Virtual kernel intercept - if the fault is directly related to a
 568          * VM context managed by a virtual kernel then let the virtual kernel
 569          * handle it.
 570          */
 571         if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
 572                 vkernel_trap(lp, frame);
 573                 goto out;
 574         }
 575
 576         /*
 577          * Translate fault for emulators (e.g. Linux)
 578          */
 579         if (*p->p_sysent->sv_transtrap)
 580                 i = (*p->p_sysent->sv_transtrap)(i, type);
 581
 582         trapsignal(lp, i, ucode);
 583
 584 #ifdef DEBUG
 585         if (type <= MAX_TRAP_MSG) {
 586                 uprintf("fatal process exception: %s",
 587                         trap_msg[type]);
 588                 if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 589                         uprintf(", fault VA = 0x%lx", (u_long)eva);
 590                 uprintf("\n");
 591         }
 592 #endif
 593
 594 out:
 595         userret(lp, frame, sticks);
 596         userexit(lp);
 597 out2:   ;
 598         KTR_LOG(kernentry_trap_ret, lp->lwp_proc->p_pid, lp->lwp_tid);
 599 #ifdef INVARIANTS
 600         KASSERT(crit_count == td->td_critcount,
 601                 ("trap: critical section count mismatch! %d/%d",
 602                 crit_count, td->td_pri));
 603         KASSERT(curstop == td->td_toks_stop,
 604                 ("trap: extra tokens held after trap! %ld/%ld",
 605                 curstop - &td->td_toks_base,
 606                 td->td_toks_stop - &td->td_toks_base));
 607 #endif
 608 }
 609
 610 void
 611 kern_trap(struct trapframe *frame)
 612 {
 613         struct globaldata *gd = mycpu;
 614         struct thread *td = gd->gd_curthread;
 615         struct lwp *lp;
 616         struct proc *p;
 617         int i = 0, ucode = 0, type, code;
 618 #ifdef INVARIANTS
 619         int crit_count = td->td_critcount;
 620         lwkt_tokref_t curstop = td->td_toks_stop;
 621 #endif
 622         vm_offset_t eva;
 623
 624         lp = td->td_lwp;
 625         p = td->td_proc;
 626
 627         if (frame->tf_trapno == T_PAGEFLT)
 628                 eva = frame->tf_addr;
 629         else
 630                 eva = 0;
 631
 632 #ifdef DDB
 633         if (db_active) {
 634                 ++gd->gd_trap_nesting_level;
 635                 trap_fatal(frame, FALSE, eva);
 636                 --gd->gd_trap_nesting_level;
 637                 goto out2;
 638         }
 639 #endif
 640
 641         type = frame->tf_trapno;
 642         code = frame->tf_err;
 643
 644 #if 0
 645 kernel_trap:
 646 #endif
 647         /* kernel trap */
 648
 649         switch (type) {
 650         case T_PAGEFLT:                 /* page fault */
 651                 trap_pfault(frame, FALSE, eva);
 652                 goto out2;
 653
 654         case T_DNA:
 655                 /*
 656                  * The kernel may be using npx for copying or other
 657                  * purposes.
 658                  */
 659                 panic("kernel NPX should not happen");
 660                 if (npxdna(frame))
 661                         goto out2;
 662                 break;
 663
 664         case T_PROTFLT:         /* general protection fault */
 665         case T_SEGNPFLT:        /* segment not present fault */
 666                 /*
 667                  * Invalid segment selectors and out of bounds
 668                  * %eip's and %esp's can be set up in user mode.
 669                  * This causes a fault in kernel mode when the
 670                  * kernel tries to return to user mode.  We want
 671                  * to get this fault so that we can fix the
 672                  * problem here and not have to check all the
 673                  * selectors and pointers when the user changes
 674                  * them.
 675                  */
 676                 if (mycpu->gd_intr_nesting_level == 0) {
 677                         if (td->td_pcb->pcb_onfault) {
 678                                 frame->tf_rip =
 679                                     (register_t)td->td_pcb->pcb_onfault;
 680                                 goto out2;
 681                         }
 682                 }
 683                 break;
 684
 685         case T_TSSFLT:
 686                 /*
 687                  * PSL_NT can be set in user mode and isn't cleared
 688                  * automatically when the kernel is entered.  This
 689                  * causes a TSS fault when the kernel attempts to
 690                  * `iret' because the TSS link is uninitialized.  We
 691                  * want to get this fault so that we can fix the
 692                  * problem here and not every time the kernel is
 693                  * entered.
 694                  */
 695                 if (frame->tf_rflags & PSL_NT) {
 696                         frame->tf_rflags &= ~PSL_NT;
 697                         goto out2;
 698                 }
 699                 break;
 700
 701         case T_TRCTRAP:  /* trace trap */
 702 #if 0
 703                 if (frame->tf_eip == (int)IDTVEC(syscall)) {
 704                         /*
 705                          * We've just entered system mode via the
 706                          * syscall lcall.  Continue single stepping
 707                          * silently until the syscall handler has
 708                          * saved the flags.
 709                          */
 710                         goto out2;
 711                 }
 712                 if (frame->tf_eip == (int)IDTVEC(syscall) + 1) {
 713                         /*
 714                          * The syscall handler has now saved the
 715                          * flags.  Stop single stepping it.
 716                          */
 717                         frame->tf_eflags &= ~PSL_T;
 718                         goto out2;
 719                 }
 720 #endif
 721 #if 0
 722                 /*
 723                  * Ignore debug register trace traps due to
 724                  * accesses in the user's address space, which
 725                  * can happen under several conditions such as
 726                  * if a user sets a watchpoint on a buffer and
 727                  * then passes that buffer to a system call.
 728                  * We still want to get TRCTRAPS for addresses
 729                  * in kernel space because that is useful when
 730                  * debugging the kernel.
 731                  */
 732                 if (user_dbreg_trap()) {
 733                         /*
 734                          * Reset breakpoint bits because the
 735                          * processor doesn't
 736                          */
 737                         load_dr6(rdr6() & 0xfffffff0);
 738                         goto out2;
 739                 }
 740 #endif
 741                 /*
 742                  * Fall through (TRCTRAP kernel mode, kernel address)
 743                  */
 744         case T_BPTFLT:
 745                 /*
 746                  * If DDB is enabled, let it handle the debugger trap.
 747                  * Otherwise, debugger traps "can't happen".
 748                  */
 749 #ifdef DDB
 750                 if (kdb_trap (type, 0, frame))
 751                         goto out2;
 752 #endif
 753                 break;
 754         case T_DIVIDE:
 755                 trap_fatal(frame, FALSE, eva);
 756                 goto out2;
 757         case T_NMI:
 758                 trap_fatal(frame, FALSE, eva);
 759                 goto out2;
 760         case T_SYSCALL80:
 761         case T_FAST_SYSCALL:
 762                 /*
 763                  * Ignore this trap generated from a spurious SIGTRAP.
 764                  *
 765                  * single stepping in / syscalls leads to spurious / SIGTRAP
 766                  * so ignore
 767                  *
 768                  * Haiku (c) 2007 Simon 'corecode' Schubert
 769                  */
 770                 goto out2;
 771         }
 772
 773         /*
 774          * Translate fault for emulators (e.g. Linux)
 775          */
 776         if (*p->p_sysent->sv_transtrap)
 777                 i = (*p->p_sysent->sv_transtrap)(i, type);
 778
 779         trapsignal(lp, i, ucode);
 780
 781 #ifdef DEBUG
 782         if (type <= MAX_TRAP_MSG) {
 783                 uprintf("fatal process exception: %s",
 784                         trap_msg[type]);
 785                 if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 786                         uprintf(", fault VA = 0x%lx", (u_long)eva);
 787                 uprintf("\n");
 788         }
 789 #endif
 790
 791 out2:
 792         ;
 793 #ifdef INVARIANTS
 794         KASSERT(crit_count == td->td_critcount,
 795                 ("trap: critical section count mismatch! %d/%d",
 796                 crit_count, td->td_pri));
 797         KASSERT(curstop == td->td_toks_stop,
 798                 ("trap: extra tokens held after trap! %ld/%ld",
 799                 curstop - &td->td_toks_base,
 800                 td->td_toks_stop - &td->td_toks_base));
 801 #endif
 802 }
 803
 804 int
 805 trap_pfault(struct trapframe *frame, int usermode, vm_offset_t eva)
 806 {
 807         vm_offset_t va;
 808         struct vmspace *vm = NULL;
 809         vm_map_t map = 0;
 810         int rv = 0;
 811         vm_prot_t ftype;
 812         thread_t td = curthread;
 813         struct lwp *lp = td->td_lwp;
 814         int fault_flags;
 815
 816         va = trunc_page(eva);
 817         if (usermode == FALSE) {
 818                 /*
 819                  * This is a fault on kernel virtual memory.
 820                  */
 821                 map = &kernel_map;
 822         } else {
 823                 /*
 824                  * This is a fault on non-kernel virtual memory.
 825                  * vm is initialized above to NULL. If curproc is NULL
 826                  * or curproc->p_vmspace is NULL the fault is fatal.
 827                  */
 828                 if (lp != NULL)
 829                         vm = lp->lwp_vmspace;
 830
 831                 if (vm == NULL)
 832                         goto nogo;
 833
 834                 map = &vm->vm_map;
 835         }
 836
 837         if (frame->tf_err & PGEX_W)
 838                 ftype = VM_PROT_READ | VM_PROT_WRITE;
 839         else
 840                 ftype = VM_PROT_READ;
 841
 842         if (map != &kernel_map) {
 843                 /*
 844                  * Keep swapout from messing with us during this
 845                  *      critical time.
 846                  */
 847                 PHOLD(lp->lwp_proc);
 848
 849 #if 0
 850                 /*
 851                  * Grow the stack if necessary
 852                  */
 853                 /* grow_stack returns false only if va falls into
 854                  * a growable stack region and the stack growth
 855                  * fails.  It returns true if va was not within
 856                  * a growable stack region, or if the stack
 857                  * growth succeeded.
 858                  */
 859                 if (!grow_stack (map, va)) {
 860                         rv = KERN_FAILURE;
 861                         PRELE(lp->lwp_proc);
 862                         goto nogo;
 863                 }
 864 #endif
 865
 866                 fault_flags = 0;
 867                 if (usermode)
 868                         fault_flags |= VM_FAULT_BURST | VM_FAULT_USERMODE;
 869                 if (ftype & VM_PROT_WRITE)
 870                         fault_flags |= VM_FAULT_DIRTY;
 871                 else
 872                         fault_flags |= VM_FAULT_NORMAL;
 873                 rv = vm_fault(map, va, ftype, fault_flags);
 874
 875                 PRELE(lp->lwp_proc);
 876         } else {
 877                 /*
 878                  * Don't have to worry about process locking or stacks in the kernel.
 879                  */
 880                 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 881         }
 882
 883         if (rv == KERN_SUCCESS)
 884                 return (0);
 885 nogo:
 886         if (!usermode) {
 887                 if (td->td_gd->gd_intr_nesting_level == 0 &&
 888                     td->td_pcb->pcb_onfault) {
 889                         frame->tf_rip = (register_t)td->td_pcb->pcb_onfault;
 890                         return (0);
 891                 }
 892                 trap_fatal(frame, usermode, eva);
 893                 return (-1);
 894         }
 895
 896         /*
 897          * NOTE: on x86_64 we have a tf_addr field in the trapframe, no
 898          * kludge is needed to pass the fault address to signal handlers.
 899          */
 900         struct proc *p = td->td_proc;
 901         kprintf("seg-fault accessing address %p rip=%p pid=%d p_comm=%s\n",
 902                 (void *)va, (void *)frame->tf_rip, p->p_pid, p->p_comm);
 903         /* Debugger("seg-fault"); */
 904
 905         return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 906 }
 907
 908 static void
 909 trap_fatal(struct trapframe *frame, int usermode, vm_offset_t eva)
 910 {
 911         int code, type, ss;
 912         long rsp;
 913
 914         code = frame->tf_xflags;
 915         type = frame->tf_trapno;
 916
 917         if (type <= MAX_TRAP_MSG) {
 918                 kprintf("\n\nFatal trap %d: %s while in %s mode\n",
 919                         type, trap_msg[type],
 920                         (usermode ? "user" : "kernel"));
 921         }
 922         /* two separate prints in case of a trap on an unmapped page */
 923         kprintf("cpuid = %d\n", mycpu->gd_cpuid);
 924         if (type == T_PAGEFLT) {
 925                 kprintf("fault virtual address  = %p\n", (void *)eva);
 926                 kprintf("fault code             = %s %s, %s\n",
 927                         usermode ? "user" : "supervisor",
 928                         code & PGEX_W ? "write" : "read",
 929                         code & PGEX_P ? "protection violation" : "page not present");
 930         }
 931         kprintf("instruction pointer    = 0x%lx:0x%lx\n",
 932                frame->tf_cs & 0xffff, frame->tf_rip);
 933         if (usermode) {
 934                 ss = frame->tf_ss & 0xffff;
 935                 rsp = frame->tf_rsp;
 936         } else {
 937                 ss = GSEL(GDATA_SEL, SEL_KPL);
 938                 rsp = (long)&frame->tf_rsp;
 939         }
 940         kprintf("stack pointer          = 0x%x:0x%lx\n", ss, rsp);
 941         kprintf("frame pointer          = 0x%x:0x%lx\n", ss, frame->tf_rbp);
 942         kprintf("processor eflags       = ");
 943         if (frame->tf_rflags & PSL_T)
 944                 kprintf("trace trap, ");
 945         if (frame->tf_rflags & PSL_I)
 946                 kprintf("interrupt enabled, ");
 947         if (frame->tf_rflags & PSL_NT)
 948                 kprintf("nested task, ");
 949         if (frame->tf_rflags & PSL_RF)
 950                 kprintf("resume, ");
 951 #if 0
 952         if (frame->tf_eflags & PSL_VM)
 953                 kprintf("vm86, ");
 954 #endif
 955         kprintf("IOPL = %jd\n", (intmax_t)((frame->tf_rflags & PSL_IOPL) >> 12));
 956         kprintf("current process                = ");
 957         if (curproc) {
 958                 kprintf("%lu (%s)\n",
 959                     (u_long)curproc->p_pid, curproc->p_comm ?
 960                     curproc->p_comm : "");
 961         } else {
 962                 kprintf("Idle\n");
 963         }
 964         kprintf("current thread          = pri %d ", curthread->td_pri);
 965         if (curthread->td_critcount)
 966                 kprintf("(CRIT)");
 967         kprintf("\n");
 968 /**
 969  *  XXX FIXME:
 970  *      we probably SHOULD have stopped the other CPUs before now!
 971  *      another CPU COULD have been touching cpl at this moment...
 972  */
 973         kprintf(" <- SMP: XXX");
 974         kprintf("\n");
 975
 976 #ifdef KDB
 977         if (kdb_trap(&psl))
 978                 return;
 979 #endif
 980 #ifdef DDB
 981         if ((debugger_on_panic || db_active) && kdb_trap(type, code, frame))
 982                 return;
 983 #endif
 984         kprintf("trap number            = %d\n", type);
 985         if (type <= MAX_TRAP_MSG)
 986                 panic("%s", trap_msg[type]);
 987         else
 988                 panic("unknown/reserved trap");
 989 }
 990
 991 /*
 992  * Double fault handler. Called when a fault occurs while writing
 993  * a frame for a trap/exception onto the stack. This usually occurs
 994  * when the stack overflows (such is the case with infinite recursion,
 995  * for example).
 996  *
 997  * XXX Note that the current PTD gets replaced by IdlePTD when the
 998  * task switch occurs. This means that the stack that was active at
 999  * the time of the double fault is not available at <kstack> unless
1000  * the machine was idle when the double fault occurred. The downside
1001  * of this is that "trace <ebp>" in ddb won't work.
1002  */
1003 void
1004 dblfault_handler(void)
1005 {
1006 #if JG
1007         struct mdglobaldata *gd = mdcpu;
1008 #endif
1009
1010         kprintf("\nFatal double fault:\n");
1011 #if JG
1012         kprintf("rip = 0x%lx\n", gd->gd_common_tss.tss_rip);
1013         kprintf("rsp = 0x%lx\n", gd->gd_common_tss.tss_rsp);
1014         kprintf("rbp = 0x%lx\n", gd->gd_common_tss.tss_rbp);
1015 #endif
1016         /* two separate prints in case of a trap on an unmapped page */
1017         kprintf("cpuid = %d\n", mycpu->gd_cpuid);
1018         panic("double fault");
1019 }
1020
1021 /*
1022  *      syscall2 -      MP aware system call request C handler
1023  *
1024  *      A system call is essentially treated as a trap except that the
1025  *      MP lock is not held on entry or return.  We are responsible for
1026  *      obtaining the MP lock if necessary and for handling ASTs
1027  *      (e.g. a task switch) prior to return.
1028  *
1029  *      In general, only simple access and manipulation of curproc and
1030  *      the current stack is allowed without having to hold MP lock.
1031  *
1032  *      MPSAFE - note that large sections of this routine are run without
1033  *               the MP lock.
1034  */
1035 void
1036 syscall2(struct trapframe *frame)
1037 {
1038         struct thread *td = curthread;
1039         struct proc *p = td->td_proc;
1040         struct lwp *lp = td->td_lwp;
1041         caddr_t params;
1042         struct sysent *callp;
1043         register_t orig_tf_rflags;
1044         int sticks;
1045         int error;
1046         int narg;
1047 #ifdef INVARIANTS
1048         int crit_count = td->td_critcount;
1049         lwkt_tokref_t curstop = td->td_toks_stop;
1050 #endif
1051         register_t *argp;
1052         u_int code;
1053         int reg, regcnt;
1054         union sysunion args;
1055         register_t *argsdst;
1056
1057         mycpu->gd_cnt.v_syscall++;
1058
1059         KTR_LOG(kernentry_syscall, lp->lwp_proc->p_pid, lp->lwp_tid,
1060                 frame->tf_rax);
1061
1062         userenter(td, p);       /* lazy raise our priority */
1063
1064         reg = 0;
1065         regcnt = 6;
1066         /*
1067          * Misc
1068          */
1069         sticks = (int)td->td_sticks;
1070         orig_tf_rflags = frame->tf_rflags;
1071
1072         /*
1073          * Virtual kernel intercept - if a VM context managed by a virtual
1074          * kernel issues a system call the virtual kernel handles it, not us.
1075          * Restore the virtual kernel context and return from its system
1076          * call.  The current frame is copied out to the virtual kernel.
1077          */
1078         if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
1079                 vkernel_trap(lp, frame);
1080                 error = EJUSTRETURN;
1081                 goto out;
1082         }
1083
1084         /*
1085          * Get the system call parameters and account for time
1086          */
1087         lp->lwp_md.md_regs = frame;
1088         params = (caddr_t)frame->tf_rsp + sizeof(register_t);
1089         code = frame->tf_rax;
1090
1091         if (p->p_sysent->sv_prepsyscall) {
1092                 (*p->p_sysent->sv_prepsyscall)(
1093                         frame, (int *)(&args.nosys.sysmsg + 1),
1094                         &code, &params);
1095         } else {
1096                 if (code == SYS_syscall || code == SYS___syscall) {
1097                         code = frame->tf_rdi;
1098                         reg++;
1099                         regcnt--;
1100                 }
1101         }
1102
1103         if (p->p_sysent->sv_mask)
1104                 code &= p->p_sysent->sv_mask;
1105
1106         if (code >= p->p_sysent->sv_size)
1107                 callp = &p->p_sysent->sv_table[0];
1108         else
1109                 callp = &p->p_sysent->sv_table[code];
1110
1111         narg = callp->sy_narg & SYF_ARGMASK;
1112
1113         /*
1114          * On x86_64 we get up to six arguments in registers. The rest are
1115          * on the stack. The first six members of 'struct trapframe' happen
1116          * to be the registers used to pass arguments, in exactly the right
1117          * order.
1118          */
1119         argp = &frame->tf_rdi;
1120         argp += reg;
1121         argsdst = (register_t *)(&args.nosys.sysmsg + 1);
1122         /*
1123          * JG can we overflow the space pointed to by 'argsdst'
1124          * either with 'bcopy' or with 'copyin'?
1125          */
1126         bcopy(argp, argsdst, sizeof(register_t) * regcnt);
1127         /*
1128          * copyin is MP aware, but the tracing code is not
1129          */
1130         if (narg > regcnt) {
1131                 KASSERT(params != NULL, ("copyin args with no params!"));
1132                 error = copyin(params, &argsdst[regcnt],
1133                         (narg - regcnt) * sizeof(register_t));
1134                 if (error) {
1135 #ifdef KTRACE
1136                         if (KTRPOINT(td, KTR_SYSCALL)) {
1137                                 ktrsyscall(lp, code, narg,
1138                                         (void *)(&args.nosys.sysmsg + 1));
1139                         }
1140 #endif
1141                         goto bad;
1142                 }
1143         }
1144
1145 #ifdef KTRACE
1146         if (KTRPOINT(td, KTR_SYSCALL)) {
1147                 ktrsyscall(lp, code, narg, (void *)(&args.nosys.sysmsg + 1));
1148         }
1149 #endif
1150
1151         /*
1152          * Default return value is 0 (will be copied to %rax).  Double-value
1153          * returns use %rax and %rdx.  %rdx is left unchanged for system
1154          * calls which return only one result.
1155          */
1156         args.sysmsg_fds[0] = 0;
1157         args.sysmsg_fds[1] = frame->tf_rdx;
1158
1159         /*
1160          * The syscall might manipulate the trap frame. If it does it
1161          * will probably return EJUSTRETURN.
1162          */
1163         args.sysmsg_frame = frame;
1164
1165         STOPEVENT(p, S_SCE, narg);      /* MP aware */
1166
1167         /*
1168          * NOTE: All system calls run MPSAFE now.  The system call itself
1169          *       is responsible for getting the MP lock.
1170          */
1171         error = (*callp->sy_call)(&args);
1172
1173 #if 0
1174         kprintf("system call %d returned %d\n", code, error);
1175 #endif
1176
1177 out:
1178         /*
1179          * MP SAFE (we may or may not have the MP lock at this point)
1180          */
1181         switch (error) {
1182         case 0:
1183                 /*
1184                  * Reinitialize proc pointer `p' as it may be different
1185                  * if this is a child returning from fork syscall.
1186                  */
1187                 p = curproc;
1188                 lp = curthread->td_lwp;
1189                 frame->tf_rax = args.sysmsg_fds[0];
1190                 frame->tf_rdx = args.sysmsg_fds[1];
1191                 frame->tf_rflags &= ~PSL_C;
1192                 break;
1193         case ERESTART:
1194                 /*
1195                  * Reconstruct pc, we know that 'syscall' is 2 bytes.
1196                  * We have to do a full context restore so that %r10
1197                  * (which was holding the value of %rcx) is restored for
1198                  * the next iteration.
1199                  */
1200                 frame->tf_rip -= frame->tf_err;
1201                 frame->tf_r10 = frame->tf_rcx;
1202                 break;
1203         case EJUSTRETURN:
1204                 break;
1205         case EASYNC:
1206                 panic("Unexpected EASYNC return value (for now)");
1207         default:
1208 bad:
1209                 if (p->p_sysent->sv_errsize) {
1210                         if (error >= p->p_sysent->sv_errsize)
1211                                 error = -1;     /* XXX */
1212                         else
1213                                 error = p->p_sysent->sv_errtbl[error];
1214                 }
1215                 frame->tf_rax = error;
1216                 frame->tf_rflags |= PSL_C;
1217                 break;
1218         }
1219
1220         /*
1221          * Traced syscall.  trapsignal() is not MP aware.
1222          */
1223         if (orig_tf_rflags & PSL_T) {
1224                 frame->tf_rflags &= ~PSL_T;
1225                 trapsignal(lp, SIGTRAP, 0);
1226         }
1227
1228         /*
1229          * Handle reschedule and other end-of-syscall issues
1230          */
1231         userret(lp, frame, sticks);
1232
1233 #ifdef KTRACE
1234         if (KTRPOINT(td, KTR_SYSRET)) {
1235                 ktrsysret(lp, code, error, args.sysmsg_result);
1236         }
1237 #endif
1238
1239         /*
1240          * This works because errno is findable through the
1241          * register set.  If we ever support an emulation where this
1242          * is not the case, this code will need to be revisited.
1243          */
1244         STOPEVENT(p, S_SCX, code);
1245
1246         userexit(lp);
1247         KTR_LOG(kernentry_syscall_ret, lp->lwp_proc->p_pid, lp->lwp_tid, error);
1248 #ifdef INVARIANTS
1249         KASSERT(&td->td_toks_base == td->td_toks_stop,
1250                 ("syscall: critical section count mismatch! %d/%d",
1251                 crit_count, td->td_pri));
1252         KASSERT(curstop == td->td_toks_stop,
1253                 ("syscall: extra tokens held after trap! %ld",
1254                 td->td_toks_stop - &td->td_toks_base));
1255 #endif
1256 }
1257
1258 /*
1259  * NOTE: mplock not held at any point
1260  */
1261 void
1262 fork_return(struct lwp *lp, struct trapframe *frame)
1263 {
1264         frame->tf_rax = 0;              /* Child returns zero */
1265         frame->tf_rflags &= ~PSL_C;     /* success */
1266         frame->tf_rdx = 1;
1267
1268         generic_lwp_return(lp, frame);
1269         KTR_LOG(kernentry_fork_ret, lp->lwp_proc->p_pid, lp->lwp_tid);
1270 }
1271
1272 /*
1273  * Simplified back end of syscall(), used when returning from fork()
1274  * directly into user mode.
1275  *
1276  * This code will return back into the fork trampoline code which then
1277  * runs doreti.
1278  *
1279  * NOTE: The mplock is not held at any point.
1280  */
1281 void
1282 generic_lwp_return(struct lwp *lp, struct trapframe *frame)
1283 {
1284         struct proc *p = lp->lwp_proc;
1285
1286         /*
1287          * Check for exit-race.  If one lwp exits the process concurrent with
1288          * another lwp creating a new thread, the two operations may cross
1289          * each other resulting in the newly-created lwp not receiving a
1290          * KILL signal.
1291          */
1292         if (p->p_flags & P_WEXIT) {
1293                 lwpsignal(p, lp, SIGKILL);
1294         }
1295
1296         /*
1297          * Newly forked processes are given a kernel priority.  We have to
1298          * adjust the priority to a normal user priority and fake entry
1299          * into the kernel (call userenter()) to install a passive release
1300          * function just in case userret() decides to stop the process.  This
1301          * can occur when ^Z races a fork.  If we do not install the passive
1302          * release function the current process designation will not be
1303          * released when the thread goes to sleep.
1304          */
1305         lwkt_setpri_self(TDPRI_USER_NORM);
1306         userenter(lp->lwp_thread, p);
1307         userret(lp, frame, 0);
1308 #ifdef KTRACE
1309         if (KTRPOINT(lp->lwp_thread, KTR_SYSRET))
1310                 ktrsysret(lp, SYS_fork, 0, 0);
1311 #endif
1312         lp->lwp_flags |= LWP_PASSIVE_ACQ;
1313         userexit(lp);
1314         lp->lwp_flags &= ~LWP_PASSIVE_ACQ;
1315 }
1316
1317 /*
1318  * doreti has turned into this.  The frame is directly on the stack.  We
1319  * pull everything else we need (fpu and tls context) from the current
1320  * thread.
1321  *
1322  * Note on fpu interactions: In a virtual kernel, the fpu context for
1323  * an emulated user mode process is not shared with the virtual kernel's
1324  * fpu context, so we only have to 'stack' fpu contexts within the virtual
1325  * kernel itself, and not even then since the signal() contexts that we care
1326  * about save and restore the FPU state (I think anyhow).
1327  *
1328  * vmspace_ctl() returns an error only if it had problems instaling the
1329  * context we supplied or problems copying data to/from our VM space.
1330  */
1331 void
1332 go_user(struct intrframe *frame)
1333 {
1334         struct trapframe *tf = (void *)&frame->if_rdi;
1335         globaldata_t gd;
1336         int r;
1337         void *id;
1338
1339         /*
1340          * Interrupts may be disabled on entry, make sure all signals
1341          * can be received before beginning our loop.
1342          */
1343         sigsetmask(0);
1344
1345         /*
1346          * Switch to the current simulated user process, then call
1347          * user_trap() when we break out of it (usually due to a signal).
1348          */
1349         for (;;) {
1350 #if 1
1351                 /*
1352                  * Always make the FPU state correct.  This should generally
1353                  * be faster because the cost of taking a #NM fault through
1354                  * the vkernel to the real kernel is astronomical.
1355                  */
1356                 crit_enter();
1357                 tf->tf_xflags &= ~PGEX_FPFAULT;
1358                 if (mdcpu->gd_npxthread != curthread) {
1359                         if (mdcpu->gd_npxthread)
1360                                 npxsave(mdcpu->gd_npxthread->td_savefpu);
1361                         npxdna(tf);
1362                 }
1363 #else
1364                 /*
1365                  * Tell the real kernel whether it is ok to use the FP
1366                  * unit or not, allowing us to take a T_DNA exception
1367                  * if the context tries to use the FP.
1368                  */
1369                 if (mdcpu->gd_npxthread == curthread) {
1370                         tf->tf_xflags &= ~PGEX_FPFAULT;
1371                 } else {
1372                         tf->tf_xflags |= PGEX_FPFAULT;
1373                 }
1374 #endif
1375
1376                 /*
1377                  * Run emulated user process context.  This call interlocks
1378                  * with new mailbox signals.
1379                  *
1380                  * Set PGEX_U unconditionally, indicating a user frame (the
1381                  * bit is normally set only by T_PAGEFLT).
1382                  */
1383                 if (vmm_enabled)
1384                         id = (void *)vtophys(curproc->p_vmspace->vm_pmap.pm_pml4);
1385                 else
1386                         id = &curproc->p_vmspace->vm_pmap;
1387
1388                 /*
1389                  * The GDF_VIRTUSER hack helps statclock() figure out who
1390                  * the tick belongs to.
1391                  */
1392                 gd = mycpu;
1393                 gd->gd_flags |= GDF_VIRTUSER;
1394                 r = vmspace_ctl(id, VMSPACE_CTL_RUN, tf,
1395                                 &curthread->td_savevext);
1396                 gd->gd_flags &= ~GDF_VIRTUSER;
1397
1398                 frame->if_xflags |= PGEX_U;
1399
1400                 /*
1401                  * Immediately save the user FPU state.  The vkernel is a
1402                  * user program and libraries like libc will use the FP
1403                  * unit.
1404                  */
1405                 if (mdcpu->gd_npxthread == curthread) {
1406                         npxsave(mdcpu->gd_npxthread->td_savefpu);
1407                 }
1408                 crit_exit();
1409 #if 0
1410                 kprintf("GO USER %d trap %ld EVA %08lx RIP %08lx RSP %08lx XFLAGS %02lx/%02lx\n",
1411                         r, tf->tf_trapno, tf->tf_addr, tf->tf_rip, tf->tf_rsp,
1412                         tf->tf_xflags, frame->if_xflags);
1413 #endif
1414                 if (r < 0) {
1415                         if (errno != EINTR)
1416                                 panic("vmspace_ctl failed error %d", errno);
1417                 } else {
1418                         if (tf->tf_trapno) {
1419                                 user_trap(tf);
1420                         }
1421                 }
1422                 if (mycpu->gd_reqflags & RQF_AST_MASK) {
1423                         tf->tf_trapno = T_ASTFLT;
1424                         user_trap(tf);
1425                 }
1426                 tf->tf_trapno = 0;
1427         }
1428 }
1429
1430 /*
1431  * If PGEX_FPFAULT is set then set FP_VIRTFP in the PCB to force a T_DNA
1432  * fault (which is then passed back to the virtual kernel) if an attempt is
1433  * made to use the FP unit.
1434  *
1435  * XXX this is a fairly big hack.
1436  */
1437 void
1438 set_vkernel_fp(struct trapframe *frame)
1439 {
1440         struct thread *td = curthread;
1441
1442         if (frame->tf_xflags & PGEX_FPFAULT) {
1443                 td->td_pcb->pcb_flags |= FP_VIRTFP;
1444                 if (mdcpu->gd_npxthread == td)
1445                         npxexit();
1446         } else {
1447                 td->td_pcb->pcb_flags &= ~FP_VIRTFP;
1448         }
1449 }
1450
1451 /*
1452  * Called from vkernel_trap() to fixup the vkernel's syscall
1453  * frame for vmspace_ctl() return.
1454  */
1455 void
1456 cpu_vkernel_trap(struct trapframe *frame, int error)
1457 {
1458         frame->tf_rax = error;
1459         if (error)
1460                 frame->tf_rflags |= PSL_C;
1461         else
1462                 frame->tf_rflags &= ~PSL_C;
1463 }