kernel/os/exit.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  25  */
  26
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/sysmacros.h>
  32 #include <sys/systm.h>
  33 #include <sys/cred.h>
  34 #include <sys/user.h>
  35 #include <sys/errno.h>
  36 #include <sys/proc.h>
  37 #include <sys/ucontext.h>
  38 #include <sys/procfs.h>
  39 #include <sys/vnode.h>
  40 #include <sys/acct.h>
  41 #include <sys/var.h>
  42 #include <sys/cmn_err.h>
  43 #include <sys/debug.h>
  44 #include <sys/wait.h>
  45 #include <sys/siginfo.h>
  46 #include <sys/procset.h>
  47 #include <sys/class.h>
  48 #include <sys/file.h>
  49 #include <sys/session.h>
  50 #include <sys/kmem.h>
  51 #include <sys/vtrace.h>
  52 #include <sys/prsystm.h>
  53 #include <sys/ipc.h>
  54 #include <sys/sem_impl.h>
  55 #include <c2/audit.h>
  56 #include <sys/aio_impl.h>
  57 #include <vm/as.h>
  58 #include <sys/poll.h>
  59 #include <sys/door.h>
  60 #include <sys/lwpchan_impl.h>
  61 #include <sys/utrap.h>
  62 #include <sys/task.h>
  63 #include <sys/exacct.h>
  64 #include <sys/cyclic.h>
  65 #include <sys/schedctl.h>
  66 #include <sys/rctl.h>
  67 #include <sys/contract_impl.h>
  68 #include <sys/contract/process_impl.h>
  69 #include <sys/list.h>
  70 #include <sys/dtrace.h>
  71 #include <sys/pool.h>
  72 #include <sys/sdt.h>
  73 #include <sys/corectl.h>
  74 #include <sys/brand.h>
  75 #include <sys/libc_kernel.h>
  76
  77 /*
  78  * convert code/data pair into old style wait status
  79  */
  80 int
  81 wstat(int code, int data)
  82 {
  83         int stat = (data & 0377);
  84
  85         switch (code) {
  86         case CLD_EXITED:
  87                 stat <<= 8;
  88                 break;
  89         case CLD_DUMPED:
  90                 stat |= WCOREFLG;
  91                 break;
  92         case CLD_KILLED:
  93                 break;
  94         case CLD_TRAPPED:
  95         case CLD_STOPPED:
  96                 stat <<= 8;
  97                 stat |= WSTOPFLG;
  98                 break;
  99         case CLD_CONTINUED:
 100                 stat = WCONTFLG;
 101                 break;
 102         default:
 103                 cmn_err(CE_PANIC, "wstat: bad code");
 104                 /* NOTREACHED */
 105         }
 106         return (stat);
 107 }
 108
 109 static char *
 110 exit_reason(char *buf, size_t bufsz, int what, int why)
 111 {
 112         switch (why) {
 113         case CLD_EXITED:
 114                 (void) snprintf(buf, bufsz, "exited with status %d", what);
 115                 break;
 116         case CLD_KILLED:
 117                 (void) snprintf(buf, bufsz, "exited on fatal signal %d", what);
 118                 break;
 119         case CLD_DUMPED:
 120                 (void) snprintf(buf, bufsz, "core dumped on signal %d", what);
 121                 break;
 122         default:
 123                 (void) snprintf(buf, bufsz, "encountered unknown error "
 124                     "(%d, %d)", why, what);
 125                 break;
 126         }
 127
 128         return (buf);
 129 }
 130
 131 /*
 132  * exit system call: pass back caller's arg.
 133  */
 134 void
 135 rexit(int rval)
 136 {
 137         exit(CLD_EXITED, rval);
 138 }
 139
 140 /*
 141  * Called by proc_exit() when a zone's init exits, presumably because
 142  * it failed.  As long as the given zone is still in the "running"
 143  * state, we will re-exec() init, but first we need to reset things
 144  * which are usually inherited across exec() but will break init's
 145  * assumption that it is being exec()'d from a virgin process.  Most
 146  * importantly this includes closing all file descriptors (exec only
 147  * closes those marked close-on-exec) and resetting signals (exec only
 148  * resets handled signals, and we need to clear any signals which
 149  * killed init).  Anything else that exec(2) says would be inherited,
 150  * but would affect the execution of init, needs to be reset.
 151  */
 152 static int
 153 restart_init(int what, int why)
 154 {
 155         kthread_t *t = curthread;
 156         klwp_t *lwp = ttolwp(t);
 157         proc_t *p = ttoproc(t);
 158         user_t *up = PTOU(p);
 159
 160         vnode_t *oldcd, *oldrd;
 161         int i, err;
 162         char reason_buf[64];
 163
 164         /*
 165          * Let zone admin (and global zone admin if this is for a non-global
 166          * zone) know that init has failed and will be restarted.
 167          */
 168         zcmn_err(p->p_zone->zone_id, CE_WARN,
 169             "init(8) %s: restarting automatically",
 170             exit_reason(reason_buf, sizeof (reason_buf), what, why));
 171
 172         if (!INGLOBALZONE(p)) {
 173                 cmn_err(CE_WARN, "init(8) for zone %s (pid %d) %s: "
 174                     "restarting automatically",
 175                     p->p_zone->zone_name, p->p_pid, reason_buf);
 176         }
 177
 178         /*
 179          * Remove any fpollinfo_t's for this (last) thread from our file
 180          * descriptors so closeall() can ASSERT() that they're all gone.
 181          * Then close all open file descriptors in the process.
 182          */
 183         pollcleanup();
 184         closeall(P_FINFO(p));
 185
 186         /*
 187          * Grab p_lock and begin clearing miscellaneous global process
 188          * state that needs to be reset before we exec the new init(8).
 189          */
 190
 191         mutex_enter(&p->p_lock);
 192         prbarrier(p);
 193
 194         p->p_flag &= ~(SKILLED | SEXTKILLED | SEXITING | SDOCORE);
 195         up->u_cmask = CMASK;
 196
 197         sigemptyset(&t->t_hold);
 198         sigemptyset(&t->t_sig);
 199         sigemptyset(&t->t_extsig);
 200
 201         sigemptyset(&p->p_sig);
 202         sigemptyset(&p->p_extsig);
 203
 204         sigdelq(p, t, 0);
 205         sigdelq(p, NULL, 0);
 206
 207         if (p->p_killsqp) {
 208                 siginfofree(p->p_killsqp);
 209                 p->p_killsqp = NULL;
 210         }
 211
 212         /*
 213          * Reset any signals that are ignored back to the default disposition.
 214          * Other u_signal members will be cleared when exec calls sigdefault().
 215          */
 216         for (i = 1; i < NSIG; i++) {
 217                 if (up->u_signal[i - 1] == SIG_IGN) {
 218                         up->u_signal[i - 1] = SIG_DFL;
 219                         sigemptyset(&up->u_sigmask[i - 1]);
 220                 }
 221         }
 222
 223         /*
 224          * Clear the current signal, any signal info associated with it, and
 225          * any signal information from contracts and/or contract templates.
 226          */
 227         lwp->lwp_cursig = 0;
 228         lwp->lwp_extsig = 0;
 229         if (lwp->lwp_curinfo != NULL) {
 230                 siginfofree(lwp->lwp_curinfo);
 231                 lwp->lwp_curinfo = NULL;
 232         }
 233         lwp_ctmpl_clear(lwp);
 234
 235         /*
 236          * Reset both the process root directory and the current working
 237          * directory to the root of the zone just as we do during boot.
 238          */
 239         VN_HOLD(p->p_zone->zone_rootvp);
 240         oldrd = up->u_rdir;
 241         up->u_rdir = p->p_zone->zone_rootvp;
 242
 243         VN_HOLD(p->p_zone->zone_rootvp);
 244         oldcd = up->u_cdir;
 245         up->u_cdir = p->p_zone->zone_rootvp;
 246
 247         if (up->u_cwd != NULL) {
 248                 refstr_rele(up->u_cwd);
 249                 up->u_cwd = NULL;
 250         }
 251
 252         mutex_exit(&p->p_lock);
 253
 254         if (oldrd != NULL)
 255                 VN_RELE(oldrd);
 256         if (oldcd != NULL)
 257                 VN_RELE(oldcd);
 258
 259         /* Free the controlling tty.  (freectty() always assumes curproc.) */
 260         ASSERT(p == curproc);
 261         (void) freectty(B_TRUE);
 262
 263         /*
 264          * Now exec() the new init(8) on top of the current process.  If we
 265          * succeed, the caller will treat this like a successful system call.
 266          * If we fail, we issue messages and the caller will proceed with exit.
 267          */
 268         err = exec_init(p->p_zone->zone_initname, NULL);
 269
 270         if (err == 0)
 271                 return (0);
 272
 273         zcmn_err(p->p_zone->zone_id, CE_WARN,
 274             "failed to restart init(8) (err=%d): system reboot required", err);
 275
 276         if (!INGLOBALZONE(p)) {
 277                 cmn_err(CE_WARN, "failed to restart init(8) for zone %s "
 278                     "(pid %d, err=%d): zoneadm(8) boot required",
 279                     p->p_zone->zone_name, p->p_pid, err);
 280         }
 281
 282         return (-1);
 283 }
 284
 285 /*
 286  * Release resources.
 287  * Enter zombie state.
 288  * Wake up parent and init processes,
 289  * and dispose of children.
 290  */
 291 void
 292 exit(int why, int what)
 293 {
 294         /*
 295          * If proc_exit() fails, then some other lwp in the process
 296          * got there first.  We just have to call lwp_exit() to allow
 297          * the other lwp to finish exiting the process.  Otherwise we're
 298          * restarting init, and should return.
 299          */
 300         if (proc_exit(why, what) != 0) {
 301                 mutex_enter(&curproc->p_lock);
 302                 ASSERT(curproc->p_flag & SEXITLWPS);
 303                 lwp_exit();
 304                 /* NOTREACHED */
 305         }
 306 }
 307
 308 /*
 309  * Set the SEXITING flag on the process, after making sure /proc does
 310  * not have it locked.  This is done in more places than proc_exit(),
 311  * so it is a separate function.
 312  */
 313 void
 314 proc_is_exiting(proc_t *p)
 315 {
 316         mutex_enter(&p->p_lock);
 317         prbarrier(p);
 318         p->p_flag |= SEXITING;
 319         mutex_exit(&p->p_lock);
 320 }
 321
 322 /*
 323  * Return value:
 324  *   1 - exitlwps() failed, call (or continue) lwp_exit()
 325  *   0 - restarting init.  Return through system call path
 326  */
 327 int
 328 proc_exit(int why, int what)
 329 {
 330         kthread_t *t = curthread;
 331         klwp_t *lwp = ttolwp(t);
 332         proc_t *p = ttoproc(t);
 333         zone_t *z = p->p_zone;
 334         timeout_id_t tmp_id;
 335         int rv;
 336         proc_t *q;
 337         task_t *tk;
 338         vnode_t *exec_vp, *execdir_vp, *cdir, *rdir;
 339         sigqueue_t *sqp;
 340         lwpdir_t *lwpdir;
 341         uint_t lwpdir_sz;
 342         tidhash_t *tidhash;
 343         uint_t tidhash_sz;
 344         ret_tidhash_t *ret_tidhash;
 345         refstr_t *cwd;
 346         hrtime_t hrutime, hrstime;
 347         int evaporate;
 348
 349         /*
 350          * Stop and discard the process's lwps except for the current one,
 351          * unless some other lwp beat us to it.  If exitlwps() fails then
 352          * return and the calling lwp will call (or continue in) lwp_exit().
 353          */
 354         proc_is_exiting(p);
 355         if (exitlwps(0) != 0)
 356                 return (1);
 357
 358         mutex_enter(&p->p_lock);
 359         if (p->p_ttime > 0) {
 360                 /*
 361                  * Account any remaining ticks charged to this process
 362                  * on its way out.
 363                  */
 364                 (void) task_cpu_time_incr(p->p_task, p->p_ttime);
 365                 p->p_ttime = 0;
 366         }
 367         mutex_exit(&p->p_lock);
 368
 369         DTRACE_PROC(lwp__exit);
 370         DTRACE_PROC1(exit, int, why);
 371
 372         /*
 373          * Will perform any brand specific proc exit processing, since this
 374          * is always the last lwp, will also perform lwp_exit and free brand
 375          * data
 376          */
 377         if (PROC_IS_BRANDED(p)) {
 378                 lwp_detach_brand_hdlrs(lwp);
 379                 brand_clearbrand(p, B_FALSE);
 380         }
 381
 382         /*
 383          * Don't let init exit unless zone_start_init() failed its exec, or
 384          * we are shutting down the zone or the machine.
 385          *
 386          * Since we are single threaded, we don't need to lock the
 387          * following accesses to zone_proc_initpid.
 388          */
 389         if (p->p_pid == z->zone_proc_initpid) {
 390                 if (z->zone_boot_err == 0 &&
 391                     zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
 392                     zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) {
 393                         if (z->zone_restart_init == B_TRUE) {
 394                                 if (restart_init(what, why) == 0)
 395                                         return (0);
 396                         } else {
 397                                 (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
 398                                     CRED());
 399                         }
 400                 }
 401
 402                 /*
 403                  * Since we didn't or couldn't restart init, we clear
 404                  * the zone's init state and proceed with exit
 405                  * processing.
 406                  */
 407                 z->zone_proc_initpid = -1;
 408         }
 409
 410         lwp_pcb_exit();
 411
 412         /*
 413          * Allocate a sigqueue now, before we grab locks.
 414          * It will be given to sigcld(), below.
 415          * Special case:  If we will be making the process disappear
 416          * without a trace because it is either:
 417          *      * an exiting SSYS process, or
 418          *      * a posix_spawn() vfork child who requests it,
 419          * we don't bother to allocate a useless sigqueue.
 420          */
 421         evaporate = (p->p_flag & SSYS) || ((p->p_flag & SVFORK) &&
 422             why == CLD_EXITED && what == _EVAPORATE);
 423         if (!evaporate)
 424                 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 425
 426         /*
 427          * revoke any doors created by the process.
 428          */
 429         if (p->p_door_list)
 430                 door_exit();
 431
 432         /*
 433          * Release schedctl data structures.
 434          */
 435         if (p->p_pagep)
 436                 schedctl_proc_cleanup();
 437
 438         /*
 439          * make sure all pending kaio has completed.
 440          */
 441         if (p->p_aio)
 442                 aio_cleanup_exit();
 443
 444         /*
 445          * discard the lwpchan cache.
 446          */
 447         if (p->p_lcp != NULL)
 448                 lwpchan_destroy_cache(0);
 449
 450         /*
 451          * Clean up any DTrace helper actions or probes for the process.
 452          */
 453         if (p->p_dtrace_helpers != NULL) {
 454                 ASSERT(dtrace_helpers_cleanup != NULL);
 455                 (*dtrace_helpers_cleanup)(p);
 456         }
 457
 458         /*
 459          * Clean up any signalfd state for the process.
 460          */
 461         if (p->p_sigfd != NULL) {
 462                 VERIFY(sigfd_exit_helper != NULL);
 463                 (*sigfd_exit_helper)();
 464         }
 465
 466         /* untimeout the realtime timers */
 467         if (p->p_itimer != NULL)
 468                 timer_exit();
 469
 470         if ((tmp_id = p->p_alarmid) != 0) {
 471                 p->p_alarmid = 0;
 472                 (void) untimeout(tmp_id);
 473         }
 474
 475         /*
 476          * Remove any fpollinfo_t's for this (last) thread from our file
 477          * descriptors so closeall() can ASSERT() that they're all gone.
 478          */
 479         pollcleanup();
 480
 481         if (p->p_rprof_cyclic != CYCLIC_NONE) {
 482                 mutex_enter(&cpu_lock);
 483                 cyclic_remove(p->p_rprof_cyclic);
 484                 mutex_exit(&cpu_lock);
 485         }
 486
 487         mutex_enter(&p->p_lock);
 488
 489         /*
 490          * Clean up any DTrace probes associated with this process.
 491          */
 492         if (p->p_dtrace_probes) {
 493                 ASSERT(dtrace_fasttrap_exit_ptr != NULL);
 494                 dtrace_fasttrap_exit_ptr(p);
 495         }
 496
 497         while ((tmp_id = p->p_itimerid) != 0) {
 498                 p->p_itimerid = 0;
 499                 mutex_exit(&p->p_lock);
 500                 (void) untimeout(tmp_id);
 501                 mutex_enter(&p->p_lock);
 502         }
 503
 504         lwp_cleanup();
 505
 506         /*
 507          * We are about to exit; prevent our resource associations from
 508          * being changed.
 509          */
 510         pool_barrier_enter();
 511
 512         /*
 513          * Block the process against /proc now that we have really
 514          * acquired p->p_lock (to manipulate p_tlist at least).
 515          */
 516         prbarrier(p);
 517
 518         sigfillset(&p->p_ignore);
 519         sigemptyset(&p->p_siginfo);
 520         sigemptyset(&p->p_sig);
 521         sigemptyset(&p->p_extsig);
 522         sigemptyset(&t->t_sig);
 523         sigemptyset(&t->t_extsig);
 524         sigemptyset(&p->p_sigmask);
 525         sigdelq(p, t, 0);
 526         lwp->lwp_cursig = 0;
 527         lwp->lwp_extsig = 0;
 528         p->p_flag &= ~(SKILLED | SEXTKILLED);
 529         if (lwp->lwp_curinfo) {
 530                 siginfofree(lwp->lwp_curinfo);
 531                 lwp->lwp_curinfo = NULL;
 532         }
 533
 534         t->t_proc_flag |= TP_LWPEXIT;
 535         ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
 536         prlwpexit(t);           /* notify /proc */
 537         lwp_hash_out(p, t->t_tid);
 538         prexit(p);
 539
 540         p->p_lwpcnt = 0;
 541         p->p_tlist = NULL;
 542         sigqfree(p);
 543         term_mstate(t);
 544         p->p_mterm = gethrtime();
 545
 546         exec_vp = p->p_exec;
 547         execdir_vp = p->p_execdir;
 548         p->p_exec = NULLVP;
 549         p->p_execdir = NULLVP;
 550         mutex_exit(&p->p_lock);
 551
 552         pr_free_watched_pages(p);
 553
 554         closeall(P_FINFO(p));
 555
 556         /* Free the controlling tty.  (freectty() always assumes curproc.) */
 557         ASSERT(p == curproc);
 558         (void) freectty(B_TRUE);
 559
 560         if (p->p_semacct)                       /* IPC semaphore exit */
 561                 semexit(p);
 562         rv = wstat(why, what);
 563
 564         acct(rv & 0xff);
 565         exacct_commit_proc(p, rv);
 566
 567         /*
 568          * Release any resources associated with C2 auditing
 569          */
 570         if (AU_AUDITING()) {
 571                 /*
 572                  * audit exit system call
 573                  */
 574                 audit_exit(why, what);
 575         }
 576
 577         /*
 578          * Free address space.
 579          */
 580         relvm();
 581
 582         if (exec_vp) {
 583                 /*
 584                  * Close this executable which has been opened when the process
 585                  * was created by getproc().
 586                  */
 587                 (void) fop_close(exec_vp, FREAD, 1, 0, CRED(), NULL);
 588                 VN_RELE(exec_vp);
 589         }
 590         if (execdir_vp)
 591                 VN_RELE(execdir_vp);
 592
 593         /*
 594          * Release held contracts.
 595          */
 596         contract_exit(p);
 597
 598         /*
 599          * Depart our encapsulating process contract.
 600          */
 601         if ((p->p_flag & SSYS) == 0) {
 602                 ASSERT(p->p_ct_process);
 603                 contract_process_exit(p->p_ct_process, p, rv);
 604         }
 605
 606         /*
 607          * Remove pool association, and block if requested by pool_do_bind.
 608          */
 609         mutex_enter(&p->p_lock);
 610         ASSERT(p->p_pool->pool_ref > 0);
 611         atomic_dec_32(&p->p_pool->pool_ref);
 612         p->p_pool = pool_default;
 613         /*
 614          * Now that our address space has been freed and all other threads
 615          * in this process have exited, set the PEXITED pool flag.  This
 616          * tells the pools subsystems to ignore this process if it was
 617          * requested to rebind this process to a new pool.
 618          */
 619         p->p_poolflag |= PEXITED;
 620         pool_barrier_exit();
 621         mutex_exit(&p->p_lock);
 622
 623         mutex_enter(&pidlock);
 624
 625         /*
 626          * Delete this process from the newstate list of its parent. We
 627          * will put it in the right place in the sigcld in the end.
 628          */
 629         delete_ns(p->p_parent, p);
 630
 631         /*
 632          * Reassign the orphans to the next of kin.
 633          * Don't rearrange init's orphanage.
 634          */
 635         if ((q = p->p_orphan) != NULL && p != proc_init) {
 636
 637                 proc_t *nokp = p->p_nextofkin;
 638
 639                 for (;;) {
 640                         q->p_nextofkin = nokp;
 641                         if (q->p_nextorph == NULL)
 642                                 break;
 643                         q = q->p_nextorph;
 644                 }
 645                 q->p_nextorph = nokp->p_orphan;
 646                 nokp->p_orphan = p->p_orphan;
 647                 p->p_orphan = NULL;
 648         }
 649
 650         /*
 651          * Reassign the children to init.
 652          * Don't try to assign init's children to init.
 653          */
 654         if ((q = p->p_child) != NULL && p != proc_init) {
 655                 struct proc     *np;
 656                 struct proc     *initp = proc_init;
 657                 boolean_t       setzonetop = B_FALSE;
 658
 659                 if (!INGLOBALZONE(curproc))
 660                         setzonetop = B_TRUE;
 661
 662                 pgdetach(p);
 663
 664                 do {
 665                         np = q->p_sibling;
 666                         /*
 667                          * Delete it from its current parent new state
 668                          * list and add it to init new state list
 669                          */
 670                         delete_ns(q->p_parent, q);
 671
 672                         q->p_ppid = 1;
 673                         q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
 674                         if (setzonetop) {
 675                                 mutex_enter(&q->p_lock);
 676                                 q->p_flag |= SZONETOP;
 677                                 mutex_exit(&q->p_lock);
 678                         }
 679                         q->p_parent = initp;
 680
 681                         /*
 682                          * Since q will be the first child,
 683                          * it will not have a previous sibling.
 684                          */
 685                         q->p_psibling = NULL;
 686                         if (initp->p_child) {
 687                                 initp->p_child->p_psibling = q;
 688                         }
 689                         q->p_sibling = initp->p_child;
 690                         initp->p_child = q;
 691                         if (q->p_proc_flag & P_PR_PTRACE) {
 692                                 mutex_enter(&q->p_lock);
 693                                 sigtoproc(q, NULL, SIGKILL);
 694                                 mutex_exit(&q->p_lock);
 695                         }
 696                         /*
 697                          * sigcld() will add the child to parents
 698                          * newstate list.
 699                          */
 700                         if (q->p_stat == SZOMB)
 701                                 sigcld(q, NULL);
 702                 } while ((q = np) != NULL);
 703
 704                 p->p_child = NULL;
 705                 ASSERT(p->p_child_ns == NULL);
 706         }
 707
 708         TRACE_1(TR_FAC_PROC, TR_PROC_EXIT, "proc_exit: %p", p);
 709
 710         mutex_enter(&p->p_lock);
 711         CL_EXIT(curthread); /* tell the scheduler that curthread is exiting */
 712
 713         /*
 714          * Have our task accummulate our resource usage data before they
 715          * become contaminated by p_cacct etc., and before we renounce
 716          * membership of the task.
 717          *
 718          * We do this regardless of whether or not task accounting is active.
 719          * This is to avoid having nonsense data reported for this task if
 720          * task accounting is subsequently enabled. The overhead is minimal;
 721          * by this point, this process has accounted for the usage of all its
 722          * LWPs. We nonetheless do the work here, and under the protection of
 723          * pidlock, so that the movement of the process's usage to the task
 724          * happens at the same time as the removal of the process from the
 725          * task, from the point of view of exacct_snapshot_task_usage().
 726          */
 727         exacct_update_task_mstate(p);
 728
 729         hrutime = mstate_aggr_state(p, LMS_USER);
 730         hrstime = mstate_aggr_state(p, LMS_SYSTEM);
 731         p->p_utime = (clock_t)NSEC_TO_TICK(hrutime) + p->p_cutime;
 732         p->p_stime = (clock_t)NSEC_TO_TICK(hrstime) + p->p_cstime;
 733
 734         p->p_acct[LMS_USER]     += p->p_cacct[LMS_USER];
 735         p->p_acct[LMS_SYSTEM]   += p->p_cacct[LMS_SYSTEM];
 736         p->p_acct[LMS_TRAP]     += p->p_cacct[LMS_TRAP];
 737         p->p_acct[LMS_TFAULT]   += p->p_cacct[LMS_TFAULT];
 738         p->p_acct[LMS_DFAULT]   += p->p_cacct[LMS_DFAULT];
 739         p->p_acct[LMS_KFAULT]   += p->p_cacct[LMS_KFAULT];
 740         p->p_acct[LMS_USER_LOCK] += p->p_cacct[LMS_USER_LOCK];
 741         p->p_acct[LMS_SLEEP]    += p->p_cacct[LMS_SLEEP];
 742         p->p_acct[LMS_WAIT_CPU] += p->p_cacct[LMS_WAIT_CPU];
 743         p->p_acct[LMS_STOPPED]  += p->p_cacct[LMS_STOPPED];
 744
 745         p->p_ru.minflt  += p->p_cru.minflt;
 746         p->p_ru.majflt  += p->p_cru.majflt;
 747         p->p_ru.nswap   += p->p_cru.nswap;
 748         p->p_ru.inblock += p->p_cru.inblock;
 749         p->p_ru.oublock += p->p_cru.oublock;
 750         p->p_ru.msgsnd  += p->p_cru.msgsnd;
 751         p->p_ru.msgrcv  += p->p_cru.msgrcv;
 752         p->p_ru.nsignals += p->p_cru.nsignals;
 753         p->p_ru.nvcsw   += p->p_cru.nvcsw;
 754         p->p_ru.nivcsw  += p->p_cru.nivcsw;
 755         p->p_ru.sysc    += p->p_cru.sysc;
 756         p->p_ru.ioch    += p->p_cru.ioch;
 757
 758         p->p_stat = SZOMB;
 759         p->p_proc_flag &= ~P_PR_PTRACE;
 760         p->p_wdata = what;
 761         p->p_wcode = (char)why;
 762
 763         cdir = PTOU(p)->u_cdir;
 764         rdir = PTOU(p)->u_rdir;
 765         cwd = PTOU(p)->u_cwd;
 766
 767         ASSERT(cdir != NULL || p->p_parent == &p0);
 768
 769         /*
 770          * Release resource controls, as they are no longer enforceable.
 771          */
 772         rctl_set_free(p->p_rctls);
 773
 774         /*
 775          * Decrement tk_nlwps counter for our task.max-lwps resource control.
 776          * An extended accounting record, if that facility is active, is
 777          * scheduled to be written.  We cannot give up task and project
 778          * membership at this point because that would allow zombies to escape
 779          * from the max-processes resource controls.  Zombies stay in their
 780          * current task and project until the process table slot is released
 781          * in freeproc().
 782          */
 783         tk = p->p_task;
 784
 785         mutex_enter(&p->p_zone->zone_nlwps_lock);
 786         tk->tk_nlwps--;
 787         tk->tk_proj->kpj_nlwps--;
 788         p->p_zone->zone_nlwps--;
 789         mutex_exit(&p->p_zone->zone_nlwps_lock);
 790
 791         /*
 792          * Clear the lwp directory and the lwpid hash table
 793          * now that /proc can't bother us any more.
 794          * We free the memory below, after dropping p->p_lock.
 795          */
 796         lwpdir = p->p_lwpdir;
 797         lwpdir_sz = p->p_lwpdir_sz;
 798         tidhash = p->p_tidhash;
 799         tidhash_sz = p->p_tidhash_sz;
 800         ret_tidhash = p->p_ret_tidhash;
 801         p->p_lwpdir = NULL;
 802         p->p_lwpfree = NULL;
 803         p->p_lwpdir_sz = 0;
 804         p->p_tidhash = NULL;
 805         p->p_tidhash_sz = 0;
 806         p->p_ret_tidhash = NULL;
 807
 808         /*
 809          * If the process has context ops installed, call the exit routine
 810          * on behalf of this last remaining thread. Normally exitpctx() is
 811          * called during thread_exit() or lwp_exit(), but because this is the
 812          * last thread in the process, we must call it here. By the time
 813          * thread_exit() is called (below), the association with the relevant
 814          * process has been lost.
 815          *
 816          * We also free the context here.
 817          */
 818         if (p->p_pctx) {
 819                 kpreempt_disable();
 820                 exitpctx(p);
 821                 kpreempt_enable();
 822
 823                 freepctx(p, 0);
 824         }
 825
 826         /*
 827          * curthread's proc pointer is changed to point to the 'sched'
 828          * process for the corresponding zone, except in the case when
 829          * the exiting process is in fact a zsched instance, in which
 830          * case the proc pointer is set to p0.  We do so, so that the
 831          * process still points at the right zone when we call the VN_RELE()
 832          * below.
 833          *
 834          * This is because curthread's original proc pointer can be freed as
 835          * soon as the child sends a SIGCLD to its parent.  We use zsched so
 836          * that for user processes, even in the final moments of death, the
 837          * process is still associated with its zone.
 838          */
 839         if (p != t->t_procp->p_zone->zone_zsched)
 840                 t->t_procp = t->t_procp->p_zone->zone_zsched;
 841         else
 842                 t->t_procp = &p0;
 843
 844         mutex_exit(&p->p_lock);
 845         if (!evaporate) {
 846                 p->p_pidflag &= ~CLDPEND;
 847                 sigcld(p, sqp);
 848         } else {
 849                 /*
 850                  * Do what sigcld() would do if the disposition
 851                  * of the SIGCHLD signal were set to be ignored.
 852                  */
 853                 cv_broadcast(&p->p_srwchan_cv);
 854                 freeproc(p);
 855         }
 856         mutex_exit(&pidlock);
 857
 858         /*
 859          * We don't release u_cdir and u_rdir until SZOMB is set.
 860          * This protects us against dofusers().
 861          */
 862         if (cdir)
 863                 VN_RELE(cdir);
 864         if (rdir)
 865                 VN_RELE(rdir);
 866         if (cwd)
 867                 refstr_rele(cwd);
 868
 869         /*
 870          * task_rele() may ultimately cause the zone to go away (or
 871          * may cause the last user process in a zone to go away, which
 872          * signals zsched to go away).  So prior to this call, we must
 873          * no longer point at zsched.
 874          */
 875         t->t_procp = &p0;
 876
 877         kmem_free(lwpdir, lwpdir_sz * sizeof (lwpdir_t));
 878         kmem_free(tidhash, tidhash_sz * sizeof (tidhash_t));
 879         while (ret_tidhash != NULL) {
 880                 ret_tidhash_t *next = ret_tidhash->rth_next;
 881                 kmem_free(ret_tidhash->rth_tidhash,
 882                     ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));
 883                 kmem_free(ret_tidhash, sizeof (*ret_tidhash));
 884                 ret_tidhash = next;
 885         }
 886
 887         thread_exit();
 888         /* NOTREACHED */
 889 }
 890
 891 /*
 892  * Format siginfo structure for wait system calls.
 893  */
 894 void
 895 winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
 896 {
 897         ASSERT(MUTEX_HELD(&pidlock));
 898
 899         bzero(ip, sizeof (k_siginfo_t));
 900         ip->si_signo = SIGCLD;
 901         ip->si_code = pp->p_wcode;
 902         ip->si_pid = pp->p_pid;
 903         ip->si_ctid = PRCTID(pp);
 904         ip->si_zoneid = pp->p_zone->zone_id;
 905         ip->si_status = pp->p_wdata;
 906         ip->si_stime = pp->p_stime;
 907         ip->si_utime = pp->p_utime;
 908
 909         if (waitflag) {
 910                 pp->p_wcode = 0;
 911                 pp->p_wdata = 0;
 912                 pp->p_pidflag &= ~CLDPEND;
 913         }
 914 }
 915
 916 /*
 917  * Wait system call.
 918  * Search for a terminated (zombie) child,
 919  * finally lay it to rest, and collect its status.
 920  * Look also for stopped children,
 921  * and pass back status from them.
 922  */
 923 int
 924 waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 925 {
 926         int found;
 927         proc_t *cp, *pp;
 928         int proc_gone;
 929         int waitflag = !(options & WNOWAIT);
 930
 931         /*
 932          * Obsolete flag, defined here only for binary compatibility
 933          * with old statically linked executables.  Delete this when
 934          * we no longer care about these old and broken applications.
 935          */
 936 #define _WNOCHLD        0400
 937         options &= ~_WNOCHLD;
 938
 939         if (options == 0 || (options & ~WOPTMASK))
 940                 return (EINVAL);
 941
 942         switch (idtype) {
 943         case P_PID:
 944         case P_PGID:
 945                 if (id < 0 || id >= maxpid)
 946                         return (EINVAL);
 947                 /* FALLTHROUGH */
 948         case P_ALL:
 949                 break;
 950         default:
 951                 return (EINVAL);
 952         }
 953
 954         pp = ttoproc(curthread);
 955
 956         /*
 957          * lock parent mutex so that sibling chain can be searched.
 958          */
 959         mutex_enter(&pidlock);
 960
 961         /*
 962          * if we are only looking for exited processes and child_ns list
 963          * is empty no reason to look at all children.
 964          */
 965         if (idtype == P_ALL &&
 966             (options & ~WNOWAIT) == (WNOHANG | WEXITED) &&
 967             pp->p_child_ns == NULL) {
 968                 if (pp->p_child) {
 969                         mutex_exit(&pidlock);
 970                         bzero(ip, sizeof (k_siginfo_t));
 971                         return (0);
 972                 }
 973                 mutex_exit(&pidlock);
 974                 return (ECHILD);
 975         }
 976
 977         while (pp->p_child != NULL) {
 978
 979                 proc_gone = 0;
 980
 981                 for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
 982                         if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
 983                                 continue;
 984                         if (idtype == P_PID && id != cp->p_pid)
 985                                 continue;
 986                         if (idtype == P_PGID && id != cp->p_pgrp)
 987                                 continue;
 988
 989                         switch (cp->p_wcode) {
 990
 991                         case CLD_TRAPPED:
 992                         case CLD_STOPPED:
 993                         case CLD_CONTINUED:
 994                                 cmn_err(CE_PANIC,
 995                                     "waitid: wrong state %d on the p_newstate"
 996                                     " list", cp->p_wcode);
 997                                 break;
 998
 999                         case CLD_EXITED:
1000                         case CLD_DUMPED:
1001                         case CLD_KILLED:
1002                                 if (!(options & WEXITED)) {
1003                                         /*
1004                                          * Count how many are already gone
1005                                          * for good.
1006                                          */
1007                                         proc_gone++;
1008                                         break;
1009                                 }
1010                                 if (!waitflag) {
1011                                         winfo(cp, ip, 0);
1012                                 } else {
1013                                         winfo(cp, ip, 1);
1014                                         freeproc(cp);
1015                                 }
1016                                 mutex_exit(&pidlock);
1017                                 if (waitflag) {         /* accept SIGCLD */
1018                                         sigcld_delete(ip);
1019                                         sigcld_repost();
1020                                 }
1021                                 return (0);
1022                         }
1023
1024                         if (idtype == P_PID)
1025                                 break;
1026                 }
1027
1028                 /*
1029                  * Wow! None of the threads on the p_sibling_ns list were
1030                  * interesting threads. Check all the kids!
1031                  */
1032                 found = 0;
1033                 for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
1034                         if (idtype == P_PID && id != cp->p_pid)
1035                                 continue;
1036                         if (idtype == P_PGID && id != cp->p_pgrp)
1037                                 continue;
1038
1039                         switch (cp->p_wcode) {
1040                         case CLD_TRAPPED:
1041                                 if (!(options & WTRAPPED))
1042                                         break;
1043                                 winfo(cp, ip, waitflag);
1044                                 mutex_exit(&pidlock);
1045                                 if (waitflag) {         /* accept SIGCLD */
1046                                         sigcld_delete(ip);
1047                                         sigcld_repost();
1048                                 }
1049                                 return (0);
1050
1051                         case CLD_STOPPED:
1052                                 if (!(options & WSTOPPED))
1053                                         break;
1054                                 /* Is it still stopped? */
1055                                 mutex_enter(&cp->p_lock);
1056                                 if (!jobstopped(cp)) {
1057                                         mutex_exit(&cp->p_lock);
1058                                         break;
1059                                 }
1060                                 mutex_exit(&cp->p_lock);
1061                                 winfo(cp, ip, waitflag);
1062                                 mutex_exit(&pidlock);
1063                                 if (waitflag) {         /* accept SIGCLD */
1064                                         sigcld_delete(ip);
1065                                         sigcld_repost();
1066                                 }
1067                                 return (0);
1068
1069                         case CLD_CONTINUED:
1070                                 if (!(options & WCONTINUED))
1071                                         break;
1072                                 winfo(cp, ip, waitflag);
1073                                 mutex_exit(&pidlock);
1074                                 if (waitflag) {         /* accept SIGCLD */
1075                                         sigcld_delete(ip);
1076                                         sigcld_repost();
1077                                 }
1078                                 return (0);
1079
1080                         case CLD_EXITED:
1081                         case CLD_DUMPED:
1082                         case CLD_KILLED:
1083                                 if (idtype != P_PID &&
1084                                     (cp->p_pidflag & CLDWAITPID))
1085                                         continue;
1086                                 /*
1087                                  * Don't complain if a process was found in
1088                                  * the first loop but we broke out of the loop
1089                                  * because of the arguments passed to us.
1090                                  */
1091                                 if (proc_gone == 0) {
1092                                         cmn_err(CE_PANIC,
1093                                             "waitid: wrong state on the"
1094                                             " p_child list");
1095                                 } else {
1096                                         break;
1097                                 }
1098                         }
1099
1100                         found++;
1101
1102                         if (idtype == P_PID)
1103                                 break;
1104                 }
1105
1106                 /*
1107                  * If we found no interesting processes at all,
1108                  * break out and return ECHILD.
1109                  */
1110                 if (found + proc_gone == 0)
1111                         break;
1112
1113                 if (options & WNOHANG) {
1114                         mutex_exit(&pidlock);
1115                         bzero(ip, sizeof (k_siginfo_t));
1116                         /* XXX: should set ip->si_signo = SIGCLD? */
1117                         return (0);
1118                 }
1119
1120                 /*
1121                  * If we found no processes of interest that could
1122                  * change state while we wait, we don't wait at all.
1123                  * Get out with ECHILD according to SVID.
1124                  */
1125                 if (found == proc_gone)
1126                         break;
1127
1128                 if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
1129                         mutex_exit(&pidlock);
1130                         return (EINTR);
1131                 }
1132         }
1133         mutex_exit(&pidlock);
1134         return (ECHILD);
1135 }
1136
1137 int
1138 waitsys(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1139 {
1140         int error;
1141         k_siginfo_t info;
1142
1143         if (error = waitid(idtype, id, &info, options))
1144                 return (set_errno(error));
1145         if (copyout(&info, infop, sizeof (k_siginfo_t)))
1146                 return (set_errno(EFAULT));
1147         return (0);
1148 }
1149
1150 #ifdef _SYSCALL32_IMPL
1151
1152 int
1153 waitsys32(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1154 {
1155         int error;
1156         k_siginfo_t info;
1157         siginfo32_t info32;
1158
1159         if (error = waitid(idtype, id, &info, options))
1160                 return (set_errno(error));
1161         siginfo_kto32(&info, &info32);
1162         if (copyout(&info32, infop, sizeof (info32)))
1163                 return (set_errno(EFAULT));
1164         return (0);
1165 }
1166
1167 #endif  /* _SYSCALL32_IMPL */
1168
1169 void
1170 proc_detach(proc_t *p)
1171 {
1172         proc_t *q;
1173
1174         ASSERT(MUTEX_HELD(&pidlock));
1175
1176         q = p->p_parent;
1177         ASSERT(q != NULL);
1178
1179         /*
1180          * Take it off the newstate list of its parent
1181          */
1182         delete_ns(q, p);
1183
1184         if (q->p_child == p) {
1185                 q->p_child = p->p_sibling;
1186                 /*
1187                  * If the parent has no children, it better not
1188                  * have any with new states either!
1189                  */
1190                 ASSERT(q->p_child ? 1 : q->p_child_ns == NULL);
1191         }
1192
1193         if (p->p_sibling) {
1194                 p->p_sibling->p_psibling = p->p_psibling;
1195         }
1196
1197         if (p->p_psibling) {
1198                 p->p_psibling->p_sibling = p->p_sibling;
1199         }
1200 }
1201
1202 /*
1203  * Remove zombie children from the process table.
1204  */
1205 void
1206 freeproc(proc_t *p)
1207 {
1208         proc_t *q;
1209         task_t *tk;
1210
1211         ASSERT(p->p_stat == SZOMB);
1212         ASSERT(p->p_tlist == NULL);
1213         ASSERT(MUTEX_HELD(&pidlock));
1214
1215         sigdelq(p, NULL, 0);
1216         if (p->p_killsqp) {
1217                 siginfofree(p->p_killsqp);
1218                 p->p_killsqp = NULL;
1219         }
1220
1221         prfree(p);      /* inform /proc */
1222
1223         /*
1224          * Don't free the init processes.
1225          * Other dying processes will access it.
1226          */
1227         if (p == proc_init)
1228                 return;
1229
1230
1231         /*
1232          * We wait until now to free the cred structure because a
1233          * zombie process's credentials may be examined by /proc.
1234          * No cred locking needed because there are no threads at this point.
1235          */
1236         upcount_dec(crgetruid(p->p_cred), crgetzoneid(p->p_cred));
1237         crfree(p->p_cred);
1238         if (p->p_corefile != NULL) {
1239                 corectl_path_rele(p->p_corefile);
1240                 p->p_corefile = NULL;
1241         }
1242         if (p->p_content != NULL) {
1243                 corectl_content_rele(p->p_content);
1244                 p->p_content = NULL;
1245         }
1246
1247         if (p->p_nextofkin && !((p->p_nextofkin->p_flag & SNOWAIT) ||
1248             (PTOU(p->p_nextofkin)->u_signal[SIGCLD - 1] == SIG_IGN))) {
1249                 /*
1250                  * This should still do the right thing since p_utime/stime
1251                  * get set to the correct value on process exit, so it
1252                  * should get properly updated
1253                  */
1254                 p->p_nextofkin->p_cutime += p->p_utime;
1255                 p->p_nextofkin->p_cstime += p->p_stime;
1256
1257                 p->p_nextofkin->p_cacct[LMS_USER] += p->p_acct[LMS_USER];
1258                 p->p_nextofkin->p_cacct[LMS_SYSTEM] += p->p_acct[LMS_SYSTEM];
1259                 p->p_nextofkin->p_cacct[LMS_TRAP] += p->p_acct[LMS_TRAP];
1260                 p->p_nextofkin->p_cacct[LMS_TFAULT] += p->p_acct[LMS_TFAULT];
1261                 p->p_nextofkin->p_cacct[LMS_DFAULT] += p->p_acct[LMS_DFAULT];
1262                 p->p_nextofkin->p_cacct[LMS_KFAULT] += p->p_acct[LMS_KFAULT];
1263                 p->p_nextofkin->p_cacct[LMS_USER_LOCK]
1264                     += p->p_acct[LMS_USER_LOCK];
1265                 p->p_nextofkin->p_cacct[LMS_SLEEP] += p->p_acct[LMS_SLEEP];
1266                 p->p_nextofkin->p_cacct[LMS_WAIT_CPU]
1267                     += p->p_acct[LMS_WAIT_CPU];
1268                 p->p_nextofkin->p_cacct[LMS_STOPPED] += p->p_acct[LMS_STOPPED];
1269
1270                 p->p_nextofkin->p_cru.minflt    += p->p_ru.minflt;
1271                 p->p_nextofkin->p_cru.majflt    += p->p_ru.majflt;
1272                 p->p_nextofkin->p_cru.nswap     += p->p_ru.nswap;
1273                 p->p_nextofkin->p_cru.inblock   += p->p_ru.inblock;
1274                 p->p_nextofkin->p_cru.oublock   += p->p_ru.oublock;
1275                 p->p_nextofkin->p_cru.msgsnd    += p->p_ru.msgsnd;
1276                 p->p_nextofkin->p_cru.msgrcv    += p->p_ru.msgrcv;
1277                 p->p_nextofkin->p_cru.nsignals  += p->p_ru.nsignals;
1278                 p->p_nextofkin->p_cru.nvcsw     += p->p_ru.nvcsw;
1279                 p->p_nextofkin->p_cru.nivcsw    += p->p_ru.nivcsw;
1280                 p->p_nextofkin->p_cru.sysc      += p->p_ru.sysc;
1281                 p->p_nextofkin->p_cru.ioch      += p->p_ru.ioch;
1282
1283         }
1284
1285         q = p->p_nextofkin;
1286         if (q && q->p_orphan == p)
1287                 q->p_orphan = p->p_nextorph;
1288         else if (q) {
1289                 for (q = q->p_orphan; q; q = q->p_nextorph)
1290                         if (q->p_nextorph == p)
1291                                 break;
1292                 ASSERT(q && q->p_nextorph == p);
1293                 q->p_nextorph = p->p_nextorph;
1294         }
1295
1296         /*
1297          * The process table slot is being freed, so it is now safe to give up
1298          * task and project membership.
1299          */
1300         mutex_enter(&p->p_lock);
1301         tk = p->p_task;
1302         task_detach(p);
1303         mutex_exit(&p->p_lock);
1304
1305         proc_detach(p);
1306         pid_exit(p, tk);        /* frees pid and proc structure */
1307
1308         task_rele(tk);
1309 }
1310
1311 /*
1312  * Delete process "child" from the newstate list of process "parent"
1313  */
1314 void
1315 delete_ns(proc_t *parent, proc_t *child)
1316 {
1317         proc_t **ns;
1318
1319         ASSERT(MUTEX_HELD(&pidlock));
1320         ASSERT(child->p_parent == parent);
1321         for (ns = &parent->p_child_ns; *ns != NULL; ns = &(*ns)->p_sibling_ns) {
1322                 if (*ns == child) {
1323
1324                         ASSERT((*ns)->p_parent == parent);
1325
1326                         *ns = child->p_sibling_ns;
1327                         child->p_sibling_ns = NULL;
1328                         return;
1329                 }
1330         }
1331 }
1332
1333 /*
1334  * Add process "child" to the new state list of process "parent"
1335  */
1336 void
1337 add_ns(proc_t *parent, proc_t *child)
1338 {
1339         ASSERT(child->p_sibling_ns == NULL);
1340         child->p_sibling_ns = parent->p_child_ns;
1341         parent->p_child_ns = child;
1342 }