kernel/disp/thread.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
  25  */
  26
  27 #include <sys/types.h>
  28 #include <sys/param.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/signal.h>
  31 #include <sys/stack.h>
  32 #include <sys/pcb.h>
  33 #include <sys/user.h>
  34 #include <sys/systm.h>
  35 #include <sys/sysinfo.h>
  36 #include <sys/errno.h>
  37 #include <sys/cmn_err.h>
  38 #include <sys/cred.h>
  39 #include <sys/resource.h>
  40 #include <sys/task.h>
  41 #include <sys/project.h>
  42 #include <sys/proc.h>
  43 #include <sys/debug.h>
  44 #include <sys/disp.h>
  45 #include <sys/class.h>
  46 #include <vm/seg_kmem.h>
  47 #include <vm/seg_kp.h>
  48 #include <sys/machlock.h>
  49 #include <sys/kmem.h>
  50 #include <sys/varargs.h>
  51 #include <sys/turnstile.h>
  52 #include <sys/poll.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/callb.h>
  55 #include <sys/tnf.h>
  56 #include <sys/sobject.h>
  57 #include <sys/cpupart.h>
  58 #include <sys/pset.h>
  59 #include <sys/door.h>
  60 #include <sys/spl.h>
  61 #include <sys/copyops.h>
  62 #include <sys/rctl.h>
  63 #include <sys/brand.h>
  64 #include <sys/pool.h>
  65 #include <sys/zone.h>
  66 #include <sys/cpc_impl.h>
  67 #include <sys/sdt.h>
  68 #include <sys/reboot.h>
  69 #include <sys/kdi.h>
  70 #include <sys/schedctl.h>
  71 #include <sys/waitq.h>
  72 #include <sys/cpucaps.h>
  73 #include <sys/kiconv.h>
  74
  75 struct kmem_cache *thread_cache;        /* cache of free threads */
  76 struct kmem_cache *lwp_cache;           /* cache of free lwps */
  77 struct kmem_cache *turnstile_cache;     /* cache of free turnstiles */
  78
  79 /*
  80  * allthreads is only for use by kmem_readers.  All kernel loops can use
  81  * the current thread as a start/end point.
  82  */
  83 kthread_t *allthreads = &t0;    /* circular list of all threads */
  84
  85 static kcondvar_t reaper_cv;            /* synchronization var */
  86 kthread_t       *thread_deathrow;       /* circular list of reapable threads */
  87 kthread_t       *lwp_deathrow;          /* circular list of reapable threads */
  88 kmutex_t        reaplock;               /* protects lwp and thread deathrows */
  89 int     thread_reapcnt = 0;             /* number of threads on deathrow */
  90 int     lwp_reapcnt = 0;                /* number of lwps on deathrow */
  91 int     reaplimit = 16;                 /* delay reaping until reaplimit */
  92
  93 thread_free_lock_t      *thread_free_lock;
  94                                         /* protects tick thread from reaper */
  95
  96 extern int nthread;
  97
  98 /* System Scheduling classes. */
  99 id_t    syscid;                         /* system scheduling class ID */
 100 id_t    sysdccid = CLASS_UNUSED;        /* reset when SDC loads */
 101
 102 void    *segkp_thread;                  /* cookie for segkp pool */
 103
 104 int lwp_cache_sz = 32;
 105 int t_cache_sz = 8;
 106 static kt_did_t next_t_id = 1;
 107
 108 /* Default mode for thread binding to CPUs and processor sets */
 109 int default_binding_mode = TB_ALLHARD;
 110
 111 /*
 112  * Min/Max stack sizes for stack size parameters
 113  */
 114 #define MAX_STKSIZE     (32 * DEFAULTSTKSZ)
 115 #define MIN_STKSIZE     DEFAULTSTKSZ
 116
 117 /*
 118  * default_stksize overrides lwp_default_stksize if it is set.
 119  */
 120 int     default_stksize;
 121 int     lwp_default_stksize;
 122
 123 static zone_key_t zone_thread_key;
 124
 125 unsigned int kmem_stackinfo;            /* stackinfo feature on-off */
 126 kmem_stkinfo_t *kmem_stkinfo_log;       /* stackinfo circular log */
 127 static kmutex_t kmem_stkinfo_lock;      /* protects kmem_stkinfo_log */
 128
 129 /*
 130  * forward declarations for internal thread specific data (tsd)
 131  */
 132 static void *tsd_realloc(void *, size_t, size_t);
 133
 134 void thread_reaper(void);
 135
 136 /* forward declarations for stackinfo feature */
 137 static void stkinfo_begin(kthread_t *);
 138 static void stkinfo_end(kthread_t *);
 139 static size_t stkinfo_percent(caddr_t, caddr_t, caddr_t);
 140
 141 /*ARGSUSED*/
 142 static int
 143 turnstile_constructor(void *buf, void *cdrarg, int kmflags)
 144 {
 145         bzero(buf, sizeof (turnstile_t));
 146         return (0);
 147 }
 148
 149 /*ARGSUSED*/
 150 static void
 151 turnstile_destructor(void *buf, void *cdrarg)
 152 {
 153         turnstile_t *ts = buf;
 154
 155         ASSERT(ts->ts_free == NULL);
 156         ASSERT(ts->ts_waiters == 0);
 157         ASSERT(ts->ts_inheritor == NULL);
 158         ASSERT(ts->ts_sleepq[0].sq_first == NULL);
 159         ASSERT(ts->ts_sleepq[1].sq_first == NULL);
 160 }
 161
 162 void
 163 thread_init(void)
 164 {
 165         kthread_t *tp;
 166         extern char sys_name[];
 167         extern void idle();
 168         struct cpu *cpu = CPU;
 169         int i;
 170         kmutex_t *lp;
 171
 172         mutex_init(&reaplock, NULL, MUTEX_SPIN, (void *)ipltospl(DISP_LEVEL));
 173         thread_free_lock =
 174             kmem_alloc(sizeof (thread_free_lock_t) * THREAD_FREE_NUM, KM_SLEEP);
 175         for (i = 0; i < THREAD_FREE_NUM; i++) {
 176                 lp = &thread_free_lock[i].tf_lock;
 177                 mutex_init(lp, NULL, MUTEX_DEFAULT, NULL);
 178         }
 179
 180 #if defined(__i386) || defined(__amd64)
 181         thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
 182             PTR24_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
 183
 184         /*
 185          * "struct _klwp" includes a "struct pcb", which includes a
 186          * "struct fpu", which needs to be 64-byte aligned on amd64
 187          * (and even on i386) for xsave/xrstor.
 188          */
 189         lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
 190             64, NULL, NULL, NULL, NULL, NULL, 0);
 191 #else
 192         /*
 193          * Allocate thread structures from static_arena.  This prevents
 194          * issues where a thread tries to relocate its own thread
 195          * structure and touches it after the mapping has been suspended.
 196          */
 197         thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
 198             PTR24_ALIGN, NULL, NULL, NULL, NULL, static_arena, 0);
 199
 200         lwp_stk_cache_init();
 201
 202         lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
 203             0, NULL, NULL, NULL, NULL, NULL, 0);
 204 #endif
 205
 206         turnstile_cache = kmem_cache_create("turnstile_cache",
 207             sizeof (turnstile_t), 0,
 208             turnstile_constructor, turnstile_destructor, NULL, NULL, NULL, 0);
 209
 210         cred_init();
 211
 212         /*
 213          * Initialize various resource management facilities.
 214          */
 215         rctl_init();
 216         cpucaps_init();
 217         /*
 218          * Zone_init() should be called before project_init() so that project ID
 219          * for the first project is initialized correctly.
 220          */
 221         zone_init();
 222         project_init();
 223         brand_init();
 224         kiconv_init();
 225         task_init();
 226         pool_init();
 227
 228         curthread->t_ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
 229
 230         /*
 231          * Originally, we had two parameters to set default stack
 232          * size: one for lwp's (lwp_default_stksize), and one for
 233          * kernel-only threads (DEFAULTSTKSZ, a.k.a. _defaultstksz).
 234          * Now we have a third parameter that overrides both if it is
 235          * set to a legal stack size, called default_stksize.
 236          */
 237
 238         if (default_stksize == 0) {
 239                 default_stksize = DEFAULTSTKSZ;
 240         } else if (default_stksize % PAGESIZE != 0 ||
 241             default_stksize > MAX_STKSIZE ||
 242             default_stksize < MIN_STKSIZE) {
 243                 cmn_err(CE_WARN, "Illegal stack size. Using %d",
 244                     (int)DEFAULTSTKSZ);
 245                 default_stksize = DEFAULTSTKSZ;
 246         } else {
 247                 lwp_default_stksize = default_stksize;
 248         }
 249
 250         if (lwp_default_stksize == 0) {
 251                 lwp_default_stksize = default_stksize;
 252         } else if (lwp_default_stksize % PAGESIZE != 0 ||
 253             lwp_default_stksize > MAX_STKSIZE ||
 254             lwp_default_stksize < MIN_STKSIZE) {
 255                 cmn_err(CE_WARN, "Illegal stack size. Using %d",
 256                     default_stksize);
 257                 lwp_default_stksize = default_stksize;
 258         }
 259
 260         segkp_lwp = segkp_cache_init(segkp, lwp_cache_sz,
 261             lwp_default_stksize,
 262             (KPD_NOWAIT | KPD_HASREDZONE | KPD_LOCKED));
 263
 264         segkp_thread = segkp_cache_init(segkp, t_cache_sz,
 265             default_stksize, KPD_HASREDZONE | KPD_LOCKED | KPD_NO_ANON);
 266
 267         (void) getcid(sys_name, &syscid);
 268         curthread->t_cid = syscid;      /* current thread is t0 */
 269
 270         /*
 271          * Set up the first CPU's idle thread.
 272          * It runs whenever the CPU has nothing worthwhile to do.
 273          */
 274         tp = thread_create(NULL, 0, idle, NULL, 0, &p0, TS_STOPPED, -1);
 275         cpu->cpu_idle_thread = tp;
 276         tp->t_preempt = 1;
 277         tp->t_disp_queue = cpu->cpu_disp;
 278         ASSERT(tp->t_disp_queue != NULL);
 279         tp->t_bound_cpu = cpu;
 280         tp->t_affinitycnt = 1;
 281
 282         /*
 283          * Registering a thread in the callback table is usually
 284          * done in the initialization code of the thread. In this
 285          * case, we do it right after thread creation to avoid
 286          * blocking idle thread while registering itself. It also
 287          * avoids the possibility of reregistration in case a CPU
 288          * restarts its idle thread.
 289          */
 290         CALLB_CPR_INIT_SAFE(tp, "idle");
 291
 292         /*
 293          * Create the thread_reaper daemon. From this point on, exited
 294          * threads will get reaped.
 295          */
 296         (void) thread_create(NULL, 0, (void (*)())thread_reaper,
 297             NULL, 0, &p0, TS_RUN, minclsyspri);
 298
 299         /*
 300          * Finish initializing the kernel memory allocator now that
 301          * thread_create() is available.
 302          */
 303         kmem_thread_init();
 304
 305         if (boothowto & RB_DEBUG)
 306                 kdi_dvec_thravail();
 307 }
 308
 309 /*
 310  * Create a thread.
 311  *
 312  * thread_create() blocks for memory if necessary.  It never fails.
 313  *
 314  * If stk is NULL, the thread is created at the base of the stack
 315  * and cannot be swapped.
 316  */
 317 kthread_t *
 318 thread_create(
 319         caddr_t stk,
 320         size_t  stksize,
 321         void    (*proc)(),
 322         void    *arg,
 323         size_t  len,
 324         proc_t   *pp,
 325         int     state,
 326         pri_t   pri)
 327 {
 328         kthread_t *t;
 329         extern struct classfuncs sys_classfuncs;
 330         turnstile_t *ts;
 331
 332         /*
 333          * Every thread keeps a turnstile around in case it needs to block.
 334          * The only reason the turnstile is not simply part of the thread
 335          * structure is that we may have to break the association whenever
 336          * more than one thread blocks on a given synchronization object.
 337          * From a memory-management standpoint, turnstiles are like the
 338          * "attached mblks" that hang off dblks in the streams allocator.
 339          */
 340         ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
 341
 342         if (stk == NULL) {
 343                 /*
 344                  * alloc both thread and stack in segkp chunk
 345                  */
 346
 347                 if (stksize < default_stksize)
 348                         stksize = default_stksize;
 349
 350                 if (stksize == default_stksize) {
 351                         stk = (caddr_t)segkp_cache_get(segkp_thread);
 352                 } else {
 353                         stksize = roundup(stksize, PAGESIZE);
 354                         stk = (caddr_t)segkp_get(segkp, stksize,
 355                             (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
 356                 }
 357
 358                 ASSERT(stk != NULL);
 359
 360                 /*
 361                  * The machine-dependent mutex code may require that
 362                  * thread pointers (since they may be used for mutex owner
 363                  * fields) have certain alignment requirements.
 364                  * PTR24_ALIGN is the size of the alignment quanta.
 365                  * XXX - assumes stack grows toward low addresses.
 366                  */
 367                 if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
 368                         cmn_err(CE_PANIC, "thread_create: proposed stack size"
 369                             " too small to hold thread.");
 370 #ifdef STACK_GROWTH_DOWN
 371                 stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
 372                 stksize &= -PTR24_ALIGN;        /* make thread aligned */
 373                 t = (kthread_t *)(stk + stksize);
 374                 bzero(t, sizeof (kthread_t));
 375                 t->t_stk = stk + stksize;
 376                 t->t_stkbase = stk;
 377 #else   /* stack grows to larger addresses */
 378                 stksize -= SA(sizeof (kthread_t));
 379                 t = (kthread_t *)(stk);
 380                 bzero(t, sizeof (kthread_t));
 381                 t->t_stk = stk + sizeof (kthread_t);
 382                 t->t_stkbase = stk + stksize + sizeof (kthread_t);
 383 #endif  /* STACK_GROWTH_DOWN */
 384                 t->t_flag |= T_TALLOCSTK;
 385                 t->t_swap = stk;
 386         } else {
 387                 t = kmem_cache_alloc(thread_cache, KM_SLEEP);
 388                 bzero(t, sizeof (kthread_t));
 389                 ASSERT(((uintptr_t)t & (PTR24_ALIGN - 1)) == 0);
 390                 /*
 391                  * Initialize t_stk to the kernel stack pointer to use
 392                  * upon entry to the kernel
 393                  */
 394 #ifdef STACK_GROWTH_DOWN
 395                 t->t_stk = stk + stksize;
 396                 t->t_stkbase = stk;
 397 #else
 398                 t->t_stk = stk;                 /* 3b2-like */
 399                 t->t_stkbase = stk + stksize;
 400 #endif /* STACK_GROWTH_DOWN */
 401         }
 402
 403         if (kmem_stackinfo != 0) {
 404                 stkinfo_begin(t);
 405         }
 406
 407         t->t_ts = ts;
 408
 409         /*
 410          * p_cred could be NULL if it thread_create is called before cred_init
 411          * is called in main.
 412          */
 413         mutex_enter(&pp->p_crlock);
 414         if (pp->p_cred)
 415                 crhold(t->t_cred = pp->p_cred);
 416         mutex_exit(&pp->p_crlock);
 417         t->t_start = gethrestime_sec();
 418         t->t_startpc = proc;
 419         t->t_procp = pp;
 420         t->t_clfuncs = &sys_classfuncs.thread;
 421         t->t_cid = syscid;
 422         t->t_pri = pri;
 423         t->t_schedflag = 0;
 424         t->t_bind_cpu = PBIND_NONE;
 425         t->t_bindflag = (uchar_t)default_binding_mode;
 426         t->t_bind_pset = PS_NONE;
 427         t->t_plockp = &pp->p_lock;
 428         t->t_copyops = NULL;
 429         t->t_taskq = NULL;
 430         t->t_anttime = 0;
 431         t->t_hatdepth = 0;
 432
 433         t->t_dtrace_vtime = 1;  /* assure vtimestamp is always non-zero */
 434
 435         CPU_STATS_ADDQ(CPU, sys, nthreads, 1);
 436 #ifndef NPROBE
 437         /* Kernel probe */
 438         tnf_thread_create(t);
 439 #endif /* NPROBE */
 440         LOCK_INIT_CLEAR(&t->t_lock);
 441
 442         /*
 443          * Callers who give us a NULL proc must do their own
 444          * stack initialization.  e.g. lwp_create()
 445          */
 446         if (proc != NULL) {
 447                 t->t_stk = thread_stk_init(t->t_stk);
 448                 thread_load(t, proc, arg, len);
 449         }
 450
 451         /*
 452          * Put a hold on project0. If this thread is actually in a
 453          * different project, then t_proj will be changed later in
 454          * lwp_create().  All kernel-only threads must be in project 0.
 455          */
 456         t->t_proj = project_hold(proj0p);
 457
 458         lgrp_affinity_init(&t->t_lgrp_affinity);
 459
 460         mutex_enter(&pidlock);
 461         nthread++;
 462         t->t_did = next_t_id++;
 463         t->t_prev = curthread->t_prev;
 464         t->t_next = curthread;
 465
 466         /*
 467          * Add the thread to the list of all threads, and initialize
 468          * its t_cpu pointer.  We need to block preemption since
 469          * cpu_offline walks the thread list looking for threads
 470          * with t_cpu pointing to the CPU being offlined.  We want
 471          * to make sure that the list is consistent and that if t_cpu
 472          * is set, the thread is on the list.
 473          */
 474         kpreempt_disable();
 475         curthread->t_prev->t_next = t;
 476         curthread->t_prev = t;
 477
 478         /*
 479          * Threads should never have a NULL t_cpu pointer so assign it
 480          * here.  If the thread is being created with state TS_RUN a
 481          * better CPU may be chosen when it is placed on the run queue.
 482          *
 483          * We need to keep kernel preemption disabled when setting all
 484          * three fields to keep them in sync.  Also, always create in
 485          * the default partition since that's where kernel threads go
 486          * (if this isn't a kernel thread, t_cpupart will be changed
 487          * in lwp_create before setting the thread runnable).
 488          */
 489         t->t_cpupart = &cp_default;
 490
 491         /*
 492          * For now, affiliate this thread with the root lgroup.
 493          * Since the kernel does not (presently) allocate its memory
 494          * in a locality aware fashion, the root is an appropriate home.
 495          * If this thread is later associated with an lwp, it will have
 496          * it's lgroup re-assigned at that time.
 497          */
 498         lgrp_move_thread(t, &cp_default.cp_lgrploads[LGRP_ROOTID], 1);
 499
 500         /*
 501          * Inherit the current cpu.  If this cpu isn't part of the chosen
 502          * lgroup, a new cpu will be chosen by cpu_choose when the thread
 503          * is ready to run.
 504          */
 505         if (CPU->cpu_part == &cp_default)
 506                 t->t_cpu = CPU;
 507         else
 508                 t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
 509                     t->t_pri, NULL);
 510
 511         t->t_disp_queue = t->t_cpu->cpu_disp;
 512         kpreempt_enable();
 513
 514         /*
 515          * Initialize thread state and the dispatcher lock pointer.
 516          * Need to hold onto pidlock to block allthreads walkers until
 517          * the state is set.
 518          */
 519         switch (state) {
 520         case TS_RUN:
 521                 curthread->t_oldspl = splhigh();        /* get dispatcher spl */
 522                 THREAD_SET_STATE(t, TS_STOPPED, &transition_lock);
 523                 CL_SETRUN(t);
 524                 thread_unlock(t);
 525                 break;
 526
 527         case TS_ONPROC:
 528                 THREAD_ONPROC(t, t->t_cpu);
 529                 break;
 530
 531         case TS_FREE:
 532                 /*
 533                  * Free state will be used for intr threads.
 534                  * The interrupt routine must set the thread dispatcher
 535                  * lock pointer (t_lockp) if starting on a CPU
 536                  * other than the current one.
 537                  */
 538                 THREAD_FREEINTR(t, CPU);
 539                 break;
 540
 541         case TS_STOPPED:
 542                 THREAD_SET_STATE(t, TS_STOPPED, &stop_lock);
 543                 break;
 544
 545         default:                        /* TS_SLEEP, TS_ZOMB or TS_TRANS */
 546                 cmn_err(CE_PANIC, "thread_create: invalid state %d", state);
 547         }
 548         mutex_exit(&pidlock);
 549         return (t);
 550 }
 551
 552 /*
 553  * Move thread to project0 and take care of project reference counters.
 554  */
 555 void
 556 thread_rele(kthread_t *t)
 557 {
 558         kproject_t *kpj;
 559
 560         thread_lock(t);
 561
 562         ASSERT(t == curthread || t->t_state == TS_FREE || t->t_procp == &p0);
 563         kpj = ttoproj(t);
 564         t->t_proj = proj0p;
 565
 566         thread_unlock(t);
 567
 568         if (kpj != proj0p) {
 569                 project_rele(kpj);
 570                 (void) project_hold(proj0p);
 571         }
 572 }
 573
 574 void
 575 thread_exit(void)
 576 {
 577         kthread_t *t = curthread;
 578
 579         if ((t->t_proc_flag & TP_ZTHREAD) != 0)
 580                 cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
 581
 582         tsd_exit();             /* Clean up this thread's TSD */
 583
 584         kcpc_passivate();       /* clean up performance counter state */
 585
 586         /*
 587          * No kernel thread should have called poll() without arranging
 588          * calling pollcleanup() here.
 589          */
 590         ASSERT(t->t_pollstate == NULL);
 591         ASSERT(t->t_schedctl == NULL);
 592         if (t->t_door)
 593                 door_slam();    /* in case thread did an upcall */
 594
 595 #ifndef NPROBE
 596         /* Kernel probe */
 597         if (t->t_tnf_tpdp)
 598                 tnf_thread_exit();
 599 #endif /* NPROBE */
 600
 601         thread_rele(t);
 602         t->t_preempt++;
 603
 604         /*
 605          * remove thread from the all threads list so that
 606          * death-row can use the same pointers.
 607          */
 608         mutex_enter(&pidlock);
 609         t->t_next->t_prev = t->t_prev;
 610         t->t_prev->t_next = t->t_next;
 611         ASSERT(allthreads != t);        /* t0 never exits */
 612         cv_broadcast(&t->t_joincv);     /* wake up anyone in thread_join */
 613         mutex_exit(&pidlock);
 614
 615         if (t->t_ctx != NULL)
 616                 exitctx(t);
 617         if (t->t_procp->p_pctx != NULL)
 618                 exitpctx(t->t_procp);
 619
 620         if (kmem_stackinfo != 0) {
 621                 stkinfo_end(t);
 622         }
 623
 624         t->t_state = TS_ZOMB;   /* set zombie thread */
 625
 626         swtch_from_zombie();    /* give up the CPU */
 627         /* NOTREACHED */
 628 }
 629
 630 /*
 631  * Check to see if the specified thread is active (defined as being on
 632  * the thread list).  This is certainly a slow way to do this; if there's
 633  * ever a reason to speed it up, we could maintain a hash table of active
 634  * threads indexed by their t_did.
 635  */
 636 static kthread_t *
 637 did_to_thread(kt_did_t tid)
 638 {
 639         kthread_t *t;
 640
 641         ASSERT(MUTEX_HELD(&pidlock));
 642         for (t = curthread->t_next; t != curthread; t = t->t_next) {
 643                 if (t->t_did == tid)
 644                         break;
 645         }
 646         if (t->t_did == tid)
 647                 return (t);
 648         else
 649                 return (NULL);
 650 }
 651
 652 /*
 653  * Wait for specified thread to exit.  Returns immediately if the thread
 654  * could not be found, meaning that it has either already exited or never
 655  * existed.
 656  */
 657 void
 658 thread_join(kt_did_t tid)
 659 {
 660         kthread_t *t;
 661
 662         ASSERT(tid != curthread->t_did);
 663         ASSERT(tid != t0.t_did);
 664
 665         mutex_enter(&pidlock);
 666         /*
 667          * Make sure we check that the thread is on the thread list
 668          * before blocking on it; otherwise we could end up blocking on
 669          * a cv that's already been freed.  In other words, don't cache
 670          * the thread pointer across calls to cv_wait.
 671          *
 672          * The choice of loop invariant means that whenever a thread
 673          * is taken off the allthreads list, a cv_broadcast must be
 674          * performed on that thread's t_joincv to wake up any waiters.
 675          * The broadcast doesn't have to happen right away, but it
 676          * shouldn't be postponed indefinitely (e.g., by doing it in
 677          * thread_free which may only be executed when the deathrow
 678          * queue is processed.
 679          */
 680         while (t = did_to_thread(tid))
 681                 cv_wait(&t->t_joincv, &pidlock);
 682         mutex_exit(&pidlock);
 683 }
 684
 685 void
 686 thread_free_prevent(kthread_t *t)
 687 {
 688         kmutex_t *lp;
 689
 690         lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
 691         mutex_enter(lp);
 692 }
 693
 694 void
 695 thread_free_allow(kthread_t *t)
 696 {
 697         kmutex_t *lp;
 698
 699         lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
 700         mutex_exit(lp);
 701 }
 702
 703 static void
 704 thread_free_barrier(kthread_t *t)
 705 {
 706         kmutex_t *lp;
 707
 708         lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
 709         mutex_enter(lp);
 710         mutex_exit(lp);
 711 }
 712
 713 void
 714 thread_free(kthread_t *t)
 715 {
 716         boolean_t allocstk = (t->t_flag & T_TALLOCSTK);
 717         klwp_t *lwp = t->t_lwp;
 718         caddr_t swap = t->t_swap;
 719
 720         ASSERT(t != &t0 && t->t_state == TS_FREE);
 721         ASSERT(t->t_door == NULL);
 722         ASSERT(t->t_schedctl == NULL);
 723         ASSERT(t->t_pollstate == NULL);
 724
 725         t->t_pri = 0;
 726         t->t_pc = 0;
 727         t->t_sp = 0;
 728         t->t_wchan0 = NULL;
 729         t->t_wchan = NULL;
 730         if (t->t_cred != NULL) {
 731                 crfree(t->t_cred);
 732                 t->t_cred = 0;
 733         }
 734         if (t->t_pdmsg) {
 735                 kmem_free(t->t_pdmsg, strlen(t->t_pdmsg) + 1);
 736                 t->t_pdmsg = NULL;
 737         }
 738 #ifndef NPROBE
 739         if (t->t_tnf_tpdp)
 740                 tnf_thread_free(t);
 741 #endif /* NPROBE */
 742         if (t->t_cldata) {
 743                 CL_EXITCLASS(t->t_cid, (caddr_t *)t->t_cldata);
 744         }
 745         if (t->t_rprof != NULL) {
 746                 kmem_free(t->t_rprof, sizeof (*t->t_rprof));
 747                 t->t_rprof = NULL;
 748         }
 749         t->t_lockp = NULL;      /* nothing should try to lock this thread now */
 750         if (lwp)
 751                 lwp_freeregs(lwp, 0);
 752         if (t->t_ctx)
 753                 freectx(t, 0);
 754         t->t_stk = NULL;
 755         if (lwp)
 756                 lwp_stk_fini(lwp);
 757         lock_clear(&t->t_lock);
 758
 759         if (t->t_ts->ts_waiters > 0)
 760                 panic("thread_free: turnstile still active");
 761
 762         kmem_cache_free(turnstile_cache, t->t_ts);
 763
 764         free_afd(&t->t_activefd);
 765
 766         /*
 767          * Barrier for the tick accounting code.  The tick accounting code
 768          * holds this lock to keep the thread from going away while it's
 769          * looking at it.
 770          */
 771         thread_free_barrier(t);
 772
 773         ASSERT(ttoproj(t) == proj0p);
 774         project_rele(ttoproj(t));
 775
 776         lgrp_affinity_free(&t->t_lgrp_affinity);
 777
 778         mutex_enter(&pidlock);
 779         nthread--;
 780         mutex_exit(&pidlock);
 781
 782         /*
 783          * Free thread, lwp and stack.  This needs to be done carefully, since
 784          * if T_TALLOCSTK is set, the thread is part of the stack.
 785          */
 786         t->t_lwp = NULL;
 787         t->t_swap = NULL;
 788
 789         if (swap) {
 790                 segkp_release(segkp, swap);
 791         }
 792         if (lwp) {
 793                 kmem_cache_free(lwp_cache, lwp);
 794         }
 795         if (!allocstk) {
 796                 kmem_cache_free(thread_cache, t);
 797         }
 798 }
 799
 800 /*
 801  * Removes threads associated with the given zone from a deathrow queue.
 802  * tp is a pointer to the head of the deathrow queue, and countp is a
 803  * pointer to the current deathrow count.  Returns a linked list of
 804  * threads removed from the list.
 805  */
 806 static kthread_t *
 807 thread_zone_cleanup(kthread_t **tp, int *countp, zoneid_t zoneid)
 808 {
 809         kthread_t *tmp, *list = NULL;
 810         cred_t *cr;
 811
 812         ASSERT(MUTEX_HELD(&reaplock));
 813         while (*tp != NULL) {
 814                 if ((cr = (*tp)->t_cred) != NULL && crgetzoneid(cr) == zoneid) {
 815                         tmp = *tp;
 816                         *tp = tmp->t_forw;
 817                         tmp->t_forw = list;
 818                         list = tmp;
 819                         (*countp)--;
 820                 } else {
 821                         tp = &(*tp)->t_forw;
 822                 }
 823         }
 824         return (list);
 825 }
 826
 827 static void
 828 thread_reap_list(kthread_t *t)
 829 {
 830         kthread_t *next;
 831
 832         while (t != NULL) {
 833                 next = t->t_forw;
 834                 thread_free(t);
 835                 t = next;
 836         }
 837 }
 838
 839 /* ARGSUSED */
 840 static void
 841 thread_zone_destroy(zoneid_t zoneid, void *unused)
 842 {
 843         kthread_t *t, *l;
 844
 845         mutex_enter(&reaplock);
 846         /*
 847          * Pull threads and lwps associated with zone off deathrow lists.
 848          */
 849         t = thread_zone_cleanup(&thread_deathrow, &thread_reapcnt, zoneid);
 850         l = thread_zone_cleanup(&lwp_deathrow, &lwp_reapcnt, zoneid);
 851         mutex_exit(&reaplock);
 852
 853         /*
 854          * Guard against race condition in mutex_owner_running:
 855          *      thread=owner(mutex)
 856          *      <interrupt>
 857          *                              thread exits mutex
 858          *                              thread exits
 859          *                              thread reaped
 860          *                              thread struct freed
 861          * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
 862          * A cross call to all cpus will cause the interrupt handler
 863          * to reset the PC if it is in mutex_owner_running, refreshing
 864          * stale thread pointers.
 865          */
 866         mutex_sync();   /* sync with mutex code */
 867
 868         /*
 869          * Reap threads
 870          */
 871         thread_reap_list(t);
 872
 873         /*
 874          * Reap lwps
 875          */
 876         thread_reap_list(l);
 877 }
 878
 879 /*
 880  * cleanup zombie threads that are on deathrow.
 881  */
 882 void
 883 thread_reaper()
 884 {
 885         kthread_t *t, *l;
 886         callb_cpr_t cprinfo;
 887
 888         /*
 889          * Register callback to clean up threads when zone is destroyed.
 890          */
 891         zone_key_create(&zone_thread_key, NULL, NULL, thread_zone_destroy);
 892
 893         CALLB_CPR_INIT(&cprinfo, &reaplock, callb_generic_cpr, "t_reaper");
 894         for (;;) {
 895                 mutex_enter(&reaplock);
 896                 while (thread_deathrow == NULL && lwp_deathrow == NULL) {
 897                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 898                         cv_wait(&reaper_cv, &reaplock);
 899                         CALLB_CPR_SAFE_END(&cprinfo, &reaplock);
 900                 }
 901                 /*
 902                  * mutex_sync() needs to be called when reaping, but
 903                  * not too often.  We limit reaping rate to once
 904                  * per second.  Reaplimit is max rate at which threads can
 905                  * be freed. Does not impact thread destruction/creation.
 906                  */
 907                 t = thread_deathrow;
 908                 l = lwp_deathrow;
 909                 thread_deathrow = NULL;
 910                 lwp_deathrow = NULL;
 911                 thread_reapcnt = 0;
 912                 lwp_reapcnt = 0;
 913                 mutex_exit(&reaplock);
 914
 915                 /*
 916                  * Guard against race condition in mutex_owner_running:
 917                  *      thread=owner(mutex)
 918                  *      <interrupt>
 919                  *                              thread exits mutex
 920                  *                              thread exits
 921                  *                              thread reaped
 922                  *                              thread struct freed
 923                  * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
 924                  * A cross call to all cpus will cause the interrupt handler
 925                  * to reset the PC if it is in mutex_owner_running, refreshing
 926                  * stale thread pointers.
 927                  */
 928                 mutex_sync();   /* sync with mutex code */
 929                 /*
 930                  * Reap threads
 931                  */
 932                 thread_reap_list(t);
 933
 934                 /*
 935                  * Reap lwps
 936                  */
 937                 thread_reap_list(l);
 938                 ddi_sleep(1);
 939         }
 940 }
 941
 942 /*
 943  * This is called by lwpcreate, etc.() to put a lwp_deathrow thread onto
 944  * thread_deathrow. The thread's state is changed already TS_FREE to indicate
 945  * that is reapable. The thread already holds the reaplock, and was already
 946  * freed.
 947  */
 948 void
 949 reapq_move_lq_to_tq(kthread_t *t)
 950 {
 951         ASSERT(t->t_state == TS_FREE);
 952         ASSERT(MUTEX_HELD(&reaplock));
 953         t->t_forw = thread_deathrow;
 954         thread_deathrow = t;
 955         thread_reapcnt++;
 956         if (lwp_reapcnt + thread_reapcnt > reaplimit)
 957                 cv_signal(&reaper_cv);  /* wake the reaper */
 958 }
 959
 960 /*
 961  * This is called by resume() to put a zombie thread onto deathrow.
 962  * The thread's state is changed to TS_FREE to indicate that is reapable.
 963  * This is called from the idle thread so it must not block - just spin.
 964  */
 965 void
 966 reapq_add(kthread_t *t)
 967 {
 968         mutex_enter(&reaplock);
 969
 970         /*
 971          * lwp_deathrow contains threads with lwp linkage and
 972          * swappable thread stacks which have the default stacksize.
 973          * These threads' lwps and stacks may be reused by lwp_create().
 974          *
 975          * Anything else goes on thread_deathrow(), where it will eventually
 976          * be thread_free()d.
 977          */
 978         if (t->t_flag & T_LWPREUSE) {
 979                 ASSERT(ttolwp(t) != NULL);
 980                 t->t_forw = lwp_deathrow;
 981                 lwp_deathrow = t;
 982                 lwp_reapcnt++;
 983         } else {
 984                 t->t_forw = thread_deathrow;
 985                 thread_deathrow = t;
 986                 thread_reapcnt++;
 987         }
 988         if (lwp_reapcnt + thread_reapcnt > reaplimit)
 989                 cv_signal(&reaper_cv);  /* wake the reaper */
 990         t->t_state = TS_FREE;
 991         lock_clear(&t->t_lock);
 992
 993         /*
 994          * Before we return, we need to grab and drop the thread lock for
 995          * the dead thread.  At this point, the current thread is the idle
 996          * thread, and the dead thread's CPU lock points to the current
 997          * CPU -- and we must grab and drop the lock to synchronize with
 998          * a racing thread walking a blocking chain that the zombie thread
 999          * was recently in.  By this point, that blocking chain is (by
1000          * definition) stale:  the dead thread is not holding any locks, and
1001          * is therefore not in any blocking chains -- but if we do not regrab
1002          * our lock before freeing the dead thread's data structures, the
1003          * thread walking the (stale) blocking chain will die on memory
1004          * corruption when it attempts to drop the dead thread's lock.  We
1005          * only need do this once because there is no way for the dead thread
1006          * to ever again be on a blocking chain:  once we have grabbed and
1007          * dropped the thread lock, we are guaranteed that anyone that could
1008          * have seen this thread in a blocking chain can no longer see it.
1009          */
1010         thread_lock(t);
1011         thread_unlock(t);
1012
1013         mutex_exit(&reaplock);
1014 }
1015
1016 /*
1017  * Install thread context ops for the current thread.
1018  */
1019 void
1020 installctx(
1021         kthread_t *t,
1022         void    *arg,
1023         void    (*save)(void *),
1024         void    (*restore)(void *),
1025         void    (*fork)(void *, void *),
1026         void    (*lwp_create)(void *, void *),
1027         void    (*exit)(void *),
1028         void    (*free)(void *, int))
1029 {
1030         struct ctxop *ctx;
1031
1032         ctx = kmem_alloc(sizeof (struct ctxop), KM_SLEEP);
1033         ctx->save_op = save;
1034         ctx->restore_op = restore;
1035         ctx->fork_op = fork;
1036         ctx->lwp_create_op = lwp_create;
1037         ctx->exit_op = exit;
1038         ctx->free_op = free;
1039         ctx->arg = arg;
1040         ctx->next = t->t_ctx;
1041         t->t_ctx = ctx;
1042 }
1043
1044 /*
1045  * Remove the thread context ops from a thread.
1046  */
1047 int
1048 removectx(
1049         kthread_t *t,
1050         void    *arg,
1051         void    (*save)(void *),
1052         void    (*restore)(void *),
1053         void    (*fork)(void *, void *),
1054         void    (*lwp_create)(void *, void *),
1055         void    (*exit)(void *),
1056         void    (*free)(void *, int))
1057 {
1058         struct ctxop *ctx, *prev_ctx;
1059
1060         /*
1061          * The incoming kthread_t (which is the thread for which the
1062          * context ops will be removed) should be one of the following:
1063          *
1064          * a) the current thread,
1065          *
1066          * b) a thread of a process that's being forked (SIDL),
1067          *
1068          * c) a thread that belongs to the same process as the current
1069          *    thread and for which the current thread is the agent thread,
1070          *
1071          * d) a thread that is TS_STOPPED which is indicative of it
1072          *    being (if curthread is not an agent) a thread being created
1073          *    as part of an lwp creation.
1074          */
1075         ASSERT(t == curthread || ttoproc(t)->p_stat == SIDL ||
1076             ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1077
1078         /*
1079          * Serialize modifications to t->t_ctx to prevent the agent thread
1080          * and the target thread from racing with each other during lwp exit.
1081          */
1082         mutex_enter(&t->t_ctx_lock);
1083         prev_ctx = NULL;
1084         kpreempt_disable();
1085         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) {
1086                 if (ctx->save_op == save && ctx->restore_op == restore &&
1087                     ctx->fork_op == fork && ctx->lwp_create_op == lwp_create &&
1088                     ctx->exit_op == exit && ctx->free_op == free &&
1089                     ctx->arg == arg) {
1090                         if (prev_ctx)
1091                                 prev_ctx->next = ctx->next;
1092                         else
1093                                 t->t_ctx = ctx->next;
1094                         mutex_exit(&t->t_ctx_lock);
1095                         if (ctx->free_op != NULL)
1096                                 (ctx->free_op)(ctx->arg, 0);
1097                         kmem_free(ctx, sizeof (struct ctxop));
1098                         kpreempt_enable();
1099                         return (1);
1100                 }
1101                 prev_ctx = ctx;
1102         }
1103         mutex_exit(&t->t_ctx_lock);
1104         kpreempt_enable();
1105
1106         return (0);
1107 }
1108
1109 void
1110 savectx(kthread_t *t)
1111 {
1112         struct ctxop *ctx;
1113
1114         ASSERT(t == curthread);
1115         for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
1116                 if (ctx->save_op != NULL)
1117                         (ctx->save_op)(ctx->arg);
1118 }
1119
1120 void
1121 restorectx(kthread_t *t)
1122 {
1123         struct ctxop *ctx;
1124
1125         ASSERT(t == curthread);
1126         for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
1127                 if (ctx->restore_op != NULL)
1128                         (ctx->restore_op)(ctx->arg);
1129 }
1130
1131 void
1132 forkctx(kthread_t *t, kthread_t *ct)
1133 {
1134         struct ctxop *ctx;
1135
1136         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1137                 if (ctx->fork_op != NULL)
1138                         (ctx->fork_op)(t, ct);
1139 }
1140
1141 /*
1142  * Note that this operator is only invoked via the _lwp_create
1143  * system call.  The system may have other reasons to create lwps
1144  * e.g. the agent lwp or the doors unreferenced lwp.
1145  */
1146 void
1147 lwp_createctx(kthread_t *t, kthread_t *ct)
1148 {
1149         struct ctxop *ctx;
1150
1151         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1152                 if (ctx->lwp_create_op != NULL)
1153                         (ctx->lwp_create_op)(t, ct);
1154 }
1155
1156 /*
1157  * exitctx is called from thread_exit() and lwp_exit() to perform any actions
1158  * needed when the thread/LWP leaves the processor for the last time. This
1159  * routine is not intended to deal with freeing memory; freectx() is used for
1160  * that purpose during thread_free(). This routine is provided to allow for
1161  * clean-up that can't wait until thread_free().
1162  */
1163 void
1164 exitctx(kthread_t *t)
1165 {
1166         struct ctxop *ctx;
1167
1168         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1169                 if (ctx->exit_op != NULL)
1170                         (ctx->exit_op)(t);
1171 }
1172
1173 /*
1174  * freectx is called from thread_free() and exec() to get
1175  * rid of old thread context ops.
1176  */
1177 void
1178 freectx(kthread_t *t, int isexec)
1179 {
1180         struct ctxop *ctx;
1181
1182         kpreempt_disable();
1183         while ((ctx = t->t_ctx) != NULL) {
1184                 t->t_ctx = ctx->next;
1185                 if (ctx->free_op != NULL)
1186                         (ctx->free_op)(ctx->arg, isexec);
1187                 kmem_free(ctx, sizeof (struct ctxop));
1188         }
1189         kpreempt_enable();
1190 }
1191
1192 /*
1193  * freectx_ctx is called from lwp_create() when lwp is reused from
1194  * lwp_deathrow and its thread structure is added to thread_deathrow.
1195  * The thread structure to which this ctx was attached may be already
1196  * freed by the thread reaper so free_op implementations shouldn't rely
1197  * on thread structure to which this ctx was attached still being around.
1198  */
1199 void
1200 freectx_ctx(struct ctxop *ctx)
1201 {
1202         struct ctxop *nctx;
1203
1204         ASSERT(ctx != NULL);
1205
1206         kpreempt_disable();
1207         do {
1208                 nctx = ctx->next;
1209                 if (ctx->free_op != NULL)
1210                         (ctx->free_op)(ctx->arg, 0);
1211                 kmem_free(ctx, sizeof (struct ctxop));
1212         } while ((ctx = nctx) != NULL);
1213         kpreempt_enable();
1214 }
1215
1216 /*
1217  * Set the thread running; arrange for it to be swapped in if necessary.
1218  */
1219 void
1220 setrun_locked(kthread_t *t)
1221 {
1222         ASSERT(THREAD_LOCK_HELD(t));
1223         if (t->t_state == TS_SLEEP) {
1224                 /*
1225                  * Take off sleep queue.
1226                  */
1227                 SOBJ_UNSLEEP(t->t_sobj_ops, t);
1228         } else if (t->t_state & (TS_RUN | TS_ONPROC)) {
1229                 /*
1230                  * Already on dispatcher queue.
1231                  */
1232                 return;
1233         } else if (t->t_state == TS_WAIT) {
1234                 waitq_setrun(t);
1235         } else if (t->t_state == TS_STOPPED) {
1236                 /*
1237                  * All of the sending of SIGCONT (TC_XSTART) and /proc
1238                  * (TC_PSTART) and lwp_continue() (TC_CSTART) must have
1239                  * requested that the thread be run.
1240                  * Just calling setrun() is not sufficient to set a stopped
1241                  * thread running.  TP_TXSTART is always set if the thread
1242                  * is not stopped by a jobcontrol stop signal.
1243                  * TP_TPSTART is always set if /proc is not controlling it.
1244                  * TP_TCSTART is always set if lwp_suspend() didn't stop it.
1245                  * The thread won't be stopped unless one of these
1246                  * three mechanisms did it.
1247                  *
1248                  * These flags must be set before calling setrun_locked(t).
1249                  * They can't be passed as arguments because the streams
1250                  * code calls setrun() indirectly and the mechanism for
1251                  * doing so admits only one argument.  Note that the
1252                  * thread must be locked in order to change t_schedflags.
1253                  */
1254                 if ((t->t_schedflag & TS_ALLSTART) != TS_ALLSTART)
1255                         return;
1256                 /*
1257                  * Process is no longer stopped (a thread is running).
1258                  */
1259                 t->t_whystop = 0;
1260                 t->t_whatstop = 0;
1261                 /*
1262                  * Strictly speaking, we do not have to clear these
1263                  * flags here; they are cleared on entry to stop().
1264                  * However, they are confusing when doing kernel
1265                  * debugging or when they are revealed by ps(1).
1266                  */
1267                 t->t_schedflag &= ~TS_ALLSTART;
1268                 THREAD_TRANSITION(t);   /* drop stopped-thread lock */
1269                 ASSERT(t->t_lockp == &transition_lock);
1270                 ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL);
1271                 /*
1272                  * Let the class put the process on the dispatcher queue.
1273                  */
1274                 CL_SETRUN(t);
1275         }
1276 }
1277
1278 void
1279 setrun(kthread_t *t)
1280 {
1281         thread_lock(t);
1282         setrun_locked(t);
1283         thread_unlock(t);
1284 }
1285
1286 /*
1287  * Unpin an interrupted thread.
1288  *      When an interrupt occurs, the interrupt is handled on the stack
1289  *      of an interrupt thread, taken from a pool linked to the CPU structure.
1290  *
1291  *      When swtch() is switching away from an interrupt thread because it
1292  *      blocked or was preempted, this routine is called to complete the
1293  *      saving of the interrupted thread state, and returns the interrupted
1294  *      thread pointer so it may be resumed.
1295  *
1296  *      Called by swtch() only at high spl.
1297  */
1298 kthread_t *
1299 thread_unpin()
1300 {
1301         kthread_t       *t = curthread; /* current thread */
1302         kthread_t       *itp;           /* interrupted thread */
1303         int             i;              /* interrupt level */
1304         extern int      intr_passivate();
1305
1306         ASSERT(t->t_intr != NULL);
1307
1308         itp = t->t_intr;                /* interrupted thread */
1309         t->t_intr = NULL;               /* clear interrupt ptr */
1310
1311         /*
1312          * Get state from interrupt thread for the one
1313          * it interrupted.
1314          */
1315
1316         i = intr_passivate(t, itp);
1317
1318         TRACE_5(TR_FAC_INTR, TR_INTR_PASSIVATE,
1319             "intr_passivate:level %d curthread %p (%T) ithread %p (%T)",
1320             i, t, t, itp, itp);
1321
1322         /*
1323          * Dissociate the current thread from the interrupted thread's LWP.
1324          */
1325         t->t_lwp = NULL;
1326
1327         /*
1328          * Interrupt handlers above the level that spinlocks block must
1329          * not block.
1330          */
1331 #if DEBUG
1332         if (i < 0 || i > LOCK_LEVEL)
1333                 cmn_err(CE_PANIC, "thread_unpin: ipl out of range %x", i);
1334 #endif
1335
1336         /*
1337          * Compute the CPU's base interrupt level based on the active
1338          * interrupts.
1339          */
1340         ASSERT(CPU->cpu_intr_actv & (1 << i));
1341         set_base_spl();
1342
1343         return (itp);
1344 }
1345
1346 /*
1347  * Create and initialize an interrupt thread.
1348  *      Returns non-zero on error.
1349  *      Called at spl7() or better.
1350  */
1351 void
1352 thread_create_intr(struct cpu *cp)
1353 {
1354         kthread_t *tp;
1355
1356         tp = thread_create(NULL, 0,
1357             (void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0);
1358
1359         /*
1360          * Set the thread in the TS_FREE state.  The state will change
1361          * to TS_ONPROC only while the interrupt is active.  Think of these
1362          * as being on a private free list for the CPU.  Being TS_FREE keeps
1363          * inactive interrupt threads out of debugger thread lists.
1364          *
1365          * We cannot call thread_create with TS_FREE because of the current
1366          * checks there for ONPROC.  Fix this when thread_create takes flags.
1367          */
1368         THREAD_FREEINTR(tp, cp);
1369
1370         /*
1371          * Nobody should ever reference the credentials of an interrupt
1372          * thread so make it NULL to catch any such references.
1373          */
1374         tp->t_cred = NULL;
1375         tp->t_flag |= T_INTR_THREAD;
1376         tp->t_cpu = cp;
1377         tp->t_bound_cpu = cp;
1378         tp->t_disp_queue = cp->cpu_disp;
1379         tp->t_affinitycnt = 1;
1380         tp->t_preempt = 1;
1381
1382         /*
1383          * Don't make a user-requested binding on this thread so that
1384          * the processor can be offlined.
1385          */
1386         tp->t_bind_cpu = PBIND_NONE;    /* no USER-requested binding */
1387         tp->t_bind_pset = PS_NONE;
1388
1389 #if defined(__i386) || defined(__amd64)
1390         tp->t_stk -= STACK_ALIGN;
1391         *(tp->t_stk) = 0;               /* terminate intr thread stack */
1392 #endif
1393
1394         /*
1395          * Link onto CPU's interrupt pool.
1396          */
1397         tp->t_link = cp->cpu_intr_thread;
1398         cp->cpu_intr_thread = tp;
1399 }
1400
1401 /*
1402  * TSD -- THREAD SPECIFIC DATA
1403  */
1404 static kmutex_t         tsd_mutex;       /* linked list spin lock */
1405 static uint_t           tsd_nkeys;       /* size of destructor array */
1406 /* per-key destructor funcs */
1407 static void             (**tsd_destructor)(void *);
1408 /* list of tsd_thread's */
1409 static struct tsd_thread        *tsd_list;
1410
1411 /*
1412  * Default destructor
1413  *      Needed because NULL destructor means that the key is unused
1414  */
1415 /* ARGSUSED */
1416 void
1417 tsd_defaultdestructor(void *value)
1418 {}
1419
1420 /*
1421  * Create a key (index into per thread array)
1422  *      Locks out tsd_create, tsd_destroy, and tsd_exit
1423  *      May allocate memory with lock held
1424  */
1425 void
1426 tsd_create(uint_t *keyp, void (*destructor)(void *))
1427 {
1428         int     i;
1429         uint_t  nkeys;
1430
1431         /*
1432          * if key is allocated, do nothing
1433          */
1434         mutex_enter(&tsd_mutex);
1435         if (*keyp) {
1436                 mutex_exit(&tsd_mutex);
1437                 return;
1438         }
1439         /*
1440          * find an unused key
1441          */
1442         if (destructor == NULL)
1443                 destructor = tsd_defaultdestructor;
1444
1445         for (i = 0; i < tsd_nkeys; ++i)
1446                 if (tsd_destructor[i] == NULL)
1447                         break;
1448
1449         /*
1450          * if no unused keys, increase the size of the destructor array
1451          */
1452         if (i == tsd_nkeys) {
1453                 if ((nkeys = (tsd_nkeys << 1)) == 0)
1454                         nkeys = 1;
1455                 tsd_destructor =
1456                     (void (**)(void *))tsd_realloc((void *)tsd_destructor,
1457                     (size_t)(tsd_nkeys * sizeof (void (*)(void *))),
1458                     (size_t)(nkeys * sizeof (void (*)(void *))));
1459                 tsd_nkeys = nkeys;
1460         }
1461
1462         /*
1463          * allocate the next available unused key
1464          */
1465         tsd_destructor[i] = destructor;
1466         *keyp = i + 1;
1467         mutex_exit(&tsd_mutex);
1468 }
1469
1470 /*
1471  * Destroy a key -- this is for unloadable modules
1472  *
1473  * Assumes that the caller is preventing tsd_set and tsd_get
1474  * Locks out tsd_create, tsd_destroy, and tsd_exit
1475  * May free memory with lock held
1476  */
1477 void
1478 tsd_destroy(uint_t *keyp)
1479 {
1480         uint_t key;
1481         struct tsd_thread *tsd;
1482
1483         /*
1484          * protect the key namespace and our destructor lists
1485          */
1486         mutex_enter(&tsd_mutex);
1487         key = *keyp;
1488         *keyp = 0;
1489
1490         ASSERT(key <= tsd_nkeys);
1491
1492         /*
1493          * if the key is valid
1494          */
1495         if (key != 0) {
1496                 uint_t k = key - 1;
1497                 /*
1498                  * for every thread with TSD, call key's destructor
1499                  */
1500                 for (tsd = tsd_list; tsd; tsd = tsd->ts_next) {
1501                         /*
1502                          * no TSD for key in this thread
1503                          */
1504                         if (key > tsd->ts_nkeys)
1505                                 continue;
1506                         /*
1507                          * call destructor for key
1508                          */
1509                         if (tsd->ts_value[k] && tsd_destructor[k])
1510                                 (*tsd_destructor[k])(tsd->ts_value[k]);
1511                         /*
1512                          * reset value for key
1513                          */
1514                         tsd->ts_value[k] = NULL;
1515                 }
1516                 /*
1517                  * actually free the key (NULL destructor == unused)
1518                  */
1519                 tsd_destructor[k] = NULL;
1520         }
1521
1522         mutex_exit(&tsd_mutex);
1523 }
1524
1525 /*
1526  * Quickly return the per thread value that was stored with the specified key
1527  * Assumes the caller is protecting key from tsd_create and tsd_destroy
1528  */
1529 void *
1530 tsd_get(uint_t key)
1531 {
1532         return (tsd_agent_get(curthread, key));
1533 }
1534
1535 /*
1536  * Set a per thread value indexed with the specified key
1537  */
1538 int
1539 tsd_set(uint_t key, void *value)
1540 {
1541         return (tsd_agent_set(curthread, key, value));
1542 }
1543
1544 /*
1545  * Like tsd_get(), except that the agent lwp can get the tsd of
1546  * another thread in the same process (the agent thread only runs when the
1547  * process is completely stopped by /proc), or syslwp is creating a new lwp.
1548  */
1549 void *
1550 tsd_agent_get(kthread_t *t, uint_t key)
1551 {
1552         struct tsd_thread *tsd = t->t_tsd;
1553
1554         ASSERT(t == curthread ||
1555             ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1556
1557         if (key && tsd != NULL && key <= tsd->ts_nkeys)
1558                 return (tsd->ts_value[key - 1]);
1559         return (NULL);
1560 }
1561
1562 /*
1563  * Like tsd_set(), except that the agent lwp can set the tsd of
1564  * another thread in the same process, or syslwp can set the tsd
1565  * of a thread it's in the middle of creating.
1566  *
1567  * Assumes the caller is protecting key from tsd_create and tsd_destroy
1568  * May lock out tsd_destroy (and tsd_create), may allocate memory with
1569  * lock held
1570  */
1571 int
1572 tsd_agent_set(kthread_t *t, uint_t key, void *value)
1573 {
1574         struct tsd_thread *tsd = t->t_tsd;
1575
1576         ASSERT(t == curthread ||
1577             ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1578
1579         if (key == 0)
1580                 return (EINVAL);
1581         if (tsd == NULL)
1582                 tsd = t->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
1583         if (key <= tsd->ts_nkeys) {
1584                 tsd->ts_value[key - 1] = value;
1585                 return (0);
1586         }
1587
1588         ASSERT(key <= tsd_nkeys);
1589
1590         /*
1591          * lock out tsd_destroy()
1592          */
1593         mutex_enter(&tsd_mutex);
1594         if (tsd->ts_nkeys == 0) {
1595                 /*
1596                  * Link onto list of threads with TSD
1597                  */
1598                 if ((tsd->ts_next = tsd_list) != NULL)
1599                         tsd_list->ts_prev = tsd;
1600                 tsd_list = tsd;
1601         }
1602
1603         /*
1604          * Allocate thread local storage and set the value for key
1605          */
1606         tsd->ts_value = tsd_realloc(tsd->ts_value,
1607             tsd->ts_nkeys * sizeof (void *),
1608             key * sizeof (void *));
1609         tsd->ts_nkeys = key;
1610         tsd->ts_value[key - 1] = value;
1611         mutex_exit(&tsd_mutex);
1612
1613         return (0);
1614 }
1615
1616
1617 /*
1618  * Return the per thread value that was stored with the specified key
1619  *      If necessary, create the key and the value
1620  *      Assumes the caller is protecting *keyp from tsd_destroy
1621  */
1622 void *
1623 tsd_getcreate(uint_t *keyp, void (*destroy)(void *), void *(*allocate)(void))
1624 {
1625         void *value;
1626         uint_t key = *keyp;
1627         struct tsd_thread *tsd = curthread->t_tsd;
1628
1629         if (tsd == NULL)
1630                 tsd = curthread->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
1631         if (key && key <= tsd->ts_nkeys && (value = tsd->ts_value[key - 1]))
1632                 return (value);
1633         if (key == 0)
1634                 tsd_create(keyp, destroy);
1635         (void) tsd_set(*keyp, value = (*allocate)());
1636
1637         return (value);
1638 }
1639
1640 /*
1641  * Called from thread_exit() to run the destructor function for each tsd
1642  *      Locks out tsd_create and tsd_destroy
1643  *      Assumes that the destructor *DOES NOT* use tsd
1644  */
1645 void
1646 tsd_exit(void)
1647 {
1648         int i;
1649         struct tsd_thread *tsd = curthread->t_tsd;
1650
1651         if (tsd == NULL)
1652                 return;
1653
1654         if (tsd->ts_nkeys == 0) {
1655                 kmem_free(tsd, sizeof (*tsd));
1656                 curthread->t_tsd = NULL;
1657                 return;
1658         }
1659
1660         /*
1661          * lock out tsd_create and tsd_destroy, call
1662          * the destructor, and mark the value as destroyed.
1663          */
1664         mutex_enter(&tsd_mutex);
1665
1666         for (i = 0; i < tsd->ts_nkeys; i++) {
1667                 if (tsd->ts_value[i] && tsd_destructor[i])
1668                         (*tsd_destructor[i])(tsd->ts_value[i]);
1669                 tsd->ts_value[i] = NULL;
1670         }
1671
1672         /*
1673          * remove from linked list of threads with TSD
1674          */
1675         if (tsd->ts_next)
1676                 tsd->ts_next->ts_prev = tsd->ts_prev;
1677         if (tsd->ts_prev)
1678                 tsd->ts_prev->ts_next = tsd->ts_next;
1679         if (tsd_list == tsd)
1680                 tsd_list = tsd->ts_next;
1681
1682         mutex_exit(&tsd_mutex);
1683
1684         /*
1685          * free up the TSD
1686          */
1687         kmem_free(tsd->ts_value, tsd->ts_nkeys * sizeof (void *));
1688         kmem_free(tsd, sizeof (struct tsd_thread));
1689         curthread->t_tsd = NULL;
1690 }
1691
1692 /*
1693  * realloc
1694  */
1695 static void *
1696 tsd_realloc(void *old, size_t osize, size_t nsize)
1697 {
1698         void *new;
1699
1700         new = kmem_zalloc(nsize, KM_SLEEP);
1701         if (old) {
1702                 bcopy(old, new, osize);
1703                 kmem_free(old, osize);
1704         }
1705         return (new);
1706 }
1707
1708 /*
1709  * Return non-zero if an interrupt is being serviced.
1710  */
1711 int
1712 servicing_interrupt()
1713 {
1714         int onintr = 0;
1715
1716         /* Are we an interrupt thread */
1717         if (curthread->t_flag & T_INTR_THREAD)
1718                 return (1);
1719         /* Are we servicing a high level interrupt? */
1720         if (CPU_ON_INTR(CPU)) {
1721                 kpreempt_disable();
1722                 onintr = CPU_ON_INTR(CPU);
1723                 kpreempt_enable();
1724         }
1725         return (onintr);
1726 }
1727
1728
1729 /*
1730  * Change the dispatch priority of a thread in the system.
1731  * Used when raising or lowering a thread's priority.
1732  * (E.g., priority inheritance)
1733  *
1734  * Since threads are queued according to their priority, we
1735  * we must check the thread's state to determine whether it
1736  * is on a queue somewhere. If it is, we've got to:
1737  *
1738  *      o Dequeue the thread.
1739  *      o Change its effective priority.
1740  *      o Enqueue the thread.
1741  *
1742  * Assumptions: The thread whose priority we wish to change
1743  * must be locked before we call thread_change_(e)pri().
1744  * The thread_change(e)pri() function doesn't drop the thread
1745  * lock--that must be done by its caller.
1746  */
1747 void
1748 thread_change_epri(kthread_t *t, pri_t disp_pri)
1749 {
1750         uint_t  state;
1751
1752         ASSERT(THREAD_LOCK_HELD(t));
1753
1754         /*
1755          * If the inherited priority hasn't actually changed,
1756          * just return.
1757          */
1758         if (t->t_epri == disp_pri)
1759                 return;
1760
1761         state = t->t_state;
1762
1763         /*
1764          * If it's not on a queue, change the priority with impunity.
1765          */
1766         if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
1767                 t->t_epri = disp_pri;
1768                 if (state == TS_ONPROC) {
1769                         cpu_t *cp = t->t_disp_queue->disp_cpu;
1770
1771                         if (t == cp->cpu_dispthread)
1772                                 cp->cpu_dispatch_pri = DISP_PRIO(t);
1773                 }
1774         } else if (state == TS_SLEEP) {
1775                 /*
1776                  * Take the thread out of its sleep queue.
1777                  * Change the inherited priority.
1778                  * Re-enqueue the thread.
1779                  * Each synchronization object exports a function
1780                  * to do this in an appropriate manner.
1781                  */
1782                 SOBJ_CHANGE_EPRI(t->t_sobj_ops, t, disp_pri);
1783         } else if (state == TS_WAIT) {
1784                 /*
1785                  * Re-enqueue a thread on the wait queue if its
1786                  * effective priority needs to change.
1787                  */
1788                 if (disp_pri != t->t_epri)
1789                         waitq_change_pri(t, disp_pri);
1790         } else {
1791                 /*
1792                  * The thread is on a run queue.
1793                  * Note: setbackdq() may not put the thread
1794                  * back on the same run queue where it originally
1795                  * resided.
1796                  */
1797                 (void) dispdeq(t);
1798                 t->t_epri = disp_pri;
1799                 setbackdq(t);
1800         }
1801         schedctl_set_cidpri(t);
1802 }
1803
1804 /*
1805  * Function: Change the t_pri field of a thread.
1806  * Side Effects: Adjust the thread ordering on a run queue
1807  *               or sleep queue, if necessary.
1808  * Returns: 1 if the thread was on a run queue, else 0.
1809  */
1810 int
1811 thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
1812 {
1813         uint_t  state;
1814         int     on_rq = 0;
1815
1816         ASSERT(THREAD_LOCK_HELD(t));
1817
1818         state = t->t_state;
1819         THREAD_WILLCHANGE_PRI(t, disp_pri);
1820
1821         /*
1822          * If it's not on a queue, change the priority with impunity.
1823          */
1824         if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
1825                 t->t_pri = disp_pri;
1826
1827                 if (state == TS_ONPROC) {
1828                         cpu_t *cp = t->t_disp_queue->disp_cpu;
1829
1830                         if (t == cp->cpu_dispthread)
1831                                 cp->cpu_dispatch_pri = DISP_PRIO(t);
1832                 }
1833         } else if (state == TS_SLEEP) {
1834                 /*
1835                  * If the priority has changed, take the thread out of
1836                  * its sleep queue and change the priority.
1837                  * Re-enqueue the thread.
1838                  * Each synchronization object exports a function
1839                  * to do this in an appropriate manner.
1840                  */
1841                 if (disp_pri != t->t_pri)
1842                         SOBJ_CHANGE_PRI(t->t_sobj_ops, t, disp_pri);
1843         } else if (state == TS_WAIT) {
1844                 /*
1845                  * Re-enqueue a thread on the wait queue if its
1846                  * priority needs to change.
1847                  */
1848                 if (disp_pri != t->t_pri)
1849                         waitq_change_pri(t, disp_pri);
1850         } else {
1851                 /*
1852                  * The thread is on a run queue.
1853                  * Note: setbackdq() may not put the thread
1854                  * back on the same run queue where it originally
1855                  * resided.
1856                  *
1857                  * We still requeue the thread even if the priority
1858                  * is unchanged to preserve round-robin (and other)
1859                  * effects between threads of the same priority.
1860                  */
1861                 on_rq = dispdeq(t);
1862                 ASSERT(on_rq);
1863                 t->t_pri = disp_pri;
1864                 if (front) {
1865                         setfrontdq(t);
1866                 } else {
1867                         setbackdq(t);
1868                 }
1869         }
1870         schedctl_set_cidpri(t);
1871         return (on_rq);
1872 }
1873
1874 /*
1875  * Tunable kmem_stackinfo is set, fill the kernel thread stack with a
1876  * specific pattern.
1877  */
1878 static void
1879 stkinfo_begin(kthread_t *t)
1880 {
1881         caddr_t start;  /* stack start */
1882         caddr_t end;    /* stack end  */
1883         uint64_t *ptr;  /* pattern pointer */
1884
1885         /*
1886          * Stack grows up or down, see thread_create(),
1887          * compute stack memory area start and end (start < end).
1888          */
1889         if (t->t_stk > t->t_stkbase) {
1890                 /* stack grows down */
1891                 start = t->t_stkbase;
1892                 end = t->t_stk;
1893         } else {
1894                 /* stack grows up */
1895                 start = t->t_stk;
1896                 end = t->t_stkbase;
1897         }
1898
1899         /*
1900          * Stackinfo pattern size is 8 bytes. Ensure proper 8 bytes
1901          * alignement for start and end in stack area boundaries
1902          * (protection against corrupt t_stkbase/t_stk data).
1903          */
1904         if ((((uintptr_t)start) & 0x7) != 0) {
1905                 start = (caddr_t)((((uintptr_t)start) & (~0x7)) + 8);
1906         }
1907         end = (caddr_t)(((uintptr_t)end) & (~0x7));
1908
1909         if ((end <= start) || (end - start) > (1024 * 1024)) {
1910                 /* negative or stack size > 1 meg, assume bogus */
1911                 return;
1912         }
1913
1914         /* fill stack area with a pattern (instead of zeros) */
1915         ptr = (uint64_t *)((void *)start);
1916         while (ptr < (uint64_t *)((void *)end)) {
1917                 *ptr++ = KMEM_STKINFO_PATTERN;
1918         }
1919 }
1920
1921
1922 /*
1923  * Tunable kmem_stackinfo is set, create stackinfo log if doesn't already exist,
1924  * compute the percentage of kernel stack really used, and set in the log
1925  * if it's the latest highest percentage.
1926  */
1927 static void
1928 stkinfo_end(kthread_t *t)
1929 {
1930         caddr_t start;  /* stack start */
1931         caddr_t end;    /* stack end  */
1932         uint64_t *ptr;  /* pattern pointer */
1933         size_t stksz;   /* stack size */
1934         size_t smallest = 0;
1935         size_t percent = 0;
1936         uint_t index = 0;
1937         uint_t i;
1938         static size_t smallest_percent = (size_t)-1;
1939         static uint_t full = 0;
1940
1941         /* create the stackinfo log, if doesn't already exist */
1942         mutex_enter(&kmem_stkinfo_lock);
1943         if (kmem_stkinfo_log == NULL) {
1944                 kmem_stkinfo_log = (kmem_stkinfo_t *)
1945                     kmem_zalloc(KMEM_STKINFO_LOG_SIZE *
1946                     (sizeof (kmem_stkinfo_t)), KM_NOSLEEP);
1947                 if (kmem_stkinfo_log == NULL) {
1948                         mutex_exit(&kmem_stkinfo_lock);
1949                         return;
1950                 }
1951         }
1952         mutex_exit(&kmem_stkinfo_lock);
1953
1954         /*
1955          * Stack grows up or down, see thread_create(),
1956          * compute stack memory area start and end (start < end).
1957          */
1958         if (t->t_stk > t->t_stkbase) {
1959                 /* stack grows down */
1960                 start = t->t_stkbase;
1961                 end = t->t_stk;
1962         } else {
1963                 /* stack grows up */
1964                 start = t->t_stk;
1965                 end = t->t_stkbase;
1966         }
1967
1968         /* stack size as found in kthread_t */
1969         stksz = end - start;
1970
1971         /*
1972          * Stackinfo pattern size is 8 bytes. Ensure proper 8 bytes
1973          * alignement for start and end in stack area boundaries
1974          * (protection against corrupt t_stkbase/t_stk data).
1975          */
1976         if ((((uintptr_t)start) & 0x7) != 0) {
1977                 start = (caddr_t)((((uintptr_t)start) & (~0x7)) + 8);
1978         }
1979         end = (caddr_t)(((uintptr_t)end) & (~0x7));
1980
1981         if ((end <= start) || (end - start) > (1024 * 1024)) {
1982                 /* negative or stack size > 1 meg, assume bogus */
1983                 return;
1984         }
1985
1986         /* search until no pattern in the stack */
1987         if (t->t_stk > t->t_stkbase) {
1988                 /* stack grows down */
1989 #if defined(__i386) || defined(__amd64)
1990                 /*
1991                  * 6 longs are pushed on stack, see thread_load(). Skip
1992                  * them, so if kthread has never run, percent is zero.
1993                  * 8 bytes alignement is preserved for a 32 bit kernel,
1994                  * 6 x 4 = 24, 24 is a multiple of 8.
1995                  *
1996                  */
1997                 end -= (6 * sizeof (long));
1998 #endif
1999                 ptr = (uint64_t *)((void *)start);
2000                 while (ptr < (uint64_t *)((void *)end)) {
2001                         if (*ptr != KMEM_STKINFO_PATTERN) {
2002                                 percent = stkinfo_percent(end,
2003                                     start, (caddr_t)ptr);
2004                                 break;
2005                         }
2006                         ptr++;
2007                 }
2008         } else {
2009                 /* stack grows up */
2010                 ptr = (uint64_t *)((void *)end);
2011                 ptr--;
2012                 while (ptr >= (uint64_t *)((void *)start)) {
2013                         if (*ptr != KMEM_STKINFO_PATTERN) {
2014                                 percent = stkinfo_percent(start,
2015                                     end, (caddr_t)ptr);
2016                                 break;
2017                         }
2018                         ptr--;
2019                 }
2020         }
2021
2022         DTRACE_PROBE3(stack__usage, kthread_t *, t,
2023             size_t, stksz, size_t, percent);
2024
2025         if (percent == 0) {
2026                 return;
2027         }
2028
2029         mutex_enter(&kmem_stkinfo_lock);
2030         if (full == KMEM_STKINFO_LOG_SIZE && percent < smallest_percent) {
2031                 /*
2032                  * The log is full and already contains the highest values
2033                  */
2034                 mutex_exit(&kmem_stkinfo_lock);
2035                 return;
2036         }
2037
2038         /* keep a log of the highest used stack */
2039         for (i = 0; i < KMEM_STKINFO_LOG_SIZE; i++) {
2040                 if (kmem_stkinfo_log[i].percent == 0) {
2041                         index = i;
2042                         full++;
2043                         break;
2044                 }
2045                 if (smallest == 0) {
2046                         smallest = kmem_stkinfo_log[i].percent;
2047                         index = i;
2048                         continue;
2049                 }
2050                 if (kmem_stkinfo_log[i].percent < smallest) {
2051                         smallest = kmem_stkinfo_log[i].percent;
2052                         index = i;
2053                 }
2054         }
2055
2056         if (percent >= kmem_stkinfo_log[index].percent) {
2057                 kmem_stkinfo_log[index].kthread = (caddr_t)t;
2058                 kmem_stkinfo_log[index].t_startpc = (caddr_t)t->t_startpc;
2059                 kmem_stkinfo_log[index].start = start;
2060                 kmem_stkinfo_log[index].stksz = stksz;
2061                 kmem_stkinfo_log[index].percent = percent;
2062                 kmem_stkinfo_log[index].t_tid = t->t_tid;
2063                 kmem_stkinfo_log[index].cmd[0] = '\0';
2064                 if (t->t_tid != 0) {
2065                         stksz = strlen((t->t_procp)->p_user.u_comm);
2066                         if (stksz >= KMEM_STKINFO_STR_SIZE) {
2067                                 stksz = KMEM_STKINFO_STR_SIZE - 1;
2068                                 kmem_stkinfo_log[index].cmd[stksz] = '\0';
2069                         } else {
2070                                 stksz += 1;
2071                         }
2072                         (void) memcpy(kmem_stkinfo_log[index].cmd,
2073                             (t->t_procp)->p_user.u_comm, stksz);
2074                 }
2075                 if (percent < smallest_percent) {
2076                         smallest_percent = percent;
2077                 }
2078         }
2079         mutex_exit(&kmem_stkinfo_lock);
2080 }
2081
2082 /*
2083  * Tunable kmem_stackinfo is set, compute stack utilization percentage.
2084  */
2085 static size_t
2086 stkinfo_percent(caddr_t t_stk, caddr_t t_stkbase, caddr_t sp)
2087 {
2088         size_t percent;
2089         size_t s;
2090
2091         if (t_stk > t_stkbase) {
2092                 /* stack grows down */
2093                 if (sp > t_stk) {
2094                         return (0);
2095                 }
2096                 if (sp < t_stkbase) {
2097                         return (100);
2098                 }
2099                 percent = t_stk - sp + 1;
2100                 s = t_stk - t_stkbase + 1;
2101         } else {
2102                 /* stack grows up */
2103                 if (sp < t_stk) {
2104                         return (0);
2105                 }
2106                 if (sp > t_stkbase) {
2107                         return (100);
2108                 }
2109                 percent = sp - t_stk + 1;
2110                 s = t_stkbase - t_stk + 1;
2111         }
2112         percent = ((100 * percent) / s) + 1;
2113         if (percent > 100) {
2114                 percent = 100;
2115         }
2116         return (percent);
2117 }