kernel/disp/disp.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28
  29
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/signal.h>
  34 #include <sys/user.h>
  35 #include <sys/systm.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/var.h>
  38 #include <sys/errno.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/debug.h>
  41 #include <sys/inline.h>
  42 #include <sys/disp.h>
  43 #include <sys/class.h>
  44 #include <sys/bitmap.h>
  45 #include <sys/kmem.h>
  46 #include <sys/cpuvar.h>
  47 #include <sys/vtrace.h>
  48 #include <sys/tnf.h>
  49 #include <sys/cpupart.h>
  50 #include <sys/lgrp.h>
  51 #include <sys/pg.h>
  52 #include <sys/cmt.h>
  53 #include <sys/bitset.h>
  54 #include <sys/schedctl.h>
  55 #include <sys/atomic.h>
  56 #include <sys/dtrace.h>
  57 #include <sys/sdt.h>
  58 #include <sys/archsystm.h>
  59 #include <sys/stdbool.h>
  60
  61 #include <vm/as.h>
  62
  63 #define BOUND_CPU       0x1
  64 #define BOUND_PARTITION 0x2
  65 #define BOUND_INTR      0x4
  66
  67 /* Dispatch queue allocation structure and functions */
  68 struct disp_queue_info {
  69         disp_t  *dp;
  70         dispq_t *olddispq;
  71         dispq_t *newdispq;
  72         ulong_t *olddqactmap;
  73         ulong_t *newdqactmap;
  74         int     oldnglobpris;
  75 };
  76 static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
  77     disp_t *dp);
  78 static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
  79 static void     disp_dq_free(struct disp_queue_info *dptr);
  80
  81 /* platform-specific routine to call when processor is idle */
  82 static void     generic_idle_cpu();
  83 void            (*idle_cpu)() = generic_idle_cpu;
  84
  85 /* routines invoked when a CPU enters/exits the idle loop */
  86 static void     idle_enter();
  87 static void     idle_exit();
  88
  89 /* platform-specific routine to call when thread is enqueued */
  90 static void     generic_enq_thread(cpu_t *, int);
  91 void            (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
  92
  93 pri_t   kpreemptpri;            /* priority where kernel preemption applies */
  94 pri_t   upreemptpri = 0;        /* priority where normal preemption applies */
  95 pri_t   intr_pri;               /* interrupt thread priority base level */
  96
  97 #define KPQPRI  -1              /* pri where cpu affinity is dropped for kpq */
  98 pri_t   kpqpri = KPQPRI;        /* can be set in /etc/system */
  99 disp_t  cpu0_disp;              /* boot CPU's dispatch queue */
 100 int     nswapped;               /* total number of swapped threads */
 101 static void     disp_swapped_setrun(kthread_t *tp);
 102 static void     cpu_resched(cpu_t *cp, pri_t tpri);
 103
 104 /*
 105  * If this is set, only interrupt threads will cause kernel preemptions.
 106  * This is done by changing the value of kpreemptpri.  kpreemptpri
 107  * will either be the max sysclass pri + 1 or the min interrupt pri.
 108  */
 109 int     only_intr_kpreempt;
 110
 111 extern void set_idle_cpu(int cpun);
 112 extern void unset_idle_cpu(int cpun);
 113 static void setkpdq(kthread_t *tp, int borf);
 114 #define SETKP_BACK      0
 115 #define SETKP_FRONT     1
 116 /*
 117  * Parameter that determines how recently a thread must have run
 118  * on the CPU to be considered loosely-bound to that CPU to reduce
 119  * cold cache effects.  The interval is in hertz.
 120  */
 121 #define RECHOOSE_INTERVAL 3
 122 int     rechoose_interval = RECHOOSE_INTERVAL;
 123
 124 /*
 125  * Parameter that determines how long (in nanoseconds) a thread must
 126  * be sitting on a run queue before it can be stolen by another CPU
 127  * to reduce migrations.  The interval is in nanoseconds.
 128  *
 129  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
 130  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
 131  * here indicating it is uninitiallized.
 132  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
 133  *
 134  */
 135 #define NOSTEAL_UNINITIALIZED   (-1)
 136 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
 137 extern void cmp_set_nosteal_interval(void);
 138
 139 id_t    defaultcid;     /* system "default" class; see dispadmin(1M) */
 140
 141 disp_lock_t     transition_lock;        /* lock on transitioning threads */
 142 disp_lock_t     stop_lock;              /* lock on stopped threads */
 143
 144 static void     cpu_dispqalloc(int numpris);
 145
 146 /*
 147  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
 148  * a thread because it was sitting on its run queue for a very short
 149  * period of time.
 150  */
 151 #define T_DONTSTEAL     (kthread_t *)(-1) /* returned by disp_getwork/getbest */
 152
 153 static kthread_t        *disp_getwork(cpu_t *to);
 154 static kthread_t        *disp_getbest(disp_t *from);
 155 static kthread_t        *disp_ratify(kthread_t *tp, disp_t *kpq);
 156
 157 void    swtch_to(kthread_t *);
 158
 159 /*
 160  * dispatcher and scheduler initialization
 161  */
 162
 163 /*
 164  * disp_setup - Common code to calculate and allocate dispatcher
 165  *              variables and structures based on the maximum priority.
 166  */
 167 static void
 168 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
 169 {
 170         pri_t   newnglobpris;
 171
 172         ASSERT(MUTEX_HELD(&cpu_lock));
 173
 174         newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
 175
 176         if (newnglobpris > oldnglobpris) {
 177                 /*
 178                  * Allocate new kp queues for each CPU partition.
 179                  */
 180                 cpupart_kpqalloc(newnglobpris);
 181
 182                 /*
 183                  * Allocate new dispatch queues for each CPU.
 184                  */
 185                 cpu_dispqalloc(newnglobpris);
 186
 187                 /*
 188                  * compute new interrupt thread base priority
 189                  */
 190                 intr_pri = maxglobpri;
 191                 if (only_intr_kpreempt) {
 192                         kpreemptpri = intr_pri + 1;
 193                         if (kpqpri == KPQPRI)
 194                                 kpqpri = kpreemptpri;
 195                 }
 196                 v.v_nglobpris = newnglobpris;
 197         }
 198 }
 199
 200 /*
 201  * dispinit - Called to initialize all loaded classes and the
 202  *            dispatcher framework.
 203  */
 204 void
 205 dispinit(void)
 206 {
 207         id_t    cid;
 208         pri_t   maxglobpri;
 209         pri_t   cl_maxglobpri;
 210
 211         maxglobpri = -1;
 212
 213         /*
 214          * Initialize transition lock, which will always be set.
 215          */
 216         DISP_LOCK_INIT(&transition_lock);
 217         disp_lock_enter_high(&transition_lock);
 218         DISP_LOCK_INIT(&stop_lock);
 219
 220         mutex_enter(&cpu_lock);
 221         CPU->cpu_disp->disp_maxrunpri = -1;
 222         CPU->cpu_disp->disp_max_unbound_pri = -1;
 223
 224         /*
 225          * Initialize the default CPU partition.
 226          */
 227         cpupart_initialize_default();
 228         /*
 229          * Call the class specific initialization functions for
 230          * all pre-installed schedulers.
 231          *
 232          * We pass the size of a class specific parameter
 233          * buffer to each of the initialization functions
 234          * to try to catch problems with backward compatibility
 235          * of class modules.
 236          *
 237          * For example a new class module running on an old system
 238          * which didn't provide sufficiently large parameter buffers
 239          * would be bad news. Class initialization modules can check for
 240          * this and take action if they detect a problem.
 241          */
 242
 243         for (cid = 0; cid < nclass; cid++) {
 244                 sclass_t        *sc;
 245
 246                 sc = &sclass[cid];
 247                 if (SCHED_INSTALLED(sc)) {
 248                         cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
 249                             &sc->cl_funcs);
 250                         if (cl_maxglobpri > maxglobpri)
 251                                 maxglobpri = cl_maxglobpri;
 252                 }
 253         }
 254         kpreemptpri = (pri_t)v.v_maxsyspri + 1;
 255         if (kpqpri == KPQPRI)
 256                 kpqpri = kpreemptpri;
 257
 258         ASSERT(maxglobpri >= 0);
 259         disp_setup(maxglobpri, 0);
 260
 261         mutex_exit(&cpu_lock);
 262
 263         /*
 264          * Platform specific sticky scheduler setup.
 265          */
 266         if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
 267                 cmp_set_nosteal_interval();
 268
 269         /*
 270          * Get the default class ID; this may be later modified via
 271          * dispadmin(1M).  This will load the class (normally TS) and that will
 272          * call disp_add(), which is why we had to drop cpu_lock first.
 273          */
 274         if (getcid(defaultclass, &defaultcid) != 0) {
 275                 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
 276                     defaultclass);
 277         }
 278 }
 279
 280 /*
 281  * disp_add - Called with class pointer to initialize the dispatcher
 282  *            for a newly loaded class.
 283  */
 284 void
 285 disp_add(sclass_t *clp)
 286 {
 287         pri_t   maxglobpri;
 288         pri_t   cl_maxglobpri;
 289
 290         mutex_enter(&cpu_lock);
 291         /*
 292          * Initialize the scheduler class.
 293          */
 294         maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
 295         cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
 296         if (cl_maxglobpri > maxglobpri)
 297                 maxglobpri = cl_maxglobpri;
 298
 299         /*
 300          * Save old queue information.  Since we're initializing a
 301          * new scheduling class which has just been loaded, then
 302          * the size of the dispq may have changed.  We need to handle
 303          * that here.
 304          */
 305         disp_setup(maxglobpri, v.v_nglobpris);
 306
 307         mutex_exit(&cpu_lock);
 308 }
 309
 310
 311 /*
 312  * For each CPU, allocate new dispatch queues
 313  * with the stated number of priorities.
 314  */
 315 static void
 316 cpu_dispqalloc(int numpris)
 317 {
 318         cpu_t   *cpup;
 319         struct disp_queue_info  *disp_mem;
 320         int i, num;
 321
 322         ASSERT(MUTEX_HELD(&cpu_lock));
 323
 324         disp_mem = kmem_zalloc(NCPU *
 325             sizeof (struct disp_queue_info), KM_SLEEP);
 326
 327         /*
 328          * This routine must allocate all of the memory before stopping
 329          * the cpus because it must not sleep in kmem_alloc while the
 330          * CPUs are stopped.  Locks they hold will not be freed until they
 331          * are restarted.
 332          */
 333         i = 0;
 334         cpup = cpu_list;
 335         do {
 336                 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
 337                 i++;
 338                 cpup = cpup->cpu_next;
 339         } while (cpup != cpu_list);
 340         num = i;
 341
 342         pause_cpus(NULL, NULL);
 343         for (i = 0; i < num; i++)
 344                 disp_dq_assign(&disp_mem[i], numpris);
 345         start_cpus();
 346
 347         /*
 348          * I must free all of the memory after starting the cpus because
 349          * I can not risk sleeping in kmem_free while the cpus are stopped.
 350          */
 351         for (i = 0; i < num; i++)
 352                 disp_dq_free(&disp_mem[i]);
 353
 354         kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
 355 }
 356
 357 static void
 358 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
 359 {
 360         dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
 361         dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
 362             sizeof (long), KM_SLEEP);
 363         dptr->dp = dp;
 364 }
 365
 366 static void
 367 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
 368 {
 369         disp_t  *dp;
 370
 371         dp = dptr->dp;
 372         dptr->olddispq = dp->disp_q;
 373         dptr->olddqactmap = dp->disp_qactmap;
 374         dptr->oldnglobpris = dp->disp_npri;
 375
 376         ASSERT(dptr->oldnglobpris < numpris);
 377
 378         if (dptr->olddispq != NULL) {
 379                 /*
 380                  * Use kcopy because bcopy is platform-specific
 381                  * and could block while we might have paused the cpus.
 382                  */
 383                 (void) kcopy(dptr->olddispq, dptr->newdispq,
 384                     dptr->oldnglobpris * sizeof (dispq_t));
 385                 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
 386                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
 387                     sizeof (long));
 388         }
 389         dp->disp_q = dptr->newdispq;
 390         dp->disp_qactmap = dptr->newdqactmap;
 391         dp->disp_q_limit = &dptr->newdispq[numpris];
 392         dp->disp_npri = numpris;
 393 }
 394
 395 static void
 396 disp_dq_free(struct disp_queue_info *dptr)
 397 {
 398         if (dptr->olddispq != NULL)
 399                 kmem_free(dptr->olddispq,
 400                     dptr->oldnglobpris * sizeof (dispq_t));
 401         if (dptr->olddqactmap != NULL)
 402                 kmem_free(dptr->olddqactmap,
 403                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
 404 }
 405
 406 /*
 407  * For a newly created CPU, initialize the dispatch queue.
 408  * This is called before the CPU is known through cpu[] or on any lists.
 409  */
 410 void
 411 disp_cpu_init(cpu_t *cp)
 412 {
 413         disp_t  *dp;
 414         dispq_t *newdispq;
 415         ulong_t *newdqactmap;
 416
 417         ASSERT(MUTEX_HELD(&cpu_lock));  /* protect dispatcher queue sizes */
 418
 419         if (cp == cpu0_disp.disp_cpu)
 420                 dp = &cpu0_disp;
 421         else
 422                 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
 423         bzero(dp, sizeof (disp_t));
 424         cp->cpu_disp = dp;
 425         dp->disp_cpu = cp;
 426         dp->disp_maxrunpri = -1;
 427         dp->disp_max_unbound_pri = -1;
 428         DISP_LOCK_INIT(&cp->cpu_thread_lock);
 429         /*
 430          * Allocate memory for the dispatcher queue headers
 431          * and the active queue bitmap.
 432          */
 433         newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
 434         newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
 435             sizeof (long), KM_SLEEP);
 436         dp->disp_q = newdispq;
 437         dp->disp_qactmap = newdqactmap;
 438         dp->disp_q_limit = &newdispq[v.v_nglobpris];
 439         dp->disp_npri = v.v_nglobpris;
 440 }
 441
 442 void
 443 disp_cpu_fini(cpu_t *cp)
 444 {
 445         ASSERT(MUTEX_HELD(&cpu_lock));
 446
 447         disp_kp_free(cp->cpu_disp);
 448         if (cp->cpu_disp != &cpu0_disp)
 449                 kmem_free(cp->cpu_disp, sizeof (disp_t));
 450 }
 451
 452 /*
 453  * Allocate new, larger kpreempt dispatch queue to replace the old one.
 454  */
 455 void
 456 disp_kp_alloc(disp_t *dq, pri_t npri)
 457 {
 458         struct disp_queue_info  mem_info;
 459
 460         if (npri > dq->disp_npri) {
 461                 /*
 462                  * Allocate memory for the new array.
 463                  */
 464                 disp_dq_alloc(&mem_info, npri, dq);
 465
 466                 /*
 467                  * We need to copy the old structures to the new
 468                  * and free the old.
 469                  */
 470                 disp_dq_assign(&mem_info, npri);
 471                 disp_dq_free(&mem_info);
 472         }
 473 }
 474
 475 /*
 476  * Free dispatch queue.
 477  * Used for the kpreempt queues for a removed CPU partition and
 478  * for the per-CPU queues of deleted CPUs.
 479  */
 480 void
 481 disp_kp_free(disp_t *dq)
 482 {
 483         struct disp_queue_info  mem_info;
 484
 485         mem_info.olddispq = dq->disp_q;
 486         mem_info.olddqactmap = dq->disp_qactmap;
 487         mem_info.oldnglobpris = dq->disp_npri;
 488         disp_dq_free(&mem_info);
 489 }
 490
 491 /*
 492  * End dispatcher and scheduler initialization.
 493  */
 494
 495 /*
 496  * See if there's anything to do other than remain idle.
 497  * Return non-zero if there is.
 498  *
 499  * This function must be called with high spl, or with
 500  * kernel preemption disabled to prevent the partition's
 501  * active cpu list from changing while being traversed.
 502  *
 503  * This is essentially a simpler version of disp_getwork()
 504  * to be called by CPUs preparing to "halt".
 505  */
 506 int
 507 disp_anywork(void)
 508 {
 509         cpu_t           *cp = CPU;
 510         cpu_t           *ocp;
 511         volatile int    *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
 512
 513         if (!(cp->cpu_flags & CPU_OFFLINE)) {
 514                 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
 515                         return (1);
 516
 517                 for (ocp = cp->cpu_next_part; ocp != cp;
 518                     ocp = ocp->cpu_next_part) {
 519                         ASSERT(CPU_ACTIVE(ocp));
 520
 521                         /*
 522                          * Something has appeared on the local run queue.
 523                          */
 524                         if (*local_nrunnable > 0)
 525                                 return (1);
 526                         /*
 527                          * If we encounter another idle CPU that will
 528                          * soon be trolling around through disp_anywork()
 529                          * terminate our walk here and let this other CPU
 530                          * patrol the next part of the list.
 531                          */
 532                         if (ocp->cpu_dispatch_pri == -1 &&
 533                             (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
 534                                 return (0);
 535                         /*
 536                          * Work can be taken from another CPU if:
 537                          *      - There is unbound work on the run queue
 538                          *      - That work isn't a thread undergoing a
 539                          *      - context switch on an otherwise empty queue.
 540                          *      - The CPU isn't running the idle loop.
 541                          */
 542                         if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
 543                             !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
 544                             ocp->cpu_disp->disp_nrunnable == 1) &&
 545                             ocp->cpu_dispatch_pri != -1)
 546                                 return (1);
 547                 }
 548         }
 549         return (0);
 550 }
 551
 552 /*
 553  * Called when CPU enters the idle loop
 554  */
 555 static void
 556 idle_enter()
 557 {
 558         cpu_t           *cp = CPU;
 559
 560         new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
 561         CPU_STATS_ADDQ(cp, sys, idlethread, 1);
 562         set_idle_cpu(cp->cpu_id);       /* arch-dependent hook */
 563 }
 564
 565 /*
 566  * Called when CPU exits the idle loop
 567  */
 568 static void
 569 idle_exit()
 570 {
 571         cpu_t           *cp = CPU;
 572
 573         new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
 574         unset_idle_cpu(cp->cpu_id);     /* arch-dependent hook */
 575 }
 576
 577 /*
 578  * Idle loop.
 579  */
 580 void
 581 idle()
 582 {
 583         struct cpu      *cp = CPU;              /* pointer to this CPU */
 584         kthread_t       *t;                     /* taken thread */
 585
 586         idle_enter();
 587
 588         /*
 589          * Uniprocessor version of idle loop.
 590          * Do this until notified that we're on an actual multiprocessor.
 591          */
 592         while (ncpus == 1) {
 593                 if (cp->cpu_disp->disp_nrunnable == 0) {
 594                         (*idle_cpu)();
 595                         continue;
 596                 }
 597                 idle_exit();
 598                 swtch();
 599
 600                 idle_enter(); /* returned from swtch */
 601         }
 602
 603         /*
 604          * Multiprocessor idle loop.
 605          */
 606         for (;;) {
 607                 /*
 608                  * If CPU is completely quiesced by p_online(2), just wait
 609                  * here with minimal bus traffic until put online.
 610                  */
 611                 while (cp->cpu_flags & CPU_QUIESCED)
 612                         (*idle_cpu)();
 613
 614                 if (cp->cpu_disp->disp_nrunnable != 0) {
 615                         idle_exit();
 616                         swtch();
 617                 } else {
 618                         if (cp->cpu_flags & CPU_OFFLINE)
 619                                 continue;
 620                         if ((t = disp_getwork(cp)) == NULL) {
 621                                 if (cp->cpu_chosen_level != -1) {
 622                                         disp_t *dp = cp->cpu_disp;
 623                                         disp_t *kpq;
 624
 625                                         disp_lock_enter(&dp->disp_lock);
 626                                         /*
 627                                          * Set kpq under lock to prevent
 628                                          * migration between partitions.
 629                                          */
 630                                         kpq = &cp->cpu_part->cp_kp_queue;
 631                                         if (kpq->disp_maxrunpri == -1)
 632                                                 cp->cpu_chosen_level = -1;
 633                                         disp_lock_exit(&dp->disp_lock);
 634                                 }
 635                                 (*idle_cpu)();
 636                                 continue;
 637                         }
 638                         /*
 639                          * If there was a thread but we couldn't steal
 640                          * it, then keep trying.
 641                          */
 642                         if (t == T_DONTSTEAL)
 643                                 continue;
 644                         idle_exit();
 645                         swtch_to(t);
 646                 }
 647                 idle_enter(); /* returned from swtch/swtch_to */
 648         }
 649 }
 650
 651
 652 /*
 653  * Preempt the currently running thread in favor of the highest
 654  * priority thread.  The class of the current thread controls
 655  * where it goes on the dispatcher queues. If panicking, turn
 656  * preemption off.
 657  */
 658 void
 659 preempt()
 660 {
 661         kthread_t       *t = curthread;
 662         klwp_t          *lwp = ttolwp(curthread);
 663
 664         if (panicstr)
 665                 return;
 666
 667         TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
 668
 669         thread_lock(t);
 670
 671         if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
 672                 /*
 673                  * this thread has already been chosen to be run on
 674                  * another CPU. Clear kprunrun on this CPU since we're
 675                  * already headed for swtch().
 676                  */
 677                 CPU->cpu_kprunrun = 0;
 678                 thread_unlock_nopreempt(t);
 679                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 680         } else {
 681                 if (lwp != NULL)
 682                         lwp->lwp_ru.nivcsw++;
 683                 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
 684                 THREAD_TRANSITION(t);
 685                 CL_PREEMPT(t);
 686                 DTRACE_SCHED(preempt);
 687                 thread_unlock_nopreempt(t);
 688
 689                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 690
 691                 swtch();                /* clears CPU->cpu_runrun via disp() */
 692         }
 693 }
 694
 695 extern kthread_t *thread_unpin();
 696
 697 /*
 698  * disp() - find the highest priority thread for this processor to run, and
 699  * set it in TS_ONPROC state so that resume() can be called to run it.
 700  */
 701 static kthread_t *
 702 disp()
 703 {
 704         cpu_t           *cpup;
 705         disp_t          *dp;
 706         kthread_t       *tp;
 707         dispq_t         *dq;
 708         int             maxrunword;
 709         pri_t           pri;
 710         disp_t          *kpq;
 711
 712         TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
 713
 714         cpup = CPU;
 715         /*
 716          * Find the highest priority loaded, runnable thread.
 717          */
 718         dp = cpup->cpu_disp;
 719
 720 reschedule:
 721         /*
 722          * If there is more important work on the global queue with a better
 723          * priority than the maximum on this CPU, take it now.
 724          */
 725         kpq = &cpup->cpu_part->cp_kp_queue;
 726         while ((pri = kpq->disp_maxrunpri) >= 0 &&
 727             pri >= dp->disp_maxrunpri &&
 728             (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
 729             (tp = disp_getbest(kpq)) != NULL) {
 730                 if (disp_ratify(tp, kpq) != NULL) {
 731                         TRACE_1(TR_FAC_DISP, TR_DISP_END,
 732                             "disp_end:tid %p", tp);
 733                         return (tp);
 734                 }
 735         }
 736
 737         disp_lock_enter(&dp->disp_lock);
 738         pri = dp->disp_maxrunpri;
 739
 740         /*
 741          * If there is nothing to run, look at what's runnable on other queues.
 742          * Choose the idle thread if the CPU is quiesced.
 743          * Note that CPUs that have the CPU_OFFLINE flag set can still run
 744          * interrupt threads, which will be the only threads on the CPU's own
 745          * queue, but cannot run threads from other queues.
 746          */
 747         if (pri == -1) {
 748                 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
 749                         disp_lock_exit(&dp->disp_lock);
 750                         if ((tp = disp_getwork(cpup)) == NULL ||
 751                             tp == T_DONTSTEAL) {
 752                                 tp = cpup->cpu_idle_thread;
 753                                 (void) splhigh();
 754                                 THREAD_ONPROC(tp, cpup);
 755                                 cpup->cpu_dispthread = tp;
 756                                 cpup->cpu_dispatch_pri = -1;
 757                                 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 758                                 cpup->cpu_chosen_level = -1;
 759                         }
 760                 } else {
 761                         disp_lock_exit_high(&dp->disp_lock);
 762                         tp = cpup->cpu_idle_thread;
 763                         THREAD_ONPROC(tp, cpup);
 764                         cpup->cpu_dispthread = tp;
 765                         cpup->cpu_dispatch_pri = -1;
 766                         cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 767                         cpup->cpu_chosen_level = -1;
 768                 }
 769                 TRACE_1(TR_FAC_DISP, TR_DISP_END,
 770                     "disp_end:tid %p", tp);
 771                 return (tp);
 772         }
 773
 774         dq = &dp->disp_q[pri];
 775         tp = dq->dq_first;
 776
 777         ASSERT(tp != NULL);
 778
 779         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
 780
 781         /*
 782          * Found it so remove it from queue.
 783          */
 784         dp->disp_nrunnable--;
 785         dq->dq_sruncnt--;
 786         if ((dq->dq_first = tp->t_link) == NULL) {
 787                 ulong_t *dqactmap = dp->disp_qactmap;
 788
 789                 ASSERT(dq->dq_sruncnt == 0);
 790                 dq->dq_last = NULL;
 791
 792                 /*
 793                  * The queue is empty, so the corresponding bit needs to be
 794                  * turned off in dqactmap.   If nrunnable != 0 just took the
 795                  * last runnable thread off the
 796                  * highest queue, so recompute disp_maxrunpri.
 797                  */
 798                 maxrunword = pri >> BT_ULSHIFT;
 799                 dqactmap[maxrunword] &= ~BT_BIW(pri);
 800
 801                 if (dp->disp_nrunnable == 0) {
 802                         dp->disp_max_unbound_pri = -1;
 803                         dp->disp_maxrunpri = -1;
 804                 } else {
 805                         int ipri;
 806
 807                         ipri = bt_gethighbit(dqactmap, maxrunword);
 808                         dp->disp_maxrunpri = ipri;
 809                         if (ipri < dp->disp_max_unbound_pri)
 810                                 dp->disp_max_unbound_pri = ipri;
 811                 }
 812         } else {
 813                 tp->t_link = NULL;
 814         }
 815
 816         cpup->cpu_dispthread = tp;              /* protected by spl only */
 817         cpup->cpu_dispatch_pri = pri;
 818         ASSERT(pri == DISP_PRIO(tp));
 819         thread_onproc(tp, cpup);                /* set t_state to TS_ONPROC */
 820         disp_lock_exit_high(&dp->disp_lock);    /* drop run queue lock */
 821
 822         ASSERT(tp != NULL);
 823         TRACE_1(TR_FAC_DISP, TR_DISP_END,
 824             "disp_end:tid %p", tp);
 825
 826         if (disp_ratify(tp, kpq) == NULL)
 827                 goto reschedule;
 828
 829         return (tp);
 830 }
 831
 832 /*
 833  * swtch()
 834  *      Find best runnable thread and run it.
 835  *      Called with the current thread already switched to a new state,
 836  *      on a sleep queue, run queue, stopped, and not zombied.
 837  *      May be called at any spl level less than or equal to LOCK_LEVEL.
 838  *      Always drops spl to the base level (spl0()).
 839  */
 840 void
 841 swtch()
 842 {
 843         kthread_t       *t = curthread;
 844         kthread_t       *next;
 845         cpu_t           *cp;
 846
 847         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 848
 849         if (t->t_flag & T_INTR_THREAD)
 850                 cpu_intr_swtch_enter(t);
 851
 852         if (t->t_intr != NULL) {
 853                 /*
 854                  * We are an interrupt thread.  Setup and return
 855                  * the interrupted thread to be resumed.
 856                  */
 857                 (void) splhigh();       /* block other scheduler action */
 858                 cp = CPU;               /* now protected against migration */
 859                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 860                 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 861                 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
 862                 next = thread_unpin();
 863                 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 864                 resume_from_intr(next);
 865         } else {
 866 #ifdef  DEBUG
 867                 if (t->t_state == TS_ONPROC &&
 868                     t->t_disp_queue->disp_cpu == CPU &&
 869                     t->t_preempt == 0) {
 870                         thread_lock(t);
 871                         ASSERT(t->t_state != TS_ONPROC ||
 872                             t->t_disp_queue->disp_cpu != CPU ||
 873                             t->t_preempt != 0); /* cannot migrate */
 874                         thread_unlock_nopreempt(t);
 875                 }
 876 #endif  /* DEBUG */
 877                 cp = CPU;
 878                 next = disp();          /* returns with spl high */
 879                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 880
 881                 /* OK to steal anything left on run queue */
 882                 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
 883
 884                 if (next != t) {
 885                         hrtime_t now;
 886
 887                         now = gethrtime_unscaled();
 888                         pg_ev_thread_swtch(cp, now, t, next);
 889
 890                         /*
 891                          * If t was previously in the TS_ONPROC state,
 892                          * setfrontdq and setbackdq won't have set its t_waitrq.
 893                          * Since we now finally know that we're switching away
 894                          * from this thread, set its t_waitrq if it is on a run
 895                          * queue.
 896                          */
 897                         if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
 898                                 t->t_waitrq = now;
 899                         }
 900
 901                         /*
 902                          * restore mstate of thread that we are switching to
 903                          */
 904                         restore_mstate(next);
 905
 906                         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 907                         cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
 908                         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 909
 910                         if (dtrace_vtime_active)
 911                                 dtrace_vtime_switch(next);
 912
 913                         resume(next);
 914                         /*
 915                          * The TR_RESUME_END and TR_SWTCH_END trace points
 916                          * appear at the end of resume(), because we may not
 917                          * return here
 918                          */
 919                 } else {
 920                         if (t->t_flag & T_INTR_THREAD)
 921                                 cpu_intr_swtch_exit(t);
 922                         /*
 923                          * Threads that enqueue themselves on a run queue defer
 924                          * setting t_waitrq. It is then either set in swtch()
 925                          * when the CPU is actually yielded, or not at all if it
 926                          * is remaining on the CPU.
 927                          * There is however a window between where the thread
 928                          * placed itself on a run queue, and where it selects
 929                          * itself in disp(), where a third party (eg. clock()
 930                          * doing tick processing) may have re-enqueued this
 931                          * thread, setting t_waitrq in the process. We detect
 932                          * this race by noticing that despite switching to
 933                          * ourself, our t_waitrq has been set, and should be
 934                          * cleared.
 935                          */
 936                         if (t->t_waitrq != 0)
 937                                 t->t_waitrq = 0;
 938
 939                         pg_ev_thread_remain(cp, t);
 940
 941                         DTRACE_SCHED(remain__cpu);
 942                         TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
 943                         (void) spl0();
 944                 }
 945         }
 946 }
 947
 948 /*
 949  * swtch_from_zombie()
 950  *      Special case of swtch(), which allows checks for TS_ZOMB to be
 951  *      eliminated from normal resume.
 952  *      Find best runnable thread and run it.
 953  *      Called with the current thread zombied.
 954  *      Zombies cannot migrate, so CPU references are safe.
 955  */
 956 void
 957 swtch_from_zombie()
 958 {
 959         kthread_t       *next;
 960         cpu_t           *cpu = CPU;
 961
 962         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 963
 964         ASSERT(curthread->t_state == TS_ZOMB);
 965
 966         next = disp();                  /* returns with spl high */
 967         ASSERT(CPU_ON_INTR(CPU) == 0);  /* not called with PIL > 10 */
 968         CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
 969         ASSERT(next != curthread);
 970         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 971
 972         pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
 973
 974         restore_mstate(next);
 975
 976         if (dtrace_vtime_active)
 977                 dtrace_vtime_switch(next);
 978
 979         resume_from_zombie(next);
 980         /*
 981          * The TR_RESUME_END and TR_SWTCH_END trace points
 982          * appear at the end of resume(), because we certainly will not
 983          * return here
 984          */
 985 }
 986
 987 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
 988
 989 /*
 990  * search_disp_queues()
 991  *      Search the given dispatch queues for thread tp.
 992  *      Return 1 if tp is found, otherwise return 0.
 993  */
 994 static int
 995 search_disp_queues(disp_t *dp, kthread_t *tp)
 996 {
 997         dispq_t         *dq;
 998         dispq_t         *eq;
 999
1000         disp_lock_enter_high(&dp->disp_lock);
1001
1002         for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1003                 kthread_t       *rp;
1004
1005                 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1006
1007                 for (rp = dq->dq_first; rp; rp = rp->t_link)
1008                         if (tp == rp) {
1009                                 disp_lock_exit_high(&dp->disp_lock);
1010                                 return (1);
1011                         }
1012         }
1013         disp_lock_exit_high(&dp->disp_lock);
1014
1015         return (0);
1016 }
1017
1018 /*
1019  * thread_on_queue()
1020  *      Search all per-CPU dispatch queues and all partition-wide kpreempt
1021  *      queues for thread tp. Return 1 if tp is found, otherwise return 0.
1022  */
1023 static int
1024 thread_on_queue(kthread_t *tp)
1025 {
1026         cpu_t           *cp;
1027         struct cpupart  *part;
1028
1029         ASSERT(getpil() >= DISP_LEVEL);
1030
1031         /*
1032          * Search the per-CPU dispatch queues for tp.
1033          */
1034         cp = CPU;
1035         do {
1036                 if (search_disp_queues(cp->cpu_disp, tp))
1037                         return (1);
1038         } while ((cp = cp->cpu_next_onln) != CPU);
1039
1040         /*
1041          * Search the partition-wide kpreempt queues for tp.
1042          */
1043         part = CPU->cpu_part;
1044         do {
1045                 if (search_disp_queues(&part->cp_kp_queue, tp))
1046                         return (1);
1047         } while ((part = part->cp_next) != CPU->cpu_part);
1048
1049         return (0);
1050 }
1051
1052 #else
1053
1054 #define thread_on_queue(tp)     0       /* ASSERT must be !thread_on_queue */
1055
1056 #endif  /* DEBUG */
1057
1058 /*
1059  * like swtch(), but switch to a specified thread taken from another CPU.
1060  *      called with spl high..
1061  */
1062 void
1063 swtch_to(kthread_t *next)
1064 {
1065         cpu_t                   *cp = CPU;
1066         hrtime_t                now;
1067
1068         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1069
1070         /*
1071          * Update context switch statistics.
1072          */
1073         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1074
1075         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1076
1077         now = gethrtime_unscaled();
1078         pg_ev_thread_swtch(cp, now, curthread, next);
1079
1080         /* OK to steal anything left on run queue */
1081         cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1082
1083         /* record last execution time */
1084         cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1085
1086         /*
1087          * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1088          * won't have set its t_waitrq.  Since we now finally know that we're
1089          * switching away from this thread, set its t_waitrq if it is on a run
1090          * queue.
1091          */
1092         if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1093                 curthread->t_waitrq = now;
1094         }
1095
1096         /* restore next thread to previously running microstate */
1097         restore_mstate(next);
1098
1099         if (dtrace_vtime_active)
1100                 dtrace_vtime_switch(next);
1101
1102         resume(next);
1103         /*
1104          * The TR_RESUME_END and TR_SWTCH_END trace points
1105          * appear at the end of resume(), because we may not
1106          * return here
1107          */
1108 }
1109
1110 #define CPU_IDLING(pri) ((pri) == -1)
1111
1112 static void
1113 cpu_resched(cpu_t *cp, pri_t tpri)
1114 {
1115         int     call_poke_cpu = 0;
1116         pri_t   cpupri = cp->cpu_dispatch_pri;
1117
1118         if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1119                 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1120                     "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1121                 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1122                         cp->cpu_runrun = 1;
1123                         aston(cp->cpu_dispthread);
1124                         if (tpri < kpreemptpri && cp != CPU)
1125                                 call_poke_cpu = 1;
1126                 }
1127                 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1128                         cp->cpu_kprunrun = 1;
1129                         if (cp != CPU)
1130                                 call_poke_cpu = 1;
1131                 }
1132         }
1133
1134         /*
1135          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1136          */
1137         membar_enter();
1138
1139         if (call_poke_cpu)
1140                 poke_cpu(cp->cpu_id);
1141 }
1142
1143 /*
1144  * setbackdq() keeps runqs balanced such that the difference in length
1145  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1146  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1147  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1148  * try to keep runqs perfectly balanced regardless of the thread priority.
1149  */
1150 #define RUNQ_MATCH_PRI  16      /* pri below which queue lengths must match */
1151 #define RUNQ_MAX_DIFF   2       /* maximum runq length difference */
1152 #define RUNQ_LEN(cp, pri)       ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1153
1154 /*
1155  * Macro that evaluates to true if it is likely that the thread has cache
1156  * warmth. This is based on the amount of time that has elapsed since the
1157  * thread last ran. If that amount of time is less than "rechoose_interval"
1158  * ticks, then we decide that the thread has enough cache warmth to warrant
1159  * some affinity for t->t_cpu.
1160  */
1161 #define THREAD_HAS_CACHE_WARMTH(thread) \
1162         ((thread == curthread) ||       \
1163         ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1164
1165 /*
1166  * Put the specified thread on the front/back of the dispatcher queue
1167  * corresponding to its current priority.
1168  *
1169  * Called with the thread in transition, onproc or stopped state and locked
1170  * (transition implies locked) and at high spl.  Returns with the thread in
1171  * TS_RUN state and still locked.
1172  */
1173 static void
1174 setfrontbackdq(kthread_t *tp, bool front)
1175 {
1176         dispq_t         *dq;
1177         disp_t          *dp;
1178         cpu_t           *cp;
1179         pri_t           tpri;
1180         bool            bound;
1181         boolean_t       self;
1182
1183         ASSERT(THREAD_LOCK_HELD(tp));
1184         ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1185         ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1186
1187         self  = (tp == curthread);
1188         bound = (tp->t_bound_cpu || tp->t_weakbound_cpu);
1189
1190         tpri = DISP_PRIO(tp);
1191         if (ncpus == 1)
1192                 cp = tp->t_cpu;
1193         else if (!bound) {
1194                 if (tpri >= kpqpri) {
1195                         setkpdq(tp, front ? SETKP_FRONT : SETKP_BACK);
1196                         return;
1197                 }
1198
1199                 cp = tp->t_cpu;
1200
1201                 if (!front) {
1202                         /*
1203                          * We'll generally let this thread continue to run where
1204                          * it last ran...but will consider migration if:
1205                          * - We thread probably doesn't have much cache warmth.
1206                          * - The CPU where it last ran is the target of an offline
1207                          *   request.
1208                          * - The thread last ran outside it's home lgroup.
1209                          */
1210                         if ((!THREAD_HAS_CACHE_WARMTH(tp)) || (cp == cpu_inmotion)) {
1211                                 cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri, NULL);
1212                         } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) {
1213                                 cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri,
1214                                     self ? cp : NULL);
1215                         }
1216
1217                 }
1218
1219                 if (tp->t_cpupart == cp->cpu_part) {
1220                         if (front) {
1221                                 /*
1222                                  * We'll generally let this thread continue to run
1223                                  * where it last ran, but will consider migration if:
1224                                  * - The thread last ran outside it's home lgroup.
1225                                  * - The CPU where it last ran is the target of an
1226                                  *   offline request (a thread_nomigrate() on the in
1227                                  *   motion CPU relies on this when forcing a preempt).
1228                                  * - The thread isn't the highest priority thread where
1229                                  *   it last ran, and it is considered not likely to
1230                                  *   have significant cache warmth.
1231                                  */
1232                                 if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1233                                     (cp == cpu_inmotion)) {
1234                                         cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri,
1235                                             self ? cp : NULL);
1236                                 } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1237                                     (!THREAD_HAS_CACHE_WARMTH(tp))) {
1238                                         cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri,
1239                                             NULL);
1240                                 }
1241                         } else {
1242                                 int     qlen;
1243
1244                                 /*
1245                                  * Perform any CMT load balancing
1246                                  */
1247                                 cp = cmt_balance(tp, cp);
1248
1249                                 /*
1250                                  * Balance across the run queues
1251                                  */
1252                                 qlen = RUNQ_LEN(cp, tpri);
1253                                 if (tpri >= RUNQ_MATCH_PRI &&
1254                                     !(tp->t_schedflag & TS_RUNQMATCH))
1255                                         qlen -= RUNQ_MAX_DIFF;
1256                                 if (qlen > 0) {
1257                                         cpu_t *newcp;
1258
1259                                         if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1260                                                 newcp = cp->cpu_next_part;
1261                                         } else if ((newcp = cp->cpu_next_lpl) == cp) {
1262                                                 newcp = cp->cpu_next_part;
1263                                         }
1264
1265                                         if (RUNQ_LEN(newcp, tpri) < qlen) {
1266                                                 DTRACE_PROBE3(runq__balance,
1267                                                     kthread_t *, tp,
1268                                                     cpu_t *, cp, cpu_t *, newcp);
1269                                                 cp = newcp;
1270                                         }
1271                                 }
1272                         }
1273                 } else {
1274                         /*
1275                          * Migrate to a cpu in the new partition.
1276                          */
1277                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1278                             tp->t_lpl, tp->t_pri, NULL);
1279                 }
1280
1281                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1282         } else {
1283                 /*
1284                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1285                  * a short time until weak binding that existed when the
1286                  * strong binding was established has dropped) so we must
1287                  * favour weak binding over strong.
1288                  */
1289                 cp = tp->t_weakbound_cpu ?
1290                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1291         }
1292
1293         /*
1294          * A thread that is ONPROC may be temporarily placed on the run queue
1295          * but then chosen to run again by disp.  If the thread we're placing on
1296          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1297          * replacement process is actually scheduled in swtch().  In this
1298          * situation, curthread is the only thread that could be in the ONPROC
1299          * state.
1300          */
1301         if ((!self) && (tp->t_waitrq == 0)) {
1302                 hrtime_t curtime;
1303
1304                 curtime = gethrtime_unscaled();
1305                 (void) cpu_update_pct(tp, curtime);
1306                 tp->t_waitrq = curtime;
1307         } else {
1308                 (void) cpu_update_pct(tp, gethrtime_unscaled());
1309         }
1310
1311         dp = cp->cpu_disp;
1312         disp_lock_enter_high(&dp->disp_lock);
1313
1314         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, front);
1315
1316 #ifndef NPROBE
1317         /* Kernel probe */
1318         if (tnf_tracing_active)
1319                 tnf_thread_queue(tp, cp, tpri);
1320 #endif /* NPROBE */
1321
1322         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1323
1324         THREAD_RUN(tp, &dp->disp_lock);         /* set t_state to TS_RUN */
1325         tp->t_disp_queue = dp;
1326         tp->t_link = NULL;
1327
1328         dq = &dp->disp_q[tpri];
1329         dp->disp_nrunnable++;
1330         if (!bound)
1331                 dp->disp_steal = 0;
1332         membar_enter();
1333
1334         if (dq->dq_sruncnt++ != 0) {
1335                 if (front) {
1336                         ASSERT(dq->dq_last != NULL);
1337                         tp->t_link = dq->dq_first;
1338                         dq->dq_first = tp;
1339                 } else {
1340                         ASSERT(dq->dq_first != NULL);
1341                         dq->dq_last->t_link = tp;
1342                         dq->dq_last = tp;
1343                 }
1344         } else {
1345                 ASSERT(dq->dq_first == NULL);
1346                 ASSERT(dq->dq_last == NULL);
1347                 dq->dq_first = dq->dq_last = tp;
1348                 BT_SET(dp->disp_qactmap, tpri);
1349                 if (tpri > dp->disp_maxrunpri) {
1350                         dp->disp_maxrunpri = tpri;
1351                         membar_enter();
1352                         cpu_resched(cp, tpri);
1353                 }
1354         }
1355
1356         if (!bound && tpri > dp->disp_max_unbound_pri) {
1357                 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1358                         /*
1359                          * If there are no other unbound threads on the
1360                          * run queue, don't allow other CPUs to steal
1361                          * this thread while we are in the middle of a
1362                          * context switch. We may just switch to it
1363                          * again right away. CPU_DISP_DONTSTEAL is cleared
1364                          * in swtch and swtch_to.
1365                          */
1366                         cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1367                 }
1368                 dp->disp_max_unbound_pri = tpri;
1369         }
1370
1371         (*disp_enq_thread)(cp, bound);
1372 }
1373
1374 /*
1375  * Put the specified thread on the back of the dispatcher
1376  * queue corresponding to its current priority.
1377  *
1378  * Called with the thread in transition, onproc or stopped state
1379  * and locked (transition implies locked) and at high spl.
1380  * Returns with the thread in TS_RUN state and still locked.
1381  */
1382 void
1383 setbackdq(kthread_t *tp)
1384 {
1385         setfrontbackdq(tp, false);
1386 }
1387
1388 /*
1389  * Put the specified thread on the front of the dispatcher
1390  * queue corresponding to its current priority.
1391  *
1392  * Called with the thread in transition, onproc or stopped state
1393  * and locked (transition implies locked) and at high spl.
1394  * Returns with the thread in TS_RUN state and still locked.
1395  */
1396 void
1397 setfrontdq(kthread_t *tp)
1398 {
1399         setfrontbackdq(tp, true);
1400 }
1401
1402 /*
1403  * Put a high-priority unbound thread on the kp queue
1404  */
1405 static void
1406 setkpdq(kthread_t *tp, int borf)
1407 {
1408         dispq_t *dq;
1409         disp_t  *dp;
1410         cpu_t   *cp;
1411         pri_t   tpri;
1412
1413         tpri = DISP_PRIO(tp);
1414
1415         dp = &tp->t_cpupart->cp_kp_queue;
1416         disp_lock_enter_high(&dp->disp_lock);
1417
1418         TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1419
1420         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1421         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1422         THREAD_RUN(tp, &dp->disp_lock);         /* set t_state to TS_RUN */
1423         tp->t_disp_queue = dp;
1424         dp->disp_nrunnable++;
1425         dq = &dp->disp_q[tpri];
1426
1427         if (dq->dq_sruncnt++ != 0) {
1428                 if (borf == SETKP_BACK) {
1429                         ASSERT(dq->dq_first != NULL);
1430                         tp->t_link = NULL;
1431                         dq->dq_last->t_link = tp;
1432                         dq->dq_last = tp;
1433                 } else {
1434                         ASSERT(dq->dq_last != NULL);
1435                         tp->t_link = dq->dq_first;
1436                         dq->dq_first = tp;
1437                 }
1438         } else {
1439                 if (borf == SETKP_BACK) {
1440                         ASSERT(dq->dq_first == NULL);
1441                         ASSERT(dq->dq_last == NULL);
1442                         dq->dq_first = dq->dq_last = tp;
1443                 } else {
1444                         ASSERT(dq->dq_last == NULL);
1445                         ASSERT(dq->dq_first == NULL);
1446                         tp->t_link = NULL;
1447                         dq->dq_first = dq->dq_last = tp;
1448                 }
1449                 BT_SET(dp->disp_qactmap, tpri);
1450                 if (tpri > dp->disp_max_unbound_pri)
1451                         dp->disp_max_unbound_pri = tpri;
1452                 if (tpri > dp->disp_maxrunpri) {
1453                         dp->disp_maxrunpri = tpri;
1454                         membar_enter();
1455                 }
1456         }
1457
1458         cp = tp->t_cpu;
1459         if (tp->t_cpupart != cp->cpu_part) {
1460                 /* migrate to a cpu in the new partition */
1461                 cp = tp->t_cpupart->cp_cpulist;
1462         }
1463         cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1464         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1465         ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1466
1467 #ifndef NPROBE
1468         /* Kernel probe */
1469         if (tnf_tracing_active)
1470                 tnf_thread_queue(tp, cp, tpri);
1471 #endif /* NPROBE */
1472
1473         if (cp->cpu_chosen_level < tpri)
1474                 cp->cpu_chosen_level = tpri;
1475         cpu_resched(cp, tpri);
1476         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1477         (*disp_enq_thread)(cp, 0);
1478 }
1479
1480 /*
1481  * Remove a thread from the dispatcher queue if it is on it.
1482  * It is not an error if it is not found but we return whether
1483  * or not it was found in case the caller wants to check.
1484  */
1485 int
1486 dispdeq(kthread_t *tp)
1487 {
1488         disp_t          *dp;
1489         dispq_t         *dq;
1490         kthread_t       *rp;
1491         kthread_t       *trp;
1492         kthread_t       **ptp;
1493         int             tpri;
1494
1495         ASSERT(THREAD_LOCK_HELD(tp));
1496
1497         if (tp->t_state != TS_RUN)
1498                 return (0);
1499
1500         tpri = DISP_PRIO(tp);
1501         dp = tp->t_disp_queue;
1502         ASSERT(tpri < dp->disp_npri);
1503         dq = &dp->disp_q[tpri];
1504         ptp = &dq->dq_first;
1505         rp = *ptp;
1506         trp = NULL;
1507
1508         ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1509
1510         /*
1511          * Search for thread in queue.
1512          * Double links would simplify this at the expense of disp/setrun.
1513          */
1514         while (rp != tp && rp != NULL) {
1515                 trp = rp;
1516                 ptp = &trp->t_link;
1517                 rp = trp->t_link;
1518         }
1519
1520         if (rp == NULL) {
1521                 panic("dispdeq: thread not on queue");
1522         }
1523
1524         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1525
1526         /*
1527          * Found it so remove it from queue.
1528          */
1529         if ((*ptp = rp->t_link) == NULL)
1530                 dq->dq_last = trp;
1531
1532         dp->disp_nrunnable--;
1533         if (--dq->dq_sruncnt == 0) {
1534                 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1535                 if (dp->disp_nrunnable == 0) {
1536                         dp->disp_max_unbound_pri = -1;
1537                         dp->disp_maxrunpri = -1;
1538                 } else if (tpri == dp->disp_maxrunpri) {
1539                         int ipri;
1540
1541                         ipri = bt_gethighbit(dp->disp_qactmap,
1542                             dp->disp_maxrunpri >> BT_ULSHIFT);
1543                         if (ipri < dp->disp_max_unbound_pri)
1544                                 dp->disp_max_unbound_pri = ipri;
1545                         dp->disp_maxrunpri = ipri;
1546                 }
1547         }
1548         tp->t_link = NULL;
1549         THREAD_TRANSITION(tp);          /* put in intermediate state */
1550         return (1);
1551 }
1552
1553 /*
1554  *      Make a thread give up its processor.  Find the processor on
1555  *      which this thread is executing, and have that processor
1556  *      preempt.
1557  *
1558  *      We allow System Duty Cycle (SDC) threads to be preempted even if
1559  *      they are running at kernel priorities.  To implement this, we always
1560  *      set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1561  *      calls cpu_surrender() very often, we only preempt if there is anyone
1562  *      competing with us.
1563  */
1564 void
1565 cpu_surrender(kthread_t *tp)
1566 {
1567         cpu_t   *cpup;
1568         int     max_pri;
1569         int     max_run_pri;
1570         klwp_t  *lwp;
1571
1572         ASSERT(THREAD_LOCK_HELD(tp));
1573
1574         if (tp->t_state != TS_ONPROC)
1575                 return;
1576         cpup = tp->t_disp_queue->disp_cpu;      /* CPU thread dispatched to */
1577         max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1578         max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1579         if (max_pri < max_run_pri)
1580                 max_pri = max_run_pri;
1581
1582         if (tp->t_cid == sysdccid) {
1583                 uint_t t_pri = DISP_PRIO(tp);
1584                 if (t_pri > max_pri)
1585                         return;         /* we are not competing w/ anyone */
1586                 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1587         } else {
1588                 cpup->cpu_runrun = 1;
1589                 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1590                         cpup->cpu_kprunrun = 1;
1591                 }
1592         }
1593
1594         /*
1595          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1596          */
1597         membar_enter();
1598
1599         DTRACE_SCHED1(surrender, kthread_t *, tp);
1600
1601         /*
1602          * Make the target thread take an excursion through trap()
1603          * to do preempt() (unless we're already in trap or post_syscall,
1604          * calling cpu_surrender via CL_TRAPRET).
1605          */
1606         if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1607             lwp->lwp_state != LWP_USER) {
1608                 aston(tp);
1609                 if (cpup != CPU)
1610                         poke_cpu(cpup->cpu_id);
1611         }
1612         TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1613             "cpu_surrender:tid %p cpu %p", tp, cpup);
1614 }
1615
1616 /*
1617  * Commit to and ratify a scheduling decision
1618  */
1619 /*ARGSUSED*/
1620 static kthread_t *
1621 disp_ratify(kthread_t *tp, disp_t *kpq)
1622 {
1623         pri_t   tpri, maxpri;
1624         pri_t   maxkpri;
1625         cpu_t   *cpup;
1626
1627         ASSERT(tp != NULL);
1628         /*
1629          * Commit to, then ratify scheduling decision
1630          */
1631         cpup = CPU;
1632         if (cpup->cpu_runrun != 0)
1633                 cpup->cpu_runrun = 0;
1634         if (cpup->cpu_kprunrun != 0)
1635                 cpup->cpu_kprunrun = 0;
1636         if (cpup->cpu_chosen_level != -1)
1637                 cpup->cpu_chosen_level = -1;
1638         membar_enter();
1639         tpri = DISP_PRIO(tp);
1640         maxpri = cpup->cpu_disp->disp_maxrunpri;
1641         maxkpri = kpq->disp_maxrunpri;
1642         if (maxpri < maxkpri)
1643                 maxpri = maxkpri;
1644         if (tpri < maxpri) {
1645                 /*
1646                  * should have done better
1647                  * put this one back and indicate to try again
1648                  */
1649                 cpup->cpu_dispthread = curthread;       /* fixup dispthread */
1650                 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1651                 thread_lock_high(tp);
1652                 THREAD_TRANSITION(tp);
1653                 setfrontdq(tp);
1654                 thread_unlock_nopreempt(tp);
1655
1656                 tp = NULL;
1657         }
1658         return (tp);
1659 }
1660
1661 /*
1662  * See if there is any work on the dispatcher queue for other CPUs.
1663  * If there is, dequeue the best thread and return.
1664  */
1665 static kthread_t *
1666 disp_getwork(cpu_t *cp)
1667 {
1668         cpu_t           *ocp;           /* other CPU */
1669         cpu_t           *ocp_start;
1670         cpu_t           *tcp;           /* target local CPU */
1671         kthread_t       *tp;
1672         kthread_t       *retval = NULL;
1673         pri_t           maxpri;
1674         disp_t          *kpq;           /* kp queue for this partition */
1675         lpl_t           *lpl, *lpl_leaf;
1676         int             leafidx, startidx;
1677         hrtime_t        stealtime;
1678         lgrp_id_t       local_id;
1679
1680         maxpri = -1;
1681         tcp = NULL;
1682
1683         kpq = &cp->cpu_part->cp_kp_queue;
1684         while (kpq->disp_maxrunpri >= 0) {
1685                 /*
1686                  * Try to take a thread from the kp_queue.
1687                  */
1688                 tp = (disp_getbest(kpq));
1689                 if (tp)
1690                         return (disp_ratify(tp, kpq));
1691         }
1692
1693         kpreempt_disable();             /* protect the cpu_active list */
1694
1695         /*
1696          * Try to find something to do on another CPU's run queue.
1697          * Loop through all other CPUs looking for the one with the highest
1698          * priority unbound thread.
1699          *
1700          * On NUMA machines, the partition's CPUs are consulted in order of
1701          * distance from the current CPU. This way, the first available
1702          * work found is also the closest, and will suffer the least
1703          * from being migrated.
1704          */
1705         lpl = lpl_leaf = cp->cpu_lpl;
1706         local_id = lpl_leaf->lpl_lgrpid;
1707         leafidx = startidx = 0;
1708
1709         /*
1710          * This loop traverses the lpl hierarchy. Higher level lpls represent
1711          * broader levels of locality
1712          */
1713         do {
1714                 /* This loop iterates over the lpl's leaves */
1715                 do {
1716                         if (lpl_leaf != cp->cpu_lpl)
1717                                 ocp = lpl_leaf->lpl_cpus;
1718                         else
1719                                 ocp = cp->cpu_next_lpl;
1720
1721                         /* This loop iterates over the CPUs in the leaf */
1722                         ocp_start = ocp;
1723                         do {
1724                                 pri_t pri;
1725
1726                                 ASSERT(CPU_ACTIVE(ocp));
1727
1728                                 /*
1729                                  * End our stroll around this lpl if:
1730                                  *
1731                                  * - Something became runnable on the local
1732                                  *   queue...which also ends our stroll around
1733                                  *   the partition.
1734                                  *
1735                                  * - We happen across another idle CPU.
1736                                  *   Since it is patrolling the next portion
1737                                  *   of the lpl's list (assuming it's not
1738                                  *   halted, or busy servicing an interrupt),
1739                                  *   move to the next higher level of locality.
1740                                  */
1741                                 if (cp->cpu_disp->disp_nrunnable != 0) {
1742                                         kpreempt_enable();
1743                                         return (NULL);
1744                                 }
1745                                 if (ocp->cpu_dispatch_pri == -1) {
1746                                         if (ocp->cpu_disp_flags &
1747                                             CPU_DISP_HALTED ||
1748                                             ocp->cpu_intr_actv != 0)
1749                                                 continue;
1750                                         else
1751                                                 goto next_level;
1752                                 }
1753
1754                                 /*
1755                                  * If there's only one thread and the CPU
1756                                  * is in the middle of a context switch,
1757                                  * or it's currently running the idle thread,
1758                                  * don't steal it.
1759                                  */
1760                                 if ((ocp->cpu_disp_flags &
1761                                     CPU_DISP_DONTSTEAL) &&
1762                                     ocp->cpu_disp->disp_nrunnable == 1)
1763                                         continue;
1764
1765                                 pri = ocp->cpu_disp->disp_max_unbound_pri;
1766                                 if (pri > maxpri) {
1767                                         /*
1768                                          * Don't steal threads that we attempted
1769                                          * to steal recently until they're ready
1770                                          * to be stolen again.
1771                                          */
1772                                         stealtime = ocp->cpu_disp->disp_steal;
1773                                         if (stealtime == 0 ||
1774                                             stealtime - gethrtime() <= 0) {
1775                                                 maxpri = pri;
1776                                                 tcp = ocp;
1777                                         } else {
1778                                                 /*
1779                                                  * Don't update tcp, just set
1780                                                  * the retval to T_DONTSTEAL, so
1781                                                  * that if no acceptable CPUs
1782                                                  * are found the return value
1783                                                  * will be T_DONTSTEAL rather
1784                                                  * then NULL.
1785                                                  */
1786                                                 retval = T_DONTSTEAL;
1787                                         }
1788                                 }
1789                         } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
1790
1791                         /*
1792                          * Iterate to the next leaf lpl in the resource set
1793                          * at this level of locality. If we hit the end of
1794                          * the set, wrap back around to the beginning.
1795                          *
1796                          * Note: This iteration is NULL terminated for a reason
1797                          * see lpl_topo_bootstrap() in lgrp.c for details.
1798                          */
1799                         if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
1800                                 leafidx = 0;
1801                                 lpl_leaf = lpl->lpl_rset[leafidx];
1802                         }
1803                 } while (leafidx != startidx);
1804
1805 next_level:
1806                 /*
1807                  * Expand the search to include farther away CPUs (next
1808                  * locality level). The closer CPUs that have already been
1809                  * checked will be checked again. In doing so, idle CPUs
1810                  * will tend to be more aggresive about stealing from CPUs
1811                  * that are closer (since the closer CPUs will be considered
1812                  * more often).
1813                  * Begin at this level with the CPUs local leaf lpl.
1814                  */
1815                 if ((lpl = lpl->lpl_parent) != NULL) {
1816                         leafidx = startidx = lpl->lpl_id2rset[local_id];
1817                         lpl_leaf = lpl->lpl_rset[leafidx];
1818                 }
1819         } while (!tcp && lpl);
1820
1821         kpreempt_enable();
1822
1823         /*
1824          * If another queue looks good, and there is still nothing on
1825          * the local queue, try to transfer one or more threads
1826          * from it to our queue.
1827          */
1828         if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
1829                 tp = disp_getbest(tcp->cpu_disp);
1830                 if (tp == NULL || tp == T_DONTSTEAL)
1831                         return (tp);
1832                 return (disp_ratify(tp, kpq));
1833         }
1834         return (retval);
1835 }
1836
1837
1838 /*
1839  * disp_fix_unbound_pri()
1840  *      Determines the maximum priority of unbound threads on the queue.
1841  *      The priority is kept for the queue, but is only increased, never
1842  *      reduced unless some CPU is looking for something on that queue.
1843  *
1844  *      The priority argument is the known upper limit.
1845  *
1846  *      Perhaps this should be kept accurately, but that probably means
1847  *      separate bitmaps for bound and unbound threads.  Since only idled
1848  *      CPUs will have to do this recalculation, it seems better this way.
1849  */
1850 static void
1851 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
1852 {
1853         kthread_t       *tp;
1854         dispq_t         *dq;
1855         ulong_t         *dqactmap = dp->disp_qactmap;
1856         ulong_t         mapword;
1857         int             wx;
1858
1859         ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
1860
1861         ASSERT(pri >= 0);                       /* checked by caller */
1862
1863         /*
1864          * Start the search at the next lowest priority below the supplied
1865          * priority.  This depends on the bitmap implementation.
1866          */
1867         do {
1868                 wx = pri >> BT_ULSHIFT;         /* index of word in map */
1869
1870                 /*
1871                  * Form mask for all lower priorities in the word.
1872                  */
1873                 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
1874
1875                 /*
1876                  * Get next lower active priority.
1877                  */
1878                 if (mapword != 0) {
1879                         pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
1880                 } else if (wx > 0) {
1881                         pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
1882                         if (pri < 0)
1883                                 break;
1884                 } else {
1885                         pri = -1;
1886                         break;
1887                 }
1888
1889                 /*
1890                  * Search the queue for unbound, runnable threads.
1891                  */
1892                 dq = &dp->disp_q[pri];
1893                 tp = dq->dq_first;
1894
1895                 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
1896                         tp = tp->t_link;
1897                 }
1898
1899                 /*
1900                  * If a thread was found, set the priority and return.
1901                  */
1902         } while (tp == NULL);
1903
1904         /*
1905          * pri holds the maximum unbound thread priority or -1.
1906          */
1907         if (dp->disp_max_unbound_pri != pri)
1908                 dp->disp_max_unbound_pri = pri;
1909 }
1910
1911 /*
1912  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
1913  *      check if the CPU to which is was previously bound should have
1914  *      its disp_max_unbound_pri increased.
1915  */
1916 void
1917 disp_adjust_unbound_pri(kthread_t *tp)
1918 {
1919         disp_t *dp;
1920         pri_t tpri;
1921
1922         ASSERT(THREAD_LOCK_HELD(tp));
1923
1924         /*
1925          * Don't do anything if the thread is not bound, or
1926          * currently not runnable.
1927          */
1928         if (tp->t_bound_cpu == NULL ||
1929             tp->t_state != TS_RUN)
1930                 return;
1931
1932         tpri = DISP_PRIO(tp);
1933         dp = tp->t_bound_cpu->cpu_disp;
1934         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1935         if (tpri > dp->disp_max_unbound_pri)
1936                 dp->disp_max_unbound_pri = tpri;
1937 }
1938
1939 /*
1940  * disp_getbest()
1941  *   De-queue the highest priority unbound runnable thread.
1942  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
1943  *   Returns NULL if nothing found.
1944  *   Returns T_DONTSTEAL if the thread was not stealable.
1945  *   so that the caller will try again later.
1946  *
1947  *   Passed a pointer to a dispatch queue not associated with this CPU, and
1948  *   its type.
1949  */
1950 static kthread_t *
1951 disp_getbest(disp_t *dp)
1952 {
1953         kthread_t       *tp;
1954         dispq_t         *dq;
1955         pri_t           pri;
1956         cpu_t           *cp, *tcp;
1957         boolean_t       allbound;
1958
1959         disp_lock_enter(&dp->disp_lock);
1960
1961         /*
1962          * If there is nothing to run, or the CPU is in the middle of a
1963          * context switch of the only thread, return NULL.
1964          */
1965         tcp = dp->disp_cpu;
1966         cp = CPU;
1967         pri = dp->disp_max_unbound_pri;
1968         if (pri == -1 ||
1969             (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
1970             tcp->cpu_disp->disp_nrunnable == 1)) {
1971                 disp_lock_exit_nopreempt(&dp->disp_lock);
1972                 return (NULL);
1973         }
1974
1975         dq = &dp->disp_q[pri];
1976
1977
1978         /*
1979          * Assume that all threads are bound on this queue, and change it
1980          * later when we find out that it is not the case.
1981          */
1982         allbound = B_TRUE;
1983         for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
1984                 hrtime_t now, nosteal, rqtime;
1985
1986                 /*
1987                  * Skip over bound threads which could be here even
1988                  * though disp_max_unbound_pri indicated this level.
1989                  */
1990                 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1991                         continue;
1992
1993                 /*
1994                  * We've got some unbound threads on this queue, so turn
1995                  * the allbound flag off now.
1996                  */
1997                 allbound = B_FALSE;
1998
1999                 /*
2000                  * The thread is a candidate for stealing from its run queue. We
2001                  * don't want to steal threads that became runnable just a
2002                  * moment ago. This improves CPU affinity for threads that get
2003                  * preempted for short periods of time and go back on the run
2004                  * queue.
2005                  *
2006                  * We want to let it stay on its run queue if it was only placed
2007                  * there recently and it was running on the same CPU before that
2008                  * to preserve its cache investment. For the thread to remain on
2009                  * its run queue, ALL of the following conditions must be
2010                  * satisfied:
2011                  *
2012                  * - the disp queue should not be the kernel preemption queue
2013                  * - delayed idle stealing should not be disabled
2014                  * - nosteal_nsec should be non-zero
2015                  * - it should run with user priority
2016                  * - it should be on the run queue of the CPU where it was
2017                  *   running before being placed on the run queue
2018                  * - it should be the only thread on the run queue (to prevent
2019                  *   extra scheduling latency for other threads)
2020                  * - it should sit on the run queue for less than per-chip
2021                  *   nosteal interval or global nosteal interval
2022                  * - in case of CPUs with shared cache it should sit in a run
2023                  *   queue of a CPU from a different chip
2024                  *
2025                  * The checks are arranged so that the ones that are faster are
2026                  * placed earlier.
2027                  */
2028                 if (tcp == NULL ||
2029                     pri >= minclsyspri ||
2030                     tp->t_cpu != tcp)
2031                         break;
2032
2033                 /*
2034                  * Steal immediately if, due to CMT processor architecture
2035                  * migraiton between cp and tcp would incur no performance
2036                  * penalty.
2037                  */
2038                 if (pg_cmt_can_migrate(cp, tcp))
2039                         break;
2040
2041                 nosteal = nosteal_nsec;
2042                 if (nosteal == 0)
2043                         break;
2044
2045                 /*
2046                  * Calculate time spent sitting on run queue
2047                  */
2048                 now = gethrtime_unscaled();
2049                 rqtime = now - tp->t_waitrq;
2050                 scalehrtime(&rqtime);
2051
2052                 /*
2053                  * Steal immediately if the time spent on this run queue is more
2054                  * than allowed nosteal delay.
2055                  *
2056                  * Negative rqtime check is needed here to avoid infinite
2057                  * stealing delays caused by unlikely but not impossible
2058                  * drifts between CPU times on different CPUs.
2059                  */
2060                 if (rqtime > nosteal || rqtime < 0)
2061                         break;
2062
2063                 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2064                     cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2065                 scalehrtime(&now);
2066                 /*
2067                  * Calculate when this thread becomes stealable
2068                  */
2069                 now += (nosteal - rqtime);
2070
2071                 /*
2072                  * Calculate time when some thread becomes stealable
2073                  */
2074                 if (now < dp->disp_steal)
2075                         dp->disp_steal = now;
2076         }
2077
2078         /*
2079          * If there were no unbound threads on this queue, find the queue
2080          * where they are and then return later. The value of
2081          * disp_max_unbound_pri is not always accurate because it isn't
2082          * reduced until another idle CPU looks for work.
2083          */
2084         if (allbound)
2085                 disp_fix_unbound_pri(dp, pri);
2086
2087         /*
2088          * If we reached the end of the queue and found no unbound threads
2089          * then return NULL so that other CPUs will be considered.  If there
2090          * are unbound threads but they cannot yet be stolen, then
2091          * return T_DONTSTEAL and try again later.
2092          */
2093         if (tp == NULL) {
2094                 disp_lock_exit_nopreempt(&dp->disp_lock);
2095                 return (allbound ? NULL : T_DONTSTEAL);
2096         }
2097
2098         /*
2099          * Found a runnable, unbound thread, so remove it from queue.
2100          * dispdeq() requires that we have the thread locked, and we do,
2101          * by virtue of holding the dispatch queue lock.  dispdeq() will
2102          * put the thread in transition state, thereby dropping the dispq
2103          * lock.
2104          */
2105
2106 #ifdef DEBUG
2107         {
2108                 int     thread_was_on_queue;
2109
2110                 thread_was_on_queue = dispdeq(tp);      /* drops disp_lock */
2111                 ASSERT(thread_was_on_queue);
2112         }
2113
2114 #else /* DEBUG */
2115         (void) dispdeq(tp);                     /* drops disp_lock */
2116 #endif /* DEBUG */
2117
2118         /*
2119          * Reset the disp_queue steal time - we do not know what is the smallest
2120          * value across the queue is.
2121          */
2122         dp->disp_steal = 0;
2123
2124         /*
2125          * Setup thread to run on the current CPU.
2126          */
2127         tp->t_disp_queue = cp->cpu_disp;
2128
2129         cp->cpu_dispthread = tp;                /* protected by spl only */
2130         cp->cpu_dispatch_pri = pri;
2131
2132         /*
2133          * There can be a memory synchronization race between disp_getbest()
2134          * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2135          * to preempt the current thread to run the enqueued thread while
2136          * disp_getbest() and disp_ratify() are changing the current thread
2137          * to the stolen thread. This may lead to a situation where
2138          * cpu_resched() tries to preempt the wrong thread and the
2139          * stolen thread continues to run on the CPU which has been tagged
2140          * for preemption.
2141          * Later the clock thread gets enqueued but doesn't get to run on the
2142          * CPU causing the system to hang.
2143          *
2144          * To avoid this, grabbing and dropping the disp_lock (which does
2145          * a memory barrier) is needed to synchronize the execution of
2146          * cpu_resched() with disp_getbest() and disp_ratify() and
2147          * synchronize the memory read and written by cpu_resched(),
2148          * disp_getbest(), and disp_ratify() with each other.
2149          *  (see CR#6482861 for more details).
2150          */
2151         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2152         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2153
2154         ASSERT(pri == DISP_PRIO(tp));
2155
2156         DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2157
2158         thread_onproc(tp, cp);                  /* set t_state to TS_ONPROC */
2159
2160         /*
2161          * Return with spl high so that swtch() won't need to raise it.
2162          * The disp_lock was dropped by dispdeq().
2163          */
2164
2165         return (tp);
2166 }
2167
2168 /*
2169  * disp_bound_common() - common routine for higher level functions
2170  *      that check for bound threads under certain conditions.
2171  *      If 'threadlistsafe' is set then there is no need to acquire
2172  *      pidlock to stop the thread list from changing (eg, if
2173  *      disp_bound_* is called with cpus paused).
2174  */
2175 static int
2176 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2177 {
2178         int             found = 0;
2179         kthread_t       *tp;
2180
2181         ASSERT(flag);
2182
2183         if (!threadlistsafe)
2184                 mutex_enter(&pidlock);
2185         tp = curthread;         /* faster than allthreads */
2186         do {
2187                 if (tp->t_state != TS_FREE) {
2188                         /*
2189                          * If an interrupt thread is busy, but the
2190                          * caller doesn't care (i.e. BOUND_INTR is off),
2191                          * then just ignore it and continue through.
2192                          */
2193                         if ((tp->t_flag & T_INTR_THREAD) &&
2194                             !(flag & BOUND_INTR))
2195                                 continue;
2196
2197                         /*
2198                          * Skip the idle thread for the CPU
2199                          * we're about to set offline.
2200                          */
2201                         if (tp == cp->cpu_idle_thread)
2202                                 continue;
2203
2204                         /*
2205                          * Skip the pause thread for the CPU
2206                          * we're about to set offline.
2207                          */
2208                         if (tp == cp->cpu_pause_thread)
2209                                 continue;
2210
2211                         if ((flag & BOUND_CPU) &&
2212                             (tp->t_bound_cpu == cp ||
2213                             tp->t_bind_cpu == cp->cpu_id ||
2214                             tp->t_weakbound_cpu == cp)) {
2215                                 found = 1;
2216                                 break;
2217                         }
2218
2219                         if ((flag & BOUND_PARTITION) &&
2220                             (tp->t_cpupart == cp->cpu_part)) {
2221                                 found = 1;
2222                                 break;
2223                         }
2224                 }
2225         } while ((tp = tp->t_next) != curthread && found == 0);
2226         if (!threadlistsafe)
2227                 mutex_exit(&pidlock);
2228         return (found);
2229 }
2230
2231 /*
2232  * disp_bound_threads - return nonzero if threads are bound to the processor.
2233  *      Called infrequently.  Keep this simple.
2234  *      Includes threads that are asleep or stopped but not onproc.
2235  */
2236 int
2237 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2238 {
2239         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2240 }
2241
2242 /*
2243  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2244  * to the given processor, including interrupt threads.
2245  */
2246 int
2247 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2248 {
2249         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2250 }
2251
2252 /*
2253  * disp_bound_partition - return nonzero if threads are bound to the same
2254  * partition as the processor.
2255  *      Called infrequently.  Keep this simple.
2256  *      Includes threads that are asleep or stopped but not onproc.
2257  */
2258 int
2259 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2260 {
2261         return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2262 }
2263
2264 /*
2265  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2266  * threads to other CPUs.
2267  */
2268 void
2269 disp_cpu_inactive(cpu_t *cp)
2270 {
2271         kthread_t       *tp;
2272         disp_t          *dp = cp->cpu_disp;
2273         dispq_t         *dq;
2274         pri_t           pri;
2275         int             wasonq;
2276
2277         disp_lock_enter(&dp->disp_lock);
2278         while ((pri = dp->disp_max_unbound_pri) != -1) {
2279                 dq = &dp->disp_q[pri];
2280                 tp = dq->dq_first;
2281
2282                 /*
2283                  * Skip over bound threads.
2284                  */
2285                 while (tp != NULL && tp->t_bound_cpu != NULL) {
2286                         tp = tp->t_link;
2287                 }
2288
2289                 if (tp == NULL) {
2290                         /* disp_max_unbound_pri must be inaccurate, so fix it */
2291                         disp_fix_unbound_pri(dp, pri);
2292                         continue;
2293                 }
2294
2295                 wasonq = dispdeq(tp);           /* drops disp_lock */
2296                 ASSERT(wasonq);
2297                 ASSERT(tp->t_weakbound_cpu == NULL);
2298
2299                 setbackdq(tp);
2300                 /*
2301                  * Called from cpu_offline:
2302                  *
2303                  * cp has already been removed from the list of active cpus
2304                  * and tp->t_cpu has been changed so there is no risk of
2305                  * tp ending up back on cp.
2306                  *
2307                  * Called from cpupart_move_cpu:
2308                  *
2309                  * The cpu has moved to a new cpupart.  Any threads that
2310                  * were on it's dispatch queues before the move remain
2311                  * in the old partition and can't run in the new partition.
2312                  */
2313                 ASSERT(tp->t_cpu != cp);
2314                 thread_unlock(tp);
2315
2316                 disp_lock_enter(&dp->disp_lock);
2317         }
2318         disp_lock_exit(&dp->disp_lock);
2319 }
2320
2321 /*
2322  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2323  *      The hint passed in is used as a starting point so we don't favor
2324  *      CPU 0 or any other CPU.  The caller should pass in the most recently
2325  *      used CPU for the thread.
2326  *
2327  *      The lgroup and priority are used to determine the best CPU to run on
2328  *      in a NUMA machine.  The lgroup specifies which CPUs are closest while
2329  *      the thread priority will indicate whether the thread will actually run
2330  *      there.  To pick the best CPU, the CPUs inside and outside of the given
2331  *      lgroup which are running the lowest priority threads are found.  The
2332  *      remote CPU is chosen only if the thread will not run locally on a CPU
2333  *      within the lgroup, but will run on the remote CPU. If the thread
2334  *      cannot immediately run on any CPU, the best local CPU will be chosen.
2335  *
2336  *      The lpl specified also identifies the cpu partition from which
2337  *      disp_lowpri_cpu should select a CPU.
2338  *
2339  *      curcpu is used to indicate that disp_lowpri_cpu is being called on
2340  *      behalf of the current thread. (curthread is looking for a new cpu)
2341  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2342  *      ignored.
2343  *
2344  *      If a cpu is the target of an offline request then try to avoid it.
2345  *
2346  *      This function must be called at either high SPL, or with preemption
2347  *      disabled, so that the "hint" CPU cannot be removed from the online
2348  *      CPU list while we are traversing it.
2349  */
2350 cpu_t *
2351 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2352 {
2353         cpu_t   *bestcpu;
2354         cpu_t   *besthomecpu;
2355         cpu_t   *cp, *cpstart;
2356
2357         pri_t   bestpri;
2358         pri_t   cpupri;
2359
2360         klgrpset_t      done;
2361         klgrpset_t      cur_set;
2362
2363         lpl_t           *lpl_iter, *lpl_leaf;
2364         int             i;
2365
2366         /*
2367          * Scan for a CPU currently running the lowest priority thread.
2368          * Cannot get cpu_lock here because it is adaptive.
2369          * We do not require lock on CPU list.
2370          */
2371         ASSERT(hint != NULL);
2372         ASSERT(lpl != NULL);
2373         ASSERT(lpl->lpl_ncpu > 0);
2374
2375         /*
2376          * First examine local CPUs. Note that it's possible the hint CPU
2377          * passed in in remote to the specified home lgroup. If our priority
2378          * isn't sufficient enough such that we can run immediately at home,
2379          * then examine CPUs remote to our home lgroup.
2380          * We would like to give preference to CPUs closest to "home".
2381          * If we can't find a CPU where we'll run at a given level
2382          * of locality, we expand our search to include the next level.
2383          */
2384         bestcpu = besthomecpu = NULL;
2385         klgrpset_clear(done);
2386         /* start with lpl we were passed */
2387
2388         lpl_iter = lpl;
2389
2390         do {
2391
2392                 bestpri = SHRT_MAX;
2393                 klgrpset_clear(cur_set);
2394
2395                 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2396                         lpl_leaf = lpl_iter->lpl_rset[i];
2397                         if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2398                                 continue;
2399
2400                         klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2401
2402                         if (hint->cpu_lpl == lpl_leaf)
2403                                 cp = cpstart = hint;
2404                         else
2405                                 cp = cpstart = lpl_leaf->lpl_cpus;
2406
2407                         do {
2408                                 if (cp == curcpu)
2409                                         cpupri = -1;
2410                                 else if (cp == cpu_inmotion)
2411                                         cpupri = SHRT_MAX;
2412                                 else
2413                                         cpupri = cp->cpu_dispatch_pri;
2414                                 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2415                                         cpupri = cp->cpu_disp->disp_maxrunpri;
2416                                 if (cp->cpu_chosen_level > cpupri)
2417                                         cpupri = cp->cpu_chosen_level;
2418                                 if (cpupri < bestpri) {
2419                                         if (CPU_IDLING(cpupri)) {
2420                                                 ASSERT((cp->cpu_flags &
2421                                                     CPU_QUIESCED) == 0);
2422                                                 return (cp);
2423                                         }
2424                                         bestcpu = cp;
2425                                         bestpri = cpupri;
2426                                 }
2427                         } while ((cp = cp->cpu_next_lpl) != cpstart);
2428                 }
2429
2430                 if (bestcpu && (tpri > bestpri)) {
2431                         ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2432                         return (bestcpu);
2433                 }
2434                 if (besthomecpu == NULL)
2435                         besthomecpu = bestcpu;
2436                 /*
2437                  * Add the lgrps we just considered to the "done" set
2438                  */
2439                 klgrpset_or(done, cur_set);
2440
2441         } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2442
2443         /*
2444          * The specified priority isn't high enough to run immediately
2445          * anywhere, so just return the best CPU from the home lgroup.
2446          */
2447         ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2448         return (besthomecpu);
2449 }
2450
2451 /*
2452  * This routine provides the generic idle cpu function for all processors.
2453  * If a processor has some specific code to execute when idle (say, to stop
2454  * the pipeline and save power) then that routine should be defined in the
2455  * processors specific code (module_xx.c) and the global variable idle_cpu
2456  * set to that function.
2457  */
2458 static void
2459 generic_idle_cpu(void)
2460 {
2461 }
2462
2463 /*ARGSUSED*/
2464 static void
2465 generic_enq_thread(cpu_t *cpu, int bound)
2466 {
2467 }