5285 pass in cpu_pause_func via pause_cpus
[illumos-gate.git] / usr / src / uts / common / disp / disp.c
blob0c2c0b49935db248c08d1118dd62e2ad9e0ac47d
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/signal.h>
34 #include <sys/user.h>
35 #include <sys/systm.h>
36 #include <sys/sysinfo.h>
37 #include <sys/var.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/inline.h>
42 #include <sys/disp.h>
43 #include <sys/class.h>
44 #include <sys/bitmap.h>
45 #include <sys/kmem.h>
46 #include <sys/cpuvar.h>
47 #include <sys/vtrace.h>
48 #include <sys/tnf.h>
49 #include <sys/cpupart.h>
50 #include <sys/lgrp.h>
51 #include <sys/pg.h>
52 #include <sys/cmt.h>
53 #include <sys/bitset.h>
54 #include <sys/schedctl.h>
55 #include <sys/atomic.h>
56 #include <sys/dtrace.h>
57 #include <sys/sdt.h>
58 #include <sys/archsystm.h>
60 #include <vm/as.h>
62 #define BOUND_CPU 0x1
63 #define BOUND_PARTITION 0x2
64 #define BOUND_INTR 0x4
66 /* Dispatch queue allocation structure and functions */
67 struct disp_queue_info {
68 disp_t *dp;
69 dispq_t *olddispq;
70 dispq_t *newdispq;
71 ulong_t *olddqactmap;
72 ulong_t *newdqactmap;
73 int oldnglobpris;
75 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
76 disp_t *dp);
77 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris);
78 static void disp_dq_free(struct disp_queue_info *dptr);
80 /* platform-specific routine to call when processor is idle */
81 static void generic_idle_cpu();
82 void (*idle_cpu)() = generic_idle_cpu;
84 /* routines invoked when a CPU enters/exits the idle loop */
85 static void idle_enter();
86 static void idle_exit();
88 /* platform-specific routine to call when thread is enqueued */
89 static void generic_enq_thread(cpu_t *, int);
90 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
92 pri_t kpreemptpri; /* priority where kernel preemption applies */
93 pri_t upreemptpri = 0; /* priority where normal preemption applies */
94 pri_t intr_pri; /* interrupt thread priority base level */
96 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */
97 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */
98 disp_t cpu0_disp; /* boot CPU's dispatch queue */
99 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */
100 int nswapped; /* total number of swapped threads */
101 void disp_swapped_enq(kthread_t *tp);
102 static void disp_swapped_setrun(kthread_t *tp);
103 static void cpu_resched(cpu_t *cp, pri_t tpri);
106 * If this is set, only interrupt threads will cause kernel preemptions.
107 * This is done by changing the value of kpreemptpri. kpreemptpri
108 * will either be the max sysclass pri + 1 or the min interrupt pri.
110 int only_intr_kpreempt;
112 extern void set_idle_cpu(int cpun);
113 extern void unset_idle_cpu(int cpun);
114 static void setkpdq(kthread_t *tp, int borf);
115 #define SETKP_BACK 0
116 #define SETKP_FRONT 1
118 * Parameter that determines how recently a thread must have run
119 * on the CPU to be considered loosely-bound to that CPU to reduce
120 * cold cache effects. The interval is in hertz.
122 #define RECHOOSE_INTERVAL 3
123 int rechoose_interval = RECHOOSE_INTERVAL;
126 * Parameter that determines how long (in nanoseconds) a thread must
127 * be sitting on a run queue before it can be stolen by another CPU
128 * to reduce migrations. The interval is in nanoseconds.
130 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
131 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED
132 * here indicating it is uninitiallized.
133 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
136 #define NOSTEAL_UNINITIALIZED (-1)
137 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
138 extern void cmp_set_nosteal_interval(void);
140 id_t defaultcid; /* system "default" class; see dispadmin(1M) */
142 disp_lock_t transition_lock; /* lock on transitioning threads */
143 disp_lock_t stop_lock; /* lock on stopped threads */
145 static void cpu_dispqalloc(int numpris);
148 * This gets returned by disp_getwork/disp_getbest if we couldn't steal
149 * a thread because it was sitting on its run queue for a very short
150 * period of time.
152 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */
154 static kthread_t *disp_getwork(cpu_t *to);
155 static kthread_t *disp_getbest(disp_t *from);
156 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq);
158 void swtch_to(kthread_t *);
161 * dispatcher and scheduler initialization
165 * disp_setup - Common code to calculate and allocate dispatcher
166 * variables and structures based on the maximum priority.
168 static void
169 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
171 pri_t newnglobpris;
173 ASSERT(MUTEX_HELD(&cpu_lock));
175 newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
177 if (newnglobpris > oldnglobpris) {
179 * Allocate new kp queues for each CPU partition.
181 cpupart_kpqalloc(newnglobpris);
184 * Allocate new dispatch queues for each CPU.
186 cpu_dispqalloc(newnglobpris);
189 * compute new interrupt thread base priority
191 intr_pri = maxglobpri;
192 if (only_intr_kpreempt) {
193 kpreemptpri = intr_pri + 1;
194 if (kpqpri == KPQPRI)
195 kpqpri = kpreemptpri;
197 v.v_nglobpris = newnglobpris;
202 * dispinit - Called to initialize all loaded classes and the
203 * dispatcher framework.
205 void
206 dispinit(void)
208 id_t cid;
209 pri_t maxglobpri;
210 pri_t cl_maxglobpri;
212 maxglobpri = -1;
215 * Initialize transition lock, which will always be set.
217 DISP_LOCK_INIT(&transition_lock);
218 disp_lock_enter_high(&transition_lock);
219 DISP_LOCK_INIT(&stop_lock);
221 mutex_enter(&cpu_lock);
222 CPU->cpu_disp->disp_maxrunpri = -1;
223 CPU->cpu_disp->disp_max_unbound_pri = -1;
226 * Initialize the default CPU partition.
228 cpupart_initialize_default();
230 * Call the class specific initialization functions for
231 * all pre-installed schedulers.
233 * We pass the size of a class specific parameter
234 * buffer to each of the initialization functions
235 * to try to catch problems with backward compatibility
236 * of class modules.
238 * For example a new class module running on an old system
239 * which didn't provide sufficiently large parameter buffers
240 * would be bad news. Class initialization modules can check for
241 * this and take action if they detect a problem.
244 for (cid = 0; cid < nclass; cid++) {
245 sclass_t *sc;
247 sc = &sclass[cid];
248 if (SCHED_INSTALLED(sc)) {
249 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
250 &sc->cl_funcs);
251 if (cl_maxglobpri > maxglobpri)
252 maxglobpri = cl_maxglobpri;
255 kpreemptpri = (pri_t)v.v_maxsyspri + 1;
256 if (kpqpri == KPQPRI)
257 kpqpri = kpreemptpri;
259 ASSERT(maxglobpri >= 0);
260 disp_setup(maxglobpri, 0);
262 mutex_exit(&cpu_lock);
265 * Platform specific sticky scheduler setup.
267 if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
268 cmp_set_nosteal_interval();
271 * Get the default class ID; this may be later modified via
272 * dispadmin(1M). This will load the class (normally TS) and that will
273 * call disp_add(), which is why we had to drop cpu_lock first.
275 if (getcid(defaultclass, &defaultcid) != 0) {
276 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
277 defaultclass);
282 * disp_add - Called with class pointer to initialize the dispatcher
283 * for a newly loaded class.
285 void
286 disp_add(sclass_t *clp)
288 pri_t maxglobpri;
289 pri_t cl_maxglobpri;
291 mutex_enter(&cpu_lock);
293 * Initialize the scheduler class.
295 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
296 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
297 if (cl_maxglobpri > maxglobpri)
298 maxglobpri = cl_maxglobpri;
301 * Save old queue information. Since we're initializing a
302 * new scheduling class which has just been loaded, then
303 * the size of the dispq may have changed. We need to handle
304 * that here.
306 disp_setup(maxglobpri, v.v_nglobpris);
308 mutex_exit(&cpu_lock);
313 * For each CPU, allocate new dispatch queues
314 * with the stated number of priorities.
316 static void
317 cpu_dispqalloc(int numpris)
319 cpu_t *cpup;
320 struct disp_queue_info *disp_mem;
321 int i, num;
323 ASSERT(MUTEX_HELD(&cpu_lock));
325 disp_mem = kmem_zalloc(NCPU *
326 sizeof (struct disp_queue_info), KM_SLEEP);
329 * This routine must allocate all of the memory before stopping
330 * the cpus because it must not sleep in kmem_alloc while the
331 * CPUs are stopped. Locks they hold will not be freed until they
332 * are restarted.
334 i = 0;
335 cpup = cpu_list;
336 do {
337 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
338 i++;
339 cpup = cpup->cpu_next;
340 } while (cpup != cpu_list);
341 num = i;
343 pause_cpus(NULL, NULL);
344 for (i = 0; i < num; i++)
345 disp_dq_assign(&disp_mem[i], numpris);
346 start_cpus();
349 * I must free all of the memory after starting the cpus because
350 * I can not risk sleeping in kmem_free while the cpus are stopped.
352 for (i = 0; i < num; i++)
353 disp_dq_free(&disp_mem[i]);
355 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
358 static void
359 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
361 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
362 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
363 sizeof (long), KM_SLEEP);
364 dptr->dp = dp;
367 static void
368 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
370 disp_t *dp;
372 dp = dptr->dp;
373 dptr->olddispq = dp->disp_q;
374 dptr->olddqactmap = dp->disp_qactmap;
375 dptr->oldnglobpris = dp->disp_npri;
377 ASSERT(dptr->oldnglobpris < numpris);
379 if (dptr->olddispq != NULL) {
381 * Use kcopy because bcopy is platform-specific
382 * and could block while we might have paused the cpus.
384 (void) kcopy(dptr->olddispq, dptr->newdispq,
385 dptr->oldnglobpris * sizeof (dispq_t));
386 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
387 ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
388 sizeof (long));
390 dp->disp_q = dptr->newdispq;
391 dp->disp_qactmap = dptr->newdqactmap;
392 dp->disp_q_limit = &dptr->newdispq[numpris];
393 dp->disp_npri = numpris;
396 static void
397 disp_dq_free(struct disp_queue_info *dptr)
399 if (dptr->olddispq != NULL)
400 kmem_free(dptr->olddispq,
401 dptr->oldnglobpris * sizeof (dispq_t));
402 if (dptr->olddqactmap != NULL)
403 kmem_free(dptr->olddqactmap,
404 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
408 * For a newly created CPU, initialize the dispatch queue.
409 * This is called before the CPU is known through cpu[] or on any lists.
411 void
412 disp_cpu_init(cpu_t *cp)
414 disp_t *dp;
415 dispq_t *newdispq;
416 ulong_t *newdqactmap;
418 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */
420 if (cp == cpu0_disp.disp_cpu)
421 dp = &cpu0_disp;
422 else
423 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
424 bzero(dp, sizeof (disp_t));
425 cp->cpu_disp = dp;
426 dp->disp_cpu = cp;
427 dp->disp_maxrunpri = -1;
428 dp->disp_max_unbound_pri = -1;
429 DISP_LOCK_INIT(&cp->cpu_thread_lock);
431 * Allocate memory for the dispatcher queue headers
432 * and the active queue bitmap.
434 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
435 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
436 sizeof (long), KM_SLEEP);
437 dp->disp_q = newdispq;
438 dp->disp_qactmap = newdqactmap;
439 dp->disp_q_limit = &newdispq[v.v_nglobpris];
440 dp->disp_npri = v.v_nglobpris;
443 void
444 disp_cpu_fini(cpu_t *cp)
446 ASSERT(MUTEX_HELD(&cpu_lock));
448 disp_kp_free(cp->cpu_disp);
449 if (cp->cpu_disp != &cpu0_disp)
450 kmem_free(cp->cpu_disp, sizeof (disp_t));
454 * Allocate new, larger kpreempt dispatch queue to replace the old one.
456 void
457 disp_kp_alloc(disp_t *dq, pri_t npri)
459 struct disp_queue_info mem_info;
461 if (npri > dq->disp_npri) {
463 * Allocate memory for the new array.
465 disp_dq_alloc(&mem_info, npri, dq);
468 * We need to copy the old structures to the new
469 * and free the old.
471 disp_dq_assign(&mem_info, npri);
472 disp_dq_free(&mem_info);
477 * Free dispatch queue.
478 * Used for the kpreempt queues for a removed CPU partition and
479 * for the per-CPU queues of deleted CPUs.
481 void
482 disp_kp_free(disp_t *dq)
484 struct disp_queue_info mem_info;
486 mem_info.olddispq = dq->disp_q;
487 mem_info.olddqactmap = dq->disp_qactmap;
488 mem_info.oldnglobpris = dq->disp_npri;
489 disp_dq_free(&mem_info);
493 * End dispatcher and scheduler initialization.
497 * See if there's anything to do other than remain idle.
498 * Return non-zero if there is.
500 * This function must be called with high spl, or with
501 * kernel preemption disabled to prevent the partition's
502 * active cpu list from changing while being traversed.
504 * This is essentially a simpler version of disp_getwork()
505 * to be called by CPUs preparing to "halt".
508 disp_anywork(void)
510 cpu_t *cp = CPU;
511 cpu_t *ocp;
512 volatile int *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
514 if (!(cp->cpu_flags & CPU_OFFLINE)) {
515 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
516 return (1);
518 for (ocp = cp->cpu_next_part; ocp != cp;
519 ocp = ocp->cpu_next_part) {
520 ASSERT(CPU_ACTIVE(ocp));
523 * Something has appeared on the local run queue.
525 if (*local_nrunnable > 0)
526 return (1);
528 * If we encounter another idle CPU that will
529 * soon be trolling around through disp_anywork()
530 * terminate our walk here and let this other CPU
531 * patrol the next part of the list.
533 if (ocp->cpu_dispatch_pri == -1 &&
534 (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
535 return (0);
537 * Work can be taken from another CPU if:
538 * - There is unbound work on the run queue
539 * - That work isn't a thread undergoing a
540 * - context switch on an otherwise empty queue.
541 * - The CPU isn't running the idle loop.
543 if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
544 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
545 ocp->cpu_disp->disp_nrunnable == 1) &&
546 ocp->cpu_dispatch_pri != -1)
547 return (1);
550 return (0);
554 * Called when CPU enters the idle loop
556 static void
557 idle_enter()
559 cpu_t *cp = CPU;
561 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
562 CPU_STATS_ADDQ(cp, sys, idlethread, 1);
563 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */
567 * Called when CPU exits the idle loop
569 static void
570 idle_exit()
572 cpu_t *cp = CPU;
574 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
575 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */
579 * Idle loop.
581 void
582 idle()
584 struct cpu *cp = CPU; /* pointer to this CPU */
585 kthread_t *t; /* taken thread */
587 idle_enter();
590 * Uniprocessor version of idle loop.
591 * Do this until notified that we're on an actual multiprocessor.
593 while (ncpus == 1) {
594 if (cp->cpu_disp->disp_nrunnable == 0) {
595 (*idle_cpu)();
596 continue;
598 idle_exit();
599 swtch();
601 idle_enter(); /* returned from swtch */
605 * Multiprocessor idle loop.
607 for (;;) {
609 * If CPU is completely quiesced by p_online(2), just wait
610 * here with minimal bus traffic until put online.
612 while (cp->cpu_flags & CPU_QUIESCED)
613 (*idle_cpu)();
615 if (cp->cpu_disp->disp_nrunnable != 0) {
616 idle_exit();
617 swtch();
618 } else {
619 if (cp->cpu_flags & CPU_OFFLINE)
620 continue;
621 if ((t = disp_getwork(cp)) == NULL) {
622 if (cp->cpu_chosen_level != -1) {
623 disp_t *dp = cp->cpu_disp;
624 disp_t *kpq;
626 disp_lock_enter(&dp->disp_lock);
628 * Set kpq under lock to prevent
629 * migration between partitions.
631 kpq = &cp->cpu_part->cp_kp_queue;
632 if (kpq->disp_maxrunpri == -1)
633 cp->cpu_chosen_level = -1;
634 disp_lock_exit(&dp->disp_lock);
636 (*idle_cpu)();
637 continue;
640 * If there was a thread but we couldn't steal
641 * it, then keep trying.
643 if (t == T_DONTSTEAL)
644 continue;
645 idle_exit();
646 swtch_to(t);
648 idle_enter(); /* returned from swtch/swtch_to */
654 * Preempt the currently running thread in favor of the highest
655 * priority thread. The class of the current thread controls
656 * where it goes on the dispatcher queues. If panicking, turn
657 * preemption off.
659 void
660 preempt()
662 kthread_t *t = curthread;
663 klwp_t *lwp = ttolwp(curthread);
665 if (panicstr)
666 return;
668 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
670 thread_lock(t);
672 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
674 * this thread has already been chosen to be run on
675 * another CPU. Clear kprunrun on this CPU since we're
676 * already headed for swtch().
678 CPU->cpu_kprunrun = 0;
679 thread_unlock_nopreempt(t);
680 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
681 } else {
682 if (lwp != NULL)
683 lwp->lwp_ru.nivcsw++;
684 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
685 THREAD_TRANSITION(t);
686 CL_PREEMPT(t);
687 DTRACE_SCHED(preempt);
688 thread_unlock_nopreempt(t);
690 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
692 swtch(); /* clears CPU->cpu_runrun via disp() */
696 extern kthread_t *thread_unpin();
699 * disp() - find the highest priority thread for this processor to run, and
700 * set it in TS_ONPROC state so that resume() can be called to run it.
702 static kthread_t *
703 disp()
705 cpu_t *cpup;
706 disp_t *dp;
707 kthread_t *tp;
708 dispq_t *dq;
709 int maxrunword;
710 pri_t pri;
711 disp_t *kpq;
713 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
715 cpup = CPU;
717 * Find the highest priority loaded, runnable thread.
719 dp = cpup->cpu_disp;
721 reschedule:
723 * If there is more important work on the global queue with a better
724 * priority than the maximum on this CPU, take it now.
726 kpq = &cpup->cpu_part->cp_kp_queue;
727 while ((pri = kpq->disp_maxrunpri) >= 0 &&
728 pri >= dp->disp_maxrunpri &&
729 (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
730 (tp = disp_getbest(kpq)) != NULL) {
731 if (disp_ratify(tp, kpq) != NULL) {
732 TRACE_1(TR_FAC_DISP, TR_DISP_END,
733 "disp_end:tid %p", tp);
734 return (tp);
738 disp_lock_enter(&dp->disp_lock);
739 pri = dp->disp_maxrunpri;
742 * If there is nothing to run, look at what's runnable on other queues.
743 * Choose the idle thread if the CPU is quiesced.
744 * Note that CPUs that have the CPU_OFFLINE flag set can still run
745 * interrupt threads, which will be the only threads on the CPU's own
746 * queue, but cannot run threads from other queues.
748 if (pri == -1) {
749 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
750 disp_lock_exit(&dp->disp_lock);
751 if ((tp = disp_getwork(cpup)) == NULL ||
752 tp == T_DONTSTEAL) {
753 tp = cpup->cpu_idle_thread;
754 (void) splhigh();
755 THREAD_ONPROC(tp, cpup);
756 cpup->cpu_dispthread = tp;
757 cpup->cpu_dispatch_pri = -1;
758 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
759 cpup->cpu_chosen_level = -1;
761 } else {
762 disp_lock_exit_high(&dp->disp_lock);
763 tp = cpup->cpu_idle_thread;
764 THREAD_ONPROC(tp, cpup);
765 cpup->cpu_dispthread = tp;
766 cpup->cpu_dispatch_pri = -1;
767 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
768 cpup->cpu_chosen_level = -1;
770 TRACE_1(TR_FAC_DISP, TR_DISP_END,
771 "disp_end:tid %p", tp);
772 return (tp);
775 dq = &dp->disp_q[pri];
776 tp = dq->dq_first;
778 ASSERT(tp != NULL);
779 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */
781 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
784 * Found it so remove it from queue.
786 dp->disp_nrunnable--;
787 dq->dq_sruncnt--;
788 if ((dq->dq_first = tp->t_link) == NULL) {
789 ulong_t *dqactmap = dp->disp_qactmap;
791 ASSERT(dq->dq_sruncnt == 0);
792 dq->dq_last = NULL;
795 * The queue is empty, so the corresponding bit needs to be
796 * turned off in dqactmap. If nrunnable != 0 just took the
797 * last runnable thread off the
798 * highest queue, so recompute disp_maxrunpri.
800 maxrunword = pri >> BT_ULSHIFT;
801 dqactmap[maxrunword] &= ~BT_BIW(pri);
803 if (dp->disp_nrunnable == 0) {
804 dp->disp_max_unbound_pri = -1;
805 dp->disp_maxrunpri = -1;
806 } else {
807 int ipri;
809 ipri = bt_gethighbit(dqactmap, maxrunword);
810 dp->disp_maxrunpri = ipri;
811 if (ipri < dp->disp_max_unbound_pri)
812 dp->disp_max_unbound_pri = ipri;
814 } else {
815 tp->t_link = NULL;
819 * Set TS_DONT_SWAP flag to prevent another processor from swapping
820 * out this thread before we have a chance to run it.
821 * While running, it is protected against swapping by t_lock.
823 tp->t_schedflag |= TS_DONT_SWAP;
824 cpup->cpu_dispthread = tp; /* protected by spl only */
825 cpup->cpu_dispatch_pri = pri;
826 ASSERT(pri == DISP_PRIO(tp));
827 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */
828 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */
830 ASSERT(tp != NULL);
831 TRACE_1(TR_FAC_DISP, TR_DISP_END,
832 "disp_end:tid %p", tp);
834 if (disp_ratify(tp, kpq) == NULL)
835 goto reschedule;
837 return (tp);
841 * swtch()
842 * Find best runnable thread and run it.
843 * Called with the current thread already switched to a new state,
844 * on a sleep queue, run queue, stopped, and not zombied.
845 * May be called at any spl level less than or equal to LOCK_LEVEL.
846 * Always drops spl to the base level (spl0()).
848 void
849 swtch()
851 kthread_t *t = curthread;
852 kthread_t *next;
853 cpu_t *cp;
855 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
857 if (t->t_flag & T_INTR_THREAD)
858 cpu_intr_swtch_enter(t);
860 if (t->t_intr != NULL) {
862 * We are an interrupt thread. Setup and return
863 * the interrupted thread to be resumed.
865 (void) splhigh(); /* block other scheduler action */
866 cp = CPU; /* now protected against migration */
867 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
868 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
869 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
870 next = thread_unpin();
871 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
872 resume_from_intr(next);
873 } else {
874 #ifdef DEBUG
875 if (t->t_state == TS_ONPROC &&
876 t->t_disp_queue->disp_cpu == CPU &&
877 t->t_preempt == 0) {
878 thread_lock(t);
879 ASSERT(t->t_state != TS_ONPROC ||
880 t->t_disp_queue->disp_cpu != CPU ||
881 t->t_preempt != 0); /* cannot migrate */
882 thread_unlock_nopreempt(t);
884 #endif /* DEBUG */
885 cp = CPU;
886 next = disp(); /* returns with spl high */
887 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
889 /* OK to steal anything left on run queue */
890 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
892 if (next != t) {
893 hrtime_t now;
895 now = gethrtime_unscaled();
896 pg_ev_thread_swtch(cp, now, t, next);
899 * If t was previously in the TS_ONPROC state,
900 * setfrontdq and setbackdq won't have set its t_waitrq.
901 * Since we now finally know that we're switching away
902 * from this thread, set its t_waitrq if it is on a run
903 * queue.
905 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
906 t->t_waitrq = now;
910 * restore mstate of thread that we are switching to
912 restore_mstate(next);
914 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
915 cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
916 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
918 if (dtrace_vtime_active)
919 dtrace_vtime_switch(next);
921 resume(next);
923 * The TR_RESUME_END and TR_SWTCH_END trace points
924 * appear at the end of resume(), because we may not
925 * return here
927 } else {
928 if (t->t_flag & T_INTR_THREAD)
929 cpu_intr_swtch_exit(t);
931 * Threads that enqueue themselves on a run queue defer
932 * setting t_waitrq. It is then either set in swtch()
933 * when the CPU is actually yielded, or not at all if it
934 * is remaining on the CPU.
935 * There is however a window between where the thread
936 * placed itself on a run queue, and where it selects
937 * itself in disp(), where a third party (eg. clock()
938 * doing tick processing) may have re-enqueued this
939 * thread, setting t_waitrq in the process. We detect
940 * this race by noticing that despite switching to
941 * ourself, our t_waitrq has been set, and should be
942 * cleared.
944 if (t->t_waitrq != 0)
945 t->t_waitrq = 0;
947 pg_ev_thread_remain(cp, t);
949 DTRACE_SCHED(remain__cpu);
950 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
951 (void) spl0();
957 * swtch_from_zombie()
958 * Special case of swtch(), which allows checks for TS_ZOMB to be
959 * eliminated from normal resume.
960 * Find best runnable thread and run it.
961 * Called with the current thread zombied.
962 * Zombies cannot migrate, so CPU references are safe.
964 void
965 swtch_from_zombie()
967 kthread_t *next;
968 cpu_t *cpu = CPU;
970 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
972 ASSERT(curthread->t_state == TS_ZOMB);
974 next = disp(); /* returns with spl high */
975 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */
976 CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
977 ASSERT(next != curthread);
978 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
980 pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
982 restore_mstate(next);
984 if (dtrace_vtime_active)
985 dtrace_vtime_switch(next);
987 resume_from_zombie(next);
989 * The TR_RESUME_END and TR_SWTCH_END trace points
990 * appear at the end of resume(), because we certainly will not
991 * return here
995 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
998 * search_disp_queues()
999 * Search the given dispatch queues for thread tp.
1000 * Return 1 if tp is found, otherwise return 0.
1002 static int
1003 search_disp_queues(disp_t *dp, kthread_t *tp)
1005 dispq_t *dq;
1006 dispq_t *eq;
1008 disp_lock_enter_high(&dp->disp_lock);
1010 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1011 kthread_t *rp;
1013 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1015 for (rp = dq->dq_first; rp; rp = rp->t_link)
1016 if (tp == rp) {
1017 disp_lock_exit_high(&dp->disp_lock);
1018 return (1);
1021 disp_lock_exit_high(&dp->disp_lock);
1023 return (0);
1027 * thread_on_queue()
1028 * Search all per-CPU dispatch queues and all partition-wide kpreempt
1029 * queues for thread tp. Return 1 if tp is found, otherwise return 0.
1031 static int
1032 thread_on_queue(kthread_t *tp)
1034 cpu_t *cp;
1035 struct cpupart *part;
1037 ASSERT(getpil() >= DISP_LEVEL);
1040 * Search the per-CPU dispatch queues for tp.
1042 cp = CPU;
1043 do {
1044 if (search_disp_queues(cp->cpu_disp, tp))
1045 return (1);
1046 } while ((cp = cp->cpu_next_onln) != CPU);
1049 * Search the partition-wide kpreempt queues for tp.
1051 part = CPU->cpu_part;
1052 do {
1053 if (search_disp_queues(&part->cp_kp_queue, tp))
1054 return (1);
1055 } while ((part = part->cp_next) != CPU->cpu_part);
1057 return (0);
1060 #else
1062 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */
1064 #endif /* DEBUG */
1067 * like swtch(), but switch to a specified thread taken from another CPU.
1068 * called with spl high..
1070 void
1071 swtch_to(kthread_t *next)
1073 cpu_t *cp = CPU;
1074 hrtime_t now;
1076 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1079 * Update context switch statistics.
1081 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1083 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1085 now = gethrtime_unscaled();
1086 pg_ev_thread_swtch(cp, now, curthread, next);
1088 /* OK to steal anything left on run queue */
1089 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1091 /* record last execution time */
1092 cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1095 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1096 * won't have set its t_waitrq. Since we now finally know that we're
1097 * switching away from this thread, set its t_waitrq if it is on a run
1098 * queue.
1100 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1101 curthread->t_waitrq = now;
1104 /* restore next thread to previously running microstate */
1105 restore_mstate(next);
1107 if (dtrace_vtime_active)
1108 dtrace_vtime_switch(next);
1110 resume(next);
1112 * The TR_RESUME_END and TR_SWTCH_END trace points
1113 * appear at the end of resume(), because we may not
1114 * return here
1118 #define CPU_IDLING(pri) ((pri) == -1)
1120 static void
1121 cpu_resched(cpu_t *cp, pri_t tpri)
1123 int call_poke_cpu = 0;
1124 pri_t cpupri = cp->cpu_dispatch_pri;
1126 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1127 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130 cp->cpu_runrun = 1;
1131 aston(cp->cpu_dispthread);
1132 if (tpri < kpreemptpri && cp != CPU)
1133 call_poke_cpu = 1;
1135 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136 cp->cpu_kprunrun = 1;
1137 if (cp != CPU)
1138 call_poke_cpu = 1;
1143 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1145 membar_enter();
1147 if (call_poke_cpu)
1148 poke_cpu(cp->cpu_id);
1152 * setbackdq() keeps runqs balanced such that the difference in length
1153 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1154 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1155 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1156 * try to keep runqs perfectly balanced regardless of the thread priority.
1158 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */
1159 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */
1160 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1163 * Macro that evaluates to true if it is likely that the thread has cache
1164 * warmth. This is based on the amount of time that has elapsed since the
1165 * thread last ran. If that amount of time is less than "rechoose_interval"
1166 * ticks, then we decide that the thread has enough cache warmth to warrant
1167 * some affinity for t->t_cpu.
1169 #define THREAD_HAS_CACHE_WARMTH(thread) \
1170 ((thread == curthread) || \
1171 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1173 * Put the specified thread on the back of the dispatcher
1174 * queue corresponding to its current priority.
1176 * Called with the thread in transition, onproc or stopped state
1177 * and locked (transition implies locked) and at high spl.
1178 * Returns with the thread in TS_RUN state and still locked.
1180 void
1181 setbackdq(kthread_t *tp)
1183 dispq_t *dq;
1184 disp_t *dp;
1185 cpu_t *cp;
1186 pri_t tpri;
1187 int bound;
1188 boolean_t self;
1190 ASSERT(THREAD_LOCK_HELD(tp));
1191 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1192 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1195 * If thread is "swapped" or on the swap queue don't
1196 * queue it, but wake sched.
1198 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1199 disp_swapped_setrun(tp);
1200 return;
1203 self = (tp == curthread);
1205 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1206 bound = 1;
1207 else
1208 bound = 0;
1210 tpri = DISP_PRIO(tp);
1211 if (ncpus == 1)
1212 cp = tp->t_cpu;
1213 else if (!bound) {
1214 if (tpri >= kpqpri) {
1215 setkpdq(tp, SETKP_BACK);
1216 return;
1220 * We'll generally let this thread continue to run where
1221 * it last ran...but will consider migration if:
1222 * - We thread probably doesn't have much cache warmth.
1223 * - The CPU where it last ran is the target of an offline
1224 * request.
1225 * - The thread last ran outside it's home lgroup.
1227 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1228 (tp->t_cpu == cpu_inmotion)) {
1229 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1230 } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1231 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1232 self ? tp->t_cpu : NULL);
1233 } else {
1234 cp = tp->t_cpu;
1237 if (tp->t_cpupart == cp->cpu_part) {
1238 int qlen;
1241 * Perform any CMT load balancing
1243 cp = cmt_balance(tp, cp);
1246 * Balance across the run queues
1248 qlen = RUNQ_LEN(cp, tpri);
1249 if (tpri >= RUNQ_MATCH_PRI &&
1250 !(tp->t_schedflag & TS_RUNQMATCH))
1251 qlen -= RUNQ_MAX_DIFF;
1252 if (qlen > 0) {
1253 cpu_t *newcp;
1255 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1256 newcp = cp->cpu_next_part;
1257 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1258 newcp = cp->cpu_next_part;
1261 if (RUNQ_LEN(newcp, tpri) < qlen) {
1262 DTRACE_PROBE3(runq__balance,
1263 kthread_t *, tp,
1264 cpu_t *, cp, cpu_t *, newcp);
1265 cp = newcp;
1268 } else {
1270 * Migrate to a cpu in the new partition.
1272 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1273 tp->t_lpl, tp->t_pri, NULL);
1275 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1276 } else {
1278 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1279 * a short time until weak binding that existed when the
1280 * strong binding was established has dropped) so we must
1281 * favour weak binding over strong.
1283 cp = tp->t_weakbound_cpu ?
1284 tp->t_weakbound_cpu : tp->t_bound_cpu;
1287 * A thread that is ONPROC may be temporarily placed on the run queue
1288 * but then chosen to run again by disp. If the thread we're placing on
1289 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1290 * replacement process is actually scheduled in swtch(). In this
1291 * situation, curthread is the only thread that could be in the ONPROC
1292 * state.
1294 if ((!self) && (tp->t_waitrq == 0)) {
1295 hrtime_t curtime;
1297 curtime = gethrtime_unscaled();
1298 (void) cpu_update_pct(tp, curtime);
1299 tp->t_waitrq = curtime;
1300 } else {
1301 (void) cpu_update_pct(tp, gethrtime_unscaled());
1304 dp = cp->cpu_disp;
1305 disp_lock_enter_high(&dp->disp_lock);
1307 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1308 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1309 tpri, cp, tp);
1311 #ifndef NPROBE
1312 /* Kernel probe */
1313 if (tnf_tracing_active)
1314 tnf_thread_queue(tp, cp, tpri);
1315 #endif /* NPROBE */
1317 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1319 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1320 tp->t_disp_queue = dp;
1321 tp->t_link = NULL;
1323 dq = &dp->disp_q[tpri];
1324 dp->disp_nrunnable++;
1325 if (!bound)
1326 dp->disp_steal = 0;
1327 membar_enter();
1329 if (dq->dq_sruncnt++ != 0) {
1330 ASSERT(dq->dq_first != NULL);
1331 dq->dq_last->t_link = tp;
1332 dq->dq_last = tp;
1333 } else {
1334 ASSERT(dq->dq_first == NULL);
1335 ASSERT(dq->dq_last == NULL);
1336 dq->dq_first = dq->dq_last = tp;
1337 BT_SET(dp->disp_qactmap, tpri);
1338 if (tpri > dp->disp_maxrunpri) {
1339 dp->disp_maxrunpri = tpri;
1340 membar_enter();
1341 cpu_resched(cp, tpri);
1345 if (!bound && tpri > dp->disp_max_unbound_pri) {
1346 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1348 * If there are no other unbound threads on the
1349 * run queue, don't allow other CPUs to steal
1350 * this thread while we are in the middle of a
1351 * context switch. We may just switch to it
1352 * again right away. CPU_DISP_DONTSTEAL is cleared
1353 * in swtch and swtch_to.
1355 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1357 dp->disp_max_unbound_pri = tpri;
1359 (*disp_enq_thread)(cp, bound);
1363 * Put the specified thread on the front of the dispatcher
1364 * queue corresponding to its current priority.
1366 * Called with the thread in transition, onproc or stopped state
1367 * and locked (transition implies locked) and at high spl.
1368 * Returns with the thread in TS_RUN state and still locked.
1370 void
1371 setfrontdq(kthread_t *tp)
1373 disp_t *dp;
1374 dispq_t *dq;
1375 cpu_t *cp;
1376 pri_t tpri;
1377 int bound;
1379 ASSERT(THREAD_LOCK_HELD(tp));
1380 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1381 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1384 * If thread is "swapped" or on the swap queue don't
1385 * queue it, but wake sched.
1387 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1388 disp_swapped_setrun(tp);
1389 return;
1392 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1393 bound = 1;
1394 else
1395 bound = 0;
1397 tpri = DISP_PRIO(tp);
1398 if (ncpus == 1)
1399 cp = tp->t_cpu;
1400 else if (!bound) {
1401 if (tpri >= kpqpri) {
1402 setkpdq(tp, SETKP_FRONT);
1403 return;
1405 cp = tp->t_cpu;
1406 if (tp->t_cpupart == cp->cpu_part) {
1408 * We'll generally let this thread continue to run
1409 * where it last ran, but will consider migration if:
1410 * - The thread last ran outside it's home lgroup.
1411 * - The CPU where it last ran is the target of an
1412 * offline request (a thread_nomigrate() on the in
1413 * motion CPU relies on this when forcing a preempt).
1414 * - The thread isn't the highest priority thread where
1415 * it last ran, and it is considered not likely to
1416 * have significant cache warmth.
1418 if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1419 (cp == cpu_inmotion)) {
1420 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1421 (tp == curthread) ? cp : NULL);
1422 } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1423 (!THREAD_HAS_CACHE_WARMTH(tp))) {
1424 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1425 NULL);
1427 } else {
1429 * Migrate to a cpu in the new partition.
1431 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1432 tp->t_lpl, tp->t_pri, NULL);
1434 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1435 } else {
1437 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1438 * a short time until weak binding that existed when the
1439 * strong binding was established has dropped) so we must
1440 * favour weak binding over strong.
1442 cp = tp->t_weakbound_cpu ?
1443 tp->t_weakbound_cpu : tp->t_bound_cpu;
1447 * A thread that is ONPROC may be temporarily placed on the run queue
1448 * but then chosen to run again by disp. If the thread we're placing on
1449 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1450 * replacement process is actually scheduled in swtch(). In this
1451 * situation, curthread is the only thread that could be in the ONPROC
1452 * state.
1454 if ((tp != curthread) && (tp->t_waitrq == 0)) {
1455 hrtime_t curtime;
1457 curtime = gethrtime_unscaled();
1458 (void) cpu_update_pct(tp, curtime);
1459 tp->t_waitrq = curtime;
1460 } else {
1461 (void) cpu_update_pct(tp, gethrtime_unscaled());
1464 dp = cp->cpu_disp;
1465 disp_lock_enter_high(&dp->disp_lock);
1467 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1468 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1470 #ifndef NPROBE
1471 /* Kernel probe */
1472 if (tnf_tracing_active)
1473 tnf_thread_queue(tp, cp, tpri);
1474 #endif /* NPROBE */
1476 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1478 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */
1479 tp->t_disp_queue = dp;
1481 dq = &dp->disp_q[tpri];
1482 dp->disp_nrunnable++;
1483 if (!bound)
1484 dp->disp_steal = 0;
1485 membar_enter();
1487 if (dq->dq_sruncnt++ != 0) {
1488 ASSERT(dq->dq_last != NULL);
1489 tp->t_link = dq->dq_first;
1490 dq->dq_first = tp;
1491 } else {
1492 ASSERT(dq->dq_last == NULL);
1493 ASSERT(dq->dq_first == NULL);
1494 tp->t_link = NULL;
1495 dq->dq_first = dq->dq_last = tp;
1496 BT_SET(dp->disp_qactmap, tpri);
1497 if (tpri > dp->disp_maxrunpri) {
1498 dp->disp_maxrunpri = tpri;
1499 membar_enter();
1500 cpu_resched(cp, tpri);
1504 if (!bound && tpri > dp->disp_max_unbound_pri) {
1505 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1506 cp == CPU) {
1508 * If there are no other unbound threads on the
1509 * run queue, don't allow other CPUs to steal
1510 * this thread while we are in the middle of a
1511 * context switch. We may just switch to it
1512 * again right away. CPU_DISP_DONTSTEAL is cleared
1513 * in swtch and swtch_to.
1515 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1517 dp->disp_max_unbound_pri = tpri;
1519 (*disp_enq_thread)(cp, bound);
1523 * Put a high-priority unbound thread on the kp queue
1525 static void
1526 setkpdq(kthread_t *tp, int borf)
1528 dispq_t *dq;
1529 disp_t *dp;
1530 cpu_t *cp;
1531 pri_t tpri;
1533 tpri = DISP_PRIO(tp);
1535 dp = &tp->t_cpupart->cp_kp_queue;
1536 disp_lock_enter_high(&dp->disp_lock);
1538 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1540 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1541 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1542 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1543 tp->t_disp_queue = dp;
1544 dp->disp_nrunnable++;
1545 dq = &dp->disp_q[tpri];
1547 if (dq->dq_sruncnt++ != 0) {
1548 if (borf == SETKP_BACK) {
1549 ASSERT(dq->dq_first != NULL);
1550 tp->t_link = NULL;
1551 dq->dq_last->t_link = tp;
1552 dq->dq_last = tp;
1553 } else {
1554 ASSERT(dq->dq_last != NULL);
1555 tp->t_link = dq->dq_first;
1556 dq->dq_first = tp;
1558 } else {
1559 if (borf == SETKP_BACK) {
1560 ASSERT(dq->dq_first == NULL);
1561 ASSERT(dq->dq_last == NULL);
1562 dq->dq_first = dq->dq_last = tp;
1563 } else {
1564 ASSERT(dq->dq_last == NULL);
1565 ASSERT(dq->dq_first == NULL);
1566 tp->t_link = NULL;
1567 dq->dq_first = dq->dq_last = tp;
1569 BT_SET(dp->disp_qactmap, tpri);
1570 if (tpri > dp->disp_max_unbound_pri)
1571 dp->disp_max_unbound_pri = tpri;
1572 if (tpri > dp->disp_maxrunpri) {
1573 dp->disp_maxrunpri = tpri;
1574 membar_enter();
1578 cp = tp->t_cpu;
1579 if (tp->t_cpupart != cp->cpu_part) {
1580 /* migrate to a cpu in the new partition */
1581 cp = tp->t_cpupart->cp_cpulist;
1583 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1584 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1585 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1587 #ifndef NPROBE
1588 /* Kernel probe */
1589 if (tnf_tracing_active)
1590 tnf_thread_queue(tp, cp, tpri);
1591 #endif /* NPROBE */
1593 if (cp->cpu_chosen_level < tpri)
1594 cp->cpu_chosen_level = tpri;
1595 cpu_resched(cp, tpri);
1596 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1597 (*disp_enq_thread)(cp, 0);
1601 * Remove a thread from the dispatcher queue if it is on it.
1602 * It is not an error if it is not found but we return whether
1603 * or not it was found in case the caller wants to check.
1606 dispdeq(kthread_t *tp)
1608 disp_t *dp;
1609 dispq_t *dq;
1610 kthread_t *rp;
1611 kthread_t *trp;
1612 kthread_t **ptp;
1613 int tpri;
1615 ASSERT(THREAD_LOCK_HELD(tp));
1617 if (tp->t_state != TS_RUN)
1618 return (0);
1621 * The thread is "swapped" or is on the swap queue and
1622 * hence no longer on the run queue, so return true.
1624 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1625 return (1);
1627 tpri = DISP_PRIO(tp);
1628 dp = tp->t_disp_queue;
1629 ASSERT(tpri < dp->disp_npri);
1630 dq = &dp->disp_q[tpri];
1631 ptp = &dq->dq_first;
1632 rp = *ptp;
1633 trp = NULL;
1635 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1638 * Search for thread in queue.
1639 * Double links would simplify this at the expense of disp/setrun.
1641 while (rp != tp && rp != NULL) {
1642 trp = rp;
1643 ptp = &trp->t_link;
1644 rp = trp->t_link;
1647 if (rp == NULL) {
1648 panic("dispdeq: thread not on queue");
1651 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1654 * Found it so remove it from queue.
1656 if ((*ptp = rp->t_link) == NULL)
1657 dq->dq_last = trp;
1659 dp->disp_nrunnable--;
1660 if (--dq->dq_sruncnt == 0) {
1661 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1662 if (dp->disp_nrunnable == 0) {
1663 dp->disp_max_unbound_pri = -1;
1664 dp->disp_maxrunpri = -1;
1665 } else if (tpri == dp->disp_maxrunpri) {
1666 int ipri;
1668 ipri = bt_gethighbit(dp->disp_qactmap,
1669 dp->disp_maxrunpri >> BT_ULSHIFT);
1670 if (ipri < dp->disp_max_unbound_pri)
1671 dp->disp_max_unbound_pri = ipri;
1672 dp->disp_maxrunpri = ipri;
1675 tp->t_link = NULL;
1676 THREAD_TRANSITION(tp); /* put in intermediate state */
1677 return (1);
1682 * dq_sruninc and dq_srundec are public functions for
1683 * incrementing/decrementing the sruncnts when a thread on
1684 * a dispatcher queue is made schedulable/unschedulable by
1685 * resetting the TS_LOAD flag.
1687 * The caller MUST have the thread lock and therefore the dispatcher
1688 * queue lock so that the operation which changes
1689 * the flag, the operation that checks the status of the thread to
1690 * determine if it's on a disp queue AND the call to this function
1691 * are one atomic operation with respect to interrupts.
1695 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1697 void
1698 dq_sruninc(kthread_t *t)
1700 ASSERT(t->t_state == TS_RUN);
1701 ASSERT(t->t_schedflag & TS_LOAD);
1703 THREAD_TRANSITION(t);
1704 setfrontdq(t);
1708 * See comment on calling conventions above.
1709 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1711 void
1712 dq_srundec(kthread_t *t)
1714 ASSERT(t->t_schedflag & TS_LOAD);
1716 (void) dispdeq(t);
1717 disp_swapped_enq(t);
1721 * Change the dispatcher lock of thread to the "swapped_lock"
1722 * and return with thread lock still held.
1724 * Called with thread_lock held, in transition state, and at high spl.
1726 void
1727 disp_swapped_enq(kthread_t *tp)
1729 ASSERT(THREAD_LOCK_HELD(tp));
1730 ASSERT(tp->t_schedflag & TS_LOAD);
1732 switch (tp->t_state) {
1733 case TS_RUN:
1734 disp_lock_enter_high(&swapped_lock);
1735 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1736 break;
1737 case TS_ONPROC:
1738 disp_lock_enter_high(&swapped_lock);
1739 THREAD_TRANSITION(tp);
1740 wake_sched_sec = 1; /* tell clock to wake sched */
1741 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1742 break;
1743 default:
1744 panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1749 * This routine is called by setbackdq/setfrontdq if the thread is
1750 * not loaded or loaded and on the swap queue.
1752 * Thread state TS_SLEEP implies that a swapped thread
1753 * has been woken up and needs to be swapped in by the swapper.
1755 * Thread state TS_RUN, it implies that the priority of a swapped
1756 * thread is being increased by scheduling class (e.g. ts_update).
1758 static void
1759 disp_swapped_setrun(kthread_t *tp)
1761 ASSERT(THREAD_LOCK_HELD(tp));
1762 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1764 switch (tp->t_state) {
1765 case TS_SLEEP:
1766 disp_lock_enter_high(&swapped_lock);
1768 * Wakeup sched immediately (i.e., next tick) if the
1769 * thread priority is above maxclsyspri.
1771 if (DISP_PRIO(tp) > maxclsyspri)
1772 wake_sched = 1;
1773 else
1774 wake_sched_sec = 1;
1775 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1776 break;
1777 case TS_RUN: /* called from ts_update */
1778 break;
1779 default:
1780 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1785 * Make a thread give up its processor. Find the processor on
1786 * which this thread is executing, and have that processor
1787 * preempt.
1789 * We allow System Duty Cycle (SDC) threads to be preempted even if
1790 * they are running at kernel priorities. To implement this, we always
1791 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC
1792 * calls cpu_surrender() very often, we only preempt if there is anyone
1793 * competing with us.
1795 void
1796 cpu_surrender(kthread_t *tp)
1798 cpu_t *cpup;
1799 int max_pri;
1800 int max_run_pri;
1801 klwp_t *lwp;
1803 ASSERT(THREAD_LOCK_HELD(tp));
1805 if (tp->t_state != TS_ONPROC)
1806 return;
1807 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */
1808 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1809 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1810 if (max_pri < max_run_pri)
1811 max_pri = max_run_pri;
1813 if (tp->t_cid == sysdccid) {
1814 uint_t t_pri = DISP_PRIO(tp);
1815 if (t_pri > max_pri)
1816 return; /* we are not competing w/ anyone */
1817 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1818 } else {
1819 cpup->cpu_runrun = 1;
1820 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1821 cpup->cpu_kprunrun = 1;
1826 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1828 membar_enter();
1830 DTRACE_SCHED1(surrender, kthread_t *, tp);
1833 * Make the target thread take an excursion through trap()
1834 * to do preempt() (unless we're already in trap or post_syscall,
1835 * calling cpu_surrender via CL_TRAPRET).
1837 if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1838 lwp->lwp_state != LWP_USER) {
1839 aston(tp);
1840 if (cpup != CPU)
1841 poke_cpu(cpup->cpu_id);
1843 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1844 "cpu_surrender:tid %p cpu %p", tp, cpup);
1848 * Commit to and ratify a scheduling decision
1850 /*ARGSUSED*/
1851 static kthread_t *
1852 disp_ratify(kthread_t *tp, disp_t *kpq)
1854 pri_t tpri, maxpri;
1855 pri_t maxkpri;
1856 cpu_t *cpup;
1858 ASSERT(tp != NULL);
1860 * Commit to, then ratify scheduling decision
1862 cpup = CPU;
1863 if (cpup->cpu_runrun != 0)
1864 cpup->cpu_runrun = 0;
1865 if (cpup->cpu_kprunrun != 0)
1866 cpup->cpu_kprunrun = 0;
1867 if (cpup->cpu_chosen_level != -1)
1868 cpup->cpu_chosen_level = -1;
1869 membar_enter();
1870 tpri = DISP_PRIO(tp);
1871 maxpri = cpup->cpu_disp->disp_maxrunpri;
1872 maxkpri = kpq->disp_maxrunpri;
1873 if (maxpri < maxkpri)
1874 maxpri = maxkpri;
1875 if (tpri < maxpri) {
1877 * should have done better
1878 * put this one back and indicate to try again
1880 cpup->cpu_dispthread = curthread; /* fixup dispthread */
1881 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1882 thread_lock_high(tp);
1883 THREAD_TRANSITION(tp);
1884 setfrontdq(tp);
1885 thread_unlock_nopreempt(tp);
1887 tp = NULL;
1889 return (tp);
1893 * See if there is any work on the dispatcher queue for other CPUs.
1894 * If there is, dequeue the best thread and return.
1896 static kthread_t *
1897 disp_getwork(cpu_t *cp)
1899 cpu_t *ocp; /* other CPU */
1900 cpu_t *ocp_start;
1901 cpu_t *tcp; /* target local CPU */
1902 kthread_t *tp;
1903 kthread_t *retval = NULL;
1904 pri_t maxpri;
1905 disp_t *kpq; /* kp queue for this partition */
1906 lpl_t *lpl, *lpl_leaf;
1907 int leafidx, startidx;
1908 hrtime_t stealtime;
1909 lgrp_id_t local_id;
1911 maxpri = -1;
1912 tcp = NULL;
1914 kpq = &cp->cpu_part->cp_kp_queue;
1915 while (kpq->disp_maxrunpri >= 0) {
1917 * Try to take a thread from the kp_queue.
1919 tp = (disp_getbest(kpq));
1920 if (tp)
1921 return (disp_ratify(tp, kpq));
1924 kpreempt_disable(); /* protect the cpu_active list */
1927 * Try to find something to do on another CPU's run queue.
1928 * Loop through all other CPUs looking for the one with the highest
1929 * priority unbound thread.
1931 * On NUMA machines, the partition's CPUs are consulted in order of
1932 * distance from the current CPU. This way, the first available
1933 * work found is also the closest, and will suffer the least
1934 * from being migrated.
1936 lpl = lpl_leaf = cp->cpu_lpl;
1937 local_id = lpl_leaf->lpl_lgrpid;
1938 leafidx = startidx = 0;
1941 * This loop traverses the lpl hierarchy. Higher level lpls represent
1942 * broader levels of locality
1944 do {
1945 /* This loop iterates over the lpl's leaves */
1946 do {
1947 if (lpl_leaf != cp->cpu_lpl)
1948 ocp = lpl_leaf->lpl_cpus;
1949 else
1950 ocp = cp->cpu_next_lpl;
1952 /* This loop iterates over the CPUs in the leaf */
1953 ocp_start = ocp;
1954 do {
1955 pri_t pri;
1957 ASSERT(CPU_ACTIVE(ocp));
1960 * End our stroll around this lpl if:
1962 * - Something became runnable on the local
1963 * queue...which also ends our stroll around
1964 * the partition.
1966 * - We happen across another idle CPU.
1967 * Since it is patrolling the next portion
1968 * of the lpl's list (assuming it's not
1969 * halted, or busy servicing an interrupt),
1970 * move to the next higher level of locality.
1972 if (cp->cpu_disp->disp_nrunnable != 0) {
1973 kpreempt_enable();
1974 return (NULL);
1976 if (ocp->cpu_dispatch_pri == -1) {
1977 if (ocp->cpu_disp_flags &
1978 CPU_DISP_HALTED ||
1979 ocp->cpu_intr_actv != 0)
1980 continue;
1981 else
1982 goto next_level;
1986 * If there's only one thread and the CPU
1987 * is in the middle of a context switch,
1988 * or it's currently running the idle thread,
1989 * don't steal it.
1991 if ((ocp->cpu_disp_flags &
1992 CPU_DISP_DONTSTEAL) &&
1993 ocp->cpu_disp->disp_nrunnable == 1)
1994 continue;
1996 pri = ocp->cpu_disp->disp_max_unbound_pri;
1997 if (pri > maxpri) {
1999 * Don't steal threads that we attempted
2000 * to steal recently until they're ready
2001 * to be stolen again.
2003 stealtime = ocp->cpu_disp->disp_steal;
2004 if (stealtime == 0 ||
2005 stealtime - gethrtime() <= 0) {
2006 maxpri = pri;
2007 tcp = ocp;
2008 } else {
2010 * Don't update tcp, just set
2011 * the retval to T_DONTSTEAL, so
2012 * that if no acceptable CPUs
2013 * are found the return value
2014 * will be T_DONTSTEAL rather
2015 * then NULL.
2017 retval = T_DONTSTEAL;
2020 } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2023 * Iterate to the next leaf lpl in the resource set
2024 * at this level of locality. If we hit the end of
2025 * the set, wrap back around to the beginning.
2027 * Note: This iteration is NULL terminated for a reason
2028 * see lpl_topo_bootstrap() in lgrp.c for details.
2030 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2031 leafidx = 0;
2032 lpl_leaf = lpl->lpl_rset[leafidx];
2034 } while (leafidx != startidx);
2036 next_level:
2038 * Expand the search to include farther away CPUs (next
2039 * locality level). The closer CPUs that have already been
2040 * checked will be checked again. In doing so, idle CPUs
2041 * will tend to be more aggresive about stealing from CPUs
2042 * that are closer (since the closer CPUs will be considered
2043 * more often).
2044 * Begin at this level with the CPUs local leaf lpl.
2046 if ((lpl = lpl->lpl_parent) != NULL) {
2047 leafidx = startidx = lpl->lpl_id2rset[local_id];
2048 lpl_leaf = lpl->lpl_rset[leafidx];
2050 } while (!tcp && lpl);
2052 kpreempt_enable();
2055 * If another queue looks good, and there is still nothing on
2056 * the local queue, try to transfer one or more threads
2057 * from it to our queue.
2059 if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2060 tp = disp_getbest(tcp->cpu_disp);
2061 if (tp == NULL || tp == T_DONTSTEAL)
2062 return (tp);
2063 return (disp_ratify(tp, kpq));
2065 return (retval);
2070 * disp_fix_unbound_pri()
2071 * Determines the maximum priority of unbound threads on the queue.
2072 * The priority is kept for the queue, but is only increased, never
2073 * reduced unless some CPU is looking for something on that queue.
2075 * The priority argument is the known upper limit.
2077 * Perhaps this should be kept accurately, but that probably means
2078 * separate bitmaps for bound and unbound threads. Since only idled
2079 * CPUs will have to do this recalculation, it seems better this way.
2081 static void
2082 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2084 kthread_t *tp;
2085 dispq_t *dq;
2086 ulong_t *dqactmap = dp->disp_qactmap;
2087 ulong_t mapword;
2088 int wx;
2090 ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2092 ASSERT(pri >= 0); /* checked by caller */
2095 * Start the search at the next lowest priority below the supplied
2096 * priority. This depends on the bitmap implementation.
2098 do {
2099 wx = pri >> BT_ULSHIFT; /* index of word in map */
2102 * Form mask for all lower priorities in the word.
2104 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2107 * Get next lower active priority.
2109 if (mapword != 0) {
2110 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2111 } else if (wx > 0) {
2112 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2113 if (pri < 0)
2114 break;
2115 } else {
2116 pri = -1;
2117 break;
2121 * Search the queue for unbound, runnable threads.
2123 dq = &dp->disp_q[pri];
2124 tp = dq->dq_first;
2126 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2127 tp = tp->t_link;
2131 * If a thread was found, set the priority and return.
2133 } while (tp == NULL);
2136 * pri holds the maximum unbound thread priority or -1.
2138 if (dp->disp_max_unbound_pri != pri)
2139 dp->disp_max_unbound_pri = pri;
2143 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2144 * check if the CPU to which is was previously bound should have
2145 * its disp_max_unbound_pri increased.
2147 void
2148 disp_adjust_unbound_pri(kthread_t *tp)
2150 disp_t *dp;
2151 pri_t tpri;
2153 ASSERT(THREAD_LOCK_HELD(tp));
2156 * Don't do anything if the thread is not bound, or
2157 * currently not runnable or swapped out.
2159 if (tp->t_bound_cpu == NULL ||
2160 tp->t_state != TS_RUN ||
2161 tp->t_schedflag & TS_ON_SWAPQ)
2162 return;
2164 tpri = DISP_PRIO(tp);
2165 dp = tp->t_bound_cpu->cpu_disp;
2166 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2167 if (tpri > dp->disp_max_unbound_pri)
2168 dp->disp_max_unbound_pri = tpri;
2172 * disp_getbest()
2173 * De-queue the highest priority unbound runnable thread.
2174 * Returns with the thread unlocked and onproc but at splhigh (like disp()).
2175 * Returns NULL if nothing found.
2176 * Returns T_DONTSTEAL if the thread was not stealable.
2177 * so that the caller will try again later.
2179 * Passed a pointer to a dispatch queue not associated with this CPU, and
2180 * its type.
2182 static kthread_t *
2183 disp_getbest(disp_t *dp)
2185 kthread_t *tp;
2186 dispq_t *dq;
2187 pri_t pri;
2188 cpu_t *cp, *tcp;
2189 boolean_t allbound;
2191 disp_lock_enter(&dp->disp_lock);
2194 * If there is nothing to run, or the CPU is in the middle of a
2195 * context switch of the only thread, return NULL.
2197 tcp = dp->disp_cpu;
2198 cp = CPU;
2199 pri = dp->disp_max_unbound_pri;
2200 if (pri == -1 ||
2201 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2202 tcp->cpu_disp->disp_nrunnable == 1)) {
2203 disp_lock_exit_nopreempt(&dp->disp_lock);
2204 return (NULL);
2207 dq = &dp->disp_q[pri];
2211 * Assume that all threads are bound on this queue, and change it
2212 * later when we find out that it is not the case.
2214 allbound = B_TRUE;
2215 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2216 hrtime_t now, nosteal, rqtime;
2219 * Skip over bound threads which could be here even
2220 * though disp_max_unbound_pri indicated this level.
2222 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2223 continue;
2226 * We've got some unbound threads on this queue, so turn
2227 * the allbound flag off now.
2229 allbound = B_FALSE;
2232 * The thread is a candidate for stealing from its run queue. We
2233 * don't want to steal threads that became runnable just a
2234 * moment ago. This improves CPU affinity for threads that get
2235 * preempted for short periods of time and go back on the run
2236 * queue.
2238 * We want to let it stay on its run queue if it was only placed
2239 * there recently and it was running on the same CPU before that
2240 * to preserve its cache investment. For the thread to remain on
2241 * its run queue, ALL of the following conditions must be
2242 * satisfied:
2244 * - the disp queue should not be the kernel preemption queue
2245 * - delayed idle stealing should not be disabled
2246 * - nosteal_nsec should be non-zero
2247 * - it should run with user priority
2248 * - it should be on the run queue of the CPU where it was
2249 * running before being placed on the run queue
2250 * - it should be the only thread on the run queue (to prevent
2251 * extra scheduling latency for other threads)
2252 * - it should sit on the run queue for less than per-chip
2253 * nosteal interval or global nosteal interval
2254 * - in case of CPUs with shared cache it should sit in a run
2255 * queue of a CPU from a different chip
2257 * The checks are arranged so that the ones that are faster are
2258 * placed earlier.
2260 if (tcp == NULL ||
2261 pri >= minclsyspri ||
2262 tp->t_cpu != tcp)
2263 break;
2266 * Steal immediately if, due to CMT processor architecture
2267 * migraiton between cp and tcp would incur no performance
2268 * penalty.
2270 if (pg_cmt_can_migrate(cp, tcp))
2271 break;
2273 nosteal = nosteal_nsec;
2274 if (nosteal == 0)
2275 break;
2278 * Calculate time spent sitting on run queue
2280 now = gethrtime_unscaled();
2281 rqtime = now - tp->t_waitrq;
2282 scalehrtime(&rqtime);
2285 * Steal immediately if the time spent on this run queue is more
2286 * than allowed nosteal delay.
2288 * Negative rqtime check is needed here to avoid infinite
2289 * stealing delays caused by unlikely but not impossible
2290 * drifts between CPU times on different CPUs.
2292 if (rqtime > nosteal || rqtime < 0)
2293 break;
2295 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2296 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2297 scalehrtime(&now);
2299 * Calculate when this thread becomes stealable
2301 now += (nosteal - rqtime);
2304 * Calculate time when some thread becomes stealable
2306 if (now < dp->disp_steal)
2307 dp->disp_steal = now;
2311 * If there were no unbound threads on this queue, find the queue
2312 * where they are and then return later. The value of
2313 * disp_max_unbound_pri is not always accurate because it isn't
2314 * reduced until another idle CPU looks for work.
2316 if (allbound)
2317 disp_fix_unbound_pri(dp, pri);
2320 * If we reached the end of the queue and found no unbound threads
2321 * then return NULL so that other CPUs will be considered. If there
2322 * are unbound threads but they cannot yet be stolen, then
2323 * return T_DONTSTEAL and try again later.
2325 if (tp == NULL) {
2326 disp_lock_exit_nopreempt(&dp->disp_lock);
2327 return (allbound ? NULL : T_DONTSTEAL);
2331 * Found a runnable, unbound thread, so remove it from queue.
2332 * dispdeq() requires that we have the thread locked, and we do,
2333 * by virtue of holding the dispatch queue lock. dispdeq() will
2334 * put the thread in transition state, thereby dropping the dispq
2335 * lock.
2338 #ifdef DEBUG
2340 int thread_was_on_queue;
2342 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */
2343 ASSERT(thread_was_on_queue);
2346 #else /* DEBUG */
2347 (void) dispdeq(tp); /* drops disp_lock */
2348 #endif /* DEBUG */
2351 * Reset the disp_queue steal time - we do not know what is the smallest
2352 * value across the queue is.
2354 dp->disp_steal = 0;
2356 tp->t_schedflag |= TS_DONT_SWAP;
2359 * Setup thread to run on the current CPU.
2361 tp->t_disp_queue = cp->cpu_disp;
2363 cp->cpu_dispthread = tp; /* protected by spl only */
2364 cp->cpu_dispatch_pri = pri;
2367 * There can be a memory synchronization race between disp_getbest()
2368 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2369 * to preempt the current thread to run the enqueued thread while
2370 * disp_getbest() and disp_ratify() are changing the current thread
2371 * to the stolen thread. This may lead to a situation where
2372 * cpu_resched() tries to preempt the wrong thread and the
2373 * stolen thread continues to run on the CPU which has been tagged
2374 * for preemption.
2375 * Later the clock thread gets enqueued but doesn't get to run on the
2376 * CPU causing the system to hang.
2378 * To avoid this, grabbing and dropping the disp_lock (which does
2379 * a memory barrier) is needed to synchronize the execution of
2380 * cpu_resched() with disp_getbest() and disp_ratify() and
2381 * synchronize the memory read and written by cpu_resched(),
2382 * disp_getbest(), and disp_ratify() with each other.
2383 * (see CR#6482861 for more details).
2385 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2386 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2388 ASSERT(pri == DISP_PRIO(tp));
2390 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2392 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */
2395 * Return with spl high so that swtch() won't need to raise it.
2396 * The disp_lock was dropped by dispdeq().
2399 return (tp);
2403 * disp_bound_common() - common routine for higher level functions
2404 * that check for bound threads under certain conditions.
2405 * If 'threadlistsafe' is set then there is no need to acquire
2406 * pidlock to stop the thread list from changing (eg, if
2407 * disp_bound_* is called with cpus paused).
2409 static int
2410 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2412 int found = 0;
2413 kthread_t *tp;
2415 ASSERT(flag);
2417 if (!threadlistsafe)
2418 mutex_enter(&pidlock);
2419 tp = curthread; /* faster than allthreads */
2420 do {
2421 if (tp->t_state != TS_FREE) {
2423 * If an interrupt thread is busy, but the
2424 * caller doesn't care (i.e. BOUND_INTR is off),
2425 * then just ignore it and continue through.
2427 if ((tp->t_flag & T_INTR_THREAD) &&
2428 !(flag & BOUND_INTR))
2429 continue;
2432 * Skip the idle thread for the CPU
2433 * we're about to set offline.
2435 if (tp == cp->cpu_idle_thread)
2436 continue;
2439 * Skip the pause thread for the CPU
2440 * we're about to set offline.
2442 if (tp == cp->cpu_pause_thread)
2443 continue;
2445 if ((flag & BOUND_CPU) &&
2446 (tp->t_bound_cpu == cp ||
2447 tp->t_bind_cpu == cp->cpu_id ||
2448 tp->t_weakbound_cpu == cp)) {
2449 found = 1;
2450 break;
2453 if ((flag & BOUND_PARTITION) &&
2454 (tp->t_cpupart == cp->cpu_part)) {
2455 found = 1;
2456 break;
2459 } while ((tp = tp->t_next) != curthread && found == 0);
2460 if (!threadlistsafe)
2461 mutex_exit(&pidlock);
2462 return (found);
2466 * disp_bound_threads - return nonzero if threads are bound to the processor.
2467 * Called infrequently. Keep this simple.
2468 * Includes threads that are asleep or stopped but not onproc.
2471 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2473 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2477 * disp_bound_anythreads - return nonzero if _any_ threads are bound
2478 * to the given processor, including interrupt threads.
2481 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2483 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2487 * disp_bound_partition - return nonzero if threads are bound to the same
2488 * partition as the processor.
2489 * Called infrequently. Keep this simple.
2490 * Includes threads that are asleep or stopped but not onproc.
2493 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2495 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2499 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2500 * threads to other CPUs.
2502 void
2503 disp_cpu_inactive(cpu_t *cp)
2505 kthread_t *tp;
2506 disp_t *dp = cp->cpu_disp;
2507 dispq_t *dq;
2508 pri_t pri;
2509 int wasonq;
2511 disp_lock_enter(&dp->disp_lock);
2512 while ((pri = dp->disp_max_unbound_pri) != -1) {
2513 dq = &dp->disp_q[pri];
2514 tp = dq->dq_first;
2517 * Skip over bound threads.
2519 while (tp != NULL && tp->t_bound_cpu != NULL) {
2520 tp = tp->t_link;
2523 if (tp == NULL) {
2524 /* disp_max_unbound_pri must be inaccurate, so fix it */
2525 disp_fix_unbound_pri(dp, pri);
2526 continue;
2529 wasonq = dispdeq(tp); /* drops disp_lock */
2530 ASSERT(wasonq);
2531 ASSERT(tp->t_weakbound_cpu == NULL);
2533 setbackdq(tp);
2535 * Called from cpu_offline:
2537 * cp has already been removed from the list of active cpus
2538 * and tp->t_cpu has been changed so there is no risk of
2539 * tp ending up back on cp.
2541 * Called from cpupart_move_cpu:
2543 * The cpu has moved to a new cpupart. Any threads that
2544 * were on it's dispatch queues before the move remain
2545 * in the old partition and can't run in the new partition.
2547 ASSERT(tp->t_cpu != cp);
2548 thread_unlock(tp);
2550 disp_lock_enter(&dp->disp_lock);
2552 disp_lock_exit(&dp->disp_lock);
2556 * disp_lowpri_cpu - find CPU running the lowest priority thread.
2557 * The hint passed in is used as a starting point so we don't favor
2558 * CPU 0 or any other CPU. The caller should pass in the most recently
2559 * used CPU for the thread.
2561 * The lgroup and priority are used to determine the best CPU to run on
2562 * in a NUMA machine. The lgroup specifies which CPUs are closest while
2563 * the thread priority will indicate whether the thread will actually run
2564 * there. To pick the best CPU, the CPUs inside and outside of the given
2565 * lgroup which are running the lowest priority threads are found. The
2566 * remote CPU is chosen only if the thread will not run locally on a CPU
2567 * within the lgroup, but will run on the remote CPU. If the thread
2568 * cannot immediately run on any CPU, the best local CPU will be chosen.
2570 * The lpl specified also identifies the cpu partition from which
2571 * disp_lowpri_cpu should select a CPU.
2573 * curcpu is used to indicate that disp_lowpri_cpu is being called on
2574 * behalf of the current thread. (curthread is looking for a new cpu)
2575 * In this case, cpu_dispatch_pri for this thread's cpu should be
2576 * ignored.
2578 * If a cpu is the target of an offline request then try to avoid it.
2580 * This function must be called at either high SPL, or with preemption
2581 * disabled, so that the "hint" CPU cannot be removed from the online
2582 * CPU list while we are traversing it.
2584 cpu_t *
2585 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2587 cpu_t *bestcpu;
2588 cpu_t *besthomecpu;
2589 cpu_t *cp, *cpstart;
2591 pri_t bestpri;
2592 pri_t cpupri;
2594 klgrpset_t done;
2595 klgrpset_t cur_set;
2597 lpl_t *lpl_iter, *lpl_leaf;
2598 int i;
2601 * Scan for a CPU currently running the lowest priority thread.
2602 * Cannot get cpu_lock here because it is adaptive.
2603 * We do not require lock on CPU list.
2605 ASSERT(hint != NULL);
2606 ASSERT(lpl != NULL);
2607 ASSERT(lpl->lpl_ncpu > 0);
2610 * First examine local CPUs. Note that it's possible the hint CPU
2611 * passed in in remote to the specified home lgroup. If our priority
2612 * isn't sufficient enough such that we can run immediately at home,
2613 * then examine CPUs remote to our home lgroup.
2614 * We would like to give preference to CPUs closest to "home".
2615 * If we can't find a CPU where we'll run at a given level
2616 * of locality, we expand our search to include the next level.
2618 bestcpu = besthomecpu = NULL;
2619 klgrpset_clear(done);
2620 /* start with lpl we were passed */
2622 lpl_iter = lpl;
2624 do {
2626 bestpri = SHRT_MAX;
2627 klgrpset_clear(cur_set);
2629 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2630 lpl_leaf = lpl_iter->lpl_rset[i];
2631 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2632 continue;
2634 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2636 if (hint->cpu_lpl == lpl_leaf)
2637 cp = cpstart = hint;
2638 else
2639 cp = cpstart = lpl_leaf->lpl_cpus;
2641 do {
2642 if (cp == curcpu)
2643 cpupri = -1;
2644 else if (cp == cpu_inmotion)
2645 cpupri = SHRT_MAX;
2646 else
2647 cpupri = cp->cpu_dispatch_pri;
2648 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2649 cpupri = cp->cpu_disp->disp_maxrunpri;
2650 if (cp->cpu_chosen_level > cpupri)
2651 cpupri = cp->cpu_chosen_level;
2652 if (cpupri < bestpri) {
2653 if (CPU_IDLING(cpupri)) {
2654 ASSERT((cp->cpu_flags &
2655 CPU_QUIESCED) == 0);
2656 return (cp);
2658 bestcpu = cp;
2659 bestpri = cpupri;
2661 } while ((cp = cp->cpu_next_lpl) != cpstart);
2664 if (bestcpu && (tpri > bestpri)) {
2665 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2666 return (bestcpu);
2668 if (besthomecpu == NULL)
2669 besthomecpu = bestcpu;
2671 * Add the lgrps we just considered to the "done" set
2673 klgrpset_or(done, cur_set);
2675 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2678 * The specified priority isn't high enough to run immediately
2679 * anywhere, so just return the best CPU from the home lgroup.
2681 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2682 return (besthomecpu);
2686 * This routine provides the generic idle cpu function for all processors.
2687 * If a processor has some specific code to execute when idle (say, to stop
2688 * the pipeline and save power) then that routine should be defined in the
2689 * processors specific code (module_xx.c) and the global variable idle_cpu
2690 * set to that function.
2692 static void
2693 generic_idle_cpu(void)
2697 /*ARGSUSED*/
2698 static void
2699 generic_enq_thread(cpu_t *cpu, int bound)