2 * Copyright (c) 1982, 1986, 1990, 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
38 #include "opt_hwpmc_hooks.h"
39 #include "opt_sched.h"
40 #include "opt_kdtrace.h"
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/cpuset.h>
45 #include <sys/kernel.h>
48 #include <sys/kthread.h>
49 #include <sys/mutex.h>
51 #include <sys/resourcevar.h>
52 #include <sys/sched.h>
54 #include <sys/sysctl.h>
56 #include <sys/turnstile.h>
58 #include <machine/pcb.h>
59 #include <machine/smp.h>
62 #include <sys/pmckern.h>
66 #include <sys/dtrace_bsd.h>
67 int dtrace_vtime_active
;
68 dtrace_vtime_switch_func_t dtrace_vtime_switch_func
;
72 * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
73 * the range 100-256 Hz (approximately).
75 #define ESTCPULIM(e) \
76 min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
77 RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
79 #define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus)
81 #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */
83 #define NICE_WEIGHT 1 /* Priorities per nice level. */
86 * The schedulable entity that runs a context.
87 * This is an extension to the thread structure and is tailored to
88 * the requirements of this scheduler
91 fixpt_t ts_pctcpu
; /* (j) %cpu during p_swtime. */
92 int ts_cpticks
; /* (j) Ticks of cpu time. */
93 int ts_slptime
; /* (j) Seconds !RUNNING. */
95 struct runq
*ts_runq
; /* runq the thread is currently on */
98 /* flags kept in td_flags */
99 #define TDF_DIDRUN TDF_SCHED0 /* thread actually ran. */
100 #define TDF_BOUND TDF_SCHED1 /* Bound to one CPU. */
102 /* flags kept in ts_flags */
103 #define TSF_AFFINITY 0x0001 /* Has a non-"full" CPU set. */
105 #define SKE_RUNQ_PCPU(ts) \
106 ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
108 #define THREAD_CAN_SCHED(td, cpu) \
109 CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
111 static struct td_sched td_sched0
;
112 struct mtx sched_lock
;
114 static int sched_tdcnt
; /* Total runnable threads in the system. */
115 static int sched_quantum
; /* Roundrobin scheduling quantum in ticks. */
116 #define SCHED_QUANTUM (hz / 10) /* Default sched quantum */
118 static void setup_runqs(void);
119 static void schedcpu(void);
120 static void schedcpu_thread(void);
121 static void sched_priority(struct thread
*td
, u_char prio
);
122 static void sched_setup(void *dummy
);
123 static void maybe_resched(struct thread
*td
);
124 static void updatepri(struct thread
*td
);
125 static void resetpriority(struct thread
*td
);
126 static void resetpriority_thread(struct thread
*td
);
128 static int sched_pickcpu(struct thread
*td
);
129 static int forward_wakeup(int cpunum
);
130 static void kick_other_cpu(int pri
, int cpuid
);
133 static struct kproc_desc sched_kp
= {
138 SYSINIT(schedcpu
, SI_SUB_RUN_SCHEDULER
, SI_ORDER_FIRST
, kproc_start
,
140 SYSINIT(sched_setup
, SI_SUB_RUN_QUEUE
, SI_ORDER_FIRST
, sched_setup
, NULL
);
145 static struct runq runq
;
151 static struct runq runq_pcpu
[MAXCPU
];
152 long runq_length
[MAXCPU
];
161 for (i
= 0; i
< MAXCPU
; ++i
)
162 runq_init(&runq_pcpu
[i
]);
169 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS
)
173 new_val
= sched_quantum
* tick
;
174 error
= sysctl_handle_int(oidp
, &new_val
, 0, req
);
175 if (error
!= 0 || req
->newptr
== NULL
)
179 sched_quantum
= new_val
/ tick
;
180 hogticks
= 2 * sched_quantum
;
184 SYSCTL_NODE(_kern
, OID_AUTO
, sched
, CTLFLAG_RD
, 0, "Scheduler");
186 SYSCTL_STRING(_kern_sched
, OID_AUTO
, name
, CTLFLAG_RD
, "4BSD", 0,
189 SYSCTL_PROC(_kern_sched
, OID_AUTO
, quantum
, CTLTYPE_INT
| CTLFLAG_RW
,
190 0, sizeof sched_quantum
, sysctl_kern_quantum
, "I",
191 "Roundrobin scheduling quantum in microseconds");
194 /* Enable forwarding of wakeups to all other cpus */
195 SYSCTL_NODE(_kern_sched
, OID_AUTO
, ipiwakeup
, CTLFLAG_RD
, NULL
, "Kernel SMP");
197 static int runq_fuzz
= 1;
198 SYSCTL_INT(_kern_sched
, OID_AUTO
, runq_fuzz
, CTLFLAG_RW
, &runq_fuzz
, 0, "");
200 static int forward_wakeup_enabled
= 1;
201 SYSCTL_INT(_kern_sched_ipiwakeup
, OID_AUTO
, enabled
, CTLFLAG_RW
,
202 &forward_wakeup_enabled
, 0,
203 "Forwarding of wakeup to idle CPUs");
205 static int forward_wakeups_requested
= 0;
206 SYSCTL_INT(_kern_sched_ipiwakeup
, OID_AUTO
, requested
, CTLFLAG_RD
,
207 &forward_wakeups_requested
, 0,
208 "Requests for Forwarding of wakeup to idle CPUs");
210 static int forward_wakeups_delivered
= 0;
211 SYSCTL_INT(_kern_sched_ipiwakeup
, OID_AUTO
, delivered
, CTLFLAG_RD
,
212 &forward_wakeups_delivered
, 0,
213 "Completed Forwarding of wakeup to idle CPUs");
215 static int forward_wakeup_use_mask
= 1;
216 SYSCTL_INT(_kern_sched_ipiwakeup
, OID_AUTO
, usemask
, CTLFLAG_RW
,
217 &forward_wakeup_use_mask
, 0,
218 "Use the mask of idle cpus");
220 static int forward_wakeup_use_loop
= 0;
221 SYSCTL_INT(_kern_sched_ipiwakeup
, OID_AUTO
, useloop
, CTLFLAG_RW
,
222 &forward_wakeup_use_loop
, 0,
223 "Use a loop to find idle cpus");
225 static int forward_wakeup_use_single
= 0;
226 SYSCTL_INT(_kern_sched_ipiwakeup
, OID_AUTO
, onecpu
, CTLFLAG_RW
,
227 &forward_wakeup_use_single
, 0,
228 "Only signal one idle cpu");
230 static int forward_wakeup_use_htt
= 0;
231 SYSCTL_INT(_kern_sched_ipiwakeup
, OID_AUTO
, htt2
, CTLFLAG_RW
,
232 &forward_wakeup_use_htt
, 0,
237 static int sched_followon
= 0;
238 SYSCTL_INT(_kern_sched
, OID_AUTO
, followon
, CTLFLAG_RW
,
240 "allow threads to share a quantum");
247 CTR1(KTR_SCHED
, "global load: %d", sched_tdcnt
);
254 CTR1(KTR_SCHED
, "global load: %d", sched_tdcnt
);
257 * Arrange to reschedule if necessary, taking the priorities and
258 * schedulers into account.
261 maybe_resched(struct thread
*td
)
264 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
265 if (td
->td_priority
< curthread
->td_priority
)
266 curthread
->td_flags
|= TDF_NEEDRESCHED
;
270 * This function is called when a thread is about to be put on run queue
271 * because it has been made runnable or its priority has been adjusted. It
272 * determines if the new thread should be immediately preempted to. If so,
273 * it switches to it and eventually returns true. If not, it returns false
274 * so that the caller may place the thread on an appropriate run queue.
277 maybe_preempt(struct thread
*td
)
284 * The new thread should not preempt the current thread if any of the
285 * following conditions are true:
287 * - The kernel is in the throes of crashing (panicstr).
288 * - The current thread has a higher (numerically lower) or
289 * equivalent priority. Note that this prevents curthread from
290 * trying to preempt to itself.
291 * - It is too early in the boot for context switches (cold is set).
292 * - The current thread has an inhibitor set or is in the process of
293 * exiting. In this case, the current thread is about to switch
294 * out anyways, so there's no point in preempting. If we did,
295 * the current thread would not be properly resumed as well, so
296 * just avoid that whole landmine.
297 * - If the new thread's priority is not a realtime priority and
298 * the current thread's priority is not an idle priority and
299 * FULL_PREEMPTION is disabled.
301 * If all of these conditions are false, but the current thread is in
302 * a nested critical section, then we have to defer the preemption
303 * until we exit the critical section. Otherwise, switch immediately
307 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
308 KASSERT((td
->td_inhibitors
== 0),
309 ("maybe_preempt: trying to run inhibited thread"));
310 pri
= td
->td_priority
;
311 cpri
= ctd
->td_priority
;
312 if (panicstr
!= NULL
|| pri
>= cpri
|| cold
/* || dumping */ ||
313 TD_IS_INHIBITED(ctd
))
315 #ifndef FULL_PREEMPTION
316 if (pri
> PRI_MAX_ITHD
&& cpri
< PRI_MIN_IDLE
)
320 if (ctd
->td_critnest
> 1) {
321 CTR1(KTR_PROC
, "maybe_preempt: in critical section %d",
323 ctd
->td_owepreempt
= 1;
327 * Thread is runnable but not yet put on system run queue.
329 MPASS(ctd
->td_lock
== td
->td_lock
);
330 MPASS(TD_ON_RUNQ(td
));
332 CTR3(KTR_PROC
, "preempting to thread %p (pid %d, %s)\n", td
,
333 td
->td_proc
->p_pid
, td
->td_name
);
334 mi_switch(SW_INVOL
| SW_PREEMPT
| SWT_PREEMPT
, td
);
336 * td's lock pointer may have changed. We have to return with it
350 * Constants for digital decay and forget:
351 * 90% of (td_estcpu) usage in 5 * loadav time
352 * 95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
353 * Note that, as ps(1) mentions, this can let percentages
354 * total over 100% (I've seen 137.9% for 3 processes).
356 * Note that schedclock() updates td_estcpu and p_cpticks asynchronously.
358 * We wish to decay away 90% of td_estcpu in (5 * loadavg) seconds.
359 * That is, the system wants to compute a value of decay such
360 * that the following for loop:
361 * for (i = 0; i < (5 * loadavg); i++)
362 * td_estcpu *= decay;
365 * for all values of loadavg:
367 * Mathematically this loop can be expressed by saying:
368 * decay ** (5 * loadavg) ~= .1
370 * The system computes decay as:
371 * decay = (2 * loadavg) / (2 * loadavg + 1)
373 * We wish to prove that the system's computation of decay
374 * will always fulfill the equation:
375 * decay ** (5 * loadavg) ~= .1
377 * If we compute b as:
380 * decay = b / (b + 1)
382 * We now need to prove two things:
383 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
384 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
387 * For x close to zero, exp(x) =~ 1 + x, since
388 * exp(x) = 0! + x**1/1! + x**2/2! + ... .
389 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
390 * For x close to zero, ln(1+x) =~ x, since
391 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
392 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
396 * Solve (factor)**(power) =~ .1 given power (5*loadav):
397 * solving for factor,
398 * ln(factor) =~ (-2.30/5*loadav), or
399 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
400 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
403 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
405 * power*ln(b/(b+1)) =~ -2.30, or
406 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
408 * Actual power values for the implemented algorithm are as follows:
410 * power: 5.68 10.32 14.94 19.55
413 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
414 #define loadfactor(loadav) (2 * (loadav))
415 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
417 /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
418 static fixpt_t ccpu
= 0.95122942450071400909 * FSCALE
; /* exp(-1/20) */
419 SYSCTL_INT(_kern
, OID_AUTO
, ccpu
, CTLFLAG_RD
, &ccpu
, 0, "");
422 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
423 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
424 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
426 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
427 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
429 * If you don't want to bother with the faster/more-accurate formula, you
430 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
431 * (more general) method of calculating the %age of CPU used by a process.
433 #define CCPU_SHIFT 11
436 * Recompute process priorities, every hz ticks.
437 * MP-safe, called without the Giant mutex.
443 register fixpt_t loadfac
= loadfactor(averunnable
.ldavg
[0]);
447 int awake
, realstathz
;
449 realstathz
= stathz
? stathz
: hz
;
450 sx_slock(&allproc_lock
);
451 FOREACH_PROC_IN_SYSTEM(p
) {
453 FOREACH_THREAD_IN_PROC(p
, td
) {
458 * Increment sleep time (if sleeping). We
459 * ignore overflow, as above.
462 * The td_sched slptimes are not touched in wakeup
463 * because the thread may not HAVE everything in
464 * memory? XXX I think this is out of date.
466 if (TD_ON_RUNQ(td
)) {
468 td
->td_flags
&= ~TDF_DIDRUN
;
469 } else if (TD_IS_RUNNING(td
)) {
471 /* Do not clear TDF_DIDRUN */
472 } else if (td
->td_flags
& TDF_DIDRUN
) {
474 td
->td_flags
&= ~TDF_DIDRUN
;
478 * ts_pctcpu is only for ps and ttyinfo().
480 ts
->ts_pctcpu
= (ts
->ts_pctcpu
* ccpu
) >> FSHIFT
;
482 * If the td_sched has been idle the entire second,
483 * stop recalculating its priority until
486 if (ts
->ts_cpticks
!= 0) {
487 #if (FSHIFT >= CCPU_SHIFT)
488 ts
->ts_pctcpu
+= (realstathz
== 100)
489 ? ((fixpt_t
) ts
->ts_cpticks
) <<
490 (FSHIFT
- CCPU_SHIFT
) :
491 100 * (((fixpt_t
) ts
->ts_cpticks
)
492 << (FSHIFT
- CCPU_SHIFT
)) / realstathz
;
494 ts
->ts_pctcpu
+= ((FSCALE
- ccpu
) *
496 FSCALE
/ realstathz
)) >> FSHIFT
;
501 * If there are ANY running threads in this process,
502 * then don't count it as sleeping.
503 * XXX: this is broken.
506 if (ts
->ts_slptime
> 1) {
508 * In an ideal world, this should not
509 * happen, because whoever woke us
510 * up from the long sleep should have
511 * unwound the slptime and reset our
512 * priority before we run at the stale
513 * priority. Should KASSERT at some
514 * point when all the cases are fixed.
521 if (ts
->ts_slptime
> 1) {
525 td
->td_estcpu
= decay_cpu(loadfac
, td
->td_estcpu
);
527 resetpriority_thread(td
);
532 sx_sunlock(&allproc_lock
);
536 * Main loop for a kthread that executes schedcpu once a second.
539 schedcpu_thread(void)
549 * Recalculate the priority of a process after it has slept for a while.
550 * For all load averages >= 1 and max td_estcpu of 255, sleeping for at
551 * least six times the loadfactor will decay td_estcpu to zero.
554 updatepri(struct thread
*td
)
561 loadfac
= loadfactor(averunnable
.ldavg
[0]);
562 if (ts
->ts_slptime
> 5 * loadfac
)
565 newcpu
= td
->td_estcpu
;
566 ts
->ts_slptime
--; /* was incremented in schedcpu() */
567 while (newcpu
&& --ts
->ts_slptime
)
568 newcpu
= decay_cpu(loadfac
, newcpu
);
569 td
->td_estcpu
= newcpu
;
574 * Compute the priority of a process when running in user mode.
575 * Arrange to reschedule if the resulting priority is better
576 * than that of the current process.
579 resetpriority(struct thread
*td
)
581 register unsigned int newpriority
;
583 if (td
->td_pri_class
== PRI_TIMESHARE
) {
584 newpriority
= PUSER
+ td
->td_estcpu
/ INVERSE_ESTCPU_WEIGHT
+
585 NICE_WEIGHT
* (td
->td_proc
->p_nice
- PRIO_MIN
);
586 newpriority
= min(max(newpriority
, PRI_MIN_TIMESHARE
),
588 sched_user_prio(td
, newpriority
);
593 * Update the thread's priority when the associated process's user
597 resetpriority_thread(struct thread
*td
)
600 /* Only change threads with a time sharing user priority. */
601 if (td
->td_priority
< PRI_MIN_TIMESHARE
||
602 td
->td_priority
> PRI_MAX_TIMESHARE
)
605 /* XXX the whole needresched thing is broken, but not silly. */
608 sched_prio(td
, td
->td_user_pri
);
613 sched_setup(void *dummy
)
617 if (sched_quantum
== 0)
618 sched_quantum
= SCHED_QUANTUM
;
619 hogticks
= 2 * sched_quantum
;
621 /* Account for thread0. */
625 /* External interfaces start here */
628 * Very early in the boot some setup of scheduler-specific
629 * parts of proc0 and of some scheduler resources needs to be done.
637 * Set up the scheduler specific parts of proc0.
639 proc0
.p_sched
= NULL
; /* XXX */
640 thread0
.td_sched
= &td_sched0
;
641 thread0
.td_lock
= &sched_lock
;
642 mtx_init(&sched_lock
, "sched lock", NULL
, MTX_SPIN
| MTX_RECURSE
);
649 return runq_check(&runq
) + runq_check(&runq_pcpu
[PCPU_GET(cpuid
)]);
651 return runq_check(&runq
);
656 sched_rr_interval(void)
658 if (sched_quantum
== 0)
659 sched_quantum
= SCHED_QUANTUM
;
660 return (sched_quantum
);
664 * We adjust the priority of the current process. The priority of
665 * a process gets worse as it accumulates CPU time. The cpu usage
666 * estimator (td_estcpu) is increased here. resetpriority() will
667 * compute a different priority each time td_estcpu increases by
668 * INVERSE_ESTCPU_WEIGHT
669 * (until MAXPRI is reached). The cpu usage estimator ramps up
670 * quite quickly when the process is running (linearly), and decays
671 * away exponentially, at a rate which is proportionally slower when
672 * the system is busy. The basic principle is that the system will
673 * 90% forget that the process used a lot of CPU time in 5 * loadav
674 * seconds. This causes the system to favor processes which haven't
675 * run much recently, and to round-robin among other processes.
678 sched_clock(struct thread
*td
)
682 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
686 td
->td_estcpu
= ESTCPULIM(td
->td_estcpu
+ 1);
687 if ((td
->td_estcpu
% INVERSE_ESTCPU_WEIGHT
) == 0) {
689 resetpriority_thread(td
);
693 * Force a context switch if the current thread has used up a full
694 * quantum (default quantum is 100ms).
696 if (!TD_IS_IDLETHREAD(td
) &&
697 ticks
- PCPU_GET(switchticks
) >= sched_quantum
)
698 td
->td_flags
|= TDF_NEEDRESCHED
;
702 * Charge child's scheduling CPU usage to parent.
705 sched_exit(struct proc
*p
, struct thread
*td
)
708 CTR3(KTR_SCHED
, "sched_exit: %p(%s) prio %d",
709 td
, td
->td_name
, td
->td_priority
);
710 PROC_LOCK_ASSERT(p
, MA_OWNED
);
711 sched_exit_thread(FIRST_THREAD_IN_PROC(p
), td
);
715 sched_exit_thread(struct thread
*td
, struct thread
*child
)
718 CTR3(KTR_SCHED
, "sched_exit_thread: %p(%s) prio %d",
719 child
, child
->td_name
, child
->td_priority
);
721 td
->td_estcpu
= ESTCPULIM(td
->td_estcpu
+ child
->td_estcpu
);
723 mtx_lock_spin(&sched_lock
);
724 if ((child
->td_proc
->p_flag
& P_NOLOAD
) == 0)
726 mtx_unlock_spin(&sched_lock
);
730 sched_fork(struct thread
*td
, struct thread
*childtd
)
732 sched_fork_thread(td
, childtd
);
736 sched_fork_thread(struct thread
*td
, struct thread
*childtd
)
740 childtd
->td_estcpu
= td
->td_estcpu
;
741 childtd
->td_lock
= &sched_lock
;
742 childtd
->td_cpuset
= cpuset_ref(td
->td_cpuset
);
743 ts
= childtd
->td_sched
;
744 bzero(ts
, sizeof(*ts
));
745 ts
->ts_flags
|= (td
->td_sched
->ts_flags
& TSF_AFFINITY
);
749 sched_nice(struct proc
*p
, int nice
)
753 PROC_LOCK_ASSERT(p
, MA_OWNED
);
755 FOREACH_THREAD_IN_PROC(p
, td
) {
758 resetpriority_thread(td
);
764 sched_class(struct thread
*td
, int class)
766 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
767 td
->td_pri_class
= class;
771 * Adjust the priority of a thread.
774 sched_priority(struct thread
*td
, u_char prio
)
776 CTR6(KTR_SCHED
, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
777 td
, td
->td_name
, td
->td_priority
, prio
, curthread
,
780 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
781 if (td
->td_priority
== prio
)
783 td
->td_priority
= prio
;
784 if (TD_ON_RUNQ(td
) && td
->td_rqindex
!= (prio
/ RQ_PPQ
)) {
786 sched_add(td
, SRQ_BORING
);
791 * Update a thread's priority when it is lent another thread's
795 sched_lend_prio(struct thread
*td
, u_char prio
)
798 td
->td_flags
|= TDF_BORROWING
;
799 sched_priority(td
, prio
);
803 * Restore a thread's priority when priority propagation is
804 * over. The prio argument is the minimum priority the thread
805 * needs to have to satisfy other possible priority lending
806 * requests. If the thread's regulary priority is less
807 * important than prio the thread will keep a priority boost
811 sched_unlend_prio(struct thread
*td
, u_char prio
)
815 if (td
->td_base_pri
>= PRI_MIN_TIMESHARE
&&
816 td
->td_base_pri
<= PRI_MAX_TIMESHARE
)
817 base_pri
= td
->td_user_pri
;
819 base_pri
= td
->td_base_pri
;
820 if (prio
>= base_pri
) {
821 td
->td_flags
&= ~TDF_BORROWING
;
822 sched_prio(td
, base_pri
);
824 sched_lend_prio(td
, prio
);
828 sched_prio(struct thread
*td
, u_char prio
)
832 /* First, update the base priority. */
833 td
->td_base_pri
= prio
;
836 * If the thread is borrowing another thread's priority, don't ever
837 * lower the priority.
839 if (td
->td_flags
& TDF_BORROWING
&& td
->td_priority
< prio
)
842 /* Change the real priority. */
843 oldprio
= td
->td_priority
;
844 sched_priority(td
, prio
);
847 * If the thread is on a turnstile, then let the turnstile update
850 if (TD_ON_LOCK(td
) && oldprio
!= prio
)
851 turnstile_adjust(td
, oldprio
);
855 sched_user_prio(struct thread
*td
, u_char prio
)
859 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
860 td
->td_base_user_pri
= prio
;
861 if (td
->td_flags
& TDF_UBORROWING
&& td
->td_user_pri
<= prio
)
863 oldprio
= td
->td_user_pri
;
864 td
->td_user_pri
= prio
;
868 sched_lend_user_prio(struct thread
*td
, u_char prio
)
872 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
873 td
->td_flags
|= TDF_UBORROWING
;
874 oldprio
= td
->td_user_pri
;
875 td
->td_user_pri
= prio
;
879 sched_unlend_user_prio(struct thread
*td
, u_char prio
)
883 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
884 base_pri
= td
->td_base_user_pri
;
885 if (prio
>= base_pri
) {
886 td
->td_flags
&= ~TDF_UBORROWING
;
887 sched_user_prio(td
, base_pri
);
889 sched_lend_user_prio(td
, prio
);
894 sched_sleep(struct thread
*td
, int pri
)
897 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
898 td
->td_slptick
= ticks
;
899 td
->td_sched
->ts_slptime
= 0;
902 if (TD_IS_SUSPENDED(td
) || pri
<= PSOCK
)
903 td
->td_flags
|= TDF_CANSWAP
;
907 sched_switch(struct thread
*td
, struct thread
*newtd
, int flags
)
915 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
918 * Switch to the sched lock to fix things up and pick
921 if (td
->td_lock
!= &sched_lock
) {
922 mtx_lock_spin(&sched_lock
);
926 if ((p
->p_flag
& P_NOLOAD
) == 0)
930 newtd
->td_flags
|= (td
->td_flags
& TDF_NEEDRESCHED
);
932 td
->td_lastcpu
= td
->td_oncpu
;
933 td
->td_flags
&= ~TDF_NEEDRESCHED
;
934 td
->td_owepreempt
= 0;
935 td
->td_oncpu
= NOCPU
;
938 * At the last moment, if this thread is still marked RUNNING,
939 * then put it back on the run queue as it has not been suspended
940 * or stopped or any thing else similar. We never put the idle
941 * threads on the run queue, however.
943 if (td
->td_flags
& TDF_IDLETD
) {
946 idle_cpus_mask
&= ~PCPU_GET(cpumask
);
949 if (TD_IS_RUNNING(td
)) {
950 /* Put us back on the run queue. */
951 sched_add(td
, (flags
& SW_PREEMPT
) ?
952 SRQ_OURSELF
|SRQ_YIELDING
|SRQ_PREEMPTED
:
953 SRQ_OURSELF
|SRQ_YIELDING
);
958 * The thread we are about to run needs to be counted
959 * as if it had been added to the run queue and selected.
965 KASSERT((newtd
->td_inhibitors
== 0),
966 ("trying to run inhibited thread"));
967 newtd
->td_flags
|= TDF_DIDRUN
;
968 TD_SET_RUNNING(newtd
);
969 if ((newtd
->td_proc
->p_flag
& P_NOLOAD
) == 0)
972 newtd
= choosethread();
974 MPASS(newtd
->td_lock
== &sched_lock
);
978 if (PMC_PROC_IS_USING_PMCS(td
->td_proc
))
979 PMC_SWITCH_CONTEXT(td
, PMC_FN_CSW_OUT
);
982 lock_profile_release_lock(&sched_lock
.lock_object
);
985 * If DTrace has set the active vtime enum to anything
986 * other than INACTIVE (0), then it should have set the
989 if (dtrace_vtime_active
)
990 (*dtrace_vtime_switch_func
)(newtd
);
993 cpu_switch(td
, newtd
, td
->td_lock
);
994 lock_profile_obtain_lock_success(&sched_lock
.lock_object
,
995 0, 0, __FILE__
, __LINE__
);
997 * Where am I? What year is it?
998 * We are in the same thread that went to sleep above,
999 * but any amount of time may have passed. All our context
1000 * will still be available as will local variables.
1001 * PCPU values however may have changed as we may have
1002 * changed CPU so don't trust cached values of them.
1003 * New threads will go to fork_exit() instead of here
1004 * so if you change things here you may need to change
1007 * If the thread above was exiting it will never wake
1008 * up again here, so either it has saved everything it
1009 * needed to, or the thread_wait() or wait() will
1013 if (PMC_PROC_IS_USING_PMCS(td
->td_proc
))
1014 PMC_SWITCH_CONTEXT(td
, PMC_FN_CSW_IN
);
1019 if (td
->td_flags
& TDF_IDLETD
)
1020 idle_cpus_mask
|= PCPU_GET(cpumask
);
1022 sched_lock
.mtx_lock
= (uintptr_t)td
;
1023 td
->td_oncpu
= PCPU_GET(cpuid
);
1024 MPASS(td
->td_lock
== &sched_lock
);
1028 sched_wakeup(struct thread
*td
)
1030 struct td_sched
*ts
;
1032 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
1034 td
->td_flags
&= ~TDF_CANSWAP
;
1035 if (ts
->ts_slptime
> 1) {
1039 td
->td_slptick
= ticks
;
1041 sched_add(td
, SRQ_BORING
);
1046 forward_wakeup(int cpunum
)
1049 cpumask_t dontuse
, id
, map
, map2
, map3
, me
;
1051 mtx_assert(&sched_lock
, MA_OWNED
);
1053 CTR0(KTR_RUNQ
, "forward_wakeup()");
1055 if ((!forward_wakeup_enabled
) ||
1056 (forward_wakeup_use_mask
== 0 && forward_wakeup_use_loop
== 0))
1058 if (!smp_started
|| cold
|| panicstr
)
1061 forward_wakeups_requested
++;
1064 * Check the idle mask we received against what we calculated
1065 * before in the old version.
1067 me
= PCPU_GET(cpumask
);
1069 /* Don't bother if we should be doing it ourself. */
1070 if ((me
& idle_cpus_mask
) && (cpunum
== NOCPU
|| me
== (1 << cpunum
)))
1073 dontuse
= me
| stopped_cpus
| hlt_cpus_mask
;
1075 if (forward_wakeup_use_loop
) {
1076 SLIST_FOREACH(pc
, &cpuhead
, pc_allcpu
) {
1077 id
= pc
->pc_cpumask
;
1078 if ((id
& dontuse
) == 0 &&
1079 pc
->pc_curthread
== pc
->pc_idlethread
) {
1085 if (forward_wakeup_use_mask
) {
1087 map
= idle_cpus_mask
& ~dontuse
;
1089 /* If they are both on, compare and use loop if different. */
1090 if (forward_wakeup_use_loop
) {
1092 printf("map (%02X) != map3 (%02X)\n", map
,
1101 /* If we only allow a specific CPU, then mask off all the others. */
1102 if (cpunum
!= NOCPU
) {
1103 KASSERT((cpunum
<= mp_maxcpus
),("forward_wakeup: bad cpunum."));
1104 map
&= (1 << cpunum
);
1106 /* Try choose an idle die. */
1107 if (forward_wakeup_use_htt
) {
1108 map2
= (map
& (map
>> 1)) & 0x5555;
1114 /* Set only one bit. */
1115 if (forward_wakeup_use_single
) {
1116 map
= map
& ((~map
) + 1);
1120 forward_wakeups_delivered
++;
1121 ipi_selected(map
, IPI_AST
);
1124 if (cpunum
== NOCPU
)
1125 printf("forward_wakeup: Idle processor not found\n");
1130 kick_other_cpu(int pri
, int cpuid
)
1135 pcpu
= pcpu_find(cpuid
);
1136 if (idle_cpus_mask
& pcpu
->pc_cpumask
) {
1137 forward_wakeups_delivered
++;
1138 ipi_selected(pcpu
->pc_cpumask
, IPI_AST
);
1142 cpri
= pcpu
->pc_curthread
->td_priority
;
1146 #if defined(IPI_PREEMPTION) && defined(PREEMPTION)
1147 #if !defined(FULL_PREEMPTION)
1148 if (pri
<= PRI_MAX_ITHD
)
1149 #endif /* ! FULL_PREEMPTION */
1151 ipi_selected(pcpu
->pc_cpumask
, IPI_PREEMPT
);
1154 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
1156 pcpu
->pc_curthread
->td_flags
|= TDF_NEEDRESCHED
;
1157 ipi_selected(pcpu
->pc_cpumask
, IPI_AST
);
1164 sched_pickcpu(struct thread
*td
)
1168 mtx_assert(&sched_lock
, MA_OWNED
);
1170 if (THREAD_CAN_SCHED(td
, td
->td_lastcpu
))
1171 best
= td
->td_lastcpu
;
1174 for (cpu
= 0; cpu
<= mp_maxid
; cpu
++) {
1175 if (CPU_ABSENT(cpu
))
1177 if (!THREAD_CAN_SCHED(td
, cpu
))
1182 else if (runq_length
[cpu
] < runq_length
[best
])
1185 KASSERT(best
!= NOCPU
, ("no valid CPUs"));
1192 sched_add(struct thread
*td
, int flags
)
1195 struct td_sched
*ts
;
1201 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
1202 KASSERT((td
->td_inhibitors
== 0),
1203 ("sched_add: trying to run inhibited thread"));
1204 KASSERT((TD_CAN_RUN(td
) || TD_IS_RUNNING(td
)),
1205 ("sched_add: bad thread state"));
1206 KASSERT(td
->td_flags
& TDF_INMEM
,
1207 ("sched_add: thread swapped out"));
1208 CTR5(KTR_SCHED
, "sched_add: %p(%s) prio %d by %p(%s)",
1209 td
, td
->td_name
, td
->td_priority
, curthread
,
1210 curthread
->td_name
);
1213 * Now that the thread is moving to the run-queue, set the lock
1214 * to the scheduler's lock.
1216 if (td
->td_lock
!= &sched_lock
) {
1217 mtx_lock_spin(&sched_lock
);
1218 thread_lock_set(td
, &sched_lock
);
1222 if (td
->td_pinned
!= 0) {
1223 cpu
= td
->td_lastcpu
;
1224 ts
->ts_runq
= &runq_pcpu
[cpu
];
1227 "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts
, td
,
1229 } else if (td
->td_flags
& TDF_BOUND
) {
1230 /* Find CPU from bound runq. */
1231 KASSERT(SKE_RUNQ_PCPU(ts
),
1232 ("sched_add: bound td_sched not on cpu runq"));
1233 cpu
= ts
->ts_runq
- &runq_pcpu
[0];
1236 "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts
, td
,
1238 } else if (ts
->ts_flags
& TSF_AFFINITY
) {
1239 /* Find a valid CPU for our cpuset */
1240 cpu
= sched_pickcpu(td
);
1241 ts
->ts_runq
= &runq_pcpu
[cpu
];
1244 "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts
, td
,
1248 "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts
,
1251 ts
->ts_runq
= &runq
;
1254 if (single_cpu
&& (cpu
!= PCPU_GET(cpuid
))) {
1255 kick_other_cpu(td
->td_priority
, cpu
);
1258 cpumask_t me
= PCPU_GET(cpumask
);
1259 cpumask_t idle
= idle_cpus_mask
& me
;
1261 if (!idle
&& ((flags
& SRQ_INTR
) == 0) &&
1262 (idle_cpus_mask
& ~(hlt_cpus_mask
| me
)))
1263 forwarded
= forward_wakeup(cpu
);
1267 if ((flags
& SRQ_YIELDING
) == 0 && maybe_preempt(td
))
1274 if ((td
->td_proc
->p_flag
& P_NOLOAD
) == 0)
1276 runq_add(ts
->ts_runq
, td
, flags
);
1282 struct td_sched
*ts
;
1285 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
1286 KASSERT((td
->td_inhibitors
== 0),
1287 ("sched_add: trying to run inhibited thread"));
1288 KASSERT((TD_CAN_RUN(td
) || TD_IS_RUNNING(td
)),
1289 ("sched_add: bad thread state"));
1290 KASSERT(td
->td_flags
& TDF_INMEM
,
1291 ("sched_add: thread swapped out"));
1292 CTR5(KTR_SCHED
, "sched_add: %p(%s) prio %d by %p(%s)",
1293 td
, td
->td_name
, td
->td_priority
, curthread
,
1294 curthread
->td_name
);
1297 * Now that the thread is moving to the run-queue, set the lock
1298 * to the scheduler's lock.
1300 if (td
->td_lock
!= &sched_lock
) {
1301 mtx_lock_spin(&sched_lock
);
1302 thread_lock_set(td
, &sched_lock
);
1305 CTR2(KTR_RUNQ
, "sched_add: adding td_sched:%p (td:%p) to runq", ts
, td
);
1306 ts
->ts_runq
= &runq
;
1309 * If we are yielding (on the way out anyhow) or the thread
1310 * being saved is US, then don't try be smart about preemption
1311 * or kicking off another CPU as it won't help and may hinder.
1312 * In the YIEDLING case, we are about to run whoever is being
1313 * put in the queue anyhow, and in the OURSELF case, we are
1314 * puting ourself on the run queue which also only happens
1315 * when we are about to yield.
1317 if ((flags
& SRQ_YIELDING
) == 0) {
1318 if (maybe_preempt(td
))
1321 if ((td
->td_proc
->p_flag
& P_NOLOAD
) == 0)
1323 runq_add(ts
->ts_runq
, td
, flags
);
1329 sched_rem(struct thread
*td
)
1331 struct td_sched
*ts
;
1334 KASSERT(td
->td_flags
& TDF_INMEM
,
1335 ("sched_rem: thread swapped out"));
1336 KASSERT(TD_ON_RUNQ(td
),
1337 ("sched_rem: thread not on run queue"));
1338 mtx_assert(&sched_lock
, MA_OWNED
);
1339 CTR5(KTR_SCHED
, "sched_rem: %p(%s) prio %d by %p(%s)",
1340 td
, td
->td_name
, td
->td_priority
, curthread
,
1341 curthread
->td_name
);
1343 if ((td
->td_proc
->p_flag
& P_NOLOAD
) == 0)
1346 if (ts
->ts_runq
!= &runq
)
1347 runq_length
[ts
->ts_runq
- runq_pcpu
]--;
1349 runq_remove(ts
->ts_runq
, td
);
1354 * Select threads to run. Note that running threads still consume a
1363 mtx_assert(&sched_lock
, MA_OWNED
);
1365 struct thread
*tdcpu
;
1368 td
= runq_choose_fuzz(&runq
, runq_fuzz
);
1369 tdcpu
= runq_choose(&runq_pcpu
[PCPU_GET(cpuid
)]);
1373 tdcpu
->td_priority
< td
->td_priority
)) {
1374 CTR2(KTR_RUNQ
, "choosing td %p from pcpu runq %d", tdcpu
,
1377 rq
= &runq_pcpu
[PCPU_GET(cpuid
)];
1379 CTR1(KTR_RUNQ
, "choosing td_sched %p from main runq", td
);
1384 td
= runq_choose(&runq
);
1390 runq_length
[PCPU_GET(cpuid
)]--;
1392 runq_remove(rq
, td
);
1393 td
->td_flags
|= TDF_DIDRUN
;
1395 KASSERT(td
->td_flags
& TDF_INMEM
,
1396 ("sched_choose: thread swapped out"));
1399 return (PCPU_GET(idlethread
));
1403 sched_preempt(struct thread
*td
)
1406 if (td
->td_critnest
> 1)
1407 td
->td_owepreempt
= 1;
1409 mi_switch(SW_INVOL
| SW_PREEMPT
| SWT_PREEMPT
, NULL
);
1414 sched_userret(struct thread
*td
)
1417 * XXX we cheat slightly on the locking here to avoid locking in
1418 * the usual case. Setting td_priority here is essentially an
1419 * incomplete workaround for not setting it properly elsewhere.
1420 * Now that some interrupt handlers are threads, not setting it
1421 * properly elsewhere can clobber it in the window between setting
1422 * it here and returning to user mode, so don't waste time setting
1423 * it perfectly here.
1425 KASSERT((td
->td_flags
& TDF_BORROWING
) == 0,
1426 ("thread with borrowed priority returning to userland"));
1427 if (td
->td_priority
!= td
->td_user_pri
) {
1429 td
->td_priority
= td
->td_user_pri
;
1430 td
->td_base_pri
= td
->td_user_pri
;
1436 sched_bind(struct thread
*td
, int cpu
)
1438 struct td_sched
*ts
;
1440 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
1441 KASSERT(TD_IS_RUNNING(td
),
1442 ("sched_bind: cannot bind non-running thread"));
1446 td
->td_flags
|= TDF_BOUND
;
1448 ts
->ts_runq
= &runq_pcpu
[cpu
];
1449 if (PCPU_GET(cpuid
) == cpu
)
1452 mi_switch(SW_VOL
, NULL
);
1457 sched_unbind(struct thread
* td
)
1459 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
1460 td
->td_flags
&= ~TDF_BOUND
;
1464 sched_is_bound(struct thread
*td
)
1466 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
1467 return (td
->td_flags
& TDF_BOUND
);
1471 sched_relinquish(struct thread
*td
)
1474 mi_switch(SW_VOL
| SWT_RELINQUISH
, NULL
);
1481 return (sched_tdcnt
);
1485 sched_sizeof_proc(void)
1487 return (sizeof(struct proc
));
1491 sched_sizeof_thread(void)
1493 return (sizeof(struct thread
) + sizeof(struct td_sched
));
1497 sched_pctcpu(struct thread
*td
)
1499 struct td_sched
*ts
;
1502 return (ts
->ts_pctcpu
);
1511 * The actual idle process.
1514 sched_idletd(void *dummy
)
1518 mtx_assert(&Giant
, MA_NOTOWNED
);
1520 while (sched_runnable() == 0)
1523 mtx_lock_spin(&sched_lock
);
1524 mi_switch(SW_VOL
| SWT_IDLE
, NULL
);
1525 mtx_unlock_spin(&sched_lock
);
1530 * A CPU is entering for the first time or a thread is exiting.
1533 sched_throw(struct thread
*td
)
1536 * Correct spinlock nesting. The idle thread context that we are
1537 * borrowing was created so that it would start out with a single
1538 * spin lock (sched_lock) held in fork_trampoline(). Since we've
1539 * explicitly acquired locks in this function, the nesting count
1540 * is now 2 rather than 1. Since we are nested, calling
1541 * spinlock_exit() will simply adjust the counts without allowing
1542 * spin lock using code to interrupt us.
1545 mtx_lock_spin(&sched_lock
);
1548 lock_profile_release_lock(&sched_lock
.lock_object
);
1549 MPASS(td
->td_lock
== &sched_lock
);
1551 mtx_assert(&sched_lock
, MA_OWNED
);
1552 KASSERT(curthread
->td_md
.md_spinlock_count
== 1, ("invalid count"));
1553 PCPU_SET(switchtime
, cpu_ticks());
1554 PCPU_SET(switchticks
, ticks
);
1555 cpu_throw(td
, choosethread()); /* doesn't return */
1559 sched_fork_exit(struct thread
*td
)
1563 * Finish setting up thread glue so that it begins execution in a
1564 * non-nested critical section with sched_lock held but not recursed.
1566 td
->td_oncpu
= PCPU_GET(cpuid
);
1567 sched_lock
.mtx_lock
= (uintptr_t)td
;
1568 lock_profile_obtain_lock_success(&sched_lock
.lock_object
,
1569 0, 0, __FILE__
, __LINE__
);
1570 THREAD_LOCK_ASSERT(td
, MA_OWNED
| MA_NOTRECURSED
);
1574 sched_affinity(struct thread
*td
)
1577 struct td_sched
*ts
;
1580 THREAD_LOCK_ASSERT(td
, MA_OWNED
);
1583 * Set the TSF_AFFINITY flag if there is at least one CPU this
1584 * thread can't run on.
1587 ts
->ts_flags
&= ~TSF_AFFINITY
;
1588 for (cpu
= 0; cpu
<= mp_maxid
; cpu
++) {
1589 if (CPU_ABSENT(cpu
))
1591 if (!THREAD_CAN_SCHED(td
, cpu
)) {
1592 ts
->ts_flags
|= TSF_AFFINITY
;
1598 * If this thread can run on all CPUs, nothing else to do.
1600 if (!(ts
->ts_flags
& TSF_AFFINITY
))
1603 /* Pinned threads and bound threads should be left alone. */
1604 if (td
->td_pinned
!= 0 || td
->td_flags
& TDF_BOUND
)
1607 switch (td
->td_state
) {
1610 * If we are on a per-CPU runqueue that is in the set,
1611 * then nothing needs to be done.
1613 if (ts
->ts_runq
!= &runq
&&
1614 THREAD_CAN_SCHED(td
, ts
->ts_runq
- runq_pcpu
))
1617 /* Put this thread on a valid per-CPU runqueue. */
1619 sched_add(td
, SRQ_BORING
);
1623 * See if our current CPU is in the set. If not, force a
1626 if (THREAD_CAN_SCHED(td
, td
->td_oncpu
))
1629 td
->td_flags
|= TDF_NEEDRESCHED
;
1630 if (td
!= curthread
)
1631 ipi_selected(1 << cpu
, IPI_AST
);