2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $FreeBSD: src/sys/kern/kern_switch.c,v 1.3.2.1 2000/05/16 06:58:12 dillon Exp $
27 * $DragonFly: src/sys/kern/Attic/kern_switch.c,v 1.23 2004/07/24 20:37:04 dillon Exp $
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
34 #include <sys/queue.h>
36 #include <sys/rtprio.h>
37 #include <sys/thread2.h>
39 #include <sys/sysctl.h>
40 #include <sys/resourcevar.h>
41 #include <machine/ipl.h>
42 #include <machine/cpu.h>
43 #include <machine/smp.h>
46 * debugging only YYY Remove me! define to schedule user processes only
47 * on the BSP. Interrupts can still be taken on the APs.
49 #undef ONLY_ONE_USER_CPU
52 * We have NQS (32) run queues per scheduling class. For the normal
53 * class, there are 128 priorities scaled onto these 32 queues. New
54 * processes are added to the last entry in each queue, and processes
55 * are selected for running by taking them from the head and maintaining
56 * a simple FIFO arrangement. Realtime and Idle priority processes have
57 * and explicit 0-31 priority which maps directly onto their class queue
58 * index. When a queue has something in it, the corresponding bit is
59 * set in the queuebits variable, allowing a single read to determine
60 * the state of all 32 queues and then a ffs() to find the first busy
63 static struct rq queues
[NQS
];
64 static struct rq rtqueues
[NQS
];
65 static struct rq idqueues
[NQS
];
66 static u_int32_t queuebits
;
67 static u_int32_t rtqueuebits
;
68 static u_int32_t idqueuebits
;
69 static cpumask_t curprocmask
= -1; /* currently running a user process */
70 static cpumask_t rdyprocmask
; /* ready to accept a user process */
76 SYSCTL_INT(_debug
, OID_AUTO
, runqcount
, CTLFLAG_RD
, &runqcount
, 0, "");
78 static int usched_nonoptimal
;
79 SYSCTL_INT(_debug
, OID_AUTO
, usched_nonoptimal
, CTLFLAG_RW
,
80 &usched_nonoptimal
, 0, "acquire_curproc() was not optimal");
81 static int usched_optimal
;
82 SYSCTL_INT(_debug
, OID_AUTO
, usched_optimal
, CTLFLAG_RW
,
83 &usched_optimal
, 0, "acquire_curproc() was optimal");
85 static int usched_debug
;
86 SYSCTL_INT(_debug
, OID_AUTO
, scdebug
, CTLFLAG_RW
, &usched_debug
, 0, "");
88 static int remote_resched
= 1;
89 static int remote_resched_nonaffinity
;
90 static int remote_resched_affinity
;
91 static int choose_affinity
;
92 SYSCTL_INT(_debug
, OID_AUTO
, remote_resched
, CTLFLAG_RW
,
93 &remote_resched
, 0, "Resched to another cpu");
94 SYSCTL_INT(_debug
, OID_AUTO
, remote_resched_nonaffinity
, CTLFLAG_RD
,
95 &remote_resched_nonaffinity
, 0, "Number of remote rescheds");
96 SYSCTL_INT(_debug
, OID_AUTO
, remote_resched_affinity
, CTLFLAG_RD
,
97 &remote_resched_affinity
, 0, "Number of remote rescheds");
98 SYSCTL_INT(_debug
, OID_AUTO
, choose_affinity
, CTLFLAG_RD
,
99 &choose_affinity
, 0, "chooseproc() was smart");
103 * Initialize the run queues at boot time.
110 for (i
= 0; i
< NQS
; i
++) {
111 TAILQ_INIT(&queues
[i
]);
112 TAILQ_INIT(&rtqueues
[i
]);
113 TAILQ_INIT(&idqueues
[i
]);
117 SYSINIT(runqueue
, SI_SUB_RUN_QUEUE
, SI_ORDER_FIRST
, rqinit
, NULL
)
120 * chooseproc() is called when a cpu needs a user process to LWKT schedule,
121 * it selects a user process and returns it. If chkp is non-NULL and chkp
122 * has a better or equal then the process that would otherwise be
123 * chosen, NULL is returned.
125 * Until we fix the RUNQ code the chkp test has to be strict or we may
126 * bounce between processes trying to acquire the current process designation.
130 chooseproc(struct proc
*chkp
)
138 pri
= bsfl(rtqueuebits
);
140 which
= &rtqueuebits
;
141 } else if (queuebits
) {
142 pri
= bsfl(queuebits
);
145 } else if (idqueuebits
) {
146 pri
= bsfl(idqueuebits
);
148 which
= &idqueuebits
;
153 KASSERT(p
, ("chooseproc: no proc on busy queue"));
156 * If the passed process <chkp> is reasonably close to the selected
157 * processed <p>, return NULL (indicating that <chkp> should be kept).
159 * Note that we must error on the side of <chkp> to avoid bouncing
160 * between threads in the acquire code.
163 if (chkp
->p_priority
< p
->p_priority
+ PPQ
)
169 * If the chosen process does not reside on this cpu spend a few
170 * cycles looking for a better candidate at the same priority level.
171 * This is a fallback check, setrunqueue() tries to wakeup the
172 * correct cpu and is our front-line affinity.
174 if (p
->p_thread
->td_gd
!= mycpu
&&
175 (chkp
= TAILQ_NEXT(p
, p_procq
)) != NULL
177 if (chkp
->p_thread
->td_gd
== mycpu
) {
184 TAILQ_REMOVE(q
, p
, p_procq
);
187 *which
&= ~(1 << pri
);
188 KASSERT((p
->p_flag
& P_ONRUNQ
) != 0, ("not on runq6!"));
189 p
->p_flag
&= ~P_ONRUNQ
;
195 * called via an ipi message to reschedule on another cpu.
199 need_user_resched_remote(void *dummy
)
207 * setrunqueue() 'wakes up' a 'user' process. GIANT must be held. The
208 * user process may represent any user process, including the current
211 * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
212 * cpus in an attempt to keep the process on the current cpu at least for
213 * a little while to take advantage of locality of reference (e.g. fork/exec
214 * or short fork/exit, and uio_yield()).
216 * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
217 * or (user level) preempt on the same cpu that a process was previously
218 * scheduled to. If we cannot do this but we are at enough of a higher
219 * priority then the processes running on other cpus, we will allow the
220 * process to be stolen by another cpu.
222 * WARNING! a thread can be acquired by another cpu the moment it is put
223 * on the user scheduler's run queue AND we release the MP lock. Since we
224 * release the MP lock before switching out another cpu may begin stealing
225 * our current thread before we are completely switched out! The
226 * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
227 * thread before stealing it.
229 * The associated thread must NOT be scheduled.
230 * The process must be runnable.
231 * This must be called at splhigh().
234 setrunqueue(struct proc
*p
)
237 struct globaldata
*gd
;
246 KASSERT(p
->p_stat
== SRUN
, ("setrunqueue: proc not SRUN"));
247 KASSERT((p
->p_flag
& P_ONRUNQ
) == 0,
248 ("process %d already on runq! flag %08x", p
->p_pid
, p
->p_flag
));
249 KKASSERT((p
->p_thread
->td_flags
& TDF_RUNQ
) == 0);
252 * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
254 gd
= p
->p_thread
->td_gd
;
257 * We have not been released, make sure that we are not the currently
258 * designated process.
260 KKASSERT(gd
->gd_uschedcp
!= p
);
263 * Check cpu affinity. The associated thread is stable at the
264 * moment. Note that we may be checking another cpu here so we
265 * have to be careful. We are currently protected by the BGL.
267 * This allows us to avoid actually queueing the process.
268 * acquire_curproc() will handle any threads we mistakenly schedule.
270 cpuid
= gd
->gd_cpuid
;
272 if ((curprocmask
& (1 << cpuid
)) == 0) {
273 curprocmask
|= 1 << cpuid
;
276 printf("F%-7d", gd
->gd_uschedcp
->p_pid
);
277 gd
->gd_upri
= p
->p_priority
;
278 lwkt_schedule(p
->p_thread
);
279 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
283 ++remote_resched_affinity
;
289 * gd and cpuid may still 'hint' at another cpu. Even so we have
290 * to place this process on the userland scheduler's run queue for
291 * action by the target cpu.
294 p
->p_flag
|= P_ONRUNQ
;
295 if (p
->p_rtprio
.type
== RTP_PRIO_NORMAL
) {
296 pri
= (p
->p_priority
& PRIMASK
) >> 2;
298 queuebits
|= 1 << pri
;
299 } else if (p
->p_rtprio
.type
== RTP_PRIO_REALTIME
||
300 p
->p_rtprio
.type
== RTP_PRIO_FIFO
) {
301 pri
= (u_int8_t
)p
->p_rtprio
.prio
;
303 rtqueuebits
|= 1 << pri
;
304 } else if (p
->p_rtprio
.type
== RTP_PRIO_IDLE
) {
305 pri
= (u_int8_t
)p
->p_rtprio
.prio
;
307 idqueuebits
|= 1 << pri
;
309 panic("setrunqueue: invalid rtprio type");
312 p
->p_rqindex
= pri
; /* remember the queue index */
313 TAILQ_INSERT_TAIL(q
, p
, p_procq
);
317 * Either wakeup other cpus user thread scheduler or request
318 * preemption on other cpus (which will also wakeup a HLT).
320 * NOTE! gd and cpuid may still be our 'hint', not our current
327 * Check cpu affinity for user preemption (when the curprocmask bit
328 * is set). Note that gd_upri is a speculative field (we modify
329 * another cpu's gd_upri to avoid sending ipiq storms).
332 if ((p
->p_thread
->td_flags
& TDF_NORESCHED
) == 0 &&
333 p
->p_priority
< gd
->gd_upri
- PPQ
) {
334 gd
->gd_upri
= p
->p_priority
;
338 } else if (remote_resched
) {
339 if (p
->p_priority
< gd
->gd_upri
- PPQ
) {
340 gd
->gd_upri
= p
->p_priority
;
341 lwkt_send_ipiq(gd
, need_user_resched_remote
, NULL
);
343 ++remote_resched_affinity
;
348 * No affinity, first schedule to any cpus that do not have a current
349 * process. If there is a free cpu we always schedule to it.
352 (mask
= ~curprocmask
& rdyprocmask
& mycpu
->gd_other_cpus
) != 0 &&
353 (p
->p_flag
& P_PASSIVE_ACQ
) == 0) {
355 printf("PROC %d nocpu to schedule it on\n", p
->p_pid
);
356 while (mask
&& count
) {
358 KKASSERT((curprocmask
& (1 << cpuid
)) == 0);
359 rdyprocmask
&= ~(1 << cpuid
);
360 lwkt_schedule(&globaldata_find(cpuid
)->gd_schedthread
);
362 mask
&= ~(1 << cpuid
);
367 * If there are still runnable processes try to wakeup a random
368 * cpu that is running a much lower priority process in order to
369 * preempt on it. Note that gd_upri is only a hint, so we can
370 * overwrite it from the wrong cpu. If we can't find one, we
373 * We depress the priority check so multiple cpu bound programs
374 * do not bounce between cpus. Remember that the clock interrupt
375 * will also cause all cpus to reschedule.
377 * We must mask against rdyprocmask or we will race in the boot
378 * code (before all cpus have working scheduler helpers), plus
379 * some cpus might not be operational and/or not configured to
380 * handle user processes.
382 if (count
&& remote_resched
&& ncpus
> 1) {
385 if (++cpuid
== ncpus
)
387 } while (cpuid
== mycpu
->gd_cpuid
);
390 if (rdyprocmask
& (1 << cpuid
)) {
391 gd
= globaldata_find(cpuid
);
393 if (p
->p_priority
< gd
->gd_upri
- PPQ
) {
394 gd
->gd_upri
= p
->p_priority
;
395 lwkt_send_ipiq(gd
, need_user_resched_remote
, NULL
);
396 ++remote_resched_nonaffinity
;
401 if ((p
->p_thread
->td_flags
& TDF_NORESCHED
) == 0 &&
402 p
->p_priority
< gd
->gd_upri
- PPQ
) {
403 gd
->gd_upri
= p
->p_priority
;
411 * remrunqueue() removes a given process from the run queue that it is on,
412 * clearing the queue busy bit if it becomes empty. This function is called
413 * when a userland process is selected for LWKT scheduling. Note that
414 * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
415 * several userland processes whos threads are scheduled or otherwise in
416 * a special state, and such processes are NOT on the userland scheduler's
419 * This must be called at splhigh().
422 remrunqueue(struct proc
*p
)
429 KASSERT((p
->p_flag
& P_ONRUNQ
) != 0, ("not on runq4!"));
430 p
->p_flag
&= ~P_ONRUNQ
;
432 KKASSERT(runqcount
>= 0);
434 if (p
->p_rtprio
.type
== RTP_PRIO_NORMAL
) {
437 } else if (p
->p_rtprio
.type
== RTP_PRIO_REALTIME
||
438 p
->p_rtprio
.type
== RTP_PRIO_FIFO
) {
440 which
= &rtqueuebits
;
441 } else if (p
->p_rtprio
.type
== RTP_PRIO_IDLE
) {
443 which
= &idqueuebits
;
445 panic("remrunqueue: invalid rtprio type");
447 TAILQ_REMOVE(q
, p
, p_procq
);
448 if (TAILQ_EMPTY(q
)) {
449 KASSERT((*which
& (1 << pri
)) != 0,
450 ("remrunqueue: remove from empty queue"));
451 *which
&= ~(1 << pri
);
457 * Release the current process designation on p. P MUST BE CURPROC.
458 * Attempt to assign a new current process from the run queue.
460 * This function is called from exit1(), tsleep(), and the passive
461 * release code setup in <arch>/<arch>/trap.c
463 * If we do not have or cannot get the MP lock we just wakeup the userland
464 * helper scheduler thread for this cpu to do the work for us.
466 * WARNING! The MP lock may be in an unsynchronized state due to the
467 * way get_mplock() works and the fact that this function may be called
468 * from a passive release during a lwkt_switch(). try_mplock() will deal
469 * with this for us but you should be aware that td_mpcount may not be
473 release_curproc(struct proc
*p
)
476 globaldata_t gd
= mycpu
;
478 #ifdef ONLY_ONE_USER_CPU
479 KKASSERT(gd
->gd_cpuid
== 0 && p
->p_thread
->td_gd
== gd
);
481 KKASSERT(p
->p_thread
->td_gd
== gd
);
485 printf("c%-7d", p
->p_pid
);
487 cpuid
= gd
->gd_cpuid
;
489 if (gd
->gd_uschedcp
== p
) {
492 * YYY when the MP lock is not assumed (see else) we
493 * will have to check that gd_uschedcp is still == p
494 * after acquisition of the MP lock
496 gd
->gd_uschedcp
= NULL
;
497 gd
->gd_upri
= PRIBASE_NULL
;
501 KKASSERT(0); /* MP LOCK ALWAYS HELD AT THE MOMENT */
502 gd
->gd_uschedcp
= NULL
;
503 gd
->gd_upri
= PRIBASE_NULL
;
504 /* YYY uschedcp and curprocmask */
505 if (runqcount
&& (rdyprocmask
& (1 << cpuid
))) {
506 rdyprocmask
&= ~(1 << cpuid
);
507 lwkt_schedule(&mycpu
->gd_schedthread
);
515 * Select a new current process, potentially retaining gd_uschedcp. However,
516 * be sure to round-robin. This routine is generally only called if a
517 * reschedule is requested and that typically only occurs if a new process
518 * has a better priority or when we are round-robining.
520 * NOTE: Must be called with giant held and the current cpu's gd.
521 * NOTE: The caller must handle the situation where it loses a
522 * uschedcp designation that it previously held, typically by
523 * calling acquire_curproc() again.
524 * NOTE: May not block
527 select_curproc(globaldata_t gd
)
530 int cpuid
= gd
->gd_cpuid
;
533 clear_user_resched();
536 * Choose the next designated current user process.
537 * Note that we cannot schedule gd_schedthread
538 * if runqcount is 0 without creating a scheduling
541 * We do not clear the user resched request here,
542 * we need to test it later when we re-acquire.
544 * NOTE: chooseproc returns NULL if the chosen proc
545 * is gd_uschedcp. XXX needs cleanup.
547 old
= gd
->gd_uschedcp
;
548 if ((np
= chooseproc(gd
->gd_uschedcp
)) != NULL
) {
549 curprocmask
|= 1 << cpuid
;
550 gd
->gd_upri
= np
->p_priority
;
551 gd
->gd_uschedcp
= np
;
553 printf("A%-7d[%p,%p]", gd
->gd_uschedcp
->p_pid
, old
, np
);
555 lwkt_acquire(np
->p_thread
);
556 lwkt_schedule(np
->p_thread
);
557 } else if (gd
->gd_uschedcp
) {
558 gd
->gd_upri
= gd
->gd_uschedcp
->p_priority
;
559 KKASSERT(curprocmask
& (1 << cpuid
));
560 } else if (runqcount
&& (rdyprocmask
& (1 << cpuid
))) {
561 /*gd->gd_uschedcp = NULL;*/
562 curprocmask
&= ~(1 << cpuid
);
563 rdyprocmask
&= ~(1 << cpuid
);
564 lwkt_schedule(&gd
->gd_schedthread
);
566 /*gd->gd_uschedcp = NULL;*/
567 curprocmask
&= ~(1 << cpuid
);
572 * Acquire the current process designation on the CURRENT process only.
573 * This function is called at kernel-user priority (not userland priority)
574 * when curproc does not match gd_uschedcp.
577 acquire_curproc(struct proc
*p
)
579 globaldata_t gd
= mycpu
;
581 #ifdef ONLY_ONE_USER_CPU
582 KKASSERT(gd
->gd_cpuid
== 0);
586 * Loop until we become the current process.
589 ++p
->p_stats
->p_ru
.ru_nivcsw
;
591 KKASSERT(p
== gd
->gd_curthread
->td_proc
);
593 lwkt_deschedule_self(gd
->gd_curthread
);
600 * WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU, RELOAD GD.
603 } while (gd
->gd_uschedcp
!= p
);
607 * That's it. Cleanup, we are done. The caller can return to
610 KKASSERT((p
->p_flag
& P_ONRUNQ
) == 0);
614 * Yield / synchronous reschedule. This is a bit tricky because the trap
615 * code might have set a lazy release on the switch function. Setting
616 * P_PASSIVE_ACQ will ensure that the lazy release executes when we call
617 * switch, and that we are given a greater chance of affinity with our
620 * We call lwkt_setpri_self() to rotate our thread to the end of the lwkt
621 * run queue. lwkt_switch() will also execute any assigned passive release
622 * (which usually calls release_curproc()), allowing a same/higher priority
623 * process to be designated as the current process.
625 * While it is possible for a lower priority process to be designated,
626 * it's call to lwkt_maybe_switch() in acquire_curproc() will likely
627 * round-robin back to us and we will be able to re-acquire the current
628 * process designation.
633 struct thread
*td
= curthread
;
634 struct proc
*p
= td
->td_proc
;
636 lwkt_setpri_self(td
->td_pri
& TDPRI_MASK
);
638 p
->p_flag
|= P_PASSIVE_ACQ
;
640 p
->p_flag
&= ~P_PASSIVE_ACQ
;
649 * For SMP systems a user scheduler helper thread is created for each
650 * cpu and is used to allow one cpu to wakeup another for the purposes of
651 * scheduling userland threads from setrunqueue(). UP systems do not
652 * need the helper since there is only one cpu. We can't use the idle
653 * thread for this because we need to hold the MP lock. Additionally,
654 * doing things this way allows us to HLT idle cpus on MP systems.
657 sched_thread(void *dummy
)
659 globaldata_t gd
= mycpu
;
660 int cpuid
= gd
->gd_cpuid
; /* doesn't change */
661 u_int32_t cpumask
= 1 << cpuid
; /* doesn't change */
663 #ifdef ONLY_ONE_USER_CPU
664 KKASSERT(cpuid
== 0);
667 get_mplock(); /* hold the MP lock */
671 lwkt_deschedule_self(gd
->gd_curthread
); /* interlock */
672 rdyprocmask
|= cpumask
;
673 crit_enter_quick(gd
->gd_curthread
);
674 if ((curprocmask
& cpumask
) == 0 && (np
= chooseproc(NULL
)) != NULL
) {
675 curprocmask
|= cpumask
;
676 gd
->gd_upri
= np
->p_priority
;
677 gd
->gd_uschedcp
= np
;
679 printf("E%-7d", gd
->gd_uschedcp
->p_pid
);
680 lwkt_acquire(np
->p_thread
);
681 lwkt_schedule(np
->p_thread
);
683 crit_exit_quick(gd
->gd_curthread
);
689 * Setup our scheduler helpers. Note that curprocmask bit 0 has already
690 * been cleared by rqinit() and we should not mess with it further.
693 sched_thread_cpu_init(void)
698 printf("start scheduler helpers on cpus:");
700 for (i
= 0; i
< ncpus
; ++i
) {
701 globaldata_t dgd
= globaldata_find(i
);
702 cpumask_t mask
= 1 << i
;
704 if ((mask
& smp_active_mask
) == 0)
710 lwkt_create(sched_thread
, NULL
, NULL
, &dgd
->gd_schedthread
,
711 TDF_STOPREQ
, i
, "usched %d", i
);
712 #ifdef ONLY_ONE_USER_CPU
714 curprocmask
|= mask
; /* DISABLE USER PROCS */
717 curprocmask
&= ~mask
; /* schedule user proc on cpu */
724 SYSINIT(uschedtd
, SI_SUB_FINISH_SMP
, SI_ORDER_ANY
, sched_thread_cpu_init
, NULL
)