2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.7 2005/12/23 10:08:14 dillon Exp $
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
33 #include <sys/queue.h>
35 #include <sys/rtprio.h>
36 #include <sys/thread2.h>
38 #include <sys/sysctl.h>
39 #include <sys/resourcevar.h>
40 #include <machine/ipl.h>
41 #include <machine/cpu.h>
42 #include <machine/smp.h>
45 * Priorities. Note that with 32 run queues per scheduler each queue
46 * represents four priority levels.
50 #define PRIMASK (MAXPRI - 1)
51 #define PRIBASE_REALTIME 0
52 #define PRIBASE_NORMAL MAXPRI
53 #define PRIBASE_IDLE (MAXPRI * 2)
54 #define PRIBASE_THREAD (MAXPRI * 3)
55 #define PRIBASE_NULL (MAXPRI * 4)
57 #define NQS 32 /* 32 run queues. */
58 #define PPQ (MAXPRI / NQS) /* priorities per queue */
61 * NICEPPQ - number of nice units per priority queue
62 * ESTCPURAMP - number of scheduler ticks for estcpu to switch queues
64 * ESTCPUPPQ - number of estcpu units per priority queue
65 * ESTCPUMAX - number of estcpu units
66 * ESTCPUINCR - amount we have to increment p_estcpu per scheduling tick at
72 #define ESTCPUMAX (ESTCPUPPQ * NQS)
73 #define ESTCPUINCR (ESTCPUPPQ / ESTCPURAMP)
74 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1)
76 #define ESTCPULIM(v) min((v), ESTCPUMAX)
80 #define lwp_priority lwp_usdata.bsd4.priority
81 #define lwp_rqindex lwp_usdata.bsd4.rqindex
82 #define lwp_origcpu lwp_usdata.bsd4.origcpu
83 #define lwp_estcpu lwp_usdata.bsd4.estcpu
85 static void bsd4_acquire_curproc(struct lwp
*lp
);
86 static void bsd4_release_curproc(struct lwp
*lp
);
87 static void bsd4_select_curproc(globaldata_t gd
);
88 static void bsd4_setrunqueue(struct lwp
*lp
);
89 static void bsd4_remrunqueue(struct lwp
*lp
);
90 static void bsd4_schedulerclock(struct lwp
*lp
, sysclock_t period
,
92 static void bsd4_resetpriority(struct lwp
*lp
);
93 static void bsd4_forking(struct lwp
*plp
, struct lwp
*lp
);
94 static void bsd4_exiting(struct lwp
*plp
, struct lwp
*lp
);
96 static void bsd4_recalculate_estcpu(struct lwp
*lp
);
98 struct usched usched_bsd4
= {
100 "bsd4", "Original DragonFly Scheduler",
101 NULL
, /* default registration */
102 NULL
, /* default deregistration */
103 bsd4_acquire_curproc
,
104 bsd4_release_curproc
,
109 bsd4_recalculate_estcpu
,
113 NULL
/* setcpumask not supported */
117 * We have NQS (32) run queues per scheduling class. For the normal
118 * class, there are 128 priorities scaled onto these 32 queues. New
119 * processes are added to the last entry in each queue, and processes
120 * are selected for running by taking them from the head and maintaining
121 * a simple FIFO arrangement. Realtime and Idle priority processes have
122 * and explicit 0-31 priority which maps directly onto their class queue
123 * index. When a queue has something in it, the corresponding bit is
124 * set in the queuebits variable, allowing a single read to determine
125 * the state of all 32 queues and then a ffs() to find the first busy
128 static struct rq queues
[NQS
];
129 static struct rq rtqueues
[NQS
];
130 static struct rq idqueues
[NQS
];
131 static u_int32_t queuebits
;
132 static u_int32_t rtqueuebits
;
133 static u_int32_t idqueuebits
;
134 static cpumask_t curprocmask
= -1; /* currently running a user process */
135 static cpumask_t rdyprocmask
; /* ready to accept a user process */
136 static int runqcount
;
141 SYSCTL_INT(_debug
, OID_AUTO
, runqcount
, CTLFLAG_RD
, &runqcount
, 0, "");
143 static int usched_nonoptimal
;
144 SYSCTL_INT(_debug
, OID_AUTO
, usched_nonoptimal
, CTLFLAG_RW
,
145 &usched_nonoptimal
, 0, "acquire_curproc() was not optimal");
146 static int usched_optimal
;
147 SYSCTL_INT(_debug
, OID_AUTO
, usched_optimal
, CTLFLAG_RW
,
148 &usched_optimal
, 0, "acquire_curproc() was optimal");
150 static int usched_debug
= -1;
151 SYSCTL_INT(_debug
, OID_AUTO
, scdebug
, CTLFLAG_RW
, &usched_debug
, 0, "");
153 static int remote_resched
= 1;
154 static int remote_resched_nonaffinity
;
155 static int remote_resched_affinity
;
156 static int choose_affinity
;
157 SYSCTL_INT(_debug
, OID_AUTO
, remote_resched
, CTLFLAG_RW
,
158 &remote_resched
, 0, "Resched to another cpu");
159 SYSCTL_INT(_debug
, OID_AUTO
, remote_resched_nonaffinity
, CTLFLAG_RD
,
160 &remote_resched_nonaffinity
, 0, "Number of remote rescheds");
161 SYSCTL_INT(_debug
, OID_AUTO
, remote_resched_affinity
, CTLFLAG_RD
,
162 &remote_resched_affinity
, 0, "Number of remote rescheds");
163 SYSCTL_INT(_debug
, OID_AUTO
, choose_affinity
, CTLFLAG_RD
,
164 &choose_affinity
, 0, "chooseproc() was smart");
167 static int usched_bsd4_rrinterval
= (ESTCPUFREQ
+ 9) / 10;
168 SYSCTL_INT(_kern
, OID_AUTO
, usched_bsd4_rrinterval
, CTLFLAG_RW
,
169 &usched_bsd4_rrinterval
, 0, "");
170 static int usched_bsd4_decay
= ESTCPUINCR
/ 2;
171 SYSCTL_INT(_kern
, OID_AUTO
, usched_bsd4_decay
, CTLFLAG_RW
,
172 &usched_bsd4_decay
, 0, "");
175 * Initialize the run queues at boot time.
182 for (i
= 0; i
< NQS
; i
++) {
183 TAILQ_INIT(&queues
[i
]);
184 TAILQ_INIT(&rtqueues
[i
]);
185 TAILQ_INIT(&idqueues
[i
]);
187 atomic_clear_int(&curprocmask
, 1);
189 SYSINIT(runqueue
, SI_SUB_RUN_QUEUE
, SI_ORDER_FIRST
, rqinit
, NULL
)
192 * chooseproc() is called when a cpu needs a user process to LWKT schedule,
193 * it selects a user process and returns it. If chklp is non-NULL and chklp
194 * has a better or equal priority then the process that would otherwise be
195 * chosen, NULL is returned.
197 * Until we fix the RUNQ code the chklp test has to be strict or we may
198 * bounce between processes trying to acquire the current process designation.
202 chooseproc(struct lwp
*chklp
)
210 pri
= bsfl(rtqueuebits
);
212 which
= &rtqueuebits
;
213 } else if (queuebits
) {
214 pri
= bsfl(queuebits
);
217 } else if (idqueuebits
) {
218 pri
= bsfl(idqueuebits
);
220 which
= &idqueuebits
;
225 KASSERT(lp
, ("chooseproc: no lwp on busy queue"));
228 * If the passed lwp <chklp> is reasonably close to the selected
229 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
231 * Note that we must error on the side of <chklp> to avoid bouncing
232 * between threads in the acquire code.
235 if (chklp
->lwp_priority
< lp
->lwp_priority
+ PPQ
)
241 * If the chosen lwp does not reside on this cpu spend a few
242 * cycles looking for a better candidate at the same priority level.
243 * This is a fallback check, setrunqueue() tries to wakeup the
244 * correct cpu and is our front-line affinity.
246 if (lp
->lwp_thread
->td_gd
!= mycpu
&&
247 (chklp
= TAILQ_NEXT(lp
, lwp_procq
)) != NULL
249 if (chklp
->lwp_thread
->td_gd
== mycpu
) {
256 TAILQ_REMOVE(q
, lp
, lwp_procq
);
259 *which
&= ~(1 << pri
);
260 KASSERT((lp
->lwp_proc
->p_flag
& P_ONRUNQ
) != 0, ("not on runq6!"));
261 lp
->lwp_proc
->p_flag
&= ~P_ONRUNQ
;
267 * called via an ipi message to reschedule on another cpu.
271 need_user_resched_remote(void *dummy
)
279 * setrunqueue() 'wakes up' a 'user' process. GIANT must be held. The
280 * user process may represent any user process, including the current
283 * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
284 * cpus in an attempt to keep the process on the current cpu at least for
285 * a little while to take advantage of locality of reference (e.g. fork/exec
286 * or short fork/exit, and uio_yield()).
288 * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
289 * or (user level) preempt on the same cpu that a process was previously
290 * scheduled to. If we cannot do this but we are at enough of a higher
291 * priority then the processes running on other cpus, we will allow the
292 * process to be stolen by another cpu.
294 * WARNING! a thread can be acquired by another cpu the moment it is put
295 * on the user scheduler's run queue AND we release the MP lock. Since we
296 * release the MP lock before switching out another cpu may begin stealing
297 * our current thread before we are completely switched out! The
298 * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
299 * thread before stealing it.
301 * NOTE on need_user_resched() calls: we have to call need_user_resched()
302 * if the new process is more important then the current process, or if
303 * the new process is the current process and is now less important then
306 * The associated thread must NOT be scheduled.
307 * The process must be runnable.
308 * This must be called at splhigh().
311 bsd4_setrunqueue(struct lwp
*lp
)
314 struct globaldata
*gd
;
317 u_int32_t needresched
;
323 ASSERT_MP_LOCK_HELD(lp
->lwp_thread
);
325 KASSERT(lp
->lwp_proc
->p_stat
== SRUN
, ("setrunqueue: proc not SRUN"));
326 KASSERT((lp
->lwp_proc
->p_flag
& P_ONRUNQ
) == 0,
327 ("lwp %d/%d already on runq! flag %08x", lp
->lwp_proc
->p_pid
,
328 lp
->lwp_tid
, lp
->lwp_proc
->p_flag
));
329 KKASSERT((lp
->lwp_thread
->td_flags
& TDF_RUNQ
) == 0);
332 * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
334 gd
= lp
->lwp_thread
->td_gd
;
337 * Because recalculate is only called once or twice for long sleeps,
338 * not every second forever while the process is sleeping, we have
339 * to manually call it to resynchronize p_cpbase on wakeup or it
340 * will wrap if the process was sleeping long enough (e.g. ~10 min
341 * with the ACPI timer) and really mess up the nticks calculation.
343 if (lp
->lwp_slptime
) {
344 bsd4_recalculate_estcpu(lp
);
348 * We have not been released, make sure that we are not the currently
349 * designated process.
351 KKASSERT(gd
->gd_uschedcp
!= lp
);
354 * Check cpu affinity. The associated thread is stable at the
355 * moment. Note that we may be checking another cpu here so we
356 * have to be careful. We are currently protected by the BGL.
358 * This allows us to avoid actually queueing the process.
359 * acquire_curproc() will handle any threads we mistakenly schedule.
361 cpuid
= gd
->gd_cpuid
;
363 if ((curprocmask
& (1 << cpuid
)) == 0) {
364 atomic_set_int(&curprocmask
, 1 << cpuid
);
365 gd
->gd_uschedcp
= lp
;
366 gd
->gd_upri
= lp
->lwp_priority
;
367 lwkt_schedule(lp
->lwp_thread
);
368 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
372 ++remote_resched_affinity
;
378 * gd and cpuid may still 'hint' at another cpu. Even so we have
379 * to place this process on the userland scheduler's run queue for
380 * action by the target cpu.
383 lp
->lwp_proc
->p_flag
|= P_ONRUNQ
;
384 if (lp
->lwp_rtprio
.type
== RTP_PRIO_NORMAL
) {
385 pri
= (lp
->lwp_priority
& PRIMASK
) / PPQ
;
387 queuebits
|= 1 << pri
;
388 needresched
= (queuebits
& ((1 << pri
) - 1));
389 } else if (lp
->lwp_rtprio
.type
== RTP_PRIO_REALTIME
||
390 lp
->lwp_rtprio
.type
== RTP_PRIO_FIFO
) {
391 pri
= (u_int8_t
)lp
->lwp_rtprio
.prio
;
393 rtqueuebits
|= 1 << pri
;
394 needresched
= (rtqueuebits
& ((1 << pri
) - 1));
395 } else if (lp
->lwp_rtprio
.type
== RTP_PRIO_IDLE
) {
396 pri
= (u_int8_t
)lp
->lwp_rtprio
.prio
;
398 idqueuebits
|= 1 << pri
;
399 needresched
= (idqueuebits
& ((1 << pri
) - 1));
402 panic("setrunqueue: invalid rtprio type");
405 lp
->lwp_rqindex
= pri
; /* remember the queue index */
406 TAILQ_INSERT_TAIL(q
, lp
, lwp_procq
);
410 * Either wakeup other cpus user thread scheduler or request
411 * preemption on other cpus (which will also wakeup a HLT).
413 * NOTE! gd and cpuid may still be our 'hint', not our current
420 * Check cpu affinity for user preemption (when the curprocmask bit
421 * is set). Note that gd_upri is a speculative field (we modify
422 * another cpu's gd_upri to avoid sending ipiq storms).
425 if ((lp
->lwp_thread
->td_flags
& TDF_NORESCHED
) == 0) {
426 if (lp
->lwp_priority
< gd
->gd_upri
- PPQ
) {
427 gd
->gd_upri
= lp
->lwp_priority
;
431 } else if (gd
->gd_uschedcp
== lp
&& needresched
) {
437 } else if (remote_resched
) {
438 if (lp
->lwp_priority
< gd
->gd_upri
- PPQ
) {
439 gd
->gd_upri
= lp
->lwp_priority
;
440 lwkt_send_ipiq(gd
, need_user_resched_remote
, NULL
);
442 ++remote_resched_affinity
;
447 * No affinity, first schedule to any cpus that do not have a current
448 * process. If there is a free cpu we always schedule to it.
451 (mask
= ~curprocmask
& rdyprocmask
& mycpu
->gd_other_cpus
) != 0 &&
452 (lp
->lwp_proc
->p_flag
& P_PASSIVE_ACQ
) == 0) {
454 printf("lwp %d/%d nocpu to schedule it on\n",
455 lp
->lwp_proc
->p_pid
, lp
->lwp_tid
);
456 while (mask
&& count
) {
458 KKASSERT((curprocmask
& (1 << cpuid
)) == 0);
459 atomic_clear_int(&rdyprocmask
, 1 << cpuid
);
460 lwkt_schedule(&globaldata_find(cpuid
)->gd_schedthread
);
462 mask
&= ~(1 << cpuid
);
467 * If there are still runnable processes try to wakeup a random
468 * cpu that is running a much lower priority process in order to
469 * preempt on it. Note that gd_upri is only a hint, so we can
470 * overwrite it from the wrong cpu. If we can't find one, we
473 * We depress the priority check so multiple cpu bound programs
474 * do not bounce between cpus. Remember that the clock interrupt
475 * will also cause all cpus to reschedule.
477 * We must mask against rdyprocmask or we will race in the boot
478 * code (before all cpus have working scheduler helpers), plus
479 * some cpus might not be operational and/or not configured to
480 * handle user processes.
482 if (count
&& remote_resched
&& ncpus
> 1) {
485 if (++cpuid
== ncpus
)
487 } while (cpuid
== mycpu
->gd_cpuid
);
490 if (rdyprocmask
& (1 << cpuid
)) {
491 gd
= globaldata_find(cpuid
);
493 if (lp
->lwp_priority
< gd
->gd_upri
- PPQ
) {
494 gd
->gd_upri
= lp
->lwp_priority
;
495 lwkt_send_ipiq(gd
, need_user_resched_remote
, NULL
);
496 ++remote_resched_nonaffinity
;
501 if ((lp
->lwp_thread
->td_flags
& TDF_NORESCHED
) == 0) {
502 if (lp
->lwp_priority
< gd
->gd_upri
- PPQ
) {
503 gd
->gd_upri
= lp
->lwp_priority
;
506 } else if (gd
->gd_uschedcp
== lp
&& needresched
) {
516 * remrunqueue() removes a given process from the run queue that it is on,
517 * clearing the queue busy bit if it becomes empty. This function is called
518 * when a userland process is selected for LWKT scheduling. Note that
519 * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
520 * several userland processes whos threads are scheduled or otherwise in
521 * a special state, and such processes are NOT on the userland scheduler's
524 * This must be called at splhigh().
527 bsd4_remrunqueue(struct lwp
*lp
)
533 ASSERT_MP_LOCK_HELD(lp
->lwp_thread
);
535 KASSERT((lp
->lwp_proc
->p_flag
& P_ONRUNQ
) != 0, ("not on runq4!"));
536 lp
->lwp_proc
->p_flag
&= ~P_ONRUNQ
;
538 KKASSERT(runqcount
>= 0);
539 pri
= lp
->lwp_rqindex
;
540 if (lp
->lwp_rtprio
.type
== RTP_PRIO_NORMAL
) {
543 } else if (lp
->lwp_rtprio
.type
== RTP_PRIO_REALTIME
||
544 lp
->lwp_rtprio
.type
== RTP_PRIO_FIFO
) {
546 which
= &rtqueuebits
;
547 } else if (lp
->lwp_rtprio
.type
== RTP_PRIO_IDLE
) {
549 which
= &idqueuebits
;
551 panic("remrunqueue: invalid rtprio type");
553 TAILQ_REMOVE(q
, lp
, lwp_procq
);
554 if (TAILQ_EMPTY(q
)) {
555 KASSERT((*which
& (1 << pri
)) != 0,
556 ("remrunqueue: remove from empty queue"));
557 *which
&= ~(1 << pri
);
563 * This routine is called from a systimer IPI. It MUST be MP-safe and
564 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ.
570 bsd4_schedulerclock(struct lwp
*lp
, sysclock_t period
, sysclock_t cpstamp
)
572 globaldata_t gd
= mycpu
;
575 * Do we need to round-robin? We round-robin 10 times a second.
576 * This should only occur for cpu-bound batch processes.
578 if (++gd
->gd_rrcount
>= usched_bsd4_rrinterval
) {
584 * As the process accumulates cpu time p_estcpu is bumped and may
585 * push the process into another scheduling queue. It typically
586 * takes 4 ticks to bump the queue.
588 lp
->lwp_estcpu
= ESTCPULIM(lp
->lwp_estcpu
+ ESTCPUINCR
);
591 * Reducing p_origcpu over time causes more of our estcpu to be
592 * returned to the parent when we exit. This is a small tweak
593 * for the batch detection heuristic.
598 /* XXX optimize, avoid lock if no reset is required */
600 bsd4_resetpriority(lp
);
606 * Release the current process designation on p. P MUST BE CURPROC.
607 * Attempt to assign a new current process from the run queue.
609 * This function is called from exit1(), tsleep(), and the passive
610 * release code setup in <arch>/<arch>/trap.c
612 * If we do not have or cannot get the MP lock we just wakeup the userland
613 * helper scheduler thread for this cpu to do the work for us.
615 * WARNING! The MP lock may be in an unsynchronized state due to the
616 * way get_mplock() works and the fact that this function may be called
617 * from a passive release during a lwkt_switch(). try_mplock() will deal
618 * with this for us but you should be aware that td_mpcount may not be
622 bsd4_release_curproc(struct lwp
*lp
)
625 globaldata_t gd
= mycpu
;
627 KKASSERT(lp
->lwp_thread
->td_gd
== gd
);
629 cpuid
= gd
->gd_cpuid
;
631 if (gd
->gd_uschedcp
== lp
) {
634 * If we can obtain the MP lock we can directly
635 * select the next current process.
637 * bsd4_select_curproc() will adjust curprocmask
640 gd
->gd_uschedcp
= NULL
;
641 gd
->gd_upri
= PRIBASE_NULL
;
642 bsd4_select_curproc(gd
);
646 * If we cannot obtain the MP lock schedule our
647 * helper thread to select the next current
650 * This is the only place where we adjust curprocmask
651 * and rdyprocmask without holding the MP lock.
653 gd
->gd_uschedcp
= NULL
;
654 gd
->gd_upri
= PRIBASE_NULL
;
655 atomic_clear_int(&curprocmask
, 1 << cpuid
);
656 if (runqcount
&& (rdyprocmask
& (1 << cpuid
))) {
657 atomic_clear_int(&rdyprocmask
, 1 << cpuid
);
658 lwkt_schedule(&mycpu
->gd_schedthread
);
666 * Select a new current process, potentially retaining gd_uschedcp. However,
667 * be sure to round-robin. This routine is generally only called if a
668 * reschedule is requested and that typically only occurs if a new process
669 * has a better priority or when we are round-robining.
671 * NOTE: Must be called with giant held and the current cpu's gd.
672 * NOTE: The caller must handle the situation where it loses a
673 * uschedcp designation that it previously held, typically by
674 * calling acquire_curproc() again.
675 * NOTE: May not block
679 bsd4_select_curproc(globaldata_t gd
)
682 int cpuid
= gd
->gd_cpuid
;
685 clear_user_resched();
689 * Choose the next designated current user process.
690 * Note that we cannot schedule gd_schedthread
691 * if runqcount is 0 without creating a scheduling
694 * We do not clear the user resched request here,
695 * we need to test it later when we re-acquire.
697 * NOTE: chooseproc returns NULL if the chosen lwp
698 * is gd_uschedcp. XXX needs cleanup.
700 old
= gd
->gd_uschedcp
;
701 if ((nlp
= chooseproc(gd
->gd_uschedcp
)) != NULL
) {
702 atomic_set_int(&curprocmask
, 1 << cpuid
);
703 gd
->gd_upri
= nlp
->lwp_priority
;
704 gd
->gd_uschedcp
= nlp
;
705 lwkt_acquire(nlp
->lwp_thread
);
706 lwkt_schedule(nlp
->lwp_thread
);
707 } else if (gd
->gd_uschedcp
) {
708 gd
->gd_upri
= gd
->gd_uschedcp
->lwp_priority
;
709 KKASSERT(curprocmask
& (1 << cpuid
));
710 } else if (runqcount
&& (rdyprocmask
& (1 << cpuid
))) {
711 /*gd->gd_uschedcp = NULL;*/
712 atomic_clear_int(&curprocmask
, 1 << cpuid
);
713 atomic_clear_int(&rdyprocmask
, 1 << cpuid
);
714 lwkt_schedule(&gd
->gd_schedthread
);
716 /*gd->gd_uschedcp = NULL;*/
717 atomic_clear_int(&curprocmask
, 1 << cpuid
);
723 * Acquire the current process designation on the CURRENT process only.
724 * This function is called at kernel-user priority (not userland priority)
725 * when curlwp does not match gd_uschedcp.
727 * This function is only called just prior to returning to user mode.
729 * Basically we recalculate our estcpu to hopefully give us a more
730 * favorable disposition, setrunqueue, then wait for the curlwp
731 * designation to be handed to us (if the setrunqueue didn't do it).
733 * WARNING! THIS FUNCTION MAY CAUSE THE CURRENT THREAD TO MIGRATE TO
734 * ANOTHER CPU! Because most of the kernel assumes that no migration will
735 * occur, this function is called only under very controlled circumstances.
738 bsd4_acquire_curproc(struct lwp
*lp
)
740 globaldata_t gd
= mycpu
;
746 * Recalculate our priority and put us back on the userland
749 * Only increment the involuntary context switch count if the
750 * setrunqueue call did not immediately schedule us.
752 KKASSERT(lp
== gd
->gd_curthread
->td_lwp
);
753 bsd4_recalculate_estcpu(lp
);
754 lwkt_deschedule_self(gd
->gd_curthread
);
755 bsd4_setrunqueue(lp
);
756 if ((gd
->gd_curthread
->td_flags
& TDF_RUNQ
) == 0)
757 ++lp
->lwp_stats
->p_ru
.ru_nivcsw
;
761 * Because we put ourselves back on the userland scheduler's run
762 * queue, WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU
767 * We better be the current process when we wake up, and we had
768 * better not be on the run queue.
770 KKASSERT(gd
->gd_uschedcp
== lp
);
771 KKASSERT((lp
->lwp_proc
->p_flag
& P_ONRUNQ
) == 0);
778 * Compute the priority of a process when running in user mode.
779 * Arrange to reschedule if the resulting priority is better
780 * than that of the current process.
783 bsd4_resetpriority(struct lwp
*lp
)
789 ASSERT_MP_LOCK_HELD(curthread
);
792 * Set p_priority for general process comparisons
794 switch(lp
->lwp_rtprio
.type
) {
795 case RTP_PRIO_REALTIME
:
796 lp
->lwp_priority
= PRIBASE_REALTIME
+ lp
->lwp_rtprio
.prio
;
798 case RTP_PRIO_NORMAL
:
801 lp
->lwp_priority
= PRIBASE_IDLE
+ lp
->lwp_rtprio
.prio
;
803 case RTP_PRIO_THREAD
:
804 lp
->lwp_priority
= PRIBASE_THREAD
+ lp
->lwp_rtprio
.prio
;
809 * NORMAL priorities fall through. These are based on niceness
810 * and cpu use. Lower numbers == higher priorities.
812 * Calculate our priority based on our niceness and estimated cpu.
813 * Note that the nice value adjusts the baseline, which effects
814 * cpu bursts but does not effect overall cpu use between cpu-bound
815 * processes. The use of the nice field in the decay calculation
816 * controls the overall cpu use.
818 * This isn't an exact calculation. We fit the full nice and
819 * estcpu range into the priority range so the actual PPQ value
820 * is incorrect, but it's still a reasonable way to think about it.
822 newpriority
= (lp
->lwp_proc
->p_nice
- PRIO_MIN
) * PPQ
/ NICEPPQ
;
823 newpriority
+= lp
->lwp_estcpu
* PPQ
/ ESTCPUPPQ
;
824 newpriority
= newpriority
* MAXPRI
/
825 (PRIO_RANGE
* PPQ
/ NICEPPQ
+ ESTCPUMAX
* PPQ
/ ESTCPUPPQ
);
826 newpriority
= MIN(newpriority
, MAXPRI
- 1); /* sanity */
827 newpriority
= MAX(newpriority
, 0); /* sanity */
828 npq
= newpriority
/ PPQ
;
830 opq
= (lp
->lwp_priority
& PRIMASK
) / PPQ
;
831 if (lp
->lwp_proc
->p_stat
== SRUN
&& (lp
->lwp_proc
->p_flag
& P_ONRUNQ
) && opq
!= npq
) {
833 * We have to move the process to another queue
835 bsd4_remrunqueue(lp
);
836 lp
->lwp_priority
= PRIBASE_NORMAL
+ newpriority
;
837 bsd4_setrunqueue(lp
);
840 * We can just adjust the priority and it will be picked
843 KKASSERT(opq
== npq
|| (lp
->lwp_proc
->p_flag
& P_ONRUNQ
) == 0);
844 lp
->lwp_priority
= PRIBASE_NORMAL
+ newpriority
;
850 * Called from fork1() when a new child process is being created.
852 * Give the child process an initial estcpu that is more batch then
853 * its parent and dock the parent for the fork (but do not
854 * reschedule the parent). This comprises the main part of our batch
855 * detection heuristic for both parallel forking and sequential execs.
857 * Interactive processes will decay the boosted estcpu quickly while batch
858 * processes will tend to compound it.
859 * XXX lwp should be "spawning" instead of "forking"
864 bsd4_forking(struct lwp
*plp
, struct lwp
*lp
)
866 lp
->lwp_estcpu
= ESTCPULIM(plp
->lwp_estcpu
+ ESTCPUPPQ
);
867 lp
->lwp_origcpu
= lp
->lwp_estcpu
;
868 plp
->lwp_estcpu
= ESTCPULIM(plp
->lwp_estcpu
+ ESTCPUPPQ
);
872 * Called when the parent reaps a child. Propogate cpu use by the child
873 * back to the parent.
878 bsd4_exiting(struct lwp
*plp
, struct lwp
*lp
)
882 if (plp
->lwp_proc
->p_pid
!= 1) {
883 delta
= lp
->lwp_estcpu
- lp
->lwp_origcpu
;
885 plp
->lwp_estcpu
= ESTCPULIM(plp
->lwp_estcpu
+ delta
);
890 * Called from acquire and from kern_synch's one-second timer with a
891 * critical section held.
893 * Decay p_estcpu based on the number of ticks we haven't been running
894 * and our p_nice. As the load increases each process observes a larger
895 * number of idle ticks (because other processes are running in them).
896 * This observation leads to a larger correction which tends to make the
897 * system more 'batchy'.
899 * Note that no recalculation occurs for a process which sleeps and wakes
900 * up in the same tick. That is, a system doing thousands of context
901 * switches per second will still only do serious estcpu calculations
902 * ESTCPUFREQ times per second.
906 bsd4_recalculate_estcpu(struct lwp
*lp
)
908 globaldata_t gd
= mycpu
;
915 ASSERT_MP_LOCK_HELD(curthread
);
918 * We have to subtract periodic to get the last schedclock
919 * timeout time, otherwise we would get the upcoming timeout.
920 * Keep in mind that a process can migrate between cpus and
921 * while the scheduler clock should be very close, boundary
922 * conditions could lead to a small negative delta.
924 cpbase
= gd
->gd_schedclock
.time
- gd
->gd_schedclock
.periodic
;
926 if (lp
->lwp_slptime
> 1) {
928 * Too much time has passed, do a coarse correction.
930 lp
->lwp_estcpu
= lp
->lwp_estcpu
>> 1;
931 bsd4_resetpriority(lp
);
932 lp
->lwp_cpbase
= cpbase
;
934 } else if (lp
->lwp_cpbase
!= cpbase
) {
936 * Adjust estcpu if we are in a different tick. Don't waste
937 * time if we are in the same tick.
939 * First calculate the number of ticks in the measurement
942 nticks
= (cpbase
- lp
->lwp_cpbase
) / gd
->gd_schedclock
.periodic
;
943 updatepcpu(lp
, lp
->lwp_cpticks
, nticks
);
945 if ((nleft
= nticks
- lp
->lwp_cpticks
) < 0)
947 if (usched_debug
== lp
->lwp_proc
->p_pid
) {
948 printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d",
949 lp
->lwp_proc
->p_pid
, lp
->lwp_tid
, lp
->lwp_estcpu
,
950 lp
->lwp_cpticks
, nticks
, nleft
);
954 * Calculate a decay value based on ticks remaining scaled
955 * down by the instantanious load and p_nice.
957 if ((loadfac
= runqcount
) < 2)
959 ndecay
= nleft
* usched_bsd4_decay
* 2 *
960 (PRIO_MAX
* 2 - lp
->lwp_proc
->p_nice
) / (loadfac
* PRIO_MAX
* 2);
963 * Adjust p_estcpu. Handle a border case where batch jobs
964 * can get stalled long enough to decay to zero when they
967 if (lp
->lwp_estcpu
> ndecay
* 2)
968 lp
->lwp_estcpu
-= ndecay
;
970 lp
->lwp_estcpu
>>= 1;
972 if (usched_debug
== lp
->lwp_proc
->p_pid
)
973 printf(" ndecay %d estcpu %d\n", ndecay
, lp
->lwp_estcpu
);
975 bsd4_resetpriority(lp
);
976 lp
->lwp_cpbase
= cpbase
;
984 * For SMP systems a user scheduler helper thread is created for each
985 * cpu and is used to allow one cpu to wakeup another for the purposes of
986 * scheduling userland threads from setrunqueue(). UP systems do not
987 * need the helper since there is only one cpu. We can't use the idle
988 * thread for this because we need to hold the MP lock. Additionally,
989 * doing things this way allows us to HLT idle cpus on MP systems.
992 sched_thread(void *dummy
)
994 globaldata_t gd
= mycpu
;
995 int cpuid
= gd
->gd_cpuid
; /* doesn't change */
996 u_int32_t cpumask
= 1 << cpuid
; /* doesn't change */
998 ASSERT_MP_LOCK_HELD(curthread
);
1002 lwkt_deschedule_self(gd
->gd_curthread
); /* interlock */
1003 atomic_set_int(&rdyprocmask
, cpumask
);
1004 crit_enter_quick(gd
->gd_curthread
);
1005 if ((curprocmask
& cpumask
) == 0 && (nlp
= chooseproc(NULL
)) != NULL
) {
1006 atomic_set_int(&curprocmask
, cpumask
);
1007 gd
->gd_upri
= nlp
->lwp_priority
;
1008 gd
->gd_uschedcp
= nlp
;
1009 lwkt_acquire(nlp
->lwp_thread
);
1010 lwkt_schedule(nlp
->lwp_thread
);
1012 crit_exit_quick(gd
->gd_curthread
);
1018 * Setup our scheduler helpers. Note that curprocmask bit 0 has already
1019 * been cleared by rqinit() and we should not mess with it further.
1022 sched_thread_cpu_init(void)
1027 printf("start scheduler helpers on cpus:");
1029 for (i
= 0; i
< ncpus
; ++i
) {
1030 globaldata_t dgd
= globaldata_find(i
);
1031 cpumask_t mask
= 1 << i
;
1033 if ((mask
& smp_active_mask
) == 0)
1039 lwkt_create(sched_thread
, NULL
, NULL
, &dgd
->gd_schedthread
,
1040 TDF_STOPREQ
, i
, "usched %d", i
);
1043 * Allow user scheduling on the target cpu. cpu #0 has already
1044 * been enabled in rqinit().
1047 atomic_clear_int(&curprocmask
, mask
);
1048 atomic_set_int(&rdyprocmask
, mask
);
1053 SYSINIT(uschedtd
, SI_SUB_FINISH_SMP
, SI_ORDER_ANY
, sched_thread_cpu_init
, NULL
)