Fix "ls: not found" problem during buildworld. mdate.sh script
[dragonfly.git] / sys / kern / usched_bsd4.c
blob9f8dc64d0f18f092442637bb85f5357a7d3bbae5
1 /*
2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
3 * All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
26 * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.7 2005/12/23 10:08:14 dillon Exp $
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/lock.h>
33 #include <sys/queue.h>
34 #include <sys/proc.h>
35 #include <sys/rtprio.h>
36 #include <sys/thread2.h>
37 #include <sys/uio.h>
38 #include <sys/sysctl.h>
39 #include <sys/resourcevar.h>
40 #include <machine/ipl.h>
41 #include <machine/cpu.h>
42 #include <machine/smp.h>
45 * Priorities. Note that with 32 run queues per scheduler each queue
46 * represents four priority levels.
49 #define MAXPRI 128
50 #define PRIMASK (MAXPRI - 1)
51 #define PRIBASE_REALTIME 0
52 #define PRIBASE_NORMAL MAXPRI
53 #define PRIBASE_IDLE (MAXPRI * 2)
54 #define PRIBASE_THREAD (MAXPRI * 3)
55 #define PRIBASE_NULL (MAXPRI * 4)
57 #define NQS 32 /* 32 run queues. */
58 #define PPQ (MAXPRI / NQS) /* priorities per queue */
61 * NICEPPQ - number of nice units per priority queue
62 * ESTCPURAMP - number of scheduler ticks for estcpu to switch queues
64 * ESTCPUPPQ - number of estcpu units per priority queue
65 * ESTCPUMAX - number of estcpu units
66 * ESTCPUINCR - amount we have to increment p_estcpu per scheduling tick at
67 * 100% cpu.
69 #define NICEPPQ 2
70 #define ESTCPURAMP 4
71 #define ESTCPUPPQ 512
72 #define ESTCPUMAX (ESTCPUPPQ * NQS)
73 #define ESTCPUINCR (ESTCPUPPQ / ESTCPURAMP)
74 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1)
76 #define ESTCPULIM(v) min((v), ESTCPUMAX)
78 TAILQ_HEAD(rq, lwp);
80 #define lwp_priority lwp_usdata.bsd4.priority
81 #define lwp_rqindex lwp_usdata.bsd4.rqindex
82 #define lwp_origcpu lwp_usdata.bsd4.origcpu
83 #define lwp_estcpu lwp_usdata.bsd4.estcpu
85 static void bsd4_acquire_curproc(struct lwp *lp);
86 static void bsd4_release_curproc(struct lwp *lp);
87 static void bsd4_select_curproc(globaldata_t gd);
88 static void bsd4_setrunqueue(struct lwp *lp);
89 static void bsd4_remrunqueue(struct lwp *lp);
90 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
91 sysclock_t cpstamp);
92 static void bsd4_resetpriority(struct lwp *lp);
93 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
94 static void bsd4_exiting(struct lwp *plp, struct lwp *lp);
96 static void bsd4_recalculate_estcpu(struct lwp *lp);
98 struct usched usched_bsd4 = {
99 { NULL },
100 "bsd4", "Original DragonFly Scheduler",
101 NULL, /* default registration */
102 NULL, /* default deregistration */
103 bsd4_acquire_curproc,
104 bsd4_release_curproc,
105 bsd4_select_curproc,
106 bsd4_setrunqueue,
107 bsd4_remrunqueue,
108 bsd4_schedulerclock,
109 bsd4_recalculate_estcpu,
110 bsd4_resetpriority,
111 bsd4_forking,
112 bsd4_exiting,
113 NULL /* setcpumask not supported */
117 * We have NQS (32) run queues per scheduling class. For the normal
118 * class, there are 128 priorities scaled onto these 32 queues. New
119 * processes are added to the last entry in each queue, and processes
120 * are selected for running by taking them from the head and maintaining
121 * a simple FIFO arrangement. Realtime and Idle priority processes have
122 * and explicit 0-31 priority which maps directly onto their class queue
123 * index. When a queue has something in it, the corresponding bit is
124 * set in the queuebits variable, allowing a single read to determine
125 * the state of all 32 queues and then a ffs() to find the first busy
126 * queue.
128 static struct rq queues[NQS];
129 static struct rq rtqueues[NQS];
130 static struct rq idqueues[NQS];
131 static u_int32_t queuebits;
132 static u_int32_t rtqueuebits;
133 static u_int32_t idqueuebits;
134 static cpumask_t curprocmask = -1; /* currently running a user process */
135 static cpumask_t rdyprocmask; /* ready to accept a user process */
136 static int runqcount;
137 #ifdef SMP
138 static int scancpu;
139 #endif
141 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
142 #ifdef INVARIANTS
143 static int usched_nonoptimal;
144 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
145 &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
146 static int usched_optimal;
147 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
148 &usched_optimal, 0, "acquire_curproc() was optimal");
149 #endif
150 static int usched_debug = -1;
151 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, "");
152 #ifdef SMP
153 static int remote_resched = 1;
154 static int remote_resched_nonaffinity;
155 static int remote_resched_affinity;
156 static int choose_affinity;
157 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
158 &remote_resched, 0, "Resched to another cpu");
159 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
160 &remote_resched_nonaffinity, 0, "Number of remote rescheds");
161 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
162 &remote_resched_affinity, 0, "Number of remote rescheds");
163 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
164 &choose_affinity, 0, "chooseproc() was smart");
165 #endif
167 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
168 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
169 &usched_bsd4_rrinterval, 0, "");
170 static int usched_bsd4_decay = ESTCPUINCR / 2;
171 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
172 &usched_bsd4_decay, 0, "");
175 * Initialize the run queues at boot time.
177 static void
178 rqinit(void *dummy)
180 int i;
182 for (i = 0; i < NQS; i++) {
183 TAILQ_INIT(&queues[i]);
184 TAILQ_INIT(&rtqueues[i]);
185 TAILQ_INIT(&idqueues[i]);
187 atomic_clear_int(&curprocmask, 1);
189 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
192 * chooseproc() is called when a cpu needs a user process to LWKT schedule,
193 * it selects a user process and returns it. If chklp is non-NULL and chklp
194 * has a better or equal priority then the process that would otherwise be
195 * chosen, NULL is returned.
197 * Until we fix the RUNQ code the chklp test has to be strict or we may
198 * bounce between processes trying to acquire the current process designation.
200 static
201 struct lwp *
202 chooseproc(struct lwp *chklp)
204 struct lwp *lp;
205 struct rq *q;
206 u_int32_t *which;
207 u_int32_t pri;
209 if (rtqueuebits) {
210 pri = bsfl(rtqueuebits);
211 q = &rtqueues[pri];
212 which = &rtqueuebits;
213 } else if (queuebits) {
214 pri = bsfl(queuebits);
215 q = &queues[pri];
216 which = &queuebits;
217 } else if (idqueuebits) {
218 pri = bsfl(idqueuebits);
219 q = &idqueues[pri];
220 which = &idqueuebits;
221 } else {
222 return NULL;
224 lp = TAILQ_FIRST(q);
225 KASSERT(lp, ("chooseproc: no lwp on busy queue"));
228 * If the passed lwp <chklp> is reasonably close to the selected
229 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
231 * Note that we must error on the side of <chklp> to avoid bouncing
232 * between threads in the acquire code.
234 if (chklp) {
235 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
236 return(NULL);
239 #ifdef SMP
241 * If the chosen lwp does not reside on this cpu spend a few
242 * cycles looking for a better candidate at the same priority level.
243 * This is a fallback check, setrunqueue() tries to wakeup the
244 * correct cpu and is our front-line affinity.
246 if (lp->lwp_thread->td_gd != mycpu &&
247 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
249 if (chklp->lwp_thread->td_gd == mycpu) {
250 ++choose_affinity;
251 lp = chklp;
254 #endif
256 TAILQ_REMOVE(q, lp, lwp_procq);
257 --runqcount;
258 if (TAILQ_EMPTY(q))
259 *which &= ~(1 << pri);
260 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
261 lp->lwp_proc->p_flag &= ~P_ONRUNQ;
262 return lp;
265 #ifdef SMP
267 * called via an ipi message to reschedule on another cpu.
269 static
270 void
271 need_user_resched_remote(void *dummy)
273 need_user_resched();
276 #endif
279 * setrunqueue() 'wakes up' a 'user' process. GIANT must be held. The
280 * user process may represent any user process, including the current
281 * process.
283 * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
284 * cpus in an attempt to keep the process on the current cpu at least for
285 * a little while to take advantage of locality of reference (e.g. fork/exec
286 * or short fork/exit, and uio_yield()).
288 * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
289 * or (user level) preempt on the same cpu that a process was previously
290 * scheduled to. If we cannot do this but we are at enough of a higher
291 * priority then the processes running on other cpus, we will allow the
292 * process to be stolen by another cpu.
294 * WARNING! a thread can be acquired by another cpu the moment it is put
295 * on the user scheduler's run queue AND we release the MP lock. Since we
296 * release the MP lock before switching out another cpu may begin stealing
297 * our current thread before we are completely switched out! The
298 * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
299 * thread before stealing it.
301 * NOTE on need_user_resched() calls: we have to call need_user_resched()
302 * if the new process is more important then the current process, or if
303 * the new process is the current process and is now less important then
304 * other processes.
306 * The associated thread must NOT be scheduled.
307 * The process must be runnable.
308 * This must be called at splhigh().
310 static void
311 bsd4_setrunqueue(struct lwp *lp)
313 struct rq *q;
314 struct globaldata *gd;
315 int pri;
316 int cpuid;
317 u_int32_t needresched;
318 #ifdef SMP
319 int count;
320 cpumask_t mask;
321 #endif
323 ASSERT_MP_LOCK_HELD(lp->lwp_thread);
324 crit_enter();
325 KASSERT(lp->lwp_proc->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
326 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0,
327 ("lwp %d/%d already on runq! flag %08x", lp->lwp_proc->p_pid,
328 lp->lwp_tid, lp->lwp_proc->p_flag));
329 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
332 * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
334 gd = lp->lwp_thread->td_gd;
337 * Because recalculate is only called once or twice for long sleeps,
338 * not every second forever while the process is sleeping, we have
339 * to manually call it to resynchronize p_cpbase on wakeup or it
340 * will wrap if the process was sleeping long enough (e.g. ~10 min
341 * with the ACPI timer) and really mess up the nticks calculation.
343 if (lp->lwp_slptime) {
344 bsd4_recalculate_estcpu(lp);
345 lp->lwp_slptime = 0;
348 * We have not been released, make sure that we are not the currently
349 * designated process.
351 KKASSERT(gd->gd_uschedcp != lp);
354 * Check cpu affinity. The associated thread is stable at the
355 * moment. Note that we may be checking another cpu here so we
356 * have to be careful. We are currently protected by the BGL.
358 * This allows us to avoid actually queueing the process.
359 * acquire_curproc() will handle any threads we mistakenly schedule.
361 cpuid = gd->gd_cpuid;
363 if ((curprocmask & (1 << cpuid)) == 0) {
364 atomic_set_int(&curprocmask, 1 << cpuid);
365 gd->gd_uschedcp = lp;
366 gd->gd_upri = lp->lwp_priority;
367 lwkt_schedule(lp->lwp_thread);
368 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
369 crit_exit();
370 #ifdef SMP
371 if (gd != mycpu)
372 ++remote_resched_affinity;
373 #endif
374 return;
378 * gd and cpuid may still 'hint' at another cpu. Even so we have
379 * to place this process on the userland scheduler's run queue for
380 * action by the target cpu.
382 ++runqcount;
383 lp->lwp_proc->p_flag |= P_ONRUNQ;
384 if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
385 pri = (lp->lwp_priority & PRIMASK) / PPQ;
386 q = &queues[pri];
387 queuebits |= 1 << pri;
388 needresched = (queuebits & ((1 << pri) - 1));
389 } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
390 lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
391 pri = (u_int8_t)lp->lwp_rtprio.prio;
392 q = &rtqueues[pri];
393 rtqueuebits |= 1 << pri;
394 needresched = (rtqueuebits & ((1 << pri) - 1));
395 } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
396 pri = (u_int8_t)lp->lwp_rtprio.prio;
397 q = &idqueues[pri];
398 idqueuebits |= 1 << pri;
399 needresched = (idqueuebits & ((1 << pri) - 1));
400 } else {
401 needresched = 0;
402 panic("setrunqueue: invalid rtprio type");
404 KKASSERT(pri < 32);
405 lp->lwp_rqindex = pri; /* remember the queue index */
406 TAILQ_INSERT_TAIL(q, lp, lwp_procq);
408 #ifdef SMP
410 * Either wakeup other cpus user thread scheduler or request
411 * preemption on other cpus (which will also wakeup a HLT).
413 * NOTE! gd and cpuid may still be our 'hint', not our current
414 * cpu info.
417 count = runqcount;
420 * Check cpu affinity for user preemption (when the curprocmask bit
421 * is set). Note that gd_upri is a speculative field (we modify
422 * another cpu's gd_upri to avoid sending ipiq storms).
424 if (gd == mycpu) {
425 if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
426 if (lp->lwp_priority < gd->gd_upri - PPQ) {
427 gd->gd_upri = lp->lwp_priority;
428 gd->gd_rrcount = 0;
429 need_user_resched();
430 --count;
431 } else if (gd->gd_uschedcp == lp && needresched) {
432 gd->gd_rrcount = 0;
433 need_user_resched();
434 --count;
437 } else if (remote_resched) {
438 if (lp->lwp_priority < gd->gd_upri - PPQ) {
439 gd->gd_upri = lp->lwp_priority;
440 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
441 --count;
442 ++remote_resched_affinity;
447 * No affinity, first schedule to any cpus that do not have a current
448 * process. If there is a free cpu we always schedule to it.
450 if (count &&
451 (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
452 (lp->lwp_proc->p_flag & P_PASSIVE_ACQ) == 0) {
453 if (!mask)
454 printf("lwp %d/%d nocpu to schedule it on\n",
455 lp->lwp_proc->p_pid, lp->lwp_tid);
456 while (mask && count) {
457 cpuid = bsfl(mask);
458 KKASSERT((curprocmask & (1 << cpuid)) == 0);
459 atomic_clear_int(&rdyprocmask, 1 << cpuid);
460 lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
461 --count;
462 mask &= ~(1 << cpuid);
467 * If there are still runnable processes try to wakeup a random
468 * cpu that is running a much lower priority process in order to
469 * preempt on it. Note that gd_upri is only a hint, so we can
470 * overwrite it from the wrong cpu. If we can't find one, we
471 * are SOL.
473 * We depress the priority check so multiple cpu bound programs
474 * do not bounce between cpus. Remember that the clock interrupt
475 * will also cause all cpus to reschedule.
477 * We must mask against rdyprocmask or we will race in the boot
478 * code (before all cpus have working scheduler helpers), plus
479 * some cpus might not be operational and/or not configured to
480 * handle user processes.
482 if (count && remote_resched && ncpus > 1) {
483 cpuid = scancpu;
484 do {
485 if (++cpuid == ncpus)
486 cpuid = 0;
487 } while (cpuid == mycpu->gd_cpuid);
488 scancpu = cpuid;
490 if (rdyprocmask & (1 << cpuid)) {
491 gd = globaldata_find(cpuid);
493 if (lp->lwp_priority < gd->gd_upri - PPQ) {
494 gd->gd_upri = lp->lwp_priority;
495 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
496 ++remote_resched_nonaffinity;
500 #else
501 if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
502 if (lp->lwp_priority < gd->gd_upri - PPQ) {
503 gd->gd_upri = lp->lwp_priority;
504 gd->gd_rrcount = 0;
505 need_user_resched();
506 } else if (gd->gd_uschedcp == lp && needresched) {
507 gd->gd_rrcount = 0;
508 need_user_resched();
511 #endif
512 crit_exit();
516 * remrunqueue() removes a given process from the run queue that it is on,
517 * clearing the queue busy bit if it becomes empty. This function is called
518 * when a userland process is selected for LWKT scheduling. Note that
519 * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
520 * several userland processes whos threads are scheduled or otherwise in
521 * a special state, and such processes are NOT on the userland scheduler's
522 * run queue.
524 * This must be called at splhigh().
526 static void
527 bsd4_remrunqueue(struct lwp *lp)
529 struct rq *q;
530 u_int32_t *which;
531 u_int8_t pri;
533 ASSERT_MP_LOCK_HELD(lp->lwp_thread);
534 crit_enter();
535 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
536 lp->lwp_proc->p_flag &= ~P_ONRUNQ;
537 --runqcount;
538 KKASSERT(runqcount >= 0);
539 pri = lp->lwp_rqindex;
540 if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
541 q = &queues[pri];
542 which = &queuebits;
543 } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
544 lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
545 q = &rtqueues[pri];
546 which = &rtqueuebits;
547 } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
548 q = &idqueues[pri];
549 which = &idqueuebits;
550 } else {
551 panic("remrunqueue: invalid rtprio type");
553 TAILQ_REMOVE(q, lp, lwp_procq);
554 if (TAILQ_EMPTY(q)) {
555 KASSERT((*which & (1 << pri)) != 0,
556 ("remrunqueue: remove from empty queue"));
557 *which &= ~(1 << pri);
559 crit_exit();
563 * This routine is called from a systimer IPI. It MUST be MP-safe and
564 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ.
566 * MPSAFE
568 static
569 void
570 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
572 globaldata_t gd = mycpu;
575 * Do we need to round-robin? We round-robin 10 times a second.
576 * This should only occur for cpu-bound batch processes.
578 if (++gd->gd_rrcount >= usched_bsd4_rrinterval) {
579 gd->gd_rrcount = 0;
580 need_user_resched();
584 * As the process accumulates cpu time p_estcpu is bumped and may
585 * push the process into another scheduling queue. It typically
586 * takes 4 ticks to bump the queue.
588 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
591 * Reducing p_origcpu over time causes more of our estcpu to be
592 * returned to the parent when we exit. This is a small tweak
593 * for the batch detection heuristic.
595 if (lp->lwp_origcpu)
596 --lp->lwp_origcpu;
598 /* XXX optimize, avoid lock if no reset is required */
599 if (try_mplock()) {
600 bsd4_resetpriority(lp);
601 rel_mplock();
606 * Release the current process designation on p. P MUST BE CURPROC.
607 * Attempt to assign a new current process from the run queue.
609 * This function is called from exit1(), tsleep(), and the passive
610 * release code setup in <arch>/<arch>/trap.c
612 * If we do not have or cannot get the MP lock we just wakeup the userland
613 * helper scheduler thread for this cpu to do the work for us.
615 * WARNING! The MP lock may be in an unsynchronized state due to the
616 * way get_mplock() works and the fact that this function may be called
617 * from a passive release during a lwkt_switch(). try_mplock() will deal
618 * with this for us but you should be aware that td_mpcount may not be
619 * useable.
621 static void
622 bsd4_release_curproc(struct lwp *lp)
624 int cpuid;
625 globaldata_t gd = mycpu;
627 KKASSERT(lp->lwp_thread->td_gd == gd);
628 crit_enter();
629 cpuid = gd->gd_cpuid;
631 if (gd->gd_uschedcp == lp) {
632 if (try_mplock()) {
634 * If we can obtain the MP lock we can directly
635 * select the next current process.
637 * bsd4_select_curproc() will adjust curprocmask
638 * for us.
640 gd->gd_uschedcp = NULL;
641 gd->gd_upri = PRIBASE_NULL;
642 bsd4_select_curproc(gd);
643 rel_mplock();
644 } else {
646 * If we cannot obtain the MP lock schedule our
647 * helper thread to select the next current
648 * process.
650 * This is the only place where we adjust curprocmask
651 * and rdyprocmask without holding the MP lock.
653 gd->gd_uschedcp = NULL;
654 gd->gd_upri = PRIBASE_NULL;
655 atomic_clear_int(&curprocmask, 1 << cpuid);
656 if (runqcount && (rdyprocmask & (1 << cpuid))) {
657 atomic_clear_int(&rdyprocmask, 1 << cpuid);
658 lwkt_schedule(&mycpu->gd_schedthread);
662 crit_exit();
666 * Select a new current process, potentially retaining gd_uschedcp. However,
667 * be sure to round-robin. This routine is generally only called if a
668 * reschedule is requested and that typically only occurs if a new process
669 * has a better priority or when we are round-robining.
671 * NOTE: Must be called with giant held and the current cpu's gd.
672 * NOTE: The caller must handle the situation where it loses a
673 * uschedcp designation that it previously held, typically by
674 * calling acquire_curproc() again.
675 * NOTE: May not block
677 static
678 void
679 bsd4_select_curproc(globaldata_t gd)
681 struct lwp *nlp;
682 int cpuid = gd->gd_cpuid;
683 void *old;
685 clear_user_resched();
686 get_mplock();
689 * Choose the next designated current user process.
690 * Note that we cannot schedule gd_schedthread
691 * if runqcount is 0 without creating a scheduling
692 * loop.
694 * We do not clear the user resched request here,
695 * we need to test it later when we re-acquire.
697 * NOTE: chooseproc returns NULL if the chosen lwp
698 * is gd_uschedcp. XXX needs cleanup.
700 old = gd->gd_uschedcp;
701 if ((nlp = chooseproc(gd->gd_uschedcp)) != NULL) {
702 atomic_set_int(&curprocmask, 1 << cpuid);
703 gd->gd_upri = nlp->lwp_priority;
704 gd->gd_uschedcp = nlp;
705 lwkt_acquire(nlp->lwp_thread);
706 lwkt_schedule(nlp->lwp_thread);
707 } else if (gd->gd_uschedcp) {
708 gd->gd_upri = gd->gd_uschedcp->lwp_priority;
709 KKASSERT(curprocmask & (1 << cpuid));
710 } else if (runqcount && (rdyprocmask & (1 << cpuid))) {
711 /*gd->gd_uschedcp = NULL;*/
712 atomic_clear_int(&curprocmask, 1 << cpuid);
713 atomic_clear_int(&rdyprocmask, 1 << cpuid);
714 lwkt_schedule(&gd->gd_schedthread);
715 } else {
716 /*gd->gd_uschedcp = NULL;*/
717 atomic_clear_int(&curprocmask, 1 << cpuid);
719 rel_mplock();
723 * Acquire the current process designation on the CURRENT process only.
724 * This function is called at kernel-user priority (not userland priority)
725 * when curlwp does not match gd_uschedcp.
727 * This function is only called just prior to returning to user mode.
729 * Basically we recalculate our estcpu to hopefully give us a more
730 * favorable disposition, setrunqueue, then wait for the curlwp
731 * designation to be handed to us (if the setrunqueue didn't do it).
733 * WARNING! THIS FUNCTION MAY CAUSE THE CURRENT THREAD TO MIGRATE TO
734 * ANOTHER CPU! Because most of the kernel assumes that no migration will
735 * occur, this function is called only under very controlled circumstances.
737 static void
738 bsd4_acquire_curproc(struct lwp *lp)
740 globaldata_t gd = mycpu;
742 get_mplock();
743 crit_enter();
746 * Recalculate our priority and put us back on the userland
747 * scheduler's runq.
749 * Only increment the involuntary context switch count if the
750 * setrunqueue call did not immediately schedule us.
752 KKASSERT(lp == gd->gd_curthread->td_lwp);
753 bsd4_recalculate_estcpu(lp);
754 lwkt_deschedule_self(gd->gd_curthread);
755 bsd4_setrunqueue(lp);
756 if ((gd->gd_curthread->td_flags & TDF_RUNQ) == 0)
757 ++lp->lwp_stats->p_ru.ru_nivcsw;
758 lwkt_switch();
761 * Because we put ourselves back on the userland scheduler's run
762 * queue, WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU
764 gd = mycpu;
767 * We better be the current process when we wake up, and we had
768 * better not be on the run queue.
770 KKASSERT(gd->gd_uschedcp == lp);
771 KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
773 crit_exit();
774 rel_mplock();
778 * Compute the priority of a process when running in user mode.
779 * Arrange to reschedule if the resulting priority is better
780 * than that of the current process.
782 static void
783 bsd4_resetpriority(struct lwp *lp)
785 int newpriority;
786 int opq;
787 int npq;
789 ASSERT_MP_LOCK_HELD(curthread);
792 * Set p_priority for general process comparisons
794 switch(lp->lwp_rtprio.type) {
795 case RTP_PRIO_REALTIME:
796 lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio;
797 return;
798 case RTP_PRIO_NORMAL:
799 break;
800 case RTP_PRIO_IDLE:
801 lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio;
802 return;
803 case RTP_PRIO_THREAD:
804 lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio;
805 return;
809 * NORMAL priorities fall through. These are based on niceness
810 * and cpu use. Lower numbers == higher priorities.
812 * Calculate our priority based on our niceness and estimated cpu.
813 * Note that the nice value adjusts the baseline, which effects
814 * cpu bursts but does not effect overall cpu use between cpu-bound
815 * processes. The use of the nice field in the decay calculation
816 * controls the overall cpu use.
818 * This isn't an exact calculation. We fit the full nice and
819 * estcpu range into the priority range so the actual PPQ value
820 * is incorrect, but it's still a reasonable way to think about it.
822 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
823 newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ;
824 newpriority = newpriority * MAXPRI /
825 (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
826 newpriority = MIN(newpriority, MAXPRI - 1); /* sanity */
827 newpriority = MAX(newpriority, 0); /* sanity */
828 npq = newpriority / PPQ;
829 crit_enter();
830 opq = (lp->lwp_priority & PRIMASK) / PPQ;
831 if (lp->lwp_proc->p_stat == SRUN && (lp->lwp_proc->p_flag & P_ONRUNQ) && opq != npq) {
833 * We have to move the process to another queue
835 bsd4_remrunqueue(lp);
836 lp->lwp_priority = PRIBASE_NORMAL + newpriority;
837 bsd4_setrunqueue(lp);
838 } else {
840 * We can just adjust the priority and it will be picked
841 * up later.
843 KKASSERT(opq == npq || (lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
844 lp->lwp_priority = PRIBASE_NORMAL + newpriority;
846 crit_exit();
850 * Called from fork1() when a new child process is being created.
852 * Give the child process an initial estcpu that is more batch then
853 * its parent and dock the parent for the fork (but do not
854 * reschedule the parent). This comprises the main part of our batch
855 * detection heuristic for both parallel forking and sequential execs.
857 * Interactive processes will decay the boosted estcpu quickly while batch
858 * processes will tend to compound it.
859 * XXX lwp should be "spawning" instead of "forking"
861 * MPSAFE
863 static void
864 bsd4_forking(struct lwp *plp, struct lwp *lp)
866 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
867 lp->lwp_origcpu = lp->lwp_estcpu;
868 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
872 * Called when the parent reaps a child. Propogate cpu use by the child
873 * back to the parent.
875 * MPSAFE
877 static void
878 bsd4_exiting(struct lwp *plp, struct lwp *lp)
880 int delta;
882 if (plp->lwp_proc->p_pid != 1) {
883 delta = lp->lwp_estcpu - lp->lwp_origcpu;
884 if (delta > 0)
885 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + delta);
890 * Called from acquire and from kern_synch's one-second timer with a
891 * critical section held.
893 * Decay p_estcpu based on the number of ticks we haven't been running
894 * and our p_nice. As the load increases each process observes a larger
895 * number of idle ticks (because other processes are running in them).
896 * This observation leads to a larger correction which tends to make the
897 * system more 'batchy'.
899 * Note that no recalculation occurs for a process which sleeps and wakes
900 * up in the same tick. That is, a system doing thousands of context
901 * switches per second will still only do serious estcpu calculations
902 * ESTCPUFREQ times per second.
904 static
905 void
906 bsd4_recalculate_estcpu(struct lwp *lp)
908 globaldata_t gd = mycpu;
909 sysclock_t cpbase;
910 int loadfac;
911 int ndecay;
912 int nticks;
913 int nleft;
915 ASSERT_MP_LOCK_HELD(curthread);
918 * We have to subtract periodic to get the last schedclock
919 * timeout time, otherwise we would get the upcoming timeout.
920 * Keep in mind that a process can migrate between cpus and
921 * while the scheduler clock should be very close, boundary
922 * conditions could lead to a small negative delta.
924 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
926 if (lp->lwp_slptime > 1) {
928 * Too much time has passed, do a coarse correction.
930 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
931 bsd4_resetpriority(lp);
932 lp->lwp_cpbase = cpbase;
933 lp->lwp_cpticks = 0;
934 } else if (lp->lwp_cpbase != cpbase) {
936 * Adjust estcpu if we are in a different tick. Don't waste
937 * time if we are in the same tick.
939 * First calculate the number of ticks in the measurement
940 * interval.
942 nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic;
943 updatepcpu(lp, lp->lwp_cpticks, nticks);
945 if ((nleft = nticks - lp->lwp_cpticks) < 0)
946 nleft = 0;
947 if (usched_debug == lp->lwp_proc->p_pid) {
948 printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d",
949 lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu,
950 lp->lwp_cpticks, nticks, nleft);
954 * Calculate a decay value based on ticks remaining scaled
955 * down by the instantanious load and p_nice.
957 if ((loadfac = runqcount) < 2)
958 loadfac = 2;
959 ndecay = nleft * usched_bsd4_decay * 2 *
960 (PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2);
963 * Adjust p_estcpu. Handle a border case where batch jobs
964 * can get stalled long enough to decay to zero when they
965 * shouldn't.
967 if (lp->lwp_estcpu > ndecay * 2)
968 lp->lwp_estcpu -= ndecay;
969 else
970 lp->lwp_estcpu >>= 1;
972 if (usched_debug == lp->lwp_proc->p_pid)
973 printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu);
975 bsd4_resetpriority(lp);
976 lp->lwp_cpbase = cpbase;
977 lp->lwp_cpticks = 0;
981 #ifdef SMP
984 * For SMP systems a user scheduler helper thread is created for each
985 * cpu and is used to allow one cpu to wakeup another for the purposes of
986 * scheduling userland threads from setrunqueue(). UP systems do not
987 * need the helper since there is only one cpu. We can't use the idle
988 * thread for this because we need to hold the MP lock. Additionally,
989 * doing things this way allows us to HLT idle cpus on MP systems.
991 static void
992 sched_thread(void *dummy)
994 globaldata_t gd = mycpu;
995 int cpuid = gd->gd_cpuid; /* doesn't change */
996 u_int32_t cpumask = 1 << cpuid; /* doesn't change */
998 ASSERT_MP_LOCK_HELD(curthread);
999 for (;;) {
1000 struct lwp *nlp;
1002 lwkt_deschedule_self(gd->gd_curthread); /* interlock */
1003 atomic_set_int(&rdyprocmask, cpumask);
1004 crit_enter_quick(gd->gd_curthread);
1005 if ((curprocmask & cpumask) == 0 && (nlp = chooseproc(NULL)) != NULL) {
1006 atomic_set_int(&curprocmask, cpumask);
1007 gd->gd_upri = nlp->lwp_priority;
1008 gd->gd_uschedcp = nlp;
1009 lwkt_acquire(nlp->lwp_thread);
1010 lwkt_schedule(nlp->lwp_thread);
1012 crit_exit_quick(gd->gd_curthread);
1013 lwkt_switch();
1018 * Setup our scheduler helpers. Note that curprocmask bit 0 has already
1019 * been cleared by rqinit() and we should not mess with it further.
1021 static void
1022 sched_thread_cpu_init(void)
1024 int i;
1026 if (bootverbose)
1027 printf("start scheduler helpers on cpus:");
1029 for (i = 0; i < ncpus; ++i) {
1030 globaldata_t dgd = globaldata_find(i);
1031 cpumask_t mask = 1 << i;
1033 if ((mask & smp_active_mask) == 0)
1034 continue;
1036 if (bootverbose)
1037 printf(" %d", i);
1039 lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread,
1040 TDF_STOPREQ, i, "usched %d", i);
1043 * Allow user scheduling on the target cpu. cpu #0 has already
1044 * been enabled in rqinit().
1046 if (i)
1047 atomic_clear_int(&curprocmask, mask);
1048 atomic_set_int(&rdyprocmask, mask);
1050 if (bootverbose)
1051 printf("\n");
1053 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL)
1055 #endif