kernel - Add usched_dfly algorith, set as default for now (6)
[dragonfly.git] / sys / kern / usched_dfly.c
blobdff7d6a0555be01539e3d4636f3acb0894e03476
1 /*
2 * Copyright (c) 2012 The DragonFly Project. All rights reserved.
3 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>. All rights reserved.
5 * This code is derived from software contributed to The DragonFly Project
6 * by Matthew Dillon <dillon@backplane.com>,
7 * by Mihai Carabas <mihai.carabas@gmail.com>
8 * and many others.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in
18 * the documentation and/or other materials provided with the
19 * distribution.
20 * 3. Neither the name of The DragonFly Project nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific, prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
28 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/lock.h>
41 #include <sys/queue.h>
42 #include <sys/proc.h>
43 #include <sys/rtprio.h>
44 #include <sys/uio.h>
45 #include <sys/sysctl.h>
46 #include <sys/resourcevar.h>
47 #include <sys/spinlock.h>
48 #include <sys/cpu_topology.h>
49 #include <sys/thread2.h>
50 #include <sys/spinlock2.h>
51 #include <sys/mplock2.h>
53 #include <sys/ktr.h>
55 #include <machine/cpu.h>
56 #include <machine/smp.h>
59 * Priorities. Note that with 32 run queues per scheduler each queue
60 * represents four priority levels.
63 int dfly_rebalanced;
65 #define MAXPRI 128
66 #define PRIMASK (MAXPRI - 1)
67 #define PRIBASE_REALTIME 0
68 #define PRIBASE_NORMAL MAXPRI
69 #define PRIBASE_IDLE (MAXPRI * 2)
70 #define PRIBASE_THREAD (MAXPRI * 3)
71 #define PRIBASE_NULL (MAXPRI * 4)
73 #define NQS 32 /* 32 run queues. */
74 #define PPQ (MAXPRI / NQS) /* priorities per queue */
75 #define PPQMASK (PPQ - 1)
78 * NICEPPQ - number of nice units per priority queue
79 * ESTCPUPPQ - number of estcpu units per priority queue
80 * ESTCPUMAX - number of estcpu units
82 #define NICEPPQ 2
83 #define ESTCPUPPQ 512
84 #define ESTCPUMAX (ESTCPUPPQ * NQS)
85 #define BATCHMAX (ESTCPUFREQ * 30)
86 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1)
88 #define ESTCPULIM(v) min((v), ESTCPUMAX)
90 TAILQ_HEAD(rq, lwp);
92 #define lwp_priority lwp_usdata.dfly.priority
93 #define lwp_forked lwp_usdata.dfly.forked
94 #define lwp_rqindex lwp_usdata.dfly.rqindex
95 #define lwp_estcpu lwp_usdata.dfly.estcpu
96 #define lwp_batch lwp_usdata.dfly.batch
97 #define lwp_rqtype lwp_usdata.dfly.rqtype
98 #define lwp_qcpu lwp_usdata.dfly.qcpu
100 struct usched_dfly_pcpu {
101 struct spinlock spin;
102 struct thread helper_thread;
103 short rrcount;
104 short upri;
105 int uload;
106 struct lwp *uschedcp;
107 struct rq queues[NQS];
108 struct rq rtqueues[NQS];
109 struct rq idqueues[NQS];
110 u_int32_t queuebits;
111 u_int32_t rtqueuebits;
112 u_int32_t idqueuebits;
113 int runqcount;
114 int cpuid;
115 cpumask_t cpumask;
116 #ifdef SMP
117 cpu_node_t *cpunode;
118 #endif
121 typedef struct usched_dfly_pcpu *dfly_pcpu_t;
123 static void dfly_acquire_curproc(struct lwp *lp);
124 static void dfly_release_curproc(struct lwp *lp);
125 static void dfly_select_curproc(globaldata_t gd);
126 static void dfly_setrunqueue(struct lwp *lp);
127 static void dfly_setrunqueue_dd(dfly_pcpu_t rdd, struct lwp *lp);
128 static void dfly_schedulerclock(struct lwp *lp, sysclock_t period,
129 sysclock_t cpstamp);
130 static void dfly_recalculate_estcpu(struct lwp *lp);
131 static void dfly_resetpriority(struct lwp *lp);
132 static void dfly_forking(struct lwp *plp, struct lwp *lp);
133 static void dfly_exiting(struct lwp *lp, struct proc *);
134 static void dfly_uload_update(struct lwp *lp);
135 static void dfly_yield(struct lwp *lp);
136 #ifdef SMP
137 static dfly_pcpu_t dfly_choose_best_queue(struct lwp *lp);
138 static dfly_pcpu_t dfly_choose_worst_queue(dfly_pcpu_t dd);
139 static dfly_pcpu_t dfly_choose_queue_simple(dfly_pcpu_t dd, struct lwp *lp);
140 #if 0
141 static void dfly_wakeup_random_helper(dfly_pcpu_t notdd);
142 #endif
143 #endif
145 #ifdef SMP
146 static void dfly_need_user_resched_remote(void *dummy);
147 #endif
148 static struct lwp *dfly_chooseproc_locked(dfly_pcpu_t dd, struct lwp *chklp,
149 int isremote);
150 static void dfly_remrunqueue_locked(dfly_pcpu_t dd, struct lwp *lp);
151 static void dfly_setrunqueue_locked(dfly_pcpu_t dd, struct lwp *lp);
153 struct usched usched_dfly = {
154 { NULL },
155 "dfly", "Original DragonFly Scheduler",
156 NULL, /* default registration */
157 NULL, /* default deregistration */
158 dfly_acquire_curproc,
159 dfly_release_curproc,
160 dfly_setrunqueue,
161 dfly_schedulerclock,
162 dfly_recalculate_estcpu,
163 dfly_resetpriority,
164 dfly_forking,
165 dfly_exiting,
166 dfly_uload_update,
167 NULL, /* setcpumask not supported */
168 dfly_yield
172 * We have NQS (32) run queues per scheduling class. For the normal
173 * class, there are 128 priorities scaled onto these 32 queues. New
174 * processes are added to the last entry in each queue, and processes
175 * are selected for running by taking them from the head and maintaining
176 * a simple FIFO arrangement. Realtime and Idle priority processes have
177 * and explicit 0-31 priority which maps directly onto their class queue
178 * index. When a queue has something in it, the corresponding bit is
179 * set in the queuebits variable, allowing a single read to determine
180 * the state of all 32 queues and then a ffs() to find the first busy
181 * queue.
183 static cpumask_t dfly_curprocmask = -1; /* currently running a user process */
184 static cpumask_t dfly_rdyprocmask; /* ready to accept a user process */
185 #ifdef SMP
186 static volatile int dfly_scancpu;
187 #endif
188 static struct usched_dfly_pcpu dfly_pcpu[MAXCPU];
189 static struct sysctl_ctx_list usched_dfly_sysctl_ctx;
190 static struct sysctl_oid *usched_dfly_sysctl_tree;
192 /* Debug info exposed through debug.* sysctl */
194 static int usched_dfly_debug = -1;
195 SYSCTL_INT(_debug, OID_AUTO, dfly_scdebug, CTLFLAG_RW,
196 &usched_dfly_debug, 0,
197 "Print debug information for this pid");
199 static int usched_dfly_pid_debug = -1;
200 SYSCTL_INT(_debug, OID_AUTO, dfly_pid_debug, CTLFLAG_RW,
201 &usched_dfly_pid_debug, 0,
202 "Print KTR debug information for this pid");
204 static int usched_dfly_chooser = 0;
205 SYSCTL_INT(_debug, OID_AUTO, dfly_chooser, CTLFLAG_RW,
206 &usched_dfly_chooser, 0,
207 "Print KTR debug information for this pid");
209 /* Tunning usched_dfly - configurable through kern.usched_dfly.* */
210 #ifdef SMP
211 static int usched_dfly_smt = 0;
212 static int usched_dfly_cache_coherent = 0;
213 static int usched_dfly_weight1 = 25; /* thread's current cpu */
214 static int usched_dfly_weight2 = 0; /* synchronous peer's current cpu */
215 /* XXX can cause cpu flapping */
216 static int usched_dfly_weight3 = 10; /* number of threads on queue */
217 static int usched_dfly_pull_enable = 1; /* allow pulls */
218 #endif
219 static int usched_dfly_rrinterval = (ESTCPUFREQ + 9) / 10;
220 static int usched_dfly_decay = 8;
221 static int usched_dfly_batch_time = 10;
223 /* KTR debug printings */
225 KTR_INFO_MASTER(usched);
227 #if !defined(KTR_USCHED_DFLY)
228 #define KTR_USCHED_DFLY KTR_ALL
229 #endif
231 #if 0
232 KTR_INFO(KTR_USCHED_DFLY, usched, dfly_acquire_curproc_urw, 0,
233 "USCHED_DFLY(dfly_acquire_curproc in user_reseched_wanted "
234 "after release: pid %d, cpuid %d, curr_cpuid %d)",
235 pid_t pid, int cpuid, int curr);
236 KTR_INFO(KTR_USCHED_DFLY, usched, dfly_acquire_curproc_before_loop, 0,
237 "USCHED_DFLY(dfly_acquire_curproc before loop: pid %d, cpuid %d, "
238 "curr_cpuid %d)",
239 pid_t pid, int cpuid, int curr);
240 KTR_INFO(KTR_USCHED_DFLY, usched, dfly_acquire_curproc_not, 0,
241 "USCHED_DFLY(dfly_acquire_curproc couldn't acquire after "
242 "dfly_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)",
243 pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid);
244 KTR_INFO(KTR_USCHED_DFLY, usched, dfly_acquire_curproc_switch, 0,
245 "USCHED_DFLY(dfly_acquire_curproc after lwkt_switch: pid %d, "
246 "cpuid %d, curr_cpuid %d)",
247 pid_t pid, int cpuid, int curr);
249 KTR_INFO(KTR_USCHED_DFLY, usched, dfly_release_curproc, 0,
250 "USCHED_DFLY(dfly_release_curproc before select: pid %d, "
251 "cpuid %d, curr_cpuid %d)",
252 pid_t pid, int cpuid, int curr);
254 KTR_INFO(KTR_USCHED_DFLY, usched, dfly_select_curproc, 0,
255 "USCHED_DFLY(dfly_release_curproc before select: pid %d, "
256 "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)",
257 pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr);
259 #ifdef SMP
260 KTR_INFO(KTR_USCHED_DFLY, usched, batchy_test_false, 0,
261 "USCHED_DFLY(batchy_looser_pri_test false: pid %d, "
262 "cpuid %d, verify_mask %lu)",
263 pid_t pid, int cpuid, cpumask_t mask);
264 KTR_INFO(KTR_USCHED_DFLY, usched, batchy_test_true, 0,
265 "USCHED_DFLY(batchy_looser_pri_test true: pid %d, "
266 "cpuid %d, verify_mask %lu)",
267 pid_t pid, int cpuid, cpumask_t mask);
269 KTR_INFO(KTR_USCHED_DFLY, usched, dfly_setrunqueue_fc_smt, 0,
270 "USCHED_DFLY(dfly_setrunqueue free cpus smt: pid %d, cpuid %d, "
271 "mask %lu, curr_cpuid %d)",
272 pid_t pid, int cpuid, cpumask_t mask, int curr);
273 KTR_INFO(KTR_USCHED_DFLY, usched, dfly_setrunqueue_fc_non_smt, 0,
274 "USCHED_DFLY(dfly_setrunqueue free cpus check non_smt: pid %d, "
275 "cpuid %d, mask %lu, curr_cpuid %d)",
276 pid_t pid, int cpuid, cpumask_t mask, int curr);
277 KTR_INFO(KTR_USCHED_DFLY, usched, dfly_setrunqueue_rc, 0,
278 "USCHED_DFLY(dfly_setrunqueue running cpus check: pid %d, "
279 "cpuid %d, mask %lu, curr_cpuid %d)",
280 pid_t pid, int cpuid, cpumask_t mask, int curr);
281 KTR_INFO(KTR_USCHED_DFLY, usched, dfly_setrunqueue_found, 0,
282 "USCHED_DFLY(dfly_setrunqueue found cpu: pid %d, cpuid %d, "
283 "mask %lu, found_cpuid %d, curr_cpuid %d)",
284 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
285 KTR_INFO(KTR_USCHED_DFLY, usched, dfly_setrunqueue_not_found, 0,
286 "USCHED_DFLY(dfly_setrunqueue not found cpu: pid %d, cpuid %d, "
287 "try_cpuid %d, curr_cpuid %d)",
288 pid_t pid, int cpuid, int try_cpuid, int curr);
289 KTR_INFO(KTR_USCHED_DFLY, usched, dfly_setrunqueue_found_best_cpuid, 0,
290 "USCHED_DFLY(dfly_setrunqueue found cpu: pid %d, cpuid %d, "
291 "mask %lu, found_cpuid %d, curr_cpuid %d)",
292 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
293 #endif
294 #endif
296 KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc, 0,
297 "USCHED_DFLY(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)",
298 pid_t pid, int old_cpuid, int curr);
299 #ifdef SMP
300 #if 0
301 KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc_cc, 0,
302 "USCHED_DFLY(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)",
303 pid_t pid, int old_cpuid, int curr);
304 KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc_cc_not_good, 0,
305 "USCHED_DFLY(chooseproc_cc not good: pid %d, old_cpumask %lu, "
306 "sibling_mask %lu, curr_cpumask %lu)",
307 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
308 KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc_cc_elected, 0,
309 "USCHED_DFLY(chooseproc_cc elected: pid %d, old_cpumask %lu, "
310 "sibling_mask %lu, curr_cpumask: %lu)",
311 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
312 #endif
314 KTR_INFO(KTR_USCHED_DFLY, usched, sched_thread_no_process, 0,
315 "USCHED_DFLY(sched_thread %d no process scheduled: pid %d, old_cpuid %d)",
316 int id, pid_t pid, int cpuid);
317 KTR_INFO(KTR_USCHED_DFLY, usched, sched_thread_process, 0,
318 "USCHED_DFLY(sched_thread %d process scheduled: pid %d, old_cpuid %d)",
319 int id, pid_t pid, int cpuid);
320 #if 0
321 KTR_INFO(KTR_USCHED_DFLY, usched, sched_thread_no_process_found, 0,
322 "USCHED_DFLY(sched_thread %d no process found; tmpmask %lu)",
323 int id, cpumask_t tmpmask);
324 #endif
325 #endif
328 * DFLY_ACQUIRE_CURPROC
330 * This function is called when the kernel intends to return to userland.
331 * It is responsible for making the thread the current designated userland
332 * thread for this cpu, blocking if necessary.
334 * The kernel has already depressed our LWKT priority so we must not switch
335 * until we have either assigned or disposed of the thread.
337 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
338 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will
339 * occur, this function is called only under very controlled circumstances.
341 static void
342 dfly_acquire_curproc(struct lwp *lp)
344 globaldata_t gd;
345 dfly_pcpu_t dd;
346 dfly_pcpu_t rdd;
347 thread_t td;
350 * Make sure we aren't sitting on a tsleep queue.
352 td = lp->lwp_thread;
353 crit_enter_quick(td);
354 if (td->td_flags & TDF_TSLEEPQ)
355 tsleep_remove(td);
356 dfly_recalculate_estcpu(lp);
359 * If a reschedule was requested give another thread the
360 * driver's seat.
362 if (user_resched_wanted()) {
363 clear_user_resched();
364 dfly_release_curproc(lp);
368 * Loop until we are the current user thread
370 gd = mycpu;
371 dd = &dfly_pcpu[gd->gd_cpuid];
373 do {
375 * Process any pending events and higher priority threads.
377 lwkt_yield();
380 * Become the currently scheduled user thread for this cpu
381 * if we can do so trivially.
383 * We can steal another thread's current thread designation
384 * on this cpu since if we are running that other thread
385 * must not be, so we can safely deschedule it.
387 if (dd->uschedcp == lp) {
389 * We are already the current lwp (hot path).
391 dd->upri = lp->lwp_priority;
392 } else if ((rdd = dfly_choose_best_queue(lp)) != dd) {
393 lwkt_deschedule(lp->lwp_thread);
394 dfly_setrunqueue_dd(rdd, lp);
395 lwkt_switch();
396 gd = mycpu;
397 dd = &dfly_pcpu[gd->gd_cpuid];
398 } else if (dd->uschedcp == NULL) {
400 * We can trivially become the current lwp.
402 atomic_set_cpumask(&dfly_curprocmask, gd->gd_cpumask);
403 dd->uschedcp = lp;
404 dd->upri = lp->lwp_priority;
405 KKASSERT(lp->lwp_qcpu == dd->cpuid);
406 } else if (dd->uschedcp && (dd->upri & ~PPQMASK) >
407 (lp->lwp_priority & ~PPQMASK)) {
409 * We can steal the current cpu's lwp designation
410 * away simply by replacing it. The other thread
411 * will stall when it tries to return to userland,
412 * possibly rescheduling elsewhere when it calls
413 * setrunqueue.
415 * It is important to do a masked test to avoid the
416 * edge case where two near-equal-priority threads
417 * are constantly interrupting each other.
419 dd->uschedcp = lp;
420 dd->upri = lp->lwp_priority;
421 KKASSERT(lp->lwp_qcpu == dd->cpuid);
422 } else {
424 * We cannot become the current lwp, place the lp
425 * on the run-queue of this or another cpu and
426 * deschedule ourselves.
428 * When we are reactivated we will have another
429 * chance.
431 * Reload after a switch or setrunqueue/switch possibly
432 * moved us to another cpu.
434 lwkt_deschedule(lp->lwp_thread);
435 dfly_setrunqueue_dd(dd, lp);
436 lwkt_switch();
437 gd = mycpu;
438 dd = &dfly_pcpu[gd->gd_cpuid];
440 } while (dd->uschedcp != lp);
442 crit_exit_quick(td);
443 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
447 * DFLY_RELEASE_CURPROC
449 * This routine detaches the current thread from the userland scheduler,
450 * usually because the thread needs to run or block in the kernel (at
451 * kernel priority) for a while.
453 * This routine is also responsible for selecting a new thread to
454 * make the current thread.
456 * NOTE: This implementation differs from the dummy example in that
457 * dfly_select_curproc() is able to select the current process, whereas
458 * dummy_select_curproc() is not able to select the current process.
459 * This means we have to NULL out uschedcp.
461 * Additionally, note that we may already be on a run queue if releasing
462 * via the lwkt_switch() in dfly_setrunqueue().
464 static void
465 dfly_release_curproc(struct lwp *lp)
467 globaldata_t gd = mycpu;
468 dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
471 * Make sure td_wakefromcpu is defaulted. This will be overwritten
472 * by wakeup().
474 lp->lwp_thread->td_wakefromcpu = gd->gd_cpuid;
476 if (dd->uschedcp == lp) {
477 spin_lock(&dd->spin);
478 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
480 dd->uschedcp = NULL; /* don't let lp be selected */
481 dd->upri = PRIBASE_NULL;
482 atomic_clear_cpumask(&dfly_curprocmask, gd->gd_cpumask);
483 spin_unlock(&dd->spin);
484 dfly_select_curproc(gd);
489 * DFLY_SELECT_CURPROC
491 * Select a new current process for this cpu and clear any pending user
492 * reschedule request. The cpu currently has no current process.
494 * This routine is also responsible for equal-priority round-robining,
495 * typically triggered from dfly_schedulerclock(). In our dummy example
496 * all the 'user' threads are LWKT scheduled all at once and we just
497 * call lwkt_switch().
499 * The calling process is not on the queue and cannot be selected.
501 static
502 void
503 dfly_select_curproc(globaldata_t gd)
505 dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
506 struct lwp *nlp;
507 int cpuid = gd->gd_cpuid;
509 crit_enter_gd(gd);
511 spin_lock(&dd->spin);
512 nlp = dfly_chooseproc_locked(dd, dd->uschedcp, 1);
514 if (nlp) {
515 atomic_set_cpumask(&dfly_curprocmask, CPUMASK(cpuid));
516 dd->upri = nlp->lwp_priority;
517 dd->uschedcp = nlp;
518 dd->rrcount = 0; /* reset round robin */
519 spin_unlock(&dd->spin);
520 #ifdef SMP
521 lwkt_acquire(nlp->lwp_thread);
522 #endif
523 lwkt_schedule(nlp->lwp_thread);
524 } else {
525 spin_unlock(&dd->spin);
527 crit_exit_gd(gd);
531 * Place the specified lwp on the user scheduler's run queue. This routine
532 * must be called with the thread descheduled. The lwp must be runnable.
533 * It must not be possible for anyone else to explicitly schedule this thread.
535 * The thread may be the current thread as a special case.
537 static void
538 dfly_setrunqueue(struct lwp *lp)
540 dfly_pcpu_t rdd;
543 * First validate the process LWKT state.
545 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
546 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0,
547 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
548 lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags));
549 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
552 * NOTE: rdd does not necessarily represent the current cpu.
553 * Instead it represents the cpu the thread was last
554 * scheduled on.
556 rdd = &dfly_pcpu[lp->lwp_qcpu];
559 * This process is not supposed to be scheduled anywhere or assigned
560 * as the current process anywhere. Assert the condition.
562 KKASSERT(rdd->uschedcp != lp);
564 #ifndef SMP
566 * If we are not SMP we do not have a scheduler helper to kick
567 * and must directly activate the process if none are scheduled.
569 * This is really only an issue when bootstrapping init since
570 * the caller in all other cases will be a user process, and
571 * even if released (rdd->uschedcp == NULL), that process will
572 * kickstart the scheduler when it returns to user mode from
573 * the kernel.
575 * NOTE: On SMP we can't just set some other cpu's uschedcp.
577 if (rdd->uschedcp == NULL) {
578 spin_lock(&rdd->spin);
579 if (rdd->uschedcp == NULL) {
580 atomic_set_cpumask(&dfly_curprocmask, 1);
581 rdd->uschedcp = lp;
582 rdd->upri = lp->lwp_priority;
583 spin_unlock(&rdd->spin);
584 lwkt_schedule(lp->lwp_thread);
585 return;
587 spin_unlock(&rdd->spin);
589 #endif
591 #ifdef SMP
593 * Ok, we have to setrunqueue some target cpu and request a reschedule
594 * if necessary.
596 * We have to choose the best target cpu. It might not be the current
597 * target even if the current cpu has no running user thread (for
598 * example, because the current cpu might be a hyperthread and its
599 * sibling has a thread assigned).
601 * If we just forked it is most optimal to run the child on the same
602 * cpu just in case the parent decides to wait for it (thus getting
603 * off that cpu). As long as there is nothing else runnable on the
604 * cpu, that is. If we did this unconditionally a parent forking
605 * multiple children before waiting (e.g. make -j N) leaves other
606 * cpus idle that could be working.
608 if (lp->lwp_forked) {
609 lp->lwp_forked = 0;
610 if (dfly_pcpu[lp->lwp_qcpu].runqcount)
611 rdd = dfly_choose_best_queue(lp);
612 else
613 rdd = &dfly_pcpu[lp->lwp_qcpu];
614 /* dfly_wakeup_random_helper(rdd); */
615 } else {
616 rdd = dfly_choose_best_queue(lp);
617 /* rdd = &dfly_pcpu[lp->lwp_qcpu]; */
619 #endif
620 dfly_setrunqueue_dd(rdd, lp);
623 static void
624 dfly_setrunqueue_dd(dfly_pcpu_t rdd, struct lwp *lp)
626 #ifdef SMP
627 globaldata_t rgd;
630 * We might be moving the lp to another cpu's run queue, and once
631 * on the runqueue (even if it is our cpu's), another cpu can rip
632 * it away from us.
634 * TDF_MIGRATING might already be set if this is part of a
635 * remrunqueue+setrunqueue sequence.
637 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
638 lwkt_giveaway(lp->lwp_thread);
640 rgd = globaldata_find(rdd->cpuid);
643 * We lose control of the lp the moment we release the spinlock
644 * after having placed it on the queue. i.e. another cpu could pick
645 * it up, or it could exit, or its priority could be further
646 * adjusted, or something like that.
648 * WARNING! rdd can point to a foreign cpu!
650 spin_lock(&rdd->spin);
651 dfly_setrunqueue_locked(rdd, lp);
653 if (rgd == mycpu) {
654 if ((rdd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
655 spin_unlock(&rdd->spin);
656 if (rdd->uschedcp == NULL) {
657 wakeup_mycpu(&rdd->helper_thread); /* XXX */
658 need_user_resched();
659 } else {
660 need_user_resched();
662 } else {
663 spin_unlock(&rdd->spin);
665 } else {
666 atomic_clear_cpumask(&dfly_rdyprocmask, rgd->gd_cpumask);
667 if ((rdd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
668 spin_unlock(&rdd->spin);
669 lwkt_send_ipiq(rgd, dfly_need_user_resched_remote,
670 NULL);
671 } else {
672 spin_unlock(&rdd->spin);
673 wakeup(&rdd->helper_thread);
676 #else
678 * Request a reschedule if appropriate.
680 spin_lock(&rdd->spin);
681 dfly_setrunqueue_locked(rdd, lp);
682 spin_unlock(&rdd->spin);
683 if ((rdd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
684 need_user_resched();
686 #endif
689 #if 0
692 * This wakes up a random helper that might have no work on its cpu to do.
693 * The idea is to improve fork/fork-exec/fork-wait/exec and similar
694 * process-spawning sequences by first scheduling the forked process
695 * on the same cpu as the parent, in case the parent is just going to
696 * wait*(). But if the parent does not wait we want another cpu to pick
697 * the forked process up ASAP.
699 * The ipi/helper-scheduling sequence typically takes a lot longer to run
700 * than a return-from-procedure-call and the parent then entering a
701 * wait*(). There's a race here that we want the parent to win ONLY if
702 * it is going to wait*().
704 * If a process sticks around for long enough normal scheduling action
705 * will move it to the right place.
707 static
708 void
709 dfly_wakeup_random_helper(dfly_pcpu_t notdd)
711 cpumask_t tmpmask;
712 cpumask_t mask;
713 int cpuid;
715 mask = dfly_rdyprocmask & ~dfly_curprocmask & smp_active_mask &
716 usched_global_cpumask & ~notdd->cpumask;
717 ++dfly_scancpu;
718 cpuid = (dfly_scancpu & 0xFFFF) % ncpus;
720 if (mask) {
721 tmpmask = ~(CPUMASK(cpuid) - 1);
722 if (mask & tmpmask)
723 cpuid = BSFCPUMASK(mask & tmpmask);
724 else
725 cpuid = BSFCPUMASK(mask);
726 atomic_clear_cpumask(&dfly_rdyprocmask, CPUMASK(cpuid));
727 wakeup(&dfly_pcpu[cpuid].helper_thread);
731 #endif
734 * This routine is called from a systimer IPI. It MUST be MP-safe and
735 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on
736 * each cpu.
738 static
739 void
740 dfly_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
742 globaldata_t gd = mycpu;
743 dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
746 * Do we need to round-robin? We round-robin 10 times a second.
747 * This should only occur for cpu-bound batch processes.
749 if (++dd->rrcount >= usched_dfly_rrinterval) {
750 dd->rrcount = 0;
751 need_user_resched();
755 * Adjust estcpu upward using a real time equivalent calculation.
757 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1);
760 * Spinlocks also hold a critical section so there should not be
761 * any active.
763 KKASSERT(gd->gd_spinlocks_wr == 0);
765 dfly_resetpriority(lp);
769 * Called from acquire and from kern_synch's one-second timer (one of the
770 * callout helper threads) with a critical section held.
772 * Decay p_estcpu based on the number of ticks we haven't been running
773 * and our p_nice. As the load increases each process observes a larger
774 * number of idle ticks (because other processes are running in them).
775 * This observation leads to a larger correction which tends to make the
776 * system more 'batchy'.
778 * Note that no recalculation occurs for a process which sleeps and wakes
779 * up in the same tick. That is, a system doing thousands of context
780 * switches per second will still only do serious estcpu calculations
781 * ESTCPUFREQ times per second.
783 static
784 void
785 dfly_recalculate_estcpu(struct lwp *lp)
787 globaldata_t gd = mycpu;
788 dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
789 sysclock_t cpbase;
790 sysclock_t ttlticks;
791 int estcpu;
792 int decay_factor;
795 * We have to subtract periodic to get the last schedclock
796 * timeout time, otherwise we would get the upcoming timeout.
797 * Keep in mind that a process can migrate between cpus and
798 * while the scheduler clock should be very close, boundary
799 * conditions could lead to a small negative delta.
801 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
803 if (lp->lwp_slptime > 1) {
805 * Too much time has passed, do a coarse correction.
807 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
808 dfly_resetpriority(lp);
809 lp->lwp_cpbase = cpbase;
810 lp->lwp_cpticks = 0;
811 lp->lwp_batch -= ESTCPUFREQ;
812 if (lp->lwp_batch < 0)
813 lp->lwp_batch = 0;
814 } else if (lp->lwp_cpbase != cpbase) {
816 * Adjust estcpu if we are in a different tick. Don't waste
817 * time if we are in the same tick.
819 * First calculate the number of ticks in the measurement
820 * interval. The ttlticks calculation can wind up 0 due to
821 * a bug in the handling of lwp_slptime (as yet not found),
822 * so make sure we do not get a divide by 0 panic.
824 ttlticks = (cpbase - lp->lwp_cpbase) /
825 gd->gd_schedclock.periodic;
826 if (ttlticks < 0) {
827 ttlticks = 0;
828 lp->lwp_cpbase = cpbase;
830 if (ttlticks == 0)
831 return;
832 updatepcpu(lp, lp->lwp_cpticks, ttlticks);
835 * Calculate the percentage of one cpu used factoring in ncpus
836 * and the load and adjust estcpu. Handle degenerate cases
837 * by adding 1 to runqcount.
839 * estcpu is scaled by ESTCPUMAX.
841 * runqcount is the excess number of user processes
842 * that cannot be immediately scheduled to cpus. We want
843 * to count these as running to avoid range compression
844 * in the base calculation (which is the actual percentage
845 * of one cpu used).
847 estcpu = (lp->lwp_cpticks * ESTCPUMAX) *
848 (dd->runqcount + ncpus) / (ncpus * ttlticks);
851 * If estcpu is > 50% we become more batch-like
852 * If estcpu is <= 50% we become less batch-like
854 * It takes 30 cpu seconds to traverse the entire range.
856 if (estcpu > ESTCPUMAX / 2) {
857 lp->lwp_batch += ttlticks;
858 if (lp->lwp_batch > BATCHMAX)
859 lp->lwp_batch = BATCHMAX;
860 } else {
861 lp->lwp_batch -= ttlticks;
862 if (lp->lwp_batch < 0)
863 lp->lwp_batch = 0;
866 if (usched_dfly_debug == lp->lwp_proc->p_pid) {
867 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d",
868 lp->lwp_proc->p_pid, lp,
869 estcpu, lp->lwp_estcpu,
870 lp->lwp_batch,
871 lp->lwp_cpticks, ttlticks);
875 * Adjust lp->lwp_esetcpu. The decay factor determines how
876 * quickly lwp_estcpu collapses to its realtime calculation.
877 * A slower collapse gives us a more accurate number but
878 * can cause a cpu hog to eat too much cpu before the
879 * scheduler decides to downgrade it.
881 * NOTE: p_nice is accounted for in dfly_resetpriority(),
882 * and not here, but we must still ensure that a
883 * cpu-bound nice -20 process does not completely
884 * override a cpu-bound nice +20 process.
886 * NOTE: We must use ESTCPULIM() here to deal with any
887 * overshoot.
889 decay_factor = usched_dfly_decay;
890 if (decay_factor < 1)
891 decay_factor = 1;
892 if (decay_factor > 1024)
893 decay_factor = 1024;
895 lp->lwp_estcpu = ESTCPULIM(
896 (lp->lwp_estcpu * decay_factor + estcpu) /
897 (decay_factor + 1));
899 if (usched_dfly_debug == lp->lwp_proc->p_pid)
900 kprintf(" finalestcpu %d\n", lp->lwp_estcpu);
901 dfly_resetpriority(lp);
902 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic;
903 lp->lwp_cpticks = 0;
908 * Compute the priority of a process when running in user mode.
909 * Arrange to reschedule if the resulting priority is better
910 * than that of the current process.
912 * This routine may be called with any process.
914 * This routine is called by fork1() for initial setup with the process
915 * of the run queue, and also may be called normally with the process on or
916 * off the run queue.
918 static void
919 dfly_resetpriority(struct lwp *lp)
921 dfly_pcpu_t rdd;
922 int newpriority;
923 u_short newrqtype;
924 int rcpu;
925 int checkpri;
926 int estcpu;
928 crit_enter();
931 * Lock the scheduler (lp) belongs to. This can be on a different
932 * cpu. Handle races. This loop breaks out with the appropriate
933 * rdd locked.
935 for (;;) {
936 rcpu = lp->lwp_qcpu;
937 cpu_ccfence();
938 rdd = &dfly_pcpu[rcpu];
939 spin_lock(&rdd->spin);
940 if (rcpu == lp->lwp_qcpu)
941 break;
942 spin_unlock(&rdd->spin);
946 * Calculate the new priority and queue type
948 newrqtype = lp->lwp_rtprio.type;
950 switch(newrqtype) {
951 case RTP_PRIO_REALTIME:
952 case RTP_PRIO_FIFO:
953 newpriority = PRIBASE_REALTIME +
954 (lp->lwp_rtprio.prio & PRIMASK);
955 break;
956 case RTP_PRIO_NORMAL:
958 * Detune estcpu based on batchiness. lwp_batch ranges
959 * from 0 to BATCHMAX. Limit estcpu for the sake of
960 * the priority calculation to between 50% and 100%.
962 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) /
963 (BATCHMAX * 2);
966 * p_nice piece Adds (0-40) * 2 0-80
967 * estcpu Adds 16384 * 4 / 512 0-128
969 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
970 newpriority += estcpu * PPQ / ESTCPUPPQ;
971 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ /
972 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
973 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK);
974 break;
975 case RTP_PRIO_IDLE:
976 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
977 break;
978 case RTP_PRIO_THREAD:
979 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
980 break;
981 default:
982 panic("Bad RTP_PRIO %d", newrqtype);
983 /* NOT REACHED */
987 * The newpriority incorporates the queue type so do a simple masked
988 * check to determine if the process has moved to another queue. If
989 * it has, and it is currently on a run queue, then move it.
991 * Since uload is ~PPQMASK masked, no modifications are necessary if
992 * we end up in the same run queue.
994 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
995 int delta_uload;
998 * uload can change, calculate the adjustment to reduce
999 * edge cases since choosers scan the cpu topology without
1000 * locks.
1002 if (lp->lwp_mpflags & LWP_MP_ULOAD) {
1003 delta_uload =
1004 -((lp->lwp_priority & ~PPQMASK) & PRIMASK) +
1005 ((newpriority & ~PPQMASK) & PRIMASK);
1006 atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload,
1007 delta_uload);
1009 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) {
1010 dfly_remrunqueue_locked(rdd, lp);
1011 lp->lwp_priority = newpriority;
1012 lp->lwp_rqtype = newrqtype;
1013 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1014 dfly_setrunqueue_locked(rdd, lp);
1015 checkpri = 1;
1016 } else {
1017 lp->lwp_priority = newpriority;
1018 lp->lwp_rqtype = newrqtype;
1019 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1020 checkpri = 0;
1022 } else {
1024 * In the same PPQ, uload cannot change.
1026 lp->lwp_priority = newpriority;
1027 checkpri = 1;
1028 rcpu = -1;
1032 * Determine if we need to reschedule the target cpu. This only
1033 * occurs if the LWP is already on a scheduler queue, which means
1034 * that idle cpu notification has already occured. At most we
1035 * need only issue a need_user_resched() on the appropriate cpu.
1037 * The LWP may be owned by a CPU different from the current one,
1038 * in which case dd->uschedcp may be modified without an MP lock
1039 * or a spinlock held. The worst that happens is that the code
1040 * below causes a spurious need_user_resched() on the target CPU
1041 * and dd->pri to be wrong for a short period of time, both of
1042 * which are harmless.
1044 * If checkpri is 0 we are adjusting the priority of the current
1045 * process, possibly higher (less desireable), so ignore the upri
1046 * check which will fail in that case.
1048 if (rcpu >= 0) {
1049 if ((dfly_rdyprocmask & CPUMASK(rcpu)) &&
1050 (checkpri == 0 ||
1051 (rdd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) {
1052 #ifdef SMP
1053 if (rcpu == mycpu->gd_cpuid) {
1054 spin_unlock(&rdd->spin);
1055 need_user_resched();
1056 } else {
1057 atomic_clear_cpumask(&dfly_rdyprocmask,
1058 CPUMASK(rcpu));
1059 spin_unlock(&rdd->spin);
1060 lwkt_send_ipiq(globaldata_find(rcpu),
1061 dfly_need_user_resched_remote,
1062 NULL);
1064 #else
1065 spin_unlock(&rdd->spin);
1066 need_user_resched();
1067 #endif
1068 } else {
1069 spin_unlock(&rdd->spin);
1071 } else {
1072 spin_unlock(&rdd->spin);
1074 crit_exit();
1077 static
1078 void
1079 dfly_yield(struct lwp *lp)
1081 #if 0
1082 /* FUTURE (or something similar) */
1083 switch(lp->lwp_rqtype) {
1084 case RTP_PRIO_NORMAL:
1085 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
1086 break;
1087 default:
1088 break;
1090 #endif
1091 need_user_resched();
1095 * Called from fork1() when a new child process is being created.
1097 * Give the child process an initial estcpu that is more batch then
1098 * its parent and dock the parent for the fork (but do not
1099 * reschedule the parent). This comprises the main part of our batch
1100 * detection heuristic for both parallel forking and sequential execs.
1102 * XXX lwp should be "spawning" instead of "forking"
1104 static void
1105 dfly_forking(struct lwp *plp, struct lwp *lp)
1108 * Put the child 4 queue slots (out of 32) higher than the parent
1109 * (less desireable than the parent).
1111 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4);
1112 lp->lwp_forked = 1;
1115 * The batch status of children always starts out centerline
1116 * and will inch-up or inch-down as appropriate. It takes roughly
1117 * ~15 seconds of >50% cpu to hit the limit.
1119 lp->lwp_batch = BATCHMAX / 2;
1122 * Dock the parent a cost for the fork, protecting us from fork
1123 * bombs. If the parent is forking quickly make the child more
1124 * batchy.
1126 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16);
1130 * Called when a lwp is being removed from this scheduler, typically
1131 * during lwp_exit().
1133 static void
1134 dfly_exiting(struct lwp *lp, struct proc *child_proc)
1136 dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu];
1138 if (lp->lwp_mpflags & LWP_MP_ULOAD) {
1139 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
1140 atomic_add_int(&dd->uload,
1141 -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
1146 * This function cannot block in any way
1148 static void
1149 dfly_uload_update(struct lwp *lp)
1151 dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu];
1153 if (lp->lwp_thread->td_flags & TDF_RUNQ) {
1154 if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
1155 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
1156 atomic_add_int(&dd->uload,
1157 ((lp->lwp_priority & ~PPQMASK) & PRIMASK));
1159 } else {
1160 if (lp->lwp_mpflags & LWP_MP_ULOAD) {
1161 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
1162 atomic_add_int(&dd->uload,
1163 -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
1169 * chooseproc() is called when a cpu needs a user process to LWKT schedule,
1170 * it selects a user process and returns it. If chklp is non-NULL and chklp
1171 * has a better or equal priority then the process that would otherwise be
1172 * chosen, NULL is returned.
1174 * Until we fix the RUNQ code the chklp test has to be strict or we may
1175 * bounce between processes trying to acquire the current process designation.
1177 * Must be called with dd->spin locked. The spinlock is left intact through
1178 * the entire routine.
1180 * if chklp is NULL this function will dive other cpu's queues looking
1181 * for work if the current queue is empty.
1183 static
1184 struct lwp *
1185 dfly_chooseproc_locked(dfly_pcpu_t dd, struct lwp *chklp, int isremote)
1187 #ifdef SMP
1188 dfly_pcpu_t xdd;
1189 #endif
1190 struct lwp *lp;
1191 struct rq *q;
1192 u_int32_t *which, *which2;
1193 u_int32_t pri;
1194 u_int32_t rtqbits;
1195 u_int32_t tsqbits;
1196 u_int32_t idqbits;
1198 rtqbits = dd->rtqueuebits;
1199 tsqbits = dd->queuebits;
1200 idqbits = dd->idqueuebits;
1202 if (rtqbits) {
1203 pri = bsfl(rtqbits);
1204 q = &dd->rtqueues[pri];
1205 which = &dd->rtqueuebits;
1206 which2 = &rtqbits;
1207 } else if (tsqbits) {
1208 pri = bsfl(tsqbits);
1209 q = &dd->queues[pri];
1210 which = &dd->queuebits;
1211 which2 = &tsqbits;
1212 } else if (idqbits) {
1213 pri = bsfl(idqbits);
1214 q = &dd->idqueues[pri];
1215 which = &dd->idqueuebits;
1216 which2 = &idqbits;
1217 } else
1218 #ifdef SMP
1219 if (isremote || usched_dfly_pull_enable == 0) {
1221 * Queue is empty, disallow remote->remote recursion and
1222 * do not pull if threads are active.
1224 return (NULL);
1225 } else {
1227 * Pull a runnable thread from a remote run queue. We have
1228 * to adjust qcpu and uload manually because the lp we return
1229 * might be assigned directly to uschedcp (setrunqueue might
1230 * not be called).
1232 xdd = dfly_choose_worst_queue(dd);
1233 if (xdd && xdd != dd && spin_trylock(&xdd->spin)) {
1234 lp = dfly_chooseproc_locked(xdd, NULL, 1);
1235 if (lp) {
1236 if (lp->lwp_mpflags & LWP_MP_ULOAD) {
1237 atomic_add_int(&xdd->uload,
1238 -((lp->lwp_priority & ~PPQMASK) &
1239 PRIMASK));
1241 lp->lwp_qcpu = dd->cpuid;
1242 atomic_add_int(&dd->uload,
1243 ((lp->lwp_priority & ~PPQMASK) & PRIMASK));
1244 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
1246 spin_unlock(&xdd->spin);
1247 } else {
1248 lp = NULL;
1250 return (lp);
1252 #else
1254 return NULL;
1256 #endif
1257 lp = TAILQ_FIRST(q);
1258 KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1261 * If the passed lwp <chklp> is reasonably close to the selected
1262 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1264 * Note that we must error on the side of <chklp> to avoid bouncing
1265 * between threads in the acquire code.
1267 if (chklp) {
1268 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1269 return(NULL);
1272 KTR_COND_LOG(usched_chooseproc,
1273 lp->lwp_proc->p_pid == usched_dfly_pid_debug,
1274 lp->lwp_proc->p_pid,
1275 lp->lwp_thread->td_gd->gd_cpuid,
1276 mycpu->gd_cpuid);
1278 TAILQ_REMOVE(q, lp, lwp_procq);
1279 --dd->runqcount;
1280 if (TAILQ_EMPTY(q))
1281 *which &= ~(1 << pri);
1282 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1283 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1285 return lp;
1288 #ifdef SMP
1291 * USED TO PUSH RUNNABLE LWPS TO THE LEAST LOADED CPU.
1293 * Choose a cpu node to schedule lp on, hopefully nearby its current
1294 * node. We give the current node a modest advantage for obvious reasons.
1296 * We also give the node the thread was woken up FROM a slight advantage
1297 * in order to try to schedule paired threads which synchronize/block waiting
1298 * for each other fairly close to each other. Similarly in a network setting
1299 * this feature will also attempt to place a user process near the kernel
1300 * protocol thread that is feeding it data. THIS IS A CRITICAL PART of the
1301 * algorithm as it heuristically groups synchronizing processes for locality
1302 * of reference in multi-socket systems.
1304 * The caller will normally dfly_setrunqueue() lp on the returned queue.
1306 * When the topology is known choose a cpu whos group has, in aggregate,
1307 * has the lowest weighted load.
1309 static
1310 dfly_pcpu_t
1311 dfly_choose_best_queue(struct lwp *lp)
1313 cpumask_t mask;
1314 cpu_node_t *cpup;
1315 cpu_node_t *cpun;
1316 cpu_node_t *cpub;
1317 dfly_pcpu_t dd1 = &dfly_pcpu[lp->lwp_qcpu];
1318 dfly_pcpu_t dd2 = &dfly_pcpu[lp->lwp_thread->td_wakefromcpu];
1319 dfly_pcpu_t rdd;
1320 int cpuid;
1321 int n;
1322 int count;
1323 int load;
1324 int lowest_load;
1327 * When the topology is unknown choose a random cpu that is hopefully
1328 * idle.
1330 if (dd1->cpunode == NULL)
1331 return (dfly_choose_queue_simple(dd1, lp));
1334 * When the topology is known choose a cpu whos group has, in
1335 * aggregate, has the lowest weighted load.
1337 cpup = root_cpu_node;
1338 rdd = dd1;
1340 while (cpup) {
1342 * Degenerate case super-root
1344 if (cpup->child_node && cpup->child_no == 1) {
1345 cpup = cpup->child_node;
1346 continue;
1350 * Terminal cpunode
1352 if (cpup->child_node == NULL) {
1353 rdd = &dfly_pcpu[BSFCPUMASK(cpup->members)];
1354 break;
1357 cpub = NULL;
1358 lowest_load = 0x7FFFFFFF;
1360 for (n = 0; n < cpup->child_no; ++n) {
1362 * Accumulate load information for all cpus
1363 * which are members of this node.
1365 cpun = &cpup->child_node[n];
1366 mask = cpun->members & usched_global_cpumask &
1367 smp_active_mask & lp->lwp_cpumask;
1368 if (mask == 0)
1369 continue;
1372 * Compensate if the lp is already accounted for in
1373 * the aggregate uload for this mask set. We want
1374 * to calculate the loads as if lp was not present.
1376 if ((lp->lwp_mpflags & LWP_MP_ULOAD) &&
1377 CPUMASK(lp->lwp_qcpu) & mask) {
1378 load = -((lp->lwp_priority & ~PPQMASK) &
1379 PRIMASK);
1380 } else {
1381 load = 0;
1384 count = 0;
1385 while (mask) {
1386 cpuid = BSFCPUMASK(mask);
1387 load += dfly_pcpu[cpuid].uload;
1388 load += dfly_pcpu[cpuid].runqcount *
1389 usched_dfly_weight3;
1390 mask &= ~CPUMASK(cpuid);
1391 ++count;
1393 load /= count;
1396 * Give a slight advantage to the cpu groups (lp)
1397 * belongs to, and a very slight advantage to the
1398 * cpu groups our synchronous partner belongs to.
1400 if (cpun->members & dd1->cpumask)
1401 load -= usched_dfly_weight1;
1402 else if (cpun->members & dd2->cpumask)
1403 load -= usched_dfly_weight2;
1406 * Calculate the best load
1408 if (cpub == NULL || lowest_load > load ||
1409 (lowest_load == load &&
1410 (cpun->members & dd1->cpumask))
1412 lowest_load = load;
1413 cpub = cpun;
1416 cpup = cpub;
1418 if (usched_dfly_chooser)
1419 kprintf("lp %02d->%02d %s\n",
1420 lp->lwp_qcpu, rdd->cpuid, lp->lwp_proc->p_comm);
1421 return (rdd);
1425 * USED TO PULL RUNNABLE LWPS FROM THE MOST LOADED CPU.
1427 * Choose the worst queue close to dd's cpu node with a non-empty runq.
1429 * This is used by the thread chooser when the current cpu's queues are
1430 * empty to steal a thread from another cpu's queue. We want to offload
1431 * the most heavily-loaded queue.
1433 static
1434 dfly_pcpu_t
1435 dfly_choose_worst_queue(dfly_pcpu_t dd)
1437 cpumask_t mask;
1438 cpu_node_t *cpup;
1439 cpu_node_t *cpun;
1440 cpu_node_t *cpub;
1441 dfly_pcpu_t rdd;
1442 int cpuid;
1443 int n;
1444 int count;
1445 int load;
1446 int highest_load;
1449 * When the topology is unknown choose a random cpu that is hopefully
1450 * idle.
1452 if (dd->cpunode == NULL) {
1453 return (NULL);
1457 * When the topology is known choose a cpu whos group has, in
1458 * aggregate, has the lowest weighted load.
1460 cpup = root_cpu_node;
1461 rdd = dd;
1462 while (cpup) {
1464 * Degenerate case super-root
1466 if (cpup->child_node && cpup->child_no == 1) {
1467 cpup = cpup->child_node;
1468 continue;
1472 * Terminal cpunode
1474 if (cpup->child_node == NULL) {
1475 rdd = &dfly_pcpu[BSFCPUMASK(cpup->members)];
1476 break;
1479 cpub = NULL;
1480 highest_load = 0;
1482 for (n = 0; n < cpup->child_no; ++n) {
1484 * Accumulate load information for all cpus
1485 * which are members of this node.
1487 cpun = &cpup->child_node[n];
1488 mask = cpun->members & usched_global_cpumask &
1489 smp_active_mask;
1490 if (mask == 0)
1491 continue;
1492 count = 0;
1493 load = 0;
1494 while (mask) {
1495 cpuid = BSFCPUMASK(mask);
1496 load += dfly_pcpu[cpuid].uload;
1497 load += dfly_pcpu[cpuid].runqcount *
1498 usched_dfly_weight3;
1499 mask &= ~CPUMASK(cpuid);
1500 ++count;
1502 load /= count;
1505 * Give a slight advantage to nearby cpus.
1507 if (cpun->members & dd->cpumask)
1508 load += usched_dfly_weight1;
1511 * The best candidate is the one with the worst
1512 * (highest) load. Prefer candiates that are
1513 * closer to our cpu.
1515 if (cpub == NULL || highest_load < load ||
1516 (highest_load == load &&
1517 (cpun->members & dd->cpumask))
1519 highest_load = load;
1520 cpub = cpun;
1523 cpup = cpub;
1525 return (rdd);
1528 static
1529 dfly_pcpu_t
1530 dfly_choose_queue_simple(dfly_pcpu_t dd, struct lwp *lp)
1532 dfly_pcpu_t rdd;
1533 cpumask_t tmpmask;
1534 cpumask_t mask;
1535 int cpuid;
1538 * Fallback to the original heuristic, select random cpu,
1539 * first checking cpus not currently running a user thread.
1541 ++dfly_scancpu;
1542 cpuid = (dfly_scancpu & 0xFFFF) % ncpus;
1543 mask = ~dfly_curprocmask & dfly_rdyprocmask & lp->lwp_cpumask &
1544 smp_active_mask & usched_global_cpumask;
1546 while (mask) {
1547 tmpmask = ~(CPUMASK(cpuid) - 1);
1548 if (mask & tmpmask)
1549 cpuid = BSFCPUMASK(mask & tmpmask);
1550 else
1551 cpuid = BSFCPUMASK(mask);
1552 rdd = &dfly_pcpu[cpuid];
1554 if ((rdd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK))
1555 goto found;
1556 mask &= ~CPUMASK(cpuid);
1560 * Then cpus which might have a currently running lp
1562 cpuid = (dfly_scancpu & 0xFFFF) % ncpus;
1563 mask = dfly_curprocmask & dfly_rdyprocmask &
1564 lp->lwp_cpumask & smp_active_mask & usched_global_cpumask;
1566 while (mask) {
1567 tmpmask = ~(CPUMASK(cpuid) - 1);
1568 if (mask & tmpmask)
1569 cpuid = BSFCPUMASK(mask & tmpmask);
1570 else
1571 cpuid = BSFCPUMASK(mask);
1572 rdd = &dfly_pcpu[cpuid];
1574 if ((rdd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
1575 goto found;
1576 mask &= ~CPUMASK(cpuid);
1580 * If we cannot find a suitable cpu we reload from dfly_scancpu
1581 * and round-robin. Other cpus will pickup as they release their
1582 * current lwps or become ready.
1584 * Avoid a degenerate system lockup case if usched_global_cpumask
1585 * is set to 0 or otherwise does not cover lwp_cpumask.
1587 * We only kick the target helper thread in this case, we do not
1588 * set the user resched flag because
1590 cpuid = (dfly_scancpu & 0xFFFF) % ncpus;
1591 if ((CPUMASK(cpuid) & usched_global_cpumask) == 0)
1592 cpuid = 0;
1593 rdd = &dfly_pcpu[cpuid];
1594 found:
1595 return (rdd);
1598 static
1599 void
1600 dfly_need_user_resched_remote(void *dummy)
1602 globaldata_t gd = mycpu;
1603 dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
1605 need_user_resched();
1607 /* Call wakeup_mycpu to avoid sending IPIs to other CPUs */
1608 wakeup_mycpu(&dd->helper_thread);
1611 #endif
1614 * dfly_remrunqueue_locked() removes a given process from the run queue
1615 * that it is on, clearing the queue busy bit if it becomes empty.
1617 * Note that user process scheduler is different from the LWKT schedule.
1618 * The user process scheduler only manages user processes but it uses LWKT
1619 * underneath, and a user process operating in the kernel will often be
1620 * 'released' from our management.
1622 * uload is NOT adjusted here. It is only adjusted if the lwkt_thread goes
1623 * to sleep or the lwp is moved to a different runq.
1625 static void
1626 dfly_remrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp)
1628 struct rq *q;
1629 u_int32_t *which;
1630 u_int8_t pri;
1632 KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ);
1633 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1634 --rdd->runqcount;
1635 /*rdd->uload -= (lp->lwp_priority & ~PPQMASK) & PRIMASK;*/
1636 KKASSERT(rdd->runqcount >= 0);
1638 pri = lp->lwp_rqindex;
1639 switch(lp->lwp_rqtype) {
1640 case RTP_PRIO_NORMAL:
1641 q = &rdd->queues[pri];
1642 which = &rdd->queuebits;
1643 break;
1644 case RTP_PRIO_REALTIME:
1645 case RTP_PRIO_FIFO:
1646 q = &rdd->rtqueues[pri];
1647 which = &rdd->rtqueuebits;
1648 break;
1649 case RTP_PRIO_IDLE:
1650 q = &rdd->idqueues[pri];
1651 which = &rdd->idqueuebits;
1652 break;
1653 default:
1654 panic("remrunqueue: invalid rtprio type");
1655 /* NOT REACHED */
1657 TAILQ_REMOVE(q, lp, lwp_procq);
1658 if (TAILQ_EMPTY(q)) {
1659 KASSERT((*which & (1 << pri)) != 0,
1660 ("remrunqueue: remove from empty queue"));
1661 *which &= ~(1 << pri);
1666 * dfly_setrunqueue_locked()
1668 * Add a process whos rqtype and rqindex had previously been calculated
1669 * onto the appropriate run queue. Determine if the addition requires
1670 * a reschedule on a cpu and return the cpuid or -1.
1672 * NOTE: Lower priorities are better priorities.
1674 * NOTE ON ULOAD: This variable specifies the aggregate load on a cpu, the
1675 * sum of the rough lwp_priority for all running and runnable
1676 * processes. Lower priority processes (higher lwp_priority
1677 * values) actually DO count as more load, not less, because
1678 * these are the programs which require the most care with
1679 * regards to cpu selection.
1681 static void
1682 dfly_setrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp)
1684 struct rq *q;
1685 u_int32_t *which;
1686 int pri;
1688 if (lp->lwp_qcpu != rdd->cpuid) {
1689 if (lp->lwp_mpflags & LWP_MP_ULOAD) {
1690 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
1691 atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload,
1692 -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
1694 lp->lwp_qcpu = rdd->cpuid;
1697 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
1698 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1699 ++rdd->runqcount;
1700 if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
1701 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
1702 atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload,
1703 (lp->lwp_priority & ~PPQMASK) & PRIMASK);
1706 pri = lp->lwp_rqindex;
1708 switch(lp->lwp_rqtype) {
1709 case RTP_PRIO_NORMAL:
1710 q = &rdd->queues[pri];
1711 which = &rdd->queuebits;
1712 break;
1713 case RTP_PRIO_REALTIME:
1714 case RTP_PRIO_FIFO:
1715 q = &rdd->rtqueues[pri];
1716 which = &rdd->rtqueuebits;
1717 break;
1718 case RTP_PRIO_IDLE:
1719 q = &rdd->idqueues[pri];
1720 which = &rdd->idqueuebits;
1721 break;
1722 default:
1723 panic("remrunqueue: invalid rtprio type");
1724 /* NOT REACHED */
1728 * Add to the correct queue and set the appropriate bit. If no
1729 * lower priority (i.e. better) processes are in the queue then
1730 * we want a reschedule, calculate the best cpu for the job.
1732 * Always run reschedules on the LWPs original cpu.
1734 TAILQ_INSERT_TAIL(q, lp, lwp_procq);
1735 *which |= 1 << pri;
1738 #ifdef SMP
1741 * For SMP systems a user scheduler helper thread is created for each
1742 * cpu and is used to allow one cpu to wakeup another for the purposes of
1743 * scheduling userland threads from setrunqueue().
1745 * UP systems do not need the helper since there is only one cpu.
1747 * We can't use the idle thread for this because we might block.
1748 * Additionally, doing things this way allows us to HLT idle cpus
1749 * on MP systems.
1751 static void
1752 dfly_helper_thread(void *dummy)
1754 globaldata_t gd;
1755 dfly_pcpu_t dd;
1756 struct lwp *nlp;
1757 cpumask_t mask;
1758 int cpuid;
1760 gd = mycpu;
1761 cpuid = gd->gd_cpuid; /* doesn't change */
1762 mask = gd->gd_cpumask; /* doesn't change */
1763 dd = &dfly_pcpu[cpuid];
1766 * Since we only want to be woken up only when no user processes
1767 * are scheduled on a cpu, run at an ultra low priority.
1769 lwkt_setpri_self(TDPRI_USER_SCHEDULER);
1771 tsleep(&dd->helper_thread, 0, "schslp", hz);
1773 for (;;) {
1775 * We use the LWKT deschedule-interlock trick to avoid racing
1776 * dfly_rdyprocmask. This means we cannot block through to the
1777 * manual lwkt_switch() call we make below.
1779 crit_enter_gd(gd);
1780 tsleep_interlock(&dd->helper_thread, 0);
1782 spin_lock(&dd->spin);
1784 atomic_set_cpumask(&dfly_rdyprocmask, mask);
1785 clear_user_resched(); /* This satisfied the reschedule request */
1786 dd->rrcount = 0; /* Reset the round-robin counter */
1788 if ((dfly_curprocmask & mask) == 0) {
1790 * No thread is currently scheduled.
1792 KKASSERT(dd->uschedcp == NULL);
1793 if ((nlp = dfly_chooseproc_locked(dd, NULL, 0)) != NULL) {
1794 KTR_COND_LOG(usched_sched_thread_no_process,
1795 nlp->lwp_proc->p_pid == usched_dfly_pid_debug,
1796 gd->gd_cpuid,
1797 nlp->lwp_proc->p_pid,
1798 nlp->lwp_thread->td_gd->gd_cpuid);
1800 atomic_set_cpumask(&dfly_curprocmask, mask);
1801 dd->upri = nlp->lwp_priority;
1802 dd->uschedcp = nlp;
1803 dd->rrcount = 0; /* reset round robin */
1804 spin_unlock(&dd->spin);
1805 lwkt_acquire(nlp->lwp_thread);
1806 lwkt_schedule(nlp->lwp_thread);
1807 } else {
1808 spin_unlock(&dd->spin);
1810 } else if (dd->runqcount) {
1812 * Possibly find a better process to schedule.
1814 nlp = dfly_chooseproc_locked(dd, dd->uschedcp, 0);
1815 if (nlp) {
1816 KTR_COND_LOG(usched_sched_thread_process,
1817 nlp->lwp_proc->p_pid == usched_dfly_pid_debug,
1818 gd->gd_cpuid,
1819 nlp->lwp_proc->p_pid,
1820 nlp->lwp_thread->td_gd->gd_cpuid);
1822 dd->upri = nlp->lwp_priority;
1823 dd->uschedcp = nlp;
1824 dd->rrcount = 0; /* reset round robin */
1825 spin_unlock(&dd->spin);
1826 lwkt_acquire(nlp->lwp_thread);
1827 lwkt_schedule(nlp->lwp_thread);
1828 } else {
1830 * Leave the thread on our run queue. Another
1831 * scheduler will try to pull it later.
1833 spin_unlock(&dd->spin);
1835 } else {
1837 * The runq is empty.
1839 spin_unlock(&dd->spin);
1843 * We're descheduled unless someone scheduled us. Switch away.
1844 * Exiting the critical section will cause splz() to be called
1845 * for us if interrupts and such are pending.
1847 crit_exit_gd(gd);
1848 tsleep(&dd->helper_thread, PINTERLOCKED, "schslp", hz);
1852 #if 0
1853 static int
1854 sysctl_usched_dfly_stick_to_level(SYSCTL_HANDLER_ARGS)
1856 int error, new_val;
1858 new_val = usched_dfly_stick_to_level;
1860 error = sysctl_handle_int(oidp, &new_val, 0, req);
1861 if (error != 0 || req->newptr == NULL)
1862 return (error);
1863 if (new_val > cpu_topology_levels_number - 1 || new_val < 0)
1864 return (EINVAL);
1865 usched_dfly_stick_to_level = new_val;
1866 return (0);
1868 #endif
1871 * Setup our scheduler helpers. Note that curprocmask bit 0 has already
1872 * been cleared by rqinit() and we should not mess with it further.
1874 static void
1875 dfly_helper_thread_cpu_init(void)
1877 int i;
1878 int j;
1879 int cpuid;
1880 int smt_not_supported = 0;
1881 int cache_coherent_not_supported = 0;
1883 if (bootverbose)
1884 kprintf("Start scheduler helpers on cpus:\n");
1886 sysctl_ctx_init(&usched_dfly_sysctl_ctx);
1887 usched_dfly_sysctl_tree =
1888 SYSCTL_ADD_NODE(&usched_dfly_sysctl_ctx,
1889 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
1890 "usched_dfly", CTLFLAG_RD, 0, "");
1892 for (i = 0; i < ncpus; ++i) {
1893 dfly_pcpu_t dd = &dfly_pcpu[i];
1894 cpumask_t mask = CPUMASK(i);
1896 if ((mask & smp_active_mask) == 0)
1897 continue;
1899 spin_init(&dd->spin);
1900 dd->cpunode = get_cpu_node_by_cpuid(i);
1901 dd->cpuid = i;
1902 dd->cpumask = CPUMASK(i);
1903 for (j = 0; j < NQS; j++) {
1904 TAILQ_INIT(&dd->queues[j]);
1905 TAILQ_INIT(&dd->rtqueues[j]);
1906 TAILQ_INIT(&dd->idqueues[j]);
1908 atomic_clear_cpumask(&dfly_curprocmask, 1);
1910 if (dd->cpunode == NULL) {
1911 smt_not_supported = 1;
1912 cache_coherent_not_supported = 1;
1913 if (bootverbose)
1914 kprintf ("\tcpu%d - WARNING: No CPU NODE "
1915 "found for cpu\n", i);
1916 } else {
1917 switch (dd->cpunode->type) {
1918 case THREAD_LEVEL:
1919 if (bootverbose)
1920 kprintf ("\tcpu%d - HyperThreading "
1921 "available. Core siblings: ",
1923 break;
1924 case CORE_LEVEL:
1925 smt_not_supported = 1;
1927 if (bootverbose)
1928 kprintf ("\tcpu%d - No HT available, "
1929 "multi-core/physical "
1930 "cpu. Physical siblings: ",
1932 break;
1933 case CHIP_LEVEL:
1934 smt_not_supported = 1;
1936 if (bootverbose)
1937 kprintf ("\tcpu%d - No HT available, "
1938 "single-core/physical cpu. "
1939 "Package Siblings: ",
1941 break;
1942 default:
1943 /* Let's go for safe defaults here */
1944 smt_not_supported = 1;
1945 cache_coherent_not_supported = 1;
1946 if (bootverbose)
1947 kprintf ("\tcpu%d - Unknown cpunode->"
1948 "type=%u. Siblings: ",
1950 (u_int)dd->cpunode->type);
1951 break;
1954 if (bootverbose) {
1955 if (dd->cpunode->parent_node != NULL) {
1956 CPUSET_FOREACH(cpuid, dd->cpunode->parent_node->members)
1957 kprintf("cpu%d ", cpuid);
1958 kprintf("\n");
1959 } else {
1960 kprintf(" no siblings\n");
1965 lwkt_create(dfly_helper_thread, NULL, NULL, &dd->helper_thread,
1966 0, i, "usched %d", i);
1969 * Allow user scheduling on the target cpu. cpu #0 has already
1970 * been enabled in rqinit().
1972 if (i)
1973 atomic_clear_cpumask(&dfly_curprocmask, mask);
1974 atomic_set_cpumask(&dfly_rdyprocmask, mask);
1975 dd->upri = PRIBASE_NULL;
1979 /* usched_dfly sysctl configurable parameters */
1981 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
1982 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
1983 OID_AUTO, "rrinterval", CTLFLAG_RW,
1984 &usched_dfly_rrinterval, 0, "");
1985 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
1986 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
1987 OID_AUTO, "decay", CTLFLAG_RW,
1988 &usched_dfly_decay, 0, "Extra decay when not running");
1989 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
1990 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
1991 OID_AUTO, "batch_time", CTLFLAG_RW,
1992 &usched_dfly_batch_time, 0, "Min batch counter value");
1994 /* Add enable/disable option for SMT scheduling if supported */
1995 if (smt_not_supported) {
1996 usched_dfly_smt = 0;
1997 SYSCTL_ADD_STRING(&usched_dfly_sysctl_ctx,
1998 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
1999 OID_AUTO, "smt", CTLFLAG_RD,
2000 "NOT SUPPORTED", 0, "SMT NOT SUPPORTED");
2001 } else {
2002 usched_dfly_smt = 1;
2003 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2004 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2005 OID_AUTO, "smt", CTLFLAG_RW,
2006 &usched_dfly_smt, 0, "Enable SMT scheduling");
2010 * Add enable/disable option for cache coherent scheduling
2011 * if supported
2013 if (cache_coherent_not_supported) {
2014 usched_dfly_cache_coherent = 0;
2015 SYSCTL_ADD_STRING(&usched_dfly_sysctl_ctx,
2016 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2017 OID_AUTO, "cache_coherent", CTLFLAG_RD,
2018 "NOT SUPPORTED", 0,
2019 "Cache coherence NOT SUPPORTED");
2020 } else {
2021 usched_dfly_cache_coherent = 1;
2022 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2023 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2024 OID_AUTO, "cache_coherent", CTLFLAG_RW,
2025 &usched_dfly_cache_coherent, 0,
2026 "Enable/Disable cache coherent scheduling");
2028 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2029 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2030 OID_AUTO, "weight1", CTLFLAG_RW,
2031 &usched_dfly_weight1, 10,
2032 "Weight selection for current cpu");
2034 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2035 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2036 OID_AUTO, "weight2", CTLFLAG_RW,
2037 &usched_dfly_weight2, 5,
2038 "Weight selection for wakefrom cpu");
2040 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2041 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2042 OID_AUTO, "weight3", CTLFLAG_RW,
2043 &usched_dfly_weight3, 50,
2044 "Weight selection for num threads on queue");
2046 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2047 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2048 OID_AUTO, "pull_enable", CTLFLAG_RW,
2049 &usched_dfly_pull_enable, 1,
2050 "Allow pulls into empty queues");
2053 #if 0
2054 SYSCTL_ADD_PROC(&usched_dfly_sysctl_ctx,
2055 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2056 OID_AUTO, "stick_to_level",
2057 CTLTYPE_INT | CTLFLAG_RW,
2058 NULL, sizeof usched_dfly_stick_to_level,
2059 sysctl_usched_dfly_stick_to_level, "I",
2060 "Stick a process to this level. See sysctl"
2061 "paremter hw.cpu_topology.level_description");
2062 #endif
2065 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2066 dfly_helper_thread_cpu_init, NULL)
2068 #else /* No SMP options - just add the configurable parameters to sysctl */
2070 static void
2071 sched_sysctl_tree_init(void)
2073 sysctl_ctx_init(&usched_dfly_sysctl_ctx);
2074 usched_dfly_sysctl_tree =
2075 SYSCTL_ADD_NODE(&usched_dfly_sysctl_ctx,
2076 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
2077 "usched_dfly", CTLFLAG_RD, 0, "");
2079 /* usched_dfly sysctl configurable parameters */
2080 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2081 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2082 OID_AUTO, "rrinterval", CTLFLAG_RW,
2083 &usched_dfly_rrinterval, 0, "");
2084 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2085 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2086 OID_AUTO, "decay", CTLFLAG_RW,
2087 &usched_dfly_decay, 0, "Extra decay when not running");
2088 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2089 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2090 OID_AUTO, "batch_time", CTLFLAG_RW,
2091 &usched_dfly_batch_time, 0, "Min batch counter value");
2093 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2094 sched_sysctl_tree_init, NULL)
2095 #endif