1 /* $NetBSD: sched_m2.c,v 1.26 2008/10/07 09:48:27 rmind Exp $ */
4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * - Implementation of fair share queue;
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.26 2008/10/07 09:48:27 rmind Exp $");
38 #include <sys/param.h>
40 #include <sys/bitops.h>
42 #include <sys/callout.h>
43 #include <sys/errno.h>
44 #include <sys/kernel.h>
47 #include <sys/mutex.h>
51 #include <sys/resource.h>
52 #include <sys/resourcevar.h>
53 #include <sys/sched.h>
54 #include <sys/syscallargs.h>
55 #include <sys/sysctl.h>
56 #include <sys/types.h>
59 * Priority related defintions.
61 #define PRI_TS_COUNT (NPRI_USER)
62 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT)
63 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10)
65 #define PRI_HIGHEST_TS (MAXPRI_USER)
68 * Time-slices and priorities.
70 static u_int min_ts
; /* Minimal time-slice */
71 static u_int max_ts
; /* Maximal time-slice */
72 static u_int rt_ts
; /* Real-time time-slice */
73 static u_int ts_map
[PRI_COUNT
]; /* Map of time-slices */
74 static pri_t high_pri
[PRI_COUNT
]; /* Map for priority increase */
76 static void sched_precalcts(void);
79 * Initialization and setup.
85 struct cpu_info
*ci
= curcpu();
88 panic("sched_rqinit: value of HZ is too low\n");
91 /* Default timing ranges */
92 min_ts
= mstohz(20); /* ~20 ms */
93 max_ts
= mstohz(150); /* ~150 ms */
94 rt_ts
= mstohz(100); /* ~100 ms */
97 /* Attach the primary CPU here */
100 sched_lwp_fork(NULL
, &lwp0
);
104 /* Pre-calculate the time-slices for the priorities */
106 sched_precalcts(void)
110 /* Time-sharing range */
111 for (p
= 0; p
<= PRI_HIGHEST_TS
; p
++) {
113 (p
* 100 / (PRI_TS_COUNT
- 1) * (max_ts
- min_ts
) / 100);
114 high_pri
[p
] = (PRI_HIGHEST_TS
- PRI_HTS_RANGE
) +
115 ((p
* PRI_HTS_RANGE
) / (PRI_TS_COUNT
- 1));
118 /* Real-time range */
119 for (p
= (PRI_HIGHEST_TS
+ 1); p
< PRI_COUNT
; p
++) {
130 sched_proc_fork(struct proc
*parent
, struct proc
*child
)
134 LIST_FOREACH(l
, &child
->p_lwps
, l_sibling
) {
142 sched_proc_exit(struct proc
*child
, struct proc
*parent
)
148 sched_lwp_fork(struct lwp
*l1
, struct lwp
*l2
)
154 sched_lwp_collect(struct lwp
*l
)
160 sched_setrunnable(struct lwp
*l
)
166 sched_schedclock(struct lwp
*l
)
172 * Priorities and time-slice.
176 sched_nice(struct proc
*p
, int prio
)
181 KASSERT(mutex_owned(p
->p_lock
));
184 n
= (prio
- NZERO
) >> 2;
188 LIST_FOREACH(l
, &p
->p_lwps
, l_sibling
) {
190 if (l
->l_class
== SCHED_OTHER
) {
191 pri_t pri
= l
->l_priority
- n
;
192 pri
= (n
< 0) ? min(pri
, PRI_HIGHEST_TS
) : imax(pri
, 0);
193 lwp_changepri(l
, pri
);
199 /* Recalculate the time-slice */
201 sched_newts(struct lwp
*l
)
204 l
->l_sched
.timeslice
= ts_map
[lwp_eprio(l
)];
208 sched_slept(struct lwp
*l
)
212 * If thread is in time-sharing queue and batch flag is not marked,
213 * increase the the priority, and run with the lower time-quantum.
215 if (l
->l_priority
< PRI_HIGHEST_TS
&& (l
->l_flag
& LW_BATCH
) == 0) {
216 struct proc
*p
= l
->l_proc
;
218 KASSERT(l
->l_class
== SCHED_OTHER
);
219 if (__predict_false(p
->p_nice
< NZERO
)) {
220 const int n
= max((NZERO
- p
->p_nice
) >> 2, 1);
221 l
->l_priority
= min(l
->l_priority
+ n
, PRI_HIGHEST_TS
);
229 sched_wakeup(struct lwp
*l
)
232 /* If thread was sleeping a second or more - set a high priority */
233 if (l
->l_slptime
>= 1)
234 l
->l_priority
= high_pri
[l
->l_priority
];
238 sched_pstats_hook(struct lwp
*l
, int batch
)
243 * Estimate threads on time-sharing queue only, however,
244 * exclude the highest priority for performance purposes.
246 KASSERT(lwp_locked(l
, NULL
));
247 if (l
->l_priority
>= PRI_HIGHEST_TS
)
249 KASSERT(l
->l_class
== SCHED_OTHER
);
251 /* If it is CPU-bound not a first time - decrease the priority */
252 prio
= l
->l_priority
;
253 if (batch
&& prio
!= 0)
256 /* If thread was not ran a second or more - set a high priority */
257 if (l
->l_stat
== LSRUN
) {
258 if (l
->l_rticks
&& (hardclock_ticks
- l
->l_rticks
>= hz
))
259 prio
= high_pri
[prio
];
260 /* Re-enqueue the thread if priority has changed */
261 if (prio
!= l
->l_priority
)
262 lwp_changepri(l
, prio
);
264 /* In other states, change the priority directly */
265 l
->l_priority
= prio
;
270 sched_oncpu(lwp_t
*l
)
272 struct schedstate_percpu
*spc
= &l
->l_cpu
->ci_schedstate
;
274 /* Update the counters */
275 KASSERT(l
->l_sched
.timeslice
>= min_ts
);
276 KASSERT(l
->l_sched
.timeslice
<= max_ts
);
277 spc
->spc_ticks
= l
->l_sched
.timeslice
;
281 * Time-driven events.
285 * Called once per time-quantum. This routine is CPU-local and runs at
286 * IPL_SCHED, thus the locking is not needed.
289 sched_tick(struct cpu_info
*ci
)
291 struct schedstate_percpu
*spc
= &ci
->ci_schedstate
;
292 struct lwp
*l
= curlwp
;
295 if (__predict_false(CURCPU_IDLE_P()))
298 switch (l
->l_class
) {
301 * Update the time-quantum, and continue running,
302 * if thread runs on FIFO real-time policy.
304 KASSERT(l
->l_priority
> PRI_HIGHEST_TS
);
305 spc
->spc_ticks
= l
->l_sched
.timeslice
;
309 * If thread is in time-sharing queue, decrease the priority,
310 * and run with a higher time-quantum.
312 KASSERT(l
->l_priority
<= PRI_HIGHEST_TS
);
313 if (l
->l_priority
== 0)
317 if (__predict_false(p
->p_nice
> NZERO
)) {
318 const int n
= max((p
->p_nice
- NZERO
) >> 2, 1);
319 l
->l_priority
= imax(l
->l_priority
- n
, 0);
326 * If there are higher priority threads or threads in the same queue,
327 * mark that thread should yield, otherwise, continue running.
329 if (lwp_eprio(l
) <= spc
->spc_maxpriority
|| l
->l_target_cpu
) {
330 spc
->spc_flags
|= SPCF_SHOULDYIELD
;
331 cpu_need_resched(ci
, 0);
333 spc
->spc_ticks
= l
->l_sched
.timeslice
;
337 * Sysctl nodes and initialization.
341 sysctl_sched_rtts(SYSCTLFN_ARGS
)
343 struct sysctlnode node
;
344 int rttsms
= hztoms(rt_ts
);
347 node
.sysctl_data
= &rttsms
;
348 return sysctl_lookup(SYSCTLFN_CALL(&node
));
352 sysctl_sched_mints(SYSCTLFN_ARGS
)
354 struct sysctlnode node
;
357 CPU_INFO_ITERATOR cii
;
360 node
.sysctl_data
= &newsize
;
362 newsize
= hztoms(min_ts
);
363 error
= sysctl_lookup(SYSCTLFN_CALL(&node
));
364 if (error
|| newp
== NULL
)
367 newsize
= mstohz(newsize
);
368 if (newsize
< 1 || newsize
> hz
|| newsize
>= max_ts
)
371 /* It is safe to do this in such order */
372 for (CPU_INFO_FOREACH(cii
, ci
))
378 for (CPU_INFO_FOREACH(cii
, ci
))
385 sysctl_sched_maxts(SYSCTLFN_ARGS
)
387 struct sysctlnode node
;
390 CPU_INFO_ITERATOR cii
;
393 node
.sysctl_data
= &newsize
;
395 newsize
= hztoms(max_ts
);
396 error
= sysctl_lookup(SYSCTLFN_CALL(&node
));
397 if (error
|| newp
== NULL
)
400 newsize
= mstohz(newsize
);
401 if (newsize
< 10 || newsize
> hz
|| newsize
<= min_ts
)
404 /* It is safe to do this in such order */
405 for (CPU_INFO_FOREACH(cii
, ci
))
411 for (CPU_INFO_FOREACH(cii
, ci
))
417 SYSCTL_SETUP(sysctl_sched_m2_setup
, "sysctl sched setup")
419 const struct sysctlnode
*node
= NULL
;
421 sysctl_createv(clog
, 0, NULL
, NULL
,
423 CTLTYPE_NODE
, "kern", NULL
,
426 sysctl_createv(clog
, 0, NULL
, &node
,
428 CTLTYPE_NODE
, "sched",
429 SYSCTL_DESCR("Scheduler options"),
431 CTL_KERN
, CTL_CREATE
, CTL_EOL
);
436 sysctl_createv(NULL
, 0, &node
, NULL
,
438 CTLTYPE_STRING
, "name", NULL
,
439 NULL
, 0, __UNCONST("M2"), 0,
440 CTL_CREATE
, CTL_EOL
);
441 sysctl_createv(NULL
, 0, &node
, NULL
,
444 SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
445 sysctl_sched_rtts
, 0, NULL
, 0,
446 CTL_CREATE
, CTL_EOL
);
447 sysctl_createv(NULL
, 0, &node
, NULL
,
448 CTLFLAG_PERMANENT
| CTLFLAG_READWRITE
,
449 CTLTYPE_INT
, "maxts",
450 SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
451 sysctl_sched_maxts
, 0, &max_ts
, 0,
452 CTL_CREATE
, CTL_EOL
);
453 sysctl_createv(NULL
, 0, &node
, NULL
,
454 CTLFLAG_PERMANENT
| CTLFLAG_READWRITE
,
455 CTLTYPE_INT
, "mints",
456 SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
457 sysctl_sched_mints
, 0, &min_ts
, 0,
458 CTL_CREATE
, CTL_EOL
);