sys/kern/sched_m2.c

   1 /*      $NetBSD: sched_m2.c,v 1.26 2008/10/07 09:48:27 rmind Exp $      */
   2
   3 /*
   4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29 /*
  30  * TODO:
  31  *  - Implementation of fair share queue;
  32  *  - Support for NUMA;
  33  */
  34
  35 #include <sys/cdefs.h>
  36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.26 2008/10/07 09:48:27 rmind Exp $");
  37
  38 #include <sys/param.h>
  39
  40 #include <sys/bitops.h>
  41 #include <sys/cpu.h>
  42 #include <sys/callout.h>
  43 #include <sys/errno.h>
  44 #include <sys/kernel.h>
  45 #include <sys/kmem.h>
  46 #include <sys/lwp.h>
  47 #include <sys/mutex.h>
  48 #include <sys/pool.h>
  49 #include <sys/proc.h>
  50 #include <sys/pset.h>
  51 #include <sys/resource.h>
  52 #include <sys/resourcevar.h>
  53 #include <sys/sched.h>
  54 #include <sys/syscallargs.h>
  55 #include <sys/sysctl.h>
  56 #include <sys/types.h>
  57
  58 /*
  59  * Priority related defintions.
  60  */
  61 #define PRI_TS_COUNT    (NPRI_USER)
  62 #define PRI_RT_COUNT    (PRI_COUNT - PRI_TS_COUNT)
  63 #define PRI_HTS_RANGE   (PRI_TS_COUNT / 10)
  64
  65 #define PRI_HIGHEST_TS  (MAXPRI_USER)
  66
  67 /*
  68  * Time-slices and priorities.
  69  */
  70 static u_int    min_ts;                 /* Minimal time-slice */
  71 static u_int    max_ts;                 /* Maximal time-slice */
  72 static u_int    rt_ts;                  /* Real-time time-slice */
  73 static u_int    ts_map[PRI_COUNT];      /* Map of time-slices */
  74 static pri_t    high_pri[PRI_COUNT];    /* Map for priority increase */
  75
  76 static void     sched_precalcts(void);
  77
  78 /*
  79  * Initialization and setup.
  80  */
  81
  82 void
  83 sched_rqinit(void)
  84 {
  85         struct cpu_info *ci = curcpu();
  86
  87         if (hz < 100) {
  88                 panic("sched_rqinit: value of HZ is too low\n");
  89         }
  90
  91         /* Default timing ranges */
  92         min_ts = mstohz(20);                    /*  ~20 ms */
  93         max_ts = mstohz(150);                   /* ~150 ms */
  94         rt_ts = mstohz(100);                    /* ~100 ms */
  95         sched_precalcts();
  96
  97         /* Attach the primary CPU here */
  98         sched_cpuattach(ci);
  99
 100         sched_lwp_fork(NULL, &lwp0);
 101         sched_newts(&lwp0);
 102 }
 103
 104 /* Pre-calculate the time-slices for the priorities */
 105 static void
 106 sched_precalcts(void)
 107 {
 108         pri_t p;
 109
 110         /* Time-sharing range */
 111         for (p = 0; p <= PRI_HIGHEST_TS; p++) {
 112                 ts_map[p] = max_ts -
 113                     (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
 114                 high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
 115                     ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
 116         }
 117
 118         /* Real-time range */
 119         for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
 120                 ts_map[p] = rt_ts;
 121                 high_pri[p] = p;
 122         }
 123 }
 124
 125 /*
 126  * Hooks.
 127  */
 128
 129 void
 130 sched_proc_fork(struct proc *parent, struct proc *child)
 131 {
 132         struct lwp *l;
 133
 134         LIST_FOREACH(l, &child->p_lwps, l_sibling) {
 135                 lwp_lock(l);
 136                 sched_newts(l);
 137                 lwp_unlock(l);
 138         }
 139 }
 140
 141 void
 142 sched_proc_exit(struct proc *child, struct proc *parent)
 143 {
 144
 145 }
 146
 147 void
 148 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
 149 {
 150
 151 }
 152
 153 void
 154 sched_lwp_collect(struct lwp *l)
 155 {
 156
 157 }
 158
 159 void
 160 sched_setrunnable(struct lwp *l)
 161 {
 162
 163 }
 164
 165 void
 166 sched_schedclock(struct lwp *l)
 167 {
 168
 169 }
 170
 171 /*
 172  * Priorities and time-slice.
 173  */
 174
 175 void
 176 sched_nice(struct proc *p, int prio)
 177 {
 178         struct lwp *l;
 179         int n;
 180
 181         KASSERT(mutex_owned(p->p_lock));
 182
 183         p->p_nice = prio;
 184         n = (prio - NZERO) >> 2;
 185         if (n == 0)
 186                 return;
 187
 188         LIST_FOREACH(l, &p->p_lwps, l_sibling) {
 189                 lwp_lock(l);
 190                 if (l->l_class == SCHED_OTHER) {
 191                         pri_t pri = l->l_priority - n;
 192                         pri = (n < 0) ? min(pri, PRI_HIGHEST_TS) : imax(pri, 0);
 193                         lwp_changepri(l, pri);
 194                 }
 195                 lwp_unlock(l);
 196         }
 197 }
 198
 199 /* Recalculate the time-slice */
 200 void
 201 sched_newts(struct lwp *l)
 202 {
 203
 204         l->l_sched.timeslice = ts_map[lwp_eprio(l)];
 205 }
 206
 207 void
 208 sched_slept(struct lwp *l)
 209 {
 210
 211         /*
 212          * If thread is in time-sharing queue and batch flag is not marked,
 213          * increase the the priority, and run with the lower time-quantum.
 214          */
 215         if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) {
 216                 struct proc *p = l->l_proc;
 217
 218                 KASSERT(l->l_class == SCHED_OTHER);
 219                 if (__predict_false(p->p_nice < NZERO)) {
 220                         const int n = max((NZERO - p->p_nice) >> 2, 1);
 221                         l->l_priority = min(l->l_priority + n, PRI_HIGHEST_TS);
 222                 } else {
 223                         l->l_priority++;
 224                 }
 225         }
 226 }
 227
 228 void
 229 sched_wakeup(struct lwp *l)
 230 {
 231
 232         /* If thread was sleeping a second or more - set a high priority */
 233         if (l->l_slptime >= 1)
 234                 l->l_priority = high_pri[l->l_priority];
 235 }
 236
 237 void
 238 sched_pstats_hook(struct lwp *l, int batch)
 239 {
 240         pri_t prio;
 241
 242         /*
 243          * Estimate threads on time-sharing queue only, however,
 244          * exclude the highest priority for performance purposes.
 245          */
 246         KASSERT(lwp_locked(l, NULL));
 247         if (l->l_priority >= PRI_HIGHEST_TS)
 248                 return;
 249         KASSERT(l->l_class == SCHED_OTHER);
 250
 251         /* If it is CPU-bound not a first time - decrease the priority */
 252         prio = l->l_priority;
 253         if (batch && prio != 0)
 254                 prio--;
 255
 256         /* If thread was not ran a second or more - set a high priority */
 257         if (l->l_stat == LSRUN) {
 258                 if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz))
 259                         prio = high_pri[prio];
 260                 /* Re-enqueue the thread if priority has changed */
 261                 if (prio != l->l_priority)
 262                         lwp_changepri(l, prio);
 263         } else {
 264                 /* In other states, change the priority directly */
 265                 l->l_priority = prio;
 266         }
 267 }
 268
 269 void
 270 sched_oncpu(lwp_t *l)
 271 {
 272         struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate;
 273
 274         /* Update the counters */
 275         KASSERT(l->l_sched.timeslice >= min_ts);
 276         KASSERT(l->l_sched.timeslice <= max_ts);
 277         spc->spc_ticks = l->l_sched.timeslice;
 278 }
 279
 280 /*
 281  * Time-driven events.
 282  */
 283
 284 /*
 285  * Called once per time-quantum.  This routine is CPU-local and runs at
 286  * IPL_SCHED, thus the locking is not needed.
 287  */
 288 void
 289 sched_tick(struct cpu_info *ci)
 290 {
 291         struct schedstate_percpu *spc = &ci->ci_schedstate;
 292         struct lwp *l = curlwp;
 293         struct proc *p;
 294
 295         if (__predict_false(CURCPU_IDLE_P()))
 296                 return;
 297
 298         switch (l->l_class) {
 299         case SCHED_FIFO:
 300                 /*
 301                  * Update the time-quantum, and continue running,
 302                  * if thread runs on FIFO real-time policy.
 303                  */
 304                 KASSERT(l->l_priority > PRI_HIGHEST_TS);
 305                 spc->spc_ticks = l->l_sched.timeslice;
 306                 return;
 307         case SCHED_OTHER:
 308                 /*
 309                  * If thread is in time-sharing queue, decrease the priority,
 310                  * and run with a higher time-quantum.
 311                  */
 312                 KASSERT(l->l_priority <= PRI_HIGHEST_TS);
 313                 if (l->l_priority == 0)
 314                         break;
 315
 316                 p = l->l_proc;
 317                 if (__predict_false(p->p_nice > NZERO)) {
 318                         const int n = max((p->p_nice - NZERO) >> 2, 1);
 319                         l->l_priority = imax(l->l_priority - n, 0);
 320                 } else
 321                         l->l_priority--;
 322                 break;
 323         }
 324
 325         /*
 326          * If there are higher priority threads or threads in the same queue,
 327          * mark that thread should yield, otherwise, continue running.
 328          */
 329         if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
 330                 spc->spc_flags |= SPCF_SHOULDYIELD;
 331                 cpu_need_resched(ci, 0);
 332         } else
 333                 spc->spc_ticks = l->l_sched.timeslice;
 334 }
 335
 336 /*
 337  * Sysctl nodes and initialization.
 338  */
 339
 340 static int
 341 sysctl_sched_rtts(SYSCTLFN_ARGS)
 342 {
 343         struct sysctlnode node;
 344         int rttsms = hztoms(rt_ts);
 345
 346         node = *rnode;
 347         node.sysctl_data = &rttsms;
 348         return sysctl_lookup(SYSCTLFN_CALL(&node));
 349 }
 350
 351 static int
 352 sysctl_sched_mints(SYSCTLFN_ARGS)
 353 {
 354         struct sysctlnode node;
 355         struct cpu_info *ci;
 356         int error, newsize;
 357         CPU_INFO_ITERATOR cii;
 358
 359         node = *rnode;
 360         node.sysctl_data = &newsize;
 361
 362         newsize = hztoms(min_ts);
 363         error = sysctl_lookup(SYSCTLFN_CALL(&node));
 364         if (error || newp == NULL)
 365                 return error;
 366
 367         newsize = mstohz(newsize);
 368         if (newsize < 1 || newsize > hz || newsize >= max_ts)
 369                 return EINVAL;
 370
 371         /* It is safe to do this in such order */
 372         for (CPU_INFO_FOREACH(cii, ci))
 373                 spc_lock(ci);
 374
 375         min_ts = newsize;
 376         sched_precalcts();
 377
 378         for (CPU_INFO_FOREACH(cii, ci))
 379                 spc_unlock(ci);
 380
 381         return 0;
 382 }
 383
 384 static int
 385 sysctl_sched_maxts(SYSCTLFN_ARGS)
 386 {
 387         struct sysctlnode node;
 388         struct cpu_info *ci;
 389         int error, newsize;
 390         CPU_INFO_ITERATOR cii;
 391
 392         node = *rnode;
 393         node.sysctl_data = &newsize;
 394
 395         newsize = hztoms(max_ts);
 396         error = sysctl_lookup(SYSCTLFN_CALL(&node));
 397         if (error || newp == NULL)
 398                 return error;
 399
 400         newsize = mstohz(newsize);
 401         if (newsize < 10 || newsize > hz || newsize <= min_ts)
 402                 return EINVAL;
 403
 404         /* It is safe to do this in such order */
 405         for (CPU_INFO_FOREACH(cii, ci))
 406                 spc_lock(ci);
 407
 408         max_ts = newsize;
 409         sched_precalcts();
 410
 411         for (CPU_INFO_FOREACH(cii, ci))
 412                 spc_unlock(ci);
 413
 414         return 0;
 415 }
 416
 417 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
 418 {
 419         const struct sysctlnode *node = NULL;
 420
 421         sysctl_createv(clog, 0, NULL, NULL,
 422                 CTLFLAG_PERMANENT,
 423                 CTLTYPE_NODE, "kern", NULL,
 424                 NULL, 0, NULL, 0,
 425                 CTL_KERN, CTL_EOL);
 426         sysctl_createv(clog, 0, NULL, &node,
 427                 CTLFLAG_PERMANENT,
 428                 CTLTYPE_NODE, "sched",
 429                 SYSCTL_DESCR("Scheduler options"),
 430                 NULL, 0, NULL, 0,
 431                 CTL_KERN, CTL_CREATE, CTL_EOL);
 432
 433         if (node == NULL)
 434                 return;
 435
 436         sysctl_createv(NULL, 0, &node, NULL,
 437                 CTLFLAG_PERMANENT,
 438                 CTLTYPE_STRING, "name", NULL,
 439                 NULL, 0, __UNCONST("M2"), 0,
 440                 CTL_CREATE, CTL_EOL);
 441         sysctl_createv(NULL, 0, &node, NULL,
 442                 CTLFLAG_PERMANENT,
 443                 CTLTYPE_INT, "rtts",
 444                 SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
 445                 sysctl_sched_rtts, 0, NULL, 0,
 446                 CTL_CREATE, CTL_EOL);
 447         sysctl_createv(NULL, 0, &node, NULL,
 448                 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
 449                 CTLTYPE_INT, "maxts",
 450                 SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
 451                 sysctl_sched_maxts, 0, &max_ts, 0,
 452                 CTL_CREATE, CTL_EOL);
 453         sysctl_createv(NULL, 0, &node, NULL,
 454                 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
 455                 CTLTYPE_INT, "mints",
 456                 SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
 457                 sysctl_sched_mints, 0, &min_ts, 0,
 458                 CTL_CREATE, CTL_EOL);
 459 }