sys/kern/usched_bsd4.c

   1 /*
   2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.7 2005/12/23 10:08:14 dillon Exp $
  27  */
  28
  29 #include <sys/param.h>
  30 #include <sys/systm.h>
  31 #include <sys/kernel.h>
  32 #include <sys/lock.h>
  33 #include <sys/queue.h>
  34 #include <sys/proc.h>
  35 #include <sys/rtprio.h>
  36 #include <sys/thread2.h>
  37 #include <sys/uio.h>
  38 #include <sys/sysctl.h>
  39 #include <sys/resourcevar.h>
  40 #include <machine/ipl.h>
  41 #include <machine/cpu.h>
  42 #include <machine/smp.h>
  43
  44 /*
  45  * Priorities.  Note that with 32 run queues per scheduler each queue
  46  * represents four priority levels.
  47  */
  48
  49 #define MAXPRI                  128
  50 #define PRIMASK                 (MAXPRI - 1)
  51 #define PRIBASE_REALTIME        0
  52 #define PRIBASE_NORMAL          MAXPRI
  53 #define PRIBASE_IDLE            (MAXPRI * 2)
  54 #define PRIBASE_THREAD          (MAXPRI * 3)
  55 #define PRIBASE_NULL            (MAXPRI * 4)
  56
  57 #define NQS     32                      /* 32 run queues. */
  58 #define PPQ     (MAXPRI / NQS)          /* priorities per queue */
  59
  60 /*
  61  * NICEPPQ      - number of nice units per priority queue
  62  * ESTCPURAMP   - number of scheduler ticks for estcpu to switch queues
  63  *
  64  * ESTCPUPPQ    - number of estcpu units per priority queue
  65  * ESTCPUMAX    - number of estcpu units
  66  * ESTCPUINCR   - amount we have to increment p_estcpu per scheduling tick at
  67  *                100% cpu.
  68  */
  69 #define NICEPPQ         2
  70 #define ESTCPURAMP      4
  71 #define ESTCPUPPQ       512
  72 #define ESTCPUMAX       (ESTCPUPPQ * NQS)
  73 #define ESTCPUINCR      (ESTCPUPPQ / ESTCPURAMP)
  74 #define PRIO_RANGE      (PRIO_MAX - PRIO_MIN + 1)
  75
  76 #define ESTCPULIM(v)    min((v), ESTCPUMAX)
  77
  78 TAILQ_HEAD(rq, lwp);
  79
  80 #define lwp_priority    lwp_usdata.bsd4.priority
  81 #define lwp_rqindex     lwp_usdata.bsd4.rqindex
  82 #define lwp_origcpu     lwp_usdata.bsd4.origcpu
  83 #define lwp_estcpu      lwp_usdata.bsd4.estcpu
  84
  85 static void bsd4_acquire_curproc(struct lwp *lp);
  86 static void bsd4_release_curproc(struct lwp *lp);
  87 static void bsd4_select_curproc(globaldata_t gd);
  88 static void bsd4_setrunqueue(struct lwp *lp);
  89 static void bsd4_remrunqueue(struct lwp *lp);
  90 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
  91                                 sysclock_t cpstamp);
  92 static void bsd4_resetpriority(struct lwp *lp);
  93 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
  94 static void bsd4_exiting(struct lwp *plp, struct lwp *lp);
  95
  96 static void bsd4_recalculate_estcpu(struct lwp *lp);
  97
  98 struct usched usched_bsd4 = {
  99         { NULL },
 100         "bsd4", "Original DragonFly Scheduler",
 101         NULL,                   /* default registration */
 102         NULL,                   /* default deregistration */
 103         bsd4_acquire_curproc,
 104         bsd4_release_curproc,
 105         bsd4_select_curproc,
 106         bsd4_setrunqueue,
 107         bsd4_remrunqueue,
 108         bsd4_schedulerclock,
 109         bsd4_recalculate_estcpu,
 110         bsd4_resetpriority,
 111         bsd4_forking,
 112         bsd4_exiting,
 113         NULL                    /* setcpumask not supported */
 114 };
 115
 116 /*
 117  * We have NQS (32) run queues per scheduling class.  For the normal
 118  * class, there are 128 priorities scaled onto these 32 queues.  New
 119  * processes are added to the last entry in each queue, and processes
 120  * are selected for running by taking them from the head and maintaining
 121  * a simple FIFO arrangement.  Realtime and Idle priority processes have
 122  * and explicit 0-31 priority which maps directly onto their class queue
 123  * index.  When a queue has something in it, the corresponding bit is
 124  * set in the queuebits variable, allowing a single read to determine
 125  * the state of all 32 queues and then a ffs() to find the first busy
 126  * queue.
 127  */
 128 static struct rq queues[NQS];
 129 static struct rq rtqueues[NQS];
 130 static struct rq idqueues[NQS];
 131 static u_int32_t queuebits;
 132 static u_int32_t rtqueuebits;
 133 static u_int32_t idqueuebits;
 134 static cpumask_t curprocmask = -1;      /* currently running a user process */
 135 static cpumask_t rdyprocmask;           /* ready to accept a user process */
 136 static int       runqcount;
 137 #ifdef SMP
 138 static int       scancpu;
 139 #endif
 140
 141 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
 142 #ifdef INVARIANTS
 143 static int usched_nonoptimal;
 144 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
 145         &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
 146 static int usched_optimal;
 147 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
 148         &usched_optimal, 0, "acquire_curproc() was optimal");
 149 #endif
 150 static int usched_debug = -1;
 151 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, "");
 152 #ifdef SMP
 153 static int remote_resched = 1;
 154 static int remote_resched_nonaffinity;
 155 static int remote_resched_affinity;
 156 static int choose_affinity;
 157 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
 158         &remote_resched, 0, "Resched to another cpu");
 159 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
 160         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
 161 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
 162         &remote_resched_affinity, 0, "Number of remote rescheds");
 163 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
 164         &choose_affinity, 0, "chooseproc() was smart");
 165 #endif
 166
 167 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
 168 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
 169         &usched_bsd4_rrinterval, 0, "");
 170 static int usched_bsd4_decay = ESTCPUINCR / 2;
 171 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
 172         &usched_bsd4_decay, 0, "");
 173
 174 /*
 175  * Initialize the run queues at boot time.
 176  */
 177 static void
 178 rqinit(void *dummy)
 179 {
 180         int i;
 181
 182         for (i = 0; i < NQS; i++) {
 183                 TAILQ_INIT(&queues[i]);
 184                 TAILQ_INIT(&rtqueues[i]);
 185                 TAILQ_INIT(&idqueues[i]);
 186         }
 187         atomic_clear_int(&curprocmask, 1);
 188 }
 189 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
 190
 191 /*
 192  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
 193  * it selects a user process and returns it.  If chklp is non-NULL and chklp
 194  * has a better or equal priority then the process that would otherwise be
 195  * chosen, NULL is returned.
 196  *
 197  * Until we fix the RUNQ code the chklp test has to be strict or we may
 198  * bounce between processes trying to acquire the current process designation.
 199  */
 200 static
 201 struct lwp *
 202 chooseproc(struct lwp *chklp)
 203 {
 204         struct lwp *lp;
 205         struct rq *q;
 206         u_int32_t *which;
 207         u_int32_t pri;
 208
 209         if (rtqueuebits) {
 210                 pri = bsfl(rtqueuebits);
 211                 q = &rtqueues[pri];
 212                 which = &rtqueuebits;
 213         } else if (queuebits) {
 214                 pri = bsfl(queuebits);
 215                 q = &queues[pri];
 216                 which = &queuebits;
 217         } else if (idqueuebits) {
 218                 pri = bsfl(idqueuebits);
 219                 q = &idqueues[pri];
 220                 which = &idqueuebits;
 221         } else {
 222                 return NULL;
 223         }
 224         lp = TAILQ_FIRST(q);
 225         KASSERT(lp, ("chooseproc: no lwp on busy queue"));
 226
 227         /*
 228          * If the passed lwp <chklp> is reasonably close to the selected
 229          * lwp <lp>, return NULL (indicating that <chklp> should be kept).
 230          *
 231          * Note that we must error on the side of <chklp> to avoid bouncing
 232          * between threads in the acquire code.
 233          */
 234         if (chklp) {
 235                 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
 236                         return(NULL);
 237         }
 238
 239 #ifdef SMP
 240         /*
 241          * If the chosen lwp does not reside on this cpu spend a few
 242          * cycles looking for a better candidate at the same priority level.
 243          * This is a fallback check, setrunqueue() tries to wakeup the
 244          * correct cpu and is our front-line affinity.
 245          */
 246         if (lp->lwp_thread->td_gd != mycpu &&
 247             (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
 248         ) {
 249                 if (chklp->lwp_thread->td_gd == mycpu) {
 250                         ++choose_affinity;
 251                         lp = chklp;
 252                 }
 253         }
 254 #endif
 255
 256         TAILQ_REMOVE(q, lp, lwp_procq);
 257         --runqcount;
 258         if (TAILQ_EMPTY(q))
 259                 *which &= ~(1 << pri);
 260         KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
 261         lp->lwp_proc->p_flag &= ~P_ONRUNQ;
 262         return lp;
 263 }
 264
 265 #ifdef SMP
 266 /*
 267  * called via an ipi message to reschedule on another cpu.
 268  */
 269 static
 270 void
 271 need_user_resched_remote(void *dummy)
 272 {
 273         need_user_resched();
 274 }
 275
 276 #endif
 277
 278 /*
 279  * setrunqueue() 'wakes up' a 'user' process.  GIANT must be held.  The
 280  * user process may represent any user process, including the current
 281  * process.
 282  *
 283  * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
 284  * cpus in an attempt to keep the process on the current cpu at least for
 285  * a little while to take advantage of locality of reference (e.g. fork/exec
 286  * or short fork/exit, and uio_yield()).
 287  *
 288  * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
 289  * or (user level) preempt on the same cpu that a process was previously
 290  * scheduled to.  If we cannot do this but we are at enough of a higher
 291  * priority then the processes running on other cpus, we will allow the
 292  * process to be stolen by another cpu.
 293  *
 294  * WARNING! a thread can be acquired by another cpu the moment it is put
 295  * on the user scheduler's run queue AND we release the MP lock.  Since we
 296  * release the MP lock before switching out another cpu may begin stealing
 297  * our current thread before we are completely switched out!  The
 298  * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
 299  * thread before stealing it.
 300  *
 301  * NOTE on need_user_resched() calls: we have to call need_user_resched()
 302  * if the new process is more important then the current process, or if
 303  * the new process is the current process and is now less important then
 304  * other processes.
 305  *
 306  * The associated thread must NOT be scheduled.
 307  * The process must be runnable.
 308  * This must be called at splhigh().
 309  */
 310 static void
 311 bsd4_setrunqueue(struct lwp *lp)
 312 {
 313         struct rq *q;
 314         struct globaldata *gd;
 315         int pri;
 316         int cpuid;
 317         u_int32_t needresched;
 318 #ifdef SMP
 319         int count;
 320         cpumask_t mask;
 321 #endif
 322
 323         ASSERT_MP_LOCK_HELD(lp->lwp_thread);
 324         crit_enter();
 325         KASSERT(lp->lwp_proc->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
 326         KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0,
 327             ("lwp %d/%d already on runq! flag %08x", lp->lwp_proc->p_pid,
 328              lp->lwp_tid, lp->lwp_proc->p_flag));
 329         KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
 330
 331         /*
 332          * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
 333          */
 334         gd = lp->lwp_thread->td_gd;
 335
 336         /*
 337          * Because recalculate is only called once or twice for long sleeps,
 338          * not every second forever while the process is sleeping, we have
 339          * to manually call it to resynchronize p_cpbase on wakeup or it
 340          * will wrap if the process was sleeping long enough (e.g. ~10 min
 341          * with the ACPI timer) and really mess up the nticks calculation.
 342          */
 343         if (lp->lwp_slptime) {
 344             bsd4_recalculate_estcpu(lp);
 345             lp->lwp_slptime = 0;
 346         }
 347         /*
 348          * We have not been released, make sure that we are not the currently
 349          * designated process.
 350          */
 351         KKASSERT(gd->gd_uschedcp != lp);
 352
 353         /*
 354          * Check cpu affinity.  The associated thread is stable at the
 355          * moment.  Note that we may be checking another cpu here so we
 356          * have to be careful.  We are currently protected by the BGL.
 357          *
 358          * This allows us to avoid actually queueing the process.
 359          * acquire_curproc() will handle any threads we mistakenly schedule.
 360          */
 361         cpuid = gd->gd_cpuid;
 362
 363         if ((curprocmask & (1 << cpuid)) == 0) {
 364                 atomic_set_int(&curprocmask, 1 << cpuid);
 365                 gd->gd_uschedcp = lp;
 366                 gd->gd_upri = lp->lwp_priority;
 367                 lwkt_schedule(lp->lwp_thread);
 368                 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
 369                 crit_exit();
 370 #ifdef SMP
 371                 if (gd != mycpu)
 372                         ++remote_resched_affinity;
 373 #endif
 374                 return;
 375         }
 376
 377         /*
 378          * gd and cpuid may still 'hint' at another cpu.  Even so we have
 379          * to place this process on the userland scheduler's run queue for
 380          * action by the target cpu.
 381          */
 382         ++runqcount;
 383         lp->lwp_proc->p_flag |= P_ONRUNQ;
 384         if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
 385                 pri = (lp->lwp_priority & PRIMASK) / PPQ;
 386                 q = &queues[pri];
 387                 queuebits |= 1 << pri;
 388                 needresched = (queuebits & ((1 << pri) - 1));
 389         } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
 390                    lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
 391                 pri = (u_int8_t)lp->lwp_rtprio.prio;
 392                 q = &rtqueues[pri];
 393                 rtqueuebits |= 1 << pri;
 394                 needresched = (rtqueuebits & ((1 << pri) - 1));
 395         } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
 396                 pri = (u_int8_t)lp->lwp_rtprio.prio;
 397                 q = &idqueues[pri];
 398                 idqueuebits |= 1 << pri;
 399                 needresched = (idqueuebits & ((1 << pri) - 1));
 400         } else {
 401                 needresched = 0;
 402                 panic("setrunqueue: invalid rtprio type");
 403         }
 404         KKASSERT(pri < 32);
 405         lp->lwp_rqindex = pri;          /* remember the queue index */
 406         TAILQ_INSERT_TAIL(q, lp, lwp_procq);
 407
 408 #ifdef SMP
 409         /*
 410          * Either wakeup other cpus user thread scheduler or request
 411          * preemption on other cpus (which will also wakeup a HLT).
 412          *
 413          * NOTE!  gd and cpuid may still be our 'hint', not our current
 414          * cpu info.
 415          */
 416
 417         count = runqcount;
 418
 419         /*
 420          * Check cpu affinity for user preemption (when the curprocmask bit
 421          * is set).  Note that gd_upri is a speculative field (we modify
 422          * another cpu's gd_upri to avoid sending ipiq storms).
 423          */
 424         if (gd == mycpu) {
 425                 if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
 426                         if (lp->lwp_priority < gd->gd_upri - PPQ) {
 427                                 gd->gd_upri = lp->lwp_priority;
 428                                 gd->gd_rrcount = 0;
 429                                 need_user_resched();
 430                                 --count;
 431                         } else if (gd->gd_uschedcp == lp && needresched) {
 432                                 gd->gd_rrcount = 0;
 433                                 need_user_resched();
 434                                 --count;
 435                         }
 436                 }
 437         } else if (remote_resched) {
 438                 if (lp->lwp_priority < gd->gd_upri - PPQ) {
 439                         gd->gd_upri = lp->lwp_priority;
 440                         lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
 441                         --count;
 442                         ++remote_resched_affinity;
 443                 }
 444         }
 445
 446         /*
 447          * No affinity, first schedule to any cpus that do not have a current
 448          * process.  If there is a free cpu we always schedule to it.
 449          */
 450         if (count &&
 451             (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
 452             (lp->lwp_proc->p_flag & P_PASSIVE_ACQ) == 0) {
 453                 if (!mask)
 454                         printf("lwp %d/%d nocpu to schedule it on\n",
 455                                lp->lwp_proc->p_pid, lp->lwp_tid);
 456                 while (mask && count) {
 457                         cpuid = bsfl(mask);
 458                         KKASSERT((curprocmask & (1 << cpuid)) == 0);
 459                         atomic_clear_int(&rdyprocmask, 1 << cpuid);
 460                         lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
 461                         --count;
 462                         mask &= ~(1 << cpuid);
 463                 }
 464         }
 465
 466         /*
 467          * If there are still runnable processes try to wakeup a random
 468          * cpu that is running a much lower priority process in order to
 469          * preempt on it.  Note that gd_upri is only a hint, so we can
 470          * overwrite it from the wrong cpu.   If we can't find one, we
 471          * are SOL.
 472          *
 473          * We depress the priority check so multiple cpu bound programs
 474          * do not bounce between cpus.  Remember that the clock interrupt
 475          * will also cause all cpus to reschedule.
 476          *
 477          * We must mask against rdyprocmask or we will race in the boot
 478          * code (before all cpus have working scheduler helpers), plus
 479          * some cpus might not be operational and/or not configured to
 480          * handle user processes.
 481          */
 482         if (count && remote_resched && ncpus > 1) {
 483                 cpuid = scancpu;
 484                 do {
 485                         if (++cpuid == ncpus)
 486                                 cpuid = 0;
 487                 } while (cpuid == mycpu->gd_cpuid);
 488                 scancpu = cpuid;
 489
 490                 if (rdyprocmask & (1 << cpuid)) {
 491                         gd = globaldata_find(cpuid);
 492
 493                         if (lp->lwp_priority < gd->gd_upri - PPQ) {
 494                                 gd->gd_upri = lp->lwp_priority;
 495                                 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
 496                                 ++remote_resched_nonaffinity;
 497                         }
 498                 }
 499         }
 500 #else
 501         if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
 502                 if (lp->lwp_priority < gd->gd_upri - PPQ) {
 503                         gd->gd_upri = lp->lwp_priority;
 504                         gd->gd_rrcount = 0;
 505                         need_user_resched();
 506                 } else if (gd->gd_uschedcp == lp && needresched) {
 507                         gd->gd_rrcount = 0;
 508                         need_user_resched();
 509                 }
 510         }
 511 #endif
 512         crit_exit();
 513 }
 514
 515 /*
 516  * remrunqueue() removes a given process from the run queue that it is on,
 517  * clearing the queue busy bit if it becomes empty.  This function is called
 518  * when a userland process is selected for LWKT scheduling.  Note that
 519  * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
 520  * several userland processes whos threads are scheduled or otherwise in
 521  * a special state, and such processes are NOT on the userland scheduler's
 522  * run queue.
 523  *
 524  * This must be called at splhigh().
 525  */
 526 static void
 527 bsd4_remrunqueue(struct lwp *lp)
 528 {
 529         struct rq *q;
 530         u_int32_t *which;
 531         u_int8_t pri;
 532
 533         ASSERT_MP_LOCK_HELD(lp->lwp_thread);
 534         crit_enter();
 535         KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
 536         lp->lwp_proc->p_flag &= ~P_ONRUNQ;
 537         --runqcount;
 538         KKASSERT(runqcount >= 0);
 539         pri = lp->lwp_rqindex;
 540         if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
 541                 q = &queues[pri];
 542                 which = &queuebits;
 543         } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
 544                    lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
 545                 q = &rtqueues[pri];
 546                 which = &rtqueuebits;
 547         } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
 548                 q = &idqueues[pri];
 549                 which = &idqueuebits;
 550         } else {
 551                 panic("remrunqueue: invalid rtprio type");
 552         }
 553         TAILQ_REMOVE(q, lp, lwp_procq);
 554         if (TAILQ_EMPTY(q)) {
 555                 KASSERT((*which & (1 << pri)) != 0,
 556                         ("remrunqueue: remove from empty queue"));
 557                 *which &= ~(1 << pri);
 558         }
 559         crit_exit();
 560 }
 561
 562 /*
 563  * This routine is called from a systimer IPI.  It MUST be MP-safe and
 564  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ.
 565  *
 566  * MPSAFE
 567  */
 568 static
 569 void
 570 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
 571 {
 572         globaldata_t gd = mycpu;
 573
 574         /*
 575          * Do we need to round-robin?  We round-robin 10 times a second.
 576          * This should only occur for cpu-bound batch processes.
 577          */
 578         if (++gd->gd_rrcount >= usched_bsd4_rrinterval) {
 579                 gd->gd_rrcount = 0;
 580                 need_user_resched();
 581         }
 582
 583         /*
 584          * As the process accumulates cpu time p_estcpu is bumped and may
 585          * push the process into another scheduling queue.  It typically
 586          * takes 4 ticks to bump the queue.
 587          */
 588         lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
 589
 590         /*
 591          * Reducing p_origcpu over time causes more of our estcpu to be
 592          * returned to the parent when we exit.  This is a small tweak
 593          * for the batch detection heuristic.
 594          */
 595         if (lp->lwp_origcpu)
 596                 --lp->lwp_origcpu;
 597
 598         /* XXX optimize, avoid lock if no reset is required */
 599         if (try_mplock()) {
 600                 bsd4_resetpriority(lp);
 601                 rel_mplock();
 602         }
 603 }
 604
 605 /*
 606  * Release the current process designation on p.  P MUST BE CURPROC.
 607  * Attempt to assign a new current process from the run queue.
 608  *
 609  * This function is called from exit1(), tsleep(), and the passive
 610  * release code setup in <arch>/<arch>/trap.c
 611  *
 612  * If we do not have or cannot get the MP lock we just wakeup the userland
 613  * helper scheduler thread for this cpu to do the work for us.
 614  *
 615  * WARNING!  The MP lock may be in an unsynchronized state due to the
 616  * way get_mplock() works and the fact that this function may be called
 617  * from a passive release during a lwkt_switch().   try_mplock() will deal
 618  * with this for us but you should be aware that td_mpcount may not be
 619  * useable.
 620  */
 621 static void
 622 bsd4_release_curproc(struct lwp *lp)
 623 {
 624         int cpuid;
 625         globaldata_t gd = mycpu;
 626
 627         KKASSERT(lp->lwp_thread->td_gd == gd);
 628         crit_enter();
 629         cpuid = gd->gd_cpuid;
 630
 631         if (gd->gd_uschedcp == lp) {
 632                 if (try_mplock()) {
 633                         /*
 634                          * If we can obtain the MP lock we can directly
 635                          * select the next current process.
 636                          *
 637                          * bsd4_select_curproc() will adjust curprocmask
 638                          * for us.
 639                          */
 640                         gd->gd_uschedcp = NULL;
 641                         gd->gd_upri = PRIBASE_NULL;
 642                         bsd4_select_curproc(gd);
 643                         rel_mplock();
 644                 } else {
 645                         /*
 646                          * If we cannot obtain the MP lock schedule our
 647                          * helper thread to select the next current
 648                          * process.
 649                          *
 650                          * This is the only place where we adjust curprocmask
 651                          * and rdyprocmask without holding the MP lock.
 652                          */
 653                         gd->gd_uschedcp = NULL;
 654                         gd->gd_upri = PRIBASE_NULL;
 655                         atomic_clear_int(&curprocmask, 1 << cpuid);
 656                         if (runqcount && (rdyprocmask & (1 << cpuid))) {
 657                                 atomic_clear_int(&rdyprocmask, 1 << cpuid);
 658                                 lwkt_schedule(&mycpu->gd_schedthread);
 659                         }
 660                 }
 661         }
 662         crit_exit();
 663 }
 664
 665 /*
 666  * Select a new current process, potentially retaining gd_uschedcp.  However,
 667  * be sure to round-robin.  This routine is generally only called if a
 668  * reschedule is requested and that typically only occurs if a new process
 669  * has a better priority or when we are round-robining.
 670  *
 671  * NOTE: Must be called with giant held and the current cpu's gd.
 672  * NOTE: The caller must handle the situation where it loses a
 673  *      uschedcp designation that it previously held, typically by
 674  *      calling acquire_curproc() again.
 675  * NOTE: May not block
 676  */
 677 static
 678 void
 679 bsd4_select_curproc(globaldata_t gd)
 680 {
 681         struct lwp *nlp;
 682         int cpuid = gd->gd_cpuid;
 683         void *old;
 684
 685         clear_user_resched();
 686         get_mplock();
 687
 688         /*
 689          * Choose the next designated current user process.
 690          * Note that we cannot schedule gd_schedthread
 691          * if runqcount is 0 without creating a scheduling
 692          * loop.
 693          *
 694          * We do not clear the user resched request here,
 695          * we need to test it later when we re-acquire.
 696          *
 697          * NOTE: chooseproc returns NULL if the chosen lwp
 698          * is gd_uschedcp. XXX needs cleanup.
 699          */
 700         old = gd->gd_uschedcp;
 701         if ((nlp = chooseproc(gd->gd_uschedcp)) != NULL) {
 702                 atomic_set_int(&curprocmask, 1 << cpuid);
 703                 gd->gd_upri = nlp->lwp_priority;
 704                 gd->gd_uschedcp = nlp;
 705                 lwkt_acquire(nlp->lwp_thread);
 706                 lwkt_schedule(nlp->lwp_thread);
 707         } else if (gd->gd_uschedcp) {
 708                 gd->gd_upri = gd->gd_uschedcp->lwp_priority;
 709                 KKASSERT(curprocmask & (1 << cpuid));
 710         } else if (runqcount && (rdyprocmask & (1 << cpuid))) {
 711                 /*gd->gd_uschedcp = NULL;*/
 712                 atomic_clear_int(&curprocmask, 1 << cpuid);
 713                 atomic_clear_int(&rdyprocmask, 1 << cpuid);
 714                 lwkt_schedule(&gd->gd_schedthread);
 715         } else {
 716                 /*gd->gd_uschedcp = NULL;*/
 717                 atomic_clear_int(&curprocmask, 1 << cpuid);
 718         }
 719         rel_mplock();
 720 }
 721
 722 /*
 723  * Acquire the current process designation on the CURRENT process only.
 724  * This function is called at kernel-user priority (not userland priority)
 725  * when curlwp does not match gd_uschedcp.
 726  *
 727  * This function is only called just prior to returning to user mode.
 728  *
 729  * Basically we recalculate our estcpu to hopefully give us a more
 730  * favorable disposition, setrunqueue, then wait for the curlwp
 731  * designation to be handed to us (if the setrunqueue didn't do it).
 732  *
 733  * WARNING! THIS FUNCTION MAY CAUSE THE CURRENT THREAD TO MIGRATE TO
 734  * ANOTHER CPU!  Because most of the kernel assumes that no migration will
 735  * occur, this function is called only under very controlled circumstances.
 736  */
 737 static void
 738 bsd4_acquire_curproc(struct lwp *lp)
 739 {
 740         globaldata_t gd = mycpu;
 741
 742         get_mplock();
 743         crit_enter();
 744
 745         /*
 746          * Recalculate our priority and put us back on the userland
 747          * scheduler's runq.
 748          *
 749          * Only increment the involuntary context switch count if the
 750          * setrunqueue call did not immediately schedule us.
 751          */
 752         KKASSERT(lp == gd->gd_curthread->td_lwp);
 753         bsd4_recalculate_estcpu(lp);
 754         lwkt_deschedule_self(gd->gd_curthread);
 755         bsd4_setrunqueue(lp);
 756         if ((gd->gd_curthread->td_flags & TDF_RUNQ) == 0)
 757                 ++lp->lwp_stats->p_ru.ru_nivcsw;
 758         lwkt_switch();
 759
 760         /*
 761          * Because we put ourselves back on the userland scheduler's run
 762          * queue, WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU
 763          */
 764         gd = mycpu;
 765
 766         /*
 767          * We better be the current process when we wake up, and we had
 768          * better not be on the run queue.
 769          */
 770         KKASSERT(gd->gd_uschedcp == lp);
 771         KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
 772
 773         crit_exit();
 774         rel_mplock();
 775 }
 776
 777 /*
 778  * Compute the priority of a process when running in user mode.
 779  * Arrange to reschedule if the resulting priority is better
 780  * than that of the current process.
 781  */
 782 static void
 783 bsd4_resetpriority(struct lwp *lp)
 784 {
 785         int newpriority;
 786         int opq;
 787         int npq;
 788
 789         ASSERT_MP_LOCK_HELD(curthread);
 790
 791         /*
 792          * Set p_priority for general process comparisons
 793          */
 794         switch(lp->lwp_rtprio.type) {
 795         case RTP_PRIO_REALTIME:
 796                 lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio;
 797                 return;
 798         case RTP_PRIO_NORMAL:
 799                 break;
 800         case RTP_PRIO_IDLE:
 801                 lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio;
 802                 return;
 803         case RTP_PRIO_THREAD:
 804                 lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio;
 805                 return;
 806         }
 807
 808         /*
 809          * NORMAL priorities fall through.  These are based on niceness
 810          * and cpu use.  Lower numbers == higher priorities.
 811          *
 812          * Calculate our priority based on our niceness and estimated cpu.
 813          * Note that the nice value adjusts the baseline, which effects
 814          * cpu bursts but does not effect overall cpu use between cpu-bound
 815          * processes.  The use of the nice field in the decay calculation
 816          * controls the overall cpu use.
 817          *
 818          * This isn't an exact calculation.  We fit the full nice and
 819          * estcpu range into the priority range so the actual PPQ value
 820          * is incorrect, but it's still a reasonable way to think about it.
 821          */
 822         newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
 823         newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ;
 824         newpriority = newpriority * MAXPRI /
 825                     (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
 826         newpriority = MIN(newpriority, MAXPRI - 1);     /* sanity */
 827         newpriority = MAX(newpriority, 0);              /* sanity */
 828         npq = newpriority / PPQ;
 829         crit_enter();
 830         opq = (lp->lwp_priority & PRIMASK) / PPQ;
 831         if (lp->lwp_proc->p_stat == SRUN && (lp->lwp_proc->p_flag & P_ONRUNQ) && opq != npq) {
 832                 /*
 833                  * We have to move the process to another queue
 834                  */
 835                 bsd4_remrunqueue(lp);
 836                 lp->lwp_priority = PRIBASE_NORMAL + newpriority;
 837                 bsd4_setrunqueue(lp);
 838         } else {
 839                 /*
 840                  * We can just adjust the priority and it will be picked
 841                  * up later.
 842                  */
 843                 KKASSERT(opq == npq || (lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
 844                 lp->lwp_priority = PRIBASE_NORMAL + newpriority;
 845         }
 846         crit_exit();
 847 }
 848
 849 /*
 850  * Called from fork1() when a new child process is being created.
 851  *
 852  * Give the child process an initial estcpu that is more batch then
 853  * its parent and dock the parent for the fork (but do not
 854  * reschedule the parent).   This comprises the main part of our batch
 855  * detection heuristic for both parallel forking and sequential execs.
 856  *
 857  * Interactive processes will decay the boosted estcpu quickly while batch
 858  * processes will tend to compound it.
 859  * XXX lwp should be "spawning" instead of "forking"
 860  *
 861  * MPSAFE
 862  */
 863 static void
 864 bsd4_forking(struct lwp *plp, struct lwp *lp)
 865 {
 866         lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
 867         lp->lwp_origcpu = lp->lwp_estcpu;
 868         plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
 869 }
 870
 871 /*
 872  * Called when the parent reaps a child.   Propogate cpu use by the child
 873  * back to the parent.
 874  *
 875  * MPSAFE
 876  */
 877 static void
 878 bsd4_exiting(struct lwp *plp, struct lwp *lp)
 879 {
 880         int delta;
 881
 882         if (plp->lwp_proc->p_pid != 1) {
 883                 delta = lp->lwp_estcpu - lp->lwp_origcpu;
 884                 if (delta > 0)
 885                         plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + delta);
 886         }
 887 }
 888
 889 /*
 890  * Called from acquire and from kern_synch's one-second timer with a
 891  * critical section held.
 892  *
 893  * Decay p_estcpu based on the number of ticks we haven't been running
 894  * and our p_nice.  As the load increases each process observes a larger
 895  * number of idle ticks (because other processes are running in them).
 896  * This observation leads to a larger correction which tends to make the
 897  * system more 'batchy'.
 898  *
 899  * Note that no recalculation occurs for a process which sleeps and wakes
 900  * up in the same tick.  That is, a system doing thousands of context
 901  * switches per second will still only do serious estcpu calculations
 902  * ESTCPUFREQ times per second.
 903  */
 904 static
 905 void
 906 bsd4_recalculate_estcpu(struct lwp *lp)
 907 {
 908         globaldata_t gd = mycpu;
 909         sysclock_t cpbase;
 910         int loadfac;
 911         int ndecay;
 912         int nticks;
 913         int nleft;
 914
 915         ASSERT_MP_LOCK_HELD(curthread);
 916
 917         /*
 918          * We have to subtract periodic to get the last schedclock
 919          * timeout time, otherwise we would get the upcoming timeout.
 920          * Keep in mind that a process can migrate between cpus and
 921          * while the scheduler clock should be very close, boundary
 922          * conditions could lead to a small negative delta.
 923          */
 924         cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
 925
 926         if (lp->lwp_slptime > 1) {
 927                 /*
 928                  * Too much time has passed, do a coarse correction.
 929                  */
 930                 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
 931                 bsd4_resetpriority(lp);
 932                 lp->lwp_cpbase = cpbase;
 933                 lp->lwp_cpticks = 0;
 934         } else if (lp->lwp_cpbase != cpbase) {
 935                 /*
 936                  * Adjust estcpu if we are in a different tick.  Don't waste
 937                  * time if we are in the same tick.
 938                  *
 939                  * First calculate the number of ticks in the measurement
 940                  * interval.
 941                  */
 942                 nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic;
 943                 updatepcpu(lp, lp->lwp_cpticks, nticks);
 944
 945                 if ((nleft = nticks - lp->lwp_cpticks) < 0)
 946                         nleft = 0;
 947                 if (usched_debug == lp->lwp_proc->p_pid) {
 948                         printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d",
 949                                 lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu,
 950                                 lp->lwp_cpticks, nticks, nleft);
 951                 }
 952
 953                 /*
 954                  * Calculate a decay value based on ticks remaining scaled
 955                  * down by the instantanious load and p_nice.
 956                  */
 957                 if ((loadfac = runqcount) < 2)
 958                         loadfac = 2;
 959                 ndecay = nleft * usched_bsd4_decay * 2 *
 960                         (PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2);
 961
 962                 /*
 963                  * Adjust p_estcpu.  Handle a border case where batch jobs
 964                  * can get stalled long enough to decay to zero when they
 965                  * shouldn't.
 966                  */
 967                 if (lp->lwp_estcpu > ndecay * 2)
 968                         lp->lwp_estcpu -= ndecay;
 969                 else
 970                         lp->lwp_estcpu >>= 1;
 971
 972                 if (usched_debug == lp->lwp_proc->p_pid)
 973                         printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu);
 974
 975                 bsd4_resetpriority(lp);
 976                 lp->lwp_cpbase = cpbase;
 977                 lp->lwp_cpticks = 0;
 978         }
 979 }
 980
 981 #ifdef SMP
 982
 983 /*
 984  * For SMP systems a user scheduler helper thread is created for each
 985  * cpu and is used to allow one cpu to wakeup another for the purposes of
 986  * scheduling userland threads from setrunqueue().  UP systems do not
 987  * need the helper since there is only one cpu.  We can't use the idle
 988  * thread for this because we need to hold the MP lock.  Additionally,
 989  * doing things this way allows us to HLT idle cpus on MP systems.
 990  */
 991 static void
 992 sched_thread(void *dummy)
 993 {
 994     globaldata_t gd = mycpu;
 995     int cpuid = gd->gd_cpuid;           /* doesn't change */
 996     u_int32_t cpumask = 1 << cpuid;     /* doesn't change */
 997
 998     ASSERT_MP_LOCK_HELD(curthread);
 999     for (;;) {
1000         struct lwp *nlp;
1001
1002         lwkt_deschedule_self(gd->gd_curthread); /* interlock */
1003         atomic_set_int(&rdyprocmask, cpumask);
1004         crit_enter_quick(gd->gd_curthread);
1005         if ((curprocmask & cpumask) == 0 && (nlp = chooseproc(NULL)) != NULL) {
1006             atomic_set_int(&curprocmask, cpumask);
1007             gd->gd_upri = nlp->lwp_priority;
1008             gd->gd_uschedcp = nlp;
1009             lwkt_acquire(nlp->lwp_thread);
1010             lwkt_schedule(nlp->lwp_thread);
1011         }
1012         crit_exit_quick(gd->gd_curthread);
1013         lwkt_switch();
1014     }
1015 }
1016
1017 /*
1018  * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
1019  * been cleared by rqinit() and we should not mess with it further.
1020  */
1021 static void
1022 sched_thread_cpu_init(void)
1023 {
1024     int i;
1025
1026     if (bootverbose)
1027         printf("start scheduler helpers on cpus:");
1028
1029     for (i = 0; i < ncpus; ++i) {
1030         globaldata_t dgd = globaldata_find(i);
1031         cpumask_t mask = 1 << i;
1032
1033         if ((mask & smp_active_mask) == 0)
1034             continue;
1035
1036         if (bootverbose)
1037             printf(" %d", i);
1038
1039         lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread,
1040                     TDF_STOPREQ, i, "usched %d", i);
1041
1042         /*
1043          * Allow user scheduling on the target cpu.  cpu #0 has already
1044          * been enabled in rqinit().
1045          */
1046         if (i)
1047             atomic_clear_int(&curprocmask, mask);
1048         atomic_set_int(&rdyprocmask, mask);
1049     }
1050     if (bootverbose)
1051         printf("\n");
1052 }
1053 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL)
1054
1055 #endif
1056