sys/kern/kern_switch.c

   1 /*
   2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  * $FreeBSD: src/sys/kern/kern_switch.c,v 1.3.2.1 2000/05/16 06:58:12 dillon Exp $
  27  * $DragonFly: src/sys/kern/Attic/kern_switch.c,v 1.23 2004/07/24 20:37:04 dillon Exp $
  28  */
  29
  30 #include <sys/param.h>
  31 #include <sys/systm.h>
  32 #include <sys/kernel.h>
  33 #include <sys/lock.h>
  34 #include <sys/queue.h>
  35 #include <sys/proc.h>
  36 #include <sys/rtprio.h>
  37 #include <sys/thread2.h>
  38 #include <sys/uio.h>
  39 #include <sys/sysctl.h>
  40 #include <sys/resourcevar.h>
  41 #include <machine/ipl.h>
  42 #include <machine/cpu.h>
  43 #include <machine/smp.h>
  44
  45 /*
  46  * debugging only YYY Remove me!   define to schedule user processes only
  47  * on the BSP.  Interrupts can still be taken on the APs.
  48  */
  49 #undef ONLY_ONE_USER_CPU
  50
  51 /*
  52  * We have NQS (32) run queues per scheduling class.  For the normal
  53  * class, there are 128 priorities scaled onto these 32 queues.  New
  54  * processes are added to the last entry in each queue, and processes
  55  * are selected for running by taking them from the head and maintaining
  56  * a simple FIFO arrangement.  Realtime and Idle priority processes have
  57  * and explicit 0-31 priority which maps directly onto their class queue
  58  * index.  When a queue has something in it, the corresponding bit is
  59  * set in the queuebits variable, allowing a single read to determine
  60  * the state of all 32 queues and then a ffs() to find the first busy
  61  * queue.
  62  */
  63 static struct rq queues[NQS];
  64 static struct rq rtqueues[NQS];
  65 static struct rq idqueues[NQS];
  66 static u_int32_t queuebits;
  67 static u_int32_t rtqueuebits;
  68 static u_int32_t idqueuebits;
  69 static cpumask_t curprocmask = -1;      /* currently running a user process */
  70 static cpumask_t rdyprocmask;           /* ready to accept a user process */
  71 static int       runqcount;
  72 #ifdef SMP
  73 static int       scancpu;
  74 #endif
  75
  76 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
  77 #ifdef INVARIANTS
  78 static int usched_nonoptimal;
  79 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
  80         &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
  81 static int usched_optimal;
  82 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
  83         &usched_optimal, 0, "acquire_curproc() was optimal");
  84 #endif
  85 static int usched_debug;
  86 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, "");
  87 #ifdef SMP
  88 static int remote_resched = 1;
  89 static int remote_resched_nonaffinity;
  90 static int remote_resched_affinity;
  91 static int choose_affinity;
  92 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
  93         &remote_resched, 0, "Resched to another cpu");
  94 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
  95         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
  96 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
  97         &remote_resched_affinity, 0, "Number of remote rescheds");
  98 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
  99         &choose_affinity, 0, "chooseproc() was smart");
 100 #endif
 101
 102 /*
 103  * Initialize the run queues at boot time.
 104  */
 105 static void
 106 rqinit(void *dummy)
 107 {
 108         int i;
 109
 110         for (i = 0; i < NQS; i++) {
 111                 TAILQ_INIT(&queues[i]);
 112                 TAILQ_INIT(&rtqueues[i]);
 113                 TAILQ_INIT(&idqueues[i]);
 114         }
 115         curprocmask &= ~1;
 116 }
 117 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
 118
 119 /*
 120  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
 121  * it selects a user process and returns it.  If chkp is non-NULL and chkp
 122  * has a better or equal then the process that would otherwise be
 123  * chosen, NULL is returned.
 124  *
 125  * Until we fix the RUNQ code the chkp test has to be strict or we may
 126  * bounce between processes trying to acquire the current process designation.
 127  */
 128 static
 129 struct proc *
 130 chooseproc(struct proc *chkp)
 131 {
 132         struct proc *p;
 133         struct rq *q;
 134         u_int32_t *which;
 135         u_int32_t pri;
 136
 137         if (rtqueuebits) {
 138                 pri = bsfl(rtqueuebits);
 139                 q = &rtqueues[pri];
 140                 which = &rtqueuebits;
 141         } else if (queuebits) {
 142                 pri = bsfl(queuebits);
 143                 q = &queues[pri];
 144                 which = &queuebits;
 145         } else if (idqueuebits) {
 146                 pri = bsfl(idqueuebits);
 147                 q = &idqueues[pri];
 148                 which = &idqueuebits;
 149         } else {
 150                 return NULL;
 151         }
 152         p = TAILQ_FIRST(q);
 153         KASSERT(p, ("chooseproc: no proc on busy queue"));
 154
 155         /*
 156          * If the passed process <chkp> is reasonably close to the selected
 157          * processed <p>, return NULL (indicating that <chkp> should be kept).
 158          *
 159          * Note that we must error on the side of <chkp> to avoid bouncing
 160          * between threads in the acquire code.
 161          */
 162         if (chkp) {
 163                 if (chkp->p_priority < p->p_priority + PPQ)
 164                         return(NULL);
 165         }
 166
 167 #ifdef SMP
 168         /*
 169          * If the chosen process does not reside on this cpu spend a few
 170          * cycles looking for a better candidate at the same priority level.
 171          * This is a fallback check, setrunqueue() tries to wakeup the
 172          * correct cpu and is our front-line affinity.
 173          */
 174         if (p->p_thread->td_gd != mycpu &&
 175             (chkp = TAILQ_NEXT(p, p_procq)) != NULL
 176         ) {
 177                 if (chkp->p_thread->td_gd == mycpu) {
 178                         ++choose_affinity;
 179                         p = chkp;
 180                 }
 181         }
 182 #endif
 183
 184         TAILQ_REMOVE(q, p, p_procq);
 185         --runqcount;
 186         if (TAILQ_EMPTY(q))
 187                 *which &= ~(1 << pri);
 188         KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
 189         p->p_flag &= ~P_ONRUNQ;
 190         return p;
 191 }
 192
 193 #ifdef SMP
 194 /*
 195  * called via an ipi message to reschedule on another cpu.
 196  */
 197 static
 198 void
 199 need_user_resched_remote(void *dummy)
 200 {
 201         need_user_resched();
 202 }
 203
 204 #endif
 205
 206 /*
 207  * setrunqueue() 'wakes up' a 'user' process.  GIANT must be held.  The
 208  * user process may represent any user process, including the current
 209  * process.
 210  *
 211  * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
 212  * cpus in an attempt to keep the process on the current cpu at least for
 213  * a little while to take advantage of locality of reference (e.g. fork/exec
 214  * or short fork/exit, and uio_yield()).
 215  *
 216  * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
 217  * or (user level) preempt on the same cpu that a process was previously
 218  * scheduled to.  If we cannot do this but we are at enough of a higher
 219  * priority then the processes running on other cpus, we will allow the
 220  * process to be stolen by another cpu.
 221  *
 222  * WARNING! a thread can be acquired by another cpu the moment it is put
 223  * on the user scheduler's run queue AND we release the MP lock.  Since we
 224  * release the MP lock before switching out another cpu may begin stealing
 225  * our current thread before we are completely switched out!  The
 226  * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
 227  * thread before stealing it.
 228  *
 229  * The associated thread must NOT be scheduled.
 230  * The process must be runnable.
 231  * This must be called at splhigh().
 232  */
 233 void
 234 setrunqueue(struct proc *p)
 235 {
 236         struct rq *q;
 237         struct globaldata *gd;
 238         int pri;
 239         int cpuid;
 240 #ifdef SMP
 241         int count;
 242         cpumask_t mask;
 243 #endif
 244
 245         crit_enter();
 246         KASSERT(p->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
 247         KASSERT((p->p_flag & P_ONRUNQ) == 0,
 248             ("process %d already on runq! flag %08x", p->p_pid, p->p_flag));
 249         KKASSERT((p->p_thread->td_flags & TDF_RUNQ) == 0);
 250
 251         /*
 252          * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
 253          */
 254         gd = p->p_thread->td_gd;
 255
 256         /*
 257          * We have not been released, make sure that we are not the currently
 258          * designated process.
 259          */
 260         KKASSERT(gd->gd_uschedcp != p);
 261
 262         /*
 263          * Check cpu affinity.  The associated thread is stable at the
 264          * moment.  Note that we may be checking another cpu here so we
 265          * have to be careful.  We are currently protected by the BGL.
 266          *
 267          * This allows us to avoid actually queueing the process.
 268          * acquire_curproc() will handle any threads we mistakenly schedule.
 269          */
 270         cpuid = gd->gd_cpuid;
 271
 272         if ((curprocmask & (1 << cpuid)) == 0) {
 273                 curprocmask |= 1 << cpuid;
 274                 gd->gd_uschedcp = p;
 275                 if (usched_debug)
 276                         printf("F%-7d", gd->gd_uschedcp->p_pid);
 277                 gd->gd_upri = p->p_priority;
 278                 lwkt_schedule(p->p_thread);
 279                 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
 280                 crit_exit();
 281 #ifdef SMP
 282                 if (gd != mycpu)
 283                         ++remote_resched_affinity;
 284 #endif
 285                 return;
 286         }
 287
 288         /*
 289          * gd and cpuid may still 'hint' at another cpu.  Even so we have
 290          * to place this process on the userland scheduler's run queue for
 291          * action by the target cpu.
 292          */
 293         ++runqcount;
 294         p->p_flag |= P_ONRUNQ;
 295         if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
 296                 pri = (p->p_priority & PRIMASK) >> 2;
 297                 q = &queues[pri];
 298                 queuebits |= 1 << pri;
 299         } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
 300                    p->p_rtprio.type == RTP_PRIO_FIFO) {
 301                 pri = (u_int8_t)p->p_rtprio.prio;
 302                 q = &rtqueues[pri];
 303                 rtqueuebits |= 1 << pri;
 304         } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
 305                 pri = (u_int8_t)p->p_rtprio.prio;
 306                 q = &idqueues[pri];
 307                 idqueuebits |= 1 << pri;
 308         } else {
 309                 panic("setrunqueue: invalid rtprio type");
 310         }
 311         KKASSERT(pri < 32);
 312         p->p_rqindex = pri;             /* remember the queue index */
 313         TAILQ_INSERT_TAIL(q, p, p_procq);
 314
 315 #ifdef SMP
 316         /*
 317          * Either wakeup other cpus user thread scheduler or request
 318          * preemption on other cpus (which will also wakeup a HLT).
 319          *
 320          * NOTE!  gd and cpuid may still be our 'hint', not our current
 321          * cpu info.
 322          */
 323
 324         count = runqcount;
 325
 326         /*
 327          * Check cpu affinity for user preemption (when the curprocmask bit
 328          * is set).  Note that gd_upri is a speculative field (we modify
 329          * another cpu's gd_upri to avoid sending ipiq storms).
 330          */
 331         if (gd == mycpu) {
 332                 if ((p->p_thread->td_flags & TDF_NORESCHED) == 0 &&
 333                     p->p_priority < gd->gd_upri - PPQ) {
 334                         gd->gd_upri = p->p_priority;
 335                         need_user_resched();
 336                         --count;
 337                 }
 338         } else if (remote_resched) {
 339                 if (p->p_priority < gd->gd_upri - PPQ) {
 340                         gd->gd_upri = p->p_priority;
 341                         lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
 342                         --count;
 343                         ++remote_resched_affinity;
 344                 }
 345         }
 346
 347         /*
 348          * No affinity, first schedule to any cpus that do not have a current
 349          * process.  If there is a free cpu we always schedule to it.
 350          */
 351         if (count &&
 352             (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
 353             (p->p_flag & P_PASSIVE_ACQ) == 0) {
 354                 if (!mask)
 355                         printf("PROC %d nocpu to schedule it on\n", p->p_pid);
 356                 while (mask && count) {
 357                         cpuid = bsfl(mask);
 358                         KKASSERT((curprocmask & (1 << cpuid)) == 0);
 359                         rdyprocmask &= ~(1 << cpuid);
 360                         lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
 361                         --count;
 362                         mask &= ~(1 << cpuid);
 363                 }
 364         }
 365
 366         /*
 367          * If there are still runnable processes try to wakeup a random
 368          * cpu that is running a much lower priority process in order to
 369          * preempt on it.  Note that gd_upri is only a hint, so we can
 370          * overwrite it from the wrong cpu.   If we can't find one, we
 371          * are SOL.
 372          *
 373          * We depress the priority check so multiple cpu bound programs
 374          * do not bounce between cpus.  Remember that the clock interrupt
 375          * will also cause all cpus to reschedule.
 376          *
 377          * We must mask against rdyprocmask or we will race in the boot
 378          * code (before all cpus have working scheduler helpers), plus
 379          * some cpus might not be operational and/or not configured to
 380          * handle user processes.
 381          */
 382         if (count && remote_resched && ncpus > 1) {
 383                 cpuid = scancpu;
 384                 do {
 385                         if (++cpuid == ncpus)
 386                                 cpuid = 0;
 387                 } while (cpuid == mycpu->gd_cpuid);
 388                 scancpu = cpuid;
 389
 390                 if (rdyprocmask & (1 << cpuid)) {
 391                         gd = globaldata_find(cpuid);
 392
 393                         if (p->p_priority < gd->gd_upri - PPQ) {
 394                                 gd->gd_upri = p->p_priority;
 395                                 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
 396                                 ++remote_resched_nonaffinity;
 397                         }
 398                 }
 399         }
 400 #else
 401         if ((p->p_thread->td_flags & TDF_NORESCHED) == 0 &&
 402             p->p_priority < gd->gd_upri - PPQ) {
 403                 gd->gd_upri = p->p_priority;
 404                 need_user_resched();
 405         }
 406 #endif
 407         crit_exit();
 408 }
 409
 410 /*
 411  * remrunqueue() removes a given process from the run queue that it is on,
 412  * clearing the queue busy bit if it becomes empty.  This function is called
 413  * when a userland process is selected for LWKT scheduling.  Note that
 414  * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
 415  * several userland processes whos threads are scheduled or otherwise in
 416  * a special state, and such processes are NOT on the userland scheduler's
 417  * run queue.
 418  *
 419  * This must be called at splhigh().
 420  */
 421 void
 422 remrunqueue(struct proc *p)
 423 {
 424         struct rq *q;
 425         u_int32_t *which;
 426         u_int8_t pri;
 427
 428         crit_enter();
 429         KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
 430         p->p_flag &= ~P_ONRUNQ;
 431         --runqcount;
 432         KKASSERT(runqcount >= 0);
 433         pri = p->p_rqindex;
 434         if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
 435                 q = &queues[pri];
 436                 which = &queuebits;
 437         } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
 438                    p->p_rtprio.type == RTP_PRIO_FIFO) {
 439                 q = &rtqueues[pri];
 440                 which = &rtqueuebits;
 441         } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
 442                 q = &idqueues[pri];
 443                 which = &idqueuebits;
 444         } else {
 445                 panic("remrunqueue: invalid rtprio type");
 446         }
 447         TAILQ_REMOVE(q, p, p_procq);
 448         if (TAILQ_EMPTY(q)) {
 449                 KASSERT((*which & (1 << pri)) != 0,
 450                         ("remrunqueue: remove from empty queue"));
 451                 *which &= ~(1 << pri);
 452         }
 453         crit_exit();
 454 }
 455
 456 /*
 457  * Release the current process designation on p.  P MUST BE CURPROC.
 458  * Attempt to assign a new current process from the run queue.
 459  *
 460  * This function is called from exit1(), tsleep(), and the passive
 461  * release code setup in <arch>/<arch>/trap.c
 462  *
 463  * If we do not have or cannot get the MP lock we just wakeup the userland
 464  * helper scheduler thread for this cpu to do the work for us.
 465  *
 466  * WARNING!  The MP lock may be in an unsynchronized state due to the
 467  * way get_mplock() works and the fact that this function may be called
 468  * from a passive release during a lwkt_switch().   try_mplock() will deal
 469  * with this for us but you should be aware that td_mpcount may not be
 470  * useable.
 471  */
 472 void
 473 release_curproc(struct proc *p)
 474 {
 475         int cpuid;
 476         globaldata_t gd = mycpu;
 477
 478 #ifdef ONLY_ONE_USER_CPU
 479         KKASSERT(gd->gd_cpuid == 0 && p->p_thread->td_gd == gd);
 480 #else
 481         KKASSERT(p->p_thread->td_gd == gd);
 482 #endif
 483         crit_enter();
 484         if (usched_debug) {
 485             printf("c%-7d", p->p_pid);
 486         }
 487         cpuid = gd->gd_cpuid;
 488
 489         if (gd->gd_uschedcp == p) {
 490                 if (try_mplock()) {
 491                         /*
 492                          * YYY when the MP lock is not assumed (see else) we
 493                          * will have to check that gd_uschedcp is still == p
 494                          * after acquisition of the MP lock
 495                          */
 496                         gd->gd_uschedcp = NULL;
 497                         gd->gd_upri = PRIBASE_NULL;
 498                         select_curproc(gd);
 499                         rel_mplock();
 500                 } else {
 501                         KKASSERT(0);    /* MP LOCK ALWAYS HELD AT THE MOMENT */
 502                         gd->gd_uschedcp = NULL;
 503                         gd->gd_upri = PRIBASE_NULL;
 504                         /* YYY uschedcp and curprocmask */
 505                         if (runqcount && (rdyprocmask & (1 << cpuid))) {
 506                                 rdyprocmask &= ~(1 << cpuid);
 507                                 lwkt_schedule(&mycpu->gd_schedthread);
 508                         }
 509                 }
 510         }
 511         crit_exit();
 512 }
 513
 514 /*
 515  * Select a new current process, potentially retaining gd_uschedcp.  However,
 516  * be sure to round-robin.  This routine is generally only called if a
 517  * reschedule is requested and that typically only occurs if a new process
 518  * has a better priority or when we are round-robining.
 519  *
 520  * NOTE: Must be called with giant held and the current cpu's gd.
 521  * NOTE: The caller must handle the situation where it loses a
 522  *      uschedcp designation that it previously held, typically by
 523  *      calling acquire_curproc() again.
 524  * NOTE: May not block
 525  */
 526 void
 527 select_curproc(globaldata_t gd)
 528 {
 529         struct proc *np;
 530         int cpuid = gd->gd_cpuid;
 531         void *old;
 532
 533         clear_user_resched();
 534
 535         /*
 536          * Choose the next designated current user process.
 537          * Note that we cannot schedule gd_schedthread
 538          * if runqcount is 0 without creating a scheduling
 539          * loop.
 540          *
 541          * We do not clear the user resched request here,
 542          * we need to test it later when we re-acquire.
 543          *
 544          * NOTE: chooseproc returns NULL if the chosen proc
 545          * is gd_uschedcp. XXX needs cleanup.
 546          */
 547         old = gd->gd_uschedcp;
 548         if ((np = chooseproc(gd->gd_uschedcp)) != NULL) {
 549                 curprocmask |= 1 << cpuid;
 550                 gd->gd_upri = np->p_priority;
 551                 gd->gd_uschedcp = np;
 552                 if (usched_debug) {
 553                     printf("A%-7d[%p,%p]", gd->gd_uschedcp->p_pid, old, np);
 554                 }
 555                 lwkt_acquire(np->p_thread);
 556                 lwkt_schedule(np->p_thread);
 557         } else if (gd->gd_uschedcp) {
 558                 gd->gd_upri = gd->gd_uschedcp->p_priority;
 559                 KKASSERT(curprocmask & (1 << cpuid));
 560         } else if (runqcount && (rdyprocmask & (1 << cpuid))) {
 561                 /*gd->gd_uschedcp = NULL;*/
 562                 curprocmask &= ~(1 << cpuid);
 563                 rdyprocmask &= ~(1 << cpuid);
 564                 lwkt_schedule(&gd->gd_schedthread);
 565         } else {
 566                 /*gd->gd_uschedcp = NULL;*/
 567                 curprocmask &= ~(1 << cpuid);
 568         }
 569 }
 570
 571 /*
 572  * Acquire the current process designation on the CURRENT process only.
 573  * This function is called at kernel-user priority (not userland priority)
 574  * when curproc does not match gd_uschedcp.
 575  */
 576 void
 577 acquire_curproc(struct proc *p)
 578 {
 579         globaldata_t gd = mycpu;
 580
 581 #ifdef ONLY_ONE_USER_CPU
 582         KKASSERT(gd->gd_cpuid == 0);
 583 #endif
 584
 585         /*
 586          * Loop until we become the current process.
 587          */
 588         crit_enter();
 589         ++p->p_stats->p_ru.ru_nivcsw;
 590         do {
 591                 KKASSERT(p == gd->gd_curthread->td_proc);
 592
 593                 lwkt_deschedule_self(gd->gd_curthread);
 594                 setrunqueue(p);
 595                 lwkt_switch();
 596                 if (usched_debug)
 597                     printf("a");
 598
 599                 /*
 600                  * WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU, RELOAD GD.
 601                  */
 602                 gd = mycpu;
 603         } while (gd->gd_uschedcp != p);
 604         crit_exit();
 605
 606         /*
 607          * That's it.  Cleanup, we are done.  The caller can return to
 608          * user mode now.
 609          */
 610         KKASSERT((p->p_flag & P_ONRUNQ) == 0);
 611 }
 612
 613 /*
 614  * Yield / synchronous reschedule.  This is a bit tricky because the trap
 615  * code might have set a lazy release on the switch function.   Setting
 616  * P_PASSIVE_ACQ will ensure that the lazy release executes when we call
 617  * switch, and that we are given a greater chance of affinity with our
 618  * current cpu.
 619  *
 620  * We call lwkt_setpri_self() to rotate our thread to the end of the lwkt
 621  * run queue.  lwkt_switch() will also execute any assigned passive release
 622  * (which usually calls release_curproc()), allowing a same/higher priority
 623  * process to be designated as the current process.
 624  *
 625  * While it is possible for a lower priority process to be designated,
 626  * it's call to lwkt_maybe_switch() in acquire_curproc() will likely
 627  * round-robin back to us and we will be able to re-acquire the current
 628  * process designation.
 629  */
 630 void
 631 uio_yield(void)
 632 {
 633         struct thread *td = curthread;
 634         struct proc *p = td->td_proc;
 635
 636         lwkt_setpri_self(td->td_pri & TDPRI_MASK);
 637         if (p) {
 638                 p->p_flag |= P_PASSIVE_ACQ;
 639                 lwkt_switch();
 640                 p->p_flag &= ~P_PASSIVE_ACQ;
 641         } else {
 642                 lwkt_switch();
 643         }
 644 }
 645
 646 #ifdef SMP
 647
 648 /*
 649  * For SMP systems a user scheduler helper thread is created for each
 650  * cpu and is used to allow one cpu to wakeup another for the purposes of
 651  * scheduling userland threads from setrunqueue().  UP systems do not
 652  * need the helper since there is only one cpu.  We can't use the idle
 653  * thread for this because we need to hold the MP lock.  Additionally,
 654  * doing things this way allows us to HLT idle cpus on MP systems.
 655  */
 656 static void
 657 sched_thread(void *dummy)
 658 {
 659     globaldata_t gd = mycpu;
 660     int cpuid = gd->gd_cpuid;           /* doesn't change */
 661     u_int32_t cpumask = 1 << cpuid;     /* doesn't change */
 662
 663 #ifdef ONLY_ONE_USER_CPU
 664     KKASSERT(cpuid == 0);
 665 #endif
 666
 667     get_mplock();                       /* hold the MP lock */
 668     for (;;) {
 669         struct proc *np;
 670
 671         lwkt_deschedule_self(gd->gd_curthread); /* interlock */
 672         rdyprocmask |= cpumask;
 673         crit_enter_quick(gd->gd_curthread);
 674         if ((curprocmask & cpumask) == 0 && (np = chooseproc(NULL)) != NULL) {
 675             curprocmask |= cpumask;
 676             gd->gd_upri = np->p_priority;
 677             gd->gd_uschedcp = np;
 678             if (usched_debug)
 679                 printf("E%-7d", gd->gd_uschedcp->p_pid);
 680             lwkt_acquire(np->p_thread);
 681             lwkt_schedule(np->p_thread);
 682         }
 683         crit_exit_quick(gd->gd_curthread);
 684         lwkt_switch();
 685     }
 686 }
 687
 688 /*
 689  * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
 690  * been cleared by rqinit() and we should not mess with it further.
 691  */
 692 static void
 693 sched_thread_cpu_init(void)
 694 {
 695     int i;
 696
 697     if (bootverbose)
 698         printf("start scheduler helpers on cpus:");
 699
 700     for (i = 0; i < ncpus; ++i) {
 701         globaldata_t dgd = globaldata_find(i);
 702         cpumask_t mask = 1 << i;
 703
 704         if ((mask & smp_active_mask) == 0)
 705             continue;
 706
 707         if (bootverbose)
 708             printf(" %d", i);
 709
 710         lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread,
 711                     TDF_STOPREQ, i, "usched %d", i);
 712 #ifdef ONLY_ONE_USER_CPU
 713         if (i)
 714             curprocmask |= mask;        /* DISABLE USER PROCS */
 715 #else
 716         if (i)
 717             curprocmask &= ~mask;       /* schedule user proc on cpu */
 718 #endif
 719         rdyprocmask |= mask;
 720     }
 721     if (bootverbose)
 722         printf("\n");
 723 }
 724 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL)
 725
 726 #endif
 727