sys/kern/kern_synch.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)kern_synch.c        8.9 (Berkeley) 5/19/95
  39  * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $
  40  * $DragonFly: src/sys/kern/kern_synch.c,v 1.85 2007/05/24 20:51:16 dillon Exp $
  41  */
  42
  43 #include "opt_ktrace.h"
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/proc.h>
  48 #include <sys/kernel.h>
  49 #include <sys/signalvar.h>
  50 #include <sys/signal2.h>
  51 #include <sys/resourcevar.h>
  52 #include <sys/vmmeter.h>
  53 #include <sys/sysctl.h>
  54 #include <sys/lock.h>
  55 #ifdef KTRACE
  56 #include <sys/uio.h>
  57 #include <sys/ktrace.h>
  58 #endif
  59 #include <sys/xwait.h>
  60 #include <sys/ktr.h>
  61
  62 #include <sys/thread2.h>
  63 #include <sys/spinlock2.h>
  64
  65 #include <machine/cpu.h>
  66 #include <machine/smp.h>
  67
  68 TAILQ_HEAD(tslpque, thread);
  69
  70 static void sched_setup (void *dummy);
  71 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
  72
  73 int     hogticks;
  74 int     lbolt;
  75 int     lbolt_syncer;
  76 int     sched_quantum;          /* Roundrobin scheduling quantum in ticks. */
  77 int     ncpus;
  78 int     ncpus2, ncpus2_shift, ncpus2_mask;
  79 int     ncpus_fit, ncpus_fit_mask;
  80 int     safepri;
  81 int     tsleep_now_works;
  82
  83 static struct callout loadav_callout;
  84 static struct callout schedcpu_callout;
  85 MALLOC_DEFINE(M_TSLEEP, "tslpque", "tsleep queues");
  86
  87 #if !defined(KTR_TSLEEP)
  88 #define KTR_TSLEEP      KTR_ALL
  89 #endif
  90 KTR_INFO_MASTER(tsleep);
  91 KTR_INFO(KTR_TSLEEP, tsleep, tsleep_beg, 0, "tsleep enter", 0);
  92 KTR_INFO(KTR_TSLEEP, tsleep, tsleep_end, 0, "tsleep exit", 0);
  93 KTR_INFO(KTR_TSLEEP, tsleep, wakeup_beg, 0, "wakeup enter", 0);
  94 KTR_INFO(KTR_TSLEEP, tsleep, wakeup_end, 0, "wakeup exit", 0);
  95 #define logtsleep(name) KTR_LOG(tsleep_ ## name)
  96
  97 struct loadavg averunnable =
  98         { {0, 0, 0}, FSCALE };  /* load average, of runnable procs */
  99 /*
 100  * Constants for averages over 1, 5, and 15 minutes
 101  * when sampling at 5 second intervals.
 102  */
 103 static fixpt_t cexp[3] = {
 104         0.9200444146293232 * FSCALE,    /* exp(-1/12) */
 105         0.9834714538216174 * FSCALE,    /* exp(-1/60) */
 106         0.9944598480048967 * FSCALE,    /* exp(-1/180) */
 107 };
 108
 109 static void     endtsleep (void *);
 110 static void     unsleep_and_wakeup_thread(struct thread *td);
 111 static void     loadav (void *arg);
 112 static void     schedcpu (void *arg);
 113
 114 /*
 115  * Adjust the scheduler quantum.  The quantum is specified in microseconds.
 116  * Note that 'tick' is in microseconds per tick.
 117  */
 118 static int
 119 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 120 {
 121         int error, new_val;
 122
 123         new_val = sched_quantum * tick;
 124         error = sysctl_handle_int(oidp, &new_val, 0, req);
 125         if (error != 0 || req->newptr == NULL)
 126                 return (error);
 127         if (new_val < tick)
 128                 return (EINVAL);
 129         sched_quantum = new_val / tick;
 130         hogticks = 2 * sched_quantum;
 131         return (0);
 132 }
 133
 134 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
 135         0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
 136
 137 /*
 138  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
 139  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
 140  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
 141  *
 142  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
 143  *     1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
 144  *
 145  * If you don't want to bother with the faster/more-accurate formula, you
 146  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
 147  * (more general) method of calculating the %age of CPU used by a process.
 148  *
 149  * decay 95% of `lwp_pctcpu' in 60 seconds; see CCPU_SHIFT before changing
 150  */
 151 #define CCPU_SHIFT      11
 152
 153 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
 154 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 155
 156 /*
 157  * kernel uses `FSCALE', userland (SHOULD) use kern.fscale
 158  */
 159 int     fscale __unused = FSCALE;       /* exported to systat */
 160 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 161
 162 /*
 163  * Recompute process priorities, once a second.
 164  *
 165  * Since the userland schedulers are typically event oriented, if the
 166  * estcpu calculation at wakeup() time is not sufficient to make a
 167  * process runnable relative to other processes in the system we have
 168  * a 1-second recalc to help out.
 169  *
 170  * This code also allows us to store sysclock_t data in the process structure
 171  * without fear of an overrun, since sysclock_t are guarenteed to hold
 172  * several seconds worth of count.
 173  *
 174  * WARNING!  callouts can preempt normal threads.  However, they will not
 175  * preempt a thread holding a spinlock so we *can* safely use spinlocks.
 176  */
 177 static int schedcpu_stats(struct proc *p, void *data __unused);
 178 static int schedcpu_resource(struct proc *p, void *data __unused);
 179
 180 static void
 181 schedcpu(void *arg)
 182 {
 183         allproc_scan(schedcpu_stats, NULL);
 184         allproc_scan(schedcpu_resource, NULL);
 185         wakeup((caddr_t)&lbolt);
 186         wakeup((caddr_t)&lbolt_syncer);
 187         callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
 188 }
 189
 190 /*
 191  * General process statistics once a second
 192  */
 193 static int
 194 schedcpu_stats(struct proc *p, void *data __unused)
 195 {
 196         struct lwp *lp;
 197
 198         crit_enter();
 199         p->p_swtime++;
 200         FOREACH_LWP_IN_PROC(lp, p) {
 201                 if (lp->lwp_stat == LSSLEEP)
 202                         lp->lwp_slptime++;
 203
 204                 /*
 205                  * Only recalculate processes that are active or have slept
 206                  * less then 2 seconds.  The schedulers understand this.
 207                  */
 208                 if (lp->lwp_slptime <= 1) {
 209                         p->p_usched->recalculate(lp);
 210                 } else {
 211                         lp->lwp_pctcpu = (lp->lwp_pctcpu * ccpu) >> FSHIFT;
 212                 }
 213         }
 214         crit_exit();
 215         return(0);
 216 }
 217
 218 /*
 219  * Resource checks.  XXX break out since ksignal/killproc can block,
 220  * limiting us to one process killed per second.  There is probably
 221  * a better way.
 222  */
 223 static int
 224 schedcpu_resource(struct proc *p, void *data __unused)
 225 {
 226         u_int64_t ttime;
 227         struct lwp *lp;
 228
 229         crit_enter();
 230         if (p->p_stat == SIDL ||
 231             p->p_stat == SZOMB ||
 232             p->p_limit == NULL
 233         ) {
 234                 crit_exit();
 235                 return(0);
 236         }
 237
 238         ttime = 0;
 239         FOREACH_LWP_IN_PROC(lp, p) {
 240                 ttime += lp->lwp_thread->td_sticks;
 241                 ttime += lp->lwp_thread->td_uticks;
 242         }
 243
 244         switch(plimit_testcpulimit(p->p_limit, ttime)) {
 245         case PLIMIT_TESTCPU_KILL:
 246                 killproc(p, "exceeded maximum CPU limit");
 247                 break;
 248         case PLIMIT_TESTCPU_XCPU:
 249                 if ((p->p_flag & P_XCPU) == 0) {
 250                         p->p_flag |= P_XCPU;
 251                         ksignal(p, SIGXCPU);
 252                 }
 253                 break;
 254         default:
 255                 break;
 256         }
 257         crit_exit();
 258         return(0);
 259 }
 260
 261 /*
 262  * This is only used by ps.  Generate a cpu percentage use over
 263  * a period of one second.
 264  *
 265  * MPSAFE
 266  */
 267 void
 268 updatepcpu(struct lwp *lp, int cpticks, int ttlticks)
 269 {
 270         fixpt_t acc;
 271         int remticks;
 272
 273         acc = (cpticks << FSHIFT) / ttlticks;
 274         if (ttlticks >= ESTCPUFREQ) {
 275                 lp->lwp_pctcpu = acc;
 276         } else {
 277                 remticks = ESTCPUFREQ - ttlticks;
 278                 lp->lwp_pctcpu = (acc * ttlticks + lp->lwp_pctcpu * remticks) /
 279                                 ESTCPUFREQ;
 280         }
 281 }
 282
 283 /*
 284  * We're only looking at 7 bits of the address; everything is
 285  * aligned to 4, lots of things are aligned to greater powers
 286  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
 287  */
 288 #define TABLESIZE       128
 289 #define LOOKUP(x)       (((intptr_t)(x) >> 8) & (TABLESIZE - 1))
 290
 291 static cpumask_t slpque_cpumasks[TABLESIZE];
 292
 293 /*
 294  * General scheduler initialization.  We force a reschedule 25 times
 295  * a second by default.  Note that cpu0 is initialized in early boot and
 296  * cannot make any high level calls.
 297  *
 298  * Each cpu has its own sleep queue.
 299  */
 300 void
 301 sleep_gdinit(globaldata_t gd)
 302 {
 303         static struct tslpque slpque_cpu0[TABLESIZE];
 304         int i;
 305
 306         if (gd->gd_cpuid == 0) {
 307                 sched_quantum = (hz + 24) / 25;
 308                 hogticks = 2 * sched_quantum;
 309
 310                 gd->gd_tsleep_hash = slpque_cpu0;
 311         } else {
 312                 gd->gd_tsleep_hash = kmalloc(sizeof(slpque_cpu0),
 313                                             M_TSLEEP, M_WAITOK | M_ZERO);
 314         }
 315         for (i = 0; i < TABLESIZE; ++i)
 316                 TAILQ_INIT(&gd->gd_tsleep_hash[i]);
 317 }
 318
 319 /*
 320  * General sleep call.  Suspends the current process until a wakeup is
 321  * performed on the specified identifier.  The process will then be made
 322  * runnable with the specified priority.  Sleeps at most timo/hz seconds
 323  * (0 means no timeout).  If flags includes PCATCH flag, signals are checked
 324  * before and after sleeping, else signals are not checked.  Returns 0 if
 325  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 326  * signal needs to be delivered, ERESTART is returned if the current system
 327  * call should be restarted if possible, and EINTR is returned if the system
 328  * call should be interrupted by the signal (return EINTR).
 329  *
 330  * Note that if we are a process, we release_curproc() before messing with
 331  * the LWKT scheduler.
 332  *
 333  * During autoconfiguration or after a panic, a sleep will simply
 334  * lower the priority briefly to allow interrupts, then return.
 335  */
 336 int
 337 tsleep(void *ident, int flags, const char *wmesg, int timo)
 338 {
 339         struct thread *td = curthread;
 340         struct lwp *lp = td->td_lwp;
 341         struct proc *p = td->td_proc;           /* may be NULL */
 342         globaldata_t gd;
 343         int sig;
 344         int catch;
 345         int id;
 346         int error;
 347         int oldpri;
 348         struct callout thandle;
 349
 350         /*
 351          * NOTE: removed KTRPOINT, it could cause races due to blocking
 352          * even in stable.  Just scrap it for now.
 353          */
 354         if (tsleep_now_works == 0 || panicstr) {
 355                 /*
 356                  * After a panic, or before we actually have an operational
 357                  * softclock, just give interrupts a chance, then just return;
 358                  *
 359                  * don't run any other procs or panic below,
 360                  * in case this is the idle process and already asleep.
 361                  */
 362                 splz();
 363                 oldpri = td->td_pri & TDPRI_MASK;
 364                 lwkt_setpri_self(safepri);
 365                 lwkt_switch();
 366                 lwkt_setpri_self(oldpri);
 367                 return (0);
 368         }
 369         logtsleep(tsleep_beg);
 370         gd = td->td_gd;
 371         KKASSERT(td != &gd->gd_idlethread);     /* you must be kidding! */
 372
 373         /*
 374          * NOTE: all of this occurs on the current cpu, including any
 375          * callout-based wakeups, so a critical section is a sufficient
 376          * interlock.
 377          *
 378          * The entire sequence through to where we actually sleep must
 379          * run without breaking the critical section.
 380          */
 381         id = LOOKUP(ident);
 382         catch = flags & PCATCH;
 383         error = 0;
 384         sig = 0;
 385
 386         crit_enter_quick(td);
 387
 388         KASSERT(ident != NULL, ("tsleep: no ident"));
 389         KASSERT(lp == NULL ||
 390                 lp->lwp_stat == LSRUN ||        /* Obvious */
 391                 lp->lwp_stat == LSSTOP,         /* Set in tstop */
 392                 ("tsleep %p %s %d",
 393                         ident, wmesg, lp->lwp_stat));
 394
 395         /*
 396          * Setup for the current process (if this is a process).
 397          */
 398         if (lp) {
 399                 if (catch) {
 400                         /*
 401                          * Early termination if PCATCH was set and a
 402                          * signal is pending, interlocked with the
 403                          * critical section.
 404                          *
 405                          * Early termination only occurs when tsleep() is
 406                          * entered while in a normal LSRUN state.
 407                          */
 408                         if ((sig = CURSIG(lp)) != 0)
 409                                 goto resume;
 410
 411                         /*
 412                          * Early termination if PCATCH was set and a
 413                          * mailbox signal was possibly delivered prior to
 414                          * the system call even being made, in order to
 415                          * allow the user to interlock without having to
 416                          * make additional system calls.
 417                          */
 418                         if (p->p_flag & P_MAILBOX)
 419                                 goto resume;
 420
 421                         /*
 422                          * Causes ksignal to wake us up when.
 423                          */
 424                         lp->lwp_flag |= LWP_SINTR;
 425                 }
 426
 427                 /*
 428                  * Make sure the current process has been untangled from
 429                  * the userland scheduler and initialize slptime to start
 430                  * counting.
 431                  */
 432                 if (flags & PNORESCHED)
 433                         td->td_flags |= TDF_NORESCHED;
 434                 p->p_usched->release_curproc(lp);
 435                 lp->lwp_slptime = 0;
 436         }
 437
 438         /*
 439          * Move our thread to the correct queue and setup our wchan, etc.
 440          */
 441         lwkt_deschedule_self(td);
 442         td->td_flags |= TDF_TSLEEPQ;
 443         TAILQ_INSERT_TAIL(&gd->gd_tsleep_hash[id], td, td_threadq);
 444         atomic_set_int(&slpque_cpumasks[id], gd->gd_cpumask);
 445
 446         td->td_wchan = ident;
 447         td->td_wmesg = wmesg;
 448         td->td_wdomain = flags & PDOMAIN_MASK;
 449
 450         /*
 451          * Setup the timeout, if any
 452          */
 453         if (timo) {
 454                 callout_init(&thandle);
 455                 callout_reset(&thandle, timo, endtsleep, td);
 456         }
 457
 458         /*
 459          * Beddy bye bye.
 460          */
 461         if (lp) {
 462                 /*
 463                  * Ok, we are sleeping.  Place us in the SSLEEP state.
 464                  */
 465                 KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0);
 466                 /*
 467                  * tstop() sets LSSTOP, so don't fiddle with that.
 468                  */
 469                 if (lp->lwp_stat != LSSTOP)
 470                         lp->lwp_stat = LSSLEEP;
 471                 lp->lwp_ru.ru_nvcsw++;
 472                 lwkt_switch();
 473
 474                 /*
 475                  * And when we are woken up, put us back in LSRUN.  If we
 476                  * slept for over a second, recalculate our estcpu.
 477                  */
 478                 lp->lwp_stat = LSRUN;
 479                 if (lp->lwp_slptime)
 480                         p->p_usched->recalculate(lp);
 481                 lp->lwp_slptime = 0;
 482         } else {
 483                 lwkt_switch();
 484         }
 485
 486         /*
 487          * Make sure we haven't switched cpus while we were asleep.  It's
 488          * not supposed to happen.  Cleanup our temporary flags.
 489          */
 490         KKASSERT(gd == td->td_gd);
 491         td->td_flags &= ~TDF_NORESCHED;
 492
 493         /*
 494          * Cleanup the timeout.
 495          */
 496         if (timo) {
 497                 if (td->td_flags & TDF_TIMEOUT) {
 498                         td->td_flags &= ~TDF_TIMEOUT;
 499                         error = EWOULDBLOCK;
 500                 } else {
 501                         callout_stop(&thandle);
 502                 }
 503         }
 504
 505         /*
 506          * Since td_threadq is used both for our run queue AND for the
 507          * tsleep hash queue, we can't still be on it at this point because
 508          * we've gotten cpu back.
 509          */
 510         KASSERT((td->td_flags & TDF_TSLEEPQ) == 0, ("tsleep: impossible thread flags %08x", td->td_flags));
 511         td->td_wchan = NULL;
 512         td->td_wmesg = NULL;
 513         td->td_wdomain = 0;
 514
 515         /*
 516          * Figure out the correct error return.  If interrupted by a
 517          * signal we want to return EINTR or ERESTART.
 518          *
 519          * If P_MAILBOX is set no automatic system call restart occurs
 520          * and we return EINTR.  P_MAILBOX is meant to be used as an
 521          * interlock, the user must poll it prior to any system call
 522          * that it wishes to interlock a mailbox signal against since
 523          * the flag is cleared on *any* system call that sleeps.
 524          */
 525 resume:
 526         if (p) {
 527                 if (catch && error == 0) {
 528                         if ((p->p_flag & P_MAILBOX) && sig == 0) {
 529                                 error = EINTR;
 530                         } else if (sig != 0 || (sig = CURSIG(lp))) {
 531                                 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 532                                         error = EINTR;
 533                                 else
 534                                         error = ERESTART;
 535                         }
 536                 }
 537                 lp->lwp_flag &= ~(LWP_BREAKTSLEEP | LWP_SINTR);
 538                 p->p_flag &= ~P_MAILBOX;
 539         }
 540         logtsleep(tsleep_end);
 541         crit_exit_quick(td);
 542         return (error);
 543 }
 544
 545 /*
 546  * This is a dandy function that allows us to interlock tsleep/wakeup
 547  * operations with unspecified upper level locks, such as lockmgr locks,
 548  * simply by holding a critical section.  The sequence is:
 549  *
 550  *      (enter critical section)
 551  *      (acquire upper level lock)
 552  *      tsleep_interlock(blah)
 553  *      (release upper level lock)
 554  *      tsleep(blah, ...)
 555  *      (exit critical section)
 556  *
 557  * Basically this function sets our cpumask for the ident which informs
 558  * other cpus that our cpu 'might' be waiting (or about to wait on) the
 559  * hash index related to the ident.  The critical section prevents another
 560  * cpu's wakeup() from being processed on our cpu until we are actually
 561  * able to enter the tsleep().  Thus, no race occurs between our attempt
 562  * to release a resource and sleep, and another cpu's attempt to acquire
 563  * a resource and call wakeup.
 564  *
 565  * There isn't much of a point to this function unless you call it while
 566  * holding a critical section.
 567  */
 568 static __inline void
 569 _tsleep_interlock(globaldata_t gd, void *ident)
 570 {
 571         int id = LOOKUP(ident);
 572
 573         atomic_set_int(&slpque_cpumasks[id], gd->gd_cpumask);
 574 }
 575
 576 void
 577 tsleep_interlock(void *ident)
 578 {
 579         _tsleep_interlock(mycpu, ident);
 580 }
 581
 582 /*
 583  * Interlocked spinlock sleep.  An exclusively held spinlock must
 584  * be passed to msleep().  The function will atomically release the
 585  * spinlock and tsleep on the ident, then reacquire the spinlock and
 586  * return.
 587  *
 588  * This routine is fairly important along the critical path, so optimize it
 589  * heavily.
 590  */
 591 int
 592 msleep(void *ident, struct spinlock *spin, int flags,
 593        const char *wmesg, int timo)
 594 {
 595         globaldata_t gd = mycpu;
 596         int error;
 597
 598         crit_enter_gd(gd);
 599         _tsleep_interlock(gd, ident);
 600         spin_unlock_wr_quick(gd, spin);
 601         error = tsleep(ident, flags, wmesg, timo);
 602         spin_lock_wr_quick(gd, spin);
 603         crit_exit_gd(gd);
 604
 605         return (error);
 606 }
 607
 608 /*
 609  * Directly block on the LWKT thread by descheduling it.  This
 610  * is much faster then tsleep(), but the only legal way to wake
 611  * us up is to directly schedule the thread.
 612  *
 613  * Setting TDF_SINTR will cause new signals to directly schedule us.
 614  *
 615  * This routine is typically called while in a critical section.
 616  */
 617 int
 618 lwkt_sleep(const char *wmesg, int flags)
 619 {
 620         thread_t td = curthread;
 621         int sig;
 622
 623         if ((flags & PCATCH) == 0 || td->td_lwp == NULL) {
 624                 td->td_flags |= TDF_BLOCKED;
 625                 td->td_wmesg = wmesg;
 626                 lwkt_deschedule_self(td);
 627                 lwkt_switch();
 628                 td->td_wmesg = NULL;
 629                 td->td_flags &= ~TDF_BLOCKED;
 630                 return(0);
 631         }
 632         if ((sig = CURSIG(td->td_lwp)) != 0) {
 633                 if (SIGISMEMBER(td->td_proc->p_sigacts->ps_sigintr, sig))
 634                         return(EINTR);
 635                 else
 636                         return(ERESTART);
 637
 638         }
 639         td->td_flags |= TDF_BLOCKED | TDF_SINTR;
 640         td->td_wmesg = wmesg;
 641         lwkt_deschedule_self(td);
 642         lwkt_switch();
 643         td->td_flags &= ~(TDF_BLOCKED | TDF_SINTR);
 644         td->td_wmesg = NULL;
 645         return(0);
 646 }
 647
 648 /*
 649  * Implement the timeout for tsleep.
 650  *
 651  * We set LWP_BREAKTSLEEP to indicate that an event has occured, but
 652  * we only call setrunnable if the process is not stopped.
 653  *
 654  * This type of callout timeout is scheduled on the same cpu the process
 655  * is sleeping on.  Also, at the moment, the MP lock is held.
 656  */
 657 static void
 658 endtsleep(void *arg)
 659 {
 660         thread_t td = arg;
 661         struct lwp *lp;
 662
 663         ASSERT_MP_LOCK_HELD(curthread);
 664         crit_enter();
 665
 666         /*
 667          * cpu interlock.  Thread flags are only manipulated on
 668          * the cpu owning the thread.  proc flags are only manipulated
 669          * by the older of the MP lock.  We have both.
 670          */
 671         if (td->td_flags & TDF_TSLEEPQ) {
 672                 td->td_flags |= TDF_TIMEOUT;
 673
 674                 if ((lp = td->td_lwp) != NULL) {
 675                         lp->lwp_flag |= LWP_BREAKTSLEEP;
 676                         if (lp->lwp_proc->p_stat != SSTOP)
 677                                 setrunnable(lp);
 678                 } else {
 679                         unsleep_and_wakeup_thread(td);
 680                 }
 681         }
 682         crit_exit();
 683 }
 684
 685 /*
 686  * Unsleep and wakeup a thread.  This function runs without the MP lock
 687  * which means that it can only manipulate thread state on the owning cpu,
 688  * and cannot touch the process state at all.
 689  */
 690 static
 691 void
 692 unsleep_and_wakeup_thread(struct thread *td)
 693 {
 694         globaldata_t gd = mycpu;
 695         int id;
 696
 697 #ifdef SMP
 698         if (td->td_gd != gd) {
 699                 lwkt_send_ipiq(td->td_gd, (ipifunc1_t)unsleep_and_wakeup_thread, td);
 700                 return;
 701         }
 702 #endif
 703         crit_enter();
 704         if (td->td_flags & TDF_TSLEEPQ) {
 705                 td->td_flags &= ~TDF_TSLEEPQ;
 706                 id = LOOKUP(td->td_wchan);
 707                 TAILQ_REMOVE(&gd->gd_tsleep_hash[id], td, td_threadq);
 708                 if (TAILQ_FIRST(&gd->gd_tsleep_hash[id]) == NULL)
 709                         atomic_clear_int(&slpque_cpumasks[id], gd->gd_cpumask);
 710                 lwkt_schedule(td);
 711         }
 712         crit_exit();
 713 }
 714
 715 /*
 716  * Make all processes sleeping on the specified identifier runnable.
 717  * count may be zero or one only.
 718  *
 719  * The domain encodes the sleep/wakeup domain AND the first cpu to check
 720  * (which is always the current cpu).  As we iterate across cpus
 721  *
 722  * This call may run without the MP lock held.  We can only manipulate thread
 723  * state on the cpu owning the thread.  We CANNOT manipulate process state
 724  * at all.
 725  */
 726 static void
 727 _wakeup(void *ident, int domain)
 728 {
 729         struct tslpque *qp;
 730         struct thread *td;
 731         struct thread *ntd;
 732         globaldata_t gd;
 733 #ifdef SMP
 734         cpumask_t mask;
 735         cpumask_t tmask;
 736         int startcpu;
 737         int nextcpu;
 738 #endif
 739         int id;
 740
 741         crit_enter();
 742         logtsleep(wakeup_beg);
 743         gd = mycpu;
 744         id = LOOKUP(ident);
 745         qp = &gd->gd_tsleep_hash[id];
 746 restart:
 747         for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
 748                 ntd = TAILQ_NEXT(td, td_threadq);
 749                 if (td->td_wchan == ident &&
 750                     td->td_wdomain == (domain & PDOMAIN_MASK)
 751                 ) {
 752                         KKASSERT(td->td_flags & TDF_TSLEEPQ);
 753                         td->td_flags &= ~TDF_TSLEEPQ;
 754                         TAILQ_REMOVE(qp, td, td_threadq);
 755                         if (TAILQ_FIRST(qp) == NULL) {
 756                                 atomic_clear_int(&slpque_cpumasks[id],
 757                                                  gd->gd_cpumask);
 758                         }
 759                         lwkt_schedule(td);
 760                         if (domain & PWAKEUP_ONE)
 761                                 goto done;
 762                         goto restart;
 763                 }
 764         }
 765
 766 #ifdef SMP
 767         /*
 768          * We finished checking the current cpu but there still may be
 769          * more work to do.  Either wakeup_one was requested and no matching
 770          * thread was found, or a normal wakeup was requested and we have
 771          * to continue checking cpus.
 772          *
 773          * The cpu that started the wakeup sequence is encoded in the domain.
 774          * We use this information to determine which cpus still need to be
 775          * checked, locate a candidate cpu, and chain the wakeup
 776          * asynchronously with an IPI message.
 777          *
 778          * It should be noted that this scheme is actually less expensive then
 779          * the old scheme when waking up multiple threads, since we send
 780          * only one IPI message per target candidate which may then schedule
 781          * multiple threads.  Before we could have wound up sending an IPI
 782          * message for each thread on the target cpu (!= current cpu) that
 783          * needed to be woken up.
 784          *
 785          * NOTE: Wakeups occuring on remote cpus are asynchronous.  This
 786          * should be ok since we are passing idents in the IPI rather then
 787          * thread pointers.
 788          */
 789         if ((domain & PWAKEUP_MYCPU) == 0 &&
 790             (mask = slpque_cpumasks[id]) != 0
 791         ) {
 792                 /*
 793                  * Look for a cpu that might have work to do.  Mask out cpus
 794                  * which have already been processed.
 795                  *
 796                  * 31xxxxxxxxxxxxxxxxxxxxxxxxxxxxx0
 797                  *        ^        ^           ^
 798                  *      start   currentcpu    start
 799                  *      case2                 case1
 800                  *        *        *           *
 801                  * 11111111111111110000000000000111     case1
 802                  * 00000000111111110000000000000000     case2
 803                  *
 804                  * case1:  We started at start_case1 and processed through
 805                  *         to the current cpu.  We have to check any bits
 806                  *         after the current cpu, then check bits before
 807                  *         the starting cpu.
 808                  *
 809                  * case2:  We have already checked all the bits from
 810                  *         start_case2 to the end, and from 0 to the current
 811                  *         cpu.  We just have the bits from the current cpu
 812                  *         to start_case2 left to check.
 813                  */
 814                 startcpu = PWAKEUP_DECODE(domain);
 815                 if (gd->gd_cpuid >= startcpu) {
 816                         /*
 817                          * CASE1
 818                          */
 819                         tmask = mask & ~((gd->gd_cpumask << 1) - 1);
 820                         if (mask & tmask) {
 821                                 nextcpu = bsfl(mask & tmask);
 822                                 lwkt_send_ipiq2(globaldata_find(nextcpu),
 823                                                 _wakeup, ident, domain);
 824                         } else {
 825                                 tmask = (1 << startcpu) - 1;
 826                                 if (mask & tmask) {
 827                                         nextcpu = bsfl(mask & tmask);
 828                                         lwkt_send_ipiq2(
 829                                                     globaldata_find(nextcpu),
 830                                                     _wakeup, ident, domain);
 831                                 }
 832                         }
 833                 } else {
 834                         /*
 835                          * CASE2
 836                          */
 837                         tmask = ~((gd->gd_cpumask << 1) - 1) &
 838                                  ((1 << startcpu) - 1);
 839                         if (mask & tmask) {
 840                                 nextcpu = bsfl(mask & tmask);
 841                                 lwkt_send_ipiq2(globaldata_find(nextcpu),
 842                                                 _wakeup, ident, domain);
 843                         }
 844                 }
 845         }
 846 #endif
 847 done:
 848         logtsleep(wakeup_end);
 849         crit_exit();
 850 }
 851
 852 /*
 853  * Wakeup all threads tsleep()ing on the specified ident, on all cpus
 854  */
 855 void
 856 wakeup(void *ident)
 857 {
 858     _wakeup(ident, PWAKEUP_ENCODE(0, mycpu->gd_cpuid));
 859 }
 860
 861 /*
 862  * Wakeup one thread tsleep()ing on the specified ident, on any cpu.
 863  */
 864 void
 865 wakeup_one(void *ident)
 866 {
 867     /* XXX potentially round-robin the first responding cpu */
 868     _wakeup(ident, PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | PWAKEUP_ONE);
 869 }
 870
 871 /*
 872  * Wakeup threads tsleep()ing on the specified ident on the current cpu
 873  * only.
 874  */
 875 void
 876 wakeup_mycpu(void *ident)
 877 {
 878     _wakeup(ident, PWAKEUP_MYCPU);
 879 }
 880
 881 /*
 882  * Wakeup one thread tsleep()ing on the specified ident on the current cpu
 883  * only.
 884  */
 885 void
 886 wakeup_mycpu_one(void *ident)
 887 {
 888     /* XXX potentially round-robin the first responding cpu */
 889     _wakeup(ident, PWAKEUP_MYCPU|PWAKEUP_ONE);
 890 }
 891
 892 /*
 893  * Wakeup all thread tsleep()ing on the specified ident on the specified cpu
 894  * only.
 895  */
 896 void
 897 wakeup_oncpu(globaldata_t gd, void *ident)
 898 {
 899 #ifdef SMP
 900     if (gd == mycpu) {
 901         _wakeup(ident, PWAKEUP_MYCPU);
 902     } else {
 903         lwkt_send_ipiq2(gd, _wakeup, ident, PWAKEUP_MYCPU);
 904     }
 905 #else
 906     _wakeup(ident, PWAKEUP_MYCPU);
 907 #endif
 908 }
 909
 910 /*
 911  * Wakeup one thread tsleep()ing on the specified ident on the specified cpu
 912  * only.
 913  */
 914 void
 915 wakeup_oncpu_one(globaldata_t gd, void *ident)
 916 {
 917 #ifdef SMP
 918     if (gd == mycpu) {
 919         _wakeup(ident, PWAKEUP_MYCPU | PWAKEUP_ONE);
 920     } else {
 921         lwkt_send_ipiq2(gd, _wakeup, ident, PWAKEUP_MYCPU | PWAKEUP_ONE);
 922     }
 923 #else
 924     _wakeup(ident, PWAKEUP_MYCPU | PWAKEUP_ONE);
 925 #endif
 926 }
 927
 928 /*
 929  * Wakeup all threads waiting on the specified ident that slept using
 930  * the specified domain, on all cpus.
 931  */
 932 void
 933 wakeup_domain(void *ident, int domain)
 934 {
 935     _wakeup(ident, PWAKEUP_ENCODE(domain, mycpu->gd_cpuid));
 936 }
 937
 938 /*
 939  * Wakeup one thread waiting on the specified ident that slept using
 940  * the specified  domain, on any cpu.
 941  */
 942 void
 943 wakeup_domain_one(void *ident, int domain)
 944 {
 945     /* XXX potentially round-robin the first responding cpu */
 946     _wakeup(ident, PWAKEUP_ENCODE(domain, mycpu->gd_cpuid) | PWAKEUP_ONE);
 947 }
 948
 949 /*
 950  * setrunnable()
 951  *
 952  * Make a process runnable.  The MP lock must be held on call.  This only
 953  * has an effect if we are in SSLEEP.  We only break out of the
 954  * tsleep if LWP_BREAKTSLEEP is set, otherwise we just fix-up the state.
 955  *
 956  * NOTE: With the MP lock held we can only safely manipulate the process
 957  * structure.  We cannot safely manipulate the thread structure.
 958  */
 959 void
 960 setrunnable(struct lwp *lp)
 961 {
 962         crit_enter();
 963         ASSERT_MP_LOCK_HELD(curthread);
 964         if (lp->lwp_stat == LSSTOP)
 965                 lp->lwp_stat = LSSLEEP;
 966         if (lp->lwp_stat == LSSLEEP && (lp->lwp_flag & LWP_BREAKTSLEEP))
 967                 unsleep_and_wakeup_thread(lp->lwp_thread);
 968         crit_exit();
 969 }
 970
 971 /*
 972  * The process is stopped due to some condition, usually because p_stat is
 973  * set to SSTOP, but also possibly due to being traced.
 974  *
 975  * NOTE!  If the caller sets SSTOP, the caller must also clear P_WAITED
 976  * because the parent may check the child's status before the child actually
 977  * gets to this routine.
 978  *
 979  * This routine is called with the current lwp only, typically just
 980  * before returning to userland.
 981  *
 982  * Setting LWP_BREAKTSLEEP before entering the tsleep will cause a passive
 983  * SIGCONT to break out of the tsleep.
 984  */
 985 void
 986 tstop(void)
 987 {
 988         struct lwp *lp = curthread->td_lwp;
 989         struct proc *p = lp->lwp_proc;
 990
 991         lp->lwp_flag |= LWP_BREAKTSLEEP;
 992         lp->lwp_stat = LSSTOP;
 993         crit_enter();
 994         /*
 995          * If LWP_WSTOP is set, we were sleeping
 996          * while our process was stopped.  At this point
 997          * we were already counted as stopped.
 998          */
 999         if ((lp->lwp_flag & LWP_WSTOP) == 0) {
1000                 /*
1001                  * If we're the last thread to stop, signal
1002                  * our parent.
1003                  */
1004                 p->p_nstopped++;
1005                 lp->lwp_flag |= LWP_WSTOP;
1006                 if (p->p_nstopped == p->p_nthreads) {
1007                         p->p_flag &= ~P_WAITED;
1008                         wakeup(p->p_pptr);
1009                         if ((p->p_pptr->p_sigacts->ps_flag & PS_NOCLDSTOP) == 0)
1010                                 ksignal(p->p_pptr, SIGCHLD);
1011                 }
1012         }
1013         tsleep(lp->lwp_proc, 0, "stop", 0);
1014         p->p_nstopped--;
1015         crit_exit();
1016 }
1017
1018 /*
1019  * Yield / synchronous reschedule.  This is a bit tricky because the trap
1020  * code might have set a lazy release on the switch function.   Setting
1021  * P_PASSIVE_ACQ will ensure that the lazy release executes when we call
1022  * switch, and that we are given a greater chance of affinity with our
1023  * current cpu.
1024  *
1025  * We call lwkt_setpri_self() to rotate our thread to the end of the lwkt
1026  * run queue.  lwkt_switch() will also execute any assigned passive release
1027  * (which usually calls release_curproc()), allowing a same/higher priority
1028  * process to be designated as the current process.
1029  *
1030  * While it is possible for a lower priority process to be designated,
1031  * it's call to lwkt_maybe_switch() in acquire_curproc() will likely
1032  * round-robin back to us and we will be able to re-acquire the current
1033  * process designation.
1034  */
1035 void
1036 uio_yield(void)
1037 {
1038         struct thread *td = curthread;
1039         struct proc *p = td->td_proc;
1040
1041         lwkt_setpri_self(td->td_pri & TDPRI_MASK);
1042         if (p) {
1043                 p->p_flag |= P_PASSIVE_ACQ;
1044                 lwkt_switch();
1045                 p->p_flag &= ~P_PASSIVE_ACQ;
1046         } else {
1047                 lwkt_switch();
1048         }
1049 }
1050
1051 /*
1052  * Compute a tenex style load average of a quantity on
1053  * 1, 5 and 15 minute intervals.
1054  */
1055 static int loadav_count_runnable(struct lwp *p, void *data);
1056
1057 static void
1058 loadav(void *arg)
1059 {
1060         struct loadavg *avg;
1061         int i, nrun;
1062
1063         nrun = 0;
1064         alllwp_scan(loadav_count_runnable, &nrun);
1065         avg = &averunnable;
1066         for (i = 0; i < 3; i++) {
1067                 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
1068                     nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
1069         }
1070
1071         /*
1072          * Schedule the next update to occur after 5 seconds, but add a
1073          * random variation to avoid synchronisation with processes that
1074          * run at regular intervals.
1075          */
1076         callout_reset(&loadav_callout, hz * 4 + (int)(krandom() % (hz * 2 + 1)),
1077                       loadav, NULL);
1078 }
1079
1080 static int
1081 loadav_count_runnable(struct lwp *lp, void *data)
1082 {
1083         int *nrunp = data;
1084         thread_t td;
1085
1086         switch (lp->lwp_stat) {
1087         case LSRUN:
1088                 if ((td = lp->lwp_thread) == NULL)
1089                         break;
1090                 if (td->td_flags & TDF_BLOCKED)
1091                         break;
1092                 ++*nrunp;
1093                 break;
1094         default:
1095                 break;
1096         }
1097         return(0);
1098 }
1099
1100 /* ARGSUSED */
1101 static void
1102 sched_setup(void *dummy)
1103 {
1104         callout_init(&loadav_callout);
1105         callout_init(&schedcpu_callout);
1106
1107         /* Kick off timeout driven events by calling first time. */
1108         schedcpu(NULL);
1109         loadav(NULL);
1110 }
1111