sys/kern/kern_synch.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)kern_synch.c        8.9 (Berkeley) 5/19/95
  35  * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $
  36  */
  37
  38 #include "opt_ktrace.h"
  39
  40 #include <sys/param.h>
  41 #include <sys/systm.h>
  42 #include <sys/proc.h>
  43 #include <sys/kernel.h>
  44 #include <sys/signalvar.h>
  45 #include <sys/resourcevar.h>
  46 #include <sys/vmmeter.h>
  47 #include <sys/sysctl.h>
  48 #include <sys/lock.h>
  49 #include <sys/uio.h>
  50 #include <sys/kcollect.h>
  51 #ifdef KTRACE
  52 #include <sys/ktrace.h>
  53 #endif
  54 #include <sys/ktr.h>
  55 #include <sys/serialize.h>
  56
  57 #include <sys/signal2.h>
  58 #include <sys/thread2.h>
  59 #include <sys/spinlock2.h>
  60 #include <sys/mutex2.h>
  61
  62 #include <machine/cpu.h>
  63 #include <machine/smp.h>
  64
  65 TAILQ_HEAD(tslpque, thread);
  66
  67 static void sched_setup (void *dummy);
  68 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL);
  69 static void sched_dyninit (void *dummy);
  70 SYSINIT(sched_dyninit, SI_BOOT1_DYNALLOC, SI_ORDER_FIRST, sched_dyninit, NULL);
  71
  72 int     lbolt;
  73 void    *lbolt_syncer;
  74 int     ncpus;
  75 int     ncpus2, ncpus2_shift, ncpus2_mask;      /* note: mask not cpumask_t */
  76 int     ncpus_fit, ncpus_fit_mask;              /* note: mask not cpumask_t */
  77 int     safepri;
  78 int     tsleep_now_works;
  79 int     tsleep_crypto_dump = 0;
  80
  81 MALLOC_DEFINE(M_TSLEEP, "tslpque", "tsleep queues");
  82
  83 #define __DEALL(ident)  __DEQUALIFY(void *, ident)
  84
  85 #if !defined(KTR_TSLEEP)
  86 #define KTR_TSLEEP      KTR_ALL
  87 #endif
  88 KTR_INFO_MASTER(tsleep);
  89 KTR_INFO(KTR_TSLEEP, tsleep, tsleep_beg, 0, "tsleep enter %p", const volatile void *ident);
  90 KTR_INFO(KTR_TSLEEP, tsleep, tsleep_end, 1, "tsleep exit");
  91 KTR_INFO(KTR_TSLEEP, tsleep, wakeup_beg, 2, "wakeup enter %p", const volatile void *ident);
  92 KTR_INFO(KTR_TSLEEP, tsleep, wakeup_end, 3, "wakeup exit");
  93 KTR_INFO(KTR_TSLEEP, tsleep, ilockfail,  4, "interlock failed %p", const volatile void *ident);
  94
  95 #define logtsleep1(name)        KTR_LOG(tsleep_ ## name)
  96 #define logtsleep2(name, val)   KTR_LOG(tsleep_ ## name, val)
  97
  98 struct loadavg averunnable =
  99         { {0, 0, 0}, FSCALE };  /* load average, of runnable procs */
 100 /*
 101  * Constants for averages over 1, 5, and 15 minutes
 102  * when sampling at 5 second intervals.
 103  */
 104 static fixpt_t cexp[3] = {
 105         0.9200444146293232 * FSCALE,    /* exp(-1/12) */
 106         0.9834714538216174 * FSCALE,    /* exp(-1/60) */
 107         0.9944598480048967 * FSCALE,    /* exp(-1/180) */
 108 };
 109
 110 static void     endtsleep (void *);
 111 static void     loadav (void *arg);
 112 static void     schedcpu (void *arg);
 113
 114 static int pctcpu_decay = 10;
 115 SYSCTL_INT(_kern, OID_AUTO, pctcpu_decay, CTLFLAG_RW, &pctcpu_decay, 0, "");
 116
 117 /*
 118  * kernel uses `FSCALE', userland (SHOULD) use kern.fscale
 119  */
 120 int     fscale __unused = FSCALE;       /* exported to systat */
 121 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 122
 123 /*
 124  * Recompute process priorities, once a second.
 125  *
 126  * Since the userland schedulers are typically event oriented, if the
 127  * estcpu calculation at wakeup() time is not sufficient to make a
 128  * process runnable relative to other processes in the system we have
 129  * a 1-second recalc to help out.
 130  *
 131  * This code also allows us to store sysclock_t data in the process structure
 132  * without fear of an overrun, since sysclock_t are guarenteed to hold
 133  * several seconds worth of count.
 134  *
 135  * WARNING!  callouts can preempt normal threads.  However, they will not
 136  * preempt a thread holding a spinlock so we *can* safely use spinlocks.
 137  */
 138 static int schedcpu_stats(struct proc *p, void *data __unused);
 139 static int schedcpu_resource(struct proc *p, void *data __unused);
 140
 141 static void
 142 schedcpu(void *arg)
 143 {
 144         allproc_scan(schedcpu_stats, NULL, 1);
 145         allproc_scan(schedcpu_resource, NULL, 1);
 146         if (mycpu->gd_cpuid == 0) {
 147                 wakeup((caddr_t)&lbolt);
 148                 wakeup(lbolt_syncer);
 149         }
 150         callout_reset(&mycpu->gd_schedcpu_callout, hz, schedcpu, NULL);
 151 }
 152
 153 /*
 154  * General process statistics once a second
 155  */
 156 static int
 157 schedcpu_stats(struct proc *p, void *data __unused)
 158 {
 159         struct lwp *lp;
 160
 161         /*
 162          * Threads may not be completely set up if process in SIDL state.
 163          */
 164         if (p->p_stat == SIDL)
 165                 return(0);
 166
 167         PHOLD(p);
 168         if (lwkt_trytoken(&p->p_token) == FALSE) {
 169                 PRELE(p);
 170                 return(0);
 171         }
 172
 173         p->p_swtime++;
 174         FOREACH_LWP_IN_PROC(lp, p) {
 175                 if (lp->lwp_stat == LSSLEEP) {
 176                         ++lp->lwp_slptime;
 177                         if (lp->lwp_slptime == 1)
 178                                 p->p_usched->uload_update(lp);
 179                 }
 180
 181                 /*
 182                  * Only recalculate processes that are active or have slept
 183                  * less then 2 seconds.  The schedulers understand this.
 184                  * Otherwise decay by 50% per second.
 185                  */
 186                 if (lp->lwp_slptime <= 1) {
 187                         p->p_usched->recalculate(lp);
 188                 } else {
 189                         int decay;
 190
 191                         decay = pctcpu_decay;
 192                         cpu_ccfence();
 193                         if (decay <= 1)
 194                                 decay = 1;
 195                         if (decay > 100)
 196                                 decay = 100;
 197                         lp->lwp_pctcpu = (lp->lwp_pctcpu * (decay - 1)) / decay;
 198                 }
 199         }
 200         lwkt_reltoken(&p->p_token);
 201         lwkt_yield();
 202         PRELE(p);
 203         return(0);
 204 }
 205
 206 /*
 207  * Resource checks.  XXX break out since ksignal/killproc can block,
 208  * limiting us to one process killed per second.  There is probably
 209  * a better way.
 210  */
 211 static int
 212 schedcpu_resource(struct proc *p, void *data __unused)
 213 {
 214         u_int64_t ttime;
 215         struct lwp *lp;
 216
 217         if (p->p_stat == SIDL)
 218                 return(0);
 219
 220         PHOLD(p);
 221         if (lwkt_trytoken(&p->p_token) == FALSE) {
 222                 PRELE(p);
 223                 return(0);
 224         }
 225
 226         if (p->p_stat == SZOMB || p->p_limit == NULL) {
 227                 lwkt_reltoken(&p->p_token);
 228                 PRELE(p);
 229                 return(0);
 230         }
 231
 232         ttime = 0;
 233         FOREACH_LWP_IN_PROC(lp, p) {
 234                 /*
 235                  * We may have caught an lp in the middle of being
 236                  * created, lwp_thread can be NULL.
 237                  */
 238                 if (lp->lwp_thread) {
 239                         ttime += lp->lwp_thread->td_sticks;
 240                         ttime += lp->lwp_thread->td_uticks;
 241                 }
 242         }
 243
 244         switch(plimit_testcpulimit(p->p_limit, ttime)) {
 245         case PLIMIT_TESTCPU_KILL:
 246                 killproc(p, "exceeded maximum CPU limit");
 247                 break;
 248         case PLIMIT_TESTCPU_XCPU:
 249                 if ((p->p_flags & P_XCPU) == 0) {
 250                         p->p_flags |= P_XCPU;
 251                         ksignal(p, SIGXCPU);
 252                 }
 253                 break;
 254         default:
 255                 break;
 256         }
 257         lwkt_reltoken(&p->p_token);
 258         lwkt_yield();
 259         PRELE(p);
 260         return(0);
 261 }
 262
 263 /*
 264  * This is only used by ps.  Generate a cpu percentage use over
 265  * a period of one second.
 266  */
 267 void
 268 updatepcpu(struct lwp *lp, int cpticks, int ttlticks)
 269 {
 270         fixpt_t acc;
 271         int remticks;
 272
 273         acc = (cpticks << FSHIFT) / ttlticks;
 274         if (ttlticks >= ESTCPUFREQ) {
 275                 lp->lwp_pctcpu = acc;
 276         } else {
 277                 remticks = ESTCPUFREQ - ttlticks;
 278                 lp->lwp_pctcpu = (acc * ttlticks + lp->lwp_pctcpu * remticks) /
 279                                 ESTCPUFREQ;
 280         }
 281 }
 282
 283 /*
 284  * Handy macros to calculate hash indices.  LOOKUP() calculates the
 285  * global cpumask hash index, TCHASHSHIFT() converts that into the
 286  * pcpu hash index.
 287  *
 288  * By making the pcpu hash arrays smaller we save a significant amount
 289  * of memory at very low cost.  The real cost is in IPIs, which are handled
 290  * by the much larger global cpumask hash table.
 291  */
 292 #define LOOKUP(x)       (((u_int)(uintptr_t)(x)) % slpque_tablesize)
 293 #define TCHASHSHIFT(x)  ((x) >> 4)
 294
 295 static uint32_t slpque_tablesize;
 296 static cpumask_t *slpque_cpumasks;
 297
 298 /*
 299  * This is a dandy function that allows us to interlock tsleep/wakeup
 300  * operations with unspecified upper level locks, such as lockmgr locks,
 301  * simply by holding a critical section.  The sequence is:
 302  *
 303  *      (acquire upper level lock)
 304  *      tsleep_interlock(blah)
 305  *      (release upper level lock)
 306  *      tsleep(blah, ...)
 307  *
 308  * Basically this functions queues us on the tsleep queue without actually
 309  * descheduling us.  When tsleep() is later called with PINTERLOCK it
 310  * assumes the thread was already queued, otherwise it queues it there.
 311  *
 312  * Thus it is possible to receive the wakeup prior to going to sleep and
 313  * the race conditions are covered.
 314  */
 315 static __inline void
 316 _tsleep_interlock(globaldata_t gd, const volatile void *ident, int flags)
 317 {
 318         thread_t td = gd->gd_curthread;
 319         uint32_t cid;
 320         uint32_t gid;
 321
 322         crit_enter_quick(td);
 323         if (td->td_flags & TDF_TSLEEPQ) {
 324                 cid = LOOKUP(td->td_wchan);
 325                 gid = TCHASHSHIFT(cid);
 326                 TAILQ_REMOVE(&gd->gd_tsleep_hash[gid], td, td_sleepq);
 327                 if (TAILQ_FIRST(&gd->gd_tsleep_hash[gid]) == NULL) {
 328                         ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid],
 329                                                gd->gd_cpuid);
 330                 }
 331         } else {
 332                 td->td_flags |= TDF_TSLEEPQ;
 333         }
 334         cid = LOOKUP(ident);
 335         gid = TCHASHSHIFT(cid);
 336         TAILQ_INSERT_TAIL(&gd->gd_tsleep_hash[gid], td, td_sleepq);
 337         ATOMIC_CPUMASK_ORBIT(slpque_cpumasks[cid], gd->gd_cpuid);
 338         td->td_wchan = ident;
 339         td->td_wdomain = flags & PDOMAIN_MASK;
 340         crit_exit_quick(td);
 341 }
 342
 343 void
 344 tsleep_interlock(const volatile void *ident, int flags)
 345 {
 346         _tsleep_interlock(mycpu, ident, flags);
 347 }
 348
 349 /*
 350  * Remove thread from sleepq.  Must be called with a critical section held.
 351  * The thread must not be migrating.
 352  */
 353 static __inline void
 354 _tsleep_remove(thread_t td)
 355 {
 356         globaldata_t gd = mycpu;
 357         uint32_t cid;
 358         uint32_t gid;
 359
 360         KKASSERT(td->td_gd == gd && IN_CRITICAL_SECT(td));
 361         KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
 362         if (td->td_flags & TDF_TSLEEPQ) {
 363                 td->td_flags &= ~TDF_TSLEEPQ;
 364                 cid = LOOKUP(td->td_wchan);
 365                 gid = TCHASHSHIFT(cid);
 366                 TAILQ_REMOVE(&gd->gd_tsleep_hash[gid], td, td_sleepq);
 367                 if (TAILQ_FIRST(&gd->gd_tsleep_hash[gid]) == NULL) {
 368                         ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid],
 369                                                gd->gd_cpuid);
 370                 }
 371                 td->td_wchan = NULL;
 372                 td->td_wdomain = 0;
 373         }
 374 }
 375
 376 void
 377 tsleep_remove(thread_t td)
 378 {
 379         _tsleep_remove(td);
 380 }
 381
 382 /*
 383  * General sleep call.  Suspends the current process until a wakeup is
 384  * performed on the specified identifier.  The process will then be made
 385  * runnable with the specified priority.  Sleeps at most timo/hz seconds
 386  * (0 means no timeout).  If flags includes PCATCH flag, signals are checked
 387  * before and after sleeping, else signals are not checked.  Returns 0 if
 388  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 389  * signal needs to be delivered, ERESTART is returned if the current system
 390  * call should be restarted if possible, and EINTR is returned if the system
 391  * call should be interrupted by the signal (return EINTR).
 392  *
 393  * Note that if we are a process, we release_curproc() before messing with
 394  * the LWKT scheduler.
 395  *
 396  * During autoconfiguration or after a panic, a sleep will simply
 397  * lower the priority briefly to allow interrupts, then return.
 398  *
 399  * WARNING!  This code can't block (short of switching away), or bad things
 400  *           will happen.  No getting tokens, no blocking locks, etc.
 401  */
 402 int
 403 tsleep(const volatile void *ident, int flags, const char *wmesg, int timo)
 404 {
 405         struct thread *td = curthread;
 406         struct lwp *lp = td->td_lwp;
 407         struct proc *p = td->td_proc;           /* may be NULL */
 408         globaldata_t gd;
 409         int sig;
 410         int catch;
 411         int error;
 412         int oldpri;
 413         struct callout thandle;
 414
 415         /*
 416          * Currently a severe hack.  Make sure any delayed wakeups
 417          * are flushed before we sleep or we might deadlock on whatever
 418          * event we are sleeping on.
 419          */
 420         if (td->td_flags & TDF_DELAYED_WAKEUP)
 421                 wakeup_end_delayed();
 422
 423         /*
 424          * NOTE: removed KTRPOINT, it could cause races due to blocking
 425          * even in stable.  Just scrap it for now.
 426          */
 427         if (!tsleep_crypto_dump && (tsleep_now_works == 0 || panicstr)) {
 428                 /*
 429                  * After a panic, or before we actually have an operational
 430                  * softclock, just give interrupts a chance, then just return;
 431                  *
 432                  * don't run any other procs or panic below,
 433                  * in case this is the idle process and already asleep.
 434                  */
 435                 splz();
 436                 oldpri = td->td_pri;
 437                 lwkt_setpri_self(safepri);
 438                 lwkt_switch();
 439                 lwkt_setpri_self(oldpri);
 440                 return (0);
 441         }
 442         logtsleep2(tsleep_beg, ident);
 443         gd = td->td_gd;
 444         KKASSERT(td != &gd->gd_idlethread);     /* you must be kidding! */
 445         td->td_wakefromcpu = -1;                /* overwritten by _wakeup */
 446
 447         /*
 448          * NOTE: all of this occurs on the current cpu, including any
 449          * callout-based wakeups, so a critical section is a sufficient
 450          * interlock.
 451          *
 452          * The entire sequence through to where we actually sleep must
 453          * run without breaking the critical section.
 454          */
 455         catch = flags & PCATCH;
 456         error = 0;
 457         sig = 0;
 458
 459         crit_enter_quick(td);
 460
 461         KASSERT(ident != NULL, ("tsleep: no ident"));
 462         KASSERT(lp == NULL ||
 463                 lp->lwp_stat == LSRUN ||        /* Obvious */
 464                 lp->lwp_stat == LSSTOP,         /* Set in tstop */
 465                 ("tsleep %p %s %d",
 466                         ident, wmesg, lp->lwp_stat));
 467
 468         /*
 469          * We interlock the sleep queue if the caller has not already done
 470          * it for us.  This must be done before we potentially acquire any
 471          * tokens or we can loose the wakeup.
 472          */
 473         if ((flags & PINTERLOCKED) == 0) {
 474                 _tsleep_interlock(gd, ident, flags);
 475         }
 476
 477         /*
 478          * Setup for the current process (if this is a process).  We must
 479          * interlock with lwp_token to avoid remote wakeup races via
 480          * setrunnable()
 481          */
 482         if (lp) {
 483                 lwkt_gettoken(&lp->lwp_token);
 484
 485                 /*
 486                  * If the umbrella process is in the SCORE state then
 487                  * make sure that the thread is flagged going into a
 488                  * normal sleep to allow the core dump to proceed, otherwise
 489                  * the coredump can end up waiting forever.  If the normal
 490                  * sleep is woken up, the thread will enter a stopped state
 491                  * upon return to userland.
 492                  *
 493                  * We do not want to interrupt or cause a thread exist at
 494                  * this juncture because that will mess-up the state the
 495                  * coredump is trying to save.
 496                  */
 497                 if (p->p_stat == SCORE &&
 498                     (lp->lwp_mpflags & LWP_MP_WSTOP) == 0) {
 499                         atomic_set_int(&lp->lwp_mpflags, LWP_MP_WSTOP);
 500                         ++p->p_nstopped;
 501                 }
 502
 503                 /*
 504                  * PCATCH requested.
 505                  */
 506                 if (catch) {
 507                         /*
 508                          * Early termination if PCATCH was set and a
 509                          * signal is pending, interlocked with the
 510                          * critical section.
 511                          *
 512                          * Early termination only occurs when tsleep() is
 513                          * entered while in a normal LSRUN state.
 514                          */
 515                         if ((sig = CURSIG(lp)) != 0)
 516                                 goto resume;
 517
 518                         /*
 519                          * Causes ksignal to wake us up if a signal is
 520                          * received (interlocked with lp->lwp_token).
 521                          */
 522                         lp->lwp_flags |= LWP_SINTR;
 523                 }
 524         } else {
 525                 KKASSERT(p == NULL);
 526         }
 527
 528         /*
 529          * Make sure the current process has been untangled from
 530          * the userland scheduler and initialize slptime to start
 531          * counting.
 532          *
 533          * NOTE: td->td_wakefromcpu is pre-set by the release function
 534          *       for the dfly scheduler, and then adjusted by _wakeup()
 535          */
 536         if (lp) {
 537                 p->p_usched->release_curproc(lp);
 538                 lp->lwp_slptime = 0;
 539         }
 540
 541         /*
 542          * If the interlocked flag is set but our cpu bit in the slpqueue
 543          * is no longer set, then a wakeup was processed inbetween the
 544          * tsleep_interlock() (ours or the callers), and here.  This can
 545          * occur under numerous circumstances including when we release the
 546          * current process.
 547          *
 548          * Extreme loads can cause the sending of an IPI (e.g. wakeup()'s)
 549          * to process incoming IPIs, thus draining incoming wakeups.
 550          */
 551         if ((td->td_flags & TDF_TSLEEPQ) == 0) {
 552                 logtsleep2(ilockfail, ident);
 553                 goto resume;
 554         }
 555
 556         /*
 557          * scheduling is blocked while in a critical section.  Coincide
 558          * the descheduled-by-tsleep flag with the descheduling of the
 559          * lwkt.
 560          *
 561          * The timer callout is localized on our cpu and interlocked by
 562          * our critical section.
 563          */
 564         lwkt_deschedule_self(td);
 565         td->td_flags |= TDF_TSLEEP_DESCHEDULED;
 566         td->td_wmesg = wmesg;
 567
 568         /*
 569          * Setup the timeout, if any.  The timeout is only operable while
 570          * the thread is flagged descheduled.
 571          */
 572         KKASSERT((td->td_flags & TDF_TIMEOUT) == 0);
 573         if (timo) {
 574                 callout_init_mp(&thandle);
 575                 callout_reset(&thandle, timo, endtsleep, td);
 576         }
 577
 578         /*
 579          * Beddy bye bye.
 580          */
 581         if (lp) {
 582                 /*
 583                  * Ok, we are sleeping.  Place us in the SSLEEP state.
 584                  */
 585                 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
 586
 587                 /*
 588                  * tstop() sets LSSTOP, so don't fiddle with that.
 589                  */
 590                 if (lp->lwp_stat != LSSTOP)
 591                         lp->lwp_stat = LSSLEEP;
 592                 lp->lwp_ru.ru_nvcsw++;
 593                 p->p_usched->uload_update(lp);
 594                 lwkt_switch();
 595
 596                 /*
 597                  * And when we are woken up, put us back in LSRUN.  If we
 598                  * slept for over a second, recalculate our estcpu.
 599                  */
 600                 lp->lwp_stat = LSRUN;
 601                 if (lp->lwp_slptime) {
 602                         p->p_usched->uload_update(lp);
 603                         p->p_usched->recalculate(lp);
 604                 }
 605                 lp->lwp_slptime = 0;
 606         } else {
 607                 lwkt_switch();
 608         }
 609
 610         /*
 611          * Make sure we haven't switched cpus while we were asleep.  It's
 612          * not supposed to happen.  Cleanup our temporary flags.
 613          */
 614         KKASSERT(gd == td->td_gd);
 615
 616         /*
 617          * Cleanup the timeout.  If the timeout has already occured thandle
 618          * has already been stopped, otherwise stop thandle.  If the timeout
 619          * is running (the callout thread must be blocked trying to get
 620          * lwp_token) then wait for us to get scheduled.
 621          */
 622         if (timo) {
 623                 while (td->td_flags & TDF_TIMEOUT_RUNNING) {
 624                         /* else we won't get rescheduled! */
 625                         if (lp->lwp_stat != LSSTOP)
 626                                 lp->lwp_stat = LSSLEEP;
 627                         lwkt_deschedule_self(td);
 628                         td->td_wmesg = "tsrace";
 629                         lwkt_switch();
 630                         kprintf("td %p %s: timeout race\n", td, td->td_comm);
 631                 }
 632                 if (td->td_flags & TDF_TIMEOUT) {
 633                         td->td_flags &= ~TDF_TIMEOUT;
 634                         error = EWOULDBLOCK;
 635                 } else {
 636                         /* does not block when on same cpu */
 637                         callout_stop(&thandle);
 638                 }
 639         }
 640         td->td_flags &= ~TDF_TSLEEP_DESCHEDULED;
 641
 642         /*
 643          * Make sure we have been removed from the sleepq.  In most
 644          * cases this will have been done for us already but it is
 645          * possible for a scheduling IPI to be in-flight from a
 646          * previous tsleep/tsleep_interlock() or due to a straight-out
 647          * call to lwkt_schedule() (in the case of an interrupt thread),
 648          * causing a spurious wakeup.
 649          */
 650         _tsleep_remove(td);
 651         td->td_wmesg = NULL;
 652
 653         /*
 654          * Figure out the correct error return.  If interrupted by a
 655          * signal we want to return EINTR or ERESTART.
 656          */
 657 resume:
 658         if (lp) {
 659                 if (catch && error == 0) {
 660                         if (sig != 0 || (sig = CURSIG(lp))) {
 661                                 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 662                                         error = EINTR;
 663                                 else
 664                                         error = ERESTART;
 665                         }
 666                 }
 667
 668                 lp->lwp_flags &= ~LWP_SINTR;
 669
 670                 /*
 671                  * Unconditionally set us to LSRUN on resume.  lwp_stat could
 672                  * be in a weird state due to the goto resume, particularly
 673                  * when tsleep() is called from tstop().
 674                  */
 675                 lp->lwp_stat = LSRUN;
 676                 lwkt_reltoken(&lp->lwp_token);
 677         }
 678         logtsleep1(tsleep_end);
 679         crit_exit_quick(td);
 680         return (error);
 681 }
 682
 683 /*
 684  * Interlocked spinlock sleep.  An exclusively held spinlock must
 685  * be passed to ssleep().  The function will atomically release the
 686  * spinlock and tsleep on the ident, then reacquire the spinlock and
 687  * return.
 688  *
 689  * This routine is fairly important along the critical path, so optimize it
 690  * heavily.
 691  */
 692 int
 693 ssleep(const volatile void *ident, struct spinlock *spin, int flags,
 694        const char *wmesg, int timo)
 695 {
 696         globaldata_t gd = mycpu;
 697         int error;
 698
 699         _tsleep_interlock(gd, ident, flags);
 700         spin_unlock_quick(gd, spin);
 701         error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
 702         _spin_lock_quick(gd, spin, wmesg);
 703
 704         return (error);
 705 }
 706
 707 int
 708 lksleep(const volatile void *ident, struct lock *lock, int flags,
 709         const char *wmesg, int timo)
 710 {
 711         globaldata_t gd = mycpu;
 712         int error;
 713
 714         _tsleep_interlock(gd, ident, flags);
 715         lockmgr(lock, LK_RELEASE);
 716         error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
 717         lockmgr(lock, LK_EXCLUSIVE);
 718
 719         return (error);
 720 }
 721
 722 /*
 723  * Interlocked mutex sleep.  An exclusively held mutex must be passed
 724  * to mtxsleep().  The function will atomically release the mutex
 725  * and tsleep on the ident, then reacquire the mutex and return.
 726  */
 727 int
 728 mtxsleep(const volatile void *ident, struct mtx *mtx, int flags,
 729          const char *wmesg, int timo)
 730 {
 731         globaldata_t gd = mycpu;
 732         int error;
 733
 734         _tsleep_interlock(gd, ident, flags);
 735         mtx_unlock(mtx);
 736         error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
 737         mtx_lock_ex_quick(mtx);
 738
 739         return (error);
 740 }
 741
 742 /*
 743  * Interlocked serializer sleep.  An exclusively held serializer must
 744  * be passed to zsleep().  The function will atomically release
 745  * the serializer and tsleep on the ident, then reacquire the serializer
 746  * and return.
 747  */
 748 int
 749 zsleep(const volatile void *ident, struct lwkt_serialize *slz, int flags,
 750        const char *wmesg, int timo)
 751 {
 752         globaldata_t gd = mycpu;
 753         int ret;
 754
 755         ASSERT_SERIALIZED(slz);
 756
 757         _tsleep_interlock(gd, ident, flags);
 758         lwkt_serialize_exit(slz);
 759         ret = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
 760         lwkt_serialize_enter(slz);
 761
 762         return ret;
 763 }
 764
 765 /*
 766  * Directly block on the LWKT thread by descheduling it.  This
 767  * is much faster then tsleep(), but the only legal way to wake
 768  * us up is to directly schedule the thread.
 769  *
 770  * Setting TDF_SINTR will cause new signals to directly schedule us.
 771  *
 772  * This routine must be called while in a critical section.
 773  */
 774 int
 775 lwkt_sleep(const char *wmesg, int flags)
 776 {
 777         thread_t td = curthread;
 778         int sig;
 779
 780         if ((flags & PCATCH) == 0 || td->td_lwp == NULL) {
 781                 td->td_flags |= TDF_BLOCKED;
 782                 td->td_wmesg = wmesg;
 783                 lwkt_deschedule_self(td);
 784                 lwkt_switch();
 785                 td->td_wmesg = NULL;
 786                 td->td_flags &= ~TDF_BLOCKED;
 787                 return(0);
 788         }
 789         if ((sig = CURSIG(td->td_lwp)) != 0) {
 790                 if (SIGISMEMBER(td->td_proc->p_sigacts->ps_sigintr, sig))
 791                         return(EINTR);
 792                 else
 793                         return(ERESTART);
 794
 795         }
 796         td->td_flags |= TDF_BLOCKED | TDF_SINTR;
 797         td->td_wmesg = wmesg;
 798         lwkt_deschedule_self(td);
 799         lwkt_switch();
 800         td->td_flags &= ~(TDF_BLOCKED | TDF_SINTR);
 801         td->td_wmesg = NULL;
 802         return(0);
 803 }
 804
 805 /*
 806  * Implement the timeout for tsleep.
 807  *
 808  * This type of callout timeout is scheduled on the same cpu the process
 809  * is sleeping on.  Also, at the moment, the MP lock is held.
 810  */
 811 static void
 812 endtsleep(void *arg)
 813 {
 814         thread_t td = arg;
 815         struct lwp *lp;
 816
 817         /*
 818          * We are going to have to get the lwp_token, which means we might
 819          * block.  This can race a tsleep getting woken up by other means
 820          * so set TDF_TIMEOUT_RUNNING to force the tsleep to wait for our
 821          * processing to complete (sorry tsleep!).
 822          *
 823          * We can safely set td_flags because td MUST be on the same cpu
 824          * as we are.
 825          */
 826         KKASSERT(td->td_gd == mycpu);
 827         crit_enter();
 828         td->td_flags |= TDF_TIMEOUT_RUNNING | TDF_TIMEOUT;
 829
 830         /*
 831          * This can block but TDF_TIMEOUT_RUNNING will prevent the thread
 832          * from exiting the tsleep on us.  The flag is interlocked by virtue
 833          * of lp being on the same cpu as we are.
 834          */
 835         if ((lp = td->td_lwp) != NULL)
 836                 lwkt_gettoken(&lp->lwp_token);
 837
 838         KKASSERT(td->td_flags & TDF_TSLEEP_DESCHEDULED);
 839
 840         if (lp) {
 841                 /*
 842                  * callout timer should normally never be set in tstop()
 843                  * because it passes a timeout of 0.  However, there is a
 844                  * case during thread exit (which SSTOP's all the threads)
 845                  * for which tstop() must break out and can (properly) leave
 846                  * the thread in LSSTOP.
 847                  */
 848                 KKASSERT(lp->lwp_stat != LSSTOP ||
 849                          (lp->lwp_mpflags & LWP_MP_WEXIT));
 850                 setrunnable(lp);
 851                 lwkt_reltoken(&lp->lwp_token);
 852         } else {
 853                 _tsleep_remove(td);
 854                 lwkt_schedule(td);
 855         }
 856         KKASSERT(td->td_gd == mycpu);
 857         td->td_flags &= ~TDF_TIMEOUT_RUNNING;
 858         crit_exit();
 859 }
 860
 861 /*
 862  * Make all processes sleeping on the specified identifier runnable.
 863  * count may be zero or one only.
 864  *
 865  * The domain encodes the sleep/wakeup domain, flags, plus the originating
 866  * cpu.
 867  *
 868  * This call may run without the MP lock held.  We can only manipulate thread
 869  * state on the cpu owning the thread.  We CANNOT manipulate process state
 870  * at all.
 871  *
 872  * _wakeup() can be passed to an IPI so we can't use (const volatile
 873  * void *ident).
 874  */
 875 static void
 876 _wakeup(void *ident, int domain)
 877 {
 878         struct tslpque *qp;
 879         struct thread *td;
 880         struct thread *ntd;
 881         globaldata_t gd;
 882         cpumask_t mask;
 883         uint32_t cid;
 884         uint32_t gid;
 885
 886         crit_enter();
 887         logtsleep2(wakeup_beg, ident);
 888         gd = mycpu;
 889         cid = LOOKUP(ident);
 890         gid = TCHASHSHIFT(cid);
 891         qp = &gd->gd_tsleep_hash[gid];
 892 restart:
 893         for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
 894                 ntd = TAILQ_NEXT(td, td_sleepq);
 895                 if (td->td_wchan == ident &&
 896                     td->td_wdomain == (domain & PDOMAIN_MASK)
 897                 ) {
 898                         KKASSERT(td->td_gd == gd);
 899                         _tsleep_remove(td);
 900                         td->td_wakefromcpu = PWAKEUP_DECODE(domain);
 901                         if (td->td_flags & TDF_TSLEEP_DESCHEDULED) {
 902                                 lwkt_schedule(td);
 903                                 if (domain & PWAKEUP_ONE)
 904                                         goto done;
 905                         }
 906                         goto restart;
 907                 }
 908         }
 909
 910         /*
 911          * Because a bunch of cpumask array entries cover the same queue, it
 912          * is possible for our bit to remain set in some of them and cause
 913          * spurious wakeup IPIs later on.  Make sure that the bit is cleared
 914          * when a spurious IPI occurs to prevent further spurious IPIs.
 915          */
 916         if (TAILQ_FIRST(qp) == NULL) {
 917                 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid], gd->gd_cpuid);
 918         }
 919
 920         /*
 921          * We finished checking the current cpu but there still may be
 922          * more work to do.  Either wakeup_one was requested and no matching
 923          * thread was found, or a normal wakeup was requested and we have
 924          * to continue checking cpus.
 925          *
 926          * It should be noted that this scheme is actually less expensive then
 927          * the old scheme when waking up multiple threads, since we send
 928          * only one IPI message per target candidate which may then schedule
 929          * multiple threads.  Before we could have wound up sending an IPI
 930          * message for each thread on the target cpu (!= current cpu) that
 931          * needed to be woken up.
 932          *
 933          * NOTE: Wakeups occuring on remote cpus are asynchronous.  This
 934          *       should be ok since we are passing idents in the IPI rather
 935          *       then thread pointers.
 936          *
 937          * NOTE: We MUST mfence (or use an atomic op) prior to reading
 938          *       the cpumask, as another cpu may have written to it in
 939          *       a fashion interlocked with whatever the caller did before
 940          *       calling wakeup().  Otherwise we might miss the interaction
 941          *       (kern_mutex.c can cause this problem).
 942          *
 943          *       lfence is insufficient as it may allow a written state to
 944          *       reorder around the cpumask load.
 945          */
 946         if ((domain & PWAKEUP_MYCPU) == 0) {
 947                 cpu_mfence();
 948                 mask = slpque_cpumasks[cid];
 949                 CPUMASK_ANDMASK(mask, gd->gd_other_cpus);
 950                 if (CPUMASK_TESTNZERO(mask)) {
 951                         lwkt_send_ipiq2_mask(mask, _wakeup, ident,
 952                                              domain | PWAKEUP_MYCPU);
 953                 }
 954         }
 955 done:
 956         logtsleep1(wakeup_end);
 957         crit_exit();
 958 }
 959
 960 /*
 961  * Wakeup all threads tsleep()ing on the specified ident, on all cpus
 962  */
 963 void
 964 wakeup(const volatile void *ident)
 965 {
 966     globaldata_t gd = mycpu;
 967     thread_t td = gd->gd_curthread;
 968
 969     if (td && (td->td_flags & TDF_DELAYED_WAKEUP)) {
 970         /*
 971          * If we are in a delayed wakeup section, record up to two wakeups in
 972          * a per-CPU queue and issue them when we block or exit the delayed
 973          * wakeup section.
 974          */
 975         if (atomic_cmpset_ptr(&gd->gd_delayed_wakeup[0], NULL, ident))
 976                 return;
 977         if (atomic_cmpset_ptr(&gd->gd_delayed_wakeup[1], NULL, ident))
 978                 return;
 979
 980         ident = atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd->gd_delayed_wakeup[1]),
 981                                 __DEALL(ident));
 982         ident = atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd->gd_delayed_wakeup[0]),
 983                                 __DEALL(ident));
 984     }
 985
 986     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, gd->gd_cpuid));
 987 }
 988
 989 /*
 990  * Wakeup one thread tsleep()ing on the specified ident, on any cpu.
 991  */
 992 void
 993 wakeup_one(const volatile void *ident)
 994 {
 995     /* XXX potentially round-robin the first responding cpu */
 996     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) |
 997                             PWAKEUP_ONE);
 998 }
 999
1000 /*
1001  * Wakeup threads tsleep()ing on the specified ident on the current cpu
1002  * only.
1003  */
1004 void
1005 wakeup_mycpu(const volatile void *ident)
1006 {
1007     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) |
1008                             PWAKEUP_MYCPU);
1009 }
1010
1011 /*
1012  * Wakeup one thread tsleep()ing on the specified ident on the current cpu
1013  * only.
1014  */
1015 void
1016 wakeup_mycpu_one(const volatile void *ident)
1017 {
1018     /* XXX potentially round-robin the first responding cpu */
1019     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) |
1020                             PWAKEUP_MYCPU | PWAKEUP_ONE);
1021 }
1022
1023 /*
1024  * Wakeup all thread tsleep()ing on the specified ident on the specified cpu
1025  * only.
1026  */
1027 void
1028 wakeup_oncpu(globaldata_t gd, const volatile void *ident)
1029 {
1030     globaldata_t mygd = mycpu;
1031     if (gd == mycpu) {
1032         _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
1033                                 PWAKEUP_MYCPU);
1034     } else {
1035         lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident),
1036                         PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
1037                         PWAKEUP_MYCPU);
1038     }
1039 }
1040
1041 /*
1042  * Wakeup one thread tsleep()ing on the specified ident on the specified cpu
1043  * only.
1044  */
1045 void
1046 wakeup_oncpu_one(globaldata_t gd, const volatile void *ident)
1047 {
1048     globaldata_t mygd = mycpu;
1049     if (gd == mygd) {
1050         _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
1051                                 PWAKEUP_MYCPU | PWAKEUP_ONE);
1052     } else {
1053         lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident),
1054                         PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
1055                         PWAKEUP_MYCPU | PWAKEUP_ONE);
1056     }
1057 }
1058
1059 /*
1060  * Wakeup all threads waiting on the specified ident that slept using
1061  * the specified domain, on all cpus.
1062  */
1063 void
1064 wakeup_domain(const volatile void *ident, int domain)
1065 {
1066     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(domain, mycpu->gd_cpuid));
1067 }
1068
1069 /*
1070  * Wakeup one thread waiting on the specified ident that slept using
1071  * the specified  domain, on any cpu.
1072  */
1073 void
1074 wakeup_domain_one(const volatile void *ident, int domain)
1075 {
1076     /* XXX potentially round-robin the first responding cpu */
1077     _wakeup(__DEALL(ident),
1078             PWAKEUP_ENCODE(domain, mycpu->gd_cpuid) | PWAKEUP_ONE);
1079 }
1080
1081 void
1082 wakeup_start_delayed(void)
1083 {
1084     globaldata_t gd = mycpu;
1085
1086     crit_enter();
1087     gd->gd_curthread->td_flags |= TDF_DELAYED_WAKEUP;
1088     crit_exit();
1089 }
1090
1091 void
1092 wakeup_end_delayed(void)
1093 {
1094     globaldata_t gd = mycpu;
1095
1096     if (gd->gd_curthread->td_flags & TDF_DELAYED_WAKEUP) {
1097         crit_enter();
1098         gd->gd_curthread->td_flags &= ~TDF_DELAYED_WAKEUP;
1099         if (gd->gd_delayed_wakeup[0] || gd->gd_delayed_wakeup[1]) {
1100             if (gd->gd_delayed_wakeup[0]) {
1101                     wakeup(gd->gd_delayed_wakeup[0]);
1102                     gd->gd_delayed_wakeup[0] = NULL;
1103             }
1104             if (gd->gd_delayed_wakeup[1]) {
1105                     wakeup(gd->gd_delayed_wakeup[1]);
1106                     gd->gd_delayed_wakeup[1] = NULL;
1107             }
1108         }
1109         crit_exit();
1110     }
1111 }
1112
1113 /*
1114  * setrunnable()
1115  *
1116  * Make a process runnable.  lp->lwp_token must be held on call and this
1117  * function must be called from the cpu owning lp.
1118  *
1119  * This only has an effect if we are in LSSTOP or LSSLEEP.
1120  */
1121 void
1122 setrunnable(struct lwp *lp)
1123 {
1124         thread_t td = lp->lwp_thread;
1125
1126         ASSERT_LWKT_TOKEN_HELD(&lp->lwp_token);
1127         KKASSERT(td->td_gd == mycpu);
1128         crit_enter();
1129         if (lp->lwp_stat == LSSTOP)
1130                 lp->lwp_stat = LSSLEEP;
1131         if (lp->lwp_stat == LSSLEEP) {
1132                 _tsleep_remove(td);
1133                 lwkt_schedule(td);
1134         } else if (td->td_flags & TDF_SINTR) {
1135                 lwkt_schedule(td);
1136         }
1137         crit_exit();
1138 }
1139
1140 /*
1141  * The process is stopped due to some condition, usually because p_stat is
1142  * set to SSTOP, but also possibly due to being traced.
1143  *
1144  * Caller must hold p->p_token
1145  *
1146  * NOTE!  If the caller sets SSTOP, the caller must also clear P_WAITED
1147  * because the parent may check the child's status before the child actually
1148  * gets to this routine.
1149  *
1150  * This routine is called with the current lwp only, typically just
1151  * before returning to userland if the process state is detected as
1152  * possibly being in a stopped state.
1153  */
1154 void
1155 tstop(void)
1156 {
1157         struct lwp *lp = curthread->td_lwp;
1158         struct proc *p = lp->lwp_proc;
1159         struct proc *q;
1160
1161         lwkt_gettoken(&lp->lwp_token);
1162         crit_enter();
1163
1164         /*
1165          * If LWP_MP_WSTOP is set, we were sleeping
1166          * while our process was stopped.  At this point
1167          * we were already counted as stopped.
1168          */
1169         if ((lp->lwp_mpflags & LWP_MP_WSTOP) == 0) {
1170                 /*
1171                  * If we're the last thread to stop, signal
1172                  * our parent.
1173                  */
1174                 p->p_nstopped++;
1175                 atomic_set_int(&lp->lwp_mpflags, LWP_MP_WSTOP);
1176                 wakeup(&p->p_nstopped);
1177                 if (p->p_nstopped == p->p_nthreads) {
1178                         /*
1179                          * Token required to interlock kern_wait()
1180                          */
1181                         q = p->p_pptr;
1182                         PHOLD(q);
1183                         lwkt_gettoken(&q->p_token);
1184                         p->p_flags &= ~P_WAITED;
1185                         wakeup(p->p_pptr);
1186                         if ((q->p_sigacts->ps_flag & PS_NOCLDSTOP) == 0)
1187                                 ksignal(q, SIGCHLD);
1188                         lwkt_reltoken(&q->p_token);
1189                         PRELE(q);
1190                 }
1191         }
1192
1193         /*
1194          * Wait here while in a stopped state, interlocked with lwp_token.
1195          * We must break-out if the whole process is trying to exit.
1196          */
1197         while (STOPLWP(p, lp)) {
1198                 lp->lwp_stat = LSSTOP;
1199                 tsleep(p, 0, "stop", 0);
1200         }
1201         p->p_nstopped--;
1202         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_WSTOP);
1203         crit_exit();
1204         lwkt_reltoken(&lp->lwp_token);
1205 }
1206
1207 /*
1208  * Compute a tenex style load average of a quantity on
1209  * 1, 5 and 15 minute intervals.  This is a pcpu callout.
1210  *
1211  * We segment the lwp scan on a pcpu basis.  This does NOT
1212  * mean the associated lwps are on this cpu, it is done
1213  * just to break the work up.
1214  *
1215  * The callout on cpu0 rolls up the stats from the other
1216  * cpus.
1217  */
1218 static int loadav_count_runnable(struct lwp *p, void *data);
1219
1220 static void
1221 loadav(void *arg)
1222 {
1223         globaldata_t gd = mycpu;
1224         struct loadavg *avg;
1225         int i, nrun;
1226
1227         nrun = 0;
1228         alllwp_scan(loadav_count_runnable, &nrun, 1);
1229         gd->gd_loadav_nrunnable = nrun;
1230         if (gd->gd_cpuid == 0) {
1231                 avg = &averunnable;
1232                 nrun = 0;
1233                 for (i = 0; i < ncpus; ++i)
1234                         nrun += globaldata_find(i)->gd_loadav_nrunnable;
1235                 for (i = 0; i < 3; i++) {
1236                         avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
1237                             (long)nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
1238                 }
1239         }
1240
1241         /*
1242          * Schedule the next update to occur after 5 seconds, but add a
1243          * random variation to avoid synchronisation with processes that
1244          * run at regular intervals.
1245          */
1246         callout_reset(&gd->gd_loadav_callout,
1247                       hz * 4 + (int)(krandom() % (hz * 2 + 1)),
1248                       loadav, NULL);
1249 }
1250
1251 static int
1252 loadav_count_runnable(struct lwp *lp, void *data)
1253 {
1254         int *nrunp = data;
1255         thread_t td;
1256
1257         switch (lp->lwp_stat) {
1258         case LSRUN:
1259                 if ((td = lp->lwp_thread) == NULL)
1260                         break;
1261                 if (td->td_flags & TDF_BLOCKED)
1262                         break;
1263                 ++*nrunp;
1264                 break;
1265         default:
1266                 break;
1267         }
1268         lwkt_yield();
1269         return(0);
1270 }
1271
1272 /*
1273  * Regular data collection
1274  */
1275 static uint64_t
1276 collect_load_callback(int n)
1277 {
1278         int fscale = averunnable.fscale;
1279
1280         return ((averunnable.ldavg[0] * 100 + (fscale >> 1)) / fscale);
1281 }
1282
1283 static void
1284 sched_setup(void *dummy __unused)
1285 {
1286         globaldata_t save_gd = mycpu;
1287         globaldata_t gd;
1288         int n;
1289
1290         kcollect_register(KCOLLECT_LOAD, "load", collect_load_callback,
1291                           KCOLLECT_SCALE(KCOLLECT_LOAD_FORMAT, 0));
1292
1293         /*
1294          * Kick off timeout driven events by calling first time.  We
1295          * split the work across available cpus to help scale it,
1296          * it can eat a lot of cpu when there are a lot of processes
1297          * on the system.
1298          */
1299         for (n = 0; n < ncpus; ++n) {
1300                 gd = globaldata_find(n);
1301                 lwkt_setcpu_self(gd);
1302                 callout_init_mp(&gd->gd_loadav_callout);
1303                 callout_init_mp(&gd->gd_schedcpu_callout);
1304                 schedcpu(NULL);
1305                 loadav(NULL);
1306         }
1307         lwkt_setcpu_self(save_gd);
1308 }
1309
1310 /*
1311  * Extremely early initialization, dummy-up the tables so we don't have
1312  * to conditionalize for NULL in _wakeup() and tsleep_interlock().  Even
1313  * though the system isn't blocking this early, these functions still
1314  * try to access the hash table.
1315  *
1316  * This setup will be overridden once sched_dyninit() -> sleep_gdinit()
1317  * is called.
1318  */
1319 void
1320 sleep_early_gdinit(globaldata_t gd)
1321 {
1322         static struct tslpque   dummy_slpque;
1323         static cpumask_t dummy_cpumasks;
1324
1325         slpque_tablesize = 1;
1326         gd->gd_tsleep_hash = &dummy_slpque;
1327         slpque_cpumasks = &dummy_cpumasks;
1328         TAILQ_INIT(&dummy_slpque);
1329 }
1330
1331 /*
1332  * PCPU initialization.  Called after KMALLOC is operational, by
1333  * sched_dyninit() for cpu 0, and by mi_gdinit() for other cpus later.
1334  *
1335  * WARNING! The pcpu hash table is smaller than the global cpumask
1336  *          hash table, which can save us a lot of memory when maxproc
1337  *          is set high.
1338  */
1339 void
1340 sleep_gdinit(globaldata_t gd)
1341 {
1342         struct thread *td;
1343         uint32_t n;
1344         uint32_t i;
1345
1346         /*
1347          * This shouldn't happen, that is there shouldn't be any threads
1348          * waiting on the dummy tsleep queue this early in the boot.
1349          */
1350         if (gd->gd_cpuid == 0) {
1351                 TAILQ_FOREACH(td, &gd->gd_tsleep_hash[0], td_sleepq) {
1352                         kprintf("SLEEP_GDINIT SWITCH %s\n", td->td_comm);
1353                 }
1354         }
1355
1356         /*
1357          * Note that we have to allocate one extra slot because we are
1358          * shifting a modulo value.  TCHASHSHIFT(slpque_tablesize - 1) can
1359          * return the same value as TCHASHSHIFT(slpque_tablesize).
1360          */
1361         n = TCHASHSHIFT(slpque_tablesize) + 1;
1362
1363         gd->gd_tsleep_hash = kmalloc(sizeof(struct tslpque) * n,
1364                                      M_TSLEEP, M_WAITOK | M_ZERO);
1365         for (i = 0; i < n; ++i)
1366                 TAILQ_INIT(&gd->gd_tsleep_hash[i]);
1367 }
1368
1369 /*
1370  * Dynamic initialization after the memory system is operational.
1371  */
1372 static void
1373 sched_dyninit(void *dummy __unused)
1374 {
1375         int tblsize;
1376         int tblsize2;
1377         int n;
1378
1379         /*
1380          * Calculate table size for slpque hash.  We want a prime number
1381          * large enough to avoid overloading slpque_cpumasks when the
1382          * system has a large number of sleeping processes, which will
1383          * spam IPIs on wakeup().
1384          *
1385          * While it is true this is really a per-lwp factor, generally
1386          * speaking the maxproc limit is a good metric to go by.
1387          */
1388         for (tblsize = maxproc | 1; ; tblsize += 2) {
1389                 if (tblsize % 3 == 0)
1390                         continue;
1391                 if (tblsize % 5 == 0)
1392                         continue;
1393                 tblsize2 = (tblsize / 2) | 1;
1394                 for (n = 7; n < tblsize2; n += 2) {
1395                         if (tblsize % n == 0)
1396                                 break;
1397                 }
1398                 if (n == tblsize2)
1399                         break;
1400         }
1401
1402         /*
1403          * PIDs are currently limited to 6 digits.  Cap the table size
1404          * at double this.
1405          */
1406         if (tblsize > 2000003)
1407                 tblsize = 2000003;
1408
1409         slpque_tablesize = tblsize;
1410         slpque_cpumasks = kmalloc(sizeof(*slpque_cpumasks) * slpque_tablesize,
1411                                   M_TSLEEP, M_WAITOK | M_ZERO);
1412         sleep_gdinit(mycpu);
1413 }