sys/kern/kern_synch.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)kern_synch.c        8.9 (Berkeley) 5/19/95
  35  * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $
  36  */
  37
  38 #include "opt_ktrace.h"
  39
  40 #include <sys/param.h>
  41 #include <sys/systm.h>
  42 #include <sys/proc.h>
  43 #include <sys/kernel.h>
  44 #include <sys/signalvar.h>
  45 #include <sys/resourcevar.h>
  46 #include <sys/vmmeter.h>
  47 #include <sys/sysctl.h>
  48 #include <sys/lock.h>
  49 #include <sys/uio.h>
  50 #include <sys/kcollect.h>
  51 #ifdef KTRACE
  52 #include <sys/ktrace.h>
  53 #endif
  54 #include <sys/ktr.h>
  55 #include <sys/serialize.h>
  56
  57 #include <sys/signal2.h>
  58 #include <sys/thread2.h>
  59 #include <sys/spinlock2.h>
  60 #include <sys/mutex2.h>
  61
  62 #include <machine/cpu.h>
  63 #include <machine/smp.h>
  64
  65 TAILQ_HEAD(tslpque, thread);
  66
  67 static void sched_setup (void *dummy);
  68 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL);
  69 static void sched_dyninit (void *dummy);
  70 SYSINIT(sched_dyninit, SI_BOOT1_DYNALLOC, SI_ORDER_FIRST, sched_dyninit, NULL);
  71
  72 int     lbolt;
  73 void    *lbolt_syncer;
  74 int     ncpus;
  75 int     ncpus2, ncpus2_shift, ncpus2_mask;      /* note: mask not cpumask_t */
  76 int     ncpus_fit, ncpus_fit_mask;              /* note: mask not cpumask_t */
  77 int     safepri;
  78 int     tsleep_now_works;
  79 int     tsleep_crypto_dump = 0;
  80
  81 MALLOC_DEFINE(M_TSLEEP, "tslpque", "tsleep queues");
  82
  83 #define __DEALL(ident)  __DEQUALIFY(void *, ident)
  84
  85 #if !defined(KTR_TSLEEP)
  86 #define KTR_TSLEEP      KTR_ALL
  87 #endif
  88 KTR_INFO_MASTER(tsleep);
  89 KTR_INFO(KTR_TSLEEP, tsleep, tsleep_beg, 0, "tsleep enter %p", const volatile void *ident);
  90 KTR_INFO(KTR_TSLEEP, tsleep, tsleep_end, 1, "tsleep exit");
  91 KTR_INFO(KTR_TSLEEP, tsleep, wakeup_beg, 2, "wakeup enter %p", const volatile void *ident);
  92 KTR_INFO(KTR_TSLEEP, tsleep, wakeup_end, 3, "wakeup exit");
  93 KTR_INFO(KTR_TSLEEP, tsleep, ilockfail,  4, "interlock failed %p", const volatile void *ident);
  94
  95 #define logtsleep1(name)        KTR_LOG(tsleep_ ## name)
  96 #define logtsleep2(name, val)   KTR_LOG(tsleep_ ## name, val)
  97
  98 struct loadavg averunnable =
  99         { {0, 0, 0}, FSCALE };  /* load average, of runnable procs */
 100 /*
 101  * Constants for averages over 1, 5, and 15 minutes
 102  * when sampling at 5 second intervals.
 103  */
 104 static fixpt_t cexp[3] = {
 105         0.9200444146293232 * FSCALE,    /* exp(-1/12) */
 106         0.9834714538216174 * FSCALE,    /* exp(-1/60) */
 107         0.9944598480048967 * FSCALE,    /* exp(-1/180) */
 108 };
 109
 110 static void     endtsleep (void *);
 111 static void     loadav (void *arg);
 112 static void     schedcpu (void *arg);
 113
 114 static int pctcpu_decay = 10;
 115 SYSCTL_INT(_kern, OID_AUTO, pctcpu_decay, CTLFLAG_RW, &pctcpu_decay, 0, "");
 116
 117 /*
 118  * kernel uses `FSCALE', userland (SHOULD) use kern.fscale
 119  */
 120 int     fscale __unused = FSCALE;       /* exported to systat */
 121 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 122
 123 /*
 124  * Recompute process priorities, once a second.
 125  *
 126  * Since the userland schedulers are typically event oriented, if the
 127  * estcpu calculation at wakeup() time is not sufficient to make a
 128  * process runnable relative to other processes in the system we have
 129  * a 1-second recalc to help out.
 130  *
 131  * This code also allows us to store sysclock_t data in the process structure
 132  * without fear of an overrun, since sysclock_t are guarenteed to hold
 133  * several seconds worth of count.
 134  *
 135  * WARNING!  callouts can preempt normal threads.  However, they will not
 136  * preempt a thread holding a spinlock so we *can* safely use spinlocks.
 137  */
 138 static int schedcpu_stats(struct proc *p, void *data __unused);
 139 static int schedcpu_resource(struct proc *p, void *data __unused);
 140
 141 static void
 142 schedcpu(void *arg)
 143 {
 144         allproc_scan(schedcpu_stats, NULL, 1);
 145         allproc_scan(schedcpu_resource, NULL, 1);
 146         if (mycpu->gd_cpuid == 0) {
 147                 wakeup((caddr_t)&lbolt);
 148                 wakeup(lbolt_syncer);
 149         }
 150         callout_reset(&mycpu->gd_schedcpu_callout, hz, schedcpu, NULL);
 151 }
 152
 153 /*
 154  * General process statistics once a second
 155  */
 156 static int
 157 schedcpu_stats(struct proc *p, void *data __unused)
 158 {
 159         struct lwp *lp;
 160
 161         /*
 162          * Threads may not be completely set up if process in SIDL state.
 163          */
 164         if (p->p_stat == SIDL)
 165                 return(0);
 166
 167         PHOLD(p);
 168         if (lwkt_trytoken(&p->p_token) == FALSE) {
 169                 PRELE(p);
 170                 return(0);
 171         }
 172
 173         p->p_swtime++;
 174         FOREACH_LWP_IN_PROC(lp, p) {
 175                 if (lp->lwp_stat == LSSLEEP) {
 176                         ++lp->lwp_slptime;
 177                         if (lp->lwp_slptime == 1)
 178                                 p->p_usched->uload_update(lp);
 179                 }
 180
 181                 /*
 182                  * Only recalculate processes that are active or have slept
 183                  * less then 2 seconds.  The schedulers understand this.
 184                  * Otherwise decay by 50% per second.
 185                  */
 186                 if (lp->lwp_slptime <= 1) {
 187                         p->p_usched->recalculate(lp);
 188                 } else {
 189                         int decay;
 190
 191                         decay = pctcpu_decay;
 192                         cpu_ccfence();
 193                         if (decay <= 1)
 194                                 decay = 1;
 195                         if (decay > 100)
 196                                 decay = 100;
 197                         lp->lwp_pctcpu = (lp->lwp_pctcpu * (decay - 1)) / decay;
 198                 }
 199         }
 200         lwkt_reltoken(&p->p_token);
 201         lwkt_yield();
 202         PRELE(p);
 203         return(0);
 204 }
 205
 206 /*
 207  * Resource checks.  XXX break out since ksignal/killproc can block,
 208  * limiting us to one process killed per second.  There is probably
 209  * a better way.
 210  */
 211 static int
 212 schedcpu_resource(struct proc *p, void *data __unused)
 213 {
 214         u_int64_t ttime;
 215         struct lwp *lp;
 216
 217         if (p->p_stat == SIDL)
 218                 return(0);
 219
 220         PHOLD(p);
 221         if (lwkt_trytoken(&p->p_token) == FALSE) {
 222                 PRELE(p);
 223                 return(0);
 224         }
 225
 226         if (p->p_stat == SZOMB || p->p_limit == NULL) {
 227                 lwkt_reltoken(&p->p_token);
 228                 PRELE(p);
 229                 return(0);
 230         }
 231
 232         ttime = 0;
 233         FOREACH_LWP_IN_PROC(lp, p) {
 234                 /*
 235                  * We may have caught an lp in the middle of being
 236                  * created, lwp_thread can be NULL.
 237                  */
 238                 if (lp->lwp_thread) {
 239                         ttime += lp->lwp_thread->td_sticks;
 240                         ttime += lp->lwp_thread->td_uticks;
 241                 }
 242         }
 243
 244         switch(plimit_testcpulimit(p->p_limit, ttime)) {
 245         case PLIMIT_TESTCPU_KILL:
 246                 killproc(p, "exceeded maximum CPU limit");
 247                 break;
 248         case PLIMIT_TESTCPU_XCPU:
 249                 if ((p->p_flags & P_XCPU) == 0) {
 250                         p->p_flags |= P_XCPU;
 251                         ksignal(p, SIGXCPU);
 252                 }
 253                 break;
 254         default:
 255                 break;
 256         }
 257         lwkt_reltoken(&p->p_token);
 258         lwkt_yield();
 259         PRELE(p);
 260         return(0);
 261 }
 262
 263 /*
 264  * This is only used by ps.  Generate a cpu percentage use over
 265  * a period of one second.
 266  */
 267 void
 268 updatepcpu(struct lwp *lp, int cpticks, int ttlticks)
 269 {
 270         fixpt_t acc;
 271         int remticks;
 272
 273         acc = (cpticks << FSHIFT) / ttlticks;
 274         if (ttlticks >= ESTCPUFREQ) {
 275                 lp->lwp_pctcpu = acc;
 276         } else {
 277                 remticks = ESTCPUFREQ - ttlticks;
 278                 lp->lwp_pctcpu = (acc * ttlticks + lp->lwp_pctcpu * remticks) /
 279                                 ESTCPUFREQ;
 280         }
 281 }
 282
 283 /*
 284  * Handy macros to calculate hash indices.  LOOKUP() calculates the
 285  * global cpumask hash index, TCHASHSHIFT() converts that into the
 286  * pcpu hash index.
 287  *
 288  * By making the pcpu hash arrays smaller we save a significant amount
 289  * of memory at very low cost.  The real cost is in IPIs, which are handled
 290  * by the much larger global cpumask hash table.
 291  */
 292 #define LOOKUP(x)       (((u_int)(uintptr_t)(x)) % slpque_tablesize)
 293 #define TCHASHSHIFT(x)  ((x) >> 4)
 294
 295 static uint32_t slpque_tablesize;
 296 static cpumask_t *slpque_cpumasks;
 297
 298 /*
 299  * This is a dandy function that allows us to interlock tsleep/wakeup
 300  * operations with unspecified upper level locks, such as lockmgr locks,
 301  * simply by holding a critical section.  The sequence is:
 302  *
 303  *      (acquire upper level lock)
 304  *      tsleep_interlock(blah)
 305  *      (release upper level lock)
 306  *      tsleep(blah, ...)
 307  *
 308  * Basically this functions queues us on the tsleep queue without actually
 309  * descheduling us.  When tsleep() is later called with PINTERLOCK it
 310  * assumes the thread was already queued, otherwise it queues it there.
 311  *
 312  * Thus it is possible to receive the wakeup prior to going to sleep and
 313  * the race conditions are covered.
 314  */
 315 static __inline void
 316 _tsleep_interlock(globaldata_t gd, const volatile void *ident, int flags)
 317 {
 318         thread_t td = gd->gd_curthread;
 319         uint32_t cid;
 320         uint32_t gid;
 321
 322         crit_enter_quick(td);
 323         if (td->td_flags & TDF_TSLEEPQ) {
 324                 cid = LOOKUP(td->td_wchan);
 325                 gid = TCHASHSHIFT(cid);
 326                 TAILQ_REMOVE(&gd->gd_tsleep_hash[gid], td, td_sleepq);
 327                 if (TAILQ_FIRST(&gd->gd_tsleep_hash[gid]) == NULL) {
 328                         ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid],
 329                                                gd->gd_cpuid);
 330                 }
 331         } else {
 332                 td->td_flags |= TDF_TSLEEPQ;
 333         }
 334         cid = LOOKUP(ident);
 335         gid = TCHASHSHIFT(cid);
 336         TAILQ_INSERT_TAIL(&gd->gd_tsleep_hash[gid], td, td_sleepq);
 337         ATOMIC_CPUMASK_ORBIT(slpque_cpumasks[cid], gd->gd_cpuid);
 338         td->td_wchan = ident;
 339         td->td_wdomain = flags & PDOMAIN_MASK;
 340         crit_exit_quick(td);
 341 }
 342
 343 void
 344 tsleep_interlock(const volatile void *ident, int flags)
 345 {
 346         _tsleep_interlock(mycpu, ident, flags);
 347 }
 348
 349 /*
 350  * Remove thread from sleepq.  Must be called with a critical section held.
 351  * The thread must not be migrating.
 352  */
 353 static __inline void
 354 _tsleep_remove(thread_t td)
 355 {
 356         globaldata_t gd = mycpu;
 357         uint32_t cid;
 358         uint32_t gid;
 359
 360         KKASSERT(td->td_gd == gd && IN_CRITICAL_SECT(td));
 361         KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
 362         if (td->td_flags & TDF_TSLEEPQ) {
 363                 td->td_flags &= ~TDF_TSLEEPQ;
 364                 cid = LOOKUP(td->td_wchan);
 365                 gid = TCHASHSHIFT(cid);
 366                 TAILQ_REMOVE(&gd->gd_tsleep_hash[gid], td, td_sleepq);
 367                 if (TAILQ_FIRST(&gd->gd_tsleep_hash[gid]) == NULL) {
 368                         ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid],
 369                                                gd->gd_cpuid);
 370                 }
 371                 td->td_wchan = NULL;
 372                 td->td_wdomain = 0;
 373         }
 374 }
 375
 376 void
 377 tsleep_remove(thread_t td)
 378 {
 379         _tsleep_remove(td);
 380 }
 381
 382 /*
 383  * General sleep call.  Suspends the current process until a wakeup is
 384  * performed on the specified identifier.  The process will then be made
 385  * runnable with the specified priority.  Sleeps at most timo/hz seconds
 386  * (0 means no timeout).  If flags includes PCATCH flag, signals are checked
 387  * before and after sleeping, else signals are not checked.  Returns 0 if
 388  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 389  * signal needs to be delivered, ERESTART is returned if the current system
 390  * call should be restarted if possible, and EINTR is returned if the system
 391  * call should be interrupted by the signal (return EINTR).
 392  *
 393  * Note that if we are a process, we release_curproc() before messing with
 394  * the LWKT scheduler.
 395  *
 396  * During autoconfiguration or after a panic, a sleep will simply
 397  * lower the priority briefly to allow interrupts, then return.
 398  *
 399  * WARNING!  This code can't block (short of switching away), or bad things
 400  *           will happen.  No getting tokens, no blocking locks, etc.
 401  */
 402 int
 403 tsleep(const volatile void *ident, int flags, const char *wmesg, int timo)
 404 {
 405         struct thread *td = curthread;
 406         struct lwp *lp = td->td_lwp;
 407         struct proc *p = td->td_proc;           /* may be NULL */
 408         globaldata_t gd;
 409         int sig;
 410         int catch;
 411         int error;
 412         int oldpri;
 413         struct callout thandle;
 414
 415         /*
 416          * Currently a severe hack.  Make sure any delayed wakeups
 417          * are flushed before we sleep or we might deadlock on whatever
 418          * event we are sleeping on.
 419          */
 420         if (td->td_flags & TDF_DELAYED_WAKEUP)
 421                 wakeup_end_delayed();
 422
 423         /*
 424          * NOTE: removed KTRPOINT, it could cause races due to blocking
 425          * even in stable.  Just scrap it for now.
 426          */
 427         if (!tsleep_crypto_dump && (tsleep_now_works == 0 || panicstr)) {
 428                 /*
 429                  * After a panic, or before we actually have an operational
 430                  * softclock, just give interrupts a chance, then just return;
 431                  *
 432                  * don't run any other procs or panic below,
 433                  * in case this is the idle process and already asleep.
 434                  */
 435                 splz();
 436                 oldpri = td->td_pri;
 437                 lwkt_setpri_self(safepri);
 438                 lwkt_switch();
 439                 lwkt_setpri_self(oldpri);
 440                 return (0);
 441         }
 442         logtsleep2(tsleep_beg, ident);
 443         gd = td->td_gd;
 444         KKASSERT(td != &gd->gd_idlethread);     /* you must be kidding! */
 445         td->td_wakefromcpu = -1;                /* overwritten by _wakeup */
 446
 447         /*
 448          * NOTE: all of this occurs on the current cpu, including any
 449          * callout-based wakeups, so a critical section is a sufficient
 450          * interlock.
 451          *
 452          * The entire sequence through to where we actually sleep must
 453          * run without breaking the critical section.
 454          */
 455         catch = flags & PCATCH;
 456         error = 0;
 457         sig = 0;
 458
 459         crit_enter_quick(td);
 460
 461         KASSERT(ident != NULL, ("tsleep: no ident"));
 462         KASSERT(lp == NULL ||
 463                 lp->lwp_stat == LSRUN ||        /* Obvious */
 464                 lp->lwp_stat == LSSTOP,         /* Set in tstop */
 465                 ("tsleep %p %s %d",
 466                         ident, wmesg, lp->lwp_stat));
 467
 468         /*
 469          * We interlock the sleep queue if the caller has not already done
 470          * it for us.  This must be done before we potentially acquire any
 471          * tokens or we can loose the wakeup.
 472          */
 473         if ((flags & PINTERLOCKED) == 0) {
 474                 _tsleep_interlock(gd, ident, flags);
 475         }
 476
 477         /*
 478          * Setup for the current process (if this is a process).  We must
 479          * interlock with lwp_token to avoid remote wakeup races via
 480          * setrunnable()
 481          */
 482         if (lp) {
 483                 lwkt_gettoken(&lp->lwp_token);
 484
 485                 /*
 486                  * If the umbrella process is in the SCORE state then
 487                  * make sure that the thread is flagged going into a
 488                  * normal sleep to allow the core dump to proceed, otherwise
 489                  * the coredump can end up waiting forever.  If the normal
 490                  * sleep is woken up, the thread will enter a stopped state
 491                  * upon return to userland.
 492                  *
 493                  * We do not want to interrupt or cause a thread exist at
 494                  * this juncture because that will mess-up the state the
 495                  * coredump is trying to save.
 496                  */
 497                 if (p->p_stat == SCORE &&
 498                     (lp->lwp_mpflags & LWP_MP_WSTOP) == 0) {
 499                         atomic_set_int(&lp->lwp_mpflags, LWP_MP_WSTOP);
 500                         ++p->p_nstopped;
 501                 }
 502
 503                 /*
 504                  * PCATCH requested.
 505                  */
 506                 if (catch) {
 507                         /*
 508                          * Early termination if PCATCH was set and a
 509                          * signal is pending, interlocked with the
 510                          * critical section.
 511                          *
 512                          * Early termination only occurs when tsleep() is
 513                          * entered while in a normal LSRUN state.
 514                          */
 515                         if ((sig = CURSIG(lp)) != 0)
 516                                 goto resume;
 517
 518                         /*
 519                          * Causes ksignal to wake us up if a signal is
 520                          * received (interlocked with lp->lwp_token).
 521                          */
 522                         lp->lwp_flags |= LWP_SINTR;
 523                 }
 524         } else {
 525                 KKASSERT(p == NULL);
 526         }
 527
 528         /*
 529          * Make sure the current process has been untangled from
 530          * the userland scheduler and initialize slptime to start
 531          * counting.
 532          *
 533          * NOTE: td->td_wakefromcpu is pre-set by the release function
 534          *       for the dfly scheduler, and then adjusted by _wakeup()
 535          */
 536         if (lp) {
 537                 p->p_usched->release_curproc(lp);
 538                 lp->lwp_slptime = 0;
 539         }
 540
 541         /*
 542          * If the interlocked flag is set but our cpu bit in the slpqueue
 543          * is no longer set, then a wakeup was processed inbetween the
 544          * tsleep_interlock() (ours or the callers), and here.  This can
 545          * occur under numerous circumstances including when we release the
 546          * current process.
 547          *
 548          * Extreme loads can cause the sending of an IPI (e.g. wakeup()'s)
 549          * to process incoming IPIs, thus draining incoming wakeups.
 550          */
 551         if ((td->td_flags & TDF_TSLEEPQ) == 0) {
 552                 logtsleep2(ilockfail, ident);
 553                 goto resume;
 554         }
 555
 556         /*
 557          * scheduling is blocked while in a critical section.  Coincide
 558          * the descheduled-by-tsleep flag with the descheduling of the
 559          * lwkt.
 560          *
 561          * The timer callout is localized on our cpu and interlocked by
 562          * our critical section.
 563          */
 564         lwkt_deschedule_self(td);
 565         td->td_flags |= TDF_TSLEEP_DESCHEDULED;
 566         td->td_wmesg = wmesg;
 567
 568         /*
 569          * Setup the timeout, if any.  The timeout is only operable while
 570          * the thread is flagged descheduled.
 571          */
 572         KKASSERT((td->td_flags & TDF_TIMEOUT) == 0);
 573         if (timo) {
 574                 callout_init_mp(&thandle);
 575                 callout_reset(&thandle, timo, endtsleep, td);
 576         }
 577
 578         /*
 579          * Beddy bye bye.
 580          */
 581         if (lp) {
 582                 /*
 583                  * Ok, we are sleeping.  Place us in the SSLEEP state.
 584                  */
 585                 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
 586
 587                 /*
 588                  * tstop() sets LSSTOP, so don't fiddle with that.
 589                  */
 590                 if (lp->lwp_stat != LSSTOP)
 591                         lp->lwp_stat = LSSLEEP;
 592                 lp->lwp_ru.ru_nvcsw++;
 593                 p->p_usched->uload_update(lp);
 594                 lwkt_switch();
 595
 596                 /*
 597                  * And when we are woken up, put us back in LSRUN.  If we
 598                  * slept for over a second, recalculate our estcpu.
 599                  */
 600                 lp->lwp_stat = LSRUN;
 601                 if (lp->lwp_slptime) {
 602                         p->p_usched->uload_update(lp);
 603                         p->p_usched->recalculate(lp);
 604                 }
 605                 lp->lwp_slptime = 0;
 606         } else {
 607                 lwkt_switch();
 608         }
 609
 610         /*
 611          * Make sure we haven't switched cpus while we were asleep.  It's
 612          * not supposed to happen.  Cleanup our temporary flags.
 613          */
 614         KKASSERT(gd == td->td_gd);
 615
 616         /*
 617          * Cleanup the timeout.  If the timeout has already occured thandle
 618          * has already been stopped, otherwise stop thandle.  If the timeout
 619          * is running (the callout thread must be blocked trying to get
 620          * lwp_token) then wait for us to get scheduled.
 621          */
 622         if (timo) {
 623                 while (td->td_flags & TDF_TIMEOUT_RUNNING) {
 624                         /* else we won't get rescheduled! */
 625                         if (lp->lwp_stat != LSSTOP)
 626                                 lp->lwp_stat = LSSLEEP;
 627                         lwkt_deschedule_self(td);
 628                         td->td_wmesg = "tsrace";
 629                         lwkt_switch();
 630                         kprintf("td %p %s: timeout race\n", td, td->td_comm);
 631                 }
 632                 if (td->td_flags & TDF_TIMEOUT) {
 633                         td->td_flags &= ~TDF_TIMEOUT;
 634                         error = EWOULDBLOCK;
 635                 } else {
 636                         /* does not block when on same cpu */
 637                         callout_stop(&thandle);
 638                 }
 639         }
 640         td->td_flags &= ~TDF_TSLEEP_DESCHEDULED;
 641
 642         /*
 643          * Make sure we have been removed from the sleepq.  In most
 644          * cases this will have been done for us already but it is
 645          * possible for a scheduling IPI to be in-flight from a
 646          * previous tsleep/tsleep_interlock() or due to a straight-out
 647          * call to lwkt_schedule() (in the case of an interrupt thread),
 648          * causing a spurious wakeup.
 649          */
 650         _tsleep_remove(td);
 651         td->td_wmesg = NULL;
 652
 653         /*
 654          * Figure out the correct error return.  If interrupted by a
 655          * signal we want to return EINTR or ERESTART.
 656          */
 657 resume:
 658         if (lp) {
 659                 if (catch && error == 0) {
 660                         if (sig != 0 || (sig = CURSIG(lp))) {
 661                                 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 662                                         error = EINTR;
 663                                 else
 664                                         error = ERESTART;
 665                         }
 666                 }
 667
 668                 lp->lwp_flags &= ~LWP_SINTR;
 669
 670                 /*
 671                  * Unconditionally set us to LSRUN on resume.  lwp_stat could
 672                  * be in a weird state due to the goto resume, particularly
 673                  * when tsleep() is called from tstop().
 674                  */
 675                 lp->lwp_stat = LSRUN;
 676                 lwkt_reltoken(&lp->lwp_token);
 677         }
 678         logtsleep1(tsleep_end);
 679         crit_exit_quick(td);
 680         return (error);
 681 }
 682
 683 /*
 684  * Interlocked spinlock sleep.  An exclusively held spinlock must
 685  * be passed to ssleep().  The function will atomically release the
 686  * spinlock and tsleep on the ident, then reacquire the spinlock and
 687  * return.
 688  *
 689  * This routine is fairly important along the critical path, so optimize it
 690  * heavily.
 691  */
 692 int
 693 ssleep(const volatile void *ident, struct spinlock *spin, int flags,
 694        const char *wmesg, int timo)
 695 {
 696         globaldata_t gd = mycpu;
 697         int error;
 698
 699         _tsleep_interlock(gd, ident, flags);
 700         spin_unlock_quick(gd, spin);
 701         error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
 702         _spin_lock_quick(gd, spin, wmesg);
 703
 704         return (error);
 705 }
 706
 707 int
 708 lksleep(const volatile void *ident, struct lock *lock, int flags,
 709         const char *wmesg, int timo)
 710 {
 711         globaldata_t gd = mycpu;
 712         int error;
 713
 714         _tsleep_interlock(gd, ident, flags);
 715         lockmgr(lock, LK_RELEASE);
 716         error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
 717         lockmgr(lock, LK_EXCLUSIVE);
 718
 719         return (error);
 720 }
 721
 722 /*
 723  * Interlocked mutex sleep.  An exclusively held mutex must be passed
 724  * to mtxsleep().  The function will atomically release the mutex
 725  * and tsleep on the ident, then reacquire the mutex and return.
 726  */
 727 int
 728 mtxsleep(const volatile void *ident, struct mtx *mtx, int flags,
 729          const char *wmesg, int timo)
 730 {
 731         globaldata_t gd = mycpu;
 732         int error;
 733
 734         _tsleep_interlock(gd, ident, flags);
 735         mtx_unlock(mtx);
 736         error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
 737         mtx_lock_ex_quick(mtx);
 738
 739         return (error);
 740 }
 741
 742 /*
 743  * Interlocked serializer sleep.  An exclusively held serializer must
 744  * be passed to zsleep().  The function will atomically release
 745  * the serializer and tsleep on the ident, then reacquire the serializer
 746  * and return.
 747  */
 748 int
 749 zsleep(const volatile void *ident, struct lwkt_serialize *slz, int flags,
 750        const char *wmesg, int timo)
 751 {
 752         globaldata_t gd = mycpu;
 753         int ret;
 754
 755         ASSERT_SERIALIZED(slz);
 756
 757         _tsleep_interlock(gd, ident, flags);
 758         lwkt_serialize_exit(slz);
 759         ret = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
 760         lwkt_serialize_enter(slz);
 761
 762         return ret;
 763 }
 764
 765 /*
 766  * Directly block on the LWKT thread by descheduling it.  This
 767  * is much faster then tsleep(), but the only legal way to wake
 768  * us up is to directly schedule the thread.
 769  *
 770  * Setting TDF_SINTR will cause new signals to directly schedule us.
 771  *
 772  * This routine must be called while in a critical section.
 773  */
 774 int
 775 lwkt_sleep(const char *wmesg, int flags)
 776 {
 777         thread_t td = curthread;
 778         int sig;
 779
 780         if ((flags & PCATCH) == 0 || td->td_lwp == NULL) {
 781                 td->td_flags |= TDF_BLOCKED;
 782                 td->td_wmesg = wmesg;
 783                 lwkt_deschedule_self(td);
 784                 lwkt_switch();
 785                 td->td_wmesg = NULL;
 786                 td->td_flags &= ~TDF_BLOCKED;
 787                 return(0);
 788         }
 789         if ((sig = CURSIG(td->td_lwp)) != 0) {
 790                 if (SIGISMEMBER(td->td_proc->p_sigacts->ps_sigintr, sig))
 791                         return(EINTR);
 792                 else
 793                         return(ERESTART);
 794
 795         }
 796         td->td_flags |= TDF_BLOCKED | TDF_SINTR;
 797         td->td_wmesg = wmesg;
 798         lwkt_deschedule_self(td);
 799         lwkt_switch();
 800         td->td_flags &= ~(TDF_BLOCKED | TDF_SINTR);
 801         td->td_wmesg = NULL;
 802         return(0);
 803 }
 804
 805 /*
 806  * Implement the timeout for tsleep.
 807  *
 808  * This type of callout timeout is scheduled on the same cpu the process
 809  * is sleeping on.  Also, at the moment, the MP lock is held.
 810  */
 811 static void
 812 endtsleep(void *arg)
 813 {
 814         thread_t td = arg;
 815         struct lwp *lp;
 816
 817         /*
 818          * We are going to have to get the lwp_token, which means we might
 819          * block.  This can race a tsleep getting woken up by other means
 820          * so set TDF_TIMEOUT_RUNNING to force the tsleep to wait for our
 821          * processing to complete (sorry tsleep!).
 822          *
 823          * We can safely set td_flags because td MUST be on the same cpu
 824          * as we are.
 825          */
 826         KKASSERT(td->td_gd == mycpu);
 827         crit_enter();
 828         td->td_flags |= TDF_TIMEOUT_RUNNING | TDF_TIMEOUT;
 829
 830         /*
 831          * This can block but TDF_TIMEOUT_RUNNING will prevent the thread
 832          * from exiting the tsleep on us.  The flag is interlocked by virtue
 833          * of lp being on the same cpu as we are.
 834          */
 835         if ((lp = td->td_lwp) != NULL)
 836                 lwkt_gettoken(&lp->lwp_token);
 837
 838         KKASSERT(td->td_flags & TDF_TSLEEP_DESCHEDULED);
 839
 840         if (lp) {
 841                 /*
 842                  * callout timer should normally never be set in tstop()
 843                  * because it passes a timeout of 0.  However, there is a
 844                  * case during thread exit (which SSTOP's all the threads)
 845                  * for which tstop() must break out and can (properly) leave
 846                  * the thread in LSSTOP.
 847                  */
 848                 KKASSERT(lp->lwp_stat != LSSTOP ||
 849                          (lp->lwp_mpflags & LWP_MP_WEXIT));
 850                 setrunnable(lp);
 851                 lwkt_reltoken(&lp->lwp_token);
 852         } else {
 853                 _tsleep_remove(td);
 854                 lwkt_schedule(td);
 855         }
 856         KKASSERT(td->td_gd == mycpu);
 857         td->td_flags &= ~TDF_TIMEOUT_RUNNING;
 858         crit_exit();
 859 }
 860
 861 /*
 862  * Make all processes sleeping on the specified identifier runnable.
 863  * count may be zero or one only.
 864  *
 865  * The domain encodes the sleep/wakeup domain, flags, plus the originating
 866  * cpu.
 867  *
 868  * This call may run without the MP lock held.  We can only manipulate thread
 869  * state on the cpu owning the thread.  We CANNOT manipulate process state
 870  * at all.
 871  *
 872  * _wakeup() can be passed to an IPI so we can't use (const volatile
 873  * void *ident).
 874  */
 875 static void
 876 _wakeup(void *ident, int domain)
 877 {
 878         struct tslpque *qp;
 879         struct thread *td;
 880         struct thread *ntd;
 881         globaldata_t gd;
 882         cpumask_t mask;
 883         uint32_t cid;
 884         uint32_t gid;
 885
 886         crit_enter();
 887         logtsleep2(wakeup_beg, ident);
 888         gd = mycpu;
 889         cid = LOOKUP(ident);
 890         gid = TCHASHSHIFT(cid);
 891         qp = &gd->gd_tsleep_hash[gid];
 892 restart:
 893         for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
 894                 ntd = TAILQ_NEXT(td, td_sleepq);
 895                 if (td->td_wchan == ident &&
 896                     td->td_wdomain == (domain & PDOMAIN_MASK)
 897                 ) {
 898                         KKASSERT(td->td_gd == gd);
 899                         _tsleep_remove(td);
 900                         td->td_wakefromcpu = PWAKEUP_DECODE(domain);
 901                         if (td->td_flags & TDF_TSLEEP_DESCHEDULED) {
 902                                 lwkt_schedule(td);
 903                                 if (domain & PWAKEUP_ONE)
 904                                         goto done;
 905                         }
 906                         goto restart;
 907                 }
 908         }
 909
 910         /*
 911          * Because a bunch of cpumask array entries cover the same queue, it
 912          * is possible for our bit to remain set in some of them and cause
 913          * spurious wakeup IPIs later on.  Make sure that the bit is cleared
 914          * when a spurious IPI occurs to prevent further spurious IPIs.
 915          */
 916         if (TAILQ_FIRST(qp) == NULL) {
 917                 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid], gd->gd_cpuid);
 918         }
 919
 920         /*
 921          * We finished checking the current cpu but there still may be
 922          * more work to do.  Either wakeup_one was requested and no matching
 923          * thread was found, or a normal wakeup was requested and we have
 924          * to continue checking cpus.
 925          *
 926          * It should be noted that this scheme is actually less expensive then
 927          * the old scheme when waking up multiple threads, since we send
 928          * only one IPI message per target candidate which may then schedule
 929          * multiple threads.  Before we could have wound up sending an IPI
 930          * message for each thread on the target cpu (!= current cpu) that
 931          * needed to be woken up.
 932          *
 933          * NOTE: Wakeups occuring on remote cpus are asynchronous.  This
 934          * should be ok since we are passing idents in the IPI rather then
 935          * thread pointers.
 936          */
 937         if ((domain & PWAKEUP_MYCPU) == 0) {
 938                 mask = slpque_cpumasks[cid];
 939                 CPUMASK_ANDMASK(mask, gd->gd_other_cpus);
 940                 if (CPUMASK_TESTNZERO(mask)) {
 941                         lwkt_send_ipiq2_mask(mask, _wakeup, ident,
 942                                              domain | PWAKEUP_MYCPU);
 943                 }
 944         }
 945 done:
 946         logtsleep1(wakeup_end);
 947         crit_exit();
 948 }
 949
 950 /*
 951  * Wakeup all threads tsleep()ing on the specified ident, on all cpus
 952  */
 953 void
 954 wakeup(const volatile void *ident)
 955 {
 956     globaldata_t gd = mycpu;
 957     thread_t td = gd->gd_curthread;
 958
 959     if (td && (td->td_flags & TDF_DELAYED_WAKEUP)) {
 960         /*
 961          * If we are in a delayed wakeup section, record up to two wakeups in
 962          * a per-CPU queue and issue them when we block or exit the delayed
 963          * wakeup section.
 964          */
 965         if (atomic_cmpset_ptr(&gd->gd_delayed_wakeup[0], NULL, ident))
 966                 return;
 967         if (atomic_cmpset_ptr(&gd->gd_delayed_wakeup[1], NULL, ident))
 968                 return;
 969
 970         ident = atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd->gd_delayed_wakeup[1]),
 971                                 __DEALL(ident));
 972         ident = atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd->gd_delayed_wakeup[0]),
 973                                 __DEALL(ident));
 974     }
 975
 976     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, gd->gd_cpuid));
 977 }
 978
 979 /*
 980  * Wakeup one thread tsleep()ing on the specified ident, on any cpu.
 981  */
 982 void
 983 wakeup_one(const volatile void *ident)
 984 {
 985     /* XXX potentially round-robin the first responding cpu */
 986     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) |
 987                             PWAKEUP_ONE);
 988 }
 989
 990 /*
 991  * Wakeup threads tsleep()ing on the specified ident on the current cpu
 992  * only.
 993  */
 994 void
 995 wakeup_mycpu(const volatile void *ident)
 996 {
 997     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) |
 998                             PWAKEUP_MYCPU);
 999 }
1000
1001 /*
1002  * Wakeup one thread tsleep()ing on the specified ident on the current cpu
1003  * only.
1004  */
1005 void
1006 wakeup_mycpu_one(const volatile void *ident)
1007 {
1008     /* XXX potentially round-robin the first responding cpu */
1009     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) |
1010                             PWAKEUP_MYCPU | PWAKEUP_ONE);
1011 }
1012
1013 /*
1014  * Wakeup all thread tsleep()ing on the specified ident on the specified cpu
1015  * only.
1016  */
1017 void
1018 wakeup_oncpu(globaldata_t gd, const volatile void *ident)
1019 {
1020     globaldata_t mygd = mycpu;
1021     if (gd == mycpu) {
1022         _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
1023                                 PWAKEUP_MYCPU);
1024     } else {
1025         lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident),
1026                         PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
1027                         PWAKEUP_MYCPU);
1028     }
1029 }
1030
1031 /*
1032  * Wakeup one thread tsleep()ing on the specified ident on the specified cpu
1033  * only.
1034  */
1035 void
1036 wakeup_oncpu_one(globaldata_t gd, const volatile void *ident)
1037 {
1038     globaldata_t mygd = mycpu;
1039     if (gd == mygd) {
1040         _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
1041                                 PWAKEUP_MYCPU | PWAKEUP_ONE);
1042     } else {
1043         lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident),
1044                         PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
1045                         PWAKEUP_MYCPU | PWAKEUP_ONE);
1046     }
1047 }
1048
1049 /*
1050  * Wakeup all threads waiting on the specified ident that slept using
1051  * the specified domain, on all cpus.
1052  */
1053 void
1054 wakeup_domain(const volatile void *ident, int domain)
1055 {
1056     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(domain, mycpu->gd_cpuid));
1057 }
1058
1059 /*
1060  * Wakeup one thread waiting on the specified ident that slept using
1061  * the specified  domain, on any cpu.
1062  */
1063 void
1064 wakeup_domain_one(const volatile void *ident, int domain)
1065 {
1066     /* XXX potentially round-robin the first responding cpu */
1067     _wakeup(__DEALL(ident),
1068             PWAKEUP_ENCODE(domain, mycpu->gd_cpuid) | PWAKEUP_ONE);
1069 }
1070
1071 void
1072 wakeup_start_delayed(void)
1073 {
1074     globaldata_t gd = mycpu;
1075
1076     crit_enter();
1077     gd->gd_curthread->td_flags |= TDF_DELAYED_WAKEUP;
1078     crit_exit();
1079 }
1080
1081 void
1082 wakeup_end_delayed(void)
1083 {
1084     globaldata_t gd = mycpu;
1085
1086     if (gd->gd_curthread->td_flags & TDF_DELAYED_WAKEUP) {
1087         crit_enter();
1088         gd->gd_curthread->td_flags &= ~TDF_DELAYED_WAKEUP;
1089         if (gd->gd_delayed_wakeup[0] || gd->gd_delayed_wakeup[1]) {
1090             if (gd->gd_delayed_wakeup[0]) {
1091                     wakeup(gd->gd_delayed_wakeup[0]);
1092                     gd->gd_delayed_wakeup[0] = NULL;
1093             }
1094             if (gd->gd_delayed_wakeup[1]) {
1095                     wakeup(gd->gd_delayed_wakeup[1]);
1096                     gd->gd_delayed_wakeup[1] = NULL;
1097             }
1098         }
1099         crit_exit();
1100     }
1101 }
1102
1103 /*
1104  * setrunnable()
1105  *
1106  * Make a process runnable.  lp->lwp_token must be held on call and this
1107  * function must be called from the cpu owning lp.
1108  *
1109  * This only has an effect if we are in LSSTOP or LSSLEEP.
1110  */
1111 void
1112 setrunnable(struct lwp *lp)
1113 {
1114         thread_t td = lp->lwp_thread;
1115
1116         ASSERT_LWKT_TOKEN_HELD(&lp->lwp_token);
1117         KKASSERT(td->td_gd == mycpu);
1118         crit_enter();
1119         if (lp->lwp_stat == LSSTOP)
1120                 lp->lwp_stat = LSSLEEP;
1121         if (lp->lwp_stat == LSSLEEP) {
1122                 _tsleep_remove(td);
1123                 lwkt_schedule(td);
1124         } else if (td->td_flags & TDF_SINTR) {
1125                 lwkt_schedule(td);
1126         }
1127         crit_exit();
1128 }
1129
1130 /*
1131  * The process is stopped due to some condition, usually because p_stat is
1132  * set to SSTOP, but also possibly due to being traced.
1133  *
1134  * Caller must hold p->p_token
1135  *
1136  * NOTE!  If the caller sets SSTOP, the caller must also clear P_WAITED
1137  * because the parent may check the child's status before the child actually
1138  * gets to this routine.
1139  *
1140  * This routine is called with the current lwp only, typically just
1141  * before returning to userland if the process state is detected as
1142  * possibly being in a stopped state.
1143  */
1144 void
1145 tstop(void)
1146 {
1147         struct lwp *lp = curthread->td_lwp;
1148         struct proc *p = lp->lwp_proc;
1149         struct proc *q;
1150
1151         lwkt_gettoken(&lp->lwp_token);
1152         crit_enter();
1153
1154         /*
1155          * If LWP_MP_WSTOP is set, we were sleeping
1156          * while our process was stopped.  At this point
1157          * we were already counted as stopped.
1158          */
1159         if ((lp->lwp_mpflags & LWP_MP_WSTOP) == 0) {
1160                 /*
1161                  * If we're the last thread to stop, signal
1162                  * our parent.
1163                  */
1164                 p->p_nstopped++;
1165                 atomic_set_int(&lp->lwp_mpflags, LWP_MP_WSTOP);
1166                 wakeup(&p->p_nstopped);
1167                 if (p->p_nstopped == p->p_nthreads) {
1168                         /*
1169                          * Token required to interlock kern_wait()
1170                          */
1171                         q = p->p_pptr;
1172                         PHOLD(q);
1173                         lwkt_gettoken(&q->p_token);
1174                         p->p_flags &= ~P_WAITED;
1175                         wakeup(p->p_pptr);
1176                         if ((q->p_sigacts->ps_flag & PS_NOCLDSTOP) == 0)
1177                                 ksignal(q, SIGCHLD);
1178                         lwkt_reltoken(&q->p_token);
1179                         PRELE(q);
1180                 }
1181         }
1182
1183         /*
1184          * Wait here while in a stopped state, interlocked with lwp_token.
1185          * We must break-out if the whole process is trying to exit.
1186          */
1187         while (STOPLWP(p, lp)) {
1188                 lp->lwp_stat = LSSTOP;
1189                 tsleep(p, 0, "stop", 0);
1190         }
1191         p->p_nstopped--;
1192         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_WSTOP);
1193         crit_exit();
1194         lwkt_reltoken(&lp->lwp_token);
1195 }
1196
1197 /*
1198  * Compute a tenex style load average of a quantity on
1199  * 1, 5 and 15 minute intervals.  This is a pcpu callout.
1200  *
1201  * We segment the lwp scan on a pcpu basis.  This does NOT
1202  * mean the associated lwps are on this cpu, it is done
1203  * just to break the work up.
1204  *
1205  * The callout on cpu0 rolls up the stats from the other
1206  * cpus.
1207  */
1208 static int loadav_count_runnable(struct lwp *p, void *data);
1209
1210 static void
1211 loadav(void *arg)
1212 {
1213         globaldata_t gd = mycpu;
1214         struct loadavg *avg;
1215         int i, nrun;
1216
1217         nrun = 0;
1218         alllwp_scan(loadav_count_runnable, &nrun, 1);
1219         gd->gd_loadav_nrunnable = nrun;
1220         if (gd->gd_cpuid == 0) {
1221                 avg = &averunnable;
1222                 nrun = 0;
1223                 for (i = 0; i < ncpus; ++i)
1224                         nrun += globaldata_find(i)->gd_loadav_nrunnable;
1225                 for (i = 0; i < 3; i++) {
1226                         avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
1227                             (long)nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
1228                 }
1229         }
1230
1231         /*
1232          * Schedule the next update to occur after 5 seconds, but add a
1233          * random variation to avoid synchronisation with processes that
1234          * run at regular intervals.
1235          */
1236         callout_reset(&gd->gd_loadav_callout,
1237                       hz * 4 + (int)(krandom() % (hz * 2 + 1)),
1238                       loadav, NULL);
1239 }
1240
1241 static int
1242 loadav_count_runnable(struct lwp *lp, void *data)
1243 {
1244         int *nrunp = data;
1245         thread_t td;
1246
1247         switch (lp->lwp_stat) {
1248         case LSRUN:
1249                 if ((td = lp->lwp_thread) == NULL)
1250                         break;
1251                 if (td->td_flags & TDF_BLOCKED)
1252                         break;
1253                 ++*nrunp;
1254                 break;
1255         default:
1256                 break;
1257         }
1258         lwkt_yield();
1259         return(0);
1260 }
1261
1262 /*
1263  * Regular data collection
1264  */
1265 static uint64_t
1266 collect_load_callback(int n)
1267 {
1268         int fscale = averunnable.fscale;
1269
1270         return ((averunnable.ldavg[0] * 100 + (fscale >> 1)) / fscale);
1271 }
1272
1273 static void
1274 sched_setup(void *dummy __unused)
1275 {
1276         globaldata_t save_gd = mycpu;
1277         globaldata_t gd;
1278         int n;
1279
1280         kcollect_register(KCOLLECT_LOAD, "load", collect_load_callback,
1281                           KCOLLECT_SCALE(KCOLLECT_LOAD_FORMAT, 0));
1282
1283         /*
1284          * Kick off timeout driven events by calling first time.  We
1285          * split the work across available cpus to help scale it,
1286          * it can eat a lot of cpu when there are a lot of processes
1287          * on the system.
1288          */
1289         for (n = 0; n < ncpus; ++n) {
1290                 gd = globaldata_find(n);
1291                 lwkt_setcpu_self(gd);
1292                 callout_init_mp(&gd->gd_loadav_callout);
1293                 callout_init_mp(&gd->gd_schedcpu_callout);
1294                 schedcpu(NULL);
1295                 loadav(NULL);
1296         }
1297         lwkt_setcpu_self(save_gd);
1298 }
1299
1300 /*
1301  * Extremely early initialization, dummy-up the tables so we don't have
1302  * to conditionalize for NULL in _wakeup() and tsleep_interlock().  Even
1303  * though the system isn't blocking this early, these functions still
1304  * try to access the hash table.
1305  *
1306  * This setup will be overridden once sched_dyninit() -> sleep_gdinit()
1307  * is called.
1308  */
1309 void
1310 sleep_early_gdinit(globaldata_t gd)
1311 {
1312         static struct tslpque   dummy_slpque;
1313         static cpumask_t dummy_cpumasks;
1314
1315         slpque_tablesize = 1;
1316         gd->gd_tsleep_hash = &dummy_slpque;
1317         slpque_cpumasks = &dummy_cpumasks;
1318         TAILQ_INIT(&dummy_slpque);
1319 }
1320
1321 /*
1322  * PCPU initialization.  Called after KMALLOC is operational, by
1323  * sched_dyninit() for cpu 0, and by mi_gdinit() for other cpus later.
1324  *
1325  * WARNING! The pcpu hash table is smaller than the global cpumask
1326  *          hash table, which can save us a lot of memory when maxproc
1327  *          is set high.
1328  */
1329 void
1330 sleep_gdinit(globaldata_t gd)
1331 {
1332         struct thread *td;
1333         uint32_t n;
1334         uint32_t i;
1335
1336         /*
1337          * This shouldn't happen, that is there shouldn't be any threads
1338          * waiting on the dummy tsleep queue this early in the boot.
1339          */
1340         if (gd->gd_cpuid == 0) {
1341                 TAILQ_FOREACH(td, &gd->gd_tsleep_hash[0], td_sleepq) {
1342                         kprintf("SLEEP_GDINIT SWITCH %s\n", td->td_comm);
1343                 }
1344         }
1345
1346         /*
1347          * Note that we have to allocate one extra slot because we are
1348          * shifting a modulo value.  TCHASHSHIFT(slpque_tablesize - 1) can
1349          * return the same value as TCHASHSHIFT(slpque_tablesize).
1350          */
1351         n = TCHASHSHIFT(slpque_tablesize) + 1;
1352
1353         gd->gd_tsleep_hash = kmalloc(sizeof(struct tslpque) * n,
1354                                      M_TSLEEP, M_WAITOK | M_ZERO);
1355         for (i = 0; i < n; ++i)
1356                 TAILQ_INIT(&gd->gd_tsleep_hash[i]);
1357 }
1358
1359 /*
1360  * Dynamic initialization after the memory system is operational.
1361  */
1362 static void
1363 sched_dyninit(void *dummy __unused)
1364 {
1365         int tblsize;
1366         int tblsize2;
1367         int n;
1368
1369         /*
1370          * Calculate table size for slpque hash.  We want a prime number
1371          * large enough to avoid overloading slpque_cpumasks when the
1372          * system has a large number of sleeping processes, which will
1373          * spam IPIs on wakeup().
1374          *
1375          * While it is true this is really a per-lwp factor, generally
1376          * speaking the maxproc limit is a good metric to go by.
1377          */
1378         for (tblsize = maxproc | 1; ; tblsize += 2) {
1379                 if (tblsize % 3 == 0)
1380                         continue;
1381                 if (tblsize % 5 == 0)
1382                         continue;
1383                 tblsize2 = (tblsize / 2) | 1;
1384                 for (n = 7; n < tblsize2; n += 2) {
1385                         if (tblsize % n == 0)
1386                                 break;
1387                 }
1388                 if (n == tblsize2)
1389                         break;
1390         }
1391
1392         /*
1393          * PIDs are currently limited to 6 digits.  Cap the table size
1394          * at double this.
1395          */
1396         if (tblsize > 2000003)
1397                 tblsize = 2000003;
1398
1399         slpque_tablesize = tblsize;
1400         slpque_cpumasks = kmalloc(sizeof(*slpque_cpumasks) * slpque_tablesize,
1401                                   M_TSLEEP, M_WAITOK | M_ZERO);
1402         sleep_gdinit(mycpu);
1403 }