2 * Copyright (c) 1982, 1986, 1990, 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
35 * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $
38 #include "opt_ktrace.h"
40 #include <sys/param.h>
41 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/signalvar.h>
45 #include <sys/resourcevar.h>
46 #include <sys/vmmeter.h>
47 #include <sys/sysctl.h>
51 #include <sys/kcollect.h>
53 #include <sys/ktrace.h>
56 #include <sys/serialize.h>
58 #include <sys/signal2.h>
59 #include <sys/thread2.h>
60 #include <sys/spinlock2.h>
61 #include <sys/mutex2.h>
63 #include <machine/cpu.h>
64 #include <machine/smp.h>
66 #include <vm/vm_extern.h>
69 TAILQ_HEAD(, thread
) queue
;
70 const volatile void *ident0
;
71 const volatile void *ident1
;
72 const volatile void *ident2
;
73 const volatile void *ident3
;
76 static void sched_setup (void *dummy
);
77 SYSINIT(sched_setup
, SI_SUB_KICK_SCHEDULER
, SI_ORDER_FIRST
, sched_setup
, NULL
);
78 static void sched_dyninit (void *dummy
);
79 SYSINIT(sched_dyninit
, SI_BOOT1_DYNALLOC
, SI_ORDER_FIRST
, sched_dyninit
, NULL
);
84 int ncpus_fit
, ncpus_fit_mask
; /* note: mask not cpumask_t */
87 int tsleep_crypto_dump
= 0;
89 MALLOC_DEFINE(M_TSLEEP
, "tslpque", "tsleep queues");
91 #define __DEALL(ident) __DEQUALIFY(void *, ident)
93 #if !defined(KTR_TSLEEP)
94 #define KTR_TSLEEP KTR_ALL
96 KTR_INFO_MASTER(tsleep
);
97 KTR_INFO(KTR_TSLEEP
, tsleep
, tsleep_beg
, 0, "tsleep enter %p", const volatile void *ident
);
98 KTR_INFO(KTR_TSLEEP
, tsleep
, tsleep_end
, 1, "tsleep exit");
99 KTR_INFO(KTR_TSLEEP
, tsleep
, wakeup_beg
, 2, "wakeup enter %p", const volatile void *ident
);
100 KTR_INFO(KTR_TSLEEP
, tsleep
, wakeup_end
, 3, "wakeup exit");
101 KTR_INFO(KTR_TSLEEP
, tsleep
, ilockfail
, 4, "interlock failed %p", const volatile void *ident
);
103 #define logtsleep1(name) KTR_LOG(tsleep_ ## name)
104 #define logtsleep2(name, val) KTR_LOG(tsleep_ ## name, val)
106 struct loadavg averunnable
=
107 { {0, 0, 0}, FSCALE
}; /* load average, of runnable procs */
109 * Constants for averages over 1, 5, and 15 minutes
110 * when sampling at 5 second intervals.
112 static fixpt_t cexp
[3] = {
113 0.9200444146293232 * FSCALE
, /* exp(-1/12) */
114 0.9834714538216174 * FSCALE
, /* exp(-1/60) */
115 0.9944598480048967 * FSCALE
, /* exp(-1/180) */
118 static void endtsleep (void *);
119 static void loadav (void *arg
);
120 static void schedcpu (void *arg
);
122 static int pctcpu_decay
= 10;
123 SYSCTL_INT(_kern
, OID_AUTO
, pctcpu_decay
, CTLFLAG_RW
,
124 &pctcpu_decay
, 0, "");
127 * kernel uses `FSCALE', userland (SHOULD) use kern.fscale
129 int fscale __unused
= FSCALE
; /* exported to systat */
130 SYSCTL_INT(_kern
, OID_AUTO
, fscale
, CTLFLAG_RD
, 0, FSCALE
, "");
133 * Issue a wakeup() from userland (debugging)
136 sysctl_wakeup(SYSCTL_HANDLER_ARGS
)
141 if (req
->newptr
!= NULL
) {
142 if (priv_check(curthread
, PRIV_ROOT
))
144 error
= SYSCTL_IN(req
, &ident
, sizeof(ident
));
147 kprintf("issue wakeup %016jx\n", ident
);
148 wakeup((void *)(intptr_t)ident
);
150 if (req
->oldptr
!= NULL
) {
151 error
= SYSCTL_OUT(req
, &ident
, sizeof(ident
));
157 sysctl_wakeup_umtx(SYSCTL_HANDLER_ARGS
)
162 if (req
->newptr
!= NULL
) {
163 if (priv_check(curthread
, PRIV_ROOT
))
165 error
= SYSCTL_IN(req
, &ident
, sizeof(ident
));
168 kprintf("issue wakeup %016jx, PDOMAIN_UMTX\n", ident
);
169 wakeup_domain((void *)(intptr_t)ident
, PDOMAIN_UMTX
);
171 if (req
->oldptr
!= NULL
) {
172 error
= SYSCTL_OUT(req
, &ident
, sizeof(ident
));
177 SYSCTL_PROC(_debug
, OID_AUTO
, wakeup
, CTLTYPE_UQUAD
|CTLFLAG_RW
, 0, 0,
178 sysctl_wakeup
, "Q", "issue wakeup(addr)");
179 SYSCTL_PROC(_debug
, OID_AUTO
, wakeup_umtx
, CTLTYPE_UQUAD
|CTLFLAG_RW
, 0, 0,
180 sysctl_wakeup_umtx
, "Q", "issue wakeup(addr, PDOMAIN_UMTX)");
183 * Recompute process priorities, once a second.
185 * Since the userland schedulers are typically event oriented, if the
186 * estcpu calculation at wakeup() time is not sufficient to make a
187 * process runnable relative to other processes in the system we have
188 * a 1-second recalc to help out.
190 * This code also allows us to store sysclock_t data in the process structure
191 * without fear of an overrun, since sysclock_t are guarenteed to hold
192 * several seconds worth of count.
194 * WARNING! callouts can preempt normal threads. However, they will not
195 * preempt a thread holding a spinlock so we *can* safely use spinlocks.
197 static int schedcpu_stats(struct proc
*p
, void *data __unused
);
198 static int schedcpu_resource(struct proc
*p
, void *data __unused
);
203 allproc_scan(schedcpu_stats
, NULL
, 1);
204 allproc_scan(schedcpu_resource
, NULL
, 1);
205 if (mycpu
->gd_cpuid
== 0) {
206 wakeup((caddr_t
)&lbolt
);
207 wakeup(lbolt_syncer
);
209 callout_reset(&mycpu
->gd_schedcpu_callout
, hz
, schedcpu
, NULL
);
213 * General process statistics once a second
216 schedcpu_stats(struct proc
*p
, void *data __unused
)
221 * Threads may not be completely set up if process in SIDL state.
223 if (p
->p_stat
== SIDL
)
227 if (lwkt_trytoken(&p
->p_token
) == FALSE
) {
233 FOREACH_LWP_IN_PROC(lp
, p
) {
234 if (lp
->lwp_stat
== LSSLEEP
) {
236 if (lp
->lwp_slptime
== 1)
237 p
->p_usched
->uload_update(lp
);
241 * Only recalculate processes that are active or have slept
242 * less then 2 seconds. The schedulers understand this.
243 * Otherwise decay by 50% per second.
245 if (lp
->lwp_slptime
<= 1) {
246 p
->p_usched
->recalculate(lp
);
250 decay
= pctcpu_decay
;
256 lp
->lwp_pctcpu
= (lp
->lwp_pctcpu
* (decay
- 1)) / decay
;
259 lwkt_reltoken(&p
->p_token
);
266 * Resource checks. XXX break out since ksignal/killproc can block,
267 * limiting us to one process killed per second. There is probably
271 schedcpu_resource(struct proc
*p
, void *data __unused
)
276 if (p
->p_stat
== SIDL
)
280 if (lwkt_trytoken(&p
->p_token
) == FALSE
) {
285 if (p
->p_stat
== SZOMB
|| p
->p_limit
== NULL
) {
286 lwkt_reltoken(&p
->p_token
);
292 FOREACH_LWP_IN_PROC(lp
, p
) {
294 * We may have caught an lp in the middle of being
295 * created, lwp_thread can be NULL.
297 if (lp
->lwp_thread
) {
298 ttime
+= lp
->lwp_thread
->td_sticks
;
299 ttime
+= lp
->lwp_thread
->td_uticks
;
303 switch(plimit_testcpulimit(p
, ttime
)) {
304 case PLIMIT_TESTCPU_KILL
:
305 killproc(p
, "exceeded maximum CPU limit");
307 case PLIMIT_TESTCPU_XCPU
:
308 if ((p
->p_flags
& P_XCPU
) == 0) {
309 p
->p_flags
|= P_XCPU
;
316 lwkt_reltoken(&p
->p_token
);
323 * This is only used by ps. Generate a cpu percentage use over
324 * a period of one second.
327 updatepcpu(struct lwp
*lp
, int cpticks
, int ttlticks
)
332 acc
= (cpticks
<< FSHIFT
) / ttlticks
;
333 if (ttlticks
>= ESTCPUFREQ
) {
334 lp
->lwp_pctcpu
= acc
;
336 remticks
= ESTCPUFREQ
- ttlticks
;
337 lp
->lwp_pctcpu
= (acc
* ttlticks
+ lp
->lwp_pctcpu
* remticks
) /
343 * Handy macros to calculate hash indices. LOOKUP() calculates the
344 * global cpumask hash index, TCHASHSHIFT() converts that into the
347 * By making the pcpu hash arrays smaller we save a significant amount
348 * of memory at very low cost. The real cost is in IPIs, which are handled
349 * by the much larger global cpumask hash table.
351 #define LOOKUP_PRIME 66555444443333333ULL
352 #define LOOKUP(x) ((((uintptr_t)(x) + ((uintptr_t)(x) >> 18)) ^ \
353 LOOKUP_PRIME) % slpque_tablesize)
354 #define TCHASHSHIFT(x) ((x) >> 4)
356 static uint32_t slpque_tablesize
;
357 static cpumask_t
*slpque_cpumasks
;
359 SYSCTL_UINT(_kern
, OID_AUTO
, slpque_tablesize
, CTLFLAG_RD
, &slpque_tablesize
,
363 * This is a dandy function that allows us to interlock tsleep/wakeup
364 * operations with unspecified upper level locks, such as lockmgr locks,
365 * simply by holding a critical section. The sequence is:
367 * (acquire upper level lock)
368 * tsleep_interlock(blah)
369 * (release upper level lock)
372 * Basically this functions queues us on the tsleep queue without actually
373 * descheduling us. When tsleep() is later called with PINTERLOCK it
374 * assumes the thread was already queued, otherwise it queues it there.
376 * Thus it is possible to receive the wakeup prior to going to sleep and
377 * the race conditions are covered.
380 _tsleep_interlock(globaldata_t gd
, const volatile void *ident
, int flags
)
382 thread_t td
= gd
->gd_curthread
;
388 kprintf("tsleep_interlock: NULL ident %s\n", td
->td_comm
);
392 crit_enter_quick(td
);
393 if (td
->td_flags
& TDF_TSLEEPQ
) {
395 * Shortcut if unchanged
397 if (td
->td_wchan
== ident
&&
398 td
->td_wdomain
== (flags
& PDOMAIN_MASK
)) {
404 * Remove current sleepq
406 cid
= LOOKUP(td
->td_wchan
);
407 gid
= TCHASHSHIFT(cid
);
408 qp
= &gd
->gd_tsleep_hash
[gid
];
409 TAILQ_REMOVE(&qp
->queue
, td
, td_sleepq
);
410 if (TAILQ_FIRST(&qp
->queue
) == NULL
) {
415 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks
[cid
],
419 td
->td_flags
|= TDF_TSLEEPQ
;
422 gid
= TCHASHSHIFT(cid
);
423 qp
= &gd
->gd_tsleep_hash
[gid
];
424 TAILQ_INSERT_TAIL(&qp
->queue
, td
, td_sleepq
);
425 if (qp
->ident0
!= ident
&& qp
->ident1
!= ident
&&
426 qp
->ident2
!= ident
&& qp
->ident3
!= ident
) {
427 if (qp
->ident0
== NULL
)
429 else if (qp
->ident1
== NULL
)
431 else if (qp
->ident2
== NULL
)
433 else if (qp
->ident3
== NULL
)
436 qp
->ident0
= (void *)(intptr_t)-1;
438 ATOMIC_CPUMASK_ORBIT(slpque_cpumasks
[cid
], gd
->gd_cpuid
);
439 td
->td_wchan
= ident
;
440 td
->td_wdomain
= flags
& PDOMAIN_MASK
;
445 tsleep_interlock(const volatile void *ident
, int flags
)
447 _tsleep_interlock(mycpu
, ident
, flags
);
451 * Remove thread from sleepq. Must be called with a critical section held.
452 * The thread must not be migrating.
455 _tsleep_remove(thread_t td
)
457 globaldata_t gd
= mycpu
;
462 KKASSERT(td
->td_gd
== gd
&& IN_CRITICAL_SECT(td
));
463 KKASSERT((td
->td_flags
& TDF_MIGRATING
) == 0);
464 if (td
->td_flags
& TDF_TSLEEPQ
) {
465 td
->td_flags
&= ~TDF_TSLEEPQ
;
466 cid
= LOOKUP(td
->td_wchan
);
467 gid
= TCHASHSHIFT(cid
);
468 qp
= &gd
->gd_tsleep_hash
[gid
];
469 TAILQ_REMOVE(&qp
->queue
, td
, td_sleepq
);
470 if (TAILQ_FIRST(&qp
->queue
) == NULL
) {
471 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks
[cid
],
480 tsleep_remove(thread_t td
)
486 * General sleep call. Suspends the current process until a wakeup is
487 * performed on the specified identifier. The process will then be made
488 * runnable with the specified priority. Sleeps at most timo/hz seconds
489 * (0 means no timeout). If flags includes PCATCH flag, signals are checked
490 * before and after sleeping, else signals are not checked. Returns 0 if
491 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
492 * signal needs to be delivered, ERESTART is returned if the current system
493 * call should be restarted if possible, and EINTR is returned if the system
494 * call should be interrupted by the signal (return EINTR).
496 * Note that if we are a process, we release_curproc() before messing with
497 * the LWKT scheduler.
499 * During autoconfiguration or after a panic, a sleep will simply
500 * lower the priority briefly to allow interrupts, then return.
502 * WARNING! This code can't block (short of switching away), or bad things
503 * will happen. No getting tokens, no blocking locks, etc.
506 tsleep(const volatile void *ident
, int flags
, const char *wmesg
, int timo
)
508 struct thread
*td
= curthread
;
509 struct lwp
*lp
= td
->td_lwp
;
510 struct proc
*p
= td
->td_proc
; /* may be NULL */
516 struct callout thandle
;
519 * Currently a severe hack. Make sure any delayed wakeups
520 * are flushed before we sleep or we might deadlock on whatever
521 * event we are sleeping on.
523 if (td
->td_flags
& TDF_DELAYED_WAKEUP
)
524 wakeup_end_delayed();
527 * NOTE: removed KTRPOINT, it could cause races due to blocking
528 * even in stable. Just scrap it for now.
530 if (!tsleep_crypto_dump
&& (tsleep_now_works
== 0 || panicstr
)) {
532 * After a panic, or before we actually have an operational
533 * softclock, just give interrupts a chance, then just return;
535 * don't run any other procs or panic below,
536 * in case this is the idle process and already asleep.
540 lwkt_setpri_self(safepri
);
542 lwkt_setpri_self(oldpri
);
545 logtsleep2(tsleep_beg
, ident
);
547 KKASSERT(td
!= &gd
->gd_idlethread
); /* you must be kidding! */
548 td
->td_wakefromcpu
= -1; /* overwritten by _wakeup */
551 * NOTE: all of this occurs on the current cpu, including any
552 * callout-based wakeups, so a critical section is a sufficient
555 * The entire sequence through to where we actually sleep must
556 * run without breaking the critical section.
558 catch = flags
& PCATCH
;
562 crit_enter_quick(td
);
564 KASSERT(ident
!= NULL
, ("tsleep: no ident"));
565 KASSERT(lp
== NULL
||
566 lp
->lwp_stat
== LSRUN
|| /* Obvious */
567 lp
->lwp_stat
== LSSTOP
, /* Set in tstop */
569 ident
, wmesg
, lp
->lwp_stat
));
572 * We interlock the sleep queue if the caller has not already done
573 * it for us. This must be done before we potentially acquire any
574 * tokens or we can loose the wakeup.
576 if ((flags
& PINTERLOCKED
) == 0) {
577 _tsleep_interlock(gd
, ident
, flags
);
581 * Setup for the current process (if this is a process). We must
582 * interlock with lwp_token to avoid remote wakeup races via
586 lwkt_gettoken(&lp
->lwp_token
);
589 * If the umbrella process is in the SCORE state then
590 * make sure that the thread is flagged going into a
591 * normal sleep to allow the core dump to proceed, otherwise
592 * the coredump can end up waiting forever. If the normal
593 * sleep is woken up, the thread will enter a stopped state
594 * upon return to userland.
596 * We do not want to interrupt or cause a thread exist at
597 * this juncture because that will mess-up the state the
598 * coredump is trying to save.
600 if (p
->p_stat
== SCORE
&&
601 (lp
->lwp_mpflags
& LWP_MP_WSTOP
) == 0) {
602 atomic_set_int(&lp
->lwp_mpflags
, LWP_MP_WSTOP
);
611 * Early termination if PCATCH was set and a
612 * signal is pending, interlocked with the
615 * Early termination only occurs when tsleep() is
616 * entered while in a normal LSRUN state.
618 if ((sig
= CURSIG(lp
)) != 0)
622 * Causes ksignal to wake us up if a signal is
623 * received (interlocked with lp->lwp_token).
625 lp
->lwp_flags
|= LWP_SINTR
;
632 * Make sure the current process has been untangled from
633 * the userland scheduler and initialize slptime to start
636 * NOTE: td->td_wakefromcpu is pre-set by the release function
637 * for the dfly scheduler, and then adjusted by _wakeup()
640 p
->p_usched
->release_curproc(lp
);
645 * For PINTERLOCKED operation, TDF_TSLEEPQ might not be set if
646 * a wakeup() was processed before the thread could go to sleep.
648 * If TDF_TSLEEPQ is set, make sure the ident matches the recorded
649 * ident. If it does not then the thread slept inbetween the
650 * caller's initial tsleep_interlock() call and the caller's tsleep()
653 * Extreme loads can cause the sending of an IPI (e.g. wakeup()'s)
654 * to process incoming IPIs, thus draining incoming wakeups.
656 if ((td
->td_flags
& TDF_TSLEEPQ
) == 0) {
657 logtsleep2(ilockfail
, ident
);
659 } else if (td
->td_wchan
!= ident
||
660 td
->td_wdomain
!= (flags
& PDOMAIN_MASK
)) {
661 logtsleep2(ilockfail
, ident
);
666 * scheduling is blocked while in a critical section. Coincide
667 * the descheduled-by-tsleep flag with the descheduling of the
670 * The timer callout is localized on our cpu and interlocked by
671 * our critical section.
673 lwkt_deschedule_self(td
);
674 td
->td_flags
|= TDF_TSLEEP_DESCHEDULED
;
675 td
->td_wmesg
= wmesg
;
678 * Setup the timeout, if any. The timeout is only operable while
679 * the thread is flagged descheduled.
681 KKASSERT((td
->td_flags
& TDF_TIMEOUT
) == 0);
683 callout_init_mp(&thandle
);
684 callout_reset(&thandle
, timo
, endtsleep
, td
);
692 * Ok, we are sleeping. Place us in the SSLEEP state.
694 KKASSERT((lp
->lwp_mpflags
& LWP_MP_ONRUNQ
) == 0);
697 * tstop() sets LSSTOP, so don't fiddle with that.
699 if (lp
->lwp_stat
!= LSSTOP
)
700 lp
->lwp_stat
= LSSLEEP
;
701 lp
->lwp_ru
.ru_nvcsw
++;
702 p
->p_usched
->uload_update(lp
);
706 * And when we are woken up, put us back in LSRUN. If we
707 * slept for over a second, recalculate our estcpu.
709 lp
->lwp_stat
= LSRUN
;
710 if (lp
->lwp_slptime
) {
711 p
->p_usched
->uload_update(lp
);
712 p
->p_usched
->recalculate(lp
);
720 * Make sure we haven't switched cpus while we were asleep. It's
721 * not supposed to happen. Cleanup our temporary flags.
723 KKASSERT(gd
== td
->td_gd
);
726 * Cleanup the timeout. If the timeout has already occured thandle
727 * has already been stopped, otherwise stop thandle. If the timeout
728 * is running (the callout thread must be blocked trying to get
729 * lwp_token) then wait for us to get scheduled.
732 while (td
->td_flags
& TDF_TIMEOUT_RUNNING
) {
733 /* else we won't get rescheduled! */
734 if (lp
->lwp_stat
!= LSSTOP
)
735 lp
->lwp_stat
= LSSLEEP
;
736 lwkt_deschedule_self(td
);
737 td
->td_wmesg
= "tsrace";
739 kprintf("td %p %s: timeout race\n", td
, td
->td_comm
);
741 if (td
->td_flags
& TDF_TIMEOUT
) {
742 td
->td_flags
&= ~TDF_TIMEOUT
;
745 /* does not block when on same cpu */
746 callout_stop(&thandle
);
749 td
->td_flags
&= ~TDF_TSLEEP_DESCHEDULED
;
752 * Make sure we have been removed from the sleepq. In most
753 * cases this will have been done for us already but it is
754 * possible for a scheduling IPI to be in-flight from a
755 * previous tsleep/tsleep_interlock() or due to a straight-out
756 * call to lwkt_schedule() (in the case of an interrupt thread),
757 * causing a spurious wakeup.
763 * Figure out the correct error return. If interrupted by a
764 * signal we want to return EINTR or ERESTART.
768 if (catch && error
== 0) {
769 if (sig
!= 0 || (sig
= CURSIG(lp
))) {
770 if (SIGISMEMBER(p
->p_sigacts
->ps_sigintr
, sig
))
777 lp
->lwp_flags
&= ~LWP_SINTR
;
780 * Unconditionally set us to LSRUN on resume. lwp_stat could
781 * be in a weird state due to the goto resume, particularly
782 * when tsleep() is called from tstop().
784 lp
->lwp_stat
= LSRUN
;
785 lwkt_reltoken(&lp
->lwp_token
);
787 logtsleep1(tsleep_end
);
794 * Interlocked spinlock sleep. An exclusively held spinlock must
795 * be passed to ssleep(). The function will atomically release the
796 * spinlock and tsleep on the ident, then reacquire the spinlock and
799 * This routine is fairly important along the critical path, so optimize it
803 ssleep(const volatile void *ident
, struct spinlock
*spin
, int flags
,
804 const char *wmesg
, int timo
)
806 globaldata_t gd
= mycpu
;
809 _tsleep_interlock(gd
, ident
, flags
);
810 spin_unlock_quick(gd
, spin
);
811 error
= tsleep(ident
, flags
| PINTERLOCKED
, wmesg
, timo
);
812 KKASSERT(gd
== mycpu
);
813 _spin_lock_quick(gd
, spin
, wmesg
);
819 lksleep(const volatile void *ident
, struct lock
*lock
, int flags
,
820 const char *wmesg
, int timo
)
822 globaldata_t gd
= mycpu
;
825 _tsleep_interlock(gd
, ident
, flags
);
826 lockmgr(lock
, LK_RELEASE
);
827 error
= tsleep(ident
, flags
| PINTERLOCKED
, wmesg
, timo
);
828 lockmgr(lock
, LK_EXCLUSIVE
);
834 * Interlocked mutex sleep. An exclusively held mutex must be passed
835 * to mtxsleep(). The function will atomically release the mutex
836 * and tsleep on the ident, then reacquire the mutex and return.
839 mtxsleep(const volatile void *ident
, struct mtx
*mtx
, int flags
,
840 const char *wmesg
, int timo
)
842 globaldata_t gd
= mycpu
;
845 _tsleep_interlock(gd
, ident
, flags
);
847 error
= tsleep(ident
, flags
| PINTERLOCKED
, wmesg
, timo
);
848 mtx_lock_ex_quick(mtx
);
854 * Interlocked serializer sleep. An exclusively held serializer must
855 * be passed to zsleep(). The function will atomically release
856 * the serializer and tsleep on the ident, then reacquire the serializer
860 zsleep(const volatile void *ident
, struct lwkt_serialize
*slz
, int flags
,
861 const char *wmesg
, int timo
)
863 globaldata_t gd
= mycpu
;
866 ASSERT_SERIALIZED(slz
);
868 _tsleep_interlock(gd
, ident
, flags
);
869 lwkt_serialize_exit(slz
);
870 ret
= tsleep(ident
, flags
| PINTERLOCKED
, wmesg
, timo
);
871 lwkt_serialize_enter(slz
);
877 * Directly block on the LWKT thread by descheduling it. This
878 * is much faster then tsleep(), but the only legal way to wake
879 * us up is to directly schedule the thread.
881 * Setting TDF_SINTR will cause new signals to directly schedule us.
883 * This routine must be called while in a critical section.
886 lwkt_sleep(const char *wmesg
, int flags
)
888 thread_t td
= curthread
;
891 if ((flags
& PCATCH
) == 0 || td
->td_lwp
== NULL
) {
892 td
->td_flags
|= TDF_BLOCKED
;
893 td
->td_wmesg
= wmesg
;
894 lwkt_deschedule_self(td
);
897 td
->td_flags
&= ~TDF_BLOCKED
;
900 if ((sig
= CURSIG(td
->td_lwp
)) != 0) {
901 if (SIGISMEMBER(td
->td_proc
->p_sigacts
->ps_sigintr
, sig
))
907 td
->td_flags
|= TDF_BLOCKED
| TDF_SINTR
;
908 td
->td_wmesg
= wmesg
;
909 lwkt_deschedule_self(td
);
911 td
->td_flags
&= ~(TDF_BLOCKED
| TDF_SINTR
);
917 * Implement the timeout for tsleep.
919 * This type of callout timeout is scheduled on the same cpu the process
920 * is sleeping on. Also, at the moment, the MP lock is held.
929 * We are going to have to get the lwp_token, which means we might
930 * block. This can race a tsleep getting woken up by other means
931 * so set TDF_TIMEOUT_RUNNING to force the tsleep to wait for our
932 * processing to complete (sorry tsleep!).
934 * We can safely set td_flags because td MUST be on the same cpu
937 KKASSERT(td
->td_gd
== mycpu
);
939 td
->td_flags
|= TDF_TIMEOUT_RUNNING
| TDF_TIMEOUT
;
942 * This can block but TDF_TIMEOUT_RUNNING will prevent the thread
943 * from exiting the tsleep on us. The flag is interlocked by virtue
944 * of lp being on the same cpu as we are.
946 if ((lp
= td
->td_lwp
) != NULL
)
947 lwkt_gettoken(&lp
->lwp_token
);
949 KKASSERT(td
->td_flags
& TDF_TSLEEP_DESCHEDULED
);
953 * callout timer should normally never be set in tstop()
954 * because it passes a timeout of 0. However, there is a
955 * case during thread exit (which SSTOP's all the threads)
956 * for which tstop() must break out and can (properly) leave
957 * the thread in LSSTOP.
959 KKASSERT(lp
->lwp_stat
!= LSSTOP
||
960 (lp
->lwp_mpflags
& LWP_MP_WEXIT
));
962 lwkt_reltoken(&lp
->lwp_token
);
967 KKASSERT(td
->td_gd
== mycpu
);
968 td
->td_flags
&= ~TDF_TIMEOUT_RUNNING
;
973 * Make all processes sleeping on the specified identifier runnable.
974 * count may be zero or one only.
976 * The domain encodes the sleep/wakeup domain, flags, plus the originating
979 * This call may run without the MP lock held. We can only manipulate thread
980 * state on the cpu owning the thread. We CANNOT manipulate process state
983 * _wakeup() can be passed to an IPI so we can't use (const volatile
987 _wakeup(void *ident
, int domain
)
999 logtsleep2(wakeup_beg
, ident
);
1001 cid
= LOOKUP(ident
);
1002 gid
= TCHASHSHIFT(cid
);
1003 qp
= &gd
->gd_tsleep_hash
[gid
];
1005 for (td
= TAILQ_FIRST(&qp
->queue
); td
!= NULL
; td
= ntd
) {
1006 ntd
= TAILQ_NEXT(td
, td_sleepq
);
1007 if (td
->td_wchan
== ident
&&
1008 td
->td_wdomain
== (domain
& PDOMAIN_MASK
)
1010 KKASSERT(td
->td_gd
== gd
);
1012 td
->td_wakefromcpu
= PWAKEUP_DECODE(domain
);
1013 if (td
->td_flags
& TDF_TSLEEP_DESCHEDULED
) {
1015 if (domain
& PWAKEUP_ONE
)
1020 if (td
->td_wchan
== qp
->ident0
)
1022 else if (td
->td_wchan
== qp
->ident1
)
1024 else if (td
->td_wchan
== qp
->ident2
)
1026 else if (td
->td_wchan
== qp
->ident3
)
1029 wids
|= 16; /* force ident0 to be retained (-1) */
1033 * Because a bunch of cpumask array entries cover the same queue, it
1034 * is possible for our bit to remain set in some of them and cause
1035 * spurious wakeup IPIs later on. Make sure that the bit is cleared
1036 * when a spurious IPI occurs to prevent further spurious IPIs.
1038 if (TAILQ_FIRST(&qp
->queue
) == NULL
) {
1039 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks
[cid
], gd
->gd_cpuid
);
1045 if ((wids
& 1) == 0) {
1046 if ((wids
& 16) == 0) {
1049 KKASSERT(qp
->ident0
== (void *)(intptr_t)-1);
1052 if ((wids
& 2) == 0)
1054 if ((wids
& 4) == 0)
1056 if ((wids
& 8) == 0)
1061 * We finished checking the current cpu but there still may be
1062 * more work to do. Either wakeup_one was requested and no matching
1063 * thread was found, or a normal wakeup was requested and we have
1064 * to continue checking cpus.
1066 * It should be noted that this scheme is actually less expensive then
1067 * the old scheme when waking up multiple threads, since we send
1068 * only one IPI message per target candidate which may then schedule
1069 * multiple threads. Before we could have wound up sending an IPI
1070 * message for each thread on the target cpu (!= current cpu) that
1071 * needed to be woken up.
1073 * NOTE: Wakeups occuring on remote cpus are asynchronous. This
1074 * should be ok since we are passing idents in the IPI rather
1075 * then thread pointers.
1077 * NOTE: We MUST mfence (or use an atomic op) prior to reading
1078 * the cpumask, as another cpu may have written to it in
1079 * a fashion interlocked with whatever the caller did before
1080 * calling wakeup(). Otherwise we might miss the interaction
1081 * (kern_mutex.c can cause this problem).
1083 * lfence is insufficient as it may allow a written state to
1084 * reorder around the cpumask load.
1086 if ((domain
& PWAKEUP_MYCPU
) == 0) {
1088 const volatile void *id0
;
1093 mask
= slpque_cpumasks
[cid
];
1094 CPUMASK_ANDMASK(mask
, gd
->gd_other_cpus
);
1095 while (CPUMASK_TESTNZERO(mask
)) {
1096 n
= BSRCPUMASK(mask
);
1097 CPUMASK_NANDBIT(mask
, n
);
1098 tgd
= globaldata_find(n
);
1101 * Both ident0 compares must from a single load
1102 * to avoid ident0 update races crossing the two
1105 qp
= &tgd
->gd_tsleep_hash
[gid
];
1108 if (id0
== (void *)(intptr_t)-1) {
1109 lwkt_send_ipiq2(tgd
, _wakeup
, ident
,
1110 domain
| PWAKEUP_MYCPU
);
1111 ++tgd
->gd_cnt
.v_wakeup_colls
;
1112 } else if (id0
== ident
||
1113 qp
->ident1
== ident
||
1114 qp
->ident2
== ident
||
1115 qp
->ident3
== ident
) {
1116 lwkt_send_ipiq2(tgd
, _wakeup
, ident
,
1117 domain
| PWAKEUP_MYCPU
);
1121 if (CPUMASK_TESTNZERO(mask
)) {
1122 lwkt_send_ipiq2_mask(mask
, _wakeup
, ident
,
1123 domain
| PWAKEUP_MYCPU
);
1128 logtsleep1(wakeup_end
);
1133 * Wakeup all threads tsleep()ing on the specified ident, on all cpus
1136 wakeup(const volatile void *ident
)
1138 globaldata_t gd
= mycpu
;
1139 thread_t td
= gd
->gd_curthread
;
1141 if (td
&& (td
->td_flags
& TDF_DELAYED_WAKEUP
)) {
1143 * If we are in a delayed wakeup section, record up to two wakeups in
1144 * a per-CPU queue and issue them when we block or exit the delayed
1147 if (atomic_cmpset_ptr(&gd
->gd_delayed_wakeup
[0], NULL
, ident
))
1149 if (atomic_cmpset_ptr(&gd
->gd_delayed_wakeup
[1], NULL
, ident
))
1152 ident
= atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd
->gd_delayed_wakeup
[1]),
1154 ident
= atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd
->gd_delayed_wakeup
[0]),
1158 _wakeup(__DEALL(ident
), PWAKEUP_ENCODE(0, gd
->gd_cpuid
));
1162 * Wakeup one thread tsleep()ing on the specified ident, on any cpu.
1165 wakeup_one(const volatile void *ident
)
1167 /* XXX potentially round-robin the first responding cpu */
1168 _wakeup(__DEALL(ident
), PWAKEUP_ENCODE(0, mycpu
->gd_cpuid
) |
1173 * Wakeup threads tsleep()ing on the specified ident on the current cpu
1177 wakeup_mycpu(const volatile void *ident
)
1179 _wakeup(__DEALL(ident
), PWAKEUP_ENCODE(0, mycpu
->gd_cpuid
) |
1184 * Wakeup one thread tsleep()ing on the specified ident on the current cpu
1188 wakeup_mycpu_one(const volatile void *ident
)
1190 /* XXX potentially round-robin the first responding cpu */
1191 _wakeup(__DEALL(ident
), PWAKEUP_ENCODE(0, mycpu
->gd_cpuid
) |
1192 PWAKEUP_MYCPU
| PWAKEUP_ONE
);
1196 * Wakeup all thread tsleep()ing on the specified ident on the specified cpu
1200 wakeup_oncpu(globaldata_t gd
, const volatile void *ident
)
1202 globaldata_t mygd
= mycpu
;
1204 _wakeup(__DEALL(ident
), PWAKEUP_ENCODE(0, mygd
->gd_cpuid
) |
1207 lwkt_send_ipiq2(gd
, _wakeup
, __DEALL(ident
),
1208 PWAKEUP_ENCODE(0, mygd
->gd_cpuid
) |
1214 * Wakeup one thread tsleep()ing on the specified ident on the specified cpu
1218 wakeup_oncpu_one(globaldata_t gd
, const volatile void *ident
)
1220 globaldata_t mygd
= mycpu
;
1222 _wakeup(__DEALL(ident
), PWAKEUP_ENCODE(0, mygd
->gd_cpuid
) |
1223 PWAKEUP_MYCPU
| PWAKEUP_ONE
);
1225 lwkt_send_ipiq2(gd
, _wakeup
, __DEALL(ident
),
1226 PWAKEUP_ENCODE(0, mygd
->gd_cpuid
) |
1227 PWAKEUP_MYCPU
| PWAKEUP_ONE
);
1232 * Wakeup all threads waiting on the specified ident that slept using
1233 * the specified domain, on all cpus.
1236 wakeup_domain(const volatile void *ident
, int domain
)
1238 _wakeup(__DEALL(ident
), PWAKEUP_ENCODE(domain
, mycpu
->gd_cpuid
));
1242 * Wakeup one thread waiting on the specified ident that slept using
1243 * the specified domain, on any cpu.
1246 wakeup_domain_one(const volatile void *ident
, int domain
)
1248 /* XXX potentially round-robin the first responding cpu */
1249 _wakeup(__DEALL(ident
),
1250 PWAKEUP_ENCODE(domain
, mycpu
->gd_cpuid
) | PWAKEUP_ONE
);
1254 wakeup_start_delayed(void)
1256 globaldata_t gd
= mycpu
;
1259 gd
->gd_curthread
->td_flags
|= TDF_DELAYED_WAKEUP
;
1264 wakeup_end_delayed(void)
1266 globaldata_t gd
= mycpu
;
1268 if (gd
->gd_curthread
->td_flags
& TDF_DELAYED_WAKEUP
) {
1270 gd
->gd_curthread
->td_flags
&= ~TDF_DELAYED_WAKEUP
;
1271 if (gd
->gd_delayed_wakeup
[0] || gd
->gd_delayed_wakeup
[1]) {
1272 if (gd
->gd_delayed_wakeup
[0]) {
1273 wakeup(gd
->gd_delayed_wakeup
[0]);
1274 gd
->gd_delayed_wakeup
[0] = NULL
;
1276 if (gd
->gd_delayed_wakeup
[1]) {
1277 wakeup(gd
->gd_delayed_wakeup
[1]);
1278 gd
->gd_delayed_wakeup
[1] = NULL
;
1288 * Make a process runnable. lp->lwp_token must be held on call and this
1289 * function must be called from the cpu owning lp.
1291 * This only has an effect if we are in LSSTOP or LSSLEEP.
1294 setrunnable(struct lwp
*lp
)
1296 thread_t td
= lp
->lwp_thread
;
1298 ASSERT_LWKT_TOKEN_HELD(&lp
->lwp_token
);
1299 KKASSERT(td
->td_gd
== mycpu
);
1301 if (lp
->lwp_stat
== LSSTOP
)
1302 lp
->lwp_stat
= LSSLEEP
;
1303 if (lp
->lwp_stat
== LSSLEEP
) {
1306 } else if (td
->td_flags
& TDF_SINTR
) {
1313 * The process is stopped due to some condition, usually because p_stat is
1314 * set to SSTOP, but also possibly due to being traced.
1316 * Caller must hold p->p_token
1318 * NOTE! If the caller sets SSTOP, the caller must also clear P_WAITED
1319 * because the parent may check the child's status before the child actually
1320 * gets to this routine.
1322 * This routine is called with the current lwp only, typically just
1323 * before returning to userland if the process state is detected as
1324 * possibly being in a stopped state.
1329 struct lwp
*lp
= curthread
->td_lwp
;
1330 struct proc
*p
= lp
->lwp_proc
;
1333 lwkt_gettoken(&lp
->lwp_token
);
1337 * If LWP_MP_WSTOP is set, we were sleeping
1338 * while our process was stopped. At this point
1339 * we were already counted as stopped.
1341 if ((lp
->lwp_mpflags
& LWP_MP_WSTOP
) == 0) {
1343 * If we're the last thread to stop, signal
1347 atomic_set_int(&lp
->lwp_mpflags
, LWP_MP_WSTOP
);
1348 wakeup(&p
->p_nstopped
);
1349 if (p
->p_nstopped
== p
->p_nthreads
) {
1351 * Token required to interlock kern_wait()
1355 lwkt_gettoken(&q
->p_token
);
1356 p
->p_flags
&= ~P_WAITED
;
1358 if ((q
->p_sigacts
->ps_flag
& PS_NOCLDSTOP
) == 0)
1359 ksignal(q
, SIGCHLD
);
1360 lwkt_reltoken(&q
->p_token
);
1366 * Wait here while in a stopped state, interlocked with lwp_token.
1367 * We must break-out if the whole process is trying to exit.
1369 while (STOPLWP(p
, lp
)) {
1370 lp
->lwp_stat
= LSSTOP
;
1371 tsleep(p
, 0, "stop", 0);
1374 atomic_clear_int(&lp
->lwp_mpflags
, LWP_MP_WSTOP
);
1376 lwkt_reltoken(&lp
->lwp_token
);
1380 * Compute a tenex style load average of a quantity on
1381 * 1, 5 and 15 minute intervals. This is a pcpu callout.
1383 * We segment the lwp scan on a pcpu basis. This does NOT
1384 * mean the associated lwps are on this cpu, it is done
1385 * just to break the work up.
1387 * The callout on cpu0 rolls up the stats from the other
1390 static int loadav_count_runnable(struct lwp
*p
, void *data
);
1395 globaldata_t gd
= mycpu
;
1396 struct loadavg
*avg
;
1400 alllwp_scan(loadav_count_runnable
, &nrun
, 1);
1401 gd
->gd_loadav_nrunnable
= nrun
;
1402 if (gd
->gd_cpuid
== 0) {
1405 for (i
= 0; i
< ncpus
; ++i
)
1406 nrun
+= globaldata_find(i
)->gd_loadav_nrunnable
;
1407 for (i
= 0; i
< 3; i
++) {
1408 avg
->ldavg
[i
] = (cexp
[i
] * avg
->ldavg
[i
] +
1409 (long)nrun
* FSCALE
* (FSCALE
- cexp
[i
])) >> FSHIFT
;
1414 * Schedule the next update to occur after 5 seconds, but add a
1415 * random variation to avoid synchronisation with processes that
1416 * run at regular intervals.
1418 callout_reset(&gd
->gd_loadav_callout
,
1419 hz
* 4 + (int)(krandom() % (hz
* 2 + 1)),
1424 loadav_count_runnable(struct lwp
*lp
, void *data
)
1429 switch (lp
->lwp_stat
) {
1431 if ((td
= lp
->lwp_thread
) == NULL
)
1433 if (td
->td_flags
& TDF_BLOCKED
)
1445 * Regular data collection
1448 collect_load_callback(int n
)
1450 int fscale
= averunnable
.fscale
;
1452 return ((averunnable
.ldavg
[0] * 100 + (fscale
>> 1)) / fscale
);
1456 sched_setup(void *dummy __unused
)
1458 globaldata_t save_gd
= mycpu
;
1462 kcollect_register(KCOLLECT_LOAD
, "load", collect_load_callback
,
1463 KCOLLECT_SCALE(KCOLLECT_LOAD_FORMAT
, 0));
1466 * Kick off timeout driven events by calling first time. We
1467 * split the work across available cpus to help scale it,
1468 * it can eat a lot of cpu when there are a lot of processes
1471 for (n
= 0; n
< ncpus
; ++n
) {
1472 gd
= globaldata_find(n
);
1473 lwkt_setcpu_self(gd
);
1474 callout_init_mp(&gd
->gd_loadav_callout
);
1475 callout_init_mp(&gd
->gd_schedcpu_callout
);
1479 lwkt_setcpu_self(save_gd
);
1483 * Extremely early initialization, dummy-up the tables so we don't have
1484 * to conditionalize for NULL in _wakeup() and tsleep_interlock(). Even
1485 * though the system isn't blocking this early, these functions still
1486 * try to access the hash table.
1488 * This setup will be overridden once sched_dyninit() -> sleep_gdinit()
1492 sleep_early_gdinit(globaldata_t gd
)
1494 static struct tslpque dummy_slpque
;
1495 static cpumask_t dummy_cpumasks
;
1497 slpque_tablesize
= 1;
1498 gd
->gd_tsleep_hash
= &dummy_slpque
;
1499 slpque_cpumasks
= &dummy_cpumasks
;
1500 TAILQ_INIT(&dummy_slpque
.queue
);
1504 * PCPU initialization. Called after KMALLOC is operational, by
1505 * sched_dyninit() for cpu 0, and by mi_gdinit() for other cpus later.
1507 * WARNING! The pcpu hash table is smaller than the global cpumask
1508 * hash table, which can save us a lot of memory when maxproc
1512 sleep_gdinit(globaldata_t gd
)
1520 * This shouldn't happen, that is there shouldn't be any threads
1521 * waiting on the dummy tsleep queue this early in the boot.
1523 if (gd
->gd_cpuid
== 0) {
1524 struct tslpque
*qp
= &gd
->gd_tsleep_hash
[0];
1525 TAILQ_FOREACH(td
, &qp
->queue
, td_sleepq
) {
1526 kprintf("SLEEP_GDINIT SWITCH %s\n", td
->td_comm
);
1531 * Note that we have to allocate one extra slot because we are
1532 * shifting a modulo value. TCHASHSHIFT(slpque_tablesize - 1) can
1533 * return the same value as TCHASHSHIFT(slpque_tablesize).
1535 n
= TCHASHSHIFT(slpque_tablesize
) + 1;
1537 hash_size
= sizeof(struct tslpque
) * n
;
1538 gd
->gd_tsleep_hash
= (void *)kmem_alloc3(&kernel_map
, hash_size
,
1540 KM_CPU(gd
->gd_cpuid
));
1541 memset(gd
->gd_tsleep_hash
, 0, hash_size
);
1542 for (i
= 0; i
< n
; ++i
)
1543 TAILQ_INIT(&gd
->gd_tsleep_hash
[i
].queue
);
1547 * Dynamic initialization after the memory system is operational.
1550 sched_dyninit(void *dummy __unused
)
1557 * Calculate table size for slpque hash. We want a prime number
1558 * large enough to avoid overloading slpque_cpumasks when the
1559 * system has a large number of sleeping processes, which will
1560 * spam IPIs on wakeup().
1562 * While it is true this is really a per-lwp factor, generally
1563 * speaking the maxproc limit is a good metric to go by.
1565 for (tblsize
= maxproc
| 1; ; tblsize
+= 2) {
1566 if (tblsize
% 3 == 0)
1568 if (tblsize
% 5 == 0)
1570 tblsize2
= (tblsize
/ 2) | 1;
1571 for (n
= 7; n
< tblsize2
; n
+= 2) {
1572 if (tblsize
% n
== 0)
1580 * PIDs are currently limited to 6 digits. Cap the table size
1583 if (tblsize
> 2000003)
1586 slpque_tablesize
= tblsize
;
1587 slpque_cpumasks
= kmalloc(sizeof(*slpque_cpumasks
) * slpque_tablesize
,
1588 M_TSLEEP
, M_WAITOK
| M_ZERO
);
1589 sleep_gdinit(mycpu
);