4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright (c) 2016 by Delphix. All rights reserved.
28 * Big Theory Statement for mutual exclusion locking primitives.
30 * A mutex serializes multiple threads so that only one thread
31 * (the "owner" of the mutex) is active at a time. See mutex(9F)
32 * for a full description of the interfaces and programming model.
33 * The rest of this comment describes the implementation.
35 * Mutexes come in two flavors: adaptive and spin. mutex_init(9F)
36 * determines the type based solely on the iblock cookie (PIL) argument.
37 * PIL > LOCK_LEVEL implies a spin lock; everything else is adaptive.
39 * Spin mutexes block interrupts and spin until the lock becomes available.
40 * A thread may not sleep, or call any function that might sleep, while
41 * holding a spin mutex. With few exceptions, spin mutexes should only
42 * be used to synchronize with interrupt handlers.
44 * Adaptive mutexes (the default type) spin if the owner is running on
45 * another CPU and block otherwise. This policy is based on the assumption
46 * that mutex hold times are typically short enough that the time spent
47 * spinning is less than the time it takes to block. If you need mutual
48 * exclusion semantics with long hold times, consider an rwlock(9F) as
49 * RW_WRITER. Better still, reconsider the algorithm: if it requires
50 * mutual exclusion for long periods of time, it's probably not scalable.
52 * Adaptive mutexes are overwhelmingly more common than spin mutexes,
53 * so mutex_enter() assumes that the lock is adaptive. We get away
54 * with this by structuring mutexes so that an attempt to acquire a
55 * spin mutex as adaptive always fails. When mutex_enter() fails
56 * it punts to mutex_vector_enter(), which does all the hard stuff.
58 * mutex_vector_enter() first checks the type. If it's spin mutex,
59 * we just call lock_set_spl() and return. If it's an adaptive mutex,
60 * we check to see what the owner is doing. If the owner is running,
61 * we spin until the lock becomes available; if not, we mark the lock
62 * as having waiters and block.
64 * Blocking on a mutex is surprisingly delicate dance because, for speed,
65 * mutex_exit() doesn't use an atomic instruction. Thus we have to work
66 * a little harder in the (rarely-executed) blocking path to make sure
67 * we don't block on a mutex that's just been released -- otherwise we
68 * might never be woken up.
70 * The logic for synchronizing mutex_vector_enter() with mutex_exit()
71 * in the face of preemption and relaxed memory ordering is as follows:
73 * (1) Preemption in the middle of mutex_exit() must cause mutex_exit()
74 * to restart. Each platform must enforce this by checking the
75 * interrupted PC in the interrupt handler (or on return from trap --
76 * whichever is more convenient for the platform). If the PC
77 * lies within the critical region of mutex_exit(), the interrupt
78 * handler must reset the PC back to the beginning of mutex_exit().
79 * The critical region consists of all instructions up to, but not
80 * including, the store that clears the lock (which, of course,
81 * must never be executed twice.)
83 * This ensures that the owner will always check for waiters after
84 * resuming from a previous preemption.
86 * (2) A thread resuming in mutex_exit() does (at least) the following:
88 * when resuming: set CPU_THREAD = owner
91 * in mutex_exit: check waiters bit; do wakeup if set
92 * membar #LoadStore|#StoreStore
94 * (at this point, other threads may or may not grab
95 * the lock, and we may or may not reacquire it)
97 * when blocking: membar #StoreStore (due to disp_lock_enter())
98 * set CPU_THREAD = (possibly) someone else
100 * (3) A thread blocking in mutex_vector_enter() does the following:
103 * membar #StoreLoad (via membar_enter())
104 * check CPU_THREAD for owner's t_cpu
105 * continue if owner running
106 * membar #LoadLoad (via membar_consumer())
107 * check owner and waiters bit; abort if either changed
110 * Thus the global memory orderings for (2) and (3) are as follows:
112 * (2M) mutex_exit() memory order:
114 * STORE CPU_THREAD = owner
117 * STORE CPU_THREAD = (possibly) someone else
119 * (3M) mutex_vector_enter() memory order:
121 * STORE waiters bit = 1
122 * LOAD CPU_THREAD for each CPU
123 * LOAD owner and waiters bit
125 * It has been verified by exhaustive simulation that all possible global
126 * memory orderings of (2M) interleaved with (3M) result in correct
127 * behavior. Moreover, these ordering constraints are minimal: changing
128 * the ordering of anything in (2M) or (3M) breaks the algorithm, creating
129 * windows for missed wakeups. Note: the possibility that other threads
130 * may grab the lock after the owner drops it can be factored out of the
131 * memory ordering analysis because mutex_vector_enter() won't block
132 * if the lock isn't still owned by the same thread.
134 * The only requirements of code outside the mutex implementation are
135 * (1) mutex_exit() preemption fixup in interrupt handlers or trap return,
136 * (2) a membar #StoreLoad after setting CPU_THREAD in resume(),
137 * (3) mutex_owner_running() preemption fixup in interrupt handlers
139 * Note: idle threads cannot grab adaptive locks (since they cannot block),
140 * so the membar may be safely omitted when resuming an idle thread.
142 * When a mutex has waiters, mutex_vector_exit() has several options:
144 * (1) Choose a waiter and make that thread the owner before waking it;
145 * this is known as "direct handoff" of ownership.
147 * (2) Drop the lock and wake one waiter.
149 * (3) Drop the lock, clear the waiters bit, and wake all waiters.
151 * In many ways (1) is the cleanest solution, but if a lock is moderately
152 * contended it defeats the adaptive spin logic. If we make some other
153 * thread the owner, but it's not ONPROC yet, then all other threads on
154 * other cpus that try to get the lock will conclude that the owner is
155 * blocked, so they'll block too. And so on -- it escalates quickly,
156 * with every thread taking the blocking path rather than the spin path.
157 * Thus, direct handoff is *not* a good idea for adaptive mutexes.
159 * Option (2) is the next most natural-seeming option, but it has several
160 * annoying properties. If there's more than one waiter, we must preserve
161 * the waiters bit on an unheld lock. On cas-capable platforms, where
162 * the waiters bit is part of the lock word, this means that both 0x0
163 * and 0x1 represent unheld locks, so we have to cas against *both*.
164 * Priority inheritance also gets more complicated, because a lock can
165 * have waiters but no owner to whom priority can be willed. So while
166 * it is possible to make option (2) work, it's surprisingly vile.
168 * Option (3), the least-intuitive at first glance, is what we actually do.
169 * It has the advantage that because you always wake all waiters, you
170 * never have to preserve the waiters bit. Waking all waiters seems like
171 * begging for a thundering herd problem, but consider: under option (2),
172 * every thread that grabs and drops the lock will wake one waiter -- so
173 * if the lock is fairly active, all waiters will be awakened very quickly
174 * anyway. Moreover, this is how adaptive locks are *supposed* to work.
175 * The blocking case is rare; the more common case (by 3-4 orders of
176 * magnitude) is that one or more threads spin waiting to get the lock.
177 * Only direct handoff can prevent the thundering herd problem, but as
178 * mentioned earlier, that would tend to defeat the adaptive spin logic.
179 * In practice, option (3) works well because the blocking case is rare.
183 * delayed lock retry with exponential delay for spin locks
185 * It is noted above that for both the spin locks and the adaptive locks,
186 * spinning is the dominate mode of operation. So long as there is only
187 * one thread waiting on a lock, the naive spin loop works very well in
188 * cache based architectures. The lock data structure is pulled into the
189 * cache of the processor with the waiting/spinning thread and no further
190 * memory traffic is generated until the lock is released. Unfortunately,
191 * once two or more threads are waiting on a lock, the naive spin has
192 * the property of generating maximum memory traffic from each spinning
193 * thread as the spinning threads contend for the lock data structure.
195 * By executing a delay loop before retrying a lock, a waiting thread
196 * can reduce its memory traffic by a large factor, depending on the
197 * size of the delay loop. A large delay loop greatly reduced the memory
198 * traffic, but has the drawback of having a period of time when
199 * no thread is attempting to gain the lock even though several threads
200 * might be waiting. A small delay loop has the drawback of not
201 * much reduction in memory traffic, but reduces the potential idle time.
202 * The theory of the exponential delay code is to start with a short
203 * delay loop and double the waiting time on each iteration, up to
204 * a preselected maximum.
207 #include <sys/param.h>
208 #include <sys/time.h>
209 #include <sys/cpuvar.h>
210 #include <sys/thread.h>
211 #include <sys/debug.h>
212 #include <sys/cmn_err.h>
213 #include <sys/sobject.h>
214 #include <sys/turnstile.h>
215 #include <sys/systm.h>
216 #include <sys/mutex_impl.h>
218 #include <sys/lockstat.h>
219 #include <sys/atomic.h>
221 #include <sys/stack.h>
222 #include <sys/archsystm.h>
223 #include <sys/machsystm.h>
224 #include <sys/x_call.h>
227 * The sobj_ops vector exports a set of functions needed when a thread
228 * is asleep on a synchronization object of this type.
230 static sobj_ops_t mutex_sobj_ops
= {
231 SOBJ_MUTEX
, mutex_owner
, turnstile_stay_asleep
, turnstile_change_pri
235 * If the system panics on a mutex, save the address of the offending
236 * mutex in panic_mutex_addr, and save the contents in panic_mutex.
238 static mutex_impl_t panic_mutex
;
239 static mutex_impl_t
*panic_mutex_addr
;
242 mutex_panic(char *msg
, mutex_impl_t
*lp
)
247 if (atomic_cas_ptr(&panic_mutex_addr
, NULL
, lp
) == NULL
)
250 panic("%s, lp=%p owner=%p thread=%p",
251 msg
, (void *)lp
, (void *)MUTEX_OWNER(&panic_mutex
),
255 /* "tunables" for per-platform backoff constants. */
256 uint_t mutex_backoff_cap
= 0;
257 ushort_t mutex_backoff_base
= MUTEX_BACKOFF_BASE
;
258 ushort_t mutex_cap_factor
= MUTEX_CAP_FACTOR
;
259 uchar_t mutex_backoff_shift
= MUTEX_BACKOFF_SHIFT
;
267 /* calculate the backoff interval */
269 default_lock_backoff(uint_t backoff
)
271 uint_t cap
; /* backoff cap calculated */
274 backoff
= mutex_backoff_base
;
275 /* first call just sets the base */
280 if (mutex_backoff_cap
== 0) {
282 * For a contended lock, in the worst case a load + cas may
283 * be queued at the controller for each contending CPU.
284 * Therefore, to avoid queueing, the accesses for all CPUS must
285 * be spread out in time over an interval of (ncpu *
286 * cap-factor). Maximum backoff is set to this value, and
287 * actual backoff is a random number from 0 to the current max.
289 cap
= ncpus_online
* mutex_cap_factor
;
291 cap
= mutex_backoff_cap
;
294 /* calculate new backoff value */
295 backoff
<<= mutex_backoff_shift
; /* increase backoff */
297 if (cap
< mutex_backoff_base
)
298 backoff
= mutex_backoff_base
;
307 * default delay function for mutexes.
310 default_lock_delay(uint_t backoff
)
312 ulong_t rnd
; /* random factor */
313 uint_t cur_backoff
; /* calculated backoff */
317 * Modify backoff by a random amount to avoid lockstep, and to
318 * make it probable that some thread gets a small backoff, and
321 rnd
= (((long)curthread
>> PTR24_LSB
) ^ (long)MUTEX_GETTICK());
322 cur_backoff
= (uint_t
)(rnd
% (backoff
- mutex_backoff_base
+ 1)) +
326 * Delay before trying
327 * to touch the mutex data structure.
329 for (backctr
= cur_backoff
; backctr
; backctr
--) {
334 uint_t (*mutex_lock_backoff
)(uint_t
) = default_lock_backoff
;
335 void (*mutex_lock_delay
)(uint_t
) = default_lock_delay
;
336 void (*mutex_delay
)(void) = mutex_delay_default
;
339 * mutex_vector_enter() is called from the assembly mutex_enter() routine
340 * if the lock is held or is not of type MUTEX_ADAPTIVE.
343 mutex_vector_enter(mutex_impl_t
*lp
)
346 kthread_id_t lastowner
= MUTEX_NO_OWNER
; /* track owner changes */
347 hrtime_t sleep_time
= 0; /* how long we slept */
348 hrtime_t spin_time
= 0; /* how long we spun */
351 volatile mutex_impl_t
*vlp
= (volatile mutex_impl_t
*)lp
;
352 uint_t backoff
= 0; /* current backoff */
353 int changecnt
= 0; /* count of owner changes */
355 ASSERT_STACK_ALIGNED();
357 if (MUTEX_TYPE_SPIN(lp
)) {
358 lock_set_spl(&lp
->m_spin
.m_spinlock
, lp
->m_spin
.m_minspl
,
359 &lp
->m_spin
.m_oldspl
);
363 if (!MUTEX_TYPE_ADAPTIVE(lp
)) {
364 mutex_panic("mutex_enter: bad mutex", lp
);
369 * Adaptive mutexes must not be acquired from above LOCK_LEVEL.
370 * We can migrate after loading CPU but before checking CPU_ON_INTR,
371 * so we must verify by disabling preemption and loading CPU again.
374 if (CPU_ON_INTR(cpup
) && !panicstr
) {
376 if (CPU_ON_INTR(CPU
))
377 mutex_panic("mutex_enter: adaptive at high PIL", lp
);
381 CPU_STATS_ADDQ(cpup
, sys
, mutex_adenters
, 1);
383 spin_time
= LOCKSTAT_START_TIME(LS_MUTEX_ENTER_SPIN
);
385 backoff
= mutex_lock_backoff(0); /* set base backoff */
387 mutex_lock_delay(backoff
); /* backoff delay */
392 if ((owner
= MUTEX_OWNER(vlp
)) == NULL
) {
393 if (mutex_adaptive_tryenter(lp
)) {
396 /* increase backoff only on failed attempt. */
397 backoff
= mutex_lock_backoff(backoff
);
400 } else if (lastowner
!= owner
) {
402 backoff
= mutex_lock_backoff(backoff
);
406 if (changecnt
>= ncpus_online
) {
407 backoff
= mutex_lock_backoff(0);
411 if (owner
== curthread
)
412 mutex_panic("recursive mutex_enter", lp
);
415 * If lock is held but owner is not yet set, spin.
416 * (Only relevant for platforms that don't have cas.)
418 if (owner
== MUTEX_NO_OWNER
)
421 if (mutex_owner_running(lp
) != NULL
) {
426 * The owner appears not to be running, so block.
427 * See the Big Theory Statement for memory ordering issues.
429 ts
= turnstile_lookup(lp
);
430 MUTEX_SET_WAITERS(lp
);
434 * Recheck whether owner is running after waiters bit hits
435 * global visibility (above). If owner is running, spin.
437 if (mutex_owner_running(lp
) != NULL
) {
444 * If owner and waiters bit are unchanged, block.
446 if (MUTEX_OWNER(vlp
) == owner
&& MUTEX_HAS_WAITERS(vlp
)) {
447 sleep_time
-= gethrtime();
448 (void) turnstile_block(ts
, TS_WRITER_Q
, lp
,
449 &mutex_sobj_ops
, NULL
, NULL
);
450 sleep_time
+= gethrtime();
451 /* reset backoff after turnstile */
452 backoff
= mutex_lock_backoff(0);
458 ASSERT(MUTEX_OWNER(lp
) == curthread
);
460 if (sleep_time
!= 0) {
462 * Note, sleep time is the sum of all the sleeping we
465 LOCKSTAT_RECORD(LS_MUTEX_ENTER_BLOCK
, lp
, sleep_time
);
468 /* record spin time, don't count sleep time */
469 if (spin_time
!= 0) {
470 LOCKSTAT_RECORD_TIME(LS_MUTEX_ENTER_SPIN
, lp
,
471 spin_time
+ sleep_time
);
474 LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE
, lp
);
478 * mutex_vector_tryenter() is called from the assembly mutex_tryenter()
479 * routine if the lock is held or is not of type MUTEX_ADAPTIVE.
482 mutex_vector_tryenter(mutex_impl_t
*lp
)
486 if (MUTEX_TYPE_ADAPTIVE(lp
))
487 return (0); /* we already tried in assembly */
489 if (!MUTEX_TYPE_SPIN(lp
)) {
490 mutex_panic("mutex_tryenter: bad mutex", lp
);
494 s
= splr(lp
->m_spin
.m_minspl
);
495 if (lock_try(&lp
->m_spin
.m_spinlock
)) {
496 lp
->m_spin
.m_oldspl
= (ushort_t
)s
;
504 * mutex_vector_exit() is called from mutex_exit() if the lock is not
505 * adaptive, has waiters, or is not owned by the current thread (panic).
508 mutex_vector_exit(mutex_impl_t
*lp
)
512 if (MUTEX_TYPE_SPIN(lp
)) {
513 lock_clear_splx(&lp
->m_spin
.m_spinlock
, lp
->m_spin
.m_oldspl
);
517 if (MUTEX_OWNER(lp
) != curthread
) {
518 mutex_panic("mutex_exit: not owner", lp
);
522 ts
= turnstile_lookup(lp
);
523 MUTEX_CLEAR_LOCK_AND_WAITERS(lp
);
527 turnstile_wakeup(ts
, TS_WRITER_Q
, ts
->ts_waiters
, NULL
);
528 LOCKSTAT_RECORD0(LS_MUTEX_EXIT_RELEASE
, lp
);
532 mutex_owned(const kmutex_t
*mp
)
534 const mutex_impl_t
*lp
= (const mutex_impl_t
*)mp
;
536 if (panicstr
|| quiesce_active
)
539 if (MUTEX_TYPE_ADAPTIVE(lp
))
540 return (MUTEX_OWNER(lp
) == curthread
);
541 return (LOCK_HELD(&lp
->m_spin
.m_spinlock
));
545 mutex_owner(const kmutex_t
*mp
)
547 const mutex_impl_t
*lp
= (const mutex_impl_t
*)mp
;
550 if (MUTEX_TYPE_ADAPTIVE(lp
) && (t
= MUTEX_OWNER(lp
)) != MUTEX_NO_OWNER
)
556 * The iblock cookie 'ibc' is the spl level associated with the lock;
557 * this alone determines whether the lock will be ADAPTIVE or SPIN.
559 * Adaptive mutexes created in zeroed memory do not need to call
560 * mutex_init() as their allocation in this fashion guarantees
561 * their initialization.
562 * eg adaptive mutexes created as static within the BSS or allocated
567 mutex_init(kmutex_t
*mp
, char *name
, kmutex_type_t type
, void *ibc
)
569 mutex_impl_t
*lp
= (mutex_impl_t
*)mp
;
571 ASSERT(ibc
< (void *)KERNELBASE
); /* see 1215173 */
573 if ((intptr_t)ibc
> ipltospl(LOCK_LEVEL
) && ibc
< (void *)KERNELBASE
) {
574 ASSERT(type
!= MUTEX_ADAPTIVE
&& type
!= MUTEX_DEFAULT
);
575 MUTEX_SET_TYPE(lp
, MUTEX_SPIN
);
576 LOCK_INIT_CLEAR(&lp
->m_spin
.m_spinlock
);
577 LOCK_INIT_HELD(&lp
->m_spin
.m_dummylock
);
578 lp
->m_spin
.m_minspl
= (int)(intptr_t)ibc
;
581 static int misalign_cnt
= 0;
583 if (((uintptr_t)lp
& (uintptr_t)(MUTEX_ALIGN
- 1)) &&
584 (misalign_cnt
< MUTEX_ALIGN_WARNINGS
)) {
586 * The mutex is not aligned and may cross a cache line.
587 * This is not supported and may cause a panic.
588 * Show a warning that the mutex is not aligned
589 * and attempt to identify the origin.
590 * Unaligned mutexes are not (supposed to be)
596 funcname
= modgetsymname((uintptr_t)caller(), &offset
);
597 cmn_err(CE_WARN
, "mutex_init: %p is not %d byte "
598 "aligned; caller %s+%lx in module %s. "
599 "This is unsupported and may cause a panic. "
600 "Please report this to the kernel module supplier.",
601 (void *)lp
, MUTEX_ALIGN
,
602 funcname
? funcname
: "unknown", offset
,
603 mod_containing_pc(caller()));
605 if (misalign_cnt
>= MUTEX_ALIGN_WARNINGS
) {
606 cmn_err(CE_WARN
, "mutex_init: further unaligned"
607 " mutex warnings will be suppressed.");
610 #endif /* MUTEX_ALIGN */
611 ASSERT(type
!= MUTEX_SPIN
);
613 MUTEX_SET_TYPE(lp
, MUTEX_ADAPTIVE
);
614 MUTEX_CLEAR_LOCK_AND_WAITERS(lp
);
619 mutex_destroy(kmutex_t
*mp
)
621 mutex_impl_t
*lp
= (mutex_impl_t
*)mp
;
623 if (lp
->m_owner
== 0 && !MUTEX_HAS_WAITERS(lp
)) {
625 } else if (MUTEX_TYPE_SPIN(lp
)) {
626 LOCKSTAT_RECORD0(LS_MUTEX_DESTROY_RELEASE
, lp
);
628 } else if (MUTEX_TYPE_ADAPTIVE(lp
)) {
629 LOCKSTAT_RECORD0(LS_MUTEX_DESTROY_RELEASE
, lp
);
630 if (MUTEX_OWNER(lp
) != curthread
)
631 mutex_panic("mutex_destroy: not owner", lp
);
632 if (MUTEX_HAS_WAITERS(lp
)) {
633 turnstile_t
*ts
= turnstile_lookup(lp
);
636 mutex_panic("mutex_destroy: has waiters", lp
);
640 mutex_panic("mutex_destroy: bad mutex", lp
);
645 * Simple C support for the cases where spin locks miss on the first try.
648 lock_set_spin(lock_t
*lp
)
651 uint_t backoff
= 0; /* current backoff */
652 hrtime_t spin_time
= 0; /* how long we spun */
658 panic("lock_set: %p lock held and only one CPU", (void *)lp
);
660 spin_time
= LOCKSTAT_START_TIME(LS_LOCK_SET_SPIN
);
662 while (LOCK_HELD(lp
) || !lock_spin_try(lp
)) {
667 if (ncpus_online
== loop_count
) {
668 backoff
= mutex_lock_backoff(0);
671 backoff
= mutex_lock_backoff(backoff
);
673 mutex_lock_delay(backoff
);
676 LOCKSTAT_RECORD_TIME(LS_LOCK_SET_SPIN
, lp
, spin_time
);
678 LOCKSTAT_RECORD0(LS_LOCK_SET_ACQUIRE
, lp
);
682 lock_set_spl_spin(lock_t
*lp
, int new_pil
, ushort_t
*old_pil_addr
, int old_pil
)
685 uint_t backoff
= 0; /* current backoff */
686 hrtime_t spin_time
= 0; /* how long we spun */
692 panic("lock_set_spl: %p lock held and only one CPU",
695 ASSERT(new_pil
> LOCK_LEVEL
);
697 spin_time
= LOCKSTAT_START_TIME(LS_LOCK_SET_SPL_SPIN
);
701 while (LOCK_HELD(lp
)) {
705 *old_pil_addr
= (ushort_t
)splr(new_pil
);
708 if (ncpus_online
== loop_count
) {
709 backoff
= mutex_lock_backoff(0);
712 backoff
= mutex_lock_backoff(backoff
);
714 mutex_lock_delay(backoff
);
716 old_pil
= splr(new_pil
);
717 } while (!lock_spin_try(lp
));
719 *old_pil_addr
= (ushort_t
)old_pil
;
721 LOCKSTAT_RECORD_TIME(LS_LOCK_SET_SPL_SPIN
, lp
, spin_time
);
723 LOCKSTAT_RECORD0(LS_LOCK_SET_SPL_ACQUIRE
, lp
);