4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
30 #include <sys/param.h>
31 #include <sys/thread.h>
32 #include <sys/cmn_err.h>
33 #include <sys/debug.h>
34 #include <sys/cpuvar.h>
35 #include <sys/sobject.h>
36 #include <sys/turnstile.h>
37 #include <sys/rwlock.h>
38 #include <sys/rwlock_impl.h>
39 #include <sys/atomic.h>
40 #include <sys/lockstat.h>
43 * Big Theory Statement for readers/writer locking primitives.
45 * An rwlock provides exclusive access to a single thread ("writer") or
46 * concurrent access to multiple threads ("readers"). See rwlock(9F)
47 * for a full description of the interfaces and programming model.
48 * The rest of this comment describes the implementation.
50 * An rwlock is a single word with the following structure:
52 * ---------------------------------------------------------------------
53 * | OWNER (writer) or HOLD COUNT (readers) | WRLOCK | WRWANT | WAIT |
54 * ---------------------------------------------------------------------
57 * The waiters bit (0) indicates whether any threads are blocked waiting
58 * for the lock. The write-wanted bit (1) indicates whether any threads
59 * are blocked waiting for write access. The write-locked bit (2) indicates
60 * whether the lock is held by a writer, which determines whether the upper
61 * bits (3..31 in ILP32, 3..63 in LP64) should be interpreted as the owner
62 * (thread pointer) or the hold count (number of readers).
64 * In the absence of any contention, a writer gets the lock by setting
65 * this word to (curthread | RW_WRITE_LOCKED); a reader gets the lock
66 * by incrementing the hold count (i.e. adding 8, aka RW_READ_LOCK).
68 * A writer will fail to acquire the lock if any other thread owns it.
69 * A reader will fail if the lock is either owned (in the RW_READER and
70 * RW_READER_STARVEWRITER cases) or wanted by a writer (in the RW_READER
71 * case). rw_tryenter() returns 0 in these cases; rw_enter() blocks until
72 * the lock becomes available.
74 * When a thread blocks it acquires the rwlock's hashed turnstile lock and
75 * attempts to set RW_HAS_WAITERS (and RW_WRITE_WANTED in the writer case)
76 * atomically *only if the lock still appears busy*. A thread must never
77 * accidentally block for an available lock since there would be no owner
78 * to awaken it. casip() provides the required atomicity. Once casip()
79 * succeeds, the decision to block becomes final and irreversible. The
80 * thread will not become runnable again until it has been granted ownership
81 * of the lock via direct handoff from a former owner as described below.
83 * In the absence of any waiters, rw_exit() just clears the lock (if it
84 * is write-locked) or decrements the hold count (if it is read-locked).
85 * Note that even if waiters are present, decrementing the hold count
86 * to a non-zero value requires no special action since the lock is still
87 * held by at least one other thread.
89 * On the "final exit" (transition to unheld state) of a lock with waiters,
90 * rw_exit_wakeup() grabs the turnstile lock and transfers ownership directly
91 * to the next writer or set of readers. There are several advantages to this
92 * approach: (1) it closes all windows for priority inversion (when a new
93 * writer has grabbed the lock but has not yet inherited from blocked readers);
94 * (2) it prevents starvation of equal-priority threads by granting the lock
95 * in FIFO order; (3) it eliminates the need for a write-wanted count -- a
96 * single bit suffices because the lock remains held until all waiting
97 * writers are gone; (4) when we awaken N readers we can perform a single
98 * "atomic_add(&x, N)" to set the total hold count rather than having all N
99 * threads fight for the cache to perform an "atomic_add(&x, 1)" upon wakeup.
101 * The most interesting policy decision in rw_exit_wakeup() is which thread
102 * to wake. Starvation is always possible with priority-based scheduling,
103 * but any sane wakeup policy should at least satisfy these requirements:
105 * (1) The highest-priority thread in the system should not starve.
106 * (2) The highest-priority writer should not starve.
107 * (3) No writer should starve due to lower-priority threads.
108 * (4) No reader should starve due to lower-priority writers.
109 * (5) If all threads have equal priority, none of them should starve.
111 * We used to employ a writers-always-win policy, which doesn't even
112 * satisfy (1): a steady stream of low-priority writers can starve out
113 * a real-time reader! This is clearly a broken policy -- it violates
114 * (1), (4), and (5) -- but it's how rwlocks always used to behave.
116 * A round-robin policy (exiting readers grant the lock to blocked writers
117 * and vice versa) satisfies all but (3): a single high-priority writer
118 * and many low-priority readers can starve out medium-priority writers.
120 * A strict priority policy (grant the lock to the highest priority blocked
121 * thread) satisfies everything but (2): a steady stream of high-priority
122 * readers can permanently starve the highest-priority writer.
124 * The reason we care about (2) is that it's important to process writers
125 * reasonably quickly -- even if they're low priority -- because their very
126 * presence causes all readers to take the slow (blocking) path through this
127 * code. There is also a general sense that writers deserve some degree of
128 * deference because they're updating the data upon which all readers act.
129 * Presumably this data should not be allowed to become arbitrarily stale
130 * due to writer starvation. Finally, it seems reasonable to level the
131 * playing field a bit to compensate for the fact that it's so much harder
132 * for a writer to get in when there are already many readers present.
134 * A hybrid of round-robin and strict priority can be made to satisfy
135 * all five criteria. In this "writer priority policy" exiting readers
136 * always grant the lock to waiting writers, but exiting writers only
137 * grant the lock to readers of the same or higher priority than the
138 * highest-priority blocked writer. Thus requirement (2) is satisfied,
139 * necessarily, by a willful act of priority inversion: an exiting reader
140 * will grant the lock to a blocked writer even if there are blocked
141 * readers of higher priority. The situation is mitigated by the fact
142 * that writers always inherit priority from blocked readers, and the
143 * writer will awaken those readers as soon as it exits the lock.
145 * Finally, note that this hybrid scheme -- and indeed, any scheme that
146 * satisfies requirement (2) -- has an important consequence: if a lock is
147 * held as reader and a writer subsequently becomes blocked, any further
148 * readers must be blocked to avoid writer starvation. This implementation
149 * detail has ramifications for the semantics of rwlocks, as it prohibits
150 * recursively acquiring an rwlock as reader: any writer that wishes to
151 * acquire the lock after the first but before the second acquisition as
152 * reader will block the second acquisition -- resulting in deadlock. This
153 * itself is not necessarily prohibitive, as it is often straightforward to
154 * prevent a single thread from recursively acquiring an rwlock as reader.
155 * However, a more subtle situation arises when both a traditional mutex and
156 * a reader lock are acquired by two different threads in opposite order.
157 * (That is, one thread first acquires the mutex and then the rwlock as
158 * reader; the other acquires the rwlock as reader and then the mutex.) As
159 * with the single threaded case, this is fine absent a blocked writer: the
160 * thread that acquires the mutex before acquiring the rwlock as reader will
161 * be able to successfully acquire the rwlock -- even as/if the other thread
162 * has the rwlock as reader and is blocked on the held mutex. However, if
163 * an unrelated writer (that is, a third thread) becomes blocked on the
164 * rwlock after the first thread acquires the rwlock as reader but before
165 * it's able to acquire the mutex, the second thread -- with the mutex held
166 * -- will not be able to acquire the rwlock as reader due to the waiting
167 * writer, deadlocking the three threads. Unlike the single-threaded
168 * (recursive) rwlock acquisition case, this case can be quite a bit
169 * thornier to fix, especially as there is nothing inherently wrong in the
170 * locking strategy: the deadlock is really induced by requirement (2), not
171 * the consumers of the rwlock. To permit such consumers, we allow rwlock
172 * acquirers to explicitly opt out of requirement (2) by specifying
173 * RW_READER_STARVEWRITER when acquiring the rwlock. This (obviously) means
174 * that inifinite readers can starve writers, but it also allows for
175 * multiple readers in the presence of other synchronization primitives
176 * without regard for lock-ordering. And while certainly odd (and perhaps
177 * unwise), RW_READER_STARVEWRITER can be safely used alongside RW_READER on
178 * the same lock -- RW_READER_STARVEWRITER describes only the act of lock
179 * acquisition with respect to waiting writers, not the lock itself.
181 * rw_downgrade() follows the same wakeup policy as an exiting writer.
183 * rw_tryupgrade() has the same failure mode as rw_tryenter() for a
184 * write lock. Both honor the WRITE_WANTED bit by specification.
186 * The following rules apply to manipulation of rwlock internal state:
188 * (1) The rwlock is only modified via the atomic primitives casip()
189 * and atomic_add_ip().
191 * (2) The waiters bit and write-wanted bit are only modified under
192 * turnstile_lookup(). This ensures that the turnstile is consistent
195 * (3) Waiters receive the lock by direct handoff from the previous
196 * owner. Therefore, waiters *always* wake up holding the lock.
200 * The sobj_ops vector exports a set of functions needed when a thread
201 * is asleep on a synchronization object of a given type.
203 static sobj_ops_t rw_sobj_ops
= {
204 SOBJ_RWLOCK
, rw_owner
, turnstile_stay_asleep
, turnstile_change_pri
208 * If the system panics on an rwlock, save the address of the offending
209 * rwlock in panic_rwlock_addr, and save the contents in panic_rwlock.
211 static rwlock_impl_t panic_rwlock
;
212 static rwlock_impl_t
*panic_rwlock_addr
;
215 rw_panic(char *msg
, rwlock_impl_t
*lp
)
220 if (atomic_cas_ptr(&panic_rwlock_addr
, NULL
, lp
) == NULL
)
223 panic("%s, lp=%p wwwh=%lx thread=%p",
224 msg
, (void *)lp
, panic_rwlock
.rw_wwwh
, (void *)curthread
);
229 rw_init(krwlock_t
*rwlp
, char *name
, krw_type_t type
, void *arg
)
231 ((rwlock_impl_t
*)rwlp
)->rw_wwwh
= 0;
235 rw_destroy(krwlock_t
*rwlp
)
237 rwlock_impl_t
*lp
= (rwlock_impl_t
*)rwlp
;
239 if (lp
->rw_wwwh
!= 0) {
240 if ((lp
->rw_wwwh
& RW_DOUBLE_LOCK
) == RW_DOUBLE_LOCK
)
241 rw_panic("rw_destroy: lock already destroyed", lp
);
243 rw_panic("rw_destroy: lock still active", lp
);
246 lp
->rw_wwwh
= RW_DOUBLE_LOCK
;
250 * Verify that an rwlock is held correctly.
253 rw_locked(rwlock_impl_t
*lp
, krw_t rw
)
255 uintptr_t old
= lp
->rw_wwwh
;
257 if (rw
== RW_READER
|| rw
== RW_READER_STARVEWRITER
)
258 return ((old
& RW_LOCKED
) && !(old
& RW_WRITE_LOCKED
));
261 return ((old
& RW_OWNER
) == (uintptr_t)curthread
);
266 uint_t (*rw_lock_backoff
)(uint_t
) = NULL
;
267 void (*rw_lock_delay
)(uint_t
) = NULL
;
270 * Full-service implementation of rw_enter() to handle all the hard cases.
271 * Called from the assembly version if anything complicated is going on.
272 * The only semantic difference between calling rw_enter() and calling
273 * rw_enter_sleep() directly is that we assume the caller has already done
274 * a THREAD_KPRI_REQUEST() in the RW_READER cases.
277 rw_enter_sleep(rwlock_impl_t
*lp
, krw_t rw
)
279 uintptr_t old
, new, lock_value
, lock_busy
, lock_wait
;
285 if (rw
== RW_READER
) {
286 lock_value
= RW_READ_LOCK
;
287 lock_busy
= RW_WRITE_CLAIMED
;
288 lock_wait
= RW_HAS_WAITERS
;
289 } else if (rw
== RW_READER_STARVEWRITER
) {
290 lock_value
= RW_READ_LOCK
;
291 lock_busy
= RW_WRITE_LOCKED
;
292 lock_wait
= RW_HAS_WAITERS
;
294 lock_value
= RW_WRITE_LOCK(curthread
);
295 lock_busy
= (uintptr_t)RW_LOCKED
;
296 lock_wait
= RW_HAS_WAITERS
| RW_WRITE_WANTED
;
300 if (((old
= lp
->rw_wwwh
) & lock_busy
) == 0) {
301 if (casip(&lp
->rw_wwwh
, old
, old
+ lock_value
) != old
) {
302 if (rw_lock_delay
!= NULL
) {
303 backoff
= rw_lock_backoff(backoff
);
304 rw_lock_delay(backoff
);
305 if (++loop_count
== ncpus_online
) {
318 if ((old
& RW_DOUBLE_LOCK
) == RW_DOUBLE_LOCK
) {
319 rw_panic("rw_enter: bad rwlock", lp
);
323 if ((old
& RW_OWNER
) == (uintptr_t)curthread
) {
324 rw_panic("recursive rw_enter", lp
);
328 ts
= turnstile_lookup(lp
);
331 if (((old
= lp
->rw_wwwh
) & lock_busy
) == 0)
333 new = old
| lock_wait
;
334 } while (old
!= new && casip(&lp
->rw_wwwh
, old
, new) != old
);
336 if ((old
& lock_busy
) == 0) {
338 * The lock appears free now; try the dance again
345 * We really are going to block. Bump the stats, and drop
346 * kpri if we're a reader.
348 ASSERT(lp
->rw_wwwh
& lock_wait
);
349 ASSERT(lp
->rw_wwwh
& RW_LOCKED
);
351 sleep_time
= -gethrtime();
352 if (rw
!= RW_WRITER
) {
353 THREAD_KPRI_RELEASE();
354 CPU_STATS_ADDQ(CPU
, sys
, rw_rdfails
, 1);
355 (void) turnstile_block(ts
, TS_READER_Q
, lp
,
356 &rw_sobj_ops
, NULL
, NULL
);
358 CPU_STATS_ADDQ(CPU
, sys
, rw_wrfails
, 1);
359 (void) turnstile_block(ts
, TS_WRITER_Q
, lp
,
360 &rw_sobj_ops
, NULL
, NULL
);
362 sleep_time
+= gethrtime();
364 LOCKSTAT_RECORD4(LS_RW_ENTER_BLOCK
, lp
, sleep_time
, rw
,
365 (old
& RW_WRITE_LOCKED
) ? 1 : 0,
366 old
>> RW_HOLD_COUNT_SHIFT
);
369 * We wake up holding the lock (and having kpri if we're
370 * a reader) via direct handoff from the previous owner.
375 ASSERT(rw_locked(lp
, rw
));
379 LOCKSTAT_RECORD(LS_RW_ENTER_ACQUIRE
, lp
, rw
);
383 * Return the number of readers to wake, or zero if we should wake a writer.
384 * Called only by exiting/downgrading writers (readers don't wake readers).
387 rw_readers_to_wake(turnstile_t
*ts
)
389 kthread_t
*next_writer
= ts
->ts_sleepq
[TS_WRITER_Q
].sq_first
;
390 kthread_t
*next_reader
= ts
->ts_sleepq
[TS_READER_Q
].sq_first
;
391 pri_t wpri
= (next_writer
!= NULL
) ? DISP_PRIO(next_writer
) : -1;
394 while (next_reader
!= NULL
) {
395 if (DISP_PRIO(next_reader
) < wpri
)
397 next_reader
->t_kpri_req
++;
398 next_reader
= next_reader
->t_link
;
405 * Full-service implementation of rw_exit() to handle all the hard cases.
406 * Called from the assembly version if anything complicated is going on.
407 * There is no semantic difference between calling rw_exit() and calling
408 * rw_exit_wakeup() directly.
411 rw_exit_wakeup(rwlock_impl_t
*lp
)
414 uintptr_t old
, new, lock_value
;
415 kthread_t
*next_writer
;
423 if (old
& RW_WRITE_LOCKED
) {
424 if ((old
& RW_OWNER
) != (uintptr_t)curthread
) {
425 rw_panic("rw_exit: not owner", lp
);
429 lock_value
= RW_WRITE_LOCK(curthread
);
431 if ((old
& RW_LOCKED
) == 0) {
432 rw_panic("rw_exit: lock not held", lp
);
435 lock_value
= RW_READ_LOCK
;
440 * If this is *not* the final exit of a lock with waiters,
441 * just drop the lock -- there's nothing tricky going on.
444 new = old
- lock_value
;
445 if ((new & (RW_LOCKED
| RW_HAS_WAITERS
)) != RW_HAS_WAITERS
) {
446 if (casip(&lp
->rw_wwwh
, old
, new) != old
) {
447 if (rw_lock_delay
!= NULL
) {
448 backoff
= rw_lock_backoff(backoff
);
449 rw_lock_delay(backoff
);
450 if (++loop_count
== ncpus_online
) {
461 * This appears to be the final exit of a lock with waiters.
462 * If we do not have the lock as writer (that is, if this is
463 * the last exit of a reader with waiting writers), we will
464 * grab the lock as writer to prevent additional readers.
465 * (This is required because a reader that is acquiring the
466 * lock via RW_READER_STARVEWRITER will not observe the
467 * RW_WRITE_WANTED bit -- and we could therefore be racing
468 * with such readers here.)
470 if (!(old
& RW_WRITE_LOCKED
)) {
471 new = RW_WRITE_LOCK(curthread
) |
472 RW_HAS_WAITERS
| RW_WRITE_WANTED
;
474 if (casip(&lp
->rw_wwwh
, old
, new) != old
)
479 * Perform the final exit of a lock that has waiters.
481 ts
= turnstile_lookup(lp
);
483 next_writer
= ts
->ts_sleepq
[TS_WRITER_Q
].sq_first
;
485 if ((old
& RW_WRITE_LOCKED
) &&
486 (nreaders
= rw_readers_to_wake(ts
)) > 0) {
488 * Don't drop the lock -- just set the hold count
489 * such that we grant the lock to all readers at once.
491 new = nreaders
* RW_READ_LOCK
;
492 if (ts
->ts_waiters
> nreaders
)
493 new |= RW_HAS_WAITERS
;
495 new |= RW_WRITE_WANTED
;
498 turnstile_wakeup(ts
, TS_READER_Q
, nreaders
, NULL
);
501 * Don't drop the lock -- just transfer ownership
502 * directly to next_writer. Note that there must
503 * be at least one waiting writer, because we get
504 * here only if (A) the lock is read-locked or
505 * (B) there are no waiting readers. In case (A),
506 * since the lock is read-locked there would be no
507 * reason for other readers to have blocked unless
508 * the RW_WRITE_WANTED bit was set. In case (B),
509 * since there are waiters but no waiting readers,
510 * they must all be waiting writers.
512 ASSERT(lp
->rw_wwwh
& RW_WRITE_WANTED
);
513 new = RW_WRITE_LOCK(next_writer
);
514 if (ts
->ts_waiters
> 1)
515 new |= RW_HAS_WAITERS
;
516 if (next_writer
->t_link
)
517 new |= RW_WRITE_WANTED
;
520 turnstile_wakeup(ts
, TS_WRITER_Q
, 1, next_writer
);
525 if (lock_value
== RW_READ_LOCK
) {
526 THREAD_KPRI_RELEASE();
527 LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE
, lp
, RW_READER
);
529 LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE
, lp
, RW_WRITER
);
534 rw_tryenter(krwlock_t
*rwlp
, krw_t rw
)
536 rwlock_impl_t
*lp
= (rwlock_impl_t
*)rwlp
;
539 if (rw
!= RW_WRITER
) {
542 THREAD_KPRI_REQUEST();
544 if ((old
= lp
->rw_wwwh
) & (rw
== RW_READER
?
545 RW_WRITE_CLAIMED
: RW_WRITE_LOCKED
)) {
546 THREAD_KPRI_RELEASE();
549 if (casip(&lp
->rw_wwwh
, old
, old
+ RW_READ_LOCK
) == old
)
551 if (rw_lock_delay
!= NULL
) {
552 backoff
= rw_lock_backoff(backoff
);
553 rw_lock_delay(backoff
);
554 if (++loop_count
== ncpus_online
) {
560 LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE
, lp
, rw
);
562 if (casip(&lp
->rw_wwwh
, 0, RW_WRITE_LOCK(curthread
)) != 0)
564 LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE
, lp
, rw
);
566 ASSERT(rw_locked(lp
, rw
));
572 rw_downgrade(krwlock_t
*rwlp
)
574 rwlock_impl_t
*lp
= (rwlock_impl_t
*)rwlp
;
576 THREAD_KPRI_REQUEST();
579 if ((lp
->rw_wwwh
& RW_OWNER
) != (uintptr_t)curthread
) {
580 rw_panic("rw_downgrade: not owner", lp
);
584 if (atomic_add_ip_nv(&lp
->rw_wwwh
,
585 RW_READ_LOCK
- RW_WRITE_LOCK(curthread
)) & RW_HAS_WAITERS
) {
586 turnstile_t
*ts
= turnstile_lookup(lp
);
587 int nreaders
= rw_readers_to_wake(ts
);
589 uintptr_t delta
= nreaders
* RW_READ_LOCK
;
590 if (ts
->ts_waiters
== nreaders
)
591 delta
-= RW_HAS_WAITERS
;
592 atomic_add_ip(&lp
->rw_wwwh
, delta
);
594 turnstile_wakeup(ts
, TS_READER_Q
, nreaders
, NULL
);
596 ASSERT(rw_locked(lp
, RW_READER
));
597 LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE
, lp
);
601 rw_tryupgrade(krwlock_t
*rwlp
)
603 rwlock_impl_t
*lp
= (rwlock_impl_t
*)rwlp
;
606 ASSERT(rw_locked(lp
, RW_READER
));
609 if (((old
= lp
->rw_wwwh
) & ~RW_HAS_WAITERS
) != RW_READ_LOCK
)
611 new = old
+ RW_WRITE_LOCK(curthread
) - RW_READ_LOCK
;
612 } while (casip(&lp
->rw_wwwh
, old
, new) != old
);
615 THREAD_KPRI_RELEASE();
616 LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE
, lp
);
617 ASSERT(rw_locked(lp
, RW_WRITER
));
622 rw_read_held(krwlock_t
*rwlp
)
626 return (_RW_READ_HELD(rwlp
, tmp
));
630 rw_write_held(krwlock_t
*rwlp
)
632 return (_RW_WRITE_HELD(rwlp
));
636 rw_lock_held(krwlock_t
*rwlp
)
638 return (_RW_LOCK_HELD(rwlp
));
642 * Like rw_read_held(), but ASSERTs that the lock is currently held
645 rw_read_locked(krwlock_t
*rwlp
)
647 uintptr_t old
= ((rwlock_impl_t
*)rwlp
)->rw_wwwh
;
649 ASSERT(old
& RW_LOCKED
);
650 return ((old
& RW_LOCKED
) && !(old
& RW_WRITE_LOCKED
));
654 * Returns non-zero if the lock is either held or desired by a writer
657 rw_iswriter(krwlock_t
*rwlp
)
659 return (_RW_ISWRITER(rwlp
));
663 rw_owner(krwlock_t
*rwlp
)
665 uintptr_t old
= ((rwlock_impl_t
*)rwlp
)->rw_wwwh
;
667 return ((old
& RW_WRITE_LOCKED
) ? (kthread_t
*)(old
& RW_OWNER
) : NULL
);