Merge illumos-gate
[unleashed.git] / kernel / syscall / lwp_sobj.c
blob1df27a4fc5a0b8a57ace2a48f451f1a10a1ecb1c
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2015 Joyent, Inc.
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/user.h>
37 #include <sys/errno.h>
38 #include <sys/file.h>
39 #include <sys/proc.h>
40 #include <sys/prsystm.h>
41 #include <sys/kmem.h>
42 #include <sys/sobject.h>
43 #include <sys/fault.h>
44 #include <sys/procfs.h>
45 #include <sys/watchpoint.h>
46 #include <sys/time.h>
47 #include <sys/cmn_err.h>
48 #include <sys/machlock.h>
49 #include <sys/debug.h>
50 #include <sys/synch.h>
51 #include <sys/synch32.h>
52 #include <sys/mman.h>
53 #include <sys/class.h>
54 #include <sys/schedctl.h>
55 #include <sys/sleepq.h>
56 #include <sys/policy.h>
57 #include <sys/tnf_probe.h>
58 #include <sys/lwpchan_impl.h>
59 #include <sys/turnstile.h>
60 #include <sys/atomic.h>
61 #include <sys/lwp_timer_impl.h>
62 #include <sys/lwp_upimutex_impl.h>
63 #include <vm/as.h>
64 #include <sys/sdt.h>
66 static kthread_t *lwpsobj_owner(caddr_t);
67 static void lwp_unsleep(kthread_t *t);
68 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
69 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
70 static void lwp_mutex_unregister(void *uaddr);
71 static void set_owner_pid(lwp_mutex_t *, uintptr_t, pid_t);
72 static int iswanted(kthread_t *, lwpchan_t *);
74 extern int lwp_cond_signal(lwp_cond_t *cv);
77 * Maximum number of user prio inheritance locks that can be held by a thread.
78 * Used to limit kmem for each thread. This is a per-thread limit that
79 * can be administered on a system wide basis (using /etc/system).
81 * Also, when a limit, say maxlwps is added for numbers of lwps within a
82 * process, the per-thread limit automatically becomes a process-wide limit
83 * of maximum number of held upi locks within a process:
84 * maxheldupimx = maxnestupimx * maxlwps;
86 static uint32_t maxnestupimx = 2000;
89 * The sobj_ops vector exports a set of functions needed when a thread
90 * is asleep on a synchronization object of this type.
92 static sobj_ops_t lwp_sobj_ops = {
93 SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
96 static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
98 static sobj_ops_t lwp_sobj_pi_ops = {
99 SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
100 turnstile_change_pri
103 static sleepq_head_t lwpsleepq[NSLEEPQ];
104 upib_t upimutextab[UPIMUTEX_TABSIZE];
106 #define LWPCHAN_LOCK_SHIFT 10 /* 1024 locks for each pool */
107 #define LWPCHAN_LOCK_SIZE (1 << LWPCHAN_LOCK_SHIFT)
110 * We know that both lc_wchan and lc_wchan0 are addresses that most
111 * likely are 8-byte aligned, so we shift off the low-order 3 bits.
112 * 'pool' is either 0 or 1.
114 #define LWPCHAN_LOCK_HASH(X, pool) \
115 (((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
116 (LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
118 static kmutex_t lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
121 * Is this a POSIX threads user-level lock requiring priority inheritance?
123 #define UPIMUTEX(type) ((type) & LOCK_PRIO_INHERIT)
125 static sleepq_head_t *
126 lwpsqhash(lwpchan_t *lwpchan)
128 uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
129 return (&lwpsleepq[SQHASHINDEX(x)]);
133 * Lock an lwpchan.
134 * Keep this in sync with lwpchan_unlock(), below.
136 static void
137 lwpchan_lock(lwpchan_t *lwpchan, int pool)
139 uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
140 mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
144 * Unlock an lwpchan.
145 * Keep this in sync with lwpchan_lock(), above.
147 static void
148 lwpchan_unlock(lwpchan_t *lwpchan, int pool)
150 uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
151 mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
155 * Delete mappings from the lwpchan cache for pages that are being
156 * unmapped by as_unmap(). Given a range of addresses, "start" to "end",
157 * all mappings within the range are deleted from the lwpchan cache.
159 void
160 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
162 lwpchan_data_t *lcp;
163 lwpchan_hashbucket_t *hashbucket;
164 lwpchan_hashbucket_t *endbucket;
165 lwpchan_entry_t *ent;
166 lwpchan_entry_t **prev;
167 caddr_t addr;
169 mutex_enter(&p->p_lcp_lock);
170 lcp = p->p_lcp;
171 hashbucket = lcp->lwpchan_cache;
172 endbucket = hashbucket + lcp->lwpchan_size;
173 for (; hashbucket < endbucket; hashbucket++) {
174 if (hashbucket->lwpchan_chain == NULL)
175 continue;
176 mutex_enter(&hashbucket->lwpchan_lock);
177 prev = &hashbucket->lwpchan_chain;
178 /* check entire chain */
179 while ((ent = *prev) != NULL) {
180 addr = ent->lwpchan_addr;
181 if (start <= addr && addr < end) {
182 *prev = ent->lwpchan_next;
184 * We do this only for the obsolete type
185 * USYNC_PROCESS_ROBUST. Otherwise robust
186 * locks do not draw ELOCKUNMAPPED or
187 * EOWNERDEAD due to being unmapped.
189 if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
190 (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
191 lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
193 * If there is a user-level robust lock
194 * registration, mark it as invalid.
196 if ((addr = ent->lwpchan_uaddr) != NULL)
197 lwp_mutex_unregister(addr);
198 kmem_free(ent, sizeof (*ent));
199 atomic_dec_32(&lcp->lwpchan_entries);
200 } else {
201 prev = &ent->lwpchan_next;
204 mutex_exit(&hashbucket->lwpchan_lock);
206 mutex_exit(&p->p_lcp_lock);
210 * Given an lwpchan cache pointer and a process virtual address,
211 * return a pointer to the corresponding lwpchan hash bucket.
213 static lwpchan_hashbucket_t *
214 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
216 uint_t i;
219 * All user-level sync object addresses are 8-byte aligned.
220 * Ignore the lowest 3 bits of the address and use the
221 * higher-order 2*lwpchan_bits bits for the hash index.
223 addr >>= 3;
224 i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
225 return (lcp->lwpchan_cache + i);
229 * (Re)allocate the per-process lwpchan cache.
231 static void
232 lwpchan_alloc_cache(proc_t *p, uint_t bits)
234 lwpchan_data_t *lcp;
235 lwpchan_data_t *old_lcp;
236 lwpchan_hashbucket_t *hashbucket;
237 lwpchan_hashbucket_t *endbucket;
238 lwpchan_hashbucket_t *newbucket;
239 lwpchan_entry_t *ent;
240 lwpchan_entry_t *next;
241 uint_t count;
243 ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
245 lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
246 lcp->lwpchan_bits = bits;
247 lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
248 lcp->lwpchan_mask = lcp->lwpchan_size - 1;
249 lcp->lwpchan_entries = 0;
250 lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
251 sizeof (lwpchan_hashbucket_t), KM_SLEEP);
252 lcp->lwpchan_next_data = NULL;
254 mutex_enter(&p->p_lcp_lock);
255 if ((old_lcp = p->p_lcp) != NULL) {
256 if (old_lcp->lwpchan_bits >= bits) {
257 /* someone beat us to it */
258 mutex_exit(&p->p_lcp_lock);
259 kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
260 sizeof (lwpchan_hashbucket_t));
261 kmem_free(lcp, sizeof (lwpchan_data_t));
262 return;
265 * Acquire all of the old hash table locks.
267 hashbucket = old_lcp->lwpchan_cache;
268 endbucket = hashbucket + old_lcp->lwpchan_size;
269 for (; hashbucket < endbucket; hashbucket++)
270 mutex_enter(&hashbucket->lwpchan_lock);
272 * Move all of the old hash table entries to the
273 * new hash table. The new hash table has not yet
274 * been installed so we don't need any of its locks.
276 count = 0;
277 hashbucket = old_lcp->lwpchan_cache;
278 for (; hashbucket < endbucket; hashbucket++) {
279 ent = hashbucket->lwpchan_chain;
280 while (ent != NULL) {
281 next = ent->lwpchan_next;
282 newbucket = lwpchan_bucket(lcp,
283 (uintptr_t)ent->lwpchan_addr);
284 ent->lwpchan_next = newbucket->lwpchan_chain;
285 newbucket->lwpchan_chain = ent;
286 ent = next;
287 count++;
289 hashbucket->lwpchan_chain = NULL;
291 lcp->lwpchan_entries = count;
295 * Retire the old hash table. We can't actually kmem_free() it
296 * now because someone may still have a pointer to it. Instead,
297 * we link it onto the new hash table's list of retired hash tables.
298 * The new hash table is double the size of the previous one, so
299 * the total size of all retired hash tables is less than the size
300 * of the new one. exit() and exec() free the retired hash tables
301 * (see lwpchan_destroy_cache(), below).
303 lcp->lwpchan_next_data = old_lcp;
306 * As soon as we store the new lcp, future locking operations will
307 * use it. Therefore, we must ensure that all the state we've just
308 * established reaches global visibility before the new lcp does.
310 membar_producer();
311 p->p_lcp = lcp;
313 if (old_lcp != NULL) {
315 * Release all of the old hash table locks.
317 hashbucket = old_lcp->lwpchan_cache;
318 for (; hashbucket < endbucket; hashbucket++)
319 mutex_exit(&hashbucket->lwpchan_lock);
321 mutex_exit(&p->p_lcp_lock);
325 * Deallocate the lwpchan cache, and any dynamically allocated mappings.
326 * Called when the process exits or execs. All lwps except one have
327 * exited so we need no locks here.
329 void
330 lwpchan_destroy_cache(int exec)
332 proc_t *p = curproc;
333 lwpchan_hashbucket_t *hashbucket;
334 lwpchan_hashbucket_t *endbucket;
335 lwpchan_data_t *lcp;
336 lwpchan_entry_t *ent;
337 lwpchan_entry_t *next;
338 uint16_t lockflg;
340 lcp = p->p_lcp;
341 p->p_lcp = NULL;
343 lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
344 hashbucket = lcp->lwpchan_cache;
345 endbucket = hashbucket + lcp->lwpchan_size;
346 for (; hashbucket < endbucket; hashbucket++) {
347 ent = hashbucket->lwpchan_chain;
348 hashbucket->lwpchan_chain = NULL;
349 while (ent != NULL) {
350 next = ent->lwpchan_next;
351 if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
352 (ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
353 == (USYNC_PROCESS | LOCK_ROBUST))
354 lwp_mutex_cleanup(ent, lockflg);
355 kmem_free(ent, sizeof (*ent));
356 ent = next;
360 while (lcp != NULL) {
361 lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
362 kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
363 sizeof (lwpchan_hashbucket_t));
364 kmem_free(lcp, sizeof (lwpchan_data_t));
365 lcp = next_lcp;
370 * Return zero when there is an entry in the lwpchan cache for the
371 * given process virtual address and non-zero when there is not.
372 * The returned non-zero value is the current length of the
373 * hash chain plus one. The caller holds the hash bucket lock.
375 static uint_t
376 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
377 lwpchan_hashbucket_t *hashbucket)
379 lwpchan_entry_t *ent;
380 uint_t count = 1;
382 for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
383 if (ent->lwpchan_addr == addr) {
384 if (ent->lwpchan_type != type ||
385 ent->lwpchan_pool != pool) {
387 * This shouldn't happen, but might if the
388 * process reuses its memory for different
389 * types of sync objects. We test first
390 * to avoid grabbing the memory cache line.
392 ent->lwpchan_type = (uint16_t)type;
393 ent->lwpchan_pool = (uint16_t)pool;
395 *lwpchan = ent->lwpchan_lwpchan;
396 return (0);
398 count++;
400 return (count);
404 * Return the cached lwpchan mapping if cached, otherwise insert
405 * a virtual address to lwpchan mapping into the cache.
407 static int
408 lwpchan_get_mapping(struct as *as, caddr_t addr, caddr_t uaddr,
409 int type, lwpchan_t *lwpchan, int pool)
411 proc_t *p = curproc;
412 lwpchan_data_t *lcp;
413 lwpchan_hashbucket_t *hashbucket;
414 lwpchan_entry_t *ent;
415 memid_t memid;
416 uint_t count;
417 uint_t bits;
419 top:
420 /* initialize the lwpchan cache, if necesary */
421 if ((lcp = p->p_lcp) == NULL) {
422 lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
423 goto top;
425 hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
426 mutex_enter(&hashbucket->lwpchan_lock);
427 if (lcp != p->p_lcp) {
428 /* someone resized the lwpchan cache; start over */
429 mutex_exit(&hashbucket->lwpchan_lock);
430 goto top;
432 if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
433 /* it's in the cache */
434 mutex_exit(&hashbucket->lwpchan_lock);
435 return (1);
437 mutex_exit(&hashbucket->lwpchan_lock);
438 if (as_getmemid(as, addr, &memid) != 0)
439 return (0);
440 lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
441 lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
442 ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
443 mutex_enter(&hashbucket->lwpchan_lock);
444 if (lcp != p->p_lcp) {
445 /* someone resized the lwpchan cache; start over */
446 mutex_exit(&hashbucket->lwpchan_lock);
447 kmem_free(ent, sizeof (*ent));
448 goto top;
450 count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
451 if (count == 0) {
452 /* someone else added this entry to the cache */
453 mutex_exit(&hashbucket->lwpchan_lock);
454 kmem_free(ent, sizeof (*ent));
455 return (1);
457 if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
458 (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
459 /* hash chain too long; reallocate the hash table */
460 mutex_exit(&hashbucket->lwpchan_lock);
461 kmem_free(ent, sizeof (*ent));
462 lwpchan_alloc_cache(p, bits + 1);
463 goto top;
465 ent->lwpchan_addr = addr;
466 ent->lwpchan_uaddr = uaddr;
467 ent->lwpchan_type = (uint16_t)type;
468 ent->lwpchan_pool = (uint16_t)pool;
469 ent->lwpchan_lwpchan = *lwpchan;
470 ent->lwpchan_next = hashbucket->lwpchan_chain;
471 hashbucket->lwpchan_chain = ent;
472 atomic_inc_32(&lcp->lwpchan_entries);
473 mutex_exit(&hashbucket->lwpchan_lock);
474 return (1);
478 * Return a unique pair of identifiers that corresponds to a
479 * synchronization object's virtual address. Process-shared
480 * sync objects usually get vnode/offset from as_getmemid().
482 static int
483 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
486 * If the lwp synch object is defined to be process-private,
487 * we just make the first field of the lwpchan be 'as' and
488 * the second field be the synch object's virtual address.
489 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
490 * The lwpchan cache is used only for process-shared objects.
492 if (!(type & USYNC_PROCESS)) {
493 lwpchan->lc_wchan0 = (caddr_t)as;
494 lwpchan->lc_wchan = addr;
495 return (1);
498 return (lwpchan_get_mapping(as, addr, NULL, type, lwpchan, pool));
501 static void
502 lwp_block(lwpchan_t *lwpchan)
504 kthread_t *t = curthread;
505 klwp_t *lwp = ttolwp(t);
506 sleepq_head_t *sqh;
508 thread_lock(t);
509 t->t_flag |= T_WAKEABLE;
510 t->t_lwpchan = *lwpchan;
511 t->t_sobj_ops = &lwp_sobj_ops;
512 t->t_release = 0;
513 sqh = lwpsqhash(lwpchan);
514 disp_lock_enter_high(&sqh->sq_lock);
515 CL_SLEEP(t);
516 DTRACE_SCHED(sleep);
517 THREAD_SLEEP(t, &sqh->sq_lock);
518 sleepq_insert(&sqh->sq_queue, t);
519 thread_unlock(t);
520 lwp->lwp_asleep = 1;
521 lwp->lwp_sysabort = 0;
522 lwp->lwp_ru.nvcsw++;
523 (void) new_mstate(curthread, LMS_SLEEP);
526 static kthread_t *
527 lwpsobj_pi_owner(upimutex_t *up)
529 return (up->upi_owner);
532 static struct upimutex *
533 upi_get(upib_t *upibp, lwpchan_t *lcp)
535 struct upimutex *upip;
537 for (upip = upibp->upib_first; upip != NULL;
538 upip = upip->upi_nextchain) {
539 if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
540 upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
541 break;
543 return (upip);
546 static void
547 upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
549 ASSERT(MUTEX_HELD(&upibp->upib_lock));
552 * Insert upimutex at front of list. Maybe a bit unfair
553 * but assume that not many lwpchans hash to the same
554 * upimutextab bucket, i.e. the list of upimutexes from
555 * upib_first is not too long.
557 upimutex->upi_nextchain = upibp->upib_first;
558 upibp->upib_first = upimutex;
561 static void
562 upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
564 struct upimutex **prev;
566 ASSERT(MUTEX_HELD(&upibp->upib_lock));
568 prev = &upibp->upib_first;
569 while (*prev != upimutex) {
570 prev = &(*prev)->upi_nextchain;
572 *prev = upimutex->upi_nextchain;
573 upimutex->upi_nextchain = NULL;
577 * Add upimutex to chain of upimutexes held by curthread.
578 * Returns number of upimutexes held by curthread.
580 static uint32_t
581 upi_mylist_add(struct upimutex *upimutex)
583 kthread_t *t = curthread;
586 * Insert upimutex at front of list of upimutexes owned by t. This
587 * would match typical LIFO order in which nested locks are acquired
588 * and released.
590 upimutex->upi_nextowned = t->t_upimutex;
591 t->t_upimutex = upimutex;
592 t->t_nupinest++;
593 ASSERT(t->t_nupinest > 0);
594 return (t->t_nupinest);
598 * Delete upimutex from list of upimutexes owned by curthread.
600 static void
601 upi_mylist_del(struct upimutex *upimutex)
603 kthread_t *t = curthread;
604 struct upimutex **prev;
607 * Since the order in which nested locks are acquired and released,
608 * is typically LIFO, and typical nesting levels are not too deep, the
609 * following should not be expensive in the general case.
611 prev = &t->t_upimutex;
612 while (*prev != upimutex) {
613 prev = &(*prev)->upi_nextowned;
615 *prev = upimutex->upi_nextowned;
616 upimutex->upi_nextowned = NULL;
617 ASSERT(t->t_nupinest > 0);
618 t->t_nupinest--;
622 * Returns true if upimutex is owned. Should be called only when upim points
623 * to kmem which cannot disappear from underneath.
625 static int
626 upi_owned(upimutex_t *upim)
628 return (upim->upi_owner == curthread);
632 * Returns pointer to kernel object (upimutex_t *) if lp is owned.
634 static struct upimutex *
635 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
637 lwpchan_t lwpchan;
638 upib_t *upibp;
639 struct upimutex *upimutex;
641 if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
642 &lwpchan, LWPCHAN_MPPOOL))
643 return (NULL);
645 upibp = &UPI_CHAIN(lwpchan);
646 mutex_enter(&upibp->upib_lock);
647 upimutex = upi_get(upibp, &lwpchan);
648 if (upimutex == NULL || upimutex->upi_owner != curthread) {
649 mutex_exit(&upibp->upib_lock);
650 return (NULL);
652 mutex_exit(&upibp->upib_lock);
653 return (upimutex);
657 * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
658 * no lock hand-off occurrs.
660 static void
661 upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
663 turnstile_t *ts;
664 upib_t *upibp;
665 kthread_t *newowner;
667 upi_mylist_del(upimutex);
668 upibp = upimutex->upi_upibp;
669 mutex_enter(&upibp->upib_lock);
670 if (upimutex->upi_waiter != 0) { /* if waiters */
671 ts = turnstile_lookup(upimutex);
672 if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
673 /* hand-off lock to highest prio waiter */
674 newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
675 upimutex->upi_owner = newowner;
676 if (ts->ts_waiters == 1)
677 upimutex->upi_waiter = 0;
678 turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
679 mutex_exit(&upibp->upib_lock);
680 return;
681 } else if (ts != NULL) {
682 /* LOCK_NOTRECOVERABLE: wakeup all */
683 turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
684 } else {
686 * Misleading w bit. Waiters might have been
687 * interrupted. No need to clear the w bit (upimutex
688 * will soon be freed). Re-calculate PI from existing
689 * waiters.
691 turnstile_exit(upimutex);
692 turnstile_pi_recalc();
696 * no waiters, or LOCK_NOTRECOVERABLE.
697 * remove from the bucket chain of upi mutexes.
698 * de-allocate kernel memory (upimutex).
700 upi_chain_del(upimutex->upi_upibp, upimutex);
701 mutex_exit(&upibp->upib_lock);
702 kmem_free(upimutex, sizeof (upimutex_t));
705 static int
706 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
708 label_t ljb;
709 int error = 0;
710 lwpchan_t lwpchan;
711 uint16_t flag;
712 upib_t *upibp;
713 volatile struct upimutex *upimutex = NULL;
714 turnstile_t *ts;
715 uint32_t nupinest;
716 volatile int upilocked = 0;
718 if (on_fault(&ljb)) {
719 if (upilocked)
720 upimutex_unlock((upimutex_t *)upimutex, 0);
721 error = EFAULT;
722 goto out;
724 if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
725 &lwpchan, LWPCHAN_MPPOOL)) {
726 error = EFAULT;
727 goto out;
729 upibp = &UPI_CHAIN(lwpchan);
730 retry:
731 mutex_enter(&upibp->upib_lock);
732 upimutex = upi_get(upibp, &lwpchan);
733 if (upimutex == NULL) {
734 /* lock available since lwpchan has no upimutex */
735 upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
736 upi_chain_add(upibp, (upimutex_t *)upimutex);
737 upimutex->upi_owner = curthread; /* grab lock */
738 upimutex->upi_upibp = upibp;
739 upimutex->upi_vaddr = lp;
740 upimutex->upi_lwpchan = lwpchan;
741 mutex_exit(&upibp->upib_lock);
742 nupinest = upi_mylist_add((upimutex_t *)upimutex);
743 upilocked = 1;
744 fuword16_noerr(&lp->mutex_flag, &flag);
745 if (nupinest > maxnestupimx &&
746 secpolicy_resource(CRED()) != 0) {
747 upimutex_unlock((upimutex_t *)upimutex, flag);
748 error = ENOMEM;
749 goto out;
751 if (flag & LOCK_NOTRECOVERABLE) {
753 * Since the setting of LOCK_NOTRECOVERABLE
754 * was done under the high-level upi mutex,
755 * in lwp_upimutex_unlock(), this flag needs to
756 * be checked while holding the upi mutex.
757 * If set, this thread should return without
758 * the lock held, and with the right error code.
760 upimutex_unlock((upimutex_t *)upimutex, flag);
761 upilocked = 0;
762 error = ENOTRECOVERABLE;
763 } else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
764 if (flag & LOCK_OWNERDEAD)
765 error = EOWNERDEAD;
766 else if (type & USYNC_PROCESS_ROBUST)
767 error = ELOCKUNMAPPED;
768 else
769 error = EOWNERDEAD;
771 goto out;
774 * If a upimutex object exists, it must have an owner.
775 * This is due to lock hand-off, and release of upimutex when no
776 * waiters are present at unlock time,
778 ASSERT(upimutex->upi_owner != NULL);
779 if (upimutex->upi_owner == curthread) {
781 * The user wrapper can check if the mutex type is
782 * ERRORCHECK: if not, it should stall at user-level.
783 * If so, it should return the error code.
785 mutex_exit(&upibp->upib_lock);
786 error = EDEADLK;
787 goto out;
789 if (try == UPIMUTEX_TRY) {
790 mutex_exit(&upibp->upib_lock);
791 error = EBUSY;
792 goto out;
795 * Block for the lock.
797 if ((error = lwptp->lwpt_time_error) != 0) {
799 * The SUSV3 Posix spec is very clear that we
800 * should get no error from validating the
801 * timer until we would actually sleep.
803 mutex_exit(&upibp->upib_lock);
804 goto out;
806 if (lwptp->lwpt_tsp != NULL) {
808 * Unlike the protocol for other lwp timedwait operations,
809 * we must drop t_delay_lock before going to sleep in
810 * turnstile_block() for a upi mutex.
811 * See the comments below and in turnstile.c
813 mutex_enter(&curthread->t_delay_lock);
814 (void) lwp_timer_enqueue(lwptp);
815 mutex_exit(&curthread->t_delay_lock);
818 * Now, set the waiter bit and block for the lock in turnstile_block().
819 * No need to preserve the previous wbit since a lock try is not
820 * attempted after setting the wait bit. Wait bit is set under
821 * the upib_lock, which is not released until the turnstile lock
822 * is acquired. Say, the upimutex is L:
824 * 1. upib_lock is held so the waiter does not have to retry L after
825 * setting the wait bit: since the owner has to grab the upib_lock
826 * to unlock L, it will certainly see the wait bit set.
827 * 2. upib_lock is not released until the turnstile lock is acquired.
828 * This is the key to preventing a missed wake-up. Otherwise, the
829 * owner could acquire the upib_lock, and the tc_lock, to call
830 * turnstile_wakeup(). All this, before the waiter gets tc_lock
831 * to sleep in turnstile_block(). turnstile_wakeup() will then not
832 * find this waiter, resulting in the missed wakeup.
833 * 3. The upib_lock, being a kernel mutex, cannot be released while
834 * holding the tc_lock (since mutex_exit() could need to acquire
835 * the same tc_lock)...and so is held when calling turnstile_block().
836 * The address of upib_lock is passed to turnstile_block() which
837 * releases it after releasing all turnstile locks, and before going
838 * to sleep in swtch().
839 * 4. The waiter value cannot be a count of waiters, because a waiter
840 * can be interrupted. The interrupt occurs under the tc_lock, at
841 * which point, the upib_lock cannot be locked, to decrement waiter
842 * count. So, just treat the waiter state as a bit, not a count.
844 ts = turnstile_lookup((upimutex_t *)upimutex);
845 upimutex->upi_waiter = 1;
846 error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
847 &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
849 * Hand-off implies that we wakeup holding the lock, except when:
850 * - deadlock is detected
851 * - lock is not recoverable
852 * - we got an interrupt or timeout
853 * If we wake up due to an interrupt or timeout, we may
854 * or may not be holding the lock due to mutex hand-off.
855 * Use lwp_upimutex_owned() to check if we do hold the lock.
857 if (error != 0) {
858 if ((error == EINTR || error == ETIME) &&
859 (upimutex = lwp_upimutex_owned(lp, type))) {
861 * Unlock and return - the re-startable syscall will
862 * try the lock again if we got EINTR.
864 (void) upi_mylist_add((upimutex_t *)upimutex);
865 upimutex_unlock((upimutex_t *)upimutex, 0);
868 * The only other possible error is EDEADLK. If so, upimutex
869 * is valid, since its owner is deadlocked with curthread.
871 ASSERT(error == EINTR || error == ETIME ||
872 (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
873 ASSERT(!lwp_upimutex_owned(lp, type));
874 goto out;
876 if (lwp_upimutex_owned(lp, type)) {
877 ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
878 nupinest = upi_mylist_add((upimutex_t *)upimutex);
879 upilocked = 1;
882 * Now, need to read the user-level lp->mutex_flag to do the following:
884 * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED
885 * should be returned.
886 * - if lock isn't held, check if ENOTRECOVERABLE should
887 * be returned.
889 * Now, either lp->mutex_flag is readable or it's not. If not
890 * readable, the on_fault path will cause a return with EFAULT
891 * as it should. If it is readable, the state of the flag
892 * encodes the robustness state of the lock:
894 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD
895 * or LOCK_UNMAPPED setting will influence the return code
896 * appropriately. If the upimutex is not locked here, this
897 * could be due to a spurious wake-up or a NOTRECOVERABLE
898 * event. The flag's setting can be used to distinguish
899 * between these two events.
901 fuword16_noerr(&lp->mutex_flag, &flag);
902 if (upilocked) {
904 * If the thread wakes up from turnstile_block with the lock
905 * held, the flag could not be set to LOCK_NOTRECOVERABLE,
906 * since it would not have been handed-off the lock.
907 * So, no need to check for this case.
909 if (nupinest > maxnestupimx &&
910 secpolicy_resource(CRED()) != 0) {
911 upimutex_unlock((upimutex_t *)upimutex, flag);
912 upilocked = 0;
913 error = ENOMEM;
914 } else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
915 if (flag & LOCK_OWNERDEAD)
916 error = EOWNERDEAD;
917 else if (type & USYNC_PROCESS_ROBUST)
918 error = ELOCKUNMAPPED;
919 else
920 error = EOWNERDEAD;
922 } else {
924 * Wake-up without the upimutex held. Either this is a
925 * spurious wake-up (due to signals, forkall(), whatever), or
926 * it is a LOCK_NOTRECOVERABLE robustness event. The setting
927 * of the mutex flag can be used to distinguish between the
928 * two events.
930 if (flag & LOCK_NOTRECOVERABLE) {
931 error = ENOTRECOVERABLE;
932 } else {
934 * Here, the flag could be set to LOCK_OWNERDEAD or
935 * not. In both cases, this is a spurious wakeup,
936 * since the upi lock is not held, but the thread
937 * has returned from turnstile_block().
939 * The user flag could be LOCK_OWNERDEAD if, at the
940 * same time as curthread having been woken up
941 * spuriously, the owner (say Tdead) has died, marked
942 * the mutex flag accordingly, and handed off the lock
943 * to some other waiter (say Tnew). curthread just
944 * happened to read the flag while Tnew has yet to deal
945 * with the owner-dead event.
947 * In this event, curthread should retry the lock.
948 * If Tnew is able to cleanup the lock, curthread
949 * will eventually get the lock with a zero error code,
950 * If Tnew is unable to cleanup, its eventual call to
951 * unlock the lock will result in the mutex flag being
952 * set to LOCK_NOTRECOVERABLE, and the wake-up of
953 * all waiters, including curthread, which will then
954 * eventually return ENOTRECOVERABLE due to the above
955 * check.
957 * Of course, if the user-flag is not set with
958 * LOCK_OWNERDEAD, retrying is the thing to do, since
959 * this is definitely a spurious wakeup.
961 goto retry;
965 out:
966 no_fault();
967 return (error);
971 static int
972 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
974 label_t ljb;
975 int error = 0;
976 lwpchan_t lwpchan;
977 uint16_t flag;
978 upib_t *upibp;
979 volatile struct upimutex *upimutex = NULL;
980 volatile int upilocked = 0;
982 if (on_fault(&ljb)) {
983 if (upilocked)
984 upimutex_unlock((upimutex_t *)upimutex, 0);
985 error = EFAULT;
986 goto out;
988 if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
989 &lwpchan, LWPCHAN_MPPOOL)) {
990 error = EFAULT;
991 goto out;
993 upibp = &UPI_CHAIN(lwpchan);
994 mutex_enter(&upibp->upib_lock);
995 upimutex = upi_get(upibp, &lwpchan);
997 * If the lock is not held, or the owner is not curthread, return
998 * error. The user-level wrapper can return this error or stall,
999 * depending on whether mutex is of ERRORCHECK type or not.
1001 if (upimutex == NULL || upimutex->upi_owner != curthread) {
1002 mutex_exit(&upibp->upib_lock);
1003 error = EPERM;
1004 goto out;
1006 mutex_exit(&upibp->upib_lock); /* release for user memory access */
1007 upilocked = 1;
1008 fuword16_noerr(&lp->mutex_flag, &flag);
1009 if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1011 * transition mutex to the LOCK_NOTRECOVERABLE state.
1013 flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
1014 flag |= LOCK_NOTRECOVERABLE;
1015 suword16_noerr(&lp->mutex_flag, flag);
1017 set_owner_pid(lp, 0, 0);
1018 upimutex_unlock((upimutex_t *)upimutex, flag);
1019 upilocked = 0;
1020 out:
1021 no_fault();
1022 return (error);
1026 * Set the owner and ownerpid fields of a user-level mutex. Note, this function
1027 * uses the suword*_noerr routines which must be called between
1028 * on_fault/no_fault. However, this routine itself does not do the
1029 * on_fault/no_fault and it is assumed all the callers will do so instead!
1031 static void
1032 set_owner_pid(lwp_mutex_t *lp, uintptr_t owner, pid_t pid)
1034 union {
1035 uint64_t word64;
1036 uint32_t word32[2];
1037 } un;
1039 un.word64 = (uint64_t)owner;
1041 suword32_noerr(&lp->mutex_ownerpid, pid);
1042 #if defined(_LP64)
1043 if (((uintptr_t)lp & (_LONG_LONG_ALIGNMENT - 1)) == 0) { /* aligned */
1044 suword64_noerr(&lp->mutex_owner, un.word64);
1045 return;
1047 #endif
1048 /* mutex is unaligned or we are running on a 32-bit kernel */
1049 suword32_noerr((uint32_t *)&lp->mutex_owner, un.word32[0]);
1050 suword32_noerr((uint32_t *)&lp->mutex_owner + 1, un.word32[1]);
1054 * Clear the contents of a user-level mutex; return the flags.
1055 * Used only by upi_dead() and lwp_mutex_cleanup(), below.
1057 static uint16_t
1058 lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg)
1060 uint16_t flag;
1062 fuword16_noerr(&lp->mutex_flag, &flag);
1063 if ((flag &
1064 (LOCK_OWNERDEAD | LOCK_UNMAPPED | LOCK_NOTRECOVERABLE)) == 0) {
1065 flag |= lockflg;
1066 suword16_noerr(&lp->mutex_flag, flag);
1068 set_owner_pid(lp, 0, 0);
1069 suword8_noerr(&lp->mutex_rcount, 0);
1071 return (flag);
1075 * Mark user mutex state, corresponding to kernel upimutex,
1076 * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate
1078 static int
1079 upi_dead(upimutex_t *upip, uint16_t lockflg)
1081 label_t ljb;
1082 int error = 0;
1083 lwp_mutex_t *lp;
1085 if (on_fault(&ljb)) {
1086 error = EFAULT;
1087 goto out;
1090 lp = upip->upi_vaddr;
1091 (void) lwp_clear_mutex(lp, lockflg);
1092 suword8_noerr(&lp->mutex_lockw, 0);
1093 out:
1094 no_fault();
1095 return (error);
1099 * Unlock all upimutexes held by curthread, since curthread is dying.
1100 * For each upimutex, attempt to mark its corresponding user mutex object as
1101 * dead.
1103 void
1104 upimutex_cleanup()
1106 kthread_t *t = curthread;
1107 uint16_t lockflg = (ttoproc(t)->p_proc_flag & P_PR_EXEC)?
1108 LOCK_UNMAPPED : LOCK_OWNERDEAD;
1109 struct upimutex *upip;
1111 while ((upip = t->t_upimutex) != NULL) {
1112 if (upi_dead(upip, lockflg) != 0) {
1114 * If the user object associated with this upimutex is
1115 * unmapped, unlock upimutex with the
1116 * LOCK_NOTRECOVERABLE flag, so that all waiters are
1117 * woken up. Since user object is unmapped, it could
1118 * not be marked as dead or notrecoverable.
1119 * The waiters will now all wake up and return
1120 * ENOTRECOVERABLE, since they would find that the lock
1121 * has not been handed-off to them.
1122 * See lwp_upimutex_lock().
1124 upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
1125 } else {
1127 * The user object has been updated as dead.
1128 * Unlock the upimutex: if no waiters, upip kmem will
1129 * be freed. If there is a waiter, the lock will be
1130 * handed off. If exit() is in progress, each existing
1131 * waiter will successively get the lock, as owners
1132 * die, and each new owner will call this routine as
1133 * it dies. The last owner will free kmem, since
1134 * it will find the upimutex has no waiters. So,
1135 * eventually, the kmem is guaranteed to be freed.
1137 upimutex_unlock(upip, 0);
1140 * Note that the call to upimutex_unlock() above will delete
1141 * upimutex from the t_upimutexes chain. And so the
1142 * while loop will eventually terminate.
1148 lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp, uintptr_t owner)
1150 kthread_t *t = curthread;
1151 klwp_t *lwp = ttolwp(t);
1152 proc_t *p = ttoproc(t);
1153 lwp_timer_t lwpt;
1154 caddr_t timedwait;
1155 int error = 0;
1156 int time_error;
1157 clock_t tim = -1;
1158 uchar_t waiters;
1159 volatile int locked = 0;
1160 volatile int watched = 0;
1161 label_t ljb;
1162 volatile uint8_t type = 0;
1163 lwpchan_t lwpchan;
1164 sleepq_head_t *sqh;
1165 uint16_t flag;
1166 int imm_timeout = 0;
1168 if ((caddr_t)lp >= p->p_as->a_userlimit)
1169 return (set_errno(EFAULT));
1172 * Put the lwp in an orderly state for debugging,
1173 * in case we are stopped while sleeping, below.
1175 prstop(PR_REQUESTED, 0);
1177 timedwait = (caddr_t)tsp;
1178 if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1179 lwpt.lwpt_imm_timeout) {
1180 imm_timeout = 1;
1181 timedwait = NULL;
1185 * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
1186 * this micro state is really a run state. If the thread indeed blocks,
1187 * this state becomes valid. If not, the state is converted back to
1188 * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
1189 * when blocking.
1191 (void) new_mstate(t, LMS_USER_LOCK);
1192 if (on_fault(&ljb)) {
1193 if (locked)
1194 lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1195 error = EFAULT;
1196 goto out;
1199 * Force Copy-on-write if necessary and ensure that the
1200 * synchronization object resides in read/write memory.
1201 * Cause an EFAULT return now if this is not so.
1203 fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1204 suword8_noerr(&lp->mutex_type, type);
1205 if (UPIMUTEX(type)) {
1206 no_fault();
1207 error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
1208 if (error == 0 || error == EOWNERDEAD ||
1209 error == ELOCKUNMAPPED) {
1210 volatile int locked = error != 0;
1211 if (on_fault(&ljb)) {
1212 if (locked != 0)
1213 error = lwp_upimutex_unlock(lp, type);
1214 else
1215 error = EFAULT;
1216 goto upierr;
1218 set_owner_pid(lp, owner,
1219 (type & USYNC_PROCESS)? p->p_pid : 0);
1220 no_fault();
1222 upierr:
1223 if (tsp && !time_error) /* copyout the residual time left */
1224 error = lwp_timer_copyout(&lwpt, error);
1225 if (error)
1226 return (set_errno(error));
1227 return (0);
1229 if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1230 &lwpchan, LWPCHAN_MPPOOL)) {
1231 error = EFAULT;
1232 goto out;
1234 lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1235 locked = 1;
1236 if (type & LOCK_ROBUST) {
1237 fuword16_noerr(&lp->mutex_flag, &flag);
1238 if (flag & LOCK_NOTRECOVERABLE) {
1239 lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1240 error = ENOTRECOVERABLE;
1241 goto out;
1244 fuword8_noerr(&lp->mutex_waiters, &waiters);
1245 suword8_noerr(&lp->mutex_waiters, 1);
1248 * If watchpoints are set, they need to be restored, since
1249 * atomic accesses of memory such as the call to ulock_try()
1250 * below cannot be watched.
1253 watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1255 while (!ulock_try(&lp->mutex_lockw)) {
1256 if (time_error) {
1258 * The SUSV3 Posix spec is very clear that we
1259 * should get no error from validating the
1260 * timer until we would actually sleep.
1262 error = time_error;
1263 break;
1266 if (watched) {
1267 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1268 watched = 0;
1271 if (timedwait) {
1273 * If we successfully queue the timeout,
1274 * then don't drop t_delay_lock until
1275 * we are on the sleep queue (below).
1277 mutex_enter(&t->t_delay_lock);
1278 if (lwp_timer_enqueue(&lwpt) != 0) {
1279 mutex_exit(&t->t_delay_lock);
1280 imm_timeout = 1;
1281 timedwait = NULL;
1284 lwp_block(&lwpchan);
1286 * Nothing should happen to cause the lwp to go to
1287 * sleep again until after it returns from swtch().
1289 if (timedwait)
1290 mutex_exit(&t->t_delay_lock);
1291 locked = 0;
1292 lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1293 if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
1294 setrun(t);
1295 swtch();
1296 t->t_flag &= ~T_WAKEABLE;
1297 if (timedwait)
1298 tim = lwp_timer_dequeue(&lwpt);
1299 setallwatch();
1300 if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
1301 error = EINTR;
1302 else if (imm_timeout || (timedwait && tim == -1))
1303 error = ETIME;
1304 if (error) {
1305 lwp->lwp_asleep = 0;
1306 lwp->lwp_sysabort = 0;
1307 watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1308 S_WRITE);
1311 * Need to re-compute waiters bit. The waiters field in
1312 * the lock is not reliable. Either of two things could
1313 * have occurred: no lwp may have called lwp_release()
1314 * for me but I have woken up due to a signal or
1315 * timeout. In this case, the waiter bit is incorrect
1316 * since it is still set to 1, set above.
1317 * OR an lwp_release() did occur for some other lwp on
1318 * the same lwpchan. In this case, the waiter bit is
1319 * correct. But which event occurred, one can't tell.
1320 * So, recompute.
1322 lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1323 locked = 1;
1324 sqh = lwpsqhash(&lwpchan);
1325 disp_lock_enter(&sqh->sq_lock);
1326 waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
1327 disp_lock_exit(&sqh->sq_lock);
1328 break;
1330 lwp->lwp_asleep = 0;
1331 watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1332 S_WRITE);
1333 lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1334 locked = 1;
1335 fuword8_noerr(&lp->mutex_waiters, &waiters);
1336 suword8_noerr(&lp->mutex_waiters, 1);
1337 if (type & LOCK_ROBUST) {
1338 fuword16_noerr(&lp->mutex_flag, &flag);
1339 if (flag & LOCK_NOTRECOVERABLE) {
1340 error = ENOTRECOVERABLE;
1341 break;
1346 if (t->t_mstate == LMS_USER_LOCK)
1347 (void) new_mstate(t, LMS_SYSTEM);
1349 if (error == 0) {
1350 set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0);
1351 if (type & LOCK_ROBUST) {
1352 fuword16_noerr(&lp->mutex_flag, &flag);
1353 if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1354 if (flag & LOCK_OWNERDEAD)
1355 error = EOWNERDEAD;
1356 else if (type & USYNC_PROCESS_ROBUST)
1357 error = ELOCKUNMAPPED;
1358 else
1359 error = EOWNERDEAD;
1363 suword8_noerr(&lp->mutex_waiters, waiters);
1364 locked = 0;
1365 lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1366 out:
1367 no_fault();
1368 if (watched)
1369 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1370 if (tsp && !time_error) /* copyout the residual time left */
1371 error = lwp_timer_copyout(&lwpt, error);
1372 if (error)
1373 return (set_errno(error));
1374 return (0);
1377 static int
1378 iswanted(kthread_t *t, lwpchan_t *lwpchan)
1381 * The caller holds the dispatcher lock on the sleep queue.
1383 while (t != NULL) {
1384 if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1385 t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1386 return (1);
1387 t = t->t_link;
1389 return (0);
1393 * Return the highest priority thread sleeping on this lwpchan.
1395 static kthread_t *
1396 lwp_queue_waiter(lwpchan_t *lwpchan)
1398 sleepq_head_t *sqh;
1399 kthread_t *tp;
1401 sqh = lwpsqhash(lwpchan);
1402 disp_lock_enter(&sqh->sq_lock); /* lock the sleep queue */
1403 for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
1404 if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1405 tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1406 break;
1408 disp_lock_exit(&sqh->sq_lock);
1409 return (tp);
1412 static int
1413 lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
1415 sleepq_head_t *sqh;
1416 kthread_t *tp;
1417 kthread_t **tpp;
1419 sqh = lwpsqhash(lwpchan);
1420 disp_lock_enter(&sqh->sq_lock); /* lock the sleep queue */
1421 tpp = &sqh->sq_queue.sq_first;
1422 while ((tp = *tpp) != NULL) {
1423 if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1424 tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1426 * The following is typically false. It could be true
1427 * only if lwp_release() is called from
1428 * lwp_mutex_wakeup() after reading the waiters field
1429 * from memory in which the lwp lock used to be, but has
1430 * since been re-used to hold a lwp cv or lwp semaphore.
1431 * The thread "tp" found to match the lwp lock's wchan
1432 * is actually sleeping for the cv or semaphore which
1433 * now has the same wchan. In this case, lwp_release()
1434 * should return failure.
1436 if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
1437 ASSERT(sync_type == 0);
1439 * assert that this can happen only for mutexes
1440 * i.e. sync_type == 0, for correctly written
1441 * user programs.
1443 disp_lock_exit(&sqh->sq_lock);
1444 return (0);
1446 *waiters = iswanted(tp->t_link, lwpchan);
1447 sleepq_unlink(tpp, tp);
1448 DTRACE_SCHED1(wakeup, kthread_t *, tp);
1449 tp->t_wchan0 = NULL;
1450 tp->t_wchan = NULL;
1451 tp->t_sobj_ops = NULL;
1452 tp->t_release = 1;
1453 THREAD_TRANSITION(tp); /* drops sleepq lock */
1454 CL_WAKEUP(tp);
1455 thread_unlock(tp); /* drop run queue lock */
1456 return (1);
1458 tpp = &tp->t_link;
1460 *waiters = 0;
1461 disp_lock_exit(&sqh->sq_lock);
1462 return (0);
1465 static void
1466 lwp_release_all(lwpchan_t *lwpchan)
1468 sleepq_head_t *sqh;
1469 kthread_t *tp;
1470 kthread_t **tpp;
1472 sqh = lwpsqhash(lwpchan);
1473 disp_lock_enter(&sqh->sq_lock); /* lock sleep q queue */
1474 tpp = &sqh->sq_queue.sq_first;
1475 while ((tp = *tpp) != NULL) {
1476 if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1477 tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1478 sleepq_unlink(tpp, tp);
1479 DTRACE_SCHED1(wakeup, kthread_t *, tp);
1480 tp->t_wchan0 = NULL;
1481 tp->t_wchan = NULL;
1482 tp->t_sobj_ops = NULL;
1483 CL_WAKEUP(tp);
1484 thread_unlock_high(tp); /* release run queue lock */
1485 } else {
1486 tpp = &tp->t_link;
1489 disp_lock_exit(&sqh->sq_lock); /* drop sleep q lock */
1493 * unblock a lwp that is trying to acquire this mutex. the blocked
1494 * lwp resumes and retries to acquire the lock.
1497 lwp_mutex_wakeup(lwp_mutex_t *lp, int release_all)
1499 proc_t *p = ttoproc(curthread);
1500 lwpchan_t lwpchan;
1501 uchar_t waiters;
1502 volatile int locked = 0;
1503 volatile int watched = 0;
1504 volatile uint8_t type = 0;
1505 label_t ljb;
1506 int error = 0;
1508 if ((caddr_t)lp >= p->p_as->a_userlimit)
1509 return (set_errno(EFAULT));
1511 watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1513 if (on_fault(&ljb)) {
1514 if (locked)
1515 lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1516 error = EFAULT;
1517 goto out;
1520 * Force Copy-on-write if necessary and ensure that the
1521 * synchronization object resides in read/write memory.
1522 * Cause an EFAULT return now if this is not so.
1524 fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1525 suword8_noerr(&lp->mutex_type, type);
1526 if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1527 &lwpchan, LWPCHAN_MPPOOL)) {
1528 error = EFAULT;
1529 goto out;
1531 lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1532 locked = 1;
1534 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
1535 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
1536 * may fail. If it fails, do not write into the waiter bit.
1537 * The call to lwp_release() might fail due to one of three reasons:
1539 * 1. due to the thread which set the waiter bit not actually
1540 * sleeping since it got the lock on the re-try. The waiter
1541 * bit will then be correctly updated by that thread. This
1542 * window may be closed by reading the wait bit again here
1543 * and not calling lwp_release() at all if it is zero.
1544 * 2. the thread which set the waiter bit and went to sleep
1545 * was woken up by a signal. This time, the waiter recomputes
1546 * the wait bit in the return with EINTR code.
1547 * 3. the waiter bit read by lwp_mutex_wakeup() was in
1548 * memory that has been re-used after the lock was dropped.
1549 * In this case, writing into the waiter bit would cause data
1550 * corruption.
1552 if (release_all)
1553 lwp_release_all(&lwpchan);
1554 else if (lwp_release(&lwpchan, &waiters, 0))
1555 suword8_noerr(&lp->mutex_waiters, waiters);
1556 lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1557 out:
1558 no_fault();
1559 if (watched)
1560 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1561 if (error)
1562 return (set_errno(error));
1563 return (0);
1567 * lwp_cond_wait() has four arguments, a pointer to a condition variable,
1568 * a pointer to a mutex, a pointer to a timespec for a timed wait and
1569 * a flag telling the kernel whether or not to honor the kernel/user
1570 * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
1571 * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
1572 * lwpchan, returned by get_lwpchan(). If the timespec pointer is non-NULL,
1573 * it is used an an in/out parameter. On entry, it contains the relative
1574 * time until timeout. On exit, we copyout the residual time left to it.
1577 lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
1579 kthread_t *t = curthread;
1580 klwp_t *lwp = ttolwp(t);
1581 proc_t *p = ttoproc(t);
1582 lwp_timer_t lwpt;
1583 lwpchan_t cv_lwpchan;
1584 lwpchan_t m_lwpchan;
1585 caddr_t timedwait;
1586 volatile uint16_t type = 0;
1587 volatile uint8_t mtype = 0;
1588 uchar_t waiters;
1589 volatile int error;
1590 clock_t tim = -1;
1591 volatile int locked = 0;
1592 volatile int m_locked = 0;
1593 volatile int cvwatched = 0;
1594 volatile int mpwatched = 0;
1595 label_t ljb;
1596 volatile int no_lwpchan = 1;
1597 int imm_timeout = 0;
1598 int imm_unpark = 0;
1600 if ((caddr_t)cv >= p->p_as->a_userlimit ||
1601 (caddr_t)mp >= p->p_as->a_userlimit)
1602 return (set_errno(EFAULT));
1605 * Put the lwp in an orderly state for debugging,
1606 * in case we are stopped while sleeping, below.
1608 prstop(PR_REQUESTED, 0);
1610 timedwait = (caddr_t)tsp;
1611 if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
1612 return (set_errno(error));
1613 if (lwpt.lwpt_imm_timeout) {
1614 imm_timeout = 1;
1615 timedwait = NULL;
1618 (void) new_mstate(t, LMS_USER_LOCK);
1620 if (on_fault(&ljb)) {
1621 if (no_lwpchan) {
1622 error = EFAULT;
1623 goto out;
1625 if (m_locked) {
1626 m_locked = 0;
1627 lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1629 if (locked) {
1630 locked = 0;
1631 lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1634 * set up another on_fault() for a possible fault
1635 * on the user lock accessed at "efault"
1637 if (on_fault(&ljb)) {
1638 if (m_locked) {
1639 m_locked = 0;
1640 lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1642 goto out;
1644 error = EFAULT;
1645 goto efault;
1649 * Force Copy-on-write if necessary and ensure that the
1650 * synchronization object resides in read/write memory.
1651 * Cause an EFAULT return now if this is not so.
1653 fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
1654 suword8_noerr(&mp->mutex_type, mtype);
1655 if (UPIMUTEX(mtype) == 0) {
1656 /* convert user level mutex, "mp", to a unique lwpchan */
1657 /* check if mtype is ok to use below, instead of type from cv */
1658 if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
1659 &m_lwpchan, LWPCHAN_MPPOOL)) {
1660 error = EFAULT;
1661 goto out;
1664 fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1665 suword16_noerr(&cv->cond_type, type);
1666 /* convert user level condition variable, "cv", to a unique lwpchan */
1667 if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
1668 &cv_lwpchan, LWPCHAN_CVPOOL)) {
1669 error = EFAULT;
1670 goto out;
1672 no_lwpchan = 0;
1673 cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1674 if (UPIMUTEX(mtype) == 0)
1675 mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
1676 S_WRITE);
1679 * lwpchan_lock ensures that the calling lwp is put to sleep atomically
1680 * with respect to a possible wakeup which is a result of either
1681 * an lwp_cond_signal() or an lwp_cond_broadcast().
1683 * What's misleading, is that the lwp is put to sleep after the
1684 * condition variable's mutex is released. This is OK as long as
1685 * the release operation is also done while holding lwpchan_lock.
1686 * The lwp is then put to sleep when the possibility of pagefaulting
1687 * or sleeping is completely eliminated.
1689 lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
1690 locked = 1;
1691 if (UPIMUTEX(mtype) == 0) {
1692 lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1693 m_locked = 1;
1694 suword8_noerr(&cv->cond_waiters_kernel, 1);
1696 * unlock the condition variable's mutex. (pagefaults are
1697 * possible here.)
1699 set_owner_pid(mp, 0, 0);
1700 ulock_clear(&mp->mutex_lockw);
1701 fuword8_noerr(&mp->mutex_waiters, &waiters);
1702 if (waiters != 0) {
1704 * Given the locking of lwpchan_lock around the release
1705 * of the mutex and checking for waiters, the following
1706 * call to lwp_release() can fail ONLY if the lock
1707 * acquirer is interrupted after setting the waiter bit,
1708 * calling lwp_block() and releasing lwpchan_lock.
1709 * In this case, it could get pulled off the lwp sleep
1710 * q (via setrun()) before the following call to
1711 * lwp_release() occurs. In this case, the lock
1712 * requestor will update the waiter bit correctly by
1713 * re-evaluating it.
1715 if (lwp_release(&m_lwpchan, &waiters, 0))
1716 suword8_noerr(&mp->mutex_waiters, waiters);
1718 m_locked = 0;
1719 lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1720 } else {
1721 suword8_noerr(&cv->cond_waiters_kernel, 1);
1722 error = lwp_upimutex_unlock(mp, mtype);
1723 if (error) { /* if the upimutex unlock failed */
1724 locked = 0;
1725 lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1726 goto out;
1729 no_fault();
1731 if (mpwatched) {
1732 watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1733 mpwatched = 0;
1735 if (cvwatched) {
1736 watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1737 cvwatched = 0;
1740 if (check_park && (!schedctl_is_park() || t->t_unpark)) {
1742 * We received a signal at user-level before calling here
1743 * or another thread wants us to return immediately
1744 * with EINTR. See lwp_unpark().
1746 imm_unpark = 1;
1747 t->t_unpark = 0;
1748 timedwait = NULL;
1749 } else if (timedwait) {
1751 * If we successfully queue the timeout,
1752 * then don't drop t_delay_lock until
1753 * we are on the sleep queue (below).
1755 mutex_enter(&t->t_delay_lock);
1756 if (lwp_timer_enqueue(&lwpt) != 0) {
1757 mutex_exit(&t->t_delay_lock);
1758 imm_timeout = 1;
1759 timedwait = NULL;
1762 t->t_flag |= T_WAITCVSEM;
1763 lwp_block(&cv_lwpchan);
1765 * Nothing should happen to cause the lwp to go to sleep
1766 * until after it returns from swtch().
1768 if (timedwait)
1769 mutex_exit(&t->t_delay_lock);
1770 locked = 0;
1771 lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1772 if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
1773 (imm_timeout | imm_unpark))
1774 setrun(t);
1775 swtch();
1776 t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
1777 if (timedwait)
1778 tim = lwp_timer_dequeue(&lwpt);
1779 if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
1780 MUSTRETURN(p, t) || imm_unpark)
1781 error = EINTR;
1782 else if (imm_timeout || (timedwait && tim == -1))
1783 error = ETIME;
1784 lwp->lwp_asleep = 0;
1785 lwp->lwp_sysabort = 0;
1786 setallwatch();
1788 if (t->t_mstate == LMS_USER_LOCK)
1789 (void) new_mstate(t, LMS_SYSTEM);
1791 if (tsp && check_park) /* copyout the residual time left */
1792 error = lwp_timer_copyout(&lwpt, error);
1794 /* the mutex is reacquired by the caller on return to user level */
1795 if (error) {
1797 * If we were concurrently lwp_cond_signal()d and we
1798 * received a UNIX signal or got a timeout, then perform
1799 * another lwp_cond_signal() to avoid consuming the wakeup.
1801 if (t->t_release)
1802 (void) lwp_cond_signal(cv);
1803 return (set_errno(error));
1805 return (0);
1807 efault:
1809 * make sure that the user level lock is dropped before
1810 * returning to caller, since the caller always re-acquires it.
1812 if (UPIMUTEX(mtype) == 0) {
1813 lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1814 m_locked = 1;
1815 set_owner_pid(mp, 0, 0);
1816 ulock_clear(&mp->mutex_lockw);
1817 fuword8_noerr(&mp->mutex_waiters, &waiters);
1818 if (waiters != 0) {
1820 * See comment above on lock clearing and lwp_release()
1821 * success/failure.
1823 if (lwp_release(&m_lwpchan, &waiters, 0))
1824 suword8_noerr(&mp->mutex_waiters, waiters);
1826 m_locked = 0;
1827 lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1828 } else {
1829 (void) lwp_upimutex_unlock(mp, mtype);
1831 out:
1832 no_fault();
1833 if (mpwatched)
1834 watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1835 if (cvwatched)
1836 watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1837 if (t->t_mstate == LMS_USER_LOCK)
1838 (void) new_mstate(t, LMS_SYSTEM);
1839 return (set_errno(error));
1843 * wakeup one lwp that's blocked on this condition variable.
1846 lwp_cond_signal(lwp_cond_t *cv)
1848 proc_t *p = ttoproc(curthread);
1849 lwpchan_t lwpchan;
1850 uchar_t waiters;
1851 volatile uint16_t type = 0;
1852 volatile int locked = 0;
1853 volatile int watched = 0;
1854 label_t ljb;
1855 int error = 0;
1857 if ((caddr_t)cv >= p->p_as->a_userlimit)
1858 return (set_errno(EFAULT));
1860 watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1862 if (on_fault(&ljb)) {
1863 if (locked)
1864 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1865 error = EFAULT;
1866 goto out;
1869 * Force Copy-on-write if necessary and ensure that the
1870 * synchronization object resides in read/write memory.
1871 * Cause an EFAULT return now if this is not so.
1873 fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1874 suword16_noerr(&cv->cond_type, type);
1875 if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1876 &lwpchan, LWPCHAN_CVPOOL)) {
1877 error = EFAULT;
1878 goto out;
1880 lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1881 locked = 1;
1882 fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1883 if (waiters != 0) {
1885 * The following call to lwp_release() might fail but it is
1886 * OK to write into the waiters bit below, since the memory
1887 * could not have been re-used or unmapped (for correctly
1888 * written user programs) as in the case of lwp_mutex_wakeup().
1889 * For an incorrect program, we should not care about data
1890 * corruption since this is just one instance of other places
1891 * where corruption can occur for such a program. Of course
1892 * if the memory is unmapped, normal fault recovery occurs.
1894 (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1895 suword8_noerr(&cv->cond_waiters_kernel, waiters);
1897 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1898 out:
1899 no_fault();
1900 if (watched)
1901 watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1902 if (error)
1903 return (set_errno(error));
1904 return (0);
1908 * wakeup every lwp that's blocked on this condition variable.
1911 lwp_cond_broadcast(lwp_cond_t *cv)
1913 proc_t *p = ttoproc(curthread);
1914 lwpchan_t lwpchan;
1915 volatile uint16_t type = 0;
1916 volatile int locked = 0;
1917 volatile int watched = 0;
1918 label_t ljb;
1919 uchar_t waiters;
1920 int error = 0;
1922 if ((caddr_t)cv >= p->p_as->a_userlimit)
1923 return (set_errno(EFAULT));
1925 watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1927 if (on_fault(&ljb)) {
1928 if (locked)
1929 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1930 error = EFAULT;
1931 goto out;
1934 * Force Copy-on-write if necessary and ensure that the
1935 * synchronization object resides in read/write memory.
1936 * Cause an EFAULT return now if this is not so.
1938 fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1939 suword16_noerr(&cv->cond_type, type);
1940 if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1941 &lwpchan, LWPCHAN_CVPOOL)) {
1942 error = EFAULT;
1943 goto out;
1945 lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1946 locked = 1;
1947 fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1948 if (waiters != 0) {
1949 lwp_release_all(&lwpchan);
1950 suword8_noerr(&cv->cond_waiters_kernel, 0);
1952 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1953 out:
1954 no_fault();
1955 if (watched)
1956 watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1957 if (error)
1958 return (set_errno(error));
1959 return (0);
1963 lwp_sema_trywait(lwp_sema_t *sp)
1965 kthread_t *t = curthread;
1966 proc_t *p = ttoproc(t);
1967 label_t ljb;
1968 volatile int locked = 0;
1969 volatile int watched = 0;
1970 volatile uint16_t type = 0;
1971 int count;
1972 lwpchan_t lwpchan;
1973 uchar_t waiters;
1974 int error = 0;
1976 if ((caddr_t)sp >= p->p_as->a_userlimit)
1977 return (set_errno(EFAULT));
1979 watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1981 if (on_fault(&ljb)) {
1982 if (locked)
1983 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1984 error = EFAULT;
1985 goto out;
1988 * Force Copy-on-write if necessary and ensure that the
1989 * synchronization object resides in read/write memory.
1990 * Cause an EFAULT return now if this is not so.
1992 fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
1993 suword16_noerr((void *)&sp->sema_type, type);
1994 if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
1995 &lwpchan, LWPCHAN_CVPOOL)) {
1996 error = EFAULT;
1997 goto out;
1999 lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2000 locked = 1;
2001 fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2002 if (count == 0)
2003 error = EBUSY;
2004 else
2005 suword32_noerr((void *)&sp->sema_count, --count);
2006 if (count != 0) {
2007 fuword8_noerr(&sp->sema_waiters, &waiters);
2008 if (waiters != 0) {
2009 (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2010 suword8_noerr(&sp->sema_waiters, waiters);
2013 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2014 out:
2015 no_fault();
2016 if (watched)
2017 watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2018 if (error)
2019 return (set_errno(error));
2020 return (0);
2024 * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
2027 lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
2029 kthread_t *t = curthread;
2030 klwp_t *lwp = ttolwp(t);
2031 proc_t *p = ttoproc(t);
2032 lwp_timer_t lwpt;
2033 caddr_t timedwait;
2034 clock_t tim = -1;
2035 label_t ljb;
2036 volatile int locked = 0;
2037 volatile int watched = 0;
2038 volatile uint16_t type = 0;
2039 int count;
2040 lwpchan_t lwpchan;
2041 uchar_t waiters;
2042 int error = 0;
2043 int time_error;
2044 int imm_timeout = 0;
2045 int imm_unpark = 0;
2047 if ((caddr_t)sp >= p->p_as->a_userlimit)
2048 return (set_errno(EFAULT));
2051 * Put the lwp in an orderly state for debugging,
2052 * in case we are stopped while sleeping, below.
2054 prstop(PR_REQUESTED, 0);
2056 timedwait = (caddr_t)tsp;
2057 if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2058 lwpt.lwpt_imm_timeout) {
2059 imm_timeout = 1;
2060 timedwait = NULL;
2063 watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2065 if (on_fault(&ljb)) {
2066 if (locked)
2067 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2068 error = EFAULT;
2069 goto out;
2072 * Force Copy-on-write if necessary and ensure that the
2073 * synchronization object resides in read/write memory.
2074 * Cause an EFAULT return now if this is not so.
2076 fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
2077 suword16_noerr((void *)&sp->sema_type, type);
2078 if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
2079 &lwpchan, LWPCHAN_CVPOOL)) {
2080 error = EFAULT;
2081 goto out;
2083 lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2084 locked = 1;
2085 fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2086 while (error == 0 && count == 0) {
2087 if (time_error) {
2089 * The SUSV3 Posix spec is very clear that we
2090 * should get no error from validating the
2091 * timer until we would actually sleep.
2093 error = time_error;
2094 break;
2096 suword8_noerr(&sp->sema_waiters, 1);
2097 if (watched)
2098 watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2099 if (check_park && (!schedctl_is_park() || t->t_unpark)) {
2101 * We received a signal at user-level before calling
2102 * here or another thread wants us to return
2103 * immediately with EINTR. See lwp_unpark().
2105 imm_unpark = 1;
2106 t->t_unpark = 0;
2107 timedwait = NULL;
2108 } else if (timedwait) {
2110 * If we successfully queue the timeout,
2111 * then don't drop t_delay_lock until
2112 * we are on the sleep queue (below).
2114 mutex_enter(&t->t_delay_lock);
2115 if (lwp_timer_enqueue(&lwpt) != 0) {
2116 mutex_exit(&t->t_delay_lock);
2117 imm_timeout = 1;
2118 timedwait = NULL;
2121 t->t_flag |= T_WAITCVSEM;
2122 lwp_block(&lwpchan);
2124 * Nothing should happen to cause the lwp to sleep
2125 * again until after it returns from swtch().
2127 if (timedwait)
2128 mutex_exit(&t->t_delay_lock);
2129 locked = 0;
2130 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2131 if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
2132 (imm_timeout | imm_unpark))
2133 setrun(t);
2134 swtch();
2135 t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2136 if (timedwait)
2137 tim = lwp_timer_dequeue(&lwpt);
2138 setallwatch();
2139 if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
2140 MUSTRETURN(p, t) || imm_unpark)
2141 error = EINTR;
2142 else if (imm_timeout || (timedwait && tim == -1))
2143 error = ETIME;
2144 lwp->lwp_asleep = 0;
2145 lwp->lwp_sysabort = 0;
2146 watched = watch_disable_addr((caddr_t)sp,
2147 sizeof (*sp), S_WRITE);
2148 lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2149 locked = 1;
2150 fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2152 if (error == 0)
2153 suword32_noerr((void *)&sp->sema_count, --count);
2154 if (count != 0) {
2155 (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2156 suword8_noerr(&sp->sema_waiters, waiters);
2158 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2159 out:
2160 no_fault();
2161 if (watched)
2162 watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2163 if (tsp && check_park && !time_error)
2164 error = lwp_timer_copyout(&lwpt, error);
2165 if (error)
2166 return (set_errno(error));
2167 return (0);
2171 lwp_sema_post(lwp_sema_t *sp)
2173 proc_t *p = ttoproc(curthread);
2174 label_t ljb;
2175 volatile int locked = 0;
2176 volatile int watched = 0;
2177 volatile uint16_t type = 0;
2178 int count;
2179 lwpchan_t lwpchan;
2180 uchar_t waiters;
2181 int error = 0;
2183 if ((caddr_t)sp >= p->p_as->a_userlimit)
2184 return (set_errno(EFAULT));
2186 watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2188 if (on_fault(&ljb)) {
2189 if (locked)
2190 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2191 error = EFAULT;
2192 goto out;
2195 * Force Copy-on-write if necessary and ensure that the
2196 * synchronization object resides in read/write memory.
2197 * Cause an EFAULT return now if this is not so.
2199 fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
2200 suword16_noerr(&sp->sema_type, type);
2201 if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
2202 &lwpchan, LWPCHAN_CVPOOL)) {
2203 error = EFAULT;
2204 goto out;
2206 lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2207 locked = 1;
2208 fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
2209 if (count == _SEM_VALUE_MAX)
2210 error = EOVERFLOW;
2211 else
2212 suword32_noerr(&sp->sema_count, ++count);
2213 if (count == 1) {
2214 fuword8_noerr(&sp->sema_waiters, &waiters);
2215 if (waiters) {
2216 (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2217 suword8_noerr(&sp->sema_waiters, waiters);
2220 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2221 out:
2222 no_fault();
2223 if (watched)
2224 watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2225 if (error)
2226 return (set_errno(error));
2227 return (0);
2230 #define TRW_WANT_WRITE 0x1
2231 #define TRW_LOCK_GRANTED 0x2
2233 #define READ_LOCK 0
2234 #define WRITE_LOCK 1
2235 #define TRY_FLAG 0x10
2236 #define READ_LOCK_TRY (READ_LOCK | TRY_FLAG)
2237 #define WRITE_LOCK_TRY (WRITE_LOCK | TRY_FLAG)
2240 * Release one writer or one or more readers. Compute the rwstate word to
2241 * reflect the new state of the queue. For a safe hand-off we copy the new
2242 * rwstate value back to userland before we wake any of the new lock holders.
2244 * Note that sleepq_insert() implements a prioritized FIFO (with writers
2245 * being given precedence over readers of the same priority).
2247 * If the first thread is a reader we scan the queue releasing all readers
2248 * until we hit a writer or the end of the queue. If the first thread is a
2249 * writer we still need to check for another writer.
2251 void
2252 lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
2254 sleepq_head_t *sqh;
2255 kthread_t *tp;
2256 kthread_t **tpp;
2257 kthread_t *tpnext;
2258 kthread_t *wakelist = NULL;
2259 uint32_t rwstate = 0;
2260 int wcount = 0;
2261 int rcount = 0;
2263 sqh = lwpsqhash(lwpchan);
2264 disp_lock_enter(&sqh->sq_lock);
2265 tpp = &sqh->sq_queue.sq_first;
2266 while ((tp = *tpp) != NULL) {
2267 if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
2268 tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
2269 if (tp->t_writer & TRW_WANT_WRITE) {
2270 if ((wcount++ == 0) && (rcount == 0)) {
2271 rwstate |= URW_WRITE_LOCKED;
2273 /* Just one writer to wake. */
2274 sleepq_unlink(tpp, tp);
2275 wakelist = tp;
2277 /* tpp already set for next thread. */
2278 continue;
2279 } else {
2280 rwstate |= URW_HAS_WAITERS;
2281 /* We need look no further. */
2282 break;
2284 } else {
2285 rcount++;
2286 if (wcount == 0) {
2287 rwstate++;
2289 /* Add reader to wake list. */
2290 sleepq_unlink(tpp, tp);
2291 tp->t_link = wakelist;
2292 wakelist = tp;
2294 /* tpp already set for next thread. */
2295 continue;
2296 } else {
2297 rwstate |= URW_HAS_WAITERS;
2298 /* We need look no further. */
2299 break;
2303 tpp = &tp->t_link;
2306 /* Copy the new rwstate back to userland. */
2307 suword32_noerr(&rw->rwlock_readers, rwstate);
2309 /* Wake the new lock holder(s) up. */
2310 tp = wakelist;
2311 while (tp != NULL) {
2312 DTRACE_SCHED1(wakeup, kthread_t *, tp);
2313 tp->t_wchan0 = NULL;
2314 tp->t_wchan = NULL;
2315 tp->t_sobj_ops = NULL;
2316 tp->t_writer |= TRW_LOCK_GRANTED;
2317 tpnext = tp->t_link;
2318 tp->t_link = NULL;
2319 CL_WAKEUP(tp);
2320 thread_unlock_high(tp);
2321 tp = tpnext;
2324 disp_lock_exit(&sqh->sq_lock);
2328 * We enter here holding the user-level mutex, which we must release before
2329 * returning or blocking. Based on lwp_cond_wait().
2331 static int
2332 lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
2334 lwp_mutex_t *mp = NULL;
2335 kthread_t *t = curthread;
2336 kthread_t *tp;
2337 klwp_t *lwp = ttolwp(t);
2338 proc_t *p = ttoproc(t);
2339 lwp_timer_t lwpt;
2340 lwpchan_t lwpchan;
2341 lwpchan_t mlwpchan;
2342 caddr_t timedwait;
2343 volatile uint16_t type = 0;
2344 volatile uint8_t mtype = 0;
2345 uchar_t mwaiters;
2346 volatile int error = 0;
2347 int time_error;
2348 clock_t tim = -1;
2349 volatile int locked = 0;
2350 volatile int mlocked = 0;
2351 volatile int watched = 0;
2352 volatile int mwatched = 0;
2353 label_t ljb;
2354 volatile int no_lwpchan = 1;
2355 int imm_timeout = 0;
2356 int try_flag;
2357 uint32_t rwstate;
2358 int acquired = 0;
2360 /* We only check rw because the mutex is included in it. */
2361 if ((caddr_t)rw >= p->p_as->a_userlimit)
2362 return (set_errno(EFAULT));
2365 * Put the lwp in an orderly state for debugging,
2366 * in case we are stopped while sleeping, below.
2368 prstop(PR_REQUESTED, 0);
2370 /* We must only report this error if we are about to sleep (later). */
2371 timedwait = (caddr_t)tsp;
2372 if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2373 lwpt.lwpt_imm_timeout) {
2374 imm_timeout = 1;
2375 timedwait = NULL;
2378 (void) new_mstate(t, LMS_USER_LOCK);
2380 if (on_fault(&ljb)) {
2381 if (no_lwpchan) {
2382 error = EFAULT;
2383 goto out_nodrop;
2385 if (mlocked) {
2386 mlocked = 0;
2387 lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2389 if (locked) {
2390 locked = 0;
2391 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2394 * Set up another on_fault() for a possible fault
2395 * on the user lock accessed at "out_drop".
2397 if (on_fault(&ljb)) {
2398 if (mlocked) {
2399 mlocked = 0;
2400 lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2402 error = EFAULT;
2403 goto out_nodrop;
2405 error = EFAULT;
2406 goto out_nodrop;
2409 /* Process rd_wr (including sanity check). */
2410 try_flag = (rd_wr & TRY_FLAG);
2411 rd_wr &= ~TRY_FLAG;
2412 if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
2413 error = EINVAL;
2414 goto out_nodrop;
2418 * Force Copy-on-write if necessary and ensure that the
2419 * synchronization object resides in read/write memory.
2420 * Cause an EFAULT return now if this is not so.
2422 mp = &rw->mutex;
2423 fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
2424 fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2425 suword8_noerr(&mp->mutex_type, mtype);
2426 suword16_noerr(&rw->rwlock_type, type);
2428 /* We can only continue for simple USYNC_PROCESS locks. */
2429 if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
2430 error = EINVAL;
2431 goto out_nodrop;
2434 /* Convert user level mutex, "mp", to a unique lwpchan. */
2435 if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
2436 &mlwpchan, LWPCHAN_MPPOOL)) {
2437 error = EFAULT;
2438 goto out_nodrop;
2441 /* Convert user level rwlock, "rw", to a unique lwpchan. */
2442 if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2443 &lwpchan, LWPCHAN_CVPOOL)) {
2444 error = EFAULT;
2445 goto out_nodrop;
2448 no_lwpchan = 0;
2449 watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2450 mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2453 * lwpchan_lock() ensures that the calling LWP is put to sleep
2454 * atomically with respect to a possible wakeup which is a result
2455 * of lwp_rwlock_unlock().
2457 * What's misleading is that the LWP is put to sleep after the
2458 * rwlock's mutex is released. This is OK as long as the release
2459 * operation is also done while holding mlwpchan. The LWP is then
2460 * put to sleep when the possibility of pagefaulting or sleeping
2461 * has been completely eliminated.
2463 lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2464 locked = 1;
2465 lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2466 mlocked = 1;
2469 * Fetch the current rwlock state.
2471 * The possibility of spurious wake-ups or killed waiters means
2472 * rwstate's URW_HAS_WAITERS bit may indicate false positives.
2473 * We only fix these if they are important to us.
2475 * Although various error states can be observed here (e.g. the lock
2476 * is not held, but there are waiters) we assume these are applicaton
2477 * errors and so we take no corrective action.
2479 fuword32_noerr(&rw->rwlock_readers, &rwstate);
2481 * We cannot legitimately get here from user-level
2482 * without URW_HAS_WAITERS being set.
2483 * Set it now to guard against user-level error.
2485 rwstate |= URW_HAS_WAITERS;
2488 * We can try only if the lock isn't held by a writer.
2490 if (!(rwstate & URW_WRITE_LOCKED)) {
2491 tp = lwp_queue_waiter(&lwpchan);
2492 if (tp == NULL) {
2494 * Hmmm, rwstate indicates waiters but there are
2495 * none queued. This could just be the result of a
2496 * spurious wakeup, so let's ignore it.
2498 * We now have a chance to acquire the lock
2499 * uncontended, but this is the last chance for
2500 * a writer to acquire the lock without blocking.
2502 if (rd_wr == READ_LOCK) {
2503 rwstate++;
2504 acquired = 1;
2505 } else if ((rwstate & URW_READERS_MASK) == 0) {
2506 rwstate |= URW_WRITE_LOCKED;
2507 acquired = 1;
2509 } else if (rd_wr == READ_LOCK) {
2511 * This is the last chance for a reader to acquire
2512 * the lock now, but it can only do so if there is
2513 * no writer of equal or greater priority at the
2514 * head of the queue .
2516 * It is also just possible that there is a reader
2517 * at the head of the queue. This may be the result
2518 * of a spurious wakeup or an application failure.
2519 * In this case we only acquire the lock if we have
2520 * equal or greater priority. It is not our job to
2521 * release spurious waiters.
2523 pri_t our_pri = DISP_PRIO(t);
2524 pri_t his_pri = DISP_PRIO(tp);
2526 if ((our_pri > his_pri) || ((our_pri == his_pri) &&
2527 !(tp->t_writer & TRW_WANT_WRITE))) {
2528 rwstate++;
2529 acquired = 1;
2534 if (acquired || try_flag || time_error) {
2536 * We're not going to block this time.
2538 suword32_noerr(&rw->rwlock_readers, rwstate);
2539 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2540 locked = 0;
2542 if (acquired) {
2544 * Got the lock!
2546 error = 0;
2548 } else if (try_flag) {
2550 * We didn't get the lock and we're about to block.
2551 * If we're doing a trylock, return EBUSY instead.
2553 error = EBUSY;
2555 } else if (time_error) {
2557 * The SUSV3 POSIX spec is very clear that we should
2558 * get no error from validating the timer (above)
2559 * until we would actually sleep.
2561 error = time_error;
2564 goto out_drop;
2568 * We're about to block, so indicate what kind of waiter we are.
2570 t->t_writer = 0;
2571 if (rd_wr == WRITE_LOCK)
2572 t->t_writer = TRW_WANT_WRITE;
2573 suword32_noerr(&rw->rwlock_readers, rwstate);
2576 * Unlock the rwlock's mutex (pagefaults are possible here).
2578 set_owner_pid(mp, 0, 0);
2579 ulock_clear(&mp->mutex_lockw);
2580 fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2581 if (mwaiters != 0) {
2583 * Given the locking of mlwpchan around the release of
2584 * the mutex and checking for waiters, the following
2585 * call to lwp_release() can fail ONLY if the lock
2586 * acquirer is interrupted after setting the waiter bit,
2587 * calling lwp_block() and releasing mlwpchan.
2588 * In this case, it could get pulled off the LWP sleep
2589 * queue (via setrun()) before the following call to
2590 * lwp_release() occurs, and the lock requestor will
2591 * update the waiter bit correctly by re-evaluating it.
2593 if (lwp_release(&mlwpchan, &mwaiters, 0))
2594 suword8_noerr(&mp->mutex_waiters, mwaiters);
2596 lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2597 mlocked = 0;
2598 no_fault();
2600 if (mwatched) {
2601 watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2602 mwatched = 0;
2604 if (watched) {
2605 watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2606 watched = 0;
2609 if (timedwait) {
2611 * If we successfully queue the timeout,
2612 * then don't drop t_delay_lock until
2613 * we are on the sleep queue (below).
2615 mutex_enter(&t->t_delay_lock);
2616 if (lwp_timer_enqueue(&lwpt) != 0) {
2617 mutex_exit(&t->t_delay_lock);
2618 imm_timeout = 1;
2619 timedwait = NULL;
2622 t->t_flag |= T_WAITCVSEM;
2623 lwp_block(&lwpchan);
2626 * Nothing should happen to cause the LWp to go to sleep until after
2627 * it returns from swtch().
2629 if (timedwait)
2630 mutex_exit(&t->t_delay_lock);
2631 locked = 0;
2632 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2633 if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
2634 setrun(t);
2635 swtch();
2638 * We're back, but we need to work out why. Were we interrupted? Did
2639 * we timeout? Were we granted the lock?
2641 error = EAGAIN;
2642 acquired = (t->t_writer & TRW_LOCK_GRANTED);
2643 t->t_writer = 0;
2644 t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2645 if (timedwait)
2646 tim = lwp_timer_dequeue(&lwpt);
2647 if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
2648 error = EINTR;
2649 else if (imm_timeout || (timedwait && tim == -1))
2650 error = ETIME;
2651 lwp->lwp_asleep = 0;
2652 lwp->lwp_sysabort = 0;
2653 setallwatch();
2656 * If we were granted the lock we don't care about EINTR or ETIME.
2658 if (acquired)
2659 error = 0;
2661 if (t->t_mstate == LMS_USER_LOCK)
2662 (void) new_mstate(t, LMS_SYSTEM);
2664 if (error)
2665 return (set_errno(error));
2666 return (0);
2668 out_drop:
2670 * Make sure that the user level lock is dropped before returning
2671 * to the caller.
2673 if (!mlocked) {
2674 lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2675 mlocked = 1;
2677 set_owner_pid(mp, 0, 0);
2678 ulock_clear(&mp->mutex_lockw);
2679 fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2680 if (mwaiters != 0) {
2682 * See comment above on lock clearing and lwp_release()
2683 * success/failure.
2685 if (lwp_release(&mlwpchan, &mwaiters, 0))
2686 suword8_noerr(&mp->mutex_waiters, mwaiters);
2688 lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2689 mlocked = 0;
2691 out_nodrop:
2692 no_fault();
2693 if (mwatched)
2694 watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2695 if (watched)
2696 watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2697 if (t->t_mstate == LMS_USER_LOCK)
2698 (void) new_mstate(t, LMS_SYSTEM);
2699 if (error)
2700 return (set_errno(error));
2701 return (0);
2705 * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
2706 * we never drop the lock.
2708 static int
2709 lwp_rwlock_unlock(lwp_rwlock_t *rw)
2711 kthread_t *t = curthread;
2712 proc_t *p = ttoproc(t);
2713 lwpchan_t lwpchan;
2714 volatile uint16_t type = 0;
2715 volatile int error = 0;
2716 volatile int locked = 0;
2717 volatile int watched = 0;
2718 label_t ljb;
2719 volatile int no_lwpchan = 1;
2720 uint32_t rwstate;
2722 /* We only check rw because the mutex is included in it. */
2723 if ((caddr_t)rw >= p->p_as->a_userlimit)
2724 return (set_errno(EFAULT));
2726 if (on_fault(&ljb)) {
2727 if (no_lwpchan) {
2728 error = EFAULT;
2729 goto out_nodrop;
2731 if (locked) {
2732 locked = 0;
2733 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2735 error = EFAULT;
2736 goto out_nodrop;
2740 * Force Copy-on-write if necessary and ensure that the
2741 * synchronization object resides in read/write memory.
2742 * Cause an EFAULT return now if this is not so.
2744 fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2745 suword16_noerr(&rw->rwlock_type, type);
2747 /* We can only continue for simple USYNC_PROCESS locks. */
2748 if (type != USYNC_PROCESS) {
2749 error = EINVAL;
2750 goto out_nodrop;
2753 /* Convert user level rwlock, "rw", to a unique lwpchan. */
2754 if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2755 &lwpchan, LWPCHAN_CVPOOL)) {
2756 error = EFAULT;
2757 goto out_nodrop;
2760 no_lwpchan = 0;
2761 watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2763 lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2764 locked = 1;
2767 * We can resolve multiple readers (except the last reader) here.
2768 * For the last reader or a writer we need lwp_rwlock_release(),
2769 * to which we also delegate the task of copying the new rwstate
2770 * back to userland (see the comment there).
2772 fuword32_noerr(&rw->rwlock_readers, &rwstate);
2773 if (rwstate & URW_WRITE_LOCKED)
2774 lwp_rwlock_release(&lwpchan, rw);
2775 else if ((rwstate & URW_READERS_MASK) > 0) {
2776 rwstate--;
2777 if ((rwstate & URW_READERS_MASK) == 0)
2778 lwp_rwlock_release(&lwpchan, rw);
2779 else
2780 suword32_noerr(&rw->rwlock_readers, rwstate);
2783 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2784 locked = 0;
2785 error = 0;
2787 out_nodrop:
2788 no_fault();
2789 if (watched)
2790 watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2791 if (error)
2792 return (set_errno(error));
2793 return (0);
2797 lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
2799 switch (subcode) {
2800 case 0:
2801 return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
2802 case 1:
2803 return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
2804 case 2:
2805 return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
2806 case 3:
2807 return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
2808 case 4:
2809 return (lwp_rwlock_unlock(rwlp));
2811 return (set_errno(EINVAL));
2815 * Return the owner of the user-level s-object.
2816 * Since we can't really do this, return NULL.
2818 /* ARGSUSED */
2819 static kthread_t *
2820 lwpsobj_owner(caddr_t sobj)
2822 return (NULL);
2826 * Wake up a thread asleep on a user-level synchronization
2827 * object.
2829 static void
2830 lwp_unsleep(kthread_t *t)
2832 ASSERT(THREAD_LOCK_HELD(t));
2833 if (t->t_wchan0 != NULL) {
2834 sleepq_head_t *sqh;
2835 sleepq_t *sqp = t->t_sleepq;
2837 if (sqp != NULL) {
2838 sqh = lwpsqhash(&t->t_lwpchan);
2839 ASSERT(&sqh->sq_queue == sqp);
2840 sleepq_unsleep(t);
2841 disp_lock_exit_high(&sqh->sq_lock);
2842 CL_SETRUN(t);
2843 return;
2846 panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
2850 * Change the priority of a thread asleep on a user-level
2851 * synchronization object. To maintain proper priority order,
2852 * we:
2853 * o dequeue the thread.
2854 * o change its priority.
2855 * o re-enqueue the thread.
2856 * Assumption: the thread is locked on entry.
2858 static void
2859 lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
2861 ASSERT(THREAD_LOCK_HELD(t));
2862 if (t->t_wchan0 != NULL) {
2863 sleepq_t *sqp = t->t_sleepq;
2865 sleepq_dequeue(t);
2866 *t_prip = pri;
2867 sleepq_insert(sqp, t);
2868 } else
2869 panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
2873 * Clean up a left-over process-shared robust mutex
2875 static void
2876 lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
2878 uint16_t flag;
2879 uchar_t waiters;
2880 label_t ljb;
2881 pid_t owner_pid;
2882 lwp_mutex_t *lp;
2883 volatile int locked = 0;
2884 volatile int watched = 0;
2885 volatile struct upimutex *upimutex = NULL;
2886 volatile int upilocked = 0;
2888 if ((ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
2889 != (USYNC_PROCESS | LOCK_ROBUST))
2890 return;
2892 lp = (lwp_mutex_t *)ent->lwpchan_addr;
2893 watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2894 if (on_fault(&ljb)) {
2895 if (locked)
2896 lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2897 if (upilocked)
2898 upimutex_unlock((upimutex_t *)upimutex, 0);
2899 goto out;
2902 fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
2904 if (UPIMUTEX(ent->lwpchan_type)) {
2905 lwpchan_t lwpchan = ent->lwpchan_lwpchan;
2906 upib_t *upibp = &UPI_CHAIN(lwpchan);
2908 if (owner_pid != curproc->p_pid)
2909 goto out;
2910 mutex_enter(&upibp->upib_lock);
2911 upimutex = upi_get(upibp, &lwpchan);
2912 if (upimutex == NULL || upimutex->upi_owner != curthread) {
2913 mutex_exit(&upibp->upib_lock);
2914 goto out;
2916 mutex_exit(&upibp->upib_lock);
2917 upilocked = 1;
2918 flag = lwp_clear_mutex(lp, lockflg);
2919 suword8_noerr(&lp->mutex_lockw, 0);
2920 upimutex_unlock((upimutex_t *)upimutex, flag);
2921 } else {
2922 lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2923 locked = 1;
2925 * Clear the spinners count because one of our
2926 * threads could have been spinning for this lock
2927 * at user level when the process was suddenly killed.
2928 * There is no harm in this since user-level libc code
2929 * will adapt to the sudden change in the spinner count.
2931 suword8_noerr(&lp->mutex_spinners, 0);
2932 if (owner_pid != curproc->p_pid) {
2934 * We are not the owner. There may or may not be one.
2935 * If there are waiters, we wake up one or all of them.
2936 * It doesn't hurt to wake them up in error since
2937 * they will just retry the lock and go to sleep
2938 * again if necessary.
2940 fuword8_noerr(&lp->mutex_waiters, &waiters);
2941 if (waiters != 0) { /* there are waiters */
2942 fuword16_noerr(&lp->mutex_flag, &flag);
2943 if (flag & LOCK_NOTRECOVERABLE) {
2944 lwp_release_all(&ent->lwpchan_lwpchan);
2945 suword8_noerr(&lp->mutex_waiters, 0);
2946 } else if (lwp_release(&ent->lwpchan_lwpchan,
2947 &waiters, 0)) {
2948 suword8_noerr(&lp->mutex_waiters,
2949 waiters);
2952 } else {
2954 * We are the owner. Release it.
2956 (void) lwp_clear_mutex(lp, lockflg);
2957 ulock_clear(&lp->mutex_lockw);
2958 fuword8_noerr(&lp->mutex_waiters, &waiters);
2959 if (waiters &&
2960 lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
2961 suword8_noerr(&lp->mutex_waiters, waiters);
2963 lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2965 out:
2966 no_fault();
2967 if (watched)
2968 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2972 * Register a process-shared robust mutex in the lwpchan cache.
2975 lwp_mutex_register(lwp_mutex_t *lp, caddr_t uaddr)
2977 int error = 0;
2978 volatile int watched;
2979 label_t ljb;
2980 uint8_t type;
2981 lwpchan_t lwpchan;
2983 if ((caddr_t)lp >= (caddr_t)USERLIMIT)
2984 return (set_errno(EFAULT));
2986 watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2988 if (on_fault(&ljb)) {
2989 error = EFAULT;
2990 } else {
2992 * Force Copy-on-write if necessary and ensure that the
2993 * synchronization object resides in read/write memory.
2994 * Cause an EFAULT return now if this is not so.
2996 fuword8_noerr(&lp->mutex_type, &type);
2997 suword8_noerr(&lp->mutex_type, type);
2998 if ((type & (USYNC_PROCESS|LOCK_ROBUST))
2999 != (USYNC_PROCESS|LOCK_ROBUST)) {
3000 error = EINVAL;
3001 } else if (!lwpchan_get_mapping(curproc->p_as, (caddr_t)lp,
3002 uaddr, type, &lwpchan, LWPCHAN_MPPOOL)) {
3003 error = EFAULT;
3006 no_fault();
3007 if (watched)
3008 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3009 if (error)
3010 return (set_errno(error));
3011 return (0);
3015 * There is a user-level robust lock registration in libc.
3016 * Mark it as invalid by storing -1 into the location of the pointer.
3018 static void
3019 lwp_mutex_unregister(void *uaddr)
3021 if (get_udatamodel() == DATAMODEL_NATIVE) {
3022 (void) sulword(uaddr, (ulong_t)-1);
3023 #ifdef _SYSCALL32_IMPL
3024 } else {
3025 (void) suword32(uaddr, (uint32_t)-1);
3026 #endif
3031 lwp_mutex_trylock(lwp_mutex_t *lp, uintptr_t owner)
3033 kthread_t *t = curthread;
3034 proc_t *p = ttoproc(t);
3035 int error = 0;
3036 volatile int locked = 0;
3037 volatile int watched = 0;
3038 label_t ljb;
3039 volatile uint8_t type = 0;
3040 uint16_t flag;
3041 lwpchan_t lwpchan;
3043 if ((caddr_t)lp >= p->p_as->a_userlimit)
3044 return (set_errno(EFAULT));
3046 (void) new_mstate(t, LMS_USER_LOCK);
3048 if (on_fault(&ljb)) {
3049 if (locked)
3050 lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3051 error = EFAULT;
3052 goto out;
3055 * Force Copy-on-write if necessary and ensure that the
3056 * synchronization object resides in read/write memory.
3057 * Cause an EFAULT return now if this is not so.
3059 fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3060 suword8_noerr(&lp->mutex_type, type);
3061 if (UPIMUTEX(type)) {
3062 no_fault();
3063 error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
3064 if (error == 0 || error == EOWNERDEAD ||
3065 error == ELOCKUNMAPPED) {
3066 volatile int locked = error != 0;
3067 if (on_fault(&ljb)) {
3068 if (locked != 0)
3069 error = lwp_upimutex_unlock(lp, type);
3070 else
3071 error = EFAULT;
3072 goto upierr;
3074 set_owner_pid(lp, owner,
3075 (type & USYNC_PROCESS)? p->p_pid : 0);
3076 no_fault();
3079 upierr:
3080 if (error)
3081 return (set_errno(error));
3082 return (0);
3084 if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3085 &lwpchan, LWPCHAN_MPPOOL)) {
3086 error = EFAULT;
3087 goto out;
3089 lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3090 locked = 1;
3091 if (type & LOCK_ROBUST) {
3092 fuword16_noerr(&lp->mutex_flag, &flag);
3093 if (flag & LOCK_NOTRECOVERABLE) {
3094 lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3095 error = ENOTRECOVERABLE;
3096 goto out;
3100 watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3102 if (!ulock_try(&lp->mutex_lockw))
3103 error = EBUSY;
3104 else {
3105 set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0);
3106 if (type & LOCK_ROBUST) {
3107 fuword16_noerr(&lp->mutex_flag, &flag);
3108 if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3109 if (flag & LOCK_OWNERDEAD)
3110 error = EOWNERDEAD;
3111 else if (type & USYNC_PROCESS_ROBUST)
3112 error = ELOCKUNMAPPED;
3113 else
3114 error = EOWNERDEAD;
3118 locked = 0;
3119 lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3120 out:
3122 if (t->t_mstate == LMS_USER_LOCK)
3123 (void) new_mstate(t, LMS_SYSTEM);
3125 no_fault();
3126 if (watched)
3127 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3128 if (error)
3129 return (set_errno(error));
3130 return (0);
3134 * unlock the mutex and unblock lwps that is trying to acquire this mutex.
3135 * the blocked lwp resumes and retries to acquire the lock.
3138 lwp_mutex_unlock(lwp_mutex_t *lp)
3140 proc_t *p = ttoproc(curthread);
3141 lwpchan_t lwpchan;
3142 uchar_t waiters;
3143 volatile int locked = 0;
3144 volatile int watched = 0;
3145 volatile uint8_t type = 0;
3146 label_t ljb;
3147 uint16_t flag;
3148 int error = 0;
3150 if ((caddr_t)lp >= p->p_as->a_userlimit)
3151 return (set_errno(EFAULT));
3153 if (on_fault(&ljb)) {
3154 if (locked)
3155 lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3156 error = EFAULT;
3157 goto out;
3161 * Force Copy-on-write if necessary and ensure that the
3162 * synchronization object resides in read/write memory.
3163 * Cause an EFAULT return now if this is not so.
3165 fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3166 suword8_noerr(&lp->mutex_type, type);
3168 if (UPIMUTEX(type)) {
3169 no_fault();
3170 error = lwp_upimutex_unlock(lp, type);
3171 if (error)
3172 return (set_errno(error));
3173 return (0);
3176 watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3178 if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3179 &lwpchan, LWPCHAN_MPPOOL)) {
3180 error = EFAULT;
3181 goto out;
3183 lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3184 locked = 1;
3185 if (type & LOCK_ROBUST) {
3186 fuword16_noerr(&lp->mutex_flag, &flag);
3187 if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3188 flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
3189 flag |= LOCK_NOTRECOVERABLE;
3190 suword16_noerr(&lp->mutex_flag, flag);
3193 set_owner_pid(lp, 0, 0);
3194 ulock_clear(&lp->mutex_lockw);
3196 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
3197 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
3198 * may fail. If it fails, do not write into the waiter bit.
3199 * The call to lwp_release() might fail due to one of three reasons:
3201 * 1. due to the thread which set the waiter bit not actually
3202 * sleeping since it got the lock on the re-try. The waiter
3203 * bit will then be correctly updated by that thread. This
3204 * window may be closed by reading the wait bit again here
3205 * and not calling lwp_release() at all if it is zero.
3206 * 2. the thread which set the waiter bit and went to sleep
3207 * was woken up by a signal. This time, the waiter recomputes
3208 * the wait bit in the return with EINTR code.
3209 * 3. the waiter bit read by lwp_mutex_wakeup() was in
3210 * memory that has been re-used after the lock was dropped.
3211 * In this case, writing into the waiter bit would cause data
3212 * corruption.
3214 fuword8_noerr(&lp->mutex_waiters, &waiters);
3215 if (waiters) {
3216 if ((type & LOCK_ROBUST) &&
3217 (flag & LOCK_NOTRECOVERABLE)) {
3218 lwp_release_all(&lwpchan);
3219 suword8_noerr(&lp->mutex_waiters, 0);
3220 } else if (lwp_release(&lwpchan, &waiters, 0)) {
3221 suword8_noerr(&lp->mutex_waiters, waiters);
3225 lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3226 out:
3227 no_fault();
3228 if (watched)
3229 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3230 if (error)
3231 return (set_errno(error));
3232 return (0);