usr/src/lib/libc/port/threads/synch.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2015, Joyent, Inc.
  26  */
  27
  28 #include "lint.h"
  29 #include "thr_uberdata.h"
  30 #include <sys/rtpriocntl.h>
  31 #include <sys/sdt.h>
  32 #include <atomic.h>
  33
  34 #if defined(THREAD_DEBUG)
  35 #define INCR32(x)       (((x) != UINT32_MAX)? (x)++ : 0)
  36 #define INCR(x)         ((x)++)
  37 #define DECR(x)         ((x)--)
  38 #define MAXINCR(m, x)   ((m < ++x)? (m = x) : 0)
  39 #else
  40 #define INCR32(x)
  41 #define INCR(x)
  42 #define DECR(x)
  43 #define MAXINCR(m, x)
  44 #endif
  45
  46 /*
  47  * This mutex is initialized to be held by lwp#1.
  48  * It is used to block a thread that has returned from a mutex_lock()
  49  * of a LOCK_PRIO_INHERIT mutex with an unrecoverable error.
  50  */
  51 mutex_t stall_mutex = DEFAULTMUTEX;
  52
  53 static int shared_mutex_held(mutex_t *);
  54 static int mutex_queuelock_adaptive(mutex_t *);
  55 static void mutex_wakeup_all(mutex_t *);
  56
  57 /*
  58  * Lock statistics support functions.
  59  */
  60 void
  61 record_begin_hold(tdb_mutex_stats_t *msp)
  62 {
  63         tdb_incr(msp->mutex_lock);
  64         msp->mutex_begin_hold = gethrtime();
  65 }
  66
  67 hrtime_t
  68 record_hold_time(tdb_mutex_stats_t *msp)
  69 {
  70         hrtime_t now = gethrtime();
  71
  72         if (msp->mutex_begin_hold)
  73                 msp->mutex_hold_time += now - msp->mutex_begin_hold;
  74         msp->mutex_begin_hold = 0;
  75         return (now);
  76 }
  77
  78 /*
  79  * Called once at library initialization.
  80  */
  81 void
  82 mutex_setup(void)
  83 {
  84         if (set_lock_byte(&stall_mutex.mutex_lockw))
  85                 thr_panic("mutex_setup() cannot acquire stall_mutex");
  86         stall_mutex.mutex_owner = (uintptr_t)curthread;
  87 }
  88
  89 /*
  90  * The default spin count of 1000 is experimentally determined.
  91  * On sun4u machines with any number of processors it could be raised
  92  * to 10,000 but that (experimentally) makes almost no difference.
  93  * The environment variable:
  94  *      _THREAD_ADAPTIVE_SPIN=count
  95  * can be used to override and set the count in the range [0 .. 1,000,000].
  96  */
  97 int     thread_adaptive_spin = 1000;
  98 uint_t  thread_max_spinners = 100;
  99 int     thread_queue_verify = 0;
 100 static  int     ncpus;
 101
 102 /*
 103  * Distinguish spinning for queue locks from spinning for regular locks.
 104  * We try harder to acquire queue locks by spinning.
 105  * The environment variable:
 106  *      _THREAD_QUEUE_SPIN=count
 107  * can be used to override and set the count in the range [0 .. 1,000,000].
 108  */
 109 int     thread_queue_spin = 10000;
 110
 111 #define ALL_ATTRIBUTES                          \
 112         (LOCK_RECURSIVE | LOCK_ERRORCHECK |     \
 113         LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT | \
 114         LOCK_ROBUST)
 115
 116 /*
 117  * 'type' can be one of USYNC_THREAD, USYNC_PROCESS, or USYNC_PROCESS_ROBUST,
 118  * augmented by zero or more the flags:
 119  *      LOCK_RECURSIVE
 120  *      LOCK_ERRORCHECK
 121  *      LOCK_PRIO_INHERIT
 122  *      LOCK_PRIO_PROTECT
 123  *      LOCK_ROBUST
 124  */
 125 #pragma weak _mutex_init = mutex_init
 126 /* ARGSUSED2 */
 127 int
 128 mutex_init(mutex_t *mp, int type, void *arg)
 129 {
 130         int basetype = (type & ~ALL_ATTRIBUTES);
 131         const pcclass_t *pccp;
 132         int error = 0;
 133         int ceil;
 134
 135         if (basetype == USYNC_PROCESS_ROBUST) {
 136                 /*
 137                  * USYNC_PROCESS_ROBUST is a deprecated historical type.
 138                  * We change it into (USYNC_PROCESS | LOCK_ROBUST) but
 139                  * retain the USYNC_PROCESS_ROBUST flag so we can return
 140                  * ELOCKUNMAPPED when necessary (only USYNC_PROCESS_ROBUST
 141                  * mutexes will ever draw ELOCKUNMAPPED).
 142                  */
 143                 type |= (USYNC_PROCESS | LOCK_ROBUST);
 144                 basetype = USYNC_PROCESS;
 145         }
 146
 147         if (type & LOCK_PRIO_PROTECT)
 148                 pccp = get_info_by_policy(SCHED_FIFO);
 149         if ((basetype != USYNC_THREAD && basetype != USYNC_PROCESS) ||
 150             (type & (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT))
 151             == (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT) ||
 152             ((type & LOCK_PRIO_PROTECT) &&
 153             ((ceil = *(int *)arg) < pccp->pcc_primin ||
 154             ceil > pccp->pcc_primax))) {
 155                 error = EINVAL;
 156         } else if (type & LOCK_ROBUST) {
 157                 /*
 158                  * Callers of mutex_init() with the LOCK_ROBUST attribute
 159                  * are required to pass an initially all-zero mutex.
 160                  * Multiple calls to mutex_init() are allowed; all but
 161                  * the first return EBUSY.  A call to mutex_init() is
 162                  * allowed to make an inconsistent robust lock consistent
 163                  * (for historical usage, even though the proper interface
 164                  * for this is mutex_consistent()).  Note that we use
 165                  * atomic_or_16() to set the LOCK_INITED flag so as
 166                  * not to disturb surrounding bits (LOCK_OWNERDEAD, etc).
 167                  */
 168                 if (!(mp->mutex_flag & LOCK_INITED)) {
 169                         mp->mutex_type = (uint8_t)type;
 170                         atomic_or_16(&mp->mutex_flag, LOCK_INITED);
 171                         mp->mutex_magic = MUTEX_MAGIC;
 172                 } else if (type != mp->mutex_type ||
 173                     ((type & LOCK_PRIO_PROTECT) && mp->mutex_ceiling != ceil)) {
 174                         error = EINVAL;
 175                 } else if (mutex_consistent(mp) != 0) {
 176                         error = EBUSY;
 177                 }
 178                 /* register a process robust mutex with the kernel */
 179                 if (basetype == USYNC_PROCESS)
 180                         register_lock(mp);
 181         } else {
 182                 (void) memset(mp, 0, sizeof (*mp));
 183                 mp->mutex_type = (uint8_t)type;
 184                 mp->mutex_flag = LOCK_INITED;
 185                 mp->mutex_magic = MUTEX_MAGIC;
 186         }
 187
 188         if (error == 0 && (type & LOCK_PRIO_PROTECT)) {
 189                 mp->mutex_ceiling = ceil;
 190         }
 191
 192         /*
 193          * This should be at the beginning of the function,
 194          * but for the sake of old broken applications that
 195          * do not have proper alignment for their mutexes
 196          * (and don't check the return code from mutex_init),
 197          * we put it here, after initializing the mutex regardless.
 198          */
 199         if (error == 0 &&
 200             ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
 201             curthread->ul_misaligned == 0)
 202                 error = EINVAL;
 203
 204         return (error);
 205 }
 206
 207 /*
 208  * Delete mp from list of ceiling mutexes owned by curthread.
 209  * Return 1 if the head of the chain was updated.
 210  */
 211 int
 212 _ceil_mylist_del(mutex_t *mp)
 213 {
 214         ulwp_t *self = curthread;
 215         mxchain_t **mcpp;
 216         mxchain_t *mcp;
 217
 218         for (mcpp = &self->ul_mxchain;
 219             (mcp = *mcpp) != NULL;
 220             mcpp = &mcp->mxchain_next) {
 221                 if (mcp->mxchain_mx == mp) {
 222                         *mcpp = mcp->mxchain_next;
 223                         lfree(mcp, sizeof (*mcp));
 224                         return (mcpp == &self->ul_mxchain);
 225                 }
 226         }
 227         return (0);
 228 }
 229
 230 /*
 231  * Add mp to the list of ceiling mutexes owned by curthread.
 232  * Return ENOMEM if no memory could be allocated.
 233  */
 234 int
 235 _ceil_mylist_add(mutex_t *mp)
 236 {
 237         ulwp_t *self = curthread;
 238         mxchain_t *mcp;
 239
 240         if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
 241                 return (ENOMEM);
 242         mcp->mxchain_mx = mp;
 243         mcp->mxchain_next = self->ul_mxchain;
 244         self->ul_mxchain = mcp;
 245         return (0);
 246 }
 247
 248 /*
 249  * Helper function for _ceil_prio_inherit() and _ceil_prio_waive(), below.
 250  */
 251 static void
 252 set_rt_priority(ulwp_t *self, int prio)
 253 {
 254         pcparms_t pcparm;
 255
 256         pcparm.pc_cid = self->ul_rtclassid;
 257         ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE;
 258         ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
 259         (void) priocntl(P_LWPID, self->ul_lwpid, PC_SETPARMS, &pcparm);
 260 }
 261
 262 /*
 263  * Inherit priority from ceiling.
 264  * This changes the effective priority, not the assigned priority.
 265  */
 266 void
 267 _ceil_prio_inherit(int prio)
 268 {
 269         ulwp_t *self = curthread;
 270
 271         self->ul_epri = prio;
 272         set_rt_priority(self, prio);
 273 }
 274
 275 /*
 276  * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
 277  * if holding at least one ceiling lock.  If no ceiling locks are held at this
 278  * point, disinherit completely, reverting back to assigned priority.
 279  */
 280 void
 281 _ceil_prio_waive(void)
 282 {
 283         ulwp_t *self = curthread;
 284         mxchain_t *mcp = self->ul_mxchain;
 285         int prio;
 286
 287         if (mcp == NULL) {
 288                 prio = self->ul_pri;
 289                 self->ul_epri = 0;
 290         } else {
 291                 prio = mcp->mxchain_mx->mutex_ceiling;
 292                 self->ul_epri = prio;
 293         }
 294         set_rt_priority(self, prio);
 295 }
 296
 297 /*
 298  * Clear the lock byte.  Retain the waiters byte and the spinners byte.
 299  * Return the old value of the lock word.
 300  */
 301 static uint32_t
 302 clear_lockbyte(volatile uint32_t *lockword)
 303 {
 304         uint32_t old;
 305         uint32_t new;
 306
 307         do {
 308                 old = *lockword;
 309                 new = old & ~LOCKMASK;
 310         } while (atomic_cas_32(lockword, old, new) != old);
 311
 312         return (old);
 313 }
 314
 315 /*
 316  * Same as clear_lockbyte(), but operates on mutex_lockword64.
 317  * The mutex_ownerpid field is cleared along with the lock byte.
 318  */
 319 static uint64_t
 320 clear_lockbyte64(volatile uint64_t *lockword64)
 321 {
 322         uint64_t old;
 323         uint64_t new;
 324
 325         do {
 326                 old = *lockword64;
 327                 new = old & ~LOCKMASK64;
 328         } while (atomic_cas_64(lockword64, old, new) != old);
 329
 330         return (old);
 331 }
 332
 333 /*
 334  * Similar to set_lock_byte(), which only tries to set the lock byte.
 335  * Here, we attempt to set the lock byte AND the mutex_ownerpid, keeping
 336  * the remaining bytes constant.  This atomic operation is required for the
 337  * correctness of process-shared robust locks, otherwise there would be
 338  * a window or vulnerability in which the lock byte had been set but the
 339  * mutex_ownerpid had not yet been set.  If the process were to die in
 340  * this window of vulnerability (due to some other thread calling exit()
 341  * or the process receiving a fatal signal), the mutex would be left locked
 342  * but without a process-ID to determine which process was holding the lock.
 343  * The kernel would then be unable to mark the robust mutex as LOCK_OWNERDEAD
 344  * when the process died.  For all other cases of process-shared locks, this
 345  * operation is just a convenience, for the sake of common code.
 346  *
 347  * This operation requires process-shared robust locks to be properly
 348  * aligned on an 8-byte boundary, at least on sparc machines, lest the
 349  * operation incur an alignment fault.  This is automatic when locks
 350  * are declared properly using the mutex_t or pthread_mutex_t data types
 351  * and the application does not allocate dynamic memory on less than an
 352  * 8-byte boundary.  See the 'horrible hack' comments below for cases
 353  * dealing with such broken applications.
 354  */
 355 static int
 356 set_lock_byte64(volatile uint64_t *lockword64, pid_t ownerpid)
 357 {
 358         uint64_t old;
 359         uint64_t new;
 360
 361         old = *lockword64 & ~LOCKMASK64;
 362         new = old | ((uint64_t)(uint_t)ownerpid << PIDSHIFT) | LOCKBYTE64;
 363         if (atomic_cas_64(lockword64, old, new) == old)
 364                 return (LOCKCLEAR);
 365
 366         return (LOCKSET);
 367 }
 368
 369 /*
 370  * Increment the spinners count in the mutex lock word.
 371  * Return 0 on success.  Return -1 if the count would overflow.
 372  */
 373 static int
 374 spinners_incr(volatile uint32_t *lockword, uint8_t max_spinners)
 375 {
 376         uint32_t old;
 377         uint32_t new;
 378
 379         do {
 380                 old = *lockword;
 381                 if (((old & SPINNERMASK) >> SPINNERSHIFT) >= max_spinners)
 382                         return (-1);
 383                 new = old + (1 << SPINNERSHIFT);
 384         } while (atomic_cas_32(lockword, old, new) != old);
 385
 386         return (0);
 387 }
 388
 389 /*
 390  * Decrement the spinners count in the mutex lock word.
 391  * Return the new value of the lock word.
 392  */
 393 static uint32_t
 394 spinners_decr(volatile uint32_t *lockword)
 395 {
 396         uint32_t old;
 397         uint32_t new;
 398
 399         do {
 400                 new = old = *lockword;
 401                 if (new & SPINNERMASK)
 402                         new -= (1 << SPINNERSHIFT);
 403         } while (atomic_cas_32(lockword, old, new) != old);
 404
 405         return (new);
 406 }
 407
 408 /*
 409  * Non-preemptive spin locks.  Used by queue_lock().
 410  * No lock statistics are gathered for these locks.
 411  * No DTrace probes are provided for these locks.
 412  */
 413 void
 414 spin_lock_set(mutex_t *mp)
 415 {
 416         ulwp_t *self = curthread;
 417
 418         no_preempt(self);
 419         if (set_lock_byte(&mp->mutex_lockw) == 0) {
 420                 mp->mutex_owner = (uintptr_t)self;
 421                 return;
 422         }
 423         /*
 424          * Spin for a while, attempting to acquire the lock.
 425          */
 426         INCR32(self->ul_spin_lock_spin);
 427         if (mutex_queuelock_adaptive(mp) == 0 ||
 428             set_lock_byte(&mp->mutex_lockw) == 0) {
 429                 mp->mutex_owner = (uintptr_t)self;
 430                 return;
 431         }
 432         /*
 433          * Try harder if we were previously at a no premption level.
 434          */
 435         if (self->ul_preempt > 1) {
 436                 INCR32(self->ul_spin_lock_spin2);
 437                 if (mutex_queuelock_adaptive(mp) == 0 ||
 438                     set_lock_byte(&mp->mutex_lockw) == 0) {
 439                         mp->mutex_owner = (uintptr_t)self;
 440                         return;
 441                 }
 442         }
 443         /*
 444          * Give up and block in the kernel for the mutex.
 445          */
 446         INCR32(self->ul_spin_lock_sleep);
 447         (void) ___lwp_mutex_timedlock(mp, NULL, self);
 448 }
 449
 450 void
 451 spin_lock_clear(mutex_t *mp)
 452 {
 453         ulwp_t *self = curthread;
 454
 455         mp->mutex_owner = 0;
 456         if (atomic_swap_32(&mp->mutex_lockword, 0) & WAITERMASK) {
 457                 (void) ___lwp_mutex_wakeup(mp, 0);
 458                 INCR32(self->ul_spin_lock_wakeup);
 459         }
 460         preempt(self);
 461 }
 462
 463 /*
 464  * Allocate the sleep queue hash table.
 465  */
 466 void
 467 queue_alloc(void)
 468 {
 469         ulwp_t *self = curthread;
 470         uberdata_t *udp = self->ul_uberdata;
 471         queue_head_t *qp;
 472         void *data;
 473         int i;
 474
 475         /*
 476          * No locks are needed; we call here only when single-threaded.
 477          */
 478         ASSERT(self == udp->ulwp_one);
 479         ASSERT(!udp->uberflags.uf_mt);
 480         if ((data = mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
 481             PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
 482             == MAP_FAILED)
 483                 thr_panic("cannot allocate thread queue_head table");
 484         udp->queue_head = qp = (queue_head_t *)data;
 485         for (i = 0; i < 2 * QHASHSIZE; qp++, i++) {
 486                 qp->qh_type = (i < QHASHSIZE)? MX : CV;
 487                 qp->qh_lock.mutex_flag = LOCK_INITED;
 488                 qp->qh_lock.mutex_magic = MUTEX_MAGIC;
 489                 qp->qh_hlist = &qp->qh_def_root;
 490 #if defined(THREAD_DEBUG)
 491                 qp->qh_hlen = 1;
 492                 qp->qh_hmax = 1;
 493 #endif
 494         }
 495 }
 496
 497 #if defined(THREAD_DEBUG)
 498
 499 /*
 500  * Debugging: verify correctness of a sleep queue.
 501  */
 502 void
 503 QVERIFY(queue_head_t *qp)
 504 {
 505         ulwp_t *self = curthread;
 506         uberdata_t *udp = self->ul_uberdata;
 507         queue_root_t *qrp;
 508         ulwp_t *ulwp;
 509         ulwp_t *prev;
 510         uint_t index;
 511         uint32_t cnt;
 512         char qtype;
 513         void *wchan;
 514
 515         ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
 516         ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
 517         for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
 518                 cnt++;
 519                 ASSERT((qrp->qr_head != NULL && qrp->qr_tail != NULL) ||
 520                     (qrp->qr_head == NULL && qrp->qr_tail == NULL));
 521         }
 522         ASSERT(qp->qh_hlen == cnt && qp->qh_hmax >= cnt);
 523         qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
 524         ASSERT(qp->qh_type == qtype);
 525         if (!thread_queue_verify)
 526                 return;
 527         /* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
 528         for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
 529                 for (prev = NULL, ulwp = qrp->qr_head; ulwp != NULL;
 530                     prev = ulwp, ulwp = ulwp->ul_link) {
 531                         cnt++;
 532                         if (ulwp->ul_writer)
 533                                 ASSERT(prev == NULL || prev->ul_writer);
 534                         ASSERT(ulwp->ul_qtype == qtype);
 535                         ASSERT(ulwp->ul_wchan != NULL);
 536                         ASSERT(ulwp->ul_sleepq == qp);
 537                         wchan = ulwp->ul_wchan;
 538                         ASSERT(qrp->qr_wchan == wchan);
 539                         index = QUEUE_HASH(wchan, qtype);
 540                         ASSERT(&udp->queue_head[index] == qp);
 541                 }
 542                 ASSERT(qrp->qr_tail == prev);
 543         }
 544         ASSERT(qp->qh_qlen == cnt);
 545 }
 546
 547 #else   /* THREAD_DEBUG */
 548
 549 #define QVERIFY(qp)
 550
 551 #endif  /* THREAD_DEBUG */
 552
 553 /*
 554  * Acquire a queue head.
 555  */
 556 queue_head_t *
 557 queue_lock(void *wchan, int qtype)
 558 {
 559         uberdata_t *udp = curthread->ul_uberdata;
 560         queue_head_t *qp;
 561         queue_root_t *qrp;
 562
 563         ASSERT(qtype == MX || qtype == CV);
 564
 565         /*
 566          * It is possible that we could be called while still single-threaded.
 567          * If so, we call queue_alloc() to allocate the queue_head[] array.
 568          */
 569         if ((qp = udp->queue_head) == NULL) {
 570                 queue_alloc();
 571                 qp = udp->queue_head;
 572         }
 573         qp += QUEUE_HASH(wchan, qtype);
 574         spin_lock_set(&qp->qh_lock);
 575         for (qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next)
 576                 if (qrp->qr_wchan == wchan)
 577                         break;
 578         if (qrp == NULL && qp->qh_def_root.qr_head == NULL) {
 579                 /* the default queue root is available; use it */
 580                 qrp = &qp->qh_def_root;
 581                 qrp->qr_wchan = wchan;
 582                 ASSERT(qrp->qr_next == NULL);
 583                 ASSERT(qrp->qr_tail == NULL &&
 584                     qrp->qr_rtcount == 0 && qrp->qr_qlen == 0);
 585         }
 586         qp->qh_wchan = wchan;   /* valid until queue_unlock() is called */
 587         qp->qh_root = qrp;      /* valid until queue_unlock() is called */
 588         INCR32(qp->qh_lockcount);
 589         QVERIFY(qp);
 590         return (qp);
 591 }
 592
 593 /*
 594  * Release a queue head.
 595  */
 596 void
 597 queue_unlock(queue_head_t *qp)
 598 {
 599         QVERIFY(qp);
 600         spin_lock_clear(&qp->qh_lock);
 601 }
 602
 603 /*
 604  * For rwlock queueing, we must queue writers ahead of readers of the
 605  * same priority.  We do this by making writers appear to have a half
 606  * point higher priority for purposes of priority comparisons below.
 607  */
 608 #define CMP_PRIO(ulwp)  ((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
 609
 610 void
 611 enqueue(queue_head_t *qp, ulwp_t *ulwp, int force_fifo)
 612 {
 613         queue_root_t *qrp;
 614         ulwp_t **ulwpp;
 615         ulwp_t *next;
 616         int pri = CMP_PRIO(ulwp);
 617
 618         ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
 619         ASSERT(ulwp->ul_sleepq != qp);
 620
 621         if ((qrp = qp->qh_root) == NULL) {
 622                 /* use the thread's queue root for the linkage */
 623                 qrp = &ulwp->ul_queue_root;
 624                 qrp->qr_next = qp->qh_hlist;
 625                 qrp->qr_prev = NULL;
 626                 qrp->qr_head = NULL;
 627                 qrp->qr_tail = NULL;
 628                 qrp->qr_wchan = qp->qh_wchan;
 629                 qrp->qr_rtcount = 0;
 630                 qrp->qr_qlen = 0;
 631                 qrp->qr_qmax = 0;
 632                 qp->qh_hlist->qr_prev = qrp;
 633                 qp->qh_hlist = qrp;
 634                 qp->qh_root = qrp;
 635                 MAXINCR(qp->qh_hmax, qp->qh_hlen);
 636         }
 637
 638         /*
 639          * LIFO queue ordering is unfair and can lead to starvation,
 640          * but it gives better performance for heavily contended locks.
 641          * We use thread_queue_fifo (range is 0..8) to determine
 642          * the frequency of FIFO vs LIFO queuing:
 643          *      0 : every 256th time    (almost always LIFO)
 644          *      1 : every 128th time
 645          *      2 : every 64th  time
 646          *      3 : every 32nd  time
 647          *      4 : every 16th  time    (the default value, mostly LIFO)
 648          *      5 : every 8th   time
 649          *      6 : every 4th   time
 650          *      7 : every 2nd   time
 651          *      8 : every time          (never LIFO, always FIFO)
 652          * Note that there is always some degree of FIFO ordering.
 653          * This breaks live lock conditions that occur in applications
 654          * that are written assuming (incorrectly) that threads acquire
 655          * locks fairly, that is, in roughly round-robin order.
 656          * In any event, the queue is maintained in kernel priority order.
 657          *
 658          * If force_fifo is non-zero, fifo queueing is forced.
 659          * SUSV3 requires this for semaphores.
 660          */
 661         if (qrp->qr_head == NULL) {
 662                 /*
 663                  * The queue is empty.  LIFO/FIFO doesn't matter.
 664                  */
 665                 ASSERT(qrp->qr_tail == NULL);
 666                 ulwpp = &qrp->qr_head;
 667         } else if (force_fifo |
 668             (((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0)) {
 669                 /*
 670                  * Enqueue after the last thread whose priority is greater
 671                  * than or equal to the priority of the thread being queued.
 672                  * Attempt first to go directly onto the tail of the queue.
 673                  */
 674                 if (pri <= CMP_PRIO(qrp->qr_tail))
 675                         ulwpp = &qrp->qr_tail->ul_link;
 676                 else {
 677                         for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
 678                             ulwpp = &next->ul_link)
 679                                 if (pri > CMP_PRIO(next))
 680                                         break;
 681                 }
 682         } else {
 683                 /*
 684                  * Enqueue before the first thread whose priority is less
 685                  * than or equal to the priority of the thread being queued.
 686                  * Hopefully we can go directly onto the head of the queue.
 687                  */
 688                 for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
 689                     ulwpp = &next->ul_link)
 690                         if (pri >= CMP_PRIO(next))
 691                                 break;
 692         }
 693         if ((ulwp->ul_link = *ulwpp) == NULL)
 694                 qrp->qr_tail = ulwp;
 695         *ulwpp = ulwp;
 696
 697         ulwp->ul_sleepq = qp;
 698         ulwp->ul_wchan = qp->qh_wchan;
 699         ulwp->ul_qtype = qp->qh_type;
 700         if ((ulwp->ul_schedctl != NULL &&
 701             ulwp->ul_schedctl->sc_cid == ulwp->ul_rtclassid) |
 702             ulwp->ul_pilocks) {
 703                 ulwp->ul_rtqueued = 1;
 704                 qrp->qr_rtcount++;
 705         }
 706         MAXINCR(qrp->qr_qmax, qrp->qr_qlen);
 707         MAXINCR(qp->qh_qmax, qp->qh_qlen);
 708 }
 709
 710 /*
 711  * Helper function for queue_slot() and queue_slot_rt().
 712  * Try to find a non-suspended thread on the queue.
 713  */
 714 static ulwp_t **
 715 queue_slot_runnable(ulwp_t **ulwpp, ulwp_t **prevp, int rt)
 716 {
 717         ulwp_t *ulwp;
 718         ulwp_t **foundpp = NULL;
 719         int priority = -1;
 720         ulwp_t *prev;
 721         int tpri;
 722
 723         for (prev = NULL;
 724             (ulwp = *ulwpp) != NULL;
 725             prev = ulwp, ulwpp = &ulwp->ul_link) {
 726                 if (ulwp->ul_stop)      /* skip suspended threads */
 727                         continue;
 728                 tpri = rt? CMP_PRIO(ulwp) : 0;
 729                 if (tpri > priority) {
 730                         foundpp = ulwpp;
 731                         *prevp = prev;
 732                         priority = tpri;
 733                         if (!rt)
 734                                 break;
 735                 }
 736         }
 737         return (foundpp);
 738 }
 739
 740 /*
 741  * For real-time, we search the entire queue because the dispatch
 742  * (kernel) priorities may have changed since enqueueing.
 743  */
 744 static ulwp_t **
 745 queue_slot_rt(ulwp_t **ulwpp_org, ulwp_t **prevp)
 746 {
 747         ulwp_t **ulwpp = ulwpp_org;
 748         ulwp_t *ulwp = *ulwpp;
 749         ulwp_t **foundpp = ulwpp;
 750         int priority = CMP_PRIO(ulwp);
 751         ulwp_t *prev;
 752         int tpri;
 753
 754         for (prev = ulwp, ulwpp = &ulwp->ul_link;
 755             (ulwp = *ulwpp) != NULL;
 756             prev = ulwp, ulwpp = &ulwp->ul_link) {
 757                 tpri = CMP_PRIO(ulwp);
 758                 if (tpri > priority) {
 759                         foundpp = ulwpp;
 760                         *prevp = prev;
 761                         priority = tpri;
 762                 }
 763         }
 764         ulwp = *foundpp;
 765
 766         /*
 767          * Try not to return a suspended thread.
 768          * This mimics the old libthread's behavior.
 769          */
 770         if (ulwp->ul_stop &&
 771             (ulwpp = queue_slot_runnable(ulwpp_org, prevp, 1)) != NULL) {
 772                 foundpp = ulwpp;
 773                 ulwp = *foundpp;
 774         }
 775         ulwp->ul_rt = 1;
 776         return (foundpp);
 777 }
 778
 779 ulwp_t **
 780 queue_slot(queue_head_t *qp, ulwp_t **prevp, int *more)
 781 {
 782         queue_root_t *qrp;
 783         ulwp_t **ulwpp;
 784         ulwp_t *ulwp;
 785         int rt;
 786
 787         ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
 788
 789         if ((qrp = qp->qh_root) == NULL || (ulwp = qrp->qr_head) == NULL) {
 790                 *more = 0;
 791                 return (NULL);          /* no lwps on the queue */
 792         }
 793         rt = (qrp->qr_rtcount != 0);
 794         *prevp = NULL;
 795         if (ulwp->ul_link == NULL) {    /* only one lwp on the queue */
 796                 *more = 0;
 797                 ulwp->ul_rt = rt;
 798                 return (&qrp->qr_head);
 799         }
 800         *more = 1;
 801
 802         if (rt)         /* real-time queue */
 803                 return (queue_slot_rt(&qrp->qr_head, prevp));
 804         /*
 805          * Try not to return a suspended thread.
 806          * This mimics the old libthread's behavior.
 807          */
 808         if (ulwp->ul_stop &&
 809             (ulwpp = queue_slot_runnable(&qrp->qr_head, prevp, 0)) != NULL) {
 810                 ulwp = *ulwpp;
 811                 ulwp->ul_rt = 0;
 812                 return (ulwpp);
 813         }
 814         /*
 815          * The common case; just pick the first thread on the queue.
 816          */
 817         ulwp->ul_rt = 0;
 818         return (&qrp->qr_head);
 819 }
 820
 821 /*
 822  * Common code for unlinking an lwp from a user-level sleep queue.
 823  */
 824 void
 825 queue_unlink(queue_head_t *qp, ulwp_t **ulwpp, ulwp_t *prev)
 826 {
 827         queue_root_t *qrp = qp->qh_root;
 828         queue_root_t *nqrp;
 829         ulwp_t *ulwp = *ulwpp;
 830         ulwp_t *next;
 831
 832         ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
 833         ASSERT(qp->qh_wchan != NULL && ulwp->ul_wchan == qp->qh_wchan);
 834
 835         DECR(qp->qh_qlen);
 836         DECR(qrp->qr_qlen);
 837         if (ulwp->ul_rtqueued) {
 838                 ulwp->ul_rtqueued = 0;
 839                 qrp->qr_rtcount--;
 840         }
 841         next = ulwp->ul_link;
 842         *ulwpp = next;
 843         ulwp->ul_link = NULL;
 844         if (qrp->qr_tail == ulwp)
 845                 qrp->qr_tail = prev;
 846         if (qrp == &ulwp->ul_queue_root) {
 847                 /*
 848                  * We can't continue to use the unlinked thread's
 849                  * queue root for the linkage.
 850                  */
 851                 queue_root_t *qr_next = qrp->qr_next;
 852                 queue_root_t *qr_prev = qrp->qr_prev;
 853
 854                 if (qrp->qr_tail) {
 855                         /* switch to using the last thread's queue root */
 856                         ASSERT(qrp->qr_qlen != 0);
 857                         nqrp = &qrp->qr_tail->ul_queue_root;
 858                         *nqrp = *qrp;
 859                         if (qr_next)
 860                                 qr_next->qr_prev = nqrp;
 861                         if (qr_prev)
 862                                 qr_prev->qr_next = nqrp;
 863                         else
 864                                 qp->qh_hlist = nqrp;
 865                         qp->qh_root = nqrp;
 866                 } else {
 867                         /* empty queue root; just delete from the hash list */
 868                         ASSERT(qrp->qr_qlen == 0);
 869                         if (qr_next)
 870                                 qr_next->qr_prev = qr_prev;
 871                         if (qr_prev)
 872                                 qr_prev->qr_next = qr_next;
 873                         else
 874                                 qp->qh_hlist = qr_next;
 875                         qp->qh_root = NULL;
 876                         DECR(qp->qh_hlen);
 877                 }
 878         }
 879 }
 880
 881 ulwp_t *
 882 dequeue(queue_head_t *qp, int *more)
 883 {
 884         ulwp_t **ulwpp;
 885         ulwp_t *ulwp;
 886         ulwp_t *prev;
 887
 888         if ((ulwpp = queue_slot(qp, &prev, more)) == NULL)
 889                 return (NULL);
 890         ulwp = *ulwpp;
 891         queue_unlink(qp, ulwpp, prev);
 892         ulwp->ul_sleepq = NULL;
 893         ulwp->ul_wchan = NULL;
 894         return (ulwp);
 895 }
 896
 897 /*
 898  * Return a pointer to the highest priority thread sleeping on wchan.
 899  */
 900 ulwp_t *
 901 queue_waiter(queue_head_t *qp)
 902 {
 903         ulwp_t **ulwpp;
 904         ulwp_t *prev;
 905         int more;
 906
 907         if ((ulwpp = queue_slot(qp, &prev, &more)) == NULL)
 908                 return (NULL);
 909         return (*ulwpp);
 910 }
 911
 912 int
 913 dequeue_self(queue_head_t *qp)
 914 {
 915         ulwp_t *self = curthread;
 916         queue_root_t *qrp;
 917         ulwp_t **ulwpp;
 918         ulwp_t *ulwp;
 919         ulwp_t *prev;
 920         int found = 0;
 921
 922         ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
 923
 924         /* find self on the sleep queue */
 925         if ((qrp = qp->qh_root) != NULL) {
 926                 for (prev = NULL, ulwpp = &qrp->qr_head;
 927                     (ulwp = *ulwpp) != NULL;
 928                     prev = ulwp, ulwpp = &ulwp->ul_link) {
 929                         if (ulwp == self) {
 930                                 queue_unlink(qp, ulwpp, prev);
 931                                 self->ul_cvmutex = NULL;
 932                                 self->ul_sleepq = NULL;
 933                                 self->ul_wchan = NULL;
 934                                 found = 1;
 935                                 break;
 936                         }
 937                 }
 938         }
 939
 940         if (!found)
 941                 thr_panic("dequeue_self(): curthread not found on queue");
 942
 943         return ((qrp = qp->qh_root) != NULL && qrp->qr_head != NULL);
 944 }
 945
 946 /*
 947  * Called from call_user_handler() and _thrp_suspend() to take
 948  * ourself off of our sleep queue so we can grab locks.
 949  */
 950 void
 951 unsleep_self(void)
 952 {
 953         ulwp_t *self = curthread;
 954         queue_head_t *qp;
 955
 956         /*
 957          * Calling enter_critical()/exit_critical() here would lead
 958          * to recursion.  Just manipulate self->ul_critical directly.
 959          */
 960         self->ul_critical++;
 961         while (self->ul_sleepq != NULL) {
 962                 qp = queue_lock(self->ul_wchan, self->ul_qtype);
 963                 /*
 964                  * We may have been moved from a CV queue to a
 965                  * mutex queue while we were attempting queue_lock().
 966                  * If so, just loop around and try again.
 967                  * dequeue_self() clears self->ul_sleepq.
 968                  */
 969                 if (qp == self->ul_sleepq)
 970                         (void) dequeue_self(qp);
 971                 queue_unlock(qp);
 972         }
 973         self->ul_writer = 0;
 974         self->ul_critical--;
 975 }
 976
 977 /*
 978  * Common code for calling the the ___lwp_mutex_timedlock() system call.
 979  * Returns with mutex_owner and mutex_ownerpid set correctly.
 980  */
 981 static int
 982 mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
 983 {
 984         ulwp_t *self = curthread;
 985         uberdata_t *udp = self->ul_uberdata;
 986         int mtype = mp->mutex_type;
 987         hrtime_t begin_sleep;
 988         int acquired;
 989         int error;
 990
 991         self->ul_sp = stkptr();
 992         self->ul_wchan = mp;
 993         if (__td_event_report(self, TD_SLEEP, udp)) {
 994                 self->ul_td_evbuf.eventnum = TD_SLEEP;
 995                 self->ul_td_evbuf.eventdata = mp;
 996                 tdb_event(TD_SLEEP, udp);
 997         }
 998         if (msp) {
 999                 tdb_incr(msp->mutex_sleep);
1000                 begin_sleep = gethrtime();
1001         }
1002
1003         DTRACE_PROBE1(plockstat, mutex__block, mp);
1004
1005         for (;;) {
1006                 /*
1007                  * A return value of EOWNERDEAD or ELOCKUNMAPPED
1008                  * means we successfully acquired the lock.
1009                  */
1010                 if ((error = ___lwp_mutex_timedlock(mp, tsp, self)) != 0 &&
1011                     error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1012                         acquired = 0;
1013                         break;
1014                 }
1015
1016                 if (mtype & USYNC_PROCESS) {
1017                         /*
1018                          * Defend against forkall().  We may be the child,
1019                          * in which case we don't actually own the mutex.
1020                          */
1021                         enter_critical(self);
1022                         if (mp->mutex_ownerpid == udp->pid) {
1023                                 exit_critical(self);
1024                                 acquired = 1;
1025                                 break;
1026                         }
1027                         exit_critical(self);
1028                 } else {
1029                         acquired = 1;
1030                         break;
1031                 }
1032         }
1033
1034         if (msp)
1035                 msp->mutex_sleep_time += gethrtime() - begin_sleep;
1036         self->ul_wchan = NULL;
1037         self->ul_sp = 0;
1038
1039         if (acquired) {
1040                 ASSERT(mp->mutex_owner == (uintptr_t)self);
1041                 DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1042                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1043         } else {
1044                 DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1045                 DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1046         }
1047
1048         return (error);
1049 }
1050
1051 /*
1052  * Common code for calling the ___lwp_mutex_trylock() system call.
1053  * Returns with mutex_owner and mutex_ownerpid set correctly.
1054  */
1055 int
1056 mutex_trylock_kernel(mutex_t *mp)
1057 {
1058         ulwp_t *self = curthread;
1059         uberdata_t *udp = self->ul_uberdata;
1060         int mtype = mp->mutex_type;
1061         int error;
1062         int acquired;
1063
1064         for (;;) {
1065                 /*
1066                  * A return value of EOWNERDEAD or ELOCKUNMAPPED
1067                  * means we successfully acquired the lock.
1068                  */
1069                 if ((error = ___lwp_mutex_trylock(mp, self)) != 0 &&
1070                     error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1071                         acquired = 0;
1072                         break;
1073                 }
1074
1075                 if (mtype & USYNC_PROCESS) {
1076                         /*
1077                          * Defend against forkall().  We may be the child,
1078                          * in which case we don't actually own the mutex.
1079                          */
1080                         enter_critical(self);
1081                         if (mp->mutex_ownerpid == udp->pid) {
1082                                 exit_critical(self);
1083                                 acquired = 1;
1084                                 break;
1085                         }
1086                         exit_critical(self);
1087                 } else {
1088                         acquired = 1;
1089                         break;
1090                 }
1091         }
1092
1093         if (acquired) {
1094                 ASSERT(mp->mutex_owner == (uintptr_t)self);
1095                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1096         } else if (error != EBUSY) {
1097                 DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1098         }
1099
1100         return (error);
1101 }
1102
1103 volatile sc_shared_t *
1104 setup_schedctl(void)
1105 {
1106         ulwp_t *self = curthread;
1107         volatile sc_shared_t *scp;
1108         sc_shared_t *tmp;
1109
1110         if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
1111             !self->ul_vfork &&                  /* not a child of vfork() */
1112             !self->ul_schedctl_called) {        /* haven't been called before */
1113                 enter_critical(self);
1114                 self->ul_schedctl_called = &self->ul_uberdata->uberflags;
1115                 if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
1116                         self->ul_schedctl = scp = tmp;
1117                 exit_critical(self);
1118         }
1119         /*
1120          * Unless the call to setup_schedctl() is surrounded
1121          * by enter_critical()/exit_critical(), the address
1122          * we are returning could be invalid due to a forkall()
1123          * having occurred in another thread.
1124          */
1125         return (scp);
1126 }
1127
1128 /*
1129  * Interfaces from libsched, incorporated into libc.
1130  * libsched.so.1 is now a filter library onto libc.
1131  */
1132 #pragma weak schedctl_lookup = schedctl_init
1133 schedctl_t *
1134 schedctl_init(void)
1135 {
1136         volatile sc_shared_t *scp = setup_schedctl();
1137         return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
1138 }
1139
1140 void
1141 schedctl_exit(void)
1142 {
1143 }
1144
1145 /*
1146  * Contract private interface for java.
1147  * Set up the schedctl data if it doesn't exist yet.
1148  * Return a pointer to the pointer to the schedctl data.
1149  */
1150 volatile sc_shared_t *volatile *
1151 _thr_schedctl(void)
1152 {
1153         ulwp_t *self = curthread;
1154         volatile sc_shared_t *volatile *ptr;
1155
1156         if (self->ul_vfork)
1157                 return (NULL);
1158         if (*(ptr = &self->ul_schedctl) == NULL)
1159                 (void) setup_schedctl();
1160         return (ptr);
1161 }
1162
1163 /*
1164  * Block signals and attempt to block preemption.
1165  * no_preempt()/preempt() must be used in pairs but can be nested.
1166  */
1167 void
1168 no_preempt(ulwp_t *self)
1169 {
1170         volatile sc_shared_t *scp;
1171
1172         if (self->ul_preempt++ == 0) {
1173                 enter_critical(self);
1174                 if ((scp = self->ul_schedctl) != NULL ||
1175                     (scp = setup_schedctl()) != NULL) {
1176                         /*
1177                          * Save the pre-existing preempt value.
1178                          */
1179                         self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
1180                         scp->sc_preemptctl.sc_nopreempt = 1;
1181                 }
1182         }
1183 }
1184
1185 /*
1186  * Undo the effects of no_preempt().
1187  */
1188 void
1189 preempt(ulwp_t *self)
1190 {
1191         volatile sc_shared_t *scp;
1192
1193         ASSERT(self->ul_preempt > 0);
1194         if (--self->ul_preempt == 0) {
1195                 if ((scp = self->ul_schedctl) != NULL) {
1196                         /*
1197                          * Restore the pre-existing preempt value.
1198                          */
1199                         scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
1200                         if (scp->sc_preemptctl.sc_yield &&
1201                             scp->sc_preemptctl.sc_nopreempt == 0) {
1202                                 yield();
1203                                 if (scp->sc_preemptctl.sc_yield) {
1204                                         /*
1205                                          * Shouldn't happen.  This is either
1206                                          * a race condition or the thread
1207                                          * just entered the real-time class.
1208                                          */
1209                                         yield();
1210                                         scp->sc_preemptctl.sc_yield = 0;
1211                                 }
1212                         }
1213                 }
1214                 exit_critical(self);
1215         }
1216 }
1217
1218 /*
1219  * If a call to preempt() would cause the current thread to yield or to
1220  * take deferred actions in exit_critical(), then unpark the specified
1221  * lwp so it can run while we delay.  Return the original lwpid if the
1222  * unpark was not performed, else return zero.  The tests are a repeat
1223  * of some of the tests in preempt(), above.  This is a statistical
1224  * optimization solely for cond_sleep_queue(), below.
1225  */
1226 static lwpid_t
1227 preempt_unpark(ulwp_t *self, lwpid_t lwpid)
1228 {
1229         volatile sc_shared_t *scp = self->ul_schedctl;
1230
1231         ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
1232         if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
1233             (self->ul_curplease && self->ul_critical == 1)) {
1234                 (void) __lwp_unpark(lwpid);
1235                 lwpid = 0;
1236         }
1237         return (lwpid);
1238 }
1239
1240 /*
1241  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1242  * If this fails, return EBUSY and let the caller deal with it.
1243  * If this succeeds, return 0 with mutex_owner set to curthread.
1244  */
1245 static int
1246 mutex_trylock_adaptive(mutex_t *mp, int tryhard)
1247 {
1248         ulwp_t *self = curthread;
1249         int error = EBUSY;
1250         ulwp_t *ulwp;
1251         volatile sc_shared_t *scp;
1252         volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
1253         volatile uint64_t *ownerp = (volatile uint64_t *)&mp->mutex_owner;
1254         uint32_t new_lockword;
1255         int count = 0;
1256         int max_count;
1257         uint8_t max_spinners;
1258
1259         ASSERT(!(mp->mutex_type & USYNC_PROCESS));
1260
1261         if (MUTEX_OWNED(mp, self))
1262                 return (EBUSY);
1263
1264         enter_critical(self);
1265
1266         /* short-cut, not definitive (see below) */
1267         if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1268                 ASSERT(mp->mutex_type & LOCK_ROBUST);
1269                 error = ENOTRECOVERABLE;
1270                 goto done;
1271         }
1272
1273         /*
1274          * Make one attempt to acquire the lock before
1275          * incurring the overhead of the spin loop.
1276          */
1277         if (set_lock_byte(lockp) == 0) {
1278                 *ownerp = (uintptr_t)self;
1279                 error = 0;
1280                 goto done;
1281         }
1282         if (!tryhard)
1283                 goto done;
1284         if (ncpus == 0)
1285                 ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1286         if ((max_spinners = self->ul_max_spinners) >= ncpus)
1287                 max_spinners = ncpus - 1;
1288         max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1289         if (max_count == 0)
1290                 goto done;
1291
1292         /*
1293          * This spin loop is unfair to lwps that have already dropped into
1294          * the kernel to sleep.  They will starve on a highly-contended mutex.
1295          * This is just too bad.  The adaptive spin algorithm is intended
1296          * to allow programs with highly-contended locks (that is, broken
1297          * programs) to execute with reasonable speed despite their contention.
1298          * Being fair would reduce the speed of such programs and well-written
1299          * programs will not suffer in any case.
1300          */
1301         if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
1302                 goto done;
1303         DTRACE_PROBE1(plockstat, mutex__spin, mp);
1304         for (count = 1; ; count++) {
1305                 if (*lockp == 0 && set_lock_byte(lockp) == 0) {
1306                         *ownerp = (uintptr_t)self;
1307                         error = 0;
1308                         break;
1309                 }
1310                 if (count == max_count)
1311                         break;
1312                 SMT_PAUSE();
1313                 /*
1314                  * Stop spinning if the mutex owner is not running on
1315                  * a processor; it will not drop the lock any time soon
1316                  * and we would just be wasting time to keep spinning.
1317                  *
1318                  * Note that we are looking at another thread (ulwp_t)
1319                  * without ensuring that the other thread does not exit.
1320                  * The scheme relies on ulwp_t structures never being
1321                  * deallocated by the library (the library employs a free
1322                  * list of ulwp_t structs that are reused when new threads
1323                  * are created) and on schedctl shared memory never being
1324                  * deallocated once created via __schedctl().
1325                  *
1326                  * Thus, the worst that can happen when the spinning thread
1327                  * looks at the owner's schedctl data is that it is looking
1328                  * at some other thread's schedctl data.  This almost never
1329                  * happens and is benign when it does.
1330                  */
1331                 if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1332                     ((scp = ulwp->ul_schedctl) == NULL ||
1333                     scp->sc_state != SC_ONPROC))
1334                         break;
1335         }
1336         new_lockword = spinners_decr(&mp->mutex_lockword);
1337         if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1338                 /*
1339                  * We haven't yet acquired the lock, the lock
1340                  * is free, and there are no other spinners.
1341                  * Make one final attempt to acquire the lock.
1342                  *
1343                  * This isn't strictly necessary since mutex_lock_queue()
1344                  * (the next action this thread will take if it doesn't
1345                  * acquire the lock here) makes one attempt to acquire
1346                  * the lock before putting the thread to sleep.
1347                  *
1348                  * If the next action for this thread (on failure here)
1349                  * were not to call mutex_lock_queue(), this would be
1350                  * necessary for correctness, to avoid ending up with an
1351                  * unheld mutex with waiters but no one to wake them up.
1352                  */
1353                 if (set_lock_byte(lockp) == 0) {
1354                         *ownerp = (uintptr_t)self;
1355                         error = 0;
1356                 }
1357                 count++;
1358         }
1359
1360 done:
1361         if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1362                 ASSERT(mp->mutex_type & LOCK_ROBUST);
1363                 /*
1364                  * We shouldn't own the mutex.
1365                  * Just clear the lock; everyone has already been waked up.
1366                  */
1367                 *ownerp = 0;
1368                 (void) clear_lockbyte(&mp->mutex_lockword);
1369                 error = ENOTRECOVERABLE;
1370         }
1371
1372         exit_critical(self);
1373
1374         if (error) {
1375                 if (count) {
1376                         DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
1377                 }
1378                 if (error != EBUSY) {
1379                         DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1380                 }
1381         } else {
1382                 if (count) {
1383                         DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
1384                 }
1385                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1386                 if (mp->mutex_flag & LOCK_OWNERDEAD) {
1387                         ASSERT(mp->mutex_type & LOCK_ROBUST);
1388                         error = EOWNERDEAD;
1389                 }
1390         }
1391
1392         return (error);
1393 }
1394
1395 /*
1396  * Same as mutex_trylock_adaptive(), except specifically for queue locks.
1397  * The owner field is not set here; the caller (spin_lock_set()) sets it.
1398  */
1399 static int
1400 mutex_queuelock_adaptive(mutex_t *mp)
1401 {
1402         ulwp_t *ulwp;
1403         volatile sc_shared_t *scp;
1404         volatile uint8_t *lockp;
1405         volatile uint64_t *ownerp;
1406         int count = curthread->ul_queue_spin;
1407
1408         ASSERT(mp->mutex_type == USYNC_THREAD);
1409
1410         if (count == 0)
1411                 return (EBUSY);
1412
1413         lockp = (volatile uint8_t *)&mp->mutex_lockw;
1414         ownerp = (volatile uint64_t *)&mp->mutex_owner;
1415         while (--count >= 0) {
1416                 if (*lockp == 0 && set_lock_byte(lockp) == 0)
1417                         return (0);
1418                 SMT_PAUSE();
1419                 if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1420                     ((scp = ulwp->ul_schedctl) == NULL ||
1421                     scp->sc_state != SC_ONPROC))
1422                         break;
1423         }
1424
1425         return (EBUSY);
1426 }
1427
1428 /*
1429  * Like mutex_trylock_adaptive(), but for process-shared mutexes.
1430  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1431  * If this fails, return EBUSY and let the caller deal with it.
1432  * If this succeeds, return 0 with mutex_owner set to curthread
1433  * and mutex_ownerpid set to the current pid.
1434  */
1435 static int
1436 mutex_trylock_process(mutex_t *mp, int tryhard)
1437 {
1438         ulwp_t *self = curthread;
1439         uberdata_t *udp = self->ul_uberdata;
1440         int error = EBUSY;
1441         volatile uint64_t *lockp = (volatile uint64_t *)&mp->mutex_lockword64;
1442         uint32_t new_lockword;
1443         int count = 0;
1444         int max_count;
1445         uint8_t max_spinners;
1446
1447 #if defined(__sparc) && !defined(_LP64)
1448         /* horrible hack, necessary only on 32-bit sparc */
1449         int fix_alignment_problem =
1450             (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
1451             self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST));
1452 #endif
1453
1454         ASSERT(mp->mutex_type & USYNC_PROCESS);
1455
1456         if (shared_mutex_held(mp))
1457                 return (EBUSY);
1458
1459         enter_critical(self);
1460
1461         /* short-cut, not definitive (see below) */
1462         if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1463                 ASSERT(mp->mutex_type & LOCK_ROBUST);
1464                 error = ENOTRECOVERABLE;
1465                 goto done;
1466         }
1467
1468         /*
1469          * Make one attempt to acquire the lock before
1470          * incurring the overhead of the spin loop.
1471          */
1472 #if defined(__sparc) && !defined(_LP64)
1473         /* horrible hack, necessary only on 32-bit sparc */
1474         if (fix_alignment_problem) {
1475                 if (set_lock_byte(&mp->mutex_lockw) == 0) {
1476                         mp->mutex_ownerpid = udp->pid;
1477                         mp->mutex_owner = (uintptr_t)self;
1478                         error = 0;
1479                         goto done;
1480                 }
1481         } else
1482 #endif
1483         if (set_lock_byte64(lockp, udp->pid) == 0) {
1484                 mp->mutex_owner = (uintptr_t)self;
1485                 /* mp->mutex_ownerpid was set by set_lock_byte64() */
1486                 error = 0;
1487                 goto done;
1488         }
1489         if (!tryhard)
1490                 goto done;
1491         if (ncpus == 0)
1492                 ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1493         if ((max_spinners = self->ul_max_spinners) >= ncpus)
1494                 max_spinners = ncpus - 1;
1495         max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1496         if (max_count == 0)
1497                 goto done;
1498
1499         /*
1500          * This is a process-shared mutex.
1501          * We cannot know if the owner is running on a processor.
1502          * We just spin and hope that it is on a processor.
1503          */
1504         if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
1505                 goto done;
1506         DTRACE_PROBE1(plockstat, mutex__spin, mp);
1507         for (count = 1; ; count++) {
1508 #if defined(__sparc) && !defined(_LP64)
1509                 /* horrible hack, necessary only on 32-bit sparc */
1510                 if (fix_alignment_problem) {
1511                         if ((*lockp & LOCKMASK64) == 0 &&
1512                             set_lock_byte(&mp->mutex_lockw) == 0) {
1513                                 mp->mutex_ownerpid = udp->pid;
1514                                 mp->mutex_owner = (uintptr_t)self;
1515                                 error = 0;
1516                                 break;
1517                         }
1518                 } else
1519 #endif
1520                 if ((*lockp & LOCKMASK64) == 0 &&
1521                     set_lock_byte64(lockp, udp->pid) == 0) {
1522                         mp->mutex_owner = (uintptr_t)self;
1523                         /* mp->mutex_ownerpid was set by set_lock_byte64() */
1524                         error = 0;
1525                         break;
1526                 }
1527                 if (count == max_count)
1528                         break;
1529                 SMT_PAUSE();
1530         }
1531         new_lockword = spinners_decr(&mp->mutex_lockword);
1532         if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1533                 /*
1534                  * We haven't yet acquired the lock, the lock
1535                  * is free, and there are no other spinners.
1536                  * Make one final attempt to acquire the lock.
1537                  *
1538                  * This isn't strictly necessary since mutex_lock_kernel()
1539                  * (the next action this thread will take if it doesn't
1540                  * acquire the lock here) makes one attempt to acquire
1541                  * the lock before putting the thread to sleep.
1542                  *
1543                  * If the next action for this thread (on failure here)
1544                  * were not to call mutex_lock_kernel(), this would be
1545                  * necessary for correctness, to avoid ending up with an
1546                  * unheld mutex with waiters but no one to wake them up.
1547                  */
1548 #if defined(__sparc) && !defined(_LP64)
1549                 /* horrible hack, necessary only on 32-bit sparc */
1550                 if (fix_alignment_problem) {
1551                         if (set_lock_byte(&mp->mutex_lockw) == 0) {
1552                                 mp->mutex_ownerpid = udp->pid;
1553                                 mp->mutex_owner = (uintptr_t)self;
1554                                 error = 0;
1555                         }
1556                 } else
1557 #endif
1558                 if (set_lock_byte64(lockp, udp->pid) == 0) {
1559                         mp->mutex_owner = (uintptr_t)self;
1560                         /* mp->mutex_ownerpid was set by set_lock_byte64() */
1561                         error = 0;
1562                 }
1563                 count++;
1564         }
1565
1566 done:
1567         if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1568                 ASSERT(mp->mutex_type & LOCK_ROBUST);
1569                 /*
1570                  * We shouldn't own the mutex.
1571                  * Just clear the lock; everyone has already been waked up.
1572                  */
1573                 mp->mutex_owner = 0;
1574                 /* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1575                 (void) clear_lockbyte64(&mp->mutex_lockword64);
1576                 error = ENOTRECOVERABLE;
1577         }
1578
1579         exit_critical(self);
1580
1581         if (error) {
1582                 if (count) {
1583                         DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
1584                 }
1585                 if (error != EBUSY) {
1586                         DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1587                 }
1588         } else {
1589                 if (count) {
1590                         DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
1591                 }
1592                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1593                 if (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1594                         ASSERT(mp->mutex_type & LOCK_ROBUST);
1595                         if (mp->mutex_flag & LOCK_OWNERDEAD)
1596                                 error = EOWNERDEAD;
1597                         else if (mp->mutex_type & USYNC_PROCESS_ROBUST)
1598                                 error = ELOCKUNMAPPED;
1599                         else
1600                                 error = EOWNERDEAD;
1601                 }
1602         }
1603
1604         return (error);
1605 }
1606
1607 /*
1608  * Mutex wakeup code for releasing a USYNC_THREAD mutex.
1609  * Returns the lwpid of the thread that was dequeued, if any.
1610  * The caller of mutex_wakeup() must call __lwp_unpark(lwpid)
1611  * to wake up the specified lwp.
1612  */
1613 static lwpid_t
1614 mutex_wakeup(mutex_t *mp)
1615 {
1616         lwpid_t lwpid = 0;
1617         int more;
1618         queue_head_t *qp;
1619         ulwp_t *ulwp;
1620
1621         /*
1622          * Dequeue a waiter from the sleep queue.  Don't touch the mutex
1623          * waiters bit if no one was found on the queue because the mutex
1624          * might have been deallocated or reallocated for another purpose.
1625          */
1626         qp = queue_lock(mp, MX);
1627         if ((ulwp = dequeue(qp, &more)) != NULL) {
1628                 lwpid = ulwp->ul_lwpid;
1629                 mp->mutex_waiters = more;
1630         }
1631         queue_unlock(qp);
1632         return (lwpid);
1633 }
1634
1635 /*
1636  * Mutex wakeup code for releasing all waiters on a USYNC_THREAD mutex.
1637  */
1638 static void
1639 mutex_wakeup_all(mutex_t *mp)
1640 {
1641         queue_head_t *qp;
1642         queue_root_t *qrp;
1643         int nlwpid = 0;
1644         int maxlwps = MAXLWPS;
1645         ulwp_t *ulwp;
1646         lwpid_t buffer[MAXLWPS];
1647         lwpid_t *lwpid = buffer;
1648
1649         /*
1650          * Walk the list of waiters and prepare to wake up all of them.
1651          * The waiters flag has already been cleared from the mutex.
1652          *
1653          * We keep track of lwpids that are to be unparked in lwpid[].
1654          * __lwp_unpark_all() is called to unpark all of them after
1655          * they have been removed from the sleep queue and the sleep
1656          * queue lock has been dropped.  If we run out of space in our
1657          * on-stack buffer, we need to allocate more but we can't call
1658          * lmalloc() because we are holding a queue lock when the overflow
1659          * occurs and lmalloc() acquires a lock.  We can't use alloca()
1660          * either because the application may have allocated a small
1661          * stack and we don't want to overrun the stack.  So we call
1662          * alloc_lwpids() to allocate a bigger buffer using the mmap()
1663          * system call directly since that path acquires no locks.
1664          */
1665         qp = queue_lock(mp, MX);
1666         for (;;) {
1667                 if ((qrp = qp->qh_root) == NULL ||
1668                     (ulwp = qrp->qr_head) == NULL)
1669                         break;
1670                 ASSERT(ulwp->ul_wchan == mp);
1671                 queue_unlink(qp, &qrp->qr_head, NULL);
1672                 ulwp->ul_sleepq = NULL;
1673                 ulwp->ul_wchan = NULL;
1674                 if (nlwpid == maxlwps)
1675                         lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
1676                 lwpid[nlwpid++] = ulwp->ul_lwpid;
1677         }
1678
1679         if (nlwpid == 0) {
1680                 queue_unlock(qp);
1681         } else {
1682                 mp->mutex_waiters = 0;
1683                 no_preempt(curthread);
1684                 queue_unlock(qp);
1685                 if (nlwpid == 1)
1686                         (void) __lwp_unpark(lwpid[0]);
1687                 else
1688                         (void) __lwp_unpark_all(lwpid, nlwpid);
1689                 preempt(curthread);
1690         }
1691
1692         if (lwpid != buffer)
1693                 (void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
1694 }
1695
1696 /*
1697  * Release a process-private mutex.
1698  * As an optimization, if there are waiters but there are also spinners
1699  * attempting to acquire the mutex, then don't bother waking up a waiter;
1700  * one of the spinners will acquire the mutex soon and it would be a waste
1701  * of resources to wake up some thread just to have it spin for a while
1702  * and then possibly go back to sleep.  See mutex_trylock_adaptive().
1703  */
1704 static lwpid_t
1705 mutex_unlock_queue(mutex_t *mp, int release_all)
1706 {
1707         ulwp_t *self = curthread;
1708         lwpid_t lwpid = 0;
1709         uint32_t old_lockword;
1710
1711         DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1712         sigoff(self);
1713         mp->mutex_owner = 0;
1714         old_lockword = clear_lockbyte(&mp->mutex_lockword);
1715         if ((old_lockword & WAITERMASK) &&
1716             (release_all || (old_lockword & SPINNERMASK) == 0)) {
1717                 no_preempt(self);       /* ensure a prompt wakeup */
1718                 if (release_all)
1719                         mutex_wakeup_all(mp);
1720                 else
1721                         lwpid = mutex_wakeup(mp);
1722                 if (lwpid == 0)
1723                         preempt(self);
1724         }
1725         sigon(self);
1726         return (lwpid);
1727 }
1728
1729 /*
1730  * Like mutex_unlock_queue(), but for process-shared mutexes.
1731  */
1732 static void
1733 mutex_unlock_process(mutex_t *mp, int release_all)
1734 {
1735         ulwp_t *self = curthread;
1736         uint64_t old_lockword64;
1737
1738         DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1739         sigoff(self);
1740         mp->mutex_owner = 0;
1741 #if defined(__sparc) && !defined(_LP64)
1742         /* horrible hack, necessary only on 32-bit sparc */
1743         if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
1744             self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST)) {
1745                 uint32_t old_lockword;
1746                 mp->mutex_ownerpid = 0;
1747                 old_lockword = clear_lockbyte(&mp->mutex_lockword);
1748                 if ((old_lockword & WAITERMASK) &&
1749                     (release_all || (old_lockword & SPINNERMASK) == 0)) {
1750                         no_preempt(self);       /* ensure a prompt wakeup */
1751                         (void) ___lwp_mutex_wakeup(mp, release_all);
1752                         preempt(self);
1753                 }
1754                 sigon(self);
1755                 return;
1756         }
1757 #endif
1758         /* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1759         old_lockword64 = clear_lockbyte64(&mp->mutex_lockword64);
1760         if ((old_lockword64 & WAITERMASK64) &&
1761             (release_all || (old_lockword64 & SPINNERMASK64) == 0)) {
1762                 no_preempt(self);       /* ensure a prompt wakeup */
1763                 (void) ___lwp_mutex_wakeup(mp, release_all);
1764                 preempt(self);
1765         }
1766         sigon(self);
1767 }
1768
1769 void
1770 stall(void)
1771 {
1772         for (;;)
1773                 (void) mutex_lock_kernel(&stall_mutex, NULL, NULL);
1774 }
1775
1776 /*
1777  * Acquire a USYNC_THREAD mutex via user-level sleep queues.
1778  * We failed set_lock_byte(&mp->mutex_lockw) before coming here.
1779  * If successful, returns with mutex_owner set correctly.
1780  */
1781 int
1782 mutex_lock_queue(ulwp_t *self, tdb_mutex_stats_t *msp, mutex_t *mp,
1783         timespec_t *tsp)
1784 {
1785         uberdata_t *udp = curthread->ul_uberdata;
1786         queue_head_t *qp;
1787         hrtime_t begin_sleep;
1788         int error = 0;
1789
1790         self->ul_sp = stkptr();
1791         if (__td_event_report(self, TD_SLEEP, udp)) {
1792                 self->ul_wchan = mp;
1793                 self->ul_td_evbuf.eventnum = TD_SLEEP;
1794                 self->ul_td_evbuf.eventdata = mp;
1795                 tdb_event(TD_SLEEP, udp);
1796         }
1797         if (msp) {
1798                 tdb_incr(msp->mutex_sleep);
1799                 begin_sleep = gethrtime();
1800         }
1801
1802         DTRACE_PROBE1(plockstat, mutex__block, mp);
1803
1804         /*
1805          * Put ourself on the sleep queue, and while we are
1806          * unable to grab the lock, go park in the kernel.
1807          * Take ourself off the sleep queue after we acquire the lock.
1808          * The waiter bit can be set/cleared only while holding the queue lock.
1809          */
1810         qp = queue_lock(mp, MX);
1811         enqueue(qp, self, 0);
1812         mp->mutex_waiters = 1;
1813         for (;;) {
1814                 if (set_lock_byte(&mp->mutex_lockw) == 0) {
1815                         mp->mutex_owner = (uintptr_t)self;
1816                         mp->mutex_waiters = dequeue_self(qp);
1817                         break;
1818                 }
1819                 set_parking_flag(self, 1);
1820                 queue_unlock(qp);
1821                 /*
1822                  * __lwp_park() will return the residual time in tsp
1823                  * if we are unparked before the timeout expires.
1824                  */
1825                 error = __lwp_park(tsp, 0);
1826                 set_parking_flag(self, 0);
1827                 /*
1828                  * We could have taken a signal or suspended ourself.
1829                  * If we did, then we removed ourself from the queue.
1830                  * Someone else may have removed us from the queue
1831                  * as a consequence of mutex_unlock().  We may have
1832                  * gotten a timeout from __lwp_park().  Or we may still
1833                  * be on the queue and this is just a spurious wakeup.
1834                  */
1835                 qp = queue_lock(mp, MX);
1836                 if (self->ul_sleepq == NULL) {
1837                         if (error) {
1838                                 mp->mutex_waiters = queue_waiter(qp)? 1 : 0;
1839                                 if (error != EINTR)
1840                                         break;
1841                                 error = 0;
1842                         }
1843                         if (set_lock_byte(&mp->mutex_lockw) == 0) {
1844                                 mp->mutex_owner = (uintptr_t)self;
1845                                 break;
1846                         }
1847                         enqueue(qp, self, 0);
1848                         mp->mutex_waiters = 1;
1849                 }
1850                 ASSERT(self->ul_sleepq == qp &&
1851                     self->ul_qtype == MX &&
1852                     self->ul_wchan == mp);
1853                 if (error) {
1854                         if (error != EINTR) {
1855                                 mp->mutex_waiters = dequeue_self(qp);
1856                                 break;
1857                         }
1858                         error = 0;
1859                 }
1860         }
1861         ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
1862             self->ul_wchan == NULL);
1863         self->ul_sp = 0;
1864
1865         ASSERT(error == 0 || error == EINVAL || error == ETIME);
1866
1867         if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1868                 ASSERT(mp->mutex_type & LOCK_ROBUST);
1869                 /*
1870                  * We shouldn't own the mutex.
1871                  * Just clear the lock; everyone has already been waked up.
1872                  */
1873                 mp->mutex_owner = 0;
1874                 (void) clear_lockbyte(&mp->mutex_lockword);
1875                 error = ENOTRECOVERABLE;
1876         }
1877
1878         queue_unlock(qp);
1879
1880         if (msp)
1881                 msp->mutex_sleep_time += gethrtime() - begin_sleep;
1882
1883         if (error) {
1884                 DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1885                 DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1886         } else {
1887                 DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1888                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1889                 if (mp->mutex_flag & LOCK_OWNERDEAD) {
1890                         ASSERT(mp->mutex_type & LOCK_ROBUST);
1891                         error = EOWNERDEAD;
1892                 }
1893         }
1894
1895         return (error);
1896 }
1897
1898 static int
1899 mutex_recursion(mutex_t *mp, int mtype, int try)
1900 {
1901         ASSERT(mutex_held(mp));
1902         ASSERT(mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK));
1903         ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1904
1905         if (mtype & LOCK_RECURSIVE) {
1906                 if (mp->mutex_rcount == RECURSION_MAX) {
1907                         DTRACE_PROBE2(plockstat, mutex__error, mp, EAGAIN);
1908                         return (EAGAIN);
1909                 }
1910                 mp->mutex_rcount++;
1911                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 1, 0);
1912                 return (0);
1913         }
1914         if (try == MUTEX_LOCK) {
1915                 DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1916                 return (EDEADLK);
1917         }
1918         return (EBUSY);
1919 }
1920
1921 /*
1922  * Register this USYNC_PROCESS|LOCK_ROBUST mutex with the kernel so
1923  * it can apply LOCK_OWNERDEAD|LOCK_UNMAPPED if it becomes necessary.
1924  * We use tdb_hash_lock here and in the synch object tracking code in
1925  * the tdb_agent.c file.  There is no conflict between these two usages.
1926  */
1927 void
1928 register_lock(mutex_t *mp)
1929 {
1930         uberdata_t *udp = curthread->ul_uberdata;
1931         uint_t hash = LOCK_HASH(mp);
1932         robust_t *rlp;
1933         robust_t *invalid;
1934         robust_t **rlpp;
1935         robust_t **table;
1936
1937         if ((table = udp->robustlocks) == NULL) {
1938                 lmutex_lock(&udp->tdb_hash_lock);
1939                 if ((table = udp->robustlocks) == NULL) {
1940                         table = lmalloc(LOCKHASHSZ * sizeof (robust_t *));
1941                         membar_producer();
1942                         udp->robustlocks = table;
1943                 }
1944                 lmutex_unlock(&udp->tdb_hash_lock);
1945         }
1946         membar_consumer();
1947
1948         /*
1949          * First search the registered table with no locks held.
1950          * This is safe because the table never shrinks
1951          * and we can only get a false negative.
1952          */
1953         for (rlp = table[hash]; rlp != NULL; rlp = rlp->robust_next) {
1954                 if (rlp->robust_lock == mp)     /* already registered */
1955                         return;
1956         }
1957
1958         /*
1959          * The lock was not found.
1960          * Repeat the operation with tdb_hash_lock held.
1961          */
1962         lmutex_lock(&udp->tdb_hash_lock);
1963
1964         invalid = NULL;
1965         for (rlpp = &table[hash];
1966             (rlp = *rlpp) != NULL;
1967             rlpp = &rlp->robust_next) {
1968                 if (rlp->robust_lock == mp) {   /* already registered */
1969                         lmutex_unlock(&udp->tdb_hash_lock);
1970                         return;
1971                 }
1972                 /* remember the first invalid entry, if any */
1973                 if (rlp->robust_lock == INVALID_ADDR && invalid == NULL)
1974                         invalid = rlp;
1975         }
1976
1977         /*
1978          * The lock has never been registered.
1979          * Add it to the table and register it now.
1980          */
1981         if ((rlp = invalid) != NULL) {
1982                 /*
1983                  * Reuse the invalid entry we found above.
1984                  * The linkages are still correct.
1985                  */
1986                 rlp->robust_lock = mp;
1987                 membar_producer();
1988         } else {
1989                 /*
1990                  * Allocate a new entry and add it to
1991                  * the hash table and to the global list.
1992                  */
1993                 rlp = lmalloc(sizeof (*rlp));
1994                 rlp->robust_lock = mp;
1995                 rlp->robust_next = NULL;
1996                 rlp->robust_list = udp->robustlist;
1997                 udp->robustlist = rlp;
1998                 membar_producer();
1999                 *rlpp = rlp;
2000         }
2001
2002         lmutex_unlock(&udp->tdb_hash_lock);
2003
2004         (void) ___lwp_mutex_register(mp, &rlp->robust_lock);
2005 }
2006
2007 /*
2008  * This is called in the child of fork()/forkall() to start over
2009  * with a clean slate.  (Each process must register its own locks.)
2010  * No locks are needed because all other threads are suspended or gone.
2011  */
2012 void
2013 unregister_locks(void)
2014 {
2015         uberdata_t *udp = curthread->ul_uberdata;
2016         robust_t **table;
2017         robust_t *rlp;
2018         robust_t *next;
2019
2020         /*
2021          * Do this first, before calling lfree().
2022          */
2023         table = udp->robustlocks;
2024         udp->robustlocks = NULL;
2025         rlp = udp->robustlist;
2026         udp->robustlist = NULL;
2027
2028         /*
2029          * Do this by traversing the global list, not the hash table.
2030          */
2031         while (rlp != NULL) {
2032                 next = rlp->robust_list;
2033                 lfree(rlp, sizeof (*rlp));
2034                 rlp = next;
2035         }
2036         if (table != NULL)
2037                 lfree(table, LOCKHASHSZ * sizeof (robust_t *));
2038 }
2039
2040 /*
2041  * Returns with mutex_owner set correctly.
2042  */
2043 int
2044 mutex_lock_internal(mutex_t *mp, timespec_t *tsp, int try)
2045 {
2046         ulwp_t *self = curthread;
2047         uberdata_t *udp = self->ul_uberdata;
2048         int mtype = mp->mutex_type;
2049         tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2050         int error = 0;
2051         int noceil = try & MUTEX_NOCEIL;
2052         uint8_t ceil;
2053         int myprio;
2054
2055         try &= ~MUTEX_NOCEIL;
2056         ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
2057
2058         if (!self->ul_schedctl_called)
2059                 (void) setup_schedctl();
2060
2061         if (msp && try == MUTEX_TRY)
2062                 tdb_incr(msp->mutex_try);
2063
2064         if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && mutex_held(mp))
2065                 return (mutex_recursion(mp, mtype, try));
2066
2067         if (self->ul_error_detection && try == MUTEX_LOCK &&
2068             tsp == NULL && mutex_held(mp))
2069                 lock_error(mp, "mutex_lock", NULL, NULL);
2070
2071         if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2072                 update_sched(self);
2073                 if (self->ul_cid != self->ul_rtclassid) {
2074                         DTRACE_PROBE2(plockstat, mutex__error, mp, EPERM);
2075                         return (EPERM);
2076                 }
2077                 ceil = mp->mutex_ceiling;
2078                 myprio = self->ul_epri? self->ul_epri : self->ul_pri;
2079                 if (myprio > ceil) {
2080                         DTRACE_PROBE2(plockstat, mutex__error, mp, EINVAL);
2081                         return (EINVAL);
2082                 }
2083                 if ((error = _ceil_mylist_add(mp)) != 0) {
2084                         DTRACE_PROBE2(plockstat, mutex__error, mp, error);
2085                         return (error);
2086                 }
2087                 if (myprio < ceil)
2088                         _ceil_prio_inherit(ceil);
2089         }
2090
2091         if ((mtype & (USYNC_PROCESS | LOCK_ROBUST))
2092             == (USYNC_PROCESS | LOCK_ROBUST))
2093                 register_lock(mp);
2094
2095         if (mtype & LOCK_PRIO_INHERIT) {
2096                 /* go straight to the kernel */
2097                 if (try == MUTEX_TRY)
2098                         error = mutex_trylock_kernel(mp);
2099                 else    /* MUTEX_LOCK */
2100                         error = mutex_lock_kernel(mp, tsp, msp);
2101                 /*
2102                  * The kernel never sets or clears the lock byte
2103                  * for LOCK_PRIO_INHERIT mutexes.
2104                  * Set it here for consistency.
2105                  */
2106                 switch (error) {
2107                 case 0:
2108                         self->ul_pilocks++;
2109                         mp->mutex_lockw = LOCKSET;
2110                         break;
2111                 case EOWNERDEAD:
2112                 case ELOCKUNMAPPED:
2113                         self->ul_pilocks++;
2114                         mp->mutex_lockw = LOCKSET;
2115                         /* FALLTHROUGH */
2116                 case ENOTRECOVERABLE:
2117                         ASSERT(mtype & LOCK_ROBUST);
2118                         break;
2119                 case EDEADLK:
2120                         if (try == MUTEX_TRY) {
2121                                 error = EBUSY;
2122                         } else if (tsp != NULL) {       /* simulate a timeout */
2123                                 /*
2124                                  * Note: mutex_timedlock() never returns EINTR.
2125                                  */
2126                                 timespec_t ts = *tsp;
2127                                 timespec_t rts;
2128
2129                                 while (__nanosleep(&ts, &rts) == EINTR)
2130                                         ts = rts;
2131                                 error = ETIME;
2132                         } else {                /* simulate a deadlock */
2133                                 stall();
2134                         }
2135                         break;
2136                 }
2137         } else if (mtype & USYNC_PROCESS) {
2138                 error = mutex_trylock_process(mp, try == MUTEX_LOCK);
2139                 if (error == EBUSY && try == MUTEX_LOCK)
2140                         error = mutex_lock_kernel(mp, tsp, msp);
2141         } else {        /* USYNC_THREAD */
2142                 error = mutex_trylock_adaptive(mp, try == MUTEX_LOCK);
2143                 if (error == EBUSY && try == MUTEX_LOCK)
2144                         error = mutex_lock_queue(self, msp, mp, tsp);
2145         }
2146
2147         switch (error) {
2148         case 0:
2149         case EOWNERDEAD:
2150         case ELOCKUNMAPPED:
2151                 if (mtype & LOCK_ROBUST)
2152                         remember_lock(mp);
2153                 if (msp)
2154                         record_begin_hold(msp);
2155                 break;
2156         default:
2157                 if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2158                         (void) _ceil_mylist_del(mp);
2159                         if (myprio < ceil)
2160                                 _ceil_prio_waive();
2161                 }
2162                 if (try == MUTEX_TRY) {
2163                         if (msp)
2164                                 tdb_incr(msp->mutex_try_fail);
2165                         if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2166                                 self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2167                                 tdb_event(TD_LOCK_TRY, udp);
2168                         }
2169                 }
2170                 break;
2171         }
2172
2173         return (error);
2174 }
2175
2176 int
2177 fast_process_lock(mutex_t *mp, timespec_t *tsp, int mtype, int try)
2178 {
2179         ulwp_t *self = curthread;
2180         uberdata_t *udp = self->ul_uberdata;
2181
2182         /*
2183          * We know that USYNC_PROCESS is set in mtype and that
2184          * zero, one, or both of the flags LOCK_RECURSIVE and
2185          * LOCK_ERRORCHECK are set, and that no other flags are set.
2186          */
2187         ASSERT((mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0);
2188         enter_critical(self);
2189 #if defined(__sparc) && !defined(_LP64)
2190         /* horrible hack, necessary only on 32-bit sparc */
2191         if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2192             self->ul_misaligned) {
2193                 if (set_lock_byte(&mp->mutex_lockw) == 0) {
2194                         mp->mutex_ownerpid = udp->pid;
2195                         mp->mutex_owner = (uintptr_t)self;
2196                         exit_critical(self);
2197                         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2198                         return (0);
2199                 }
2200         } else
2201 #endif
2202         if (set_lock_byte64(&mp->mutex_lockword64, udp->pid) == 0) {
2203                 mp->mutex_owner = (uintptr_t)self;
2204                 /* mp->mutex_ownerpid was set by set_lock_byte64() */
2205                 exit_critical(self);
2206                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2207                 return (0);
2208         }
2209         exit_critical(self);
2210
2211         if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && shared_mutex_held(mp))
2212                 return (mutex_recursion(mp, mtype, try));
2213
2214         if (try == MUTEX_LOCK) {
2215                 if (mutex_trylock_process(mp, 1) == 0)
2216                         return (0);
2217                 return (mutex_lock_kernel(mp, tsp, NULL));
2218         }
2219
2220         if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2221                 self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2222                 tdb_event(TD_LOCK_TRY, udp);
2223         }
2224         return (EBUSY);
2225 }
2226
2227 static int
2228 mutex_lock_impl(mutex_t *mp, timespec_t *tsp)
2229 {
2230         ulwp_t *self = curthread;
2231         int mtype = mp->mutex_type;
2232         uberflags_t *gflags;
2233
2234         if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2235             self->ul_error_detection && self->ul_misaligned == 0)
2236                 lock_error(mp, "mutex_lock", NULL, "mutex is misaligned");
2237
2238         /*
2239          * Optimize the case of USYNC_THREAD, including
2240          * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2241          * no error detection, no lock statistics,
2242          * and the process has only a single thread.
2243          * (Most likely a traditional single-threaded application.)
2244          */
2245         if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2246             self->ul_uberdata->uberflags.uf_all) == 0) {
2247                 /*
2248                  * Only one thread exists so we don't need an atomic operation.
2249                  * We do, however, need to protect against signals.
2250                  */
2251                 if (mp->mutex_lockw == 0) {
2252                         sigoff(self);
2253                         mp->mutex_lockw = LOCKSET;
2254                         mp->mutex_owner = (uintptr_t)self;
2255                         sigon(self);
2256                         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2257                         return (0);
2258                 }
2259                 if (mtype && MUTEX_OWNER(mp) == self)
2260                         return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2261                 /*
2262                  * We have reached a deadlock, probably because the
2263                  * process is executing non-async-signal-safe code in
2264                  * a signal handler and is attempting to acquire a lock
2265                  * that it already owns.  This is not surprising, given
2266                  * bad programming practices over the years that has
2267                  * resulted in applications calling printf() and such
2268                  * in their signal handlers.  Unless the user has told
2269                  * us that the signal handlers are safe by setting:
2270                  *      export _THREAD_ASYNC_SAFE=1
2271                  * we return EDEADLK rather than actually deadlocking.
2272                  */
2273                 if (tsp == NULL &&
2274                     MUTEX_OWNER(mp) == self && !self->ul_async_safe) {
2275                         DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
2276                         return (EDEADLK);
2277                 }
2278         }
2279
2280         /*
2281          * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2282          * no error detection, and no lock statistics.
2283          * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2284          */
2285         if ((gflags = self->ul_schedctl_called) != NULL &&
2286             (gflags->uf_trs_ted |
2287             (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2288                 if (mtype & USYNC_PROCESS)
2289                         return (fast_process_lock(mp, tsp, mtype, MUTEX_LOCK));
2290                 sigoff(self);
2291                 if (set_lock_byte(&mp->mutex_lockw) == 0) {
2292                         mp->mutex_owner = (uintptr_t)self;
2293                         sigon(self);
2294                         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2295                         return (0);
2296                 }
2297                 sigon(self);
2298                 if (mtype && MUTEX_OWNER(mp) == self)
2299                         return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2300                 if (mutex_trylock_adaptive(mp, 1) != 0)
2301                         return (mutex_lock_queue(self, NULL, mp, tsp));
2302                 return (0);
2303         }
2304
2305         /* else do it the long way */
2306         return (mutex_lock_internal(mp, tsp, MUTEX_LOCK));
2307 }
2308
2309 #pragma weak pthread_mutex_lock = mutex_lock
2310 #pragma weak _mutex_lock = mutex_lock
2311 int
2312 mutex_lock(mutex_t *mp)
2313 {
2314         ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2315         return (mutex_lock_impl(mp, NULL));
2316 }
2317
2318 void
2319 mutex_enter(mutex_t *mp)
2320 {
2321         int ret;
2322         int attr = mp->mutex_type & ALL_ATTRIBUTES;
2323
2324         /*
2325          * Require LOCK_ERRORCHECK, accept LOCK_RECURSIVE.
2326          */
2327         if (attr != LOCK_ERRORCHECK &&
2328             attr != (LOCK_ERRORCHECK | LOCK_RECURSIVE)) {
2329                 mutex_panic(mp, "mutex_enter: bad mutex type");
2330         }
2331         ret = mutex_lock(mp);
2332         if (ret == EDEADLK) {
2333                 mutex_panic(mp, "recursive mutex_enter");
2334         } else if (ret == EAGAIN) {
2335                 mutex_panic(mp, "excessive recursive mutex_enter");
2336         } else if (ret != 0) {
2337                 mutex_panic(mp, "unknown mutex_enter failure");
2338         }
2339 }
2340
2341 int
2342 pthread_mutex_timedlock(pthread_mutex_t *_RESTRICT_KYWD mp,
2343         const struct timespec *_RESTRICT_KYWD abstime)
2344 {
2345         timespec_t tslocal;
2346         int error;
2347
2348         ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2349         abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
2350         error = mutex_lock_impl((mutex_t *)mp, &tslocal);
2351         if (error == ETIME)
2352                 error = ETIMEDOUT;
2353         return (error);
2354 }
2355
2356 int
2357 pthread_mutex_reltimedlock_np(pthread_mutex_t *_RESTRICT_KYWD mp,
2358         const struct timespec *_RESTRICT_KYWD reltime)
2359 {
2360         timespec_t tslocal;
2361         int error;
2362
2363         ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2364         tslocal = *reltime;
2365         error = mutex_lock_impl((mutex_t *)mp, &tslocal);
2366         if (error == ETIME)
2367                 error = ETIMEDOUT;
2368         return (error);
2369 }
2370
2371 #pragma weak pthread_mutex_trylock = mutex_trylock
2372 int
2373 mutex_trylock(mutex_t *mp)
2374 {
2375         ulwp_t *self = curthread;
2376         uberdata_t *udp = self->ul_uberdata;
2377         int mtype = mp->mutex_type;
2378         uberflags_t *gflags;
2379
2380         ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2381
2382         /*
2383          * Optimize the case of USYNC_THREAD, including
2384          * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2385          * no error detection, no lock statistics,
2386          * and the process has only a single thread.
2387          * (Most likely a traditional single-threaded application.)
2388          */
2389         if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2390             udp->uberflags.uf_all) == 0) {
2391                 /*
2392                  * Only one thread exists so we don't need an atomic operation.
2393                  * We do, however, need to protect against signals.
2394                  */
2395                 if (mp->mutex_lockw == 0) {
2396                         sigoff(self);
2397                         mp->mutex_lockw = LOCKSET;
2398                         mp->mutex_owner = (uintptr_t)self;
2399                         sigon(self);
2400                         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2401                         return (0);
2402                 }
2403                 if (mtype && MUTEX_OWNER(mp) == self)
2404                         return (mutex_recursion(mp, mtype, MUTEX_TRY));
2405                 return (EBUSY);
2406         }
2407
2408         /*
2409          * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2410          * no error detection, and no lock statistics.
2411          * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2412          */
2413         if ((gflags = self->ul_schedctl_called) != NULL &&
2414             (gflags->uf_trs_ted |
2415             (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2416                 if (mtype & USYNC_PROCESS)
2417                         return (fast_process_lock(mp, NULL, mtype, MUTEX_TRY));
2418                 sigoff(self);
2419                 if (set_lock_byte(&mp->mutex_lockw) == 0) {
2420                         mp->mutex_owner = (uintptr_t)self;
2421                         sigon(self);
2422                         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2423                         return (0);
2424                 }
2425                 sigon(self);
2426                 if (mtype && MUTEX_OWNER(mp) == self)
2427                         return (mutex_recursion(mp, mtype, MUTEX_TRY));
2428                 if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2429                         self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2430                         tdb_event(TD_LOCK_TRY, udp);
2431                 }
2432                 return (EBUSY);
2433         }
2434
2435         /* else do it the long way */
2436         return (mutex_lock_internal(mp, NULL, MUTEX_TRY));
2437 }
2438
2439 int
2440 mutex_unlock_internal(mutex_t *mp, int retain_robust_flags)
2441 {
2442         ulwp_t *self = curthread;
2443         uberdata_t *udp = self->ul_uberdata;
2444         int mtype = mp->mutex_type;
2445         tdb_mutex_stats_t *msp;
2446         int error = 0;
2447         int release_all;
2448         lwpid_t lwpid;
2449
2450         if ((mtype & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
2451             !mutex_held(mp))
2452                 return (EPERM);
2453
2454         if (self->ul_error_detection && !mutex_held(mp))
2455                 lock_error(mp, "mutex_unlock", NULL, NULL);
2456
2457         if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2458                 mp->mutex_rcount--;
2459                 DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2460                 return (0);
2461         }
2462
2463         if ((msp = MUTEX_STATS(mp, udp)) != NULL)
2464                 (void) record_hold_time(msp);
2465
2466         if (!retain_robust_flags && !(mtype & LOCK_PRIO_INHERIT) &&
2467             (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2468                 ASSERT(mtype & LOCK_ROBUST);
2469                 mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2470                 mp->mutex_flag |= LOCK_NOTRECOVERABLE;
2471         }
2472         release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
2473
2474         if (mtype & LOCK_PRIO_INHERIT) {
2475                 no_preempt(self);
2476                 mp->mutex_owner = 0;
2477                 /* mp->mutex_ownerpid is cleared by ___lwp_mutex_unlock() */
2478                 DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2479                 mp->mutex_lockw = LOCKCLEAR;
2480                 self->ul_pilocks--;
2481                 error = ___lwp_mutex_unlock(mp);
2482                 preempt(self);
2483         } else if (mtype & USYNC_PROCESS) {
2484                 mutex_unlock_process(mp, release_all);
2485         } else {        /* USYNC_THREAD */
2486                 if ((lwpid = mutex_unlock_queue(mp, release_all)) != 0) {
2487                         (void) __lwp_unpark(lwpid);
2488                         preempt(self);
2489                 }
2490         }
2491
2492         if (mtype & LOCK_ROBUST)
2493                 forget_lock(mp);
2494
2495         if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
2496                 _ceil_prio_waive();
2497
2498         return (error);
2499 }
2500
2501 #pragma weak pthread_mutex_unlock = mutex_unlock
2502 #pragma weak _mutex_unlock = mutex_unlock
2503 int
2504 mutex_unlock(mutex_t *mp)
2505 {
2506         ulwp_t *self = curthread;
2507         int mtype = mp->mutex_type;
2508         uberflags_t *gflags;
2509         lwpid_t lwpid;
2510         short el;
2511
2512         /*
2513          * Optimize the case of USYNC_THREAD, including
2514          * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2515          * no error detection, no lock statistics,
2516          * and the process has only a single thread.
2517          * (Most likely a traditional single-threaded application.)
2518          */
2519         if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2520             self->ul_uberdata->uberflags.uf_all) == 0) {
2521                 if (mtype) {
2522                         /*
2523                          * At this point we know that one or both of the
2524                          * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2525                          */
2526                         if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2527                                 return (EPERM);
2528                         if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2529                                 mp->mutex_rcount--;
2530                                 DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2531                                 return (0);
2532                         }
2533                 }
2534                 /*
2535                  * Only one thread exists so we don't need an atomic operation.
2536                  * Also, there can be no waiters.
2537                  */
2538                 sigoff(self);
2539                 mp->mutex_owner = 0;
2540                 mp->mutex_lockword = 0;
2541                 sigon(self);
2542                 DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2543                 return (0);
2544         }
2545
2546         /*
2547          * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2548          * no error detection, and no lock statistics.
2549          * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2550          */
2551         if ((gflags = self->ul_schedctl_called) != NULL) {
2552                 if (((el = gflags->uf_trs_ted) | mtype) == 0) {
2553 fast_unlock:
2554                         if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2555                                 (void) __lwp_unpark(lwpid);
2556                                 preempt(self);
2557                         }
2558                         return (0);
2559                 }
2560                 if (el)         /* error detection or lock statistics */
2561                         goto slow_unlock;
2562                 if ((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2563                         /*
2564                          * At this point we know that one or both of the
2565                          * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2566                          */
2567                         if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2568                                 return (EPERM);
2569                         if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2570                                 mp->mutex_rcount--;
2571                                 DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2572                                 return (0);
2573                         }
2574                         goto fast_unlock;
2575                 }
2576                 if ((mtype &
2577                     ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2578                         /*
2579                          * At this point we know that zero, one, or both of the
2580                          * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set and
2581                          * that the USYNC_PROCESS flag is set.
2582                          */
2583                         if ((mtype & LOCK_ERRORCHECK) && !shared_mutex_held(mp))
2584                                 return (EPERM);
2585                         if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2586                                 mp->mutex_rcount--;
2587                                 DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2588                                 return (0);
2589                         }
2590                         mutex_unlock_process(mp, 0);
2591                         return (0);
2592                 }
2593         }
2594
2595         /* else do it the long way */
2596 slow_unlock:
2597         return (mutex_unlock_internal(mp, 0));
2598 }
2599
2600 void
2601 mutex_exit(mutex_t *mp)
2602 {
2603         int ret;
2604         int attr = mp->mutex_type & ALL_ATTRIBUTES;
2605
2606         if (attr != LOCK_ERRORCHECK &&
2607             attr != (LOCK_ERRORCHECK | LOCK_RECURSIVE)) {
2608                 mutex_panic(mp, "mutex_exit: bad mutex type");
2609         }
2610         ret = mutex_unlock(mp);
2611         if (ret == EPERM) {
2612                 mutex_panic(mp, "mutex_exit: not owner");
2613         } else if (ret != 0) {
2614                 mutex_panic(mp, "unknown mutex_exit failure");
2615         }
2616
2617 }
2618
2619 /*
2620  * Internally to the library, almost all mutex lock/unlock actions
2621  * go through these lmutex_ functions, to protect critical regions.
2622  * We replicate a bit of code from mutex_lock() and mutex_unlock()
2623  * to make these functions faster since we know that the mutex type
2624  * of all internal locks is USYNC_THREAD.  We also know that internal
2625  * locking can never fail, so we panic if it does.
2626  */
2627 void
2628 lmutex_lock(mutex_t *mp)
2629 {
2630         ulwp_t *self = curthread;
2631         uberdata_t *udp = self->ul_uberdata;
2632
2633         ASSERT(mp->mutex_type == USYNC_THREAD);
2634
2635         enter_critical(self);
2636         /*
2637          * Optimize the case of no lock statistics and only a single thread.
2638          * (Most likely a traditional single-threaded application.)
2639          */
2640         if (udp->uberflags.uf_all == 0) {
2641                 /*
2642                  * Only one thread exists; the mutex must be free.
2643                  */
2644                 ASSERT(mp->mutex_lockw == 0);
2645                 mp->mutex_lockw = LOCKSET;
2646                 mp->mutex_owner = (uintptr_t)self;
2647                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2648         } else {
2649                 tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2650
2651                 if (!self->ul_schedctl_called)
2652                         (void) setup_schedctl();
2653
2654                 if (set_lock_byte(&mp->mutex_lockw) == 0) {
2655                         mp->mutex_owner = (uintptr_t)self;
2656                         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2657                 } else if (mutex_trylock_adaptive(mp, 1) != 0) {
2658                         (void) mutex_lock_queue(self, msp, mp, NULL);
2659                 }
2660
2661                 if (msp)
2662                         record_begin_hold(msp);
2663         }
2664 }
2665
2666 void
2667 lmutex_unlock(mutex_t *mp)
2668 {
2669         ulwp_t *self = curthread;
2670         uberdata_t *udp = self->ul_uberdata;
2671
2672         ASSERT(mp->mutex_type == USYNC_THREAD);
2673
2674         /*
2675          * Optimize the case of no lock statistics and only a single thread.
2676          * (Most likely a traditional single-threaded application.)
2677          */
2678         if (udp->uberflags.uf_all == 0) {
2679                 /*
2680                  * Only one thread exists so there can be no waiters.
2681                  */
2682                 mp->mutex_owner = 0;
2683                 mp->mutex_lockword = 0;
2684                 DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2685         } else {
2686                 tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2687                 lwpid_t lwpid;
2688
2689                 if (msp)
2690                         (void) record_hold_time(msp);
2691                 if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2692                         (void) __lwp_unpark(lwpid);
2693                         preempt(self);
2694                 }
2695         }
2696         exit_critical(self);
2697 }
2698
2699 /*
2700  * For specialized code in libc, like the asynchronous i/o code,
2701  * the following sig_*() locking primitives are used in order
2702  * to make the code asynchronous signal safe.  Signals are
2703  * deferred while locks acquired by these functions are held.
2704  */
2705 void
2706 sig_mutex_lock(mutex_t *mp)
2707 {
2708         ulwp_t *self = curthread;
2709
2710         sigoff(self);
2711         (void) mutex_lock(mp);
2712 }
2713
2714 void
2715 sig_mutex_unlock(mutex_t *mp)
2716 {
2717         ulwp_t *self = curthread;
2718
2719         (void) mutex_unlock(mp);
2720         sigon(self);
2721 }
2722
2723 int
2724 sig_mutex_trylock(mutex_t *mp)
2725 {
2726         ulwp_t *self = curthread;
2727         int error;
2728
2729         sigoff(self);
2730         if ((error = mutex_trylock(mp)) != 0)
2731                 sigon(self);
2732         return (error);
2733 }
2734
2735 /*
2736  * sig_cond_wait() is a cancellation point.
2737  */
2738 int
2739 sig_cond_wait(cond_t *cv, mutex_t *mp)
2740 {
2741         int error;
2742
2743         ASSERT(curthread->ul_sigdefer != 0);
2744         pthread_testcancel();
2745         error = __cond_wait(cv, mp);
2746         if (error == EINTR && curthread->ul_cursig) {
2747                 sig_mutex_unlock(mp);
2748                 /* take the deferred signal here */
2749                 sig_mutex_lock(mp);
2750         }
2751         pthread_testcancel();
2752         return (error);
2753 }
2754
2755 /*
2756  * sig_cond_reltimedwait() is a cancellation point.
2757  */
2758 int
2759 sig_cond_reltimedwait(cond_t *cv, mutex_t *mp, const timespec_t *ts)
2760 {
2761         int error;
2762
2763         ASSERT(curthread->ul_sigdefer != 0);
2764         pthread_testcancel();
2765         error = __cond_reltimedwait(cv, mp, ts);
2766         if (error == EINTR && curthread->ul_cursig) {
2767                 sig_mutex_unlock(mp);
2768                 /* take the deferred signal here */
2769                 sig_mutex_lock(mp);
2770         }
2771         pthread_testcancel();
2772         return (error);
2773 }
2774
2775 /*
2776  * For specialized code in libc, like the stdio code.
2777  * the following cancel_safe_*() locking primitives are used in
2778  * order to make the code cancellation-safe.  Cancellation is
2779  * deferred while locks acquired by these functions are held.
2780  */
2781 void
2782 cancel_safe_mutex_lock(mutex_t *mp)
2783 {
2784         (void) mutex_lock(mp);
2785         curthread->ul_libc_locks++;
2786 }
2787
2788 int
2789 cancel_safe_mutex_trylock(mutex_t *mp)
2790 {
2791         int error;
2792
2793         if ((error = mutex_trylock(mp)) == 0)
2794                 curthread->ul_libc_locks++;
2795         return (error);
2796 }
2797
2798 void
2799 cancel_safe_mutex_unlock(mutex_t *mp)
2800 {
2801         ulwp_t *self = curthread;
2802
2803         ASSERT(self->ul_libc_locks != 0);
2804
2805         (void) mutex_unlock(mp);
2806
2807         /*
2808          * Decrement the count of locks held by cancel_safe_mutex_lock().
2809          * If we are then in a position to terminate cleanly and
2810          * if there is a pending cancellation and cancellation
2811          * is not disabled and we received EINTR from a recent
2812          * system call then perform the cancellation action now.
2813          */
2814         if (--self->ul_libc_locks == 0 &&
2815             !(self->ul_vfork | self->ul_nocancel |
2816             self->ul_critical | self->ul_sigdefer) &&
2817             cancel_active())
2818                 pthread_exit(PTHREAD_CANCELED);
2819 }
2820
2821 static int
2822 shared_mutex_held(mutex_t *mparg)
2823 {
2824         /*
2825          * The 'volatile' is necessary to make sure the compiler doesn't
2826          * reorder the tests of the various components of the mutex.
2827          * They must be tested in this order:
2828          *      mutex_lockw
2829          *      mutex_owner
2830          *      mutex_ownerpid
2831          * This relies on the fact that everywhere mutex_lockw is cleared,
2832          * mutex_owner and mutex_ownerpid are cleared before mutex_lockw
2833          * is cleared, and that everywhere mutex_lockw is set, mutex_owner
2834          * and mutex_ownerpid are set after mutex_lockw is set, and that
2835          * mutex_lockw is set or cleared with a memory barrier.
2836          */
2837         volatile mutex_t *mp = (volatile mutex_t *)mparg;
2838         ulwp_t *self = curthread;
2839         uberdata_t *udp = self->ul_uberdata;
2840
2841         return (MUTEX_OWNED(mp, self) && mp->mutex_ownerpid == udp->pid);
2842 }
2843
2844 #pragma weak _mutex_held = mutex_held
2845 int
2846 mutex_held(mutex_t *mparg)
2847 {
2848         volatile mutex_t *mp = (volatile mutex_t *)mparg;
2849
2850         if (mparg->mutex_type & USYNC_PROCESS)
2851                 return (shared_mutex_held(mparg));
2852         return (MUTEX_OWNED(mp, curthread));
2853 }
2854
2855 #pragma weak pthread_mutex_destroy = mutex_destroy
2856 #pragma weak _mutex_destroy = mutex_destroy
2857 int
2858 mutex_destroy(mutex_t *mp)
2859 {
2860         if (mp->mutex_type & USYNC_PROCESS)
2861                 forget_lock(mp);
2862         (void) memset(mp, 0, sizeof (*mp));
2863         tdb_sync_obj_deregister(mp);
2864         return (0);
2865 }
2866
2867 #pragma weak pthread_mutex_consistent_np = mutex_consistent
2868 #pragma weak pthread_mutex_consistent = mutex_consistent
2869 int
2870 mutex_consistent(mutex_t *mp)
2871 {
2872         /*
2873          * Do this only for an inconsistent, initialized robust lock
2874          * that we hold.  For all other cases, return EINVAL.
2875          */
2876         if (mutex_held(mp) &&
2877             (mp->mutex_type & LOCK_ROBUST) &&
2878             (mp->mutex_flag & LOCK_INITED) &&
2879             (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2880                 mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2881                 mp->mutex_rcount = 0;
2882                 return (0);
2883         }
2884         return (EINVAL);
2885 }
2886
2887 /*
2888  * Spin locks are separate from ordinary mutexes,
2889  * but we use the same data structure for them.
2890  */
2891
2892 int
2893 pthread_spin_init(pthread_spinlock_t *lock, int pshared)
2894 {
2895         mutex_t *mp = (mutex_t *)lock;
2896
2897         (void) memset(mp, 0, sizeof (*mp));
2898         if (pshared == PTHREAD_PROCESS_SHARED)
2899                 mp->mutex_type = USYNC_PROCESS;
2900         else
2901                 mp->mutex_type = USYNC_THREAD;
2902         mp->mutex_flag = LOCK_INITED;
2903         mp->mutex_magic = MUTEX_MAGIC;
2904
2905         /*
2906          * This should be at the beginning of the function,
2907          * but for the sake of old broken applications that
2908          * do not have proper alignment for their mutexes
2909          * (and don't check the return code from pthread_spin_init),
2910          * we put it here, after initializing the mutex regardless.
2911          */
2912         if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2913             curthread->ul_misaligned == 0)
2914                 return (EINVAL);
2915
2916         return (0);
2917 }
2918
2919 int
2920 pthread_spin_destroy(pthread_spinlock_t *lock)
2921 {
2922         (void) memset(lock, 0, sizeof (*lock));
2923         return (0);
2924 }
2925
2926 int
2927 pthread_spin_trylock(pthread_spinlock_t *lock)
2928 {
2929         mutex_t *mp = (mutex_t *)lock;
2930         ulwp_t *self = curthread;
2931         int error = 0;
2932
2933         no_preempt(self);
2934         if (set_lock_byte(&mp->mutex_lockw) != 0)
2935                 error = EBUSY;
2936         else {
2937                 mp->mutex_owner = (uintptr_t)self;
2938                 if (mp->mutex_type == USYNC_PROCESS)
2939                         mp->mutex_ownerpid = self->ul_uberdata->pid;
2940                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2941         }
2942         preempt(self);
2943         return (error);
2944 }
2945
2946 int
2947 pthread_spin_lock(pthread_spinlock_t *lock)
2948 {
2949         mutex_t *mp = (mutex_t *)lock;
2950         ulwp_t *self = curthread;
2951         volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
2952         int count = 0;
2953
2954         ASSERT(!self->ul_critical || self->ul_bindflags);
2955
2956         DTRACE_PROBE1(plockstat, mutex__spin, mp);
2957
2958         /*
2959          * We don't care whether the owner is running on a processor.
2960          * We just spin because that's what this interface requires.
2961          */
2962         for (;;) {
2963                 if (*lockp == 0) {      /* lock byte appears to be clear */
2964                         no_preempt(self);
2965                         if (set_lock_byte(lockp) == 0)
2966                                 break;
2967                         preempt(self);
2968                 }
2969                 if (count < INT_MAX)
2970                         count++;
2971                 SMT_PAUSE();
2972         }
2973         mp->mutex_owner = (uintptr_t)self;
2974         if (mp->mutex_type == USYNC_PROCESS)
2975                 mp->mutex_ownerpid = self->ul_uberdata->pid;
2976         preempt(self);
2977         if (count) {
2978                 DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
2979         }
2980         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
2981         return (0);
2982 }
2983
2984 int
2985 pthread_spin_unlock(pthread_spinlock_t *lock)
2986 {
2987         mutex_t *mp = (mutex_t *)lock;
2988         ulwp_t *self = curthread;
2989
2990         no_preempt(self);
2991         mp->mutex_owner = 0;
2992         mp->mutex_ownerpid = 0;
2993         DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2994         (void) atomic_swap_32(&mp->mutex_lockword, 0);
2995         preempt(self);
2996         return (0);
2997 }
2998
2999 #define INITIAL_LOCKS   8       /* initial size of ul_heldlocks.array */
3000
3001 /*
3002  * Find/allocate an entry for 'lock' in our array of held locks.
3003  */
3004 static mutex_t **
3005 find_lock_entry(mutex_t *lock)
3006 {
3007         ulwp_t *self = curthread;
3008         mutex_t **remembered = NULL;
3009         mutex_t **lockptr;
3010         uint_t nlocks;
3011
3012         if ((nlocks = self->ul_heldlockcnt) != 0)
3013                 lockptr = self->ul_heldlocks.array;
3014         else {
3015                 nlocks = 1;
3016                 lockptr = &self->ul_heldlocks.single;
3017         }
3018
3019         for (; nlocks; nlocks--, lockptr++) {
3020                 if (*lockptr == lock)
3021                         return (lockptr);
3022                 if (*lockptr == NULL && remembered == NULL)
3023                         remembered = lockptr;
3024         }
3025         if (remembered != NULL) {
3026                 *remembered = lock;
3027                 return (remembered);
3028         }
3029
3030         /*
3031          * No entry available.  Allocate more space, converting
3032          * the single entry into an array of entries if necessary.
3033          */
3034         if ((nlocks = self->ul_heldlockcnt) == 0) {
3035                 /*
3036                  * Initial allocation of the array.
3037                  * Convert the single entry into an array.
3038                  */
3039                 self->ul_heldlockcnt = nlocks = INITIAL_LOCKS;
3040                 lockptr = lmalloc(nlocks * sizeof (mutex_t *));
3041                 /*
3042                  * The single entry becomes the first entry in the array.
3043                  */
3044                 *lockptr = self->ul_heldlocks.single;
3045                 self->ul_heldlocks.array = lockptr;
3046                 /*
3047                  * Return the next available entry in the array.
3048                  */
3049                 *++lockptr = lock;
3050                 return (lockptr);
3051         }
3052         /*
3053          * Reallocate the array, double the size each time.
3054          */
3055         lockptr = lmalloc(nlocks * 2 * sizeof (mutex_t *));
3056         (void) memcpy(lockptr, self->ul_heldlocks.array,
3057             nlocks * sizeof (mutex_t *));
3058         lfree(self->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
3059         self->ul_heldlocks.array = lockptr;
3060         self->ul_heldlockcnt *= 2;
3061         /*
3062          * Return the next available entry in the newly allocated array.
3063          */
3064         *(lockptr += nlocks) = lock;
3065         return (lockptr);
3066 }
3067
3068 /*
3069  * Insert 'lock' into our list of held locks.
3070  * Currently only used for LOCK_ROBUST mutexes.
3071  */
3072 void
3073 remember_lock(mutex_t *lock)
3074 {
3075         (void) find_lock_entry(lock);
3076 }
3077
3078 /*
3079  * Remove 'lock' from our list of held locks.
3080  * Currently only used for LOCK_ROBUST mutexes.
3081  */
3082 void
3083 forget_lock(mutex_t *lock)
3084 {
3085         *find_lock_entry(lock) = NULL;
3086 }
3087
3088 /*
3089  * Free the array of held locks.
3090  */
3091 void
3092 heldlock_free(ulwp_t *ulwp)
3093 {
3094         uint_t nlocks;
3095
3096         if ((nlocks = ulwp->ul_heldlockcnt) != 0)
3097                 lfree(ulwp->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
3098         ulwp->ul_heldlockcnt = 0;
3099         ulwp->ul_heldlocks.array = NULL;
3100 }
3101
3102 /*
3103  * Mark all held LOCK_ROBUST mutexes LOCK_OWNERDEAD.
3104  * Called from _thrp_exit() to deal with abandoned locks.
3105  */
3106 void
3107 heldlock_exit(void)
3108 {
3109         ulwp_t *self = curthread;
3110         mutex_t **lockptr;
3111         uint_t nlocks;
3112         mutex_t *mp;
3113
3114         if ((nlocks = self->ul_heldlockcnt) != 0)
3115                 lockptr = self->ul_heldlocks.array;
3116         else {
3117                 nlocks = 1;
3118                 lockptr = &self->ul_heldlocks.single;
3119         }
3120
3121         for (; nlocks; nlocks--, lockptr++) {
3122                 /*
3123                  * The kernel takes care of transitioning held
3124                  * LOCK_PRIO_INHERIT mutexes to LOCK_OWNERDEAD.
3125                  * We avoid that case here.
3126                  */
3127                 if ((mp = *lockptr) != NULL &&
3128                     mutex_held(mp) &&
3129                     (mp->mutex_type & (LOCK_ROBUST | LOCK_PRIO_INHERIT)) ==
3130                     LOCK_ROBUST) {
3131                         mp->mutex_rcount = 0;
3132                         if (!(mp->mutex_flag & LOCK_UNMAPPED))
3133                                 mp->mutex_flag |= LOCK_OWNERDEAD;
3134                         (void) mutex_unlock_internal(mp, 1);
3135                 }
3136         }
3137
3138         heldlock_free(self);
3139 }
3140
3141 #pragma weak _cond_init = cond_init
3142 /* ARGSUSED2 */
3143 int
3144 cond_init(cond_t *cvp, int type, void *arg)
3145 {
3146         if (type != USYNC_THREAD && type != USYNC_PROCESS)
3147                 return (EINVAL);
3148         (void) memset(cvp, 0, sizeof (*cvp));
3149         cvp->cond_type = (uint16_t)type;
3150         cvp->cond_magic = COND_MAGIC;
3151
3152         /*
3153          * This should be at the beginning of the function,
3154          * but for the sake of old broken applications that
3155          * do not have proper alignment for their condvars
3156          * (and don't check the return code from cond_init),
3157          * we put it here, after initializing the condvar regardless.
3158          */
3159         if (((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1)) &&
3160             curthread->ul_misaligned == 0)
3161                 return (EINVAL);
3162
3163         return (0);
3164 }
3165
3166 /*
3167  * cond_sleep_queue(): utility function for cond_wait_queue().
3168  *
3169  * Go to sleep on a condvar sleep queue, expect to be waked up
3170  * by someone calling cond_signal() or cond_broadcast() or due
3171  * to receiving a UNIX signal or being cancelled, or just simply
3172  * due to a spurious wakeup (like someome calling forkall()).
3173  *
3174  * The associated mutex is *not* reacquired before returning.
3175  * That must be done by the caller of cond_sleep_queue().
3176  */
3177 static int
3178 cond_sleep_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3179 {
3180         ulwp_t *self = curthread;
3181         queue_head_t *qp;
3182         queue_head_t *mqp;
3183         lwpid_t lwpid;
3184         int signalled;
3185         int error;
3186         int cv_wake;
3187         int release_all;
3188
3189         /*
3190          * Put ourself on the CV sleep queue, unlock the mutex, then
3191          * park ourself and unpark a candidate lwp to grab the mutex.
3192          * We must go onto the CV sleep queue before dropping the
3193          * mutex in order to guarantee atomicity of the operation.
3194          */
3195         self->ul_sp = stkptr();
3196         qp = queue_lock(cvp, CV);
3197         enqueue(qp, self, 0);
3198         cvp->cond_waiters_user = 1;
3199         self->ul_cvmutex = mp;
3200         self->ul_cv_wake = cv_wake = (tsp != NULL);
3201         self->ul_signalled = 0;
3202         if (mp->mutex_flag & LOCK_OWNERDEAD) {
3203                 mp->mutex_flag &= ~LOCK_OWNERDEAD;
3204                 mp->mutex_flag |= LOCK_NOTRECOVERABLE;
3205         }
3206         release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
3207         lwpid = mutex_unlock_queue(mp, release_all);
3208         for (;;) {
3209                 set_parking_flag(self, 1);
3210                 queue_unlock(qp);
3211                 if (lwpid != 0) {
3212                         lwpid = preempt_unpark(self, lwpid);
3213                         preempt(self);
3214                 }
3215                 /*
3216                  * We may have a deferred signal present,
3217                  * in which case we should return EINTR.
3218                  * Also, we may have received a SIGCANCEL; if so
3219                  * and we are cancelable we should return EINTR.
3220                  * We force an immediate EINTR return from
3221                  * __lwp_park() by turning our parking flag off.
3222                  */
3223                 if (self->ul_cursig != 0 ||
3224                     (self->ul_cancelable && self->ul_cancel_pending))
3225                         set_parking_flag(self, 0);
3226                 /*
3227                  * __lwp_park() will return the residual time in tsp
3228                  * if we are unparked before the timeout expires.
3229                  */
3230                 error = __lwp_park(tsp, lwpid);
3231                 set_parking_flag(self, 0);
3232                 lwpid = 0;      /* unpark the other lwp only once */
3233                 /*
3234                  * We were waked up by cond_signal(), cond_broadcast(),
3235                  * by an interrupt or timeout (EINTR or ETIME),
3236                  * or we may just have gotten a spurious wakeup.
3237                  */
3238                 qp = queue_lock(cvp, CV);
3239                 if (!cv_wake)
3240                         mqp = queue_lock(mp, MX);
3241                 if (self->ul_sleepq == NULL)
3242                         break;
3243                 /*
3244                  * We are on either the condvar sleep queue or the
3245                  * mutex sleep queue.  Break out of the sleep if we
3246                  * were interrupted or we timed out (EINTR or ETIME).
3247                  * Else this is a spurious wakeup; continue the loop.
3248                  */
3249                 if (!cv_wake && self->ul_sleepq == mqp) { /* mutex queue */
3250                         if (error) {
3251                                 mp->mutex_waiters = dequeue_self(mqp);
3252                                 break;
3253                         }
3254                         tsp = NULL;     /* no more timeout */
3255                 } else if (self->ul_sleepq == qp) {     /* condvar queue */
3256                         if (error) {
3257                                 cvp->cond_waiters_user = dequeue_self(qp);
3258                                 break;
3259                         }
3260                         /*
3261                          * Else a spurious wakeup on the condvar queue.
3262                          * __lwp_park() has already adjusted the timeout.
3263                          */
3264                 } else {
3265                         thr_panic("cond_sleep_queue(): thread not on queue");
3266                 }
3267                 if (!cv_wake)
3268                         queue_unlock(mqp);
3269         }
3270
3271         self->ul_sp = 0;
3272         self->ul_cv_wake = 0;
3273         ASSERT(self->ul_cvmutex == NULL);
3274         ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
3275             self->ul_wchan == NULL);
3276
3277         signalled = self->ul_signalled;
3278         self->ul_signalled = 0;
3279         queue_unlock(qp);
3280         if (!cv_wake)
3281                 queue_unlock(mqp);
3282
3283         /*
3284          * If we were concurrently cond_signal()d and any of:
3285          * received a UNIX signal, were cancelled, or got a timeout,
3286          * then perform another cond_signal() to avoid consuming it.
3287          */
3288         if (error && signalled)
3289                 (void) cond_signal(cvp);
3290
3291         return (error);
3292 }
3293
3294 static void
3295 cond_wait_check_alignment(cond_t *cvp, mutex_t *mp)
3296 {
3297         if ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1))
3298                 lock_error(mp, "cond_wait", cvp, "mutex is misaligned");
3299         if ((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1))
3300                 lock_error(mp, "cond_wait", cvp, "condvar is misaligned");
3301 }
3302
3303 int
3304 cond_wait_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3305 {
3306         ulwp_t *self = curthread;
3307         int error;
3308         int merror;
3309
3310         if (self->ul_error_detection && self->ul_misaligned == 0)
3311                 cond_wait_check_alignment(cvp, mp);
3312
3313         /*
3314          * The old thread library was programmed to defer signals
3315          * while in cond_wait() so that the associated mutex would
3316          * be guaranteed to be held when the application signal
3317          * handler was invoked.
3318          *
3319          * We do not behave this way by default; the state of the
3320          * associated mutex in the signal handler is undefined.
3321          *
3322          * To accommodate applications that depend on the old
3323          * behavior, the _THREAD_COND_WAIT_DEFER environment
3324          * variable can be set to 1 and we will behave in the
3325          * old way with respect to cond_wait().
3326          */
3327         if (self->ul_cond_wait_defer)
3328                 sigoff(self);
3329
3330         error = cond_sleep_queue(cvp, mp, tsp);
3331
3332         /*
3333          * Reacquire the mutex.
3334          */
3335         if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3336                 error = merror;
3337
3338         /*
3339          * Take any deferred signal now, after we have reacquired the mutex.
3340          */
3341         if (self->ul_cond_wait_defer)
3342                 sigon(self);
3343
3344         return (error);
3345 }
3346
3347 /*
3348  * cond_sleep_kernel(): utility function for cond_wait_kernel().
3349  * See the comment ahead of cond_sleep_queue(), above.
3350  */
3351 static int
3352 cond_sleep_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3353 {
3354         int mtype = mp->mutex_type;
3355         ulwp_t *self = curthread;
3356         int error;
3357
3358         if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
3359                 _ceil_prio_waive();
3360
3361         self->ul_sp = stkptr();
3362         self->ul_wchan = cvp;
3363         sigoff(self);
3364         mp->mutex_owner = 0;
3365         /* mp->mutex_ownerpid is cleared by ___lwp_cond_wait() */
3366         if (mtype & LOCK_PRIO_INHERIT) {
3367                 mp->mutex_lockw = LOCKCLEAR;
3368                 self->ul_pilocks--;
3369         }
3370         /*
3371          * ___lwp_cond_wait() returns immediately with EINTR if
3372          * set_parking_flag(self,0) is called on this lwp before it
3373          * goes to sleep in the kernel.  sigacthandler() calls this
3374          * when a deferred signal is noted.  This assures that we don't
3375          * get stuck in ___lwp_cond_wait() with all signals blocked
3376          * due to taking a deferred signal before going to sleep.
3377          */
3378         set_parking_flag(self, 1);
3379         if (self->ul_cursig != 0 ||
3380             (self->ul_cancelable && self->ul_cancel_pending))
3381                 set_parking_flag(self, 0);
3382         error = ___lwp_cond_wait(cvp, mp, tsp, 1);
3383         set_parking_flag(self, 0);
3384         sigon(self);
3385         self->ul_sp = 0;
3386         self->ul_wchan = NULL;
3387         return (error);
3388 }
3389
3390 int
3391 cond_wait_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3392 {
3393         ulwp_t *self = curthread;
3394         int error;
3395         int merror;
3396
3397         if (self->ul_error_detection && self->ul_misaligned == 0)
3398                 cond_wait_check_alignment(cvp, mp);
3399
3400         /*
3401          * See the large comment in cond_wait_queue(), above.
3402          */
3403         if (self->ul_cond_wait_defer)
3404                 sigoff(self);
3405
3406         error = cond_sleep_kernel(cvp, mp, tsp);
3407
3408         /*
3409          * Override the return code from ___lwp_cond_wait()
3410          * with any non-zero return code from mutex_lock().
3411          * This addresses robust lock failures in particular;
3412          * the caller must see the EOWNERDEAD or ENOTRECOVERABLE
3413          * errors in order to take corrective action.
3414          */
3415         if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3416                 error = merror;
3417
3418         /*
3419          * Take any deferred signal now, after we have reacquired the mutex.
3420          */
3421         if (self->ul_cond_wait_defer)
3422                 sigon(self);
3423
3424         return (error);
3425 }
3426
3427 /*
3428  * Common code for cond_wait() and cond_timedwait()
3429  */
3430 int
3431 cond_wait_common(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3432 {
3433         int mtype = mp->mutex_type;
3434         hrtime_t begin_sleep = 0;
3435         ulwp_t *self = curthread;
3436         uberdata_t *udp = self->ul_uberdata;
3437         tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3438         tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
3439         uint8_t rcount;
3440         int error = 0;
3441
3442         /*
3443          * The SUSV3 Posix spec for pthread_cond_timedwait() states:
3444          *      Except in the case of [ETIMEDOUT], all these error checks
3445          *      shall act as if they were performed immediately at the
3446          *      beginning of processing for the function and shall cause
3447          *      an error return, in effect, prior to modifying the state
3448          *      of the mutex specified by mutex or the condition variable
3449          *      specified by cond.
3450          * Therefore, we must return EINVAL now if the timout is invalid.
3451          */
3452         if (tsp != NULL &&
3453             (tsp->tv_sec < 0 || (ulong_t)tsp->tv_nsec >= NANOSEC))
3454                 return (EINVAL);
3455
3456         if (__td_event_report(self, TD_SLEEP, udp)) {
3457                 self->ul_sp = stkptr();
3458                 self->ul_wchan = cvp;
3459                 self->ul_td_evbuf.eventnum = TD_SLEEP;
3460                 self->ul_td_evbuf.eventdata = cvp;
3461                 tdb_event(TD_SLEEP, udp);
3462                 self->ul_sp = 0;
3463         }
3464         if (csp) {
3465                 if (tsp)
3466                         tdb_incr(csp->cond_timedwait);
3467                 else
3468                         tdb_incr(csp->cond_wait);
3469         }
3470         if (msp)
3471                 begin_sleep = record_hold_time(msp);
3472         else if (csp)
3473                 begin_sleep = gethrtime();
3474
3475         if (self->ul_error_detection) {
3476                 if (!mutex_held(mp))
3477                         lock_error(mp, "cond_wait", cvp, NULL);
3478                 if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0)
3479                         lock_error(mp, "recursive mutex in cond_wait",
3480                             cvp, NULL);
3481                 if (cvp->cond_type & USYNC_PROCESS) {
3482                         if (!(mtype & USYNC_PROCESS))
3483                                 lock_error(mp, "cond_wait", cvp,
3484                                     "condvar process-shared, "
3485                                     "mutex process-private");
3486                 } else {
3487                         if (mtype & USYNC_PROCESS)
3488                                 lock_error(mp, "cond_wait", cvp,
3489                                     "condvar process-private, "
3490                                     "mutex process-shared");
3491                 }
3492         }
3493
3494         /*
3495          * We deal with recursive mutexes by completely
3496          * dropping the lock and restoring the recursion
3497          * count after waking up.  This is arguably wrong,
3498          * but it obeys the principle of least astonishment.
3499          */
3500         rcount = mp->mutex_rcount;
3501         mp->mutex_rcount = 0;
3502         if ((mtype &
3503             (USYNC_PROCESS | LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT)) |
3504             (cvp->cond_type & USYNC_PROCESS))
3505                 error = cond_wait_kernel(cvp, mp, tsp);
3506         else
3507                 error = cond_wait_queue(cvp, mp, tsp);
3508         mp->mutex_rcount = rcount;
3509
3510         if (csp) {
3511                 hrtime_t lapse = gethrtime() - begin_sleep;
3512                 if (tsp == NULL)
3513                         csp->cond_wait_sleep_time += lapse;
3514                 else {
3515                         csp->cond_timedwait_sleep_time += lapse;
3516                         if (error == ETIME)
3517                                 tdb_incr(csp->cond_timedwait_timeout);
3518                 }
3519         }
3520         return (error);
3521 }
3522
3523 /*
3524  * cond_wait() is a cancellation point but __cond_wait() is not.
3525  * Internally, libc calls the non-cancellation version.
3526  * Other libraries need to use pthread_setcancelstate(), as appropriate,
3527  * since __cond_wait() is not exported from libc.
3528  */
3529 int
3530 __cond_wait(cond_t *cvp, mutex_t *mp)
3531 {
3532         ulwp_t *self = curthread;
3533         uberdata_t *udp = self->ul_uberdata;
3534         uberflags_t *gflags;
3535
3536         if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3537             !mutex_held(mp))
3538                 return (EPERM);
3539
3540         /*
3541          * Optimize the common case of USYNC_THREAD plus
3542          * no error detection, no lock statistics, and no event tracing.
3543          */
3544         if ((gflags = self->ul_schedctl_called) != NULL &&
3545             (cvp->cond_type | mp->mutex_type | gflags->uf_trs_ted |
3546             self->ul_td_events_enable |
3547             udp->tdb.tdb_ev_global_mask.event_bits[0]) == 0)
3548                 return (cond_wait_queue(cvp, mp, NULL));
3549
3550         /*
3551          * Else do it the long way.
3552          */
3553         return (cond_wait_common(cvp, mp, NULL));
3554 }
3555
3556 #pragma weak _cond_wait = cond_wait
3557 int
3558 cond_wait(cond_t *cvp, mutex_t *mp)
3559 {
3560         int error;
3561
3562         _cancelon();
3563         error = __cond_wait(cvp, mp);
3564         if (error == EINTR)
3565                 _canceloff();
3566         else
3567                 _canceloff_nocancel();
3568         return (error);
3569 }
3570
3571 /*
3572  * pthread_cond_wait() is a cancellation point.
3573  */
3574 int
3575 pthread_cond_wait(pthread_cond_t *_RESTRICT_KYWD cvp,
3576         pthread_mutex_t *_RESTRICT_KYWD mp)
3577 {
3578         int error;
3579
3580         error = cond_wait((cond_t *)cvp, (mutex_t *)mp);
3581         return ((error == EINTR)? 0 : error);
3582 }
3583
3584 /*
3585  * cond_timedwait() is a cancellation point but __cond_timedwait() is not.
3586  */
3587 int
3588 __cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3589 {
3590         clockid_t clock_id = cvp->cond_clockid;
3591         timespec_t reltime;
3592         int error;
3593
3594         if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3595             !mutex_held(mp))
3596                 return (EPERM);
3597
3598         if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_HIGHRES)
3599                 clock_id = CLOCK_REALTIME;
3600         abstime_to_reltime(clock_id, abstime, &reltime);
3601         error = cond_wait_common(cvp, mp, &reltime);
3602         if (error == ETIME && clock_id == CLOCK_HIGHRES) {
3603                 /*
3604                  * Don't return ETIME if we didn't really get a timeout.
3605                  * This can happen if we return because someone resets
3606                  * the system clock.  Just return zero in this case,
3607                  * giving a spurious wakeup but not a timeout.
3608                  */
3609                 if ((hrtime_t)(uint32_t)abstime->tv_sec * NANOSEC +
3610                     abstime->tv_nsec > gethrtime())
3611                         error = 0;
3612         }
3613         return (error);
3614 }
3615
3616 int
3617 cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3618 {
3619         int error;
3620
3621         _cancelon();
3622         error = __cond_timedwait(cvp, mp, abstime);
3623         if (error == EINTR)
3624                 _canceloff();
3625         else
3626                 _canceloff_nocancel();
3627         return (error);
3628 }
3629
3630 /*
3631  * pthread_cond_timedwait() is a cancellation point.
3632  */
3633 int
3634 pthread_cond_timedwait(pthread_cond_t *_RESTRICT_KYWD cvp,
3635         pthread_mutex_t *_RESTRICT_KYWD mp,
3636         const struct timespec *_RESTRICT_KYWD abstime)
3637 {
3638         int error;
3639
3640         error = cond_timedwait((cond_t *)cvp, (mutex_t *)mp, abstime);
3641         if (error == ETIME)
3642                 error = ETIMEDOUT;
3643         else if (error == EINTR)
3644                 error = 0;
3645         return (error);
3646 }
3647
3648 /*
3649  * cond_reltimedwait() is a cancellation point but __cond_reltimedwait() is not.
3650  */
3651 int
3652 __cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3653 {
3654         timespec_t tslocal = *reltime;
3655
3656         if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3657             !mutex_held(mp))
3658                 return (EPERM);
3659
3660         return (cond_wait_common(cvp, mp, &tslocal));
3661 }
3662
3663 int
3664 cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3665 {
3666         int error;
3667
3668         _cancelon();
3669         error = __cond_reltimedwait(cvp, mp, reltime);
3670         if (error == EINTR)
3671                 _canceloff();
3672         else
3673                 _canceloff_nocancel();
3674         return (error);
3675 }
3676
3677 int
3678 pthread_cond_reltimedwait_np(pthread_cond_t *_RESTRICT_KYWD cvp,
3679         pthread_mutex_t *_RESTRICT_KYWD mp,
3680         const struct timespec *_RESTRICT_KYWD reltime)
3681 {
3682         int error;
3683
3684         error = cond_reltimedwait((cond_t *)cvp, (mutex_t *)mp, reltime);
3685         if (error == ETIME)
3686                 error = ETIMEDOUT;
3687         else if (error == EINTR)
3688                 error = 0;
3689         return (error);
3690 }
3691
3692 #pragma weak pthread_cond_signal = cond_signal
3693 #pragma weak _cond_signal = cond_signal
3694 int
3695 cond_signal(cond_t *cvp)
3696 {
3697         ulwp_t *self = curthread;
3698         uberdata_t *udp = self->ul_uberdata;
3699         tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3700         int error = 0;
3701         int more;
3702         lwpid_t lwpid;
3703         queue_head_t *qp;
3704         mutex_t *mp;
3705         queue_head_t *mqp;
3706         ulwp_t **ulwpp;
3707         ulwp_t *ulwp;
3708         ulwp_t *prev;
3709
3710         if (csp)
3711                 tdb_incr(csp->cond_signal);
3712
3713         if (cvp->cond_waiters_kernel)   /* someone sleeping in the kernel? */
3714                 error = _lwp_cond_signal(cvp);
3715
3716         if (!cvp->cond_waiters_user)    /* no one sleeping at user-level */
3717                 return (error);
3718
3719         /*
3720          * Move someone from the condvar sleep queue to the mutex sleep
3721          * queue for the mutex that he will acquire on being waked up.
3722          * We can do this only if we own the mutex he will acquire.
3723          * If we do not own the mutex, or if his ul_cv_wake flag
3724          * is set, just dequeue and unpark him.
3725          */
3726         qp = queue_lock(cvp, CV);
3727         ulwpp = queue_slot(qp, &prev, &more);
3728         cvp->cond_waiters_user = more;
3729         if (ulwpp == NULL) {    /* no one on the sleep queue */
3730                 queue_unlock(qp);
3731                 return (error);
3732         }
3733         ulwp = *ulwpp;
3734
3735         /*
3736          * Inform the thread that he was the recipient of a cond_signal().
3737          * This lets him deal with cond_signal() and, concurrently,
3738          * one or more of a cancellation, a UNIX signal, or a timeout.
3739          * These latter conditions must not consume a cond_signal().
3740          */
3741         ulwp->ul_signalled = 1;
3742
3743         /*
3744          * Dequeue the waiter but leave his ul_sleepq non-NULL
3745          * while we move him to the mutex queue so that he can
3746          * deal properly with spurious wakeups.
3747          */
3748         queue_unlink(qp, ulwpp, prev);
3749
3750         mp = ulwp->ul_cvmutex;          /* the mutex he will acquire */
3751         ulwp->ul_cvmutex = NULL;
3752         ASSERT(mp != NULL);
3753
3754         if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3755                 /* just wake him up */
3756                 lwpid = ulwp->ul_lwpid;
3757                 no_preempt(self);
3758                 ulwp->ul_sleepq = NULL;
3759                 ulwp->ul_wchan = NULL;
3760                 queue_unlock(qp);
3761                 (void) __lwp_unpark(lwpid);
3762                 preempt(self);
3763         } else {
3764                 /* move him to the mutex queue */
3765                 mqp = queue_lock(mp, MX);
3766                 enqueue(mqp, ulwp, 0);
3767                 mp->mutex_waiters = 1;
3768                 queue_unlock(mqp);
3769                 queue_unlock(qp);
3770         }
3771
3772         return (error);
3773 }
3774
3775 /*
3776  * Utility function called by mutex_wakeup_all(), cond_broadcast(),
3777  * and rw_queue_release() to (re)allocate a big buffer to hold the
3778  * lwpids of all the threads to be set running after they are removed
3779  * from their sleep queues.  Since we are holding a queue lock, we
3780  * cannot call any function that might acquire a lock.  mmap(), munmap(),
3781  * lwp_unpark_all() are simple system calls and are safe in this regard.
3782  */
3783 lwpid_t *
3784 alloc_lwpids(lwpid_t *lwpid, int *nlwpid_ptr, int *maxlwps_ptr)
3785 {
3786         /*
3787          * Allocate NEWLWPS ids on the first overflow.
3788          * Double the allocation each time after that.
3789          */
3790         int nlwpid = *nlwpid_ptr;
3791         int maxlwps = *maxlwps_ptr;
3792         int first_allocation;
3793         int newlwps;
3794         void *vaddr;
3795
3796         ASSERT(nlwpid == maxlwps);
3797
3798         first_allocation = (maxlwps == MAXLWPS);
3799         newlwps = first_allocation? NEWLWPS : 2 * maxlwps;
3800         vaddr = mmap(NULL, newlwps * sizeof (lwpid_t),
3801             PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
3802
3803         if (vaddr == MAP_FAILED) {
3804                 /*
3805                  * Let's hope this never happens.
3806                  * If it does, then we have a terrible
3807                  * thundering herd on our hands.
3808                  */
3809                 (void) __lwp_unpark_all(lwpid, nlwpid);
3810                 *nlwpid_ptr = 0;
3811         } else {
3812                 (void) memcpy(vaddr, lwpid, maxlwps * sizeof (lwpid_t));
3813                 if (!first_allocation)
3814                         (void) munmap((caddr_t)lwpid,
3815                             maxlwps * sizeof (lwpid_t));
3816                 lwpid = vaddr;
3817                 *maxlwps_ptr = newlwps;
3818         }
3819
3820         return (lwpid);
3821 }
3822
3823 #pragma weak pthread_cond_broadcast = cond_broadcast
3824 #pragma weak _cond_broadcast = cond_broadcast
3825 int
3826 cond_broadcast(cond_t *cvp)
3827 {
3828         ulwp_t *self = curthread;
3829         uberdata_t *udp = self->ul_uberdata;
3830         tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3831         int error = 0;
3832         queue_head_t *qp;
3833         queue_root_t *qrp;
3834         mutex_t *mp;
3835         mutex_t *mp_cache = NULL;
3836         queue_head_t *mqp = NULL;
3837         ulwp_t *ulwp;
3838         int nlwpid = 0;
3839         int maxlwps = MAXLWPS;
3840         lwpid_t buffer[MAXLWPS];
3841         lwpid_t *lwpid = buffer;
3842
3843         if (csp)
3844                 tdb_incr(csp->cond_broadcast);
3845
3846         if (cvp->cond_waiters_kernel)   /* someone sleeping in the kernel? */
3847                 error = _lwp_cond_broadcast(cvp);
3848
3849         if (!cvp->cond_waiters_user)    /* no one sleeping at user-level */
3850                 return (error);
3851
3852         /*
3853          * Move everyone from the condvar sleep queue to the mutex sleep
3854          * queue for the mutex that they will acquire on being waked up.
3855          * We can do this only if we own the mutex they will acquire.
3856          * If we do not own the mutex, or if their ul_cv_wake flag
3857          * is set, just dequeue and unpark them.
3858          *
3859          * We keep track of lwpids that are to be unparked in lwpid[].
3860          * __lwp_unpark_all() is called to unpark all of them after
3861          * they have been removed from the sleep queue and the sleep
3862          * queue lock has been dropped.  If we run out of space in our
3863          * on-stack buffer, we need to allocate more but we can't call
3864          * lmalloc() because we are holding a queue lock when the overflow
3865          * occurs and lmalloc() acquires a lock.  We can't use alloca()
3866          * either because the application may have allocated a small
3867          * stack and we don't want to overrun the stack.  So we call
3868          * alloc_lwpids() to allocate a bigger buffer using the mmap()
3869          * system call directly since that path acquires no locks.
3870          */
3871         qp = queue_lock(cvp, CV);
3872         cvp->cond_waiters_user = 0;
3873         for (;;) {
3874                 if ((qrp = qp->qh_root) == NULL ||
3875                     (ulwp = qrp->qr_head) == NULL)
3876                         break;
3877                 ASSERT(ulwp->ul_wchan == cvp);
3878                 queue_unlink(qp, &qrp->qr_head, NULL);
3879                 mp = ulwp->ul_cvmutex;          /* his mutex */
3880                 ulwp->ul_cvmutex = NULL;
3881                 ASSERT(mp != NULL);
3882                 if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3883                         /* just wake him up */
3884                         ulwp->ul_sleepq = NULL;
3885                         ulwp->ul_wchan = NULL;
3886                         if (nlwpid == maxlwps)
3887                                 lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
3888                         lwpid[nlwpid++] = ulwp->ul_lwpid;
3889                 } else {
3890                         /* move him to the mutex queue */
3891                         if (mp != mp_cache) {
3892                                 mp_cache = mp;
3893                                 if (mqp != NULL)
3894                                         queue_unlock(mqp);
3895                                 mqp = queue_lock(mp, MX);
3896                         }
3897                         enqueue(mqp, ulwp, 0);
3898                         mp->mutex_waiters = 1;
3899                 }
3900         }
3901         if (mqp != NULL)
3902                 queue_unlock(mqp);
3903         if (nlwpid == 0) {
3904                 queue_unlock(qp);
3905         } else {
3906                 no_preempt(self);
3907                 queue_unlock(qp);
3908                 if (nlwpid == 1)
3909                         (void) __lwp_unpark(lwpid[0]);
3910                 else
3911                         (void) __lwp_unpark_all(lwpid, nlwpid);
3912                 preempt(self);
3913         }
3914         if (lwpid != buffer)
3915                 (void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
3916         return (error);
3917 }
3918
3919 #pragma weak pthread_cond_destroy = cond_destroy
3920 int
3921 cond_destroy(cond_t *cvp)
3922 {
3923         cvp->cond_magic = 0;
3924         tdb_sync_obj_deregister(cvp);
3925         return (0);
3926 }
3927
3928 #if defined(THREAD_DEBUG)
3929 void
3930 assert_no_libc_locks_held(void)
3931 {
3932         ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
3933 }
3934
3935 /* protected by link_lock */
3936 uint64_t spin_lock_spin;
3937 uint64_t spin_lock_spin2;
3938 uint64_t spin_lock_sleep;
3939 uint64_t spin_lock_wakeup;
3940
3941 /*
3942  * Record spin lock statistics.
3943  * Called by a thread exiting itself in thrp_exit().
3944  * Also called via atexit() from the thread calling
3945  * exit() to do all the other threads as well.
3946  */
3947 void
3948 record_spin_locks(ulwp_t *ulwp)
3949 {
3950         spin_lock_spin += ulwp->ul_spin_lock_spin;
3951         spin_lock_spin2 += ulwp->ul_spin_lock_spin2;
3952         spin_lock_sleep += ulwp->ul_spin_lock_sleep;
3953         spin_lock_wakeup += ulwp->ul_spin_lock_wakeup;
3954         ulwp->ul_spin_lock_spin = 0;
3955         ulwp->ul_spin_lock_spin2 = 0;
3956         ulwp->ul_spin_lock_sleep = 0;
3957         ulwp->ul_spin_lock_wakeup = 0;
3958 }
3959
3960 /*
3961  * atexit function:  dump the queue statistics to stderr.
3962  */
3963 #include <stdio.h>
3964 void
3965 dump_queue_statistics(void)
3966 {
3967         uberdata_t *udp = curthread->ul_uberdata;
3968         queue_head_t *qp;
3969         int qn;
3970         uint64_t spin_lock_total = 0;
3971
3972         if (udp->queue_head == NULL || thread_queue_dump == 0)
3973                 return;
3974
3975         if (fprintf(stderr, "\n%5d mutex queues:\n", QHASHSIZE) < 0 ||
3976             fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
3977                 return;
3978         for (qn = 0, qp = udp->queue_head; qn < QHASHSIZE; qn++, qp++) {
3979                 if (qp->qh_lockcount == 0)
3980                         continue;
3981                 spin_lock_total += qp->qh_lockcount;
3982                 if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
3983                     (u_longlong_t)qp->qh_lockcount,
3984                     qp->qh_qmax, qp->qh_hmax) < 0)
3985                         return;
3986         }
3987
3988         if (fprintf(stderr, "\n%5d condvar queues:\n", QHASHSIZE) < 0 ||
3989             fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
3990                 return;
3991         for (qn = 0; qn < QHASHSIZE; qn++, qp++) {
3992                 if (qp->qh_lockcount == 0)
3993                         continue;
3994                 spin_lock_total += qp->qh_lockcount;
3995                 if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
3996                     (u_longlong_t)qp->qh_lockcount,
3997                     qp->qh_qmax, qp->qh_hmax) < 0)
3998                         return;
3999         }
4000
4001         (void) fprintf(stderr, "\n  spin_lock_total  = %10llu\n",
4002             (u_longlong_t)spin_lock_total);
4003         (void) fprintf(stderr, "  spin_lock_spin   = %10llu\n",
4004             (u_longlong_t)spin_lock_spin);
4005         (void) fprintf(stderr, "  spin_lock_spin2  = %10llu\n",
4006             (u_longlong_t)spin_lock_spin2);
4007         (void) fprintf(stderr, "  spin_lock_sleep  = %10llu\n",
4008             (u_longlong_t)spin_lock_sleep);
4009         (void) fprintf(stderr, "  spin_lock_wakeup = %10llu\n",
4010             (u_longlong_t)spin_lock_wakeup);
4011 }
4012 #endif