sys/kern/kern_mutex.c

   1 /*
   2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34 /*
  35  * Implement fast persistent locks based on atomic_cmpset_int() with
  36  * semantics similar to lockmgr locks but faster and taking up much less
  37  * space.  Taken from HAMMER's lock implementation.
  38  *
  39  * These are meant to complement our LWKT tokens.  Tokens are only held
  40  * while the thread is running.  Mutexes can be held across blocking
  41  * conditions.
  42  *
  43  * - Exclusive priority over shared to prevent SMP starvation.
  44  * - locks can be aborted (async callback, if any, will be made w/ENOLCK).
  45  * - locks can be asynchronous.
  46  * - synchronous fast path if no blocking occurs (async callback is not
  47  *   made in this case).
  48  *
  49  * Generally speaking any caller-supplied link state must be properly
  50  * initialized before use.
  51  *
  52  * Most of the support is in sys/mutex[2].h.  We mostly provide backoff
  53  * functions here.
  54  */
  55
  56 #include <sys/param.h>
  57 #include <sys/systm.h>
  58 #include <sys/kernel.h>
  59 #include <sys/sysctl.h>
  60 #include <sys/thread.h>
  61
  62 #include <machine/cpufunc.h>
  63
  64 #include <sys/thread2.h>
  65 #include <sys/mutex2.h>
  66
  67 static int64_t mtx_contention_count;
  68 static int64_t mtx_collision_count;
  69 static int64_t mtx_wakeup_count;
  70
  71 SYSCTL_QUAD(_kern, OID_AUTO, mtx_contention_count, CTLFLAG_RW,
  72             &mtx_contention_count, 0, "");
  73 SYSCTL_QUAD(_kern, OID_AUTO, mtx_collision_count, CTLFLAG_RW,
  74             &mtx_collision_count, 0, "");
  75 SYSCTL_QUAD(_kern, OID_AUTO, mtx_wakeup_count, CTLFLAG_RW,
  76             &mtx_wakeup_count, 0, "");
  77
  78 static int mtx_chain_link_ex(mtx_t *mtx, u_int olock);
  79 static int mtx_chain_link_sh(mtx_t *mtx, u_int olock, int addcount);
  80 static void mtx_delete_link(mtx_t *mtx, mtx_link_t *link);
  81
  82 /*
  83  * Exclusive-lock a mutex, block until acquired unless link is async.
  84  * Recursion is allowed.
  85  *
  86  * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS
  87  * if async.  If immediately successful an async exclusive lock will return 0
  88  * and not issue the async callback or link the link structure.  The caller
  89  * must handle this case (typically this is an optimal code path).
  90  *
  91  * A tsleep() error can only be returned if PCATCH is specified in the flags.
  92  */
  93 static __inline int
  94 __mtx_lock_ex(mtx_t *mtx, mtx_link_t *link, int flags, int to)
  95 {
  96         thread_t td;
  97         u_int   lock;
  98         u_int   nlock;
  99         int     error;
 100         int     isasync;
 101
 102         for (;;) {
 103                 lock = mtx->mtx_lock;
 104                 cpu_ccfence();
 105
 106                 if (lock == 0) {
 107                         nlock = MTX_EXCLUSIVE | 1;
 108                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 109                                 mtx->mtx_owner = curthread;
 110                                 link->state = MTX_LINK_ACQUIRED;
 111                                 error = 0;
 112                                 break;
 113                         }
 114                         continue;
 115                 }
 116                 if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) {
 117                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 118                         nlock = lock + 1;
 119                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 120                                 link->state = MTX_LINK_ACQUIRED;
 121                                 error = 0;
 122                                 break;
 123                         }
 124                         continue;
 125                 }
 126
 127                 /*
 128                  * We need MTX_LINKSPIN to manipulate exlink or
 129                  * shlink.
 130                  *
 131                  * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate
 132                  * pending shared requests.  It cannot be set as a separate
 133                  * operation prior to acquiring MTX_LINKSPIN.
 134                  *
 135                  * To avoid unnecessary cpu cache traffic we poll
 136                  * for collisions.  It is also possible that EXWANTED
 137                  * state failing the above test was spurious, so all the
 138                  * tests must be repeated if we cannot obtain LINKSPIN
 139                  * with the prior state tests intact (i.e. don't reload
 140                  * the (lock) variable here, for heaven's sake!).
 141                  */
 142                 if (lock & MTX_LINKSPIN) {
 143                         cpu_pause();
 144                         ++mtx_collision_count;
 145                         continue;
 146                 }
 147                 td = curthread;
 148                 nlock = lock | MTX_EXWANTED | MTX_LINKSPIN;
 149                 ++td->td_critcount;
 150                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
 151                         --td->td_critcount;
 152                         continue;
 153                 }
 154
 155                 /*
 156                  * Check for early abort.
 157                  */
 158                 if (link->state == MTX_LINK_ABORTED) {
 159                         if (mtx->mtx_exlink == NULL) {
 160                                 atomic_clear_int(&mtx->mtx_lock,
 161                                                  MTX_LINKSPIN |
 162                                                  MTX_EXWANTED);
 163                         } else {
 164                                 atomic_clear_int(&mtx->mtx_lock,
 165                                                  MTX_LINKSPIN);
 166                         }
 167                         --td->td_critcount;
 168                         link->state = MTX_LINK_IDLE;
 169                         error = ENOLCK;
 170                         break;
 171                 }
 172
 173                 /*
 174                  * Add our link to the exlink list and release LINKSPIN.
 175                  */
 176                 link->owner = td;
 177                 link->state = MTX_LINK_LINKED_EX;
 178                 if (mtx->mtx_exlink) {
 179                         link->next = mtx->mtx_exlink;
 180                         link->prev = link->next->prev;
 181                         link->next->prev = link;
 182                         link->prev->next = link;
 183                 } else {
 184                         link->next = link;
 185                         link->prev = link;
 186                         mtx->mtx_exlink = link;
 187                 }
 188                 isasync = (link->callback != NULL);
 189                 atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
 190                 --td->td_critcount;
 191
 192                 /*
 193                  * If asynchronous lock request return without
 194                  * blocking, leave link structure linked.
 195                  */
 196                 if (isasync) {
 197                         error = EINPROGRESS;
 198                         break;
 199                 }
 200
 201                 /*
 202                  * Wait for lock
 203                  */
 204                 error = mtx_wait_link(mtx, link, flags, to);
 205                 break;
 206         }
 207         return (error);
 208 }
 209
 210 int
 211 _mtx_lock_ex_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
 212 {
 213         return(__mtx_lock_ex(mtx, link, flags, to));
 214 }
 215
 216 int
 217 _mtx_lock_ex(mtx_t *mtx, int flags, int to)
 218 {
 219         mtx_link_t link;
 220
 221         mtx_link_init(&link);
 222         return(__mtx_lock_ex(mtx, &link, flags, to));
 223 }
 224
 225 int
 226 _mtx_lock_ex_quick(mtx_t *mtx)
 227 {
 228         mtx_link_t link;
 229
 230         mtx_link_init(&link);
 231         return(__mtx_lock_ex(mtx, &link, 0, 0));
 232 }
 233
 234 /*
 235  * Share-lock a mutex, block until acquired.  Recursion is allowed.
 236  *
 237  * Returns 0 on success, or the tsleep() return code on failure.
 238  * An error can only be returned if PCATCH is specified in the flags.
 239  *
 240  * NOTE: Shared locks get a mass-wakeup so if the tsleep fails we
 241  *       do not have to chain the wakeup().
 242  */
 243 static __inline int
 244 __mtx_lock_sh(mtx_t *mtx, mtx_link_t *link, int flags, int to)
 245 {
 246         thread_t td;
 247         u_int   lock;
 248         u_int   nlock;
 249         int     error;
 250         int     isasync;
 251
 252         for (;;) {
 253                 lock = mtx->mtx_lock;
 254                 cpu_ccfence();
 255
 256                 if (lock == 0) {
 257                         nlock = 1;
 258                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 259                                 error = 0;
 260                                 link->state = MTX_LINK_ACQUIRED;
 261                                 break;
 262                         }
 263                         continue;
 264                 }
 265                 if ((lock & (MTX_EXCLUSIVE | MTX_EXWANTED)) == 0) {
 266                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 267                         nlock = lock + 1;
 268                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 269                                 error = 0;
 270                                 link->state = MTX_LINK_ACQUIRED;
 271                                 break;
 272                         }
 273                         continue;
 274                 }
 275
 276                 /*
 277                  * We need MTX_LINKSPIN to manipulate exlink or
 278                  * shlink.
 279                  *
 280                  * We must set MTX_SHWANTED with MTX_LINKSPIN to indicate
 281                  * pending shared requests.  It cannot be set as a separate
 282                  * operation prior to acquiring MTX_LINKSPIN.
 283                  *
 284                  * To avoid unnecessary cpu cache traffic we poll
 285                  * for collisions.  It is also possible that EXWANTED
 286                  * state failing the above test was spurious, so all the
 287                  * tests must be repeated if we cannot obtain LINKSPIN
 288                  * with the prior state tests intact (i.e. don't reload
 289                  * the (lock) variable here, for heaven's sake!).
 290                  */
 291                 if (lock & MTX_LINKSPIN) {
 292                         cpu_pause();
 293                         ++mtx_collision_count;
 294                         continue;
 295                 }
 296                 td = curthread;
 297                 nlock = lock | MTX_SHWANTED | MTX_LINKSPIN;
 298                 ++td->td_critcount;
 299                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
 300                         --td->td_critcount;
 301                         continue;
 302                 }
 303
 304                 /*
 305                  * Check for early abort.
 306                  */
 307                 if (link->state == MTX_LINK_ABORTED) {
 308                         if (mtx->mtx_exlink == NULL) {
 309                                 atomic_clear_int(&mtx->mtx_lock,
 310                                                  MTX_LINKSPIN |
 311                                                  MTX_SHWANTED);
 312                         } else {
 313                                 atomic_clear_int(&mtx->mtx_lock,
 314                                                  MTX_LINKSPIN);
 315                         }
 316                         --td->td_critcount;
 317                         link->state = MTX_LINK_IDLE;
 318                         error = ENOLCK;
 319                         break;
 320                 }
 321
 322                 /*
 323                  * Add our link to the exlink list and release LINKSPIN.
 324                  */
 325                 link->owner = td;
 326                 link->state = MTX_LINK_LINKED_SH;
 327                 if (mtx->mtx_shlink) {
 328                         link->next = mtx->mtx_shlink;
 329                         link->prev = link->next->prev;
 330                         link->next->prev = link;
 331                         link->prev->next = link;
 332                 } else {
 333                         link->next = link;
 334                         link->prev = link;
 335                         mtx->mtx_shlink = link;
 336                 }
 337                 isasync = (link->callback != NULL);
 338                 atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
 339                 --td->td_critcount;
 340
 341                 /*
 342                  * If asynchronous lock request return without
 343                  * blocking, leave link structure linked.
 344                  */
 345                 if (isasync) {
 346                         error = EINPROGRESS;
 347                         break;
 348                 }
 349
 350                 /*
 351                  * Wait for lock
 352                  */
 353                 error = mtx_wait_link(mtx, link, flags, to);
 354                 break;
 355         }
 356         return (error);
 357 }
 358
 359 int
 360 _mtx_lock_sh_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
 361 {
 362         return(__mtx_lock_sh(mtx, link, flags, to));
 363 }
 364
 365 int
 366 _mtx_lock_sh(mtx_t *mtx, int flags, int to)
 367 {
 368         mtx_link_t link;
 369
 370         mtx_link_init(&link);
 371         return(__mtx_lock_sh(mtx, &link, flags, to));
 372 }
 373
 374 int
 375 _mtx_lock_sh_quick(mtx_t *mtx)
 376 {
 377         mtx_link_t link;
 378
 379         mtx_link_init(&link);
 380         return(__mtx_lock_sh(mtx, &link, 0, 0));
 381 }
 382
 383 /*
 384  * Get an exclusive spinlock the hard way.
 385  */
 386 void
 387 _mtx_spinlock(mtx_t *mtx)
 388 {
 389         u_int   lock;
 390         u_int   nlock;
 391         int     bb = 1;
 392         int     bo;
 393
 394         for (;;) {
 395                 lock = mtx->mtx_lock;
 396                 if (lock == 0) {
 397                         nlock = MTX_EXCLUSIVE | 1;
 398                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 399                                 mtx->mtx_owner = curthread;
 400                                 break;
 401                         }
 402                 } else if ((lock & MTX_EXCLUSIVE) &&
 403                            mtx->mtx_owner == curthread) {
 404                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 405                         nlock = lock + 1;
 406                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 407                                 break;
 408                 } else {
 409                         /* MWAIT here */
 410                         if (bb < 1000)
 411                                 ++bb;
 412                         cpu_pause();
 413                         for (bo = 0; bo < bb; ++bo)
 414                                 ;
 415                         ++mtx_contention_count;
 416                 }
 417                 cpu_pause();
 418                 ++mtx_collision_count;
 419         }
 420 }
 421
 422 /*
 423  * Attempt to acquire a spinlock, if we fail we must undo the
 424  * gd->gd_spinlocks/gd->gd_curthead->td_critcount predisposition.
 425  *
 426  * Returns 0 on success, EAGAIN on failure.
 427  */
 428 int
 429 _mtx_spinlock_try(mtx_t *mtx)
 430 {
 431         globaldata_t gd = mycpu;
 432         u_int   lock;
 433         u_int   nlock;
 434         int     res = 0;
 435
 436         for (;;) {
 437                 lock = mtx->mtx_lock;
 438                 if (lock == 0) {
 439                         nlock = MTX_EXCLUSIVE | 1;
 440                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 441                                 mtx->mtx_owner = gd->gd_curthread;
 442                                 break;
 443                         }
 444                 } else if ((lock & MTX_EXCLUSIVE) &&
 445                            mtx->mtx_owner == gd->gd_curthread) {
 446                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 447                         nlock = lock + 1;
 448                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 449                                 break;
 450                 } else {
 451                         --gd->gd_spinlocks;
 452                         cpu_ccfence();
 453                         --gd->gd_curthread->td_critcount;
 454                         res = EAGAIN;
 455                         break;
 456                 }
 457                 cpu_pause();
 458                 ++mtx_collision_count;
 459         }
 460         return res;
 461 }
 462
 463 #if 0
 464
 465 void
 466 _mtx_spinlock_sh(mtx_t *mtx)
 467 {
 468         u_int   lock;
 469         u_int   nlock;
 470         int     bb = 1;
 471         int     bo;
 472
 473         for (;;) {
 474                 lock = mtx->mtx_lock;
 475                 if ((lock & MTX_EXCLUSIVE) == 0) {
 476                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 477                         nlock = lock + 1;
 478                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 479                                 break;
 480                 } else {
 481                         /* MWAIT here */
 482                         if (bb < 1000)
 483                                 ++bb;
 484                         cpu_pause();
 485                         for (bo = 0; bo < bb; ++bo)
 486                                 ;
 487                         ++mtx_contention_count;
 488                 }
 489                 cpu_pause();
 490                 ++mtx_collision_count;
 491         }
 492 }
 493
 494 #endif
 495
 496 int
 497 _mtx_lock_ex_try(mtx_t *mtx)
 498 {
 499         u_int   lock;
 500         u_int   nlock;
 501         int     error;
 502
 503         for (;;) {
 504                 lock = mtx->mtx_lock;
 505                 if (lock == 0) {
 506                         nlock = MTX_EXCLUSIVE | 1;
 507                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 508                                 mtx->mtx_owner = curthread;
 509                                 error = 0;
 510                                 break;
 511                         }
 512                 } else if ((lock & MTX_EXCLUSIVE) &&
 513                            mtx->mtx_owner == curthread) {
 514                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 515                         nlock = lock + 1;
 516                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 517                                 error = 0;
 518                                 break;
 519                         }
 520                 } else {
 521                         error = EAGAIN;
 522                         break;
 523                 }
 524                 cpu_pause();
 525                 ++mtx_collision_count;
 526         }
 527         return (error);
 528 }
 529
 530 int
 531 _mtx_lock_sh_try(mtx_t *mtx)
 532 {
 533         u_int   lock;
 534         u_int   nlock;
 535         int     error = 0;
 536
 537         for (;;) {
 538                 lock = mtx->mtx_lock;
 539                 if ((lock & MTX_EXCLUSIVE) == 0) {
 540                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 541                         nlock = lock + 1;
 542                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 543                                 break;
 544                 } else {
 545                         error = EAGAIN;
 546                         break;
 547                 }
 548                 cpu_pause();
 549                 ++mtx_collision_count;
 550         }
 551         return (error);
 552 }
 553
 554 /*
 555  * If the lock is held exclusively it must be owned by the caller.  If the
 556  * lock is already a shared lock this operation is a NOP.  A panic will
 557  * occur if the lock is not held either shared or exclusive.
 558  *
 559  * The exclusive count is converted to a shared count.
 560  */
 561 void
 562 _mtx_downgrade(mtx_t *mtx)
 563 {
 564         u_int   lock;
 565         u_int   nlock;
 566
 567         for (;;) {
 568                 lock = mtx->mtx_lock;
 569                 cpu_ccfence();
 570
 571                 /*
 572                  * NOP if already shared.
 573                  */
 574                 if ((lock & MTX_EXCLUSIVE) == 0) {
 575                         KKASSERT((lock & MTX_MASK) > 0);
 576                         break;
 577                 }
 578
 579                 /*
 580                  * Transfer count to shared.  Any additional pending shared
 581                  * waiters must be woken up.
 582                  */
 583                 if (lock & MTX_SHWANTED) {
 584                         if (mtx_chain_link_sh(mtx, lock, 1))
 585                                 break;
 586                         /* retry */
 587                 } else {
 588                         nlock = lock & ~MTX_EXCLUSIVE;
 589                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 590                                 break;
 591                         /* retry */
 592                 }
 593                 cpu_pause();
 594                 ++mtx_collision_count;
 595         }
 596 }
 597
 598 /*
 599  * Upgrade a shared lock to an exclusive lock.  The upgrade will fail if
 600  * the shared lock has a count other then 1.  Optimize the most likely case
 601  * but note that a single cmpset can fail due to WANTED races.
 602  *
 603  * If the lock is held exclusively it must be owned by the caller and
 604  * this function will simply return without doing anything.   A panic will
 605  * occur if the lock is held exclusively by someone other then the caller.
 606  *
 607  * Returns 0 on success, EDEADLK on failure.
 608  */
 609 int
 610 _mtx_upgrade_try(mtx_t *mtx)
 611 {
 612         u_int   lock;
 613         u_int   nlock;
 614         int     error = 0;
 615
 616         for (;;) {
 617                 lock = mtx->mtx_lock;
 618
 619                 if ((lock & ~MTX_EXWANTED) == 1) {
 620                         nlock = lock | MTX_EXCLUSIVE;
 621                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 622                                 mtx->mtx_owner = curthread;
 623                                 break;
 624                         }
 625                 } else if (lock & MTX_EXCLUSIVE) {
 626                         KKASSERT(mtx->mtx_owner == curthread);
 627                         break;
 628                 } else {
 629                         error = EDEADLK;
 630                         break;
 631                 }
 632                 cpu_pause();
 633                 ++mtx_collision_count;
 634         }
 635         return (error);
 636 }
 637
 638 /*
 639  * Unlock a lock.  The caller must hold the lock either shared or exclusive.
 640  *
 641  * On the last release we handle any pending chains.
 642  */
 643 void
 644 _mtx_unlock(mtx_t *mtx)
 645 {
 646         u_int   lock;
 647         u_int   nlock;
 648
 649         for (;;) {
 650                 lock = mtx->mtx_lock;
 651                 cpu_ccfence();
 652
 653                 switch(lock) {
 654                 case MTX_EXCLUSIVE | 1:
 655                         /*
 656                          * Last release, exclusive lock.
 657                          * No exclusive or shared requests pending.
 658                          */
 659                         mtx->mtx_owner = NULL;
 660                         nlock = 0;
 661                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 662                                 goto done;
 663                         break;
 664                 case MTX_EXCLUSIVE | MTX_EXWANTED | 1:
 665                 case MTX_EXCLUSIVE | MTX_EXWANTED | MTX_SHWANTED | 1:
 666                         /*
 667                          * Last release, exclusive lock.
 668                          * Exclusive requests pending.
 669                          * Exclusive requests have priority over shared reqs.
 670                          */
 671                         if (mtx_chain_link_ex(mtx, lock))
 672                                 goto done;
 673                         break;
 674                 case MTX_EXCLUSIVE | MTX_SHWANTED | 1:
 675                         /*
 676                          * Last release, exclusive lock.
 677                          *
 678                          * Shared requests are pending.  Transfer our count (1)
 679                          * to the first shared request, wakeup all shared reqs.
 680                          */
 681                         if (mtx_chain_link_sh(mtx, lock, 0))
 682                                 goto done;
 683                         break;
 684                 case 1:
 685                         /*
 686                          * Last release, shared lock.
 687                          * No exclusive or shared requests pending.
 688                          */
 689                         nlock = 0;
 690                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 691                                 goto done;
 692                         break;
 693                 case MTX_EXWANTED | 1:
 694                 case MTX_EXWANTED | MTX_SHWANTED | 1:
 695                         /*
 696                          * Last release, shared lock.
 697                          *
 698                          * Exclusive requests are pending.  Transfer our
 699                          * count (1) to the next exclusive request.
 700                          *
 701                          * Exclusive requests have priority over shared reqs.
 702                          */
 703                         if (mtx_chain_link_ex(mtx, lock))
 704                                 goto done;
 705                         break;
 706                 case MTX_SHWANTED | 1:
 707                         /*
 708                          * Last release, shared lock.
 709                          * Shared requests pending.
 710                          */
 711                         if (mtx_chain_link_sh(mtx, lock, 0))
 712                                 goto done;
 713                         break;
 714                 default:
 715                         /*
 716                          * We have to loop if this is the last release but
 717                          * someone is fiddling with LINKSPIN.
 718                          */
 719                         if ((lock & MTX_MASK) == 1) {
 720                                 KKASSERT(lock & MTX_LINKSPIN);
 721                                 break;
 722                         }
 723
 724                         /*
 725                          * Not the last release (shared or exclusive)
 726                          */
 727                         nlock = lock - 1;
 728                         KKASSERT((nlock & MTX_MASK) != MTX_MASK);
 729                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 730                                 goto done;
 731                         break;
 732                 }
 733                 /* loop try again */
 734                 cpu_pause();
 735                 ++mtx_collision_count;
 736         }
 737 done:
 738         ;
 739 }
 740
 741 /*
 742  * Chain pending links.  Called on the last release of an exclusive or
 743  * shared lock when the appropriate WANTED bit is set.  mtx_lock old state
 744  * is passed in with the count left at 1, which we can inherit, and other
 745  * bits which we must adjust in a single atomic operation.
 746  *
 747  * Return non-zero on success, 0 if caller needs to retry.
 748  *
 749  * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are
 750  *       acquiring LINKSPIN as all other cases will also need to acquire
 751  *       LINKSPIN when handling the EXWANTED case.
 752  */
 753 static int
 754 mtx_chain_link_ex(mtx_t *mtx, u_int olock)
 755 {
 756         thread_t td = curthread;
 757         mtx_link_t *link;
 758         u_int   nlock;
 759
 760         olock &= ~MTX_LINKSPIN;
 761         nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE;
 762         ++td->td_critcount;
 763         if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
 764                 link = mtx->mtx_exlink;
 765                 KKASSERT(link != NULL);
 766                 if (link->next == link) {
 767                         mtx->mtx_exlink = NULL;
 768                         nlock = MTX_LINKSPIN | MTX_EXWANTED;    /* to clear */
 769                 } else {
 770                         mtx->mtx_exlink = link->next;
 771                         link->next->prev = link->prev;
 772                         link->prev->next = link->next;
 773                         nlock = MTX_LINKSPIN;                   /* to clear */
 774                 }
 775                 KKASSERT(link->state == MTX_LINK_LINKED_EX);
 776                 mtx->mtx_owner = link->owner;
 777                 cpu_sfence();
 778
 779                 /*
 780                  * WARNING! The callback can only be safely
 781                  *          made with LINKSPIN still held
 782                  *          and in a critical section.
 783                  *
 784                  * WARNING! The link can go away after the
 785                  *          state is set, or after the
 786                  *          callback.
 787                  */
 788                 if (link->callback) {
 789                         link->state = MTX_LINK_CALLEDBACK;
 790                         link->callback(link, link->arg, 0);
 791                 } else {
 792                         link->state = MTX_LINK_ACQUIRED;
 793                         wakeup(link);
 794                 }
 795                 atomic_clear_int(&mtx->mtx_lock, nlock);
 796                 --td->td_critcount;
 797                 ++mtx_wakeup_count;
 798                 return 1;
 799         }
 800         /* retry */
 801         --td->td_critcount;
 802         return 0;
 803 }
 804
 805 /*
 806  * Flush waiting shared locks.  The lock's prior state is passed in and must
 807  * be adjusted atomically only if it matches.
 808  *
 809  * If addcount is 0, the count for the first shared lock in the chain is
 810  * assumed to have already been accounted for.
 811  *
 812  * If addcount is 1, the count for the first shared lock in the chain has
 813  * not yet been accounted for.
 814  */
 815 static int
 816 mtx_chain_link_sh(mtx_t *mtx, u_int olock, int addcount)
 817 {
 818         thread_t td = curthread;
 819         mtx_link_t *link;
 820         u_int   nlock;
 821
 822         olock &= ~MTX_LINKSPIN;
 823         nlock = olock | MTX_LINKSPIN;
 824         nlock &= ~MTX_EXCLUSIVE;
 825         ++td->td_critcount;
 826         if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
 827                 KKASSERT(mtx->mtx_shlink != NULL);
 828                 for (;;) {
 829                         link = mtx->mtx_shlink;
 830                         atomic_add_int(&mtx->mtx_lock, addcount);
 831                         KKASSERT(link->state == MTX_LINK_LINKED_SH);
 832                         if (link->next == link) {
 833                                 mtx->mtx_shlink = NULL;
 834                                 cpu_sfence();
 835
 836                                 /*
 837                                  * WARNING! The callback can only be safely
 838                                  *          made with LINKSPIN still held
 839                                  *          and in a critical section.
 840                                  *
 841                                  * WARNING! The link can go away after the
 842                                  *          state is set, or after the
 843                                  *          callback.
 844                                  */
 845                                 if (link->callback) {
 846                                         link->state = MTX_LINK_CALLEDBACK;
 847                                         link->callback(link, link->arg, 0);
 848                                 } else {
 849                                         link->state = MTX_LINK_ACQUIRED;
 850                                         wakeup(link);
 851                                 }
 852                                 ++mtx_wakeup_count;
 853                                 break;
 854                         }
 855                         mtx->mtx_shlink = link->next;
 856                         link->next->prev = link->prev;
 857                         link->prev->next = link->next;
 858                         cpu_sfence();
 859                         link->state = MTX_LINK_ACQUIRED;
 860                         /* link can go away */
 861                         wakeup(link);
 862                         ++mtx_wakeup_count;
 863                         addcount = 1;
 864                 }
 865                 atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN |
 866                                                  MTX_SHWANTED);
 867                 --td->td_critcount;
 868                 return 1;
 869         }
 870         /* retry */
 871         --td->td_critcount;
 872         return 0;
 873 }
 874
 875 /*
 876  * Delete a link structure after tsleep has failed.  This code is not
 877  * in the critical path as most exclusive waits are chained.
 878  */
 879 static
 880 void
 881 mtx_delete_link(mtx_t *mtx, mtx_link_t *link)
 882 {
 883         thread_t td = curthread;
 884         u_int   lock;
 885         u_int   nlock;
 886
 887         /*
 888          * Acquire MTX_LINKSPIN.
 889          *
 890          * Do not use cmpxchg to wait for LINKSPIN to clear as this might
 891          * result in too much cpu cache traffic.
 892          */
 893         ++td->td_critcount;
 894         for (;;) {
 895                 lock = mtx->mtx_lock;
 896                 if (lock & MTX_LINKSPIN) {
 897                         cpu_pause();
 898                         ++mtx_collision_count;
 899                         continue;
 900                 }
 901                 nlock = lock | MTX_LINKSPIN;
 902                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 903                         break;
 904                 cpu_pause();
 905                 ++mtx_collision_count;
 906         }
 907
 908         /*
 909          * Delete the link and release LINKSPIN.
 910          */
 911         nlock = MTX_LINKSPIN;   /* to clear */
 912
 913         switch(link->state) {
 914         case MTX_LINK_LINKED_EX:
 915                 if (link->next == link) {
 916                         mtx->mtx_exlink = NULL;
 917                         nlock |= MTX_EXWANTED;  /* to clear */
 918                 } else {
 919                         mtx->mtx_exlink = link->next;
 920                         link->next->prev = link->prev;
 921                         link->prev->next = link->next;
 922                 }
 923                 break;
 924         case MTX_LINK_LINKED_SH:
 925                 if (link->next == link) {
 926                         mtx->mtx_shlink = NULL;
 927                         nlock |= MTX_SHWANTED;  /* to clear */
 928                 } else {
 929                         mtx->mtx_shlink = link->next;
 930                         link->next->prev = link->prev;
 931                         link->prev->next = link->next;
 932                 }
 933                 break;
 934         default:
 935                 /* no change */
 936                 break;
 937         }
 938         atomic_clear_int(&mtx->mtx_lock, nlock);
 939         --td->td_critcount;
 940 }
 941
 942 /*
 943  * Wait for async lock completion or abort.  Returns ENOLCK if an abort
 944  * occurred.
 945  */
 946 int
 947 mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
 948 {
 949         int error;
 950
 951         /*
 952          * Sleep.  Handle false wakeups, interruptions, etc.
 953          * The link may also have been aborted.
 954          */
 955         error = 0;
 956         while (link->state & MTX_LINK_LINKED) {
 957                 tsleep_interlock(link, 0);
 958                 cpu_lfence();
 959                 if (link->state & MTX_LINK_LINKED) {
 960                         ++mtx_contention_count;
 961                         if (link->state & MTX_LINK_LINKED_SH)
 962                                 mycpu->gd_cnt.v_lock_name[0] = 'S';
 963                         else
 964                                 mycpu->gd_cnt.v_lock_name[0] = 'X';
 965                         strncpy(mycpu->gd_cnt.v_lock_name + 1,
 966                                 mtx->mtx_ident,
 967                                 sizeof(mycpu->gd_cnt.v_lock_name) - 2);
 968                         ++mycpu->gd_cnt.v_lock_colls;
 969
 970                         error = tsleep(link, flags | PINTERLOCKED,
 971                                        mtx->mtx_ident, to);
 972                         if (error)
 973                                 break;
 974                 }
 975         }
 976
 977         /*
 978          * We are done, make sure the link structure is unlinked.
 979          * It may still be on the list due to e.g. EINTR or
 980          * EWOULDBLOCK.
 981          *
 982          * It is possible for the tsleep to race an ABORT and cause
 983          * error to be 0.
 984          *
 985          * The tsleep() can be woken up for numerous reasons and error
 986          * might be zero in situations where we intend to return an error.
 987          *
 988          * (This is the synchronous case so state cannot be CALLEDBACK)
 989          */
 990         switch(link->state) {
 991         case MTX_LINK_ACQUIRED:
 992         case MTX_LINK_CALLEDBACK:
 993                 error = 0;
 994                 break;
 995         case MTX_LINK_ABORTED:
 996                 error = ENOLCK;
 997                 break;
 998         case MTX_LINK_LINKED_EX:
 999         case MTX_LINK_LINKED_SH:
1000                 mtx_delete_link(mtx, link);
1001                 /* fall through */
1002         default:
1003                 if (error == 0)
1004                         error = EWOULDBLOCK;
1005                 break;
1006         }
1007
1008         /*
1009          * Clear state on status returned.
1010          */
1011         link->state = MTX_LINK_IDLE;
1012
1013         return error;
1014 }
1015
1016 /*
1017  * Abort a mutex locking operation, causing mtx_lock_ex_link() to
1018  * return ENOLCK.  This may be called at any time after the mtx_link
1019  * is initialized or the status from a previous lock has been
1020  * returned.  If called prior to the next (non-try) lock attempt, the
1021  * next lock attempt using this link structure will abort instantly.
1022  *
1023  * Caller must still wait for the operation to complete, either from a
1024  * blocking call that is still in progress or by calling mtx_wait_link().
1025  *
1026  * If an asynchronous lock request is possibly in-progress, the caller
1027  * should call mtx_wait_link() synchronously.  Note that the asynchronous
1028  * lock callback will NOT be called if a successful abort occurred. XXX
1029  */
1030 void
1031 mtx_abort_link(mtx_t *mtx, mtx_link_t *link)
1032 {
1033         thread_t td = curthread;
1034         u_int   lock;
1035         u_int   nlock;
1036
1037         /*
1038          * Acquire MTX_LINKSPIN
1039          */
1040         ++td->td_critcount;
1041         for (;;) {
1042                 lock = mtx->mtx_lock;
1043                 if (lock & MTX_LINKSPIN) {
1044                         cpu_pause();
1045                         ++mtx_collision_count;
1046                         continue;
1047                 }
1048                 nlock = lock | MTX_LINKSPIN;
1049                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
1050                         break;
1051                 cpu_pause();
1052                 ++mtx_collision_count;
1053         }
1054
1055         /*
1056          * Do the abort.
1057          *
1058          * WARNING! Link structure can disappear once link->state is set.
1059          */
1060         nlock = MTX_LINKSPIN;   /* to clear */
1061
1062         switch(link->state) {
1063         case MTX_LINK_IDLE:
1064                 /*
1065                  * Link not started yet
1066                  */
1067                 link->state = MTX_LINK_ABORTED;
1068                 break;
1069         case MTX_LINK_LINKED_EX:
1070                 /*
1071                  * de-link, mark aborted, and potentially wakeup the thread
1072                  * or issue the callback.
1073                  */
1074                 if (link->next == link) {
1075                         if (mtx->mtx_exlink == link) {
1076                                 mtx->mtx_exlink = NULL;
1077                                 nlock |= MTX_EXWANTED;  /* to clear */
1078                         }
1079                 } else {
1080                         if (mtx->mtx_exlink == link)
1081                                 mtx->mtx_exlink = link->next;
1082                         link->next->prev = link->prev;
1083                         link->prev->next = link->next;
1084                 }
1085
1086                 /*
1087                  * When aborting the async callback is still made.  We must
1088                  * not set the link status to ABORTED in the callback case
1089                  * since there is nothing else to clear its status if the
1090                  * link is reused.
1091                  */
1092                 if (link->callback) {
1093                         link->state = MTX_LINK_CALLEDBACK;
1094                         link->callback(link, link->arg, ENOLCK);
1095                 } else {
1096                         link->state = MTX_LINK_ABORTED;
1097                         wakeup(link);
1098                 }
1099                 ++mtx_wakeup_count;
1100                 break;
1101         case MTX_LINK_LINKED_SH:
1102                 /*
1103                  * de-link, mark aborted, and potentially wakeup the thread
1104                  * or issue the callback.
1105                  */
1106                 if (link->next == link) {
1107                         if (mtx->mtx_shlink == link) {
1108                                 mtx->mtx_shlink = NULL;
1109                                 nlock |= MTX_SHWANTED;  /* to clear */
1110                         }
1111                 } else {
1112                         if (mtx->mtx_shlink == link)
1113                                 mtx->mtx_shlink = link->next;
1114                         link->next->prev = link->prev;
1115                         link->prev->next = link->next;
1116                 }
1117
1118                 /*
1119                  * When aborting the async callback is still made.  We must
1120                  * not set the link status to ABORTED in the callback case
1121                  * since there is nothing else to clear its status if the
1122                  * link is reused.
1123                  */
1124                 if (link->callback) {
1125                         link->state = MTX_LINK_CALLEDBACK;
1126                         link->callback(link, link->arg, ENOLCK);
1127                 } else {
1128                         link->state = MTX_LINK_ABORTED;
1129                         wakeup(link);
1130                 }
1131                 ++mtx_wakeup_count;
1132                 break;
1133         case MTX_LINK_ACQUIRED:
1134         case MTX_LINK_CALLEDBACK:
1135                 /*
1136                  * Too late, the lock was acquired.  Let it complete.
1137                  */
1138                 break;
1139         default:
1140                 /*
1141                  * link already aborted, do nothing.
1142                  */
1143                 break;
1144         }
1145         atomic_clear_int(&mtx->mtx_lock, nlock);
1146         --td->td_critcount;
1147 }