sys/kern/kern_mutex.c

   1 /*
   2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34 /*
  35  * Implement fast persistent locks based on atomic_cmpset_int() with
  36  * semantics similar to lockmgr locks but faster and taking up much less
  37  * space.  Taken from HAMMER's lock implementation.
  38  *
  39  * These are meant to complement our LWKT tokens.  Tokens are only held
  40  * while the thread is running.  Mutexes can be held across blocking
  41  * conditions.
  42  *
  43  * - Exclusive priority over shared to prevent SMP starvation.
  44  * - locks can be aborted (async callback, if any, will be made w/ENOLCK).
  45  * - locks can be asynchronous.
  46  * - synchronous fast path if no blocking occurs (async callback is not
  47  *   made in this case).
  48  *
  49  * Generally speaking any caller-supplied link state must be properly
  50  * initialized before use.
  51  *
  52  * Most of the support is in sys/mutex[2].h.  We mostly provide backoff
  53  * functions here.
  54  */
  55
  56 #include <sys/param.h>
  57 #include <sys/systm.h>
  58 #include <sys/kernel.h>
  59 #include <sys/sysctl.h>
  60 #include <sys/indefinite.h>
  61 #include <sys/thread.h>
  62
  63 #include <machine/cpufunc.h>
  64
  65 #include <sys/thread2.h>
  66 #include <sys/mutex2.h>
  67 #include <sys/indefinite2.h>
  68
  69 static int mtx_chain_link_ex(mtx_t *mtx, u_int olock);
  70 static int mtx_chain_link_sh(mtx_t *mtx, u_int olock);
  71 static void mtx_delete_link(mtx_t *mtx, mtx_link_t *link);
  72
  73 /*
  74  * Exclusive-lock a mutex, block until acquired unless link is async.
  75  * Recursion is allowed.
  76  *
  77  * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS
  78  * if async.  If immediately successful an async exclusive lock will return 0
  79  * and not issue the async callback or link the link structure.  The caller
  80  * must handle this case (typically this is an optimal code path).
  81  *
  82  * A tsleep() error can only be returned if PCATCH is specified in the flags.
  83  */
  84 static __inline int
  85 __mtx_lock_ex(mtx_t *mtx, mtx_link_t *link, int flags, int to)
  86 {
  87         thread_t td;
  88         u_int   lock;
  89         u_int   nlock;
  90         int     error;
  91         int     isasync;
  92
  93         for (;;) {
  94                 lock = mtx->mtx_lock;
  95                 cpu_ccfence();
  96
  97                 if (lock == 0) {
  98                         nlock = MTX_EXCLUSIVE | 1;
  99                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 100                                 mtx->mtx_owner = curthread;
 101                                 cpu_sfence();
 102                                 link->state = MTX_LINK_ACQUIRED;
 103                                 error = 0;
 104                                 break;
 105                         }
 106                         continue;
 107                 }
 108                 if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) {
 109                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 110                         nlock = lock + 1;
 111                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 112                                 cpu_sfence();
 113                                 link->state = MTX_LINK_ACQUIRED;
 114                                 error = 0;
 115                                 break;
 116                         }
 117                         continue;
 118                 }
 119
 120                 /*
 121                  * We need MTX_LINKSPIN to manipulate exlink or
 122                  * shlink.
 123                  *
 124                  * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate
 125                  * pending exclusive requests.  It cannot be set as a separate
 126                  * operation prior to acquiring MTX_LINKSPIN.
 127                  *
 128                  * To avoid unnecessary cpu cache traffic we poll
 129                  * for collisions.  It is also possible that EXWANTED
 130                  * state failing the above test was spurious, so all the
 131                  * tests must be repeated if we cannot obtain LINKSPIN
 132                  * with the prior state tests intact (i.e. don't reload
 133                  * the (lock) variable here, for heaven's sake!).
 134                  */
 135                 if (lock & MTX_LINKSPIN) {
 136                         cpu_pause();
 137                         continue;
 138                 }
 139                 td = curthread;
 140                 nlock = lock | MTX_EXWANTED | MTX_LINKSPIN;
 141                 crit_enter_raw(td);
 142                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
 143                         crit_exit_raw(td);
 144                         continue;
 145                 }
 146
 147                 /*
 148                  * Check for early abort.
 149                  */
 150                 if (link->state == MTX_LINK_ABORTED) {
 151                         if (mtx->mtx_exlink == NULL) {
 152                                 atomic_clear_int(&mtx->mtx_lock,
 153                                                  MTX_LINKSPIN |
 154                                                  MTX_EXWANTED);
 155                         } else {
 156                                 atomic_clear_int(&mtx->mtx_lock,
 157                                                  MTX_LINKSPIN);
 158                         }
 159                         crit_exit_raw(td);
 160                         link->state = MTX_LINK_IDLE;
 161                         error = ENOLCK;
 162                         break;
 163                 }
 164
 165                 /*
 166                  * Add our link to the exlink list and release LINKSPIN.
 167                  */
 168                 link->owner = td;
 169                 link->state = MTX_LINK_LINKED_EX;
 170                 if (mtx->mtx_exlink) {
 171                         link->next = mtx->mtx_exlink;
 172                         link->prev = link->next->prev;
 173                         link->next->prev = link;
 174                         link->prev->next = link;
 175                 } else {
 176                         link->next = link;
 177                         link->prev = link;
 178                         mtx->mtx_exlink = link;
 179                 }
 180                 isasync = (link->callback != NULL);
 181                 atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
 182                 crit_exit_raw(td);
 183
 184                 /*
 185                  * If asynchronous lock request return without
 186                  * blocking, leave link structure linked.
 187                  */
 188                 if (isasync) {
 189                         error = EINPROGRESS;
 190                         break;
 191                 }
 192
 193                 /*
 194                  * Wait for lock
 195                  */
 196                 error = mtx_wait_link(mtx, link, flags, to);
 197                 break;
 198         }
 199         return (error);
 200 }
 201
 202 int
 203 _mtx_lock_ex_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
 204 {
 205         return(__mtx_lock_ex(mtx, link, flags, to));
 206 }
 207
 208 int
 209 _mtx_lock_ex(mtx_t *mtx, int flags, int to)
 210 {
 211         mtx_link_t link;
 212
 213         mtx_link_init(&link);
 214         return(__mtx_lock_ex(mtx, &link, flags, to));
 215 }
 216
 217 int
 218 _mtx_lock_ex_quick(mtx_t *mtx)
 219 {
 220         mtx_link_t link;
 221
 222         mtx_link_init(&link);
 223         return(__mtx_lock_ex(mtx, &link, 0, 0));
 224 }
 225
 226 /*
 227  * Share-lock a mutex, block until acquired.  Recursion is allowed.
 228  *
 229  * Returns 0 on success, or the tsleep() return code on failure.
 230  * An error can only be returned if PCATCH is specified in the flags.
 231  *
 232  * NOTE: Shared locks get a mass-wakeup so if the tsleep fails we
 233  *       do not have to chain the wakeup().
 234  */
 235 static __inline int
 236 __mtx_lock_sh(mtx_t *mtx, mtx_link_t *link, int flags, int to)
 237 {
 238         thread_t td;
 239         u_int   lock;
 240         u_int   nlock;
 241         int     error;
 242         int     isasync;
 243
 244         for (;;) {
 245                 lock = mtx->mtx_lock;
 246                 cpu_ccfence();
 247
 248                 if (lock == 0) {
 249                         nlock = 1;
 250                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 251                                 error = 0;
 252                                 cpu_sfence();
 253                                 link->state = MTX_LINK_ACQUIRED;
 254                                 break;
 255                         }
 256                         continue;
 257                 }
 258                 if ((lock & (MTX_EXCLUSIVE | MTX_EXWANTED)) == 0) {
 259                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 260                         nlock = lock + 1;
 261                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 262                                 error = 0;
 263                                 cpu_sfence();
 264                                 link->state = MTX_LINK_ACQUIRED;
 265                                 break;
 266                         }
 267                         continue;
 268                 }
 269
 270                 /*
 271                  * We need MTX_LINKSPIN to manipulate exlink or
 272                  * shlink.
 273                  *
 274                  * We must set MTX_SHWANTED with MTX_LINKSPIN to indicate
 275                  * pending shared requests.  It cannot be set as a separate
 276                  * operation prior to acquiring MTX_LINKSPIN.
 277                  *
 278                  * To avoid unnecessary cpu cache traffic we poll
 279                  * for collisions.  It is also possible that EXWANTED
 280                  * state failing the above test was spurious, so all the
 281                  * tests must be repeated if we cannot obtain LINKSPIN
 282                  * with the prior state tests intact (i.e. don't reload
 283                  * the (lock) variable here, for heaven's sake!).
 284                  */
 285                 if (lock & MTX_LINKSPIN) {
 286                         cpu_pause();
 287                         continue;
 288                 }
 289                 td = curthread;
 290                 nlock = lock | MTX_SHWANTED | MTX_LINKSPIN;
 291                 crit_enter_raw(td);
 292                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
 293                         crit_exit_raw(td);
 294                         continue;
 295                 }
 296
 297                 /*
 298                  * Check for early abort.  Other shared lock requestors
 299                  * could have sneaked in before we set LINKSPIN so make
 300                  * sure we undo the state properly.
 301                  */
 302                 if (link->state == MTX_LINK_ABORTED) {
 303                         if (mtx->mtx_shlink) {
 304                                 atomic_clear_int(&mtx->mtx_lock,
 305                                                  MTX_LINKSPIN);
 306                         } else {
 307                                 atomic_clear_int(&mtx->mtx_lock,
 308                                                  MTX_LINKSPIN |
 309                                                  MTX_SHWANTED);
 310                         }
 311                         crit_exit_raw(td);
 312                         link->state = MTX_LINK_IDLE;
 313                         error = ENOLCK;
 314                         break;
 315                 }
 316
 317                 /*
 318                  * Add our link to the shlink list and release LINKSPIN.
 319                  */
 320                 link->owner = td;
 321                 link->state = MTX_LINK_LINKED_SH;
 322                 if (mtx->mtx_shlink) {
 323                         link->next = mtx->mtx_shlink;
 324                         link->prev = link->next->prev;
 325                         link->next->prev = link;
 326                         link->prev->next = link;
 327                 } else {
 328                         link->next = link;
 329                         link->prev = link;
 330                         mtx->mtx_shlink = link;
 331                 }
 332                 isasync = (link->callback != NULL);
 333                 atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
 334                 crit_exit_raw(td);
 335
 336                 /*
 337                  * If asynchronous lock request return without
 338                  * blocking, leave link structure linked.
 339                  */
 340                 if (isasync) {
 341                         error = EINPROGRESS;
 342                         break;
 343                 }
 344
 345                 /*
 346                  * Wait for lock
 347                  */
 348                 error = mtx_wait_link(mtx, link, flags, to);
 349                 break;
 350         }
 351         return (error);
 352 }
 353
 354 int
 355 _mtx_lock_sh_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
 356 {
 357         return(__mtx_lock_sh(mtx, link, flags, to));
 358 }
 359
 360 int
 361 _mtx_lock_sh(mtx_t *mtx, int flags, int to)
 362 {
 363         mtx_link_t link;
 364
 365         mtx_link_init(&link);
 366         return(__mtx_lock_sh(mtx, &link, flags, to));
 367 }
 368
 369 int
 370 _mtx_lock_sh_quick(mtx_t *mtx)
 371 {
 372         mtx_link_t link;
 373
 374         mtx_link_init(&link);
 375         return(__mtx_lock_sh(mtx, &link, 0, 0));
 376 }
 377
 378 /*
 379  * Get an exclusive spinlock the hard way.
 380  */
 381 void
 382 _mtx_spinlock(mtx_t *mtx)
 383 {
 384         u_int   lock;
 385         u_int   nlock;
 386         int     bb = 1;
 387         int     bo;
 388
 389         for (;;) {
 390                 lock = mtx->mtx_lock;
 391                 if (lock == 0) {
 392                         nlock = MTX_EXCLUSIVE | 1;
 393                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 394                                 mtx->mtx_owner = curthread;
 395                                 break;
 396                         }
 397                 } else if ((lock & MTX_EXCLUSIVE) &&
 398                            mtx->mtx_owner == curthread) {
 399                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 400                         nlock = lock + 1;
 401                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 402                                 break;
 403                 } else {
 404                         /* MWAIT here */
 405                         if (bb < 1000)
 406                                 ++bb;
 407                         cpu_pause();
 408                         for (bo = 0; bo < bb; ++bo)
 409                                 ;
 410                 }
 411                 cpu_pause();
 412         }
 413 }
 414
 415 /*
 416  * Attempt to acquire a spinlock, if we fail we must undo the
 417  * gd->gd_spinlocks/gd->gd_curthead->td_critcount predisposition.
 418  *
 419  * Returns 0 on success, EAGAIN on failure.
 420  */
 421 int
 422 _mtx_spinlock_try(mtx_t *mtx)
 423 {
 424         globaldata_t gd = mycpu;
 425         u_int   lock;
 426         u_int   nlock;
 427         int     res = 0;
 428
 429         for (;;) {
 430                 lock = mtx->mtx_lock;
 431                 if (lock == 0) {
 432                         nlock = MTX_EXCLUSIVE | 1;
 433                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 434                                 mtx->mtx_owner = gd->gd_curthread;
 435                                 break;
 436                         }
 437                 } else if ((lock & MTX_EXCLUSIVE) &&
 438                            mtx->mtx_owner == gd->gd_curthread) {
 439                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 440                         nlock = lock + 1;
 441                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 442                                 break;
 443                 } else {
 444                         --gd->gd_spinlocks;
 445                         cpu_ccfence();
 446                         crit_exit_raw(gd->gd_curthread);
 447                         res = EAGAIN;
 448                         break;
 449                 }
 450                 cpu_pause();
 451         }
 452         return res;
 453 }
 454
 455 #if 0
 456
 457 void
 458 _mtx_spinlock_sh(mtx_t *mtx)
 459 {
 460         u_int   lock;
 461         u_int   nlock;
 462         int     bb = 1;
 463         int     bo;
 464
 465         for (;;) {
 466                 lock = mtx->mtx_lock;
 467                 if ((lock & MTX_EXCLUSIVE) == 0) {
 468                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 469                         nlock = lock + 1;
 470                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 471                                 break;
 472                 } else {
 473                         /* MWAIT here */
 474                         if (bb < 1000)
 475                                 ++bb;
 476                         cpu_pause();
 477                         for (bo = 0; bo < bb; ++bo)
 478                                 ;
 479                 }
 480                 cpu_pause();
 481         }
 482 }
 483
 484 #endif
 485
 486 int
 487 _mtx_lock_ex_try(mtx_t *mtx)
 488 {
 489         u_int   lock;
 490         u_int   nlock;
 491         int     error;
 492
 493         for (;;) {
 494                 lock = mtx->mtx_lock;
 495                 if (lock == 0) {
 496                         nlock = MTX_EXCLUSIVE | 1;
 497                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 498                                 mtx->mtx_owner = curthread;
 499                                 error = 0;
 500                                 break;
 501                         }
 502                 } else if ((lock & MTX_EXCLUSIVE) &&
 503                            mtx->mtx_owner == curthread) {
 504                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 505                         nlock = lock + 1;
 506                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 507                                 error = 0;
 508                                 break;
 509                         }
 510                 } else {
 511                         error = EAGAIN;
 512                         break;
 513                 }
 514                 cpu_pause();
 515         }
 516         return (error);
 517 }
 518
 519 int
 520 _mtx_lock_sh_try(mtx_t *mtx)
 521 {
 522         u_int   lock;
 523         u_int   nlock;
 524         int     error = 0;
 525
 526         for (;;) {
 527                 lock = mtx->mtx_lock;
 528                 if ((lock & MTX_EXCLUSIVE) == 0) {
 529                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 530                         nlock = lock + 1;
 531                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 532                                 break;
 533                 } else {
 534                         error = EAGAIN;
 535                         break;
 536                 }
 537                 cpu_pause();
 538         }
 539         return (error);
 540 }
 541
 542 /*
 543  * If the lock is held exclusively it must be owned by the caller.  If the
 544  * lock is already a shared lock this operation is a NOP.  A panic will
 545  * occur if the lock is not held either shared or exclusive.
 546  *
 547  * The exclusive count is converted to a shared count.
 548  */
 549 void
 550 _mtx_downgrade(mtx_t *mtx)
 551 {
 552         u_int   lock;
 553         u_int   nlock;
 554
 555         for (;;) {
 556                 lock = mtx->mtx_lock;
 557                 cpu_ccfence();
 558
 559                 /*
 560                  * NOP if already shared.
 561                  */
 562                 if ((lock & MTX_EXCLUSIVE) == 0) {
 563                         KKASSERT((lock & MTX_MASK) > 0);
 564                         break;
 565                 }
 566
 567                 /*
 568                  * Transfer count to shared.  Any additional pending shared
 569                  * waiters must be woken up.
 570                  */
 571                 if (lock & MTX_SHWANTED) {
 572                         if (mtx_chain_link_sh(mtx, lock))
 573                                 break;
 574                         /* retry */
 575                 } else {
 576                         nlock = lock & ~MTX_EXCLUSIVE;
 577                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 578                                 break;
 579                         /* retry */
 580                 }
 581                 cpu_pause();
 582         }
 583 }
 584
 585 /*
 586  * Upgrade a shared lock to an exclusive lock.  The upgrade will fail if
 587  * the shared lock has a count other then 1.  Optimize the most likely case
 588  * but note that a single cmpset can fail due to WANTED races.
 589  *
 590  * If the lock is held exclusively it must be owned by the caller and
 591  * this function will simply return without doing anything.   A panic will
 592  * occur if the lock is held exclusively by someone other then the caller.
 593  *
 594  * Returns 0 on success, EDEADLK on failure.
 595  */
 596 int
 597 _mtx_upgrade_try(mtx_t *mtx)
 598 {
 599         u_int   lock;
 600         u_int   nlock;
 601         int     error = 0;
 602
 603         for (;;) {
 604                 lock = mtx->mtx_lock;
 605                 cpu_ccfence();
 606
 607                 if ((lock & ~MTX_EXWANTED) == 1) {
 608                         nlock = lock | MTX_EXCLUSIVE;
 609                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 610                                 mtx->mtx_owner = curthread;
 611                                 break;
 612                         }
 613                 } else if (lock & MTX_EXCLUSIVE) {
 614                         KKASSERT(mtx->mtx_owner == curthread);
 615                         break;
 616                 } else {
 617                         error = EDEADLK;
 618                         break;
 619                 }
 620                 cpu_pause();
 621         }
 622         return (error);
 623 }
 624
 625 /*
 626  * Unlock a lock.  The caller must hold the lock either shared or exclusive.
 627  *
 628  * On the last release we handle any pending chains.
 629  */
 630 void
 631 _mtx_unlock(mtx_t *mtx)
 632 {
 633         u_int   lock;
 634         u_int   nlock;
 635
 636         for (;;) {
 637                 lock = mtx->mtx_lock;
 638                 cpu_ccfence();
 639
 640                 switch(lock) {
 641                 case MTX_EXCLUSIVE | 1:
 642                         /*
 643                          * Last release, exclusive lock.
 644                          * No exclusive or shared requests pending.
 645                          */
 646                         mtx->mtx_owner = NULL;
 647                         nlock = 0;
 648                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 649                                 goto done;
 650                         break;
 651                 case MTX_EXCLUSIVE | MTX_EXWANTED | 1:
 652                 case MTX_EXCLUSIVE | MTX_EXWANTED | MTX_SHWANTED | 1:
 653                         /*
 654                          * Last release, exclusive lock.
 655                          * Exclusive requests pending.
 656                          * Exclusive requests have priority over shared reqs.
 657                          */
 658                         if (mtx_chain_link_ex(mtx, lock))
 659                                 goto done;
 660                         break;
 661                 case MTX_EXCLUSIVE | MTX_SHWANTED | 1:
 662                         /*
 663                          * Last release, exclusive lock.
 664                          *
 665                          * Shared requests are pending.  Transfer our count (1)
 666                          * to the first shared request, wakeup all shared reqs.
 667                          */
 668                         if (mtx_chain_link_sh(mtx, lock))
 669                                 goto done;
 670                         break;
 671                 case 1:
 672                         /*
 673                          * Last release, shared lock.
 674                          * No exclusive or shared requests pending.
 675                          */
 676                         nlock = 0;
 677                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 678                                 goto done;
 679                         break;
 680                 case MTX_EXWANTED | 1:
 681                 case MTX_EXWANTED | MTX_SHWANTED | 1:
 682                         /*
 683                          * Last release, shared lock.
 684                          *
 685                          * Exclusive requests are pending.  Upgrade this
 686                          * final shared lock to exclusive and transfer our
 687                          * count (1) to the next exclusive request.
 688                          *
 689                          * Exclusive requests have priority over shared reqs.
 690                          */
 691                         if (mtx_chain_link_ex(mtx, lock))
 692                                 goto done;
 693                         break;
 694                 case MTX_SHWANTED | 1:
 695                         /*
 696                          * Last release, shared lock.
 697                          * Shared requests pending.
 698                          */
 699                         if (mtx_chain_link_sh(mtx, lock))
 700                                 goto done;
 701                         break;
 702                 default:
 703                         /*
 704                          * We have to loop if this is the last release but
 705                          * someone is fiddling with LINKSPIN.
 706                          */
 707                         if ((lock & MTX_MASK) == 1) {
 708                                 KKASSERT(lock & MTX_LINKSPIN);
 709                                 break;
 710                         }
 711
 712                         /*
 713                          * Not the last release (shared or exclusive)
 714                          */
 715                         nlock = lock - 1;
 716                         KKASSERT((nlock & MTX_MASK) != MTX_MASK);
 717                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 718                                 goto done;
 719                         break;
 720                 }
 721                 /* loop try again */
 722                 cpu_pause();
 723         }
 724 done:
 725         ;
 726 }
 727
 728 /*
 729  * Chain pending links.  Called on the last release of an exclusive or
 730  * shared lock when the appropriate WANTED bit is set.  mtx_lock old state
 731  * is passed in with the count left at 1, which we can inherit, and other
 732  * bits which we must adjust in a single atomic operation.
 733  *
 734  * Return non-zero on success, 0 if caller needs to retry.
 735  *
 736  * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are
 737  *       acquiring LINKSPIN as all other cases will also need to acquire
 738  *       LINKSPIN when handling the EXWANTED case.
 739  */
 740 static int
 741 mtx_chain_link_ex(mtx_t *mtx, u_int olock)
 742 {
 743         thread_t td = curthread;
 744         mtx_link_t *link;
 745         u_int   nlock;
 746
 747         olock &= ~MTX_LINKSPIN;
 748         nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE;   /* upgrade if necc */
 749         crit_enter_raw(td);
 750         if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
 751                 link = mtx->mtx_exlink;
 752                 KKASSERT(link != NULL);
 753                 if (link->next == link) {
 754                         mtx->mtx_exlink = NULL;
 755                         nlock = MTX_LINKSPIN | MTX_EXWANTED;    /* to clear */
 756                 } else {
 757                         mtx->mtx_exlink = link->next;
 758                         link->next->prev = link->prev;
 759                         link->prev->next = link->next;
 760                         nlock = MTX_LINKSPIN;                   /* to clear */
 761                 }
 762                 KKASSERT(link->state == MTX_LINK_LINKED_EX);
 763                 mtx->mtx_owner = link->owner;
 764                 cpu_sfence();
 765
 766                 /*
 767                  * WARNING! The callback can only be safely
 768                  *          made with LINKSPIN still held
 769                  *          and in a critical section.
 770                  *
 771                  * WARNING! The link can go away after the
 772                  *          state is set, or after the
 773                  *          callback.
 774                  */
 775                 if (link->callback) {
 776                         link->state = MTX_LINK_CALLEDBACK;
 777                         link->callback(link, link->arg, 0);
 778                 } else {
 779                         link->state = MTX_LINK_ACQUIRED;
 780                         wakeup(link);
 781                 }
 782                 atomic_clear_int(&mtx->mtx_lock, nlock);
 783                 crit_exit_raw(td);
 784                 return 1;
 785         }
 786         /* retry */
 787         crit_exit_raw(td);
 788
 789         return 0;
 790 }
 791
 792 /*
 793  * Flush waiting shared locks.  The lock's prior state is passed in and must
 794  * be adjusted atomically only if it matches and LINKSPIN is not set.
 795  *
 796  * IMPORTANT! The caller has left one active count on the lock for us to
 797  *            consume.  We will apply this to the first link, but must add
 798  *            additional counts for any other links.
 799  */
 800 static int
 801 mtx_chain_link_sh(mtx_t *mtx, u_int olock)
 802 {
 803         thread_t td = curthread;
 804         mtx_link_t *link;
 805         u_int   addcount;
 806         u_int   nlock;
 807
 808         olock &= ~MTX_LINKSPIN;
 809         nlock = olock | MTX_LINKSPIN;
 810         nlock &= ~MTX_EXCLUSIVE;
 811         crit_enter_raw(td);
 812         if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
 813                 /*
 814                  * It should not be possible for SHWANTED to be set without
 815                  * any links pending.
 816                  */
 817                 KKASSERT(mtx->mtx_shlink != NULL);
 818
 819                 /*
 820                  * We have to process the count for all shared locks before
 821                  * we process any of the links.  Count the additional shared
 822                  * locks beyond the first link (which is already accounted
 823                  * for) and associate the full count with the lock
 824                  * immediately.
 825                  */
 826                 addcount = 0;
 827                 for (link = mtx->mtx_shlink->next; link != mtx->mtx_shlink;
 828                      link = link->next) {
 829                         ++addcount;
 830                 }
 831                 if (addcount > 0)
 832                         atomic_add_int(&mtx->mtx_lock, addcount);
 833
 834                 /*
 835                  * We can wakeup all waiting shared locks.
 836                  */
 837                 while ((link = mtx->mtx_shlink) != NULL) {
 838                         KKASSERT(link->state == MTX_LINK_LINKED_SH);
 839                         if (link->next == link) {
 840                                 mtx->mtx_shlink = NULL;
 841                         } else {
 842                                 mtx->mtx_shlink = link->next;
 843                                 link->next->prev = link->prev;
 844                                 link->prev->next = link->next;
 845                         }
 846                         link->next = NULL;
 847                         link->prev = NULL;
 848                         cpu_sfence();
 849                         if (link->callback) {
 850                                 link->state = MTX_LINK_CALLEDBACK;
 851                                 link->callback(link, link->arg, 0);
 852                         } else {
 853                                 cpu_sfence();
 854                                 link->state = MTX_LINK_ACQUIRED;
 855                                 wakeup(link);
 856                         }
 857                 }
 858                 atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN |
 859                                                  MTX_SHWANTED);
 860                 crit_exit_raw(td);
 861                 return 1;
 862         }
 863         /* retry */
 864         crit_exit_raw(td);
 865
 866         return 0;
 867 }
 868
 869 /*
 870  * Delete a link structure after tsleep has failed.  This code is not
 871  * in the critical path as most exclusive waits are chained.
 872  */
 873 static
 874 void
 875 mtx_delete_link(mtx_t *mtx, mtx_link_t *link)
 876 {
 877         thread_t td = curthread;
 878         u_int   lock;
 879         u_int   nlock;
 880
 881         /*
 882          * Acquire MTX_LINKSPIN.
 883          *
 884          * Do not use cmpxchg to wait for LINKSPIN to clear as this might
 885          * result in too much cpu cache traffic.
 886          */
 887         crit_enter_raw(td);
 888         for (;;) {
 889                 lock = mtx->mtx_lock;
 890                 if (lock & MTX_LINKSPIN) {
 891                         cpu_pause();
 892                         continue;
 893                 }
 894                 nlock = lock | MTX_LINKSPIN;
 895                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 896                         break;
 897                 cpu_pause();
 898         }
 899
 900         /*
 901          * Delete the link and release LINKSPIN.
 902          */
 903         nlock = MTX_LINKSPIN;   /* to clear */
 904
 905         switch(link->state) {
 906         case MTX_LINK_LINKED_EX:
 907                 if (link->next == link) {
 908                         mtx->mtx_exlink = NULL;
 909                         nlock |= MTX_EXWANTED;  /* to clear */
 910                 } else {
 911                         mtx->mtx_exlink = link->next;
 912                         link->next->prev = link->prev;
 913                         link->prev->next = link->next;
 914                 }
 915                 break;
 916         case MTX_LINK_LINKED_SH:
 917                 if (link->next == link) {
 918                         mtx->mtx_shlink = NULL;
 919                         nlock |= MTX_SHWANTED;  /* to clear */
 920                 } else {
 921                         mtx->mtx_shlink = link->next;
 922                         link->next->prev = link->prev;
 923                         link->prev->next = link->next;
 924                 }
 925                 break;
 926         default:
 927                 /* no change */
 928                 break;
 929         }
 930         atomic_clear_int(&mtx->mtx_lock, nlock);
 931         crit_exit_raw(td);
 932 }
 933
 934 /*
 935  * Wait for async lock completion or abort.  Returns ENOLCK if an abort
 936  * occurred.
 937  */
 938 int
 939 mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
 940 {
 941         thread_t td = curthread;
 942         int error;
 943
 944         if ((mtx->mtx_flags & MTXF_NOCOLLSTATS) == 0) {
 945                 indefinite_init(&td->td_indefinite, mtx->mtx_ident, 1,
 946                         ((link->state & MTX_LINK_LINKED_SH) ? 'm' : 'M'));
 947         }
 948
 949         /*
 950          * Sleep.  Handle false wakeups, interruptions, etc.
 951          * The link may also have been aborted.  The LINKED
 952          * bit was set by this cpu so we can test it without
 953          * fences.
 954          */
 955         error = 0;
 956         while (link->state & MTX_LINK_LINKED) {
 957                 tsleep_interlock(link, 0);
 958                 cpu_lfence();
 959                 if (link->state & MTX_LINK_LINKED) {
 960                         error = tsleep(link, flags | PINTERLOCKED,
 961                                        mtx->mtx_ident, to);
 962                         if (error)
 963                                 break;
 964                 }
 965                 if ((mtx->mtx_flags & MTXF_NOCOLLSTATS) == 0)
 966                         indefinite_check(&td->td_indefinite);
 967         }
 968
 969         /*
 970          * We need at least a lfence (load fence) to ensure our cpu does not
 971          * reorder loads (of data outside the lock structure) prior to the
 972          * remote cpu's release, since the above test may have run without
 973          * any atomic interactions.
 974          *
 975          * If we do not do this then state updated by the other cpu before
 976          * releasing its lock may not be read cleanly by our cpu when this
 977          * function returns.  Even though the other cpu ordered its stores,
 978          * our loads can still be out of order.
 979          */
 980         cpu_mfence();
 981
 982         /*
 983          * We are done, make sure the link structure is unlinked.
 984          * It may still be on the list due to e.g. EINTR or
 985          * EWOULDBLOCK.
 986          *
 987          * It is possible for the tsleep to race an ABORT and cause
 988          * error to be 0.
 989          *
 990          * The tsleep() can be woken up for numerous reasons and error
 991          * might be zero in situations where we intend to return an error.
 992          *
 993          * (This is the synchronous case so state cannot be CALLEDBACK)
 994          */
 995         switch(link->state) {
 996         case MTX_LINK_ACQUIRED:
 997         case MTX_LINK_CALLEDBACK:
 998                 error = 0;
 999                 break;
1000         case MTX_LINK_ABORTED:
1001                 error = ENOLCK;
1002                 break;
1003         case MTX_LINK_LINKED_EX:
1004         case MTX_LINK_LINKED_SH:
1005                 mtx_delete_link(mtx, link);
1006                 /* fall through */
1007         default:
1008                 if (error == 0)
1009                         error = EWOULDBLOCK;
1010                 break;
1011         }
1012
1013         /*
1014          * Clear state on status returned.
1015          */
1016         link->state = MTX_LINK_IDLE;
1017
1018         if ((mtx->mtx_flags & MTXF_NOCOLLSTATS) == 0)
1019                 indefinite_done(&td->td_indefinite);
1020
1021         return error;
1022 }
1023
1024 /*
1025  * Abort a mutex locking operation, causing mtx_lock_ex_link() to
1026  * return ENOLCK.  This may be called at any time after the mtx_link
1027  * is initialized or the status from a previous lock has been
1028  * returned.  If called prior to the next (non-try) lock attempt, the
1029  * next lock attempt using this link structure will abort instantly.
1030  *
1031  * Caller must still wait for the operation to complete, either from a
1032  * blocking call that is still in progress or by calling mtx_wait_link().
1033  *
1034  * If an asynchronous lock request is possibly in-progress, the caller
1035  * should call mtx_wait_link() synchronously.  Note that the asynchronous
1036  * lock callback will NOT be called if a successful abort occurred. XXX
1037  */
1038 void
1039 mtx_abort_link(mtx_t *mtx, mtx_link_t *link)
1040 {
1041         thread_t td = curthread;
1042         u_int   lock;
1043         u_int   nlock;
1044
1045         /*
1046          * Acquire MTX_LINKSPIN
1047          */
1048         crit_enter_raw(td);
1049         for (;;) {
1050                 lock = mtx->mtx_lock;
1051                 if (lock & MTX_LINKSPIN) {
1052                         cpu_pause();
1053                         continue;
1054                 }
1055                 nlock = lock | MTX_LINKSPIN;
1056                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
1057                         break;
1058                 cpu_pause();
1059         }
1060
1061         /*
1062          * Do the abort.
1063          *
1064          * WARNING! Link structure can disappear once link->state is set.
1065          */
1066         nlock = MTX_LINKSPIN;   /* to clear */
1067
1068         switch(link->state) {
1069         case MTX_LINK_IDLE:
1070                 /*
1071                  * Link not started yet
1072                  */
1073                 link->state = MTX_LINK_ABORTED;
1074                 break;
1075         case MTX_LINK_LINKED_EX:
1076                 /*
1077                  * de-link, mark aborted, and potentially wakeup the thread
1078                  * or issue the callback.
1079                  */
1080                 if (link->next == link) {
1081                         if (mtx->mtx_exlink == link) {
1082                                 mtx->mtx_exlink = NULL;
1083                                 nlock |= MTX_EXWANTED;  /* to clear */
1084                         }
1085                 } else {
1086                         if (mtx->mtx_exlink == link)
1087                                 mtx->mtx_exlink = link->next;
1088                         link->next->prev = link->prev;
1089                         link->prev->next = link->next;
1090                 }
1091
1092                 /*
1093                  * When aborting the async callback is still made.  We must
1094                  * not set the link status to ABORTED in the callback case
1095                  * since there is nothing else to clear its status if the
1096                  * link is reused.
1097                  */
1098                 if (link->callback) {
1099                         link->state = MTX_LINK_CALLEDBACK;
1100                         link->callback(link, link->arg, ENOLCK);
1101                 } else {
1102                         link->state = MTX_LINK_ABORTED;
1103                         wakeup(link);
1104                 }
1105                 break;
1106         case MTX_LINK_LINKED_SH:
1107                 /*
1108                  * de-link, mark aborted, and potentially wakeup the thread
1109                  * or issue the callback.
1110                  */
1111                 if (link->next == link) {
1112                         if (mtx->mtx_shlink == link) {
1113                                 mtx->mtx_shlink = NULL;
1114                                 nlock |= MTX_SHWANTED;  /* to clear */
1115                         }
1116                 } else {
1117                         if (mtx->mtx_shlink == link)
1118                                 mtx->mtx_shlink = link->next;
1119                         link->next->prev = link->prev;
1120                         link->prev->next = link->next;
1121                 }
1122
1123                 /*
1124                  * When aborting the async callback is still made.  We must
1125                  * not set the link status to ABORTED in the callback case
1126                  * since there is nothing else to clear its status if the
1127                  * link is reused.
1128                  */
1129                 if (link->callback) {
1130                         link->state = MTX_LINK_CALLEDBACK;
1131                         link->callback(link, link->arg, ENOLCK);
1132                 } else {
1133                         link->state = MTX_LINK_ABORTED;
1134                         wakeup(link);
1135                 }
1136                 break;
1137         case MTX_LINK_ACQUIRED:
1138         case MTX_LINK_CALLEDBACK:
1139                 /*
1140                  * Too late, the lock was acquired.  Let it complete.
1141                  */
1142                 break;
1143         default:
1144                 /*
1145                  * link already aborted, do nothing.
1146                  */
1147                 break;
1148         }
1149         atomic_clear_int(&mtx->mtx_lock, nlock);
1150         crit_exit_raw(td);
1151 }