usr/src/uts/common/vm/page_lock.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27
  28 /*
  29  * VM - page locking primitives
  30  */
  31 #include <sys/param.h>
  32 #include <sys/t_lock.h>
  33 #include <sys/vtrace.h>
  34 #include <sys/debug.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/vnode.h>
  37 #include <sys/bitmap.h>
  38 #include <sys/lockstat.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/condvar_impl.h>
  41 #include <vm/page.h>
  42 #include <vm/seg_enum.h>
  43 #include <vm/vm_dep.h>
  44
  45 /*
  46  * This global mutex is for logical page locking.
  47  * The following fields in the page structure are protected
  48  * by this lock:
  49  *
  50  *      p_lckcnt
  51  *      p_cowcnt
  52  */
  53 kmutex_t page_llock;
  54
  55 /*
  56  * This is a global lock for the logical page free list.  The
  57  * logical free list, in this implementation, is maintained as two
  58  * separate physical lists - the cache list and the free list.
  59  */
  60 kmutex_t  page_freelock;
  61
  62 /*
  63  * The hash table, page_hash[], the p_selock fields, and the
  64  * list of pages associated with vnodes are protected by arrays of mutexes.
  65  *
  66  * Unless the hashes are changed radically, the table sizes must be
  67  * a power of two.  Also, we typically need more mutexes for the
  68  * vnodes since these locks are occasionally held for long periods.
  69  * And since there seem to be two special vnodes (kvp and swapvp),
  70  * we make room for private mutexes for them.
  71  *
  72  * The pse_mutex[] array holds the mutexes to protect the p_selock
  73  * fields of all page_t structures.
  74  *
  75  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
  76  * when given a pointer to a page_t.
  77  *
  78  * PIO_TABLE_SIZE must be a power of two.  One could argue that we
  79  * should go to the trouble of setting it up at run time and base it
  80  * on memory size rather than the number of compile time CPUs.
  81  *
  82  * XX64 We should be using physmem size to calculate PIO_SHIFT.
  83  *
  84  *      These might break in 64 bit world.
  85  */
  86 #define PIO_SHIFT       7       /* log2(sizeof(page_t)) */
  87 #define PIO_TABLE_SIZE  128     /* number of io mutexes to have */
  88
  89 pad_mutex_t     ph_mutex[PH_TABLE_SIZE];
  90 kmutex_t        pio_mutex[PIO_TABLE_SIZE];
  91
  92 #define PAGE_IO_MUTEX(pp) \
  93             &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
  94
  95 /*
  96  * The pse_mutex[] array is allocated in the platform startup code
  97  * based on the size of the machine at startup.
  98  */
  99 extern pad_mutex_t *pse_mutex;          /* Locks protecting pp->p_selock */
 100 extern size_t pse_table_size;           /* Number of mutexes in pse_mutex[] */
 101 extern int pse_shift;                   /* log2(pse_table_size) */
 102 #define PAGE_SE_MUTEX(pp)       &pse_mutex[                             \
 103         ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &   \
 104         (pse_table_size - 1)].pad_mutex
 105
 106 #define PSZC_MTX_TABLE_SIZE     128
 107 #define PSZC_MTX_TABLE_SHIFT    7
 108
 109 static pad_mutex_t      pszc_mutex[PSZC_MTX_TABLE_SIZE];
 110
 111 #define PAGE_SZC_MUTEX(_pp) \
 112             &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
 113                 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
 114                 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
 115                 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
 116
 117 /*
 118  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
 119  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
 120  * and p_vpnext).
 121  *
 122  * The page_vnode_mutex(vp) function returns the address of the appropriate
 123  * mutex from this array given a pointer to a vnode.  It is complicated
 124  * by the fact that the kernel's vnode and the swapfs vnode are referenced
 125  * frequently enough to warrent their own mutexes.
 126  *
 127  * The VP_HASH_FUNC returns the index into the vph_mutex array given
 128  * an address of a vnode.
 129  */
 130
 131 /*
 132  * XX64 VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
 133  *      Need to review again.
 134  */
 135 #if defined(_LP64)
 136 #define VPH_TABLE_SIZE  (1 << (VP_SHIFT + 3))
 137 #else   /* 32 bits */
 138 #define VPH_TABLE_SIZE  (2 << VP_SHIFT)
 139 #endif
 140
 141 #define VP_HASH_FUNC(vp) \
 142         ((((uintptr_t)(vp) >> 6) + \
 143             ((uintptr_t)(vp) >> 8) + \
 144             ((uintptr_t)(vp) >> 10) + \
 145             ((uintptr_t)(vp) >> 12)) \
 146             & (VPH_TABLE_SIZE - 1))
 147
 148 extern  struct vnode    kvp;
 149
 150 /*
 151  * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
 152  * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
 153  * VPH_TABLE_SIZE + 1.
 154  */
 155
 156 kmutex_t        vph_mutex[VPH_TABLE_SIZE + 2];
 157
 158 /*
 159  * Initialize the locks used by the Virtual Memory Management system.
 160  */
 161 void
 162 page_lock_init()
 163 {
 164 }
 165
 166 /*
 167  * Return a value for pse_shift based on npg (the number of physical pages)
 168  * and ncpu (the maximum number of CPUs).  This is called by platform startup
 169  * code.
 170  *
 171  * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
 172  * locks grew approximately as the square of the number of threads executing.
 173  * So the primary scaling factor used is NCPU^2.  The size of the machine in
 174  * megabytes is used as an upper bound, particularly for sun4v machines which
 175  * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
 176  * (128) is used as a minimum.  Since the size of the table has to be a power
 177  * of two, the calculated size is rounded up to the next power of two.
 178  */
 179 /*ARGSUSED*/
 180 int
 181 size_pse_array(pgcnt_t npg, int ncpu)
 182 {
 183         size_t size;
 184         pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
 185
 186         size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
 187         size += (1 << (highbit(size) - 1)) - 1;
 188         return (highbit(size) - 1);
 189 }
 190
 191 /*
 192  * At present we only use page ownership to aid debugging, so it's
 193  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
 194  * can map to the same owner because we just 'or' in 0x80000000 and
 195  * then clear the second highest bit, so that (for example) 0x2faced00
 196  * and 0xafaced00 both map to 0xafaced00.
 197  * In the 64-bit world, p_selock may not be large enough to hold a full
 198  * thread pointer.  If we ever need precise ownership (e.g. if we implement
 199  * priority inheritance for page locks) then p_selock should become a
 200  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
 201  */
 202 #define SE_WRITER       (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
 203 #define SE_READER       1
 204
 205 /*
 206  * A page that is deleted must be marked as such using the
 207  * page_lock_delete() function. The page must be exclusively locked.
 208  * The SE_DELETED marker is put in p_selock when this function is called.
 209  * SE_DELETED must be distinct from any SE_WRITER value.
 210  */
 211 #define SE_DELETED      (1 | INT_MIN)
 212
 213 #ifdef VM_STATS
 214 uint_t  vph_kvp_count;
 215 uint_t  vph_swapfsvp_count;
 216 uint_t  vph_other;
 217 #endif /* VM_STATS */
 218
 219 #ifdef VM_STATS
 220 uint_t  page_lock_count;
 221 uint_t  page_lock_miss;
 222 uint_t  page_lock_miss_lock;
 223 uint_t  page_lock_reclaim;
 224 uint_t  page_lock_bad_reclaim;
 225 uint_t  page_lock_same_page;
 226 uint_t  page_lock_upgrade;
 227 uint_t  page_lock_retired;
 228 uint_t  page_lock_upgrade_failed;
 229 uint_t  page_lock_deleted;
 230
 231 uint_t  page_trylock_locked;
 232 uint_t  page_trylock_failed;
 233 uint_t  page_trylock_missed;
 234
 235 uint_t  page_try_reclaim_upgrade;
 236 #endif /* VM_STATS */
 237
 238 /*
 239  * Acquire the "shared/exclusive" lock on a page.
 240  *
 241  * Returns 1 on success and locks the page appropriately.
 242  *         0 on failure and does not lock the page.
 243  *
 244  * If `lock' is non-NULL, it will be dropped and reacquired in the
 245  * failure case.  This routine can block, and if it does
 246  * it will always return a failure since the page identity [vp, off]
 247  * or state may have changed.
 248  */
 249
 250 int
 251 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
 252 {
 253         return (page_lock_es(pp, se, lock, reclaim, 0));
 254 }
 255
 256 /*
 257  * With the addition of reader-writer lock semantics to page_lock_es,
 258  * callers wanting an exclusive (writer) lock may prevent shared-lock
 259  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
 260  * In this case, when an exclusive lock cannot be acquired, p_selock's
 261  * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
 262  * if the page is slated for retirement.
 263  *
 264  * The se and es parameters determine if the lock should be granted
 265  * based on the following decision table:
 266  *
 267  * Lock wanted   es flags     p_selock/SE_EWANTED  Action
 268  * ----------- -------------- -------------------  ---------
 269  * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
 270  * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
 271  * SE_EXCL        none         any lock/any        deny
 272  * SE_SHARED      n/a [2]        shared/0          grant
 273  * SE_SHARED      n/a [2]      unlocked/0          grant
 274  * SE_SHARED      n/a            shared/1          deny
 275  * SE_SHARED      n/a          unlocked/1          deny
 276  * SE_SHARED      n/a              excl/any        deny
 277  *
 278  * Notes:
 279  * [1] The code grants an exclusive lock to the caller and clears the bit
 280  *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
 281  *   bit's value.  This was deemed acceptable as we are not concerned about
 282  *   exclusive-lock starvation. If this ever becomes an issue, a priority or
 283  *   fifo mechanism should also be implemented. Meantime, the thread that
 284  *   set SE_EWANTED should be prepared to catch this condition and reset it
 285  *
 286  * [2] Retired pages may not be locked at any time, regardless of the
 287  *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
 288  *
 289  * Notes on values of "es":
 290  *
 291  *   es & 1: page_lookup_create will attempt page relocation
 292  *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
 293  *       memory thread); this prevents reader-starvation of waiting
 294  *       writer thread(s) by giving priority to writers over readers.
 295  *   es & SE_RETIRED: caller wants to lock pages even if they are
 296  *       retired.  Default is to deny the lock if the page is retired.
 297  *
 298  * And yes, we know, the semantics of this function are too complicated.
 299  * It's on the list to be cleaned up.
 300  */
 301 int
 302 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
 303 {
 304         int             retval;
 305         kmutex_t        *pse = PAGE_SE_MUTEX(pp);
 306         int             upgraded;
 307         int             reclaim_it;
 308
 309         ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
 310
 311         VM_STAT_ADD(page_lock_count);
 312
 313         upgraded = 0;
 314         reclaim_it = 0;
 315
 316         mutex_enter(pse);
 317
 318         ASSERT(((es & SE_EXCL_WANTED) == 0) ||
 319             ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
 320
 321         if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
 322                 mutex_exit(pse);
 323                 VM_STAT_ADD(page_lock_retired);
 324                 return (0);
 325         }
 326
 327         if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
 328                 se = SE_EXCL;
 329         }
 330
 331         if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
 332
 333                 reclaim_it = 1;
 334                 if (se == SE_SHARED) {
 335                         /*
 336                          * This is an interesting situation.
 337                          *
 338                          * Remember that p_free can only change if
 339                          * p_selock < 0.
 340                          * p_free does not depend on our holding `pse'.
 341                          * And, since we hold `pse', p_selock can not change.
 342                          * So, if p_free changes on us, the page is already
 343                          * exclusively held, and we would fail to get p_selock
 344                          * regardless.
 345                          *
 346                          * We want to avoid getting the share
 347                          * lock on a free page that needs to be reclaimed.
 348                          * It is possible that some other thread has the share
 349                          * lock and has left the free page on the cache list.
 350                          * pvn_vplist_dirty() does this for brief periods.
 351                          * If the se_share is currently SE_EXCL, we will fail
 352                          * to acquire p_selock anyway.  Blocking is the
 353                          * right thing to do.
 354                          * If we need to reclaim this page, we must get
 355                          * exclusive access to it, force the upgrade now.
 356                          * Again, we will fail to acquire p_selock if the
 357                          * page is not free and block.
 358                          */
 359                         upgraded = 1;
 360                         se = SE_EXCL;
 361                         VM_STAT_ADD(page_lock_upgrade);
 362                 }
 363         }
 364
 365         if (se == SE_EXCL) {
 366                 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
 367                         /*
 368                          * if the caller wants a writer lock (but did not
 369                          * specify exclusive access), and there is a pending
 370                          * writer that wants exclusive access, return failure
 371                          */
 372                         retval = 0;
 373                 } else if ((pp->p_selock & ~SE_EWANTED) == 0) {
 374                         /* no reader/writer lock held */
 375                         THREAD_KPRI_REQUEST();
 376                         /* this clears our setting of the SE_EWANTED bit */
 377                         pp->p_selock = SE_WRITER;
 378                         retval = 1;
 379                 } else {
 380                         /* page is locked */
 381                         if (es & SE_EXCL_WANTED) {
 382                                 /* set the SE_EWANTED bit */
 383                                 pp->p_selock |= SE_EWANTED;
 384                         }
 385                         retval = 0;
 386                 }
 387         } else {
 388                 retval = 0;
 389                 if (pp->p_selock >= 0) {
 390                         if ((pp->p_selock & SE_EWANTED) == 0) {
 391                                 pp->p_selock += SE_READER;
 392                                 retval = 1;
 393                         }
 394                 }
 395         }
 396
 397         if (retval == 0) {
 398                 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
 399                         VM_STAT_ADD(page_lock_deleted);
 400                         mutex_exit(pse);
 401                         return (retval);
 402                 }
 403
 404 #ifdef VM_STATS
 405                 VM_STAT_ADD(page_lock_miss);
 406                 if (upgraded) {
 407                         VM_STAT_ADD(page_lock_upgrade_failed);
 408                 }
 409 #endif
 410                 if (lock) {
 411                         VM_STAT_ADD(page_lock_miss_lock);
 412                         mutex_exit(lock);
 413                 }
 414
 415                 /*
 416                  * Now, wait for the page to be unlocked and
 417                  * release the lock protecting p_cv and p_selock.
 418                  */
 419                 cv_wait(&pp->p_cv, pse);
 420                 mutex_exit(pse);
 421
 422                 /*
 423                  * The page identity may have changed while we were
 424                  * blocked.  If we are willing to depend on "pp"
 425                  * still pointing to a valid page structure (i.e.,
 426                  * assuming page structures are not dynamically allocated
 427                  * or freed), we could try to lock the page if its
 428                  * identity hasn't changed.
 429                  *
 430                  * This needs to be measured, since we come back from
 431                  * cv_wait holding pse (the expensive part of this
 432                  * operation) we might as well try the cheap part.
 433                  * Though we would also have to confirm that dropping
 434                  * `lock' did not cause any grief to the callers.
 435                  */
 436                 if (lock) {
 437                         mutex_enter(lock);
 438                 }
 439         } else {
 440                 /*
 441                  * We have the page lock.
 442                  * If we needed to reclaim the page, and the page
 443                  * needed reclaiming (ie, it was free), then we
 444                  * have the page exclusively locked.  We may need
 445                  * to downgrade the page.
 446                  */
 447                 ASSERT((upgraded) ?
 448                     ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
 449                 mutex_exit(pse);
 450
 451                 /*
 452                  * We now hold this page's lock, either shared or
 453                  * exclusive.  This will prevent its identity from changing.
 454                  * The page, however, may or may not be free.  If the caller
 455                  * requested, and it is free, go reclaim it from the
 456                  * free list.  If the page can't be reclaimed, return failure
 457                  * so that the caller can start all over again.
 458                  *
 459                  * NOTE:page_reclaim() releases the page lock (p_selock)
 460                  *      if it can't be reclaimed.
 461                  */
 462                 if (reclaim_it) {
 463                         if (!page_reclaim(pp, lock)) {
 464                                 VM_STAT_ADD(page_lock_bad_reclaim);
 465                                 retval = 0;
 466                         } else {
 467                                 VM_STAT_ADD(page_lock_reclaim);
 468                                 if (upgraded) {
 469                                         page_downgrade(pp);
 470                                 }
 471                         }
 472                 }
 473         }
 474         return (retval);
 475 }
 476
 477 /*
 478  * Clear the SE_EWANTED bit from p_selock.  This function allows
 479  * callers of page_lock_es and page_try_reclaim_lock to clear
 480  * their setting of this bit if they decide they no longer wish
 481  * to gain exclusive access to the page.  Currently only
 482  * delete_memory_thread uses this when the delete memory
 483  * operation is cancelled.
 484  */
 485 void
 486 page_lock_clr_exclwanted(page_t *pp)
 487 {
 488         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 489
 490         mutex_enter(pse);
 491         pp->p_selock &= ~SE_EWANTED;
 492         if (CV_HAS_WAITERS(&pp->p_cv))
 493                 cv_broadcast(&pp->p_cv);
 494         mutex_exit(pse);
 495 }
 496
 497 /*
 498  * Read the comments inside of page_lock_es() carefully.
 499  *
 500  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
 501  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
 502  * This is used by threads subject to reader-starvation (eg. memory delete).
 503  *
 504  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
 505  * it is expected that it will retry at a later time.  Threads that will
 506  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
 507  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
 508  * the bit is cleared.)
 509  */
 510 int
 511 page_try_reclaim_lock(page_t *pp, se_t se, int es)
 512 {
 513         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 514         selock_t old;
 515
 516         mutex_enter(pse);
 517
 518         old = pp->p_selock;
 519
 520         ASSERT(((es & SE_EXCL_WANTED) == 0) ||
 521             ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
 522
 523         if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
 524                 mutex_exit(pse);
 525                 VM_STAT_ADD(page_trylock_failed);
 526                 return (0);
 527         }
 528
 529         if (se == SE_SHARED && es == 1 && old == 0) {
 530                 se = SE_EXCL;
 531         }
 532
 533         if (se == SE_SHARED) {
 534                 if (!PP_ISFREE(pp)) {
 535                         if (old >= 0) {
 536                                 /*
 537                                  * Readers are not allowed when excl wanted
 538                                  */
 539                                 if ((old & SE_EWANTED) == 0) {
 540                                         pp->p_selock = old + SE_READER;
 541                                         mutex_exit(pse);
 542                                         return (1);
 543                                 }
 544                         }
 545                         mutex_exit(pse);
 546                         return (0);
 547                 }
 548                 /*
 549                  * The page is free, so we really want SE_EXCL (below)
 550                  */
 551                 VM_STAT_ADD(page_try_reclaim_upgrade);
 552         }
 553
 554         /*
 555          * The caller wants a writer lock.  We try for it only if
 556          * SE_EWANTED is not set, or if the caller specified
 557          * SE_EXCL_WANTED.
 558          */
 559         if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
 560                 if ((old & ~SE_EWANTED) == 0) {
 561                         /* no reader/writer lock held */
 562                         THREAD_KPRI_REQUEST();
 563                         /* this clears out our setting of the SE_EWANTED bit */
 564                         pp->p_selock = SE_WRITER;
 565                         mutex_exit(pse);
 566                         return (1);
 567                 }
 568         }
 569         if (es & SE_EXCL_WANTED) {
 570                 /* page is locked, set the SE_EWANTED bit */
 571                 pp->p_selock |= SE_EWANTED;
 572         }
 573         mutex_exit(pse);
 574         return (0);
 575 }
 576
 577 /*
 578  * Acquire a page's "shared/exclusive" lock, but never block.
 579  * Returns 1 on success, 0 on failure.
 580  */
 581 int
 582 page_trylock(page_t *pp, se_t se)
 583 {
 584         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 585
 586         mutex_enter(pse);
 587         if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
 588             (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
 589                 /*
 590                  * Fail if a thread wants exclusive access and page is
 591                  * retired, if the page is slated for retirement, or a
 592                  * share lock is requested.
 593                  */
 594                 mutex_exit(pse);
 595                 VM_STAT_ADD(page_trylock_failed);
 596                 return (0);
 597         }
 598
 599         if (se == SE_EXCL) {
 600                 if (pp->p_selock == 0) {
 601                         THREAD_KPRI_REQUEST();
 602                         pp->p_selock = SE_WRITER;
 603                         mutex_exit(pse);
 604                         return (1);
 605                 }
 606         } else {
 607                 if (pp->p_selock >= 0) {
 608                         pp->p_selock += SE_READER;
 609                         mutex_exit(pse);
 610                         return (1);
 611                 }
 612         }
 613         mutex_exit(pse);
 614         return (0);
 615 }
 616
 617 /*
 618  * Variant of page_unlock() specifically for the page freelist
 619  * code. The mere existence of this code is a vile hack that
 620  * has resulted due to the backwards locking order of the page
 621  * freelist manager; please don't call it.
 622  */
 623 void
 624 page_unlock_nocapture(page_t *pp)
 625 {
 626         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 627         selock_t old;
 628
 629         mutex_enter(pse);
 630
 631         old = pp->p_selock;
 632         if ((old & ~SE_EWANTED) == SE_READER) {
 633                 pp->p_selock = old & ~SE_READER;
 634                 if (CV_HAS_WAITERS(&pp->p_cv))
 635                         cv_broadcast(&pp->p_cv);
 636         } else if ((old & ~SE_EWANTED) == SE_DELETED) {
 637                 panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
 638         } else if (old < 0) {
 639                 THREAD_KPRI_RELEASE();
 640                 pp->p_selock &= SE_EWANTED;
 641                 if (CV_HAS_WAITERS(&pp->p_cv))
 642                         cv_broadcast(&pp->p_cv);
 643         } else if ((old & ~SE_EWANTED) > SE_READER) {
 644                 pp->p_selock = old - SE_READER;
 645         } else {
 646                 panic("page_unlock_nocapture: page %p is not locked",
 647                     (void *)pp);
 648         }
 649
 650         mutex_exit(pse);
 651 }
 652
 653 /*
 654  * Release the page's "shared/exclusive" lock and wake up anyone
 655  * who might be waiting for it.
 656  */
 657 void
 658 page_unlock(page_t *pp)
 659 {
 660         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 661         selock_t old;
 662
 663         mutex_enter(pse);
 664
 665         old = pp->p_selock;
 666         if ((old & ~SE_EWANTED) == SE_READER) {
 667                 pp->p_selock = old & ~SE_READER;
 668                 if (CV_HAS_WAITERS(&pp->p_cv))
 669                         cv_broadcast(&pp->p_cv);
 670         } else if ((old & ~SE_EWANTED) == SE_DELETED) {
 671                 panic("page_unlock: page %p is deleted", (void *)pp);
 672         } else if (old < 0) {
 673                 THREAD_KPRI_RELEASE();
 674                 pp->p_selock &= SE_EWANTED;
 675                 if (CV_HAS_WAITERS(&pp->p_cv))
 676                         cv_broadcast(&pp->p_cv);
 677         } else if ((old & ~SE_EWANTED) > SE_READER) {
 678                 pp->p_selock = old - SE_READER;
 679         } else {
 680                 panic("page_unlock: page %p is not locked", (void *)pp);
 681         }
 682
 683         if (pp->p_selock == 0) {
 684                 /*
 685                  * If the T_CAPTURING bit is set, that means that we should
 686                  * not try and capture the page again as we could recurse
 687                  * which could lead to a stack overflow panic or spending a
 688                  * relatively long time in the kernel making no progress.
 689                  */
 690                 if ((pp->p_toxic & PR_CAPTURE) &&
 691                     !(curthread->t_flag & T_CAPTURING) &&
 692                     !PP_RETIRED(pp)) {
 693                         THREAD_KPRI_REQUEST();
 694                         pp->p_selock = SE_WRITER;
 695                         mutex_exit(pse);
 696                         page_unlock_capture(pp);
 697                 } else {
 698                         mutex_exit(pse);
 699                 }
 700         } else {
 701                 mutex_exit(pse);
 702         }
 703 }
 704
 705 /*
 706  * Try to upgrade the lock on the page from a "shared" to an
 707  * "exclusive" lock.  Since this upgrade operation is done while
 708  * holding the mutex protecting this page, no one else can acquire this page's
 709  * lock and change the page. Thus, it is safe to drop the "shared"
 710  * lock and attempt to acquire the "exclusive" lock.
 711  *
 712  * Returns 1 on success, 0 on failure.
 713  */
 714 int
 715 page_tryupgrade(page_t *pp)
 716 {
 717         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 718
 719         mutex_enter(pse);
 720         if (!(pp->p_selock & SE_EWANTED)) {
 721                 /* no threads want exclusive access, try upgrade */
 722                 if (pp->p_selock == SE_READER) {
 723                         THREAD_KPRI_REQUEST();
 724                         /* convert to exclusive lock */
 725                         pp->p_selock = SE_WRITER;
 726                         mutex_exit(pse);
 727                         return (1);
 728                 }
 729         }
 730         mutex_exit(pse);
 731         return (0);
 732 }
 733
 734 /*
 735  * Downgrade the "exclusive" lock on the page to a "shared" lock
 736  * while holding the mutex protecting this page's p_selock field.
 737  */
 738 void
 739 page_downgrade(page_t *pp)
 740 {
 741         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 742         int excl_waiting;
 743
 744         ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
 745         ASSERT(PAGE_EXCL(pp));
 746
 747         mutex_enter(pse);
 748         excl_waiting =  pp->p_selock & SE_EWANTED;
 749         THREAD_KPRI_RELEASE();
 750         pp->p_selock = SE_READER | excl_waiting;
 751         if (CV_HAS_WAITERS(&pp->p_cv))
 752                 cv_broadcast(&pp->p_cv);
 753         mutex_exit(pse);
 754 }
 755
 756 void
 757 page_lock_delete(page_t *pp)
 758 {
 759         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 760
 761         ASSERT(PAGE_EXCL(pp));
 762         ASSERT(pp->p_vnode == NULL);
 763         ASSERT(pp->p_offset == (u_offset_t)-1);
 764         ASSERT(!PP_ISFREE(pp));
 765
 766         mutex_enter(pse);
 767         THREAD_KPRI_RELEASE();
 768         pp->p_selock = SE_DELETED;
 769         if (CV_HAS_WAITERS(&pp->p_cv))
 770                 cv_broadcast(&pp->p_cv);
 771         mutex_exit(pse);
 772 }
 773
 774 int
 775 page_deleted(page_t *pp)
 776 {
 777         return (pp->p_selock == SE_DELETED);
 778 }
 779
 780 /*
 781  * Implement the io lock for pages
 782  */
 783 void
 784 page_iolock_init(page_t *pp)
 785 {
 786         pp->p_iolock_state = 0;
 787         cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
 788 }
 789
 790 /*
 791  * Acquire the i/o lock on a page.
 792  */
 793 void
 794 page_io_lock(page_t *pp)
 795 {
 796         kmutex_t *pio;
 797
 798         pio = PAGE_IO_MUTEX(pp);
 799         mutex_enter(pio);
 800         while (pp->p_iolock_state & PAGE_IO_INUSE) {
 801                 cv_wait(&(pp->p_io_cv), pio);
 802         }
 803         pp->p_iolock_state |= PAGE_IO_INUSE;
 804         mutex_exit(pio);
 805 }
 806
 807 /*
 808  * Release the i/o lock on a page.
 809  */
 810 void
 811 page_io_unlock(page_t *pp)
 812 {
 813         kmutex_t *pio;
 814
 815         pio = PAGE_IO_MUTEX(pp);
 816         mutex_enter(pio);
 817         cv_broadcast(&pp->p_io_cv);
 818         pp->p_iolock_state &= ~PAGE_IO_INUSE;
 819         mutex_exit(pio);
 820 }
 821
 822 /*
 823  * Try to acquire the i/o lock on a page without blocking.
 824  * Returns 1 on success, 0 on failure.
 825  */
 826 int
 827 page_io_trylock(page_t *pp)
 828 {
 829         kmutex_t *pio;
 830
 831         if (pp->p_iolock_state & PAGE_IO_INUSE)
 832                 return (0);
 833
 834         pio = PAGE_IO_MUTEX(pp);
 835         mutex_enter(pio);
 836
 837         if (pp->p_iolock_state & PAGE_IO_INUSE) {
 838                 mutex_exit(pio);
 839                 return (0);
 840         }
 841         pp->p_iolock_state |= PAGE_IO_INUSE;
 842         mutex_exit(pio);
 843
 844         return (1);
 845 }
 846
 847 /*
 848  * Wait until the i/o lock is not held.
 849  */
 850 void
 851 page_io_wait(page_t *pp)
 852 {
 853         kmutex_t *pio;
 854
 855         pio = PAGE_IO_MUTEX(pp);
 856         mutex_enter(pio);
 857         while (pp->p_iolock_state & PAGE_IO_INUSE) {
 858                 cv_wait(&(pp->p_io_cv), pio);
 859         }
 860         mutex_exit(pio);
 861 }
 862
 863 /*
 864  * Returns 1 on success, 0 on failure.
 865  */
 866 int
 867 page_io_locked(page_t *pp)
 868 {
 869         return (pp->p_iolock_state & PAGE_IO_INUSE);
 870 }
 871
 872 /*
 873  * Assert that the i/o lock on a page is held.
 874  * Returns 1 on success, 0 on failure.
 875  */
 876 int
 877 page_iolock_assert(page_t *pp)
 878 {
 879         return (page_io_locked(pp));
 880 }
 881
 882 /*
 883  * Wrapper exported to kernel routines that are built
 884  * platform-independent (the macro is platform-dependent;
 885  * the size of vph_mutex[] is based on NCPU).
 886  *
 887  * Note that you can do stress testing on this by setting the
 888  * variable page_vnode_mutex_stress to something other than
 889  * zero in a DEBUG kernel in a debugger after loading the kernel.
 890  * Setting it after the kernel is running may not work correctly.
 891  */
 892 #ifdef DEBUG
 893 static int page_vnode_mutex_stress = 0;
 894 #endif
 895
 896 kmutex_t *
 897 page_vnode_mutex(vnode_t *vp)
 898 {
 899         if (vp == &kvp)
 900                 return (&vph_mutex[VPH_TABLE_SIZE + 0]);
 901
 902         if (vp == &zvp)
 903                 return (&vph_mutex[VPH_TABLE_SIZE + 1]);
 904 #ifdef DEBUG
 905         if (page_vnode_mutex_stress != 0)
 906                 return (&vph_mutex[0]);
 907 #endif
 908
 909         return (&vph_mutex[VP_HASH_FUNC(vp)]);
 910 }
 911
 912 kmutex_t *
 913 page_se_mutex(page_t *pp)
 914 {
 915         return (PAGE_SE_MUTEX(pp));
 916 }
 917
 918 #ifdef VM_STATS
 919 uint_t pszclck_stat[4];
 920 #endif
 921 /*
 922  * Find, take and return a mutex held by hat_page_demote().
 923  * Called by page_demote_vp_pages() before hat_page_demote() call and by
 924  * routines that want to block hat_page_demote() but can't do it
 925  * via locking all constituent pages.
 926  *
 927  * Return NULL if p_szc is 0.
 928  *
 929  * It should only be used for pages that can be demoted by hat_page_demote()
 930  * i.e. non swapfs file system pages.  The logic here is lifted from
 931  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
 932  * since the page is locked and not free.
 933  *
 934  * Hash of the root page is used to find the lock.
 935  * To find the root in the presense of hat_page_demote() chageing the location
 936  * of the root this routine relies on the fact that hat_page_demote() changes
 937  * root last.
 938  *
 939  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
 940  * returned pp's p_szc may be any value.
 941  */
 942 kmutex_t *
 943 page_szc_lock(page_t *pp)
 944 {
 945         kmutex_t        *mtx;
 946         page_t          *rootpp;
 947         uint_t          szc;
 948         uint_t          rszc;
 949         uint_t          pszc = pp->p_szc;
 950
 951         ASSERT(pp != NULL);
 952         ASSERT(PAGE_LOCKED(pp));
 953         ASSERT(!PP_ISFREE(pp));
 954         ASSERT(pp->p_vnode != NULL);
 955         ASSERT(!IS_SWAPFSVP(pp->p_vnode));
 956         ASSERT(!PP_ISKAS(pp));
 957
 958 again:
 959         if (pszc == 0) {
 960                 VM_STAT_ADD(pszclck_stat[0]);
 961                 return (NULL);
 962         }
 963
 964         /* The lock lives in the root page */
 965
 966         rootpp = PP_GROUPLEADER(pp, pszc);
 967         mtx = PAGE_SZC_MUTEX(rootpp);
 968         mutex_enter(mtx);
 969
 970         /*
 971          * since p_szc can only decrease if pp == rootpp
 972          * rootpp will be always the same i.e we have the right root
 973          * regardless of rootpp->p_szc.
 974          * If location of pp's root didn't change after we took
 975          * the lock we have the right root. return mutex hashed off it.
 976          */
 977         if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
 978                 VM_STAT_ADD(pszclck_stat[1]);
 979                 return (mtx);
 980         }
 981
 982         /*
 983          * root location changed because page got demoted.
 984          * locate the new root.
 985          */
 986         if (rszc < pszc) {
 987                 szc = pp->p_szc;
 988                 ASSERT(szc < pszc);
 989                 mutex_exit(mtx);
 990                 pszc = szc;
 991                 VM_STAT_ADD(pszclck_stat[2]);
 992                 goto again;
 993         }
 994
 995         VM_STAT_ADD(pszclck_stat[3]);
 996         /*
 997          * current hat_page_demote not done yet.
 998          * wait for it to finish.
 999          */
1000         mutex_exit(mtx);
1001         rootpp = PP_GROUPLEADER(rootpp, rszc);
1002         mtx = PAGE_SZC_MUTEX(rootpp);
1003         mutex_enter(mtx);
1004         mutex_exit(mtx);
1005         ASSERT(rootpp->p_szc < rszc);
1006         goto again;
1007 }
1008
1009 int
1010 page_szc_lock_assert(page_t *pp)
1011 {
1012         page_t *rootpp = PP_PAGEROOT(pp);
1013         kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
1014
1015         return (MUTEX_HELD(mtx));
1016 }
1017
1018 /*
1019  * memseg locking
1020  */
1021 static krwlock_t memsegslock;
1022
1023 /*
1024  * memlist (phys_install, phys_avail) locking.
1025  */
1026 static krwlock_t memlists_lock;
1027
1028 void
1029 memsegs_lock(int writer)
1030 {
1031         rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
1032 }
1033
1034 /*ARGSUSED*/
1035 void
1036 memsegs_unlock(int writer)
1037 {
1038         rw_exit(&memsegslock);
1039 }
1040
1041 int
1042 memsegs_lock_held(void)
1043 {
1044         return (RW_LOCK_HELD(&memsegslock));
1045 }
1046
1047 void
1048 memlist_read_lock(void)
1049 {
1050         rw_enter(&memlists_lock, RW_READER);
1051 }
1052
1053 void
1054 memlist_read_unlock(void)
1055 {
1056         rw_exit(&memlists_lock);
1057 }
1058
1059 void
1060 memlist_write_lock(void)
1061 {
1062         rw_enter(&memlists_lock, RW_WRITER);
1063 }
1064
1065 void
1066 memlist_write_unlock(void)
1067 {
1068         rw_exit(&memlists_lock);
1069 }