kernel/vm/page_lock.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25
  26 /*
  27  * VM - page locking primitives
  28  */
  29 #include <sys/param.h>
  30 #include <sys/t_lock.h>
  31 #include <sys/vtrace.h>
  32 #include <sys/debug.h>
  33 #include <sys/cmn_err.h>
  34 #include <sys/bitmap.h>
  35 #include <sys/lockstat.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/condvar_impl.h>
  38 #include <vm/page.h>
  39 #include <vm/seg_enum.h>
  40 #include <vm/vm_dep.h>
  41 #include <vm/seg_kmem.h>
  42
  43 /*
  44  * This global mutex array is for logical page locking.
  45  * The following fields in the page structure are protected
  46  * by this lock:
  47  *
  48  *      p_lckcnt
  49  *      p_cowcnt
  50  */
  51 pad_mutex_t page_llocks[8 * NCPU_P2];
  52
  53 /*
  54  * This is a global lock for the logical page free list.  The
  55  * logical free list, in this implementation, is maintained as two
  56  * separate physical lists - the cache list and the free list.
  57  */
  58 kmutex_t  page_freelock;
  59
  60 /*
  61  * The hash table, page_hash[], the p_selock fields, and the
  62  * list of pages associated with vnodes are protected by arrays of mutexes.
  63  *
  64  * Unless the hashes are changed radically, the table sizes must be
  65  * a power of two.  Also, we typically need more mutexes for the
  66  * vnodes since these locks are occasionally held for long periods.
  67  * And since there seem to be two special vnodes (kvp and swapvp),
  68  * we make room for private mutexes for them.
  69  *
  70  * The pse_mutex[] array holds the mutexes to protect the p_selock
  71  * fields of all page_t structures.
  72  *
  73  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
  74  * when given a pointer to a page_t.
  75  *
  76  * PIO_TABLE_SIZE must be a power of two.  One could argue that we
  77  * should go to the trouble of setting it up at run time and base it
  78  * on memory size rather than the number of compile time CPUs.
  79  *
  80  * XX64 We should be using physmem size to calculate PIO_SHIFT.
  81  *
  82  *      These might break in 64 bit world.
  83  */
  84 #define PIO_SHIFT       7       /* log2(sizeof(page_t)) */
  85 #define PIO_TABLE_SIZE  128     /* number of io mutexes to have */
  86
  87 kmutex_t        pio_mutex[PIO_TABLE_SIZE];
  88
  89 #define PAGE_IO_MUTEX(pp) \
  90             &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
  91
  92 /*
  93  * The pse_mutex[] array is allocated in the platform startup code
  94  * based on the size of the machine at startup.
  95  */
  96 extern pad_mutex_t *pse_mutex;          /* Locks protecting pp->p_selock */
  97 extern size_t pse_table_size;           /* Number of mutexes in pse_mutex[] */
  98 extern int pse_shift;                   /* log2(pse_table_size) */
  99 #define PAGE_SE_MUTEX(pp)       &pse_mutex[                             \
 100         ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &   \
 101         (pse_table_size - 1)].pad_mutex
 102
 103 #define PSZC_MTX_TABLE_SIZE     128
 104 #define PSZC_MTX_TABLE_SHIFT    7
 105
 106 static pad_mutex_t      pszc_mutex[PSZC_MTX_TABLE_SIZE];
 107
 108 #define PAGE_SZC_MUTEX(_pp) \
 109             &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
 110                 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
 111                 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
 112                 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
 113
 114 /*
 115  * Initialize the locks used by the Virtual Memory Management system.
 116  */
 117 void
 118 page_lock_init()
 119 {
 120 }
 121
 122 /*
 123  * Return a value for pse_shift based on npg (the number of physical pages)
 124  * and ncpu (the maximum number of CPUs).  This is called by platform startup
 125  * code.
 126  *
 127  * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
 128  * locks grew approximately as the square of the number of threads executing.
 129  * So the primary scaling factor used is NCPU^2.  The size of the machine in
 130  * megabytes is used as an upper bound, particularly for sun4v machines which
 131  * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
 132  * (128) is used as a minimum.  Since the size of the table has to be a power
 133  * of two, the calculated size is rounded up to the next power of two.
 134  */
 135 /*ARGSUSED*/
 136 int
 137 size_pse_array(pgcnt_t npg, int ncpu)
 138 {
 139         size_t size;
 140         pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
 141
 142         size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
 143         size += (1 << (highbit(size) - 1)) - 1;
 144         return (highbit(size) - 1);
 145 }
 146
 147 /*
 148  * At present we only use page ownership to aid debugging, so it's
 149  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
 150  * can map to the same owner because we just 'or' in 0x80000000 and
 151  * then clear the second highest bit, so that (for example) 0x2faced00
 152  * and 0xafaced00 both map to 0xafaced00.
 153  * In the 64-bit world, p_selock may not be large enough to hold a full
 154  * thread pointer.  If we ever need precise ownership (e.g. if we implement
 155  * priority inheritance for page locks) then p_selock should become a
 156  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
 157  */
 158 #define SE_WRITER       (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
 159 #define SE_READER       1
 160
 161 /*
 162  * A page that is deleted must be marked as such using the
 163  * page_lock_delete() function. The page must be exclusively locked.
 164  * The SE_DELETED marker is put in p_selock when this function is called.
 165  * SE_DELETED must be distinct from any SE_WRITER value.
 166  */
 167 #define SE_DELETED      (1 | INT_MIN)
 168
 169 #ifdef VM_STATS
 170 uint_t  vph_kvp_count;
 171 uint_t  vph_swapfsvp_count;
 172 uint_t  vph_other;
 173
 174 uint_t  page_lock_count;
 175 uint_t  page_lock_miss;
 176 uint_t  page_lock_miss_lock;
 177 uint_t  page_lock_reclaim;
 178 uint_t  page_lock_bad_reclaim;
 179 uint_t  page_lock_same_page;
 180 uint_t  page_lock_upgrade;
 181 uint_t  page_lock_retired;
 182 uint_t  page_lock_upgrade_failed;
 183 uint_t  page_lock_deleted;
 184
 185 uint_t  page_trylock_locked;
 186 uint_t  page_trylock_failed;
 187 uint_t  page_trylock_missed;
 188
 189 uint_t  page_try_reclaim_upgrade;
 190 #endif /* VM_STATS */
 191
 192 /*
 193  * Acquire the "shared/exclusive" lock on a page.
 194  *
 195  * Returns 1 on success and locks the page appropriately.
 196  *         0 on failure and does not lock the page.
 197  *
 198  * If `lock' is non-NULL, it will be dropped and reacquired in the
 199  * failure case.  This routine can block, and if it does
 200  * it will always return a failure since the page identity [vp, off]
 201  * or state may have changed.
 202  */
 203
 204 int
 205 page_lock(page_t *pp, se_t se, vnode_t *vnode, reclaim_t reclaim)
 206 {
 207         return (page_lock_es(pp, se, vnode, reclaim, 0));
 208 }
 209
 210 /*
 211  * With the addition of reader-writer lock semantics to page_lock_es,
 212  * callers wanting an exclusive (writer) lock may prevent shared-lock
 213  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
 214  * In this case, when an exclusive lock cannot be acquired, p_selock's
 215  * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
 216  * if the page is slated for retirement.
 217  *
 218  * The se and es parameters determine if the lock should be granted
 219  * based on the following decision table:
 220  *
 221  * Lock wanted   es flags     p_selock/SE_EWANTED  Action
 222  * ----------- -------------- -------------------  ---------
 223  * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
 224  * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
 225  * SE_EXCL        none         any lock/any        deny
 226  * SE_SHARED      n/a [2]        shared/0          grant
 227  * SE_SHARED      n/a [2]      unlocked/0          grant
 228  * SE_SHARED      n/a            shared/1          deny
 229  * SE_SHARED      n/a          unlocked/1          deny
 230  * SE_SHARED      n/a              excl/any        deny
 231  *
 232  * Notes:
 233  * [1] The code grants an exclusive lock to the caller and clears the bit
 234  *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
 235  *   bit's value.  This was deemed acceptable as we are not concerned about
 236  *   exclusive-lock starvation. If this ever becomes an issue, a priority or
 237  *   fifo mechanism should also be implemented. Meantime, the thread that
 238  *   set SE_EWANTED should be prepared to catch this condition and reset it
 239  *
 240  * [2] Retired pages may not be locked at any time, regardless of the
 241  *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
 242  *
 243  * Notes on values of "es":
 244  *
 245  *   es & 1: page_lookup_create will attempt page relocation
 246  *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
 247  *       memory thread); this prevents reader-starvation of waiting
 248  *       writer thread(s) by giving priority to writers over readers.
 249  *   es & SE_RETIRED: caller wants to lock pages even if they are
 250  *       retired.  Default is to deny the lock if the page is retired.
 251  *
 252  * And yes, we know, the semantics of this function are too complicated.
 253  * It's on the list to be cleaned up.
 254  */
 255 int
 256 page_lock_es(page_t *pp, se_t se, vnode_t *vnode, reclaim_t reclaim, int es)
 257 {
 258         int             retval;
 259         kmutex_t        *pse = PAGE_SE_MUTEX(pp);
 260         int             upgraded;
 261         int             reclaim_it;
 262
 263         ASSERT(vnode != NULL ? MUTEX_HELD(page_vnode_mutex(vnode)) : 1);
 264
 265         VM_STAT_ADD(page_lock_count);
 266
 267         upgraded = 0;
 268         reclaim_it = 0;
 269
 270         mutex_enter(pse);
 271
 272         ASSERT(((es & SE_EXCL_WANTED) == 0) ||
 273             ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
 274
 275         if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
 276                 mutex_exit(pse);
 277                 VM_STAT_ADD(page_lock_retired);
 278                 return (0);
 279         }
 280
 281         if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
 282                 se = SE_EXCL;
 283         }
 284
 285         if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
 286
 287                 reclaim_it = 1;
 288                 if (se == SE_SHARED) {
 289                         /*
 290                          * This is an interesting situation.
 291                          *
 292                          * Remember that p_free can only change if
 293                          * p_selock < 0.
 294                          * p_free does not depend on our holding `pse'.
 295                          * And, since we hold `pse', p_selock can not change.
 296                          * So, if p_free changes on us, the page is already
 297                          * exclusively held, and we would fail to get p_selock
 298                          * regardless.
 299                          *
 300                          * We want to avoid getting the share
 301                          * lock on a free page that needs to be reclaimed.
 302                          * It is possible that some other thread has the share
 303                          * lock and has left the free page on the cache list.
 304                          * pvn_vplist_dirty() does this for brief periods.
 305                          * If the se_share is currently SE_EXCL, we will fail
 306                          * to acquire p_selock anyway.  Blocking is the
 307                          * right thing to do.
 308                          * If we need to reclaim this page, we must get
 309                          * exclusive access to it, force the upgrade now.
 310                          * Again, we will fail to acquire p_selock if the
 311                          * page is not free and block.
 312                          */
 313                         upgraded = 1;
 314                         se = SE_EXCL;
 315                         VM_STAT_ADD(page_lock_upgrade);
 316                 }
 317         }
 318
 319         if (se == SE_EXCL) {
 320                 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
 321                         /*
 322                          * if the caller wants a writer lock (but did not
 323                          * specify exclusive access), and there is a pending
 324                          * writer that wants exclusive access, return failure
 325                          */
 326                         retval = 0;
 327                 } else if ((pp->p_selock & ~SE_EWANTED) == 0) {
 328                         /* no reader/writer lock held */
 329                         THREAD_KPRI_REQUEST();
 330                         /* this clears our setting of the SE_EWANTED bit */
 331                         pp->p_selock = SE_WRITER;
 332                         retval = 1;
 333                 } else {
 334                         /* page is locked */
 335                         if (es & SE_EXCL_WANTED) {
 336                                 /* set the SE_EWANTED bit */
 337                                 pp->p_selock |= SE_EWANTED;
 338                         }
 339                         retval = 0;
 340                 }
 341         } else {
 342                 retval = 0;
 343                 if (pp->p_selock >= 0) {
 344                         if ((pp->p_selock & SE_EWANTED) == 0) {
 345                                 pp->p_selock += SE_READER;
 346                                 retval = 1;
 347                         }
 348                 }
 349         }
 350
 351         if (retval == 0) {
 352                 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
 353                         VM_STAT_ADD(page_lock_deleted);
 354                         mutex_exit(pse);
 355                         return (retval);
 356                 }
 357
 358                 VM_STAT_ADD(page_lock_miss);
 359                 VM_STAT_COND_ADD(upgraded, page_lock_upgrade_failed);
 360
 361                 if (vnode != NULL) {
 362                         VM_STAT_ADD(page_lock_miss_lock);
 363                         mutex_exit(page_vnode_mutex(vnode));
 364                 }
 365
 366                 /*
 367                  * Now, wait for the page to be unlocked and
 368                  * release the lock protecting p_cv and p_selock.
 369                  */
 370                 cv_wait(&pp->p_cv, pse);
 371                 mutex_exit(pse);
 372
 373                 /*
 374                  * The page identity may have changed while we were
 375                  * blocked.  If we are willing to depend on "pp"
 376                  * still pointing to a valid page structure (i.e.,
 377                  * assuming page structures are not dynamically allocated
 378                  * or freed), we could try to lock the page if its
 379                  * identity hasn't changed.
 380                  *
 381                  * This needs to be measured, since we come back from
 382                  * cv_wait holding pse (the expensive part of this
 383                  * operation) we might as well try the cheap part.
 384                  * Though we would also have to confirm that dropping
 385                  * vnode page lock did not cause any grief to the callers.
 386                  */
 387                 if (vnode != NULL)
 388                         mutex_enter(page_vnode_mutex(vnode));
 389         } else {
 390                 /*
 391                  * We have the page lock.
 392                  * If we needed to reclaim the page, and the page
 393                  * needed reclaiming (ie, it was free), then we
 394                  * have the page exclusively locked.  We may need
 395                  * to downgrade the page.
 396                  */
 397                 ASSERT((upgraded) ?
 398                     ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
 399                 mutex_exit(pse);
 400
 401                 /*
 402                  * We now hold this page's lock, either shared or
 403                  * exclusive.  This will prevent its identity from changing.
 404                  * The page, however, may or may not be free.  If the caller
 405                  * requested, and it is free, go reclaim it from the
 406                  * free list.  If the page can't be reclaimed, return failure
 407                  * so that the caller can start all over again.
 408                  *
 409                  * NOTE:page_reclaim() releases the page lock (p_selock)
 410                  *      if it can't be reclaimed.
 411                  */
 412                 if (reclaim_it) {
 413                         if (!page_reclaim(pp, vnode)) {
 414                                 VM_STAT_ADD(page_lock_bad_reclaim);
 415                                 retval = 0;
 416                         } else {
 417                                 VM_STAT_ADD(page_lock_reclaim);
 418                                 if (upgraded) {
 419                                         page_downgrade(pp);
 420                                 }
 421                         }
 422                 }
 423         }
 424         return (retval);
 425 }
 426
 427 /*
 428  * Clear the SE_EWANTED bit from p_selock.  This function allows
 429  * callers of page_lock_es and page_try_reclaim_lock to clear
 430  * their setting of this bit if they decide they no longer wish
 431  * to gain exclusive access to the page.  Currently only
 432  * delete_memory_thread uses this when the delete memory
 433  * operation is cancelled.
 434  */
 435 void
 436 page_lock_clr_exclwanted(page_t *pp)
 437 {
 438         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 439
 440         mutex_enter(pse);
 441         pp->p_selock &= ~SE_EWANTED;
 442         if (CV_HAS_WAITERS(&pp->p_cv))
 443                 cv_broadcast(&pp->p_cv);
 444         mutex_exit(pse);
 445 }
 446
 447 /*
 448  * Read the comments inside of page_lock_es() carefully.
 449  *
 450  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
 451  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
 452  * This is used by threads subject to reader-starvation (eg. memory delete).
 453  *
 454  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
 455  * it is expected that it will retry at a later time.  Threads that will
 456  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
 457  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
 458  * the bit is cleared.)
 459  */
 460 int
 461 page_try_reclaim_lock(page_t *pp, se_t se, int es)
 462 {
 463         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 464         selock_t old;
 465
 466         mutex_enter(pse);
 467
 468         old = pp->p_selock;
 469
 470         ASSERT(((es & SE_EXCL_WANTED) == 0) ||
 471             ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
 472
 473         if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
 474                 mutex_exit(pse);
 475                 VM_STAT_ADD(page_trylock_failed);
 476                 return (0);
 477         }
 478
 479         if (se == SE_SHARED && es == 1 && old == 0) {
 480                 se = SE_EXCL;
 481         }
 482
 483         if (se == SE_SHARED) {
 484                 if (!PP_ISFREE(pp)) {
 485                         if (old >= 0) {
 486                                 /*
 487                                  * Readers are not allowed when excl wanted
 488                                  */
 489                                 if ((old & SE_EWANTED) == 0) {
 490                                         pp->p_selock = old + SE_READER;
 491                                         mutex_exit(pse);
 492                                         return (1);
 493                                 }
 494                         }
 495                         mutex_exit(pse);
 496                         return (0);
 497                 }
 498                 /*
 499                  * The page is free, so we really want SE_EXCL (below)
 500                  */
 501                 VM_STAT_ADD(page_try_reclaim_upgrade);
 502         }
 503
 504         /*
 505          * The caller wants a writer lock.  We try for it only if
 506          * SE_EWANTED is not set, or if the caller specified
 507          * SE_EXCL_WANTED.
 508          */
 509         if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
 510                 if ((old & ~SE_EWANTED) == 0) {
 511                         /* no reader/writer lock held */
 512                         THREAD_KPRI_REQUEST();
 513                         /* this clears out our setting of the SE_EWANTED bit */
 514                         pp->p_selock = SE_WRITER;
 515                         mutex_exit(pse);
 516                         return (1);
 517                 }
 518         }
 519         if (es & SE_EXCL_WANTED) {
 520                 /* page is locked, set the SE_EWANTED bit */
 521                 pp->p_selock |= SE_EWANTED;
 522         }
 523         mutex_exit(pse);
 524         return (0);
 525 }
 526
 527 /*
 528  * Acquire a page's "shared/exclusive" lock, but never block.
 529  * Returns 1 on success, 0 on failure.
 530  */
 531 int
 532 page_trylock(page_t *pp, se_t se)
 533 {
 534         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 535
 536         mutex_enter(pse);
 537         if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
 538             (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
 539                 /*
 540                  * Fail if a thread wants exclusive access and page is
 541                  * retired, if the page is slated for retirement, or a
 542                  * share lock is requested.
 543                  */
 544                 mutex_exit(pse);
 545                 VM_STAT_ADD(page_trylock_failed);
 546                 return (0);
 547         }
 548
 549         if (se == SE_EXCL) {
 550                 if (pp->p_selock == 0) {
 551                         THREAD_KPRI_REQUEST();
 552                         pp->p_selock = SE_WRITER;
 553                         mutex_exit(pse);
 554                         return (1);
 555                 }
 556         } else {
 557                 if (pp->p_selock >= 0) {
 558                         pp->p_selock += SE_READER;
 559                         mutex_exit(pse);
 560                         return (1);
 561                 }
 562         }
 563         mutex_exit(pse);
 564         return (0);
 565 }
 566
 567 /*
 568  * Variant of page_unlock() specifically for the page freelist
 569  * code. The mere existence of this code is a vile hack that
 570  * has resulted due to the backwards locking order of the page
 571  * freelist manager; please don't call it.
 572  */
 573 void
 574 page_unlock_nocapture(page_t *pp)
 575 {
 576         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 577         selock_t old;
 578
 579         mutex_enter(pse);
 580
 581         old = pp->p_selock;
 582         if ((old & ~SE_EWANTED) == SE_READER) {
 583                 pp->p_selock = old & ~SE_READER;
 584                 if (CV_HAS_WAITERS(&pp->p_cv))
 585                         cv_broadcast(&pp->p_cv);
 586         } else if ((old & ~SE_EWANTED) == SE_DELETED) {
 587                 panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
 588         } else if (old < 0) {
 589                 THREAD_KPRI_RELEASE();
 590                 pp->p_selock &= SE_EWANTED;
 591                 if (CV_HAS_WAITERS(&pp->p_cv))
 592                         cv_broadcast(&pp->p_cv);
 593         } else if ((old & ~SE_EWANTED) > SE_READER) {
 594                 pp->p_selock = old - SE_READER;
 595         } else {
 596                 panic("page_unlock_nocapture: page %p is not locked",
 597                     (void *)pp);
 598         }
 599
 600         mutex_exit(pse);
 601 }
 602
 603 /*
 604  * Release the page's "shared/exclusive" lock and wake up anyone
 605  * who might be waiting for it.
 606  */
 607 void
 608 page_unlock(page_t *pp)
 609 {
 610         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 611         selock_t old;
 612
 613         mutex_enter(pse);
 614
 615         old = pp->p_selock;
 616         if ((old & ~SE_EWANTED) == SE_READER) {
 617                 pp->p_selock = old & ~SE_READER;
 618                 if (CV_HAS_WAITERS(&pp->p_cv))
 619                         cv_broadcast(&pp->p_cv);
 620         } else if ((old & ~SE_EWANTED) == SE_DELETED) {
 621                 panic("page_unlock: page %p is deleted", (void *)pp);
 622         } else if (old < 0) {
 623                 THREAD_KPRI_RELEASE();
 624                 pp->p_selock &= SE_EWANTED;
 625                 if (CV_HAS_WAITERS(&pp->p_cv))
 626                         cv_broadcast(&pp->p_cv);
 627         } else if ((old & ~SE_EWANTED) > SE_READER) {
 628                 pp->p_selock = old - SE_READER;
 629         } else {
 630                 panic("page_unlock: page %p is not locked", (void *)pp);
 631         }
 632
 633         if (pp->p_selock == 0) {
 634                 /*
 635                  * If the T_CAPTURING bit is set, that means that we should
 636                  * not try and capture the page again as we could recurse
 637                  * which could lead to a stack overflow panic or spending a
 638                  * relatively long time in the kernel making no progress.
 639                  */
 640                 if ((pp->p_toxic & PR_CAPTURE) &&
 641                     !(curthread->t_flag & T_CAPTURING) &&
 642                     !PP_RETIRED(pp)) {
 643                         THREAD_KPRI_REQUEST();
 644                         pp->p_selock = SE_WRITER;
 645                         mutex_exit(pse);
 646                         page_unlock_capture(pp);
 647                 } else {
 648                         mutex_exit(pse);
 649                 }
 650         } else {
 651                 mutex_exit(pse);
 652         }
 653 }
 654
 655 /*
 656  * Try to upgrade the lock on the page from a "shared" to an
 657  * "exclusive" lock.  Since this upgrade operation is done while
 658  * holding the mutex protecting this page, no one else can acquire this page's
 659  * lock and change the page. Thus, it is safe to drop the "shared"
 660  * lock and attempt to acquire the "exclusive" lock.
 661  *
 662  * Returns 1 on success, 0 on failure.
 663  */
 664 int
 665 page_tryupgrade(page_t *pp)
 666 {
 667         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 668
 669         mutex_enter(pse);
 670         if (!(pp->p_selock & SE_EWANTED)) {
 671                 /* no threads want exclusive access, try upgrade */
 672                 if (pp->p_selock == SE_READER) {
 673                         THREAD_KPRI_REQUEST();
 674                         /* convert to exclusive lock */
 675                         pp->p_selock = SE_WRITER;
 676                         mutex_exit(pse);
 677                         return (1);
 678                 }
 679         }
 680         mutex_exit(pse);
 681         return (0);
 682 }
 683
 684 /*
 685  * Downgrade the "exclusive" lock on the page to a "shared" lock
 686  * while holding the mutex protecting this page's p_selock field.
 687  */
 688 void
 689 page_downgrade(page_t *pp)
 690 {
 691         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 692         int excl_waiting;
 693
 694         ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
 695         ASSERT(PAGE_EXCL(pp));
 696
 697         mutex_enter(pse);
 698         excl_waiting =  pp->p_selock & SE_EWANTED;
 699         THREAD_KPRI_RELEASE();
 700         pp->p_selock = SE_READER | excl_waiting;
 701         if (CV_HAS_WAITERS(&pp->p_cv))
 702                 cv_broadcast(&pp->p_cv);
 703         mutex_exit(pse);
 704 }
 705
 706 void
 707 page_lock_delete(page_t *pp)
 708 {
 709         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 710
 711         ASSERT(PAGE_EXCL(pp));
 712         ASSERT(pp->p_vnode == NULL);
 713         ASSERT(pp->p_offset == (uoff_t)-1);
 714         ASSERT(!PP_ISFREE(pp));
 715
 716         mutex_enter(pse);
 717         THREAD_KPRI_RELEASE();
 718         pp->p_selock = SE_DELETED;
 719         if (CV_HAS_WAITERS(&pp->p_cv))
 720                 cv_broadcast(&pp->p_cv);
 721         mutex_exit(pse);
 722 }
 723
 724 int
 725 page_deleted(page_t *pp)
 726 {
 727         return (pp->p_selock == SE_DELETED);
 728 }
 729
 730 /*
 731  * Implement the io lock for pages
 732  */
 733 void
 734 page_iolock_init(page_t *pp)
 735 {
 736         pp->p_iolock_state = 0;
 737         cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
 738 }
 739
 740 /*
 741  * Acquire the i/o lock on a page.
 742  */
 743 void
 744 page_io_lock(page_t *pp)
 745 {
 746         kmutex_t *pio;
 747
 748         pio = PAGE_IO_MUTEX(pp);
 749         mutex_enter(pio);
 750         while (pp->p_iolock_state & PAGE_IO_INUSE) {
 751                 cv_wait(&(pp->p_io_cv), pio);
 752         }
 753         pp->p_iolock_state |= PAGE_IO_INUSE;
 754         mutex_exit(pio);
 755 }
 756
 757 /*
 758  * Release the i/o lock on a page.
 759  */
 760 void
 761 page_io_unlock(page_t *pp)
 762 {
 763         kmutex_t *pio;
 764
 765         pio = PAGE_IO_MUTEX(pp);
 766         mutex_enter(pio);
 767         cv_broadcast(&pp->p_io_cv);
 768         pp->p_iolock_state &= ~PAGE_IO_INUSE;
 769         mutex_exit(pio);
 770 }
 771
 772 /*
 773  * Try to acquire the i/o lock on a page without blocking.
 774  * Returns 1 on success, 0 on failure.
 775  */
 776 int
 777 page_io_trylock(page_t *pp)
 778 {
 779         kmutex_t *pio;
 780
 781         if (pp->p_iolock_state & PAGE_IO_INUSE)
 782                 return (0);
 783
 784         pio = PAGE_IO_MUTEX(pp);
 785         mutex_enter(pio);
 786
 787         if (pp->p_iolock_state & PAGE_IO_INUSE) {
 788                 mutex_exit(pio);
 789                 return (0);
 790         }
 791         pp->p_iolock_state |= PAGE_IO_INUSE;
 792         mutex_exit(pio);
 793
 794         return (1);
 795 }
 796
 797 /*
 798  * Wait until the i/o lock is not held.
 799  */
 800 void
 801 page_io_wait(page_t *pp)
 802 {
 803         kmutex_t *pio;
 804
 805         pio = PAGE_IO_MUTEX(pp);
 806         mutex_enter(pio);
 807         while (pp->p_iolock_state & PAGE_IO_INUSE) {
 808                 cv_wait(&(pp->p_io_cv), pio);
 809         }
 810         mutex_exit(pio);
 811 }
 812
 813 /*
 814  * Returns 1 on success, 0 on failure.
 815  */
 816 int
 817 page_io_locked(page_t *pp)
 818 {
 819         return (pp->p_iolock_state & PAGE_IO_INUSE);
 820 }
 821
 822 /*
 823  * Assert that the i/o lock on a page is held.
 824  * Returns 1 on success, 0 on failure.
 825  */
 826 int
 827 page_iolock_assert(page_t *pp)
 828 {
 829         return (page_io_locked(pp));
 830 }
 831
 832 kmutex_t *
 833 page_vnode_mutex(vnode_t *vp)
 834 {
 835         return (&vp->v_pagecache_lock);
 836 }
 837
 838 kmutex_t *
 839 page_se_mutex(page_t *pp)
 840 {
 841         return (PAGE_SE_MUTEX(pp));
 842 }
 843
 844 #ifdef VM_STATS
 845 uint_t pszclck_stat[4];
 846 #endif
 847 /*
 848  * Find, take and return a mutex held by hat_page_demote().
 849  * Called by page_demote_vp_pages() before hat_page_demote() call and by
 850  * routines that want to block hat_page_demote() but can't do it
 851  * via locking all constituent pages.
 852  *
 853  * Return NULL if p_szc is 0.
 854  *
 855  * It should only be used for pages that can be demoted by hat_page_demote()
 856  * i.e. non swapfs file system pages.  The logic here is lifted from
 857  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
 858  * since the page is locked and not free.
 859  *
 860  * Hash of the root page is used to find the lock.
 861  * To find the root in the presense of hat_page_demote() chageing the location
 862  * of the root this routine relies on the fact that hat_page_demote() changes
 863  * root last.
 864  *
 865  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
 866  * returned pp's p_szc may be any value.
 867  */
 868 kmutex_t *
 869 page_szc_lock(page_t *pp)
 870 {
 871         kmutex_t        *mtx;
 872         page_t          *rootpp;
 873         uint_t          szc;
 874         uint_t          rszc;
 875         uint_t          pszc = pp->p_szc;
 876
 877         ASSERT(pp != NULL);
 878         ASSERT(PAGE_LOCKED(pp));
 879         ASSERT(!PP_ISFREE(pp));
 880         ASSERT(pp->p_vnode != NULL);
 881         ASSERT(!IS_SWAPFSVP(pp->p_vnode));
 882         ASSERT(!PP_ISKAS(pp));
 883
 884 again:
 885         if (pszc == 0) {
 886                 VM_STAT_ADD(pszclck_stat[0]);
 887                 return (NULL);
 888         }
 889
 890         /* The lock lives in the root page */
 891
 892         rootpp = PP_GROUPLEADER(pp, pszc);
 893         mtx = PAGE_SZC_MUTEX(rootpp);
 894         mutex_enter(mtx);
 895
 896         /*
 897          * since p_szc can only decrease if pp == rootpp
 898          * rootpp will be always the same i.e we have the right root
 899          * regardless of rootpp->p_szc.
 900          * If location of pp's root didn't change after we took
 901          * the lock we have the right root. return mutex hashed off it.
 902          */
 903         if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
 904                 VM_STAT_ADD(pszclck_stat[1]);
 905                 return (mtx);
 906         }
 907
 908         /*
 909          * root location changed because page got demoted.
 910          * locate the new root.
 911          */
 912         if (rszc < pszc) {
 913                 szc = pp->p_szc;
 914                 ASSERT(szc < pszc);
 915                 mutex_exit(mtx);
 916                 pszc = szc;
 917                 VM_STAT_ADD(pszclck_stat[2]);
 918                 goto again;
 919         }
 920
 921         VM_STAT_ADD(pszclck_stat[3]);
 922         /*
 923          * current hat_page_demote not done yet.
 924          * wait for it to finish.
 925          */
 926         mutex_exit(mtx);
 927         rootpp = PP_GROUPLEADER(rootpp, rszc);
 928         mtx = PAGE_SZC_MUTEX(rootpp);
 929         mutex_enter(mtx);
 930         mutex_exit(mtx);
 931         ASSERT(rootpp->p_szc < rszc);
 932         goto again;
 933 }
 934
 935 int
 936 page_szc_lock_assert(page_t *pp)
 937 {
 938         page_t *rootpp = PP_PAGEROOT(pp);
 939         kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
 940
 941         return (MUTEX_HELD(mtx));
 942 }
 943
 944 /*
 945  * memseg locking
 946  */
 947 static krwlock_t memsegslock;
 948
 949 /*
 950  * memlist (phys_install, phys_avail) locking.
 951  */
 952 static krwlock_t memlists_lock;
 953
 954 int
 955 memsegs_trylock(int writer)
 956 {
 957         return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER));
 958 }
 959
 960 void
 961 memsegs_lock(int writer)
 962 {
 963         rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
 964 }
 965
 966 /*ARGSUSED*/
 967 void
 968 memsegs_unlock(int writer)
 969 {
 970         rw_exit(&memsegslock);
 971 }
 972
 973 int
 974 memsegs_lock_held(void)
 975 {
 976         return (RW_LOCK_HELD(&memsegslock));
 977 }
 978
 979 void
 980 memlist_read_lock(void)
 981 {
 982         rw_enter(&memlists_lock, RW_READER);
 983 }
 984
 985 void
 986 memlist_read_unlock(void)
 987 {
 988         rw_exit(&memlists_lock);
 989 }
 990
 991 void
 992 memlist_write_lock(void)
 993 {
 994         rw_enter(&memlists_lock, RW_WRITER);
 995 }
 996
 997 void
 998 memlist_write_unlock(void)
 999 {
1000         rw_exit(&memlists_lock);
1001 }