kernel/vm/vm_page.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
  24  * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
  25  */
  26
  27 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T    */
  28 /*        All Rights Reserved   */
  29
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39
  40 /*
  41  * VM - physical page management.
  42  */
  43
  44 #include <sys/types.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/param.h>
  47 #include <sys/systm.h>
  48 #include <sys/errno.h>
  49 #include <sys/time.h>
  50 #include <sys/vnode.h>
  51 #include <sys/vm.h>
  52 #include <sys/vtrace.h>
  53 #include <sys/swap.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/tuneable.h>
  56 #include <sys/sysmacros.h>
  57 #include <sys/cpuvar.h>
  58 #include <sys/callb.h>
  59 #include <sys/debug.h>
  60 #include <sys/tnf_probe.h>
  61 #include <sys/condvar_impl.h>
  62 #include <sys/mem_config.h>
  63 #include <sys/kmem.h>
  64 #include <sys/atomic.h>
  65 #include <sys/strlog.h>
  66 #include <sys/mman.h>
  67 #include <sys/ontrap.h>
  68 #include <sys/lgrp.h>
  69 #include <sys/vfs.h>
  70
  71 #include <vm/hat.h>
  72 #include <vm/anon.h>
  73 #include <vm/page.h>
  74 #include <vm/seg.h>
  75 #include <vm/pvn.h>
  76 #include <vm/seg_kmem.h>
  77 #include <vm/vm_dep.h>
  78 #include <sys/vm_usage.h>
  79 #include <sys/fs_subr.h>
  80 #include <sys/ddi.h>
  81 #include <sys/modctl.h>
  82
  83 static pgcnt_t max_page_get;    /* max page_get request size in pages */
  84 pgcnt_t total_pages = 0;        /* total number of pages (used by /proc) */
  85
  86 /*
  87  * freemem_lock protects all freemem variables:
  88  * availrmem. Also this lock protects the globals which track the
  89  * availrmem changes for accurate kernel footprint calculation.
  90  * See below for an explanation of these
  91  * globals.
  92  */
  93 kmutex_t freemem_lock;
  94 pgcnt_t availrmem;
  95 pgcnt_t availrmem_initial;
  96
  97 /*
  98  * These globals track availrmem changes to get a more accurate
  99  * estimate of tke kernel size. Historically pp_kernel is used for
 100  * kernel size and is based on availrmem. But availrmem is adjusted for
 101  * locked pages in the system not just for kernel locked pages.
 102  * These new counters will track the pages locked through segvn and
 103  * by explicit user locking.
 104  *
 105  * pages_locked : How many pages are locked because of user specified
 106  * locking through mlock or plock.
 107  *
 108  * pages_useclaim,pages_claimed : These two variables track the
 109  * claim adjustments because of the protection changes on a segvn segment.
 110  *
 111  * All these globals are protected by the same lock which protects availrmem.
 112  */
 113 pgcnt_t pages_locked = 0;
 114 pgcnt_t pages_useclaim = 0;
 115 pgcnt_t pages_claimed = 0;
 116
 117
 118 /*
 119  * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
 120  */
 121 static kmutex_t new_freemem_lock;
 122 static uint_t   freemem_wait;   /* someone waiting for freemem */
 123 static kcondvar_t freemem_cv;
 124
 125 /*
 126  * The logical page free list is maintained as two lists, the 'free'
 127  * and the 'cache' lists.
 128  * The free list contains those pages that should be reused first.
 129  *
 130  * The implementation of the lists is machine dependent.
 131  * page_get_freelist(), page_get_cachelist(),
 132  * page_list_sub(), and page_list_add()
 133  * form the interface to the machine dependent implementation.
 134  *
 135  * Pages with p_free set are on the cache list.
 136  * Pages with p_free and p_age set are on the free list,
 137  *
 138  * A page may be locked while on either list.
 139  */
 140
 141 /*
 142  * free list accounting stuff.
 143  *
 144  *
 145  * Spread out the value for the number of pages on the
 146  * page free and page cache lists.  If there is just one
 147  * value, then it must be under just one lock.
 148  * The lock contention and cache traffic are a real bother.
 149  *
 150  * When we acquire and then drop a single pcf lock
 151  * we can start in the middle of the array of pcf structures.
 152  * If we acquire more than one pcf lock at a time, we need to
 153  * start at the front to avoid deadlocking.
 154  *
 155  * pcf_count holds the number of pages in each pool.
 156  *
 157  * pcf_block is set when page_create_get_something() has asked the
 158  * PSM page freelist and page cachelist routines without specifying
 159  * a color and nothing came back.  This is used to block anything
 160  * else from moving pages from one list to the other while the
 161  * lists are searched again.  If a page is freeed while pcf_block is
 162  * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
 163  * of clearning pcf_block, doing the wakeups, etc.
 164  */
 165
 166 #define MAX_PCF_FANOUT NCPU
 167 static uint_t pcf_fanout = 1; /* Will get changed at boot time */
 168 static uint_t pcf_fanout_mask = 0;
 169
 170 struct pcf {
 171         kmutex_t        pcf_lock;       /* protects the structure */
 172         uint_t          pcf_count;      /* page count */
 173         uint_t          pcf_wait;       /* number of waiters */
 174         uint_t          pcf_block;      /* pcgs flag to page_free() */
 175         uint_t          pcf_reserve;    /* pages freed after pcf_block set */
 176         uint_t          pcf_fill[10];   /* to line up on the caches */
 177 };
 178
 179 /*
 180  * PCF_INDEX hash needs to be dynamic (every so often the hash changes where
 181  * it will hash the cpu to).  This is done to prevent a drain condition
 182  * from happening.  This drain condition will occur when pcf_count decrement
 183  * occurs on cpu A and the increment of pcf_count always occurs on cpu B.  An
 184  * example of this shows up with device interrupts.  The dma buffer is allocated
 185  * by the cpu requesting the IO thus the pcf_count is decremented based on that.
 186  * When the memory is returned by the interrupt thread, the pcf_count will be
 187  * incremented based on the cpu servicing the interrupt.
 188  */
 189 static struct pcf pcf[MAX_PCF_FANOUT];
 190 #define PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \
 191         (randtick() >> 24)) & (pcf_fanout_mask))
 192
 193 static int pcf_decrement_bucket(pgcnt_t);
 194 static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int);
 195
 196 kmutex_t        pcgs_lock;              /* serializes page_create_get_ */
 197 kmutex_t        pcgs_throttle;          /* serializes NOSLEEP NORELOC allocs */
 198 kmutex_t        pcgs_wait_lock;         /* used for delay in pcgs */
 199 static kcondvar_t       pcgs_cv;        /* cv for delay in pcgs */
 200
 201 #ifdef VM_STATS
 202
 203 /*
 204  * No locks, but so what, they are only statistics.
 205  */
 206
 207 static struct page_tcnt {
 208         int     pc_free_cache;          /* free's into cache list */
 209         int     pc_free_dontneed;       /* free's with dontneed */
 210         int     pc_free_pageout;        /* free's from pageout */
 211         int     pc_free_free;           /* free's into free list */
 212         int     pc_free_pages;          /* free's into large page free list */
 213         int     pc_destroy_pages;       /* large page destroy's */
 214         int     pc_get_cache;           /* get's from cache list */
 215         int     pc_get_free;            /* get's from free list */
 216         int     pc_reclaim;             /* reclaim's */
 217         int     pc_abortfree;           /* abort's of free pages */
 218         int     pc_find_hit;            /* find's that find page */
 219         int     pc_find_miss;           /* find's that don't find page */
 220         int     pc_destroy_free;        /* # of free pages destroyed */
 221         int     pc_addclaim_pages;
 222         int     pc_subclaim_pages;
 223         int     pc_free_replacement_page[2];
 224         int     pc_try_demote_pages[6];
 225         int     pc_demote_pages[2];
 226 } pagecnt;
 227
 228 uint_t  hashin_count;
 229 uint_t  hashin_not_held;
 230 uint_t  hashin_already;
 231
 232 uint_t  hashout_count;
 233 uint_t  hashout_not_held;
 234
 235 uint_t  page_create_count;
 236 uint_t  page_create_not_enough;
 237 uint_t  page_create_not_enough_again;
 238 uint_t  page_create_zero;
 239 uint_t  page_create_hashout;
 240 uint_t  page_create_page_lock_failed;
 241 uint_t  page_create_trylock_failed;
 242 uint_t  page_create_found_one;
 243 uint_t  page_create_hashin_failed;
 244 uint_t  page_create_dropped_phm;
 245
 246 uint_t  page_create_new;
 247 uint_t  page_create_exists;
 248 uint_t  page_create_putbacks;
 249 uint_t  page_create_overshoot;
 250
 251 uint_t  page_reclaim_zero;
 252 uint_t  page_reclaim_zero_locked;
 253
 254 uint_t  page_rename_exists;
 255 uint_t  page_rename_count;
 256
 257 uint_t  page_lookup_cnt[20];
 258 uint_t  page_lookup_nowait_cnt[10];
 259 uint_t  page_find_cnt;
 260 uint_t  page_exists_cnt;
 261 uint_t  page_exists_forreal_cnt;
 262 uint_t  page_lookup_dev_cnt;
 263 uint_t  get_cachelist_cnt;
 264 uint_t  page_create_cnt[10];
 265 uint_t  alloc_pages[9];
 266 uint_t  page_exphcontg[19];
 267 uint_t  page_create_large_cnt[10];
 268
 269 #endif
 270
 271 static inline struct page *
 272 find_page(struct vmobject *obj, uoff_t off)
 273 {
 274         struct page key = {
 275                 .p_offset = off,
 276         };
 277         struct page *page;
 278
 279         page = avl_find(&obj->tree, &key, NULL);
 280
 281 #ifdef  VM_STATS
 282         if (page != NULL)
 283                 pagecnt.pc_find_hit++;
 284         else
 285                 pagecnt.pc_find_miss++;
 286 #endif
 287
 288         return (page);
 289 }
 290
 291
 292 #ifdef DEBUG
 293 #define MEMSEG_SEARCH_STATS
 294 #endif
 295
 296 #ifdef MEMSEG_SEARCH_STATS
 297 struct memseg_stats {
 298     uint_t nsearch;
 299     uint_t nlastwon;
 300     uint_t nhashwon;
 301     uint_t nnotfound;
 302 } memseg_stats;
 303
 304 #define MEMSEG_STAT_INCR(v) \
 305         atomic_inc_32(&memseg_stats.v)
 306 #else
 307 #define MEMSEG_STAT_INCR(x)
 308 #endif
 309
 310 struct memseg *memsegs;         /* list of memory segments */
 311
 312 /*
 313  * /etc/system tunable to control large page allocation hueristic.
 314  *
 315  * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup
 316  * for large page allocation requests.  If a large page is not readily
 317  * avaliable on the local freelists we will go through additional effort
 318  * to create a large page, potentially moving smaller pages around to coalesce
 319  * larger pages in the local lgroup.
 320  * Default value of LPAP_DEFAULT will go to remote freelists if large pages
 321  * are not readily available in the local lgroup.
 322  */
 323 enum lpap {
 324         LPAP_DEFAULT,   /* default large page allocation policy */
 325         LPAP_LOCAL      /* local large page allocation policy */
 326 };
 327
 328 enum lpap lpg_alloc_prefer = LPAP_DEFAULT;
 329
 330 static void page_init_mem_config(void);
 331 static int page_do_hashin(struct page *, struct vmobject *, uoff_t);
 332 static void page_do_hashout(page_t *);
 333 static void page_capture_init();
 334 int page_capture_take_action(page_t *, uint_t, void *);
 335
 336 static void page_demote_vp_pages(page_t *);
 337
 338
 339 void
 340 pcf_init(void)
 341 {
 342         if (boot_ncpus != -1) {
 343                 pcf_fanout = boot_ncpus;
 344         } else {
 345                 pcf_fanout = max_ncpus;
 346         }
 347 #ifdef sun4v
 348         /*
 349          * Force at least 4 buckets if possible for sun4v.
 350          */
 351         pcf_fanout = MAX(pcf_fanout, 4);
 352 #endif /* sun4v */
 353
 354         /*
 355          * Round up to the nearest power of 2.
 356          */
 357         pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT);
 358         if (!ISP2(pcf_fanout)) {
 359                 pcf_fanout = 1 << highbit(pcf_fanout);
 360
 361                 if (pcf_fanout > MAX_PCF_FANOUT) {
 362                         pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1);
 363                 }
 364         }
 365         pcf_fanout_mask = pcf_fanout - 1;
 366 }
 367
 368 /*
 369  * vm subsystem related initialization
 370  */
 371 void
 372 vm_init(void)
 373 {
 374         boolean_t callb_vm_cpr(void *, int);
 375
 376         (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
 377         page_init_mem_config();
 378         page_retire_init();
 379         vm_usage_init();
 380         page_capture_init();
 381 }
 382
 383 /*
 384  * This function is called at startup and when memory is added or deleted.
 385  */
 386 void
 387 init_pages_pp_maximum()
 388 {
 389         static pgcnt_t p_min;
 390         static pgcnt_t pages_pp_maximum_startup;
 391         static pgcnt_t avrmem_delta;
 392         static int init_done;
 393         static int user_set;    /* true if set in /etc/system */
 394
 395         if (init_done == 0) {
 396
 397                 /* If the user specified a value, save it */
 398                 if (pages_pp_maximum != 0) {
 399                         user_set = 1;
 400                         pages_pp_maximum_startup = pages_pp_maximum;
 401                 }
 402
 403                 /*
 404                  * Setting of pages_pp_maximum is based first time
 405                  * on the value of availrmem just after the start-up
 406                  * allocations. To preserve this relationship at run
 407                  * time, use a delta from availrmem_initial.
 408                  */
 409                 ASSERT(availrmem_initial >= availrmem);
 410                 avrmem_delta = availrmem_initial - availrmem;
 411
 412                 /* The allowable floor of pages_pp_maximum */
 413                 p_min = tune.t_minarmem + 100;
 414
 415                 /* Make sure we don't come through here again. */
 416                 init_done = 1;
 417         }
 418         /*
 419          * Determine pages_pp_maximum, the number of currently available
 420          * pages (availrmem) that can't be `locked'. If not set by
 421          * the user, we set it to 4% of the currently available memory
 422          * plus 4MB.
 423          * But we also insist that it be greater than tune.t_minarmem;
 424          * otherwise a process could lock down a lot of memory, get swapped
 425          * out, and never have enough to get swapped back in.
 426          */
 427         if (user_set)
 428                 pages_pp_maximum = pages_pp_maximum_startup;
 429         else
 430                 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
 431                     + btop(4 * 1024 * 1024);
 432
 433         if (pages_pp_maximum <= p_min) {
 434                 pages_pp_maximum = p_min;
 435         }
 436 }
 437
 438 void
 439 set_max_page_get(pgcnt_t target_total_pages)
 440 {
 441         max_page_get = target_total_pages / 2;
 442 }
 443
 444 static pgcnt_t pending_delete;
 445
 446 /*ARGSUSED*/
 447 static void
 448 page_mem_config_post_add(
 449         void *arg,
 450         pgcnt_t delta_pages)
 451 {
 452         set_max_page_get(total_pages - pending_delete);
 453         init_pages_pp_maximum();
 454 }
 455
 456 /*ARGSUSED*/
 457 static int
 458 page_mem_config_pre_del(
 459         void *arg,
 460         pgcnt_t delta_pages)
 461 {
 462         pgcnt_t nv;
 463
 464         nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
 465         set_max_page_get(total_pages - nv);
 466         return (0);
 467 }
 468
 469 /*ARGSUSED*/
 470 static void
 471 page_mem_config_post_del(
 472         void *arg,
 473         pgcnt_t delta_pages,
 474         int cancelled)
 475 {
 476         pgcnt_t nv;
 477
 478         nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
 479         set_max_page_get(total_pages - nv);
 480         if (!cancelled)
 481                 init_pages_pp_maximum();
 482 }
 483
 484 static kphysm_setup_vector_t page_mem_config_vec = {
 485         KPHYSM_SETUP_VECTOR_VERSION,
 486         page_mem_config_post_add,
 487         page_mem_config_pre_del,
 488         page_mem_config_post_del,
 489 };
 490
 491 static void
 492 page_init_mem_config(void)
 493 {
 494         int ret;
 495
 496         ret = kphysm_setup_func_register(&page_mem_config_vec, NULL);
 497         ASSERT(ret == 0);
 498 }
 499
 500 /*
 501  * Evenly spread out the PCF counters for large free pages
 502  */
 503 static void
 504 page_free_large_ctr(pgcnt_t npages)
 505 {
 506         static struct pcf       *p = pcf;
 507         pgcnt_t                 lump;
 508
 509         freemem += npages;
 510
 511         lump = roundup(npages, pcf_fanout) / pcf_fanout;
 512
 513         while (npages > 0) {
 514
 515                 ASSERT(!p->pcf_block);
 516
 517                 if (lump < npages) {
 518                         p->pcf_count += (uint_t)lump;
 519                         npages -= lump;
 520                 } else {
 521                         p->pcf_count += (uint_t)npages;
 522                         npages = 0;
 523                 }
 524
 525                 ASSERT(!p->pcf_wait);
 526
 527                 if (++p > &pcf[pcf_fanout - 1])
 528                         p = pcf;
 529         }
 530
 531         ASSERT(npages == 0);
 532 }
 533
 534 /*
 535  * Add a physical chunk of memory to the system free lists during startup.
 536  * Platform specific startup() allocates the memory for the page structs.
 537  *
 538  * num  - number of page structures
 539  * base - page number (pfn) to be associated with the first page.
 540  *
 541  * Since we are doing this during startup (ie. single threaded), we will
 542  * use shortcut routines to avoid any locking overhead while putting all
 543  * these pages on the freelists.
 544  *
 545  * NOTE: Any changes performed to page_free(), must also be performed to
 546  *       add_physmem() since this is how we initialize all page_t's at
 547  *       boot time.
 548  */
 549 void
 550 add_physmem(
 551         page_t  *pp,
 552         pgcnt_t num,
 553         pfn_t   pnum)
 554 {
 555         page_t  *root = NULL;
 556         uint_t  szc = page_num_pagesizes() - 1;
 557         pgcnt_t large = page_get_pagecnt(szc);
 558         pgcnt_t cnt = 0;
 559
 560         /*
 561          * Arbitrarily limit the max page_get request
 562          * to 1/2 of the page structs we have.
 563          */
 564         total_pages += num;
 565         set_max_page_get(total_pages);
 566
 567         PLCNT_MODIFY_MAX(pnum, (long)num);
 568
 569         /*
 570          * The physical space for the pages array
 571          * representing ram pages has already been
 572          * allocated.  Here we initialize each lock
 573          * in the page structure, and put each on
 574          * the free list
 575          */
 576         for (; num; pp++, pnum++, num--) {
 577
 578                 /*
 579                  * this needs to fill in the page number
 580                  * and do any other arch specific initialization
 581                  */
 582                 add_physmem_cb(pp, pnum);
 583
 584                 pp->p_lckcnt = 0;
 585                 pp->p_cowcnt = 0;
 586                 pp->p_slckcnt = 0;
 587
 588                 /*
 589                  * Initialize the page lock as unlocked, since nobody
 590                  * can see or access this page yet.
 591                  */
 592                 pp->p_selock = 0;
 593
 594                 /*
 595                  * Initialize IO lock
 596                  */
 597                 page_iolock_init(pp);
 598
 599                 /*
 600                  * initialize other fields in the page_t
 601                  */
 602                 PP_SETFREE(pp);
 603                 page_clr_all_props(pp);
 604                 PP_SETAGED(pp);
 605                 pp->p_offset = (uoff_t)-1;
 606                 pp->p_next = pp;
 607                 pp->p_prev = pp;
 608
 609                 /*
 610                  * Simple case: System doesn't support large pages.
 611                  */
 612                 if (szc == 0) {
 613                         pp->p_szc = 0;
 614                         page_free_at_startup(pp);
 615                         continue;
 616                 }
 617
 618                 /*
 619                  * Handle unaligned pages, we collect them up onto
 620                  * the root page until we have a full large page.
 621                  */
 622                 if (!IS_P2ALIGNED(pnum, large)) {
 623
 624                         /*
 625                          * If not in a large page,
 626                          * just free as small page.
 627                          */
 628                         if (root == NULL) {
 629                                 pp->p_szc = 0;
 630                                 page_free_at_startup(pp);
 631                                 continue;
 632                         }
 633
 634                         /*
 635                          * Link a constituent page into the large page.
 636                          */
 637                         pp->p_szc = szc;
 638                         page_list_concat(&root, &pp);
 639
 640                         /*
 641                          * When large page is fully formed, free it.
 642                          */
 643                         if (++cnt == large) {
 644                                 page_free_large_ctr(cnt);
 645                                 page_list_add_pages(root, PG_LIST_ISINIT);
 646                                 root = NULL;
 647                                 cnt = 0;
 648                         }
 649                         continue;
 650                 }
 651
 652                 /*
 653                  * At this point we have a page number which
 654                  * is aligned. We assert that we aren't already
 655                  * in a different large page.
 656                  */
 657                 ASSERT(IS_P2ALIGNED(pnum, large));
 658                 ASSERT(root == NULL && cnt == 0);
 659
 660                 /*
 661                  * If insufficient number of pages left to form
 662                  * a large page, just free the small page.
 663                  */
 664                 if (num < large) {
 665                         pp->p_szc = 0;
 666                         page_free_at_startup(pp);
 667                         continue;
 668                 }
 669
 670                 /*
 671                  * Otherwise start a new large page.
 672                  */
 673                 pp->p_szc = szc;
 674                 cnt++;
 675                 root = pp;
 676         }
 677         ASSERT(root == NULL && cnt == 0);
 678 }
 679
 680 /*
 681  * Find a page representing the specified [vp, offset].
 682  * If we find the page but it is intransit coming in,
 683  * it will have an "exclusive" lock and we wait for
 684  * the i/o to complete.  A page found on the free list
 685  * is always reclaimed and then locked.  On success, the page
 686  * is locked, its data is valid and it isn't on the free
 687  * list, while a NULL is returned if the page doesn't exist.
 688  */
 689 struct page *
 690 page_lookup(struct vmobject *obj, uoff_t off, se_t se)
 691 {
 692         return (page_lookup_create(obj, off, se, NULL, NULL, 0));
 693 }
 694
 695 /*
 696  * Find a page representing the specified [vp, offset].
 697  * We either return the one we found or, if passed in,
 698  * create one with identity of [vp, offset] of the
 699  * pre-allocated page. If we find existing page but it is
 700  * intransit coming in, it will have an "exclusive" lock
 701  * and we wait for the i/o to complete.  A page found on
 702  * the free list is always reclaimed and then locked.
 703  * On success, the page is locked, its data is valid and
 704  * it isn't on the free list, while a NULL is returned
 705  * if the page doesn't exist and newpp is NULL;
 706  */
 707 struct page *
 708 page_lookup_create(
 709         struct vmobject *obj,
 710         uoff_t off,
 711         se_t se,
 712         struct page *newpp,
 713         spgcnt_t *nrelocp,
 714         int flags)
 715 {
 716         page_t          *pp;
 717         kmutex_t        *phm;
 718         ulong_t         index;
 719         uint_t          es;
 720
 721         ASSERT(!VMOBJECT_LOCKED(obj));
 722         VM_STAT_ADD(page_lookup_cnt[0]);
 723         ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
 724
 725         vmobject_lock(obj);
 726 top:
 727         pp = find_page(obj, off);
 728
 729         if (pp != NULL) {
 730                 VM_STAT_ADD(page_lookup_cnt[1]);
 731                 es = (newpp != NULL) ? 1 : 0;
 732                 es |= flags;
 733
 734                 VM_STAT_ADD(page_lookup_cnt[4]);
 735                 if (!page_lock_es(pp, se, obj, P_RECLAIM, es)) {
 736                         VM_STAT_ADD(page_lookup_cnt[5]);
 737                         goto top;
 738                 }
 739
 740                 VM_STAT_ADD(page_lookup_cnt[6]);
 741
 742                 vmobject_unlock(obj);
 743
 744                 if (newpp != NULL && pp->p_szc < newpp->p_szc &&
 745                     PAGE_EXCL(pp) && nrelocp != NULL) {
 746                         ASSERT(nrelocp != NULL);
 747                         (void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
 748                             NULL);
 749                         if (*nrelocp > 0) {
 750                                 VM_STAT_COND_ADD(*nrelocp == 1,
 751                                     page_lookup_cnt[11]);
 752                                 VM_STAT_COND_ADD(*nrelocp > 1,
 753                                     page_lookup_cnt[12]);
 754                                 pp = newpp;
 755                                 se = SE_EXCL;
 756                         } else {
 757                                 if (se == SE_SHARED) {
 758                                         page_downgrade(pp);
 759                                 }
 760                                 VM_STAT_ADD(page_lookup_cnt[13]);
 761                         }
 762                 } else if (newpp != NULL && nrelocp != NULL) {
 763                         if (PAGE_EXCL(pp) && se == SE_SHARED) {
 764                                 page_downgrade(pp);
 765                         }
 766                         VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
 767                             page_lookup_cnt[14]);
 768                         VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
 769                             page_lookup_cnt[15]);
 770                         VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
 771                             page_lookup_cnt[16]);
 772                 } else if (newpp != NULL && PAGE_EXCL(pp)) {
 773                         se = SE_EXCL;
 774                 }
 775         } else if (newpp != NULL) {
 776                 /*
 777                  * If we have a preallocated page then
 778                  * insert it now and basically behave like
 779                  * page_create.
 780                  */
 781                 VM_STAT_ADD(page_lookup_cnt[18]);
 782                 /*
 783                  * Since we hold the page hash mutex and
 784                  * just searched for this page, page_hashin
 785                  * had better not fail.  If it does, that
 786                  * means some thread did not follow the
 787                  * page hash mutex rules.  Panic now and
 788                  * get it over with.  As usual, go down
 789                  * holding all the locks.
 790                  */
 791                 if (!page_hashin(newpp, obj, off, true)) {
 792                         ASSERT(VMOBJECT_LOCKED(obj));
 793                         panic("page_lookup_create: hashin failed %p %p %llx",
 794                             newpp, obj, off);
 795                         /*NOTREACHED*/
 796                 }
 797                 ASSERT(VMOBJECT_LOCKED(obj));
 798                 vmobject_unlock(obj);
 799                 page_set_props(newpp, P_REF);
 800                 page_io_lock(newpp);
 801                 pp = newpp;
 802                 se = SE_EXCL;
 803         } else {
 804                 VM_STAT_ADD(page_lookup_cnt[19]);
 805                 vmobject_unlock(obj);
 806         }
 807
 808         ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
 809
 810         ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
 811
 812         return (pp);
 813 }
 814
 815 /*
 816  * Search the hash list for the page representing the
 817  * specified [vp, offset] and return it locked.  Skip
 818  * free pages and pages that cannot be locked as requested.
 819  * Used while attempting to kluster pages.
 820  */
 821 struct page *
 822 page_lookup_nowait(struct vmobject *obj, uoff_t off, se_t se)
 823 {
 824         page_t          *pp;
 825
 826         ASSERT(!VMOBJECT_LOCKED(obj));
 827         VM_STAT_ADD(page_lookup_nowait_cnt[0]);
 828
 829         vmobject_lock(obj);
 830         pp = find_page(obj, off);
 831
 832         if (pp == NULL || PP_ISFREE(pp)) {
 833                 VM_STAT_ADD(page_lookup_nowait_cnt[2]);
 834                 pp = NULL;
 835         } else {
 836                 if (!page_trylock(pp, se)) {
 837                         VM_STAT_ADD(page_lookup_nowait_cnt[3]);
 838                         pp = NULL;
 839                 } else {
 840                         VM_STAT_ADD(page_lookup_nowait_cnt[4]);
 841                         if (PP_ISFREE(pp)) {
 842                                 VM_STAT_ADD(page_lookup_nowait_cnt[6]);
 843                                 page_unlock(pp);
 844                                 pp = NULL;
 845                         }
 846                 }
 847         }
 848
 849         vmobject_unlock(obj);
 850
 851         ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
 852
 853         return (pp);
 854 }
 855
 856 /*
 857  * Search the hash list for a page with the specified [vp, off]
 858  * that is known to exist and is already locked.  This routine
 859  * is typically used by segment SOFTUNLOCK routines.
 860  */
 861 struct page *
 862 page_find(struct vmobject *obj, uoff_t off)
 863 {
 864         struct page *page;
 865
 866         ASSERT(!VMOBJECT_LOCKED(obj));
 867         VM_STAT_ADD(page_find_cnt);
 868
 869         vmobject_lock(obj);
 870         page = find_page(obj, off);
 871         vmobject_unlock(obj);
 872
 873         ASSERT(page == NULL || PAGE_LOCKED(page) || panicstr);
 874         return (page);
 875 }
 876
 877 /*
 878  * Determine whether a page with the specified [vp, off]
 879  * currently exists in the system.  Obviously this should
 880  * only be considered as a hint since nothing prevents the
 881  * page from disappearing or appearing immediately after
 882  * the return from this routine.
 883  *
 884  * Note: This is virtually identical to page_find.  Can we combine them?
 885  */
 886 struct page *
 887 page_exists(struct vmobject *obj, uoff_t off)
 888 {
 889         struct page *page;
 890
 891         ASSERT(!VMOBJECT_LOCKED(obj));
 892         VM_STAT_ADD(page_exists_cnt);
 893
 894         vmobject_lock(obj);
 895         page = find_page(obj, off);
 896         vmobject_unlock(obj);
 897
 898         return (page);
 899 }
 900
 901 /*
 902  * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
 903  * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
 904  * with these pages locked SHARED. If necessary reclaim pages from
 905  * freelist. Return 1 if contiguous pages exist and 0 otherwise.
 906  *
 907  * If we fail to lock pages still return 1 if pages exist and contiguous.
 908  * But in this case return value is just a hint. ppa array won't be filled.
 909  * Caller should initialize ppa[0] as NULL to distinguish return value.
 910  *
 911  * Returns 0 if pages don't exist or not physically contiguous.
 912  *
 913  * This routine doesn't work for anonymous(swapfs) pages.
 914  */
 915 int
 916 page_exists_physcontig(struct vmobject *obj, uoff_t off, uint_t szc,
 917     struct page **ppa)
 918 {
 919         pgcnt_t pages;
 920         pfn_t pfn;
 921         page_t *rootpp;
 922         pgcnt_t i;
 923         pgcnt_t j;
 924         uoff_t save_off = off;
 925         page_t *pp;
 926         uint_t pszc;
 927         int loopcnt = 0;
 928
 929         ASSERT(szc != 0);
 930         ASSERT(obj != NULL);
 931         ASSERT(!IS_SWAPFSVP(obj->vnode));
 932         ASSERT(!VN_ISKAS(obj->vnode));
 933
 934 again:
 935         if (++loopcnt > 3) {
 936                 VM_STAT_ADD(page_exphcontg[0]);
 937                 return (0);
 938         }
 939
 940         vmobject_lock(obj);
 941         pp = find_page(obj, off);
 942         vmobject_unlock(obj);
 943
 944         VM_STAT_ADD(page_exphcontg[1]);
 945
 946         if (pp == NULL) {
 947                 VM_STAT_ADD(page_exphcontg[2]);
 948                 return (0);
 949         }
 950
 951         pages = page_get_pagecnt(szc);
 952         rootpp = pp;
 953         pfn = rootpp->p_pagenum;
 954
 955         if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
 956                 VM_STAT_ADD(page_exphcontg[3]);
 957                 if (!page_trylock(pp, SE_SHARED)) {
 958                         VM_STAT_ADD(page_exphcontg[4]);
 959                         return (1);
 960                 }
 961                 /*
 962                  * Also check whether p_pagenum was modified by DR.
 963                  */
 964                 if (pp->p_szc != pszc || pp->p_vnode != obj->vnode ||
 965                     pp->p_offset != off || pp->p_pagenum != pfn) {
 966                         VM_STAT_ADD(page_exphcontg[5]);
 967                         page_unlock(pp);
 968                         off = save_off;
 969                         goto again;
 970                 }
 971                 /*
 972                  * szc was non zero and vnode and offset matched after we
 973                  * locked the page it means it can't become free on us.
 974                  */
 975                 ASSERT(!PP_ISFREE(pp));
 976                 if (!IS_P2ALIGNED(pfn, pages)) {
 977                         page_unlock(pp);
 978                         return (0);
 979                 }
 980                 ppa[0] = pp;
 981                 pp++;
 982                 off += PAGESIZE;
 983                 pfn++;
 984                 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
 985                         if (!page_trylock(pp, SE_SHARED)) {
 986                                 VM_STAT_ADD(page_exphcontg[6]);
 987                                 pp--;
 988                                 while (i-- > 0) {
 989                                         page_unlock(pp);
 990                                         pp--;
 991                                 }
 992                                 ppa[0] = NULL;
 993                                 return (1);
 994                         }
 995                         if (pp->p_szc != pszc) {
 996                                 VM_STAT_ADD(page_exphcontg[7]);
 997                                 page_unlock(pp);
 998                                 pp--;
 999                                 while (i-- > 0) {
1000                                         page_unlock(pp);
1001                                         pp--;
1002                                 }
1003                                 ppa[0] = NULL;
1004                                 off = save_off;
1005                                 goto again;
1006                         }
1007                         /*
1008                          * szc the same as for previous already locked pages
1009                          * with right identity. Since this page had correct
1010                          * szc after we locked it can't get freed or destroyed
1011                          * and therefore must have the expected identity.
1012                          */
1013                         ASSERT(!PP_ISFREE(pp));
1014                         if (pp->p_vnode != obj->vnode ||
1015                             pp->p_offset != off) {
1016                                 panic("page_exists_physcontig: "
1017                                     "large page identity doesn't match");
1018                         }
1019                         ppa[i] = pp;
1020                         ASSERT(pp->p_pagenum == pfn);
1021                 }
1022                 VM_STAT_ADD(page_exphcontg[8]);
1023                 ppa[pages] = NULL;
1024                 return (1);
1025         } else if (pszc >= szc) {
1026                 VM_STAT_ADD(page_exphcontg[9]);
1027                 if (!IS_P2ALIGNED(pfn, pages)) {
1028                         return (0);
1029                 }
1030                 return (1);
1031         }
1032
1033         if (!IS_P2ALIGNED(pfn, pages)) {
1034                 VM_STAT_ADD(page_exphcontg[10]);
1035                 return (0);
1036         }
1037
1038         if (page_numtomemseg_nolock(pfn) !=
1039             page_numtomemseg_nolock(pfn + pages - 1)) {
1040                 VM_STAT_ADD(page_exphcontg[11]);
1041                 return (0);
1042         }
1043
1044         /*
1045          * We loop up 4 times across pages to promote page size.
1046          * We're extra cautious to promote page size atomically with respect
1047          * to everybody else.  But we can probably optimize into 1 loop if
1048          * this becomes an issue.
1049          */
1050
1051         for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1052                 if (!page_trylock(pp, SE_EXCL)) {
1053                         VM_STAT_ADD(page_exphcontg[12]);
1054                         break;
1055                 }
1056                 /*
1057                  * Check whether p_pagenum was modified by DR.
1058                  */
1059                 if (pp->p_pagenum != pfn) {
1060                         page_unlock(pp);
1061                         break;
1062                 }
1063                 if (pp->p_vnode != obj->vnode ||
1064                     pp->p_offset != off) {
1065                         VM_STAT_ADD(page_exphcontg[13]);
1066                         page_unlock(pp);
1067                         break;
1068                 }
1069                 if (pp->p_szc >= szc) {
1070                         ASSERT(i == 0);
1071                         page_unlock(pp);
1072                         off = save_off;
1073                         goto again;
1074                 }
1075         }
1076
1077         if (i != pages) {
1078                 VM_STAT_ADD(page_exphcontg[14]);
1079                 --pp;
1080                 while (i-- > 0) {
1081                         page_unlock(pp);
1082                         --pp;
1083                 }
1084                 return (0);
1085         }
1086
1087         pp = rootpp;
1088         for (i = 0; i < pages; i++, pp++) {
1089                 if (PP_ISFREE(pp)) {
1090                         VM_STAT_ADD(page_exphcontg[15]);
1091                         ASSERT(!PP_ISAGED(pp));
1092                         ASSERT(pp->p_szc == 0);
1093                         if (!page_reclaim(pp, NULL)) {
1094                                 break;
1095                         }
1096                 } else {
1097                         ASSERT(pp->p_szc < szc);
1098                         VM_STAT_ADD(page_exphcontg[16]);
1099                         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1100                 }
1101         }
1102         if (i < pages) {
1103                 VM_STAT_ADD(page_exphcontg[17]);
1104                 /*
1105                  * page_reclaim failed because we were out of memory.
1106                  * drop the rest of the locks and return because this page
1107                  * must be already reallocated anyway.
1108                  */
1109                 pp = rootpp;
1110                 for (j = 0; j < pages; j++, pp++) {
1111                         if (j != i) {
1112                                 page_unlock(pp);
1113                         }
1114                 }
1115                 return (0);
1116         }
1117
1118         off = save_off;
1119         pp = rootpp;
1120         for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1121                 ASSERT(PAGE_EXCL(pp));
1122                 ASSERT(!PP_ISFREE(pp));
1123                 ASSERT(!hat_page_is_mapped(pp));
1124                 VERIFY(pp->p_object == obj);
1125                 ASSERT(pp->p_vnode == obj->vnode);
1126                 ASSERT(pp->p_offset == off);
1127                 pp->p_szc = szc;
1128         }
1129         pp = rootpp;
1130         for (i = 0; i < pages; i++, pp++) {
1131                 if (ppa == NULL) {
1132                         page_unlock(pp);
1133                 } else {
1134                         ppa[i] = pp;
1135                         page_downgrade(ppa[i]);
1136                 }
1137         }
1138         if (ppa != NULL) {
1139                 ppa[pages] = NULL;
1140         }
1141         VM_STAT_ADD(page_exphcontg[18]);
1142         ASSERT(vn_has_cached_data(obj->vnode));
1143         return (1);
1144 }
1145
1146 /*
1147  * Determine whether a page with the specified [vp, off]
1148  * currently exists in the system and if so return its
1149  * size code. Obviously this should only be considered as
1150  * a hint since nothing prevents the page from disappearing
1151  * or appearing immediately after the return from this routine.
1152  */
1153 int
1154 page_exists_forreal(struct vmobject *obj, uoff_t off, uint_t *szc)
1155 {
1156         page_t          *pp;
1157         int             rc = 0;
1158
1159         ASSERT(!VMOBJECT_LOCKED(obj));
1160         ASSERT(szc != NULL);
1161         VM_STAT_ADD(page_exists_forreal_cnt);
1162
1163         vmobject_lock(obj);
1164         pp = find_page(obj, off);
1165         if (pp != NULL) {
1166                 *szc = pp->p_szc;
1167                 rc = 1;
1168         }
1169         vmobject_unlock(obj);
1170         return (rc);
1171 }
1172
1173 /* wakeup threads waiting for pages in page_create_get_something() */
1174 void
1175 wakeup_pcgs(void)
1176 {
1177         if (!CV_HAS_WAITERS(&pcgs_cv))
1178                 return;
1179         cv_broadcast(&pcgs_cv);
1180 }
1181
1182 /*
1183  * 'freemem' is used all over the kernel as an indication of how many
1184  * pages are free (either on the cache list or on the free page list)
1185  * in the system.  In very few places is a really accurate 'freemem'
1186  * needed.  To avoid contention of the lock protecting a the
1187  * single freemem, it was spread out into NCPU buckets.  Set_freemem
1188  * sets freemem to the total of all NCPU buckets.  It is called from
1189  * clock() on each TICK.
1190  */
1191 void
1192 set_freemem()
1193 {
1194         struct pcf      *p;
1195         ulong_t         t;
1196         uint_t          i;
1197
1198         t = 0;
1199         p = pcf;
1200         for (i = 0;  i < pcf_fanout; i++) {
1201                 t += p->pcf_count;
1202                 p++;
1203         }
1204         freemem = t;
1205
1206         /*
1207          * Don't worry about grabbing mutex.  It's not that
1208          * critical if we miss a tick or two.  This is
1209          * where we wakeup possible delayers in
1210          * page_create_get_something().
1211          */
1212         wakeup_pcgs();
1213 }
1214
1215 ulong_t
1216 get_freemem()
1217 {
1218         struct pcf      *p;
1219         ulong_t         t;
1220         uint_t          i;
1221
1222         t = 0;
1223         p = pcf;
1224         for (i = 0; i < pcf_fanout; i++) {
1225                 t += p->pcf_count;
1226                 p++;
1227         }
1228         /*
1229          * We just calculated it, might as well set it.
1230          */
1231         freemem = t;
1232         return (t);
1233 }
1234
1235 /*
1236  * Acquire all of the page cache & free (pcf) locks.
1237  */
1238 void
1239 pcf_acquire_all()
1240 {
1241         struct pcf      *p;
1242         uint_t          i;
1243
1244         p = pcf;
1245         for (i = 0; i < pcf_fanout; i++) {
1246                 mutex_enter(&p->pcf_lock);
1247                 p++;
1248         }
1249 }
1250
1251 /*
1252  * Release all the pcf_locks.
1253  */
1254 void
1255 pcf_release_all()
1256 {
1257         struct pcf      *p;
1258         uint_t          i;
1259
1260         p = pcf;
1261         for (i = 0; i < pcf_fanout; i++) {
1262                 mutex_exit(&p->pcf_lock);
1263                 p++;
1264         }
1265 }
1266
1267 /*
1268  * Inform the VM system that we need some pages freed up.
1269  * Calls must be symmetric, e.g.:
1270  *
1271  *      page_needfree(100);
1272  *      wait a bit;
1273  *      page_needfree(-100);
1274  */
1275 void
1276 page_needfree(spgcnt_t npages)
1277 {
1278         mutex_enter(&new_freemem_lock);
1279         needfree += npages;
1280         mutex_exit(&new_freemem_lock);
1281 }
1282
1283 /*
1284  * Throttle for page_create(): try to prevent freemem from dropping
1285  * below throttlefree.  We can't provide a 100% guarantee because
1286  * KM_NOSLEEP allocations, page_reclaim(), and various other things
1287  * nibble away at the freelist.  However, we can block all PG_WAIT
1288  * allocations until memory becomes available.  The motivation is
1289  * that several things can fall apart when there's no free memory:
1290  *
1291  * (1) If pageout() needs memory to push a page, the system deadlocks.
1292  *
1293  * (2) By (broken) specification, timeout(9F) can neither fail nor
1294  *     block, so it has no choice but to panic the system if it
1295  *     cannot allocate a callout structure.
1296  *
1297  * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1298  *     it panics if it cannot allocate a callback structure.
1299  *
1300  * (4) Untold numbers of third-party drivers have not yet been hardened
1301  *     against KM_NOSLEEP and/or allocb() failures; they simply assume
1302  *     success and panic the system with a data fault on failure.
1303  *     (The long-term solution to this particular problem is to ship
1304  *     hostile fault-injecting DEBUG kernels with the DDK.)
1305  *
1306  * It is theoretically impossible to guarantee success of non-blocking
1307  * allocations, but in practice, this throttle is very hard to break.
1308  */
1309 static int
1310 page_create_throttle(pgcnt_t npages, int flags)
1311 {
1312         ulong_t fm;
1313         uint_t  i;
1314         pgcnt_t tf;     /* effective value of throttlefree */
1315
1316         /*
1317          * Normal priority allocations.
1318          */
1319         if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) {
1320                 ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE)));
1321                 return (freemem >= npages + throttlefree);
1322         }
1323
1324         /*
1325          * Never deny pages when:
1326          * - it's a thread that cannot block [NOMEMWAIT()]
1327          * - the allocation cannot block and must not fail
1328          * - the allocation cannot block and is pageout dispensated
1329          */
1330         if (NOMEMWAIT() ||
1331             ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1332             ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1333                 return (1);
1334
1335         /*
1336          * If the allocation can't block, we look favorably upon it
1337          * unless we're below pageout_reserve.  In that case we fail
1338          * the allocation because we want to make sure there are a few
1339          * pages available for pageout.
1340          */
1341         if ((flags & PG_WAIT) == 0)
1342                 return (freemem >= npages + pageout_reserve);
1343
1344         /* Calculate the effective throttlefree value */
1345         tf = throttlefree -
1346             ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1347
1348         cv_signal(&proc_pageout->p_cv);
1349
1350         for (;;) {
1351                 fm = 0;
1352                 pcf_acquire_all();
1353                 mutex_enter(&new_freemem_lock);
1354                 for (i = 0; i < pcf_fanout; i++) {
1355                         fm += pcf[i].pcf_count;
1356                         pcf[i].pcf_wait++;
1357                         mutex_exit(&pcf[i].pcf_lock);
1358                 }
1359                 freemem = fm;
1360                 if (freemem >= npages + tf) {
1361                         mutex_exit(&new_freemem_lock);
1362                         break;
1363                 }
1364                 needfree += npages;
1365                 freemem_wait++;
1366                 cv_wait(&freemem_cv, &new_freemem_lock);
1367                 freemem_wait--;
1368                 needfree -= npages;
1369                 mutex_exit(&new_freemem_lock);
1370         }
1371         return (1);
1372 }
1373
1374 /*
1375  * page_create_wait() is called to either coalesce pages from the
1376  * different pcf buckets or to wait because there simply are not
1377  * enough pages to satisfy the caller's request.
1378  *
1379  * Sadly, this is called from platform/vm/vm_machdep.c
1380  */
1381 int
1382 page_create_wait(pgcnt_t npages, uint_t flags)
1383 {
1384         pgcnt_t         total;
1385         uint_t          i;
1386         struct pcf      *p;
1387
1388         /*
1389          * Wait until there are enough free pages to satisfy our
1390          * entire request.
1391          * We set needfree += npages before prodding pageout, to make sure
1392          * it does real work when npages > lotsfree > freemem.
1393          */
1394         VM_STAT_ADD(page_create_not_enough);
1395
1396         ASSERT(!(flags & PG_NORELOC));
1397 checkagain:
1398         if (freemem < npages + throttlefree)
1399                 if (!page_create_throttle(npages, flags))
1400                         return (0);
1401
1402         if (pcf_decrement_bucket(npages) ||
1403             pcf_decrement_multiple(&total, npages, 0))
1404                 return (1);
1405
1406         /*
1407          * All of the pcf locks are held, there are not enough pages
1408          * to satisfy the request (npages < total).
1409          * Be sure to acquire the new_freemem_lock before dropping
1410          * the pcf locks.  This prevents dropping wakeups in page_free().
1411          * The order is always pcf_lock then new_freemem_lock.
1412          *
1413          * Since we hold all the pcf locks, it is a good time to set freemem.
1414          *
1415          * If the caller does not want to wait, return now.
1416          * Else turn the pageout daemon loose to find something
1417          * and wait till it does.
1418          *
1419          */
1420         freemem = total;
1421
1422         if ((flags & PG_WAIT) == 0) {
1423                 pcf_release_all();
1424
1425                 return (0);
1426         }
1427
1428         ASSERT(proc_pageout != NULL);
1429         cv_signal(&proc_pageout->p_cv);
1430
1431         /*
1432          * We are going to wait.
1433          * We currently hold all of the pcf_locks,
1434          * get the new_freemem_lock (it protects freemem_wait),
1435          * before dropping the pcf_locks.
1436          */
1437         mutex_enter(&new_freemem_lock);
1438
1439         p = pcf;
1440         for (i = 0; i < pcf_fanout; i++) {
1441                 p->pcf_wait++;
1442                 mutex_exit(&p->pcf_lock);
1443                 p++;
1444         }
1445
1446         needfree += npages;
1447         freemem_wait++;
1448
1449         cv_wait(&freemem_cv, &new_freemem_lock);
1450
1451         freemem_wait--;
1452         needfree -= npages;
1453
1454         mutex_exit(&new_freemem_lock);
1455
1456         VM_STAT_ADD(page_create_not_enough_again);
1457         goto checkagain;
1458 }
1459 /*
1460  * A routine to do the opposite of page_create_wait().
1461  */
1462 void
1463 page_create_putback(spgcnt_t npages)
1464 {
1465         struct pcf      *p;
1466         pgcnt_t         lump;
1467         uint_t          *which;
1468
1469         /*
1470          * When a contiguous lump is broken up, we have to
1471          * deal with lots of pages (min 64) so lets spread
1472          * the wealth around.
1473          */
1474         lump = roundup(npages, pcf_fanout) / pcf_fanout;
1475         freemem += npages;
1476
1477         for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) {
1478                 which = &p->pcf_count;
1479
1480                 mutex_enter(&p->pcf_lock);
1481
1482                 if (p->pcf_block) {
1483                         which = &p->pcf_reserve;
1484                 }
1485
1486                 if (lump < npages) {
1487                         *which += (uint_t)lump;
1488                         npages -= lump;
1489                 } else {
1490                         *which += (uint_t)npages;
1491                         npages = 0;
1492                 }
1493
1494                 if (p->pcf_wait) {
1495                         mutex_enter(&new_freemem_lock);
1496                         /*
1497                          * Check to see if some other thread
1498                          * is actually waiting.  Another bucket
1499                          * may have woken it up by now.  If there
1500                          * are no waiters, then set our pcf_wait
1501                          * count to zero to avoid coming in here
1502                          * next time.
1503                          */
1504                         if (freemem_wait) {
1505                                 if (npages > 1) {
1506                                         cv_broadcast(&freemem_cv);
1507                                 } else {
1508                                         cv_signal(&freemem_cv);
1509                                 }
1510                                 p->pcf_wait--;
1511                         } else {
1512                                 p->pcf_wait = 0;
1513                         }
1514                         mutex_exit(&new_freemem_lock);
1515                 }
1516                 mutex_exit(&p->pcf_lock);
1517         }
1518         ASSERT(npages == 0);
1519 }
1520
1521 /*
1522  * A helper routine for page_create_get_something.
1523  * The indenting got to deep down there.
1524  * Unblock the pcf counters.  Any pages freed after
1525  * pcf_block got set are moved to pcf_count and
1526  * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1527  */
1528 static void
1529 pcgs_unblock(void)
1530 {
1531         int             i;
1532         struct pcf      *p;
1533
1534         /* Update freemem while we're here. */
1535         freemem = 0;
1536         p = pcf;
1537         for (i = 0; i < pcf_fanout; i++) {
1538                 mutex_enter(&p->pcf_lock);
1539                 ASSERT(p->pcf_count == 0);
1540                 p->pcf_count = p->pcf_reserve;
1541                 p->pcf_block = 0;
1542                 freemem += p->pcf_count;
1543                 if (p->pcf_wait) {
1544                         mutex_enter(&new_freemem_lock);
1545                         if (freemem_wait) {
1546                                 if (p->pcf_reserve > 1) {
1547                                         cv_broadcast(&freemem_cv);
1548                                         p->pcf_wait = 0;
1549                                 } else {
1550                                         cv_signal(&freemem_cv);
1551                                         p->pcf_wait--;
1552                                 }
1553                         } else {
1554                                 p->pcf_wait = 0;
1555                         }
1556                         mutex_exit(&new_freemem_lock);
1557                 }
1558                 p->pcf_reserve = 0;
1559                 mutex_exit(&p->pcf_lock);
1560                 p++;
1561         }
1562 }
1563
1564 /*
1565  * Called from page_create_va() when both the cache and free lists
1566  * have been checked once.
1567  *
1568  * Either returns a page or panics since the accounting was done
1569  * way before we got here.
1570  *
1571  * We don't come here often, so leave the accounting on permanently.
1572  */
1573
1574 #define MAX_PCGS        100
1575
1576 #ifdef  DEBUG
1577 #define PCGS_TRIES      100
1578 #else   /* DEBUG */
1579 #define PCGS_TRIES      10
1580 #endif  /* DEBUG */
1581
1582 #ifdef  VM_STATS
1583 uint_t  pcgs_counts[PCGS_TRIES];
1584 uint_t  pcgs_too_many;
1585 uint_t  pcgs_entered;
1586 uint_t  pcgs_entered_noreloc;
1587 uint_t  pcgs_locked;
1588 uint_t  pcgs_throttled;
1589 #endif  /* VM_STATS */
1590
1591 static bool
1592 page_create_get_something_throttle(void)
1593 {
1594         /*
1595          * We can't throttle the panic thread.
1596          */
1597         if (panicstr)
1598                 return (false);
1599
1600         /*
1601          * Don't throttle threads which are critical for proper
1602          * vm management if freemem is very low.
1603          */
1604         if (NOMEMWAIT() && (freemem < minfree))
1605                 return (false);
1606
1607         return (true);
1608 }
1609
1610 static struct page *
1611 page_create_get_something(struct vmobject *obj, uoff_t off, struct seg *seg,
1612     caddr_t vaddr, uint_t flags)
1613 {
1614         uint_t          count;
1615         page_t          *pp;
1616         uint_t          locked, i;
1617         struct  pcf     *p;
1618         lgrp_t          *lgrp;
1619         int             throttled = 0;
1620
1621         VM_STAT_ADD(pcgs_entered);
1622
1623         /*
1624          * Tap any reserve freelists: if we fail now, we'll die
1625          * since the page(s) we're looking for have already been
1626          * accounted for.
1627          */
1628         flags |= PG_PANIC;
1629
1630         if ((flags & PG_NORELOC) != 0) {
1631                 VM_STAT_ADD(pcgs_entered_noreloc);
1632                 /*
1633                  * Requests for free pages from critical threads such as
1634                  * pageout still won't throttle here.  Since we already
1635                  * accounted for the pages, we had better get them this
1636                  * time.
1637                  *
1638                  * N.B. All non-critical threads acquire the pcgs_throttle
1639                  * to serialize access to the freelists. This implements a
1640                  * turnstile-type synchornization to avoid starvation of
1641                  * critical requests for PG_NORELOC memory by non-critical
1642                  * threads: all non-critical threads must acquire a 'ticket'
1643                  * before passing through, which entails making sure
1644                  * freemem won't fall below minfree prior to grabbing pages
1645                  * from the freelists.
1646                  */
1647                 if (page_create_get_something_throttle()) {
1648                         mutex_enter(&pcgs_throttle);
1649                         throttled = 1;
1650                         VM_STAT_ADD(pcgs_throttled);
1651                 }
1652         }
1653
1654         /*
1655          * Time to get serious.
1656          * We failed to get a `correctly colored' page from both the
1657          * free and cache lists.
1658          * We escalate in stage.
1659          *
1660          * First try both lists without worring about color.
1661          *
1662          * Then, grab all page accounting locks (ie. pcf[]) and
1663          * steal any pages that they have and set the pcf_block flag to
1664          * stop deletions from the lists.  This will help because
1665          * a page can get added to the free list while we are looking
1666          * at the cache list, then another page could be added to the cache
1667          * list allowing the page on the free list to be removed as we
1668          * move from looking at the cache list to the free list. This
1669          * could happen over and over. We would never find the page
1670          * we have accounted for.
1671          *
1672          * Noreloc pages are a subset of the global (relocatable) page pool.
1673          * They are not tracked separately in the pcf bins, so it is
1674          * impossible to know when doing pcf accounting if the available
1675          * page(s) are noreloc pages or not. When looking for a noreloc page
1676          * it is quite easy to end up here even if the global (relocatable)
1677          * page pool has plenty of free pages but the noreloc pool is empty.
1678          *
1679          * When the noreloc pool is empty (or low), additional noreloc pages
1680          * are created by converting pages from the global page pool. This
1681          * process will stall during pcf accounting if the pcf bins are
1682          * already locked. Such is the case when a noreloc allocation is
1683          * looping here in page_create_get_something waiting for more noreloc
1684          * pages to appear.
1685          *
1686          * Short of adding a new field to the pcf bins to accurately track
1687          * the number of free noreloc pages, we instead do not grab the
1688          * pcgs_lock, do not set the pcf blocks and do not timeout when
1689          * allocating a noreloc page. This allows noreloc allocations to
1690          * loop without blocking global page pool allocations.
1691          *
1692          * NOTE: the behaviour of page_create_get_something has not changed
1693          * for the case of global page pool allocations.
1694          */
1695
1696         flags &= ~PG_MATCH_COLOR;
1697         locked = 0;
1698 #if defined(__i386) || defined(__amd64)
1699         flags = page_create_update_flags_x86(flags);
1700 #endif
1701
1702         lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1703
1704         for (count = 0; count < MAX_PCGS; count++) {
1705                 pp = page_get_freelist(obj, off, seg, vaddr, PAGESIZE, flags,
1706                                        lgrp);
1707                 if (pp == NULL) {
1708                         pp = page_get_cachelist(obj, off, seg, vaddr, flags,
1709                                                 lgrp);
1710                 }
1711                 if (pp == NULL) {
1712                         /*
1713                          * Serialize.  Don't fight with other pcgs().
1714                          */
1715                         if (!locked) {
1716                                 mutex_enter(&pcgs_lock);
1717                                 VM_STAT_ADD(pcgs_locked);
1718                                 locked = 1;
1719                                 p = pcf;
1720                                 for (i = 0; i < pcf_fanout; i++) {
1721                                         mutex_enter(&p->pcf_lock);
1722                                         ASSERT(p->pcf_block == 0);
1723                                         p->pcf_block = 1;
1724                                         p->pcf_reserve = p->pcf_count;
1725                                         p->pcf_count = 0;
1726                                         mutex_exit(&p->pcf_lock);
1727                                         p++;
1728                                 }
1729                                 freemem = 0;
1730                         }
1731
1732                         if (count) {
1733                                 /*
1734                                  * Since page_free() puts pages on
1735                                  * a list then accounts for it, we
1736                                  * just have to wait for page_free()
1737                                  * to unlock any page it was working
1738                                  * with. The page_lock()-page_reclaim()
1739                                  * path falls in the same boat.
1740                                  *
1741                                  * We don't need to check on the
1742                                  * PG_WAIT flag, we have already
1743                                  * accounted for the page we are
1744                                  * looking for in page_create_va().
1745                                  *
1746                                  * We just wait a moment to let any
1747                                  * locked pages on the lists free up,
1748                                  * then continue around and try again.
1749                                  *
1750                                  * Will be awakened by set_freemem().
1751                                  */
1752                                 mutex_enter(&pcgs_wait_lock);
1753                                 cv_wait(&pcgs_cv, &pcgs_wait_lock);
1754                                 mutex_exit(&pcgs_wait_lock);
1755                         }
1756                 } else {
1757 #ifdef VM_STATS
1758                         if (count >= PCGS_TRIES) {
1759                                 VM_STAT_ADD(pcgs_too_many);
1760                         } else {
1761                                 VM_STAT_ADD(pcgs_counts[count]);
1762                         }
1763 #endif
1764                         if (locked) {
1765                                 pcgs_unblock();
1766                                 mutex_exit(&pcgs_lock);
1767                         }
1768                         if (throttled)
1769                                 mutex_exit(&pcgs_throttle);
1770                         return (pp);
1771                 }
1772         }
1773         /*
1774          * we go down holding the pcf locks.
1775          */
1776         panic("no %spage found %d",
1777             ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
1778         /*NOTREACHED*/
1779 }
1780
1781 #ifdef DEBUG
1782 uint32_t pg_alloc_pgs_mtbf = 0;
1783 #endif
1784
1785 /*
1786  * Used for large page support. It will attempt to allocate
1787  * a large page(s) off the freelist.
1788  *
1789  * Returns non zero on failure.
1790  */
1791 int
1792 page_alloc_pages(struct vmobject *obj, struct seg *seg, caddr_t addr,
1793     struct page **basepp, struct page **ppa, uint_t szc, int anypgsz,
1794     int pgflags)
1795 {
1796         pgcnt_t         npgs, curnpgs, totpgs;
1797         size_t          pgsz;
1798         page_t          *pplist = NULL, *pp;
1799         int             err = 0;
1800         lgrp_t          *lgrp;
1801
1802         ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
1803         ASSERT(pgflags == 0 || pgflags == PG_LOCAL);
1804
1805         /*
1806          * Check if system heavily prefers local large pages over remote
1807          * on systems with multiple lgroups.
1808          */
1809         if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) {
1810                 pgflags = PG_LOCAL;
1811         }
1812
1813         VM_STAT_ADD(alloc_pages[0]);
1814
1815 #ifdef DEBUG
1816         if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
1817                 return (ENOMEM);
1818         }
1819 #endif
1820
1821         /*
1822          * One must be NULL but not both.
1823          * And one must be non NULL but not both.
1824          */
1825         ASSERT(basepp != NULL || ppa != NULL);
1826         ASSERT(basepp == NULL || ppa == NULL);
1827
1828 #if defined(__i386) || defined(__amd64)
1829         while (page_chk_freelist(szc) == 0) {
1830                 VM_STAT_ADD(alloc_pages[8]);
1831                 if (anypgsz == 0 || --szc == 0)
1832                         return (ENOMEM);
1833         }
1834 #endif
1835
1836         pgsz = page_get_pagesize(szc);
1837         totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
1838
1839         ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
1840
1841         (void) page_create_wait(npgs, PG_WAIT);
1842
1843         while (npgs && szc) {
1844                 lgrp = lgrp_mem_choose(seg, addr, pgsz);
1845                 if (pgflags == PG_LOCAL) {
1846                         pp = page_get_freelist(obj, 0, seg, addr, pgsz, pgflags,
1847                                                lgrp);
1848                         if (pp == NULL) {
1849                                 pp = page_get_freelist(obj, 0, seg, addr, pgsz,
1850                                                        0, lgrp);
1851                         }
1852                 } else {
1853                         pp = page_get_freelist(obj, 0, seg, addr, pgsz, 0, lgrp);
1854                 }
1855                 if (pp != NULL) {
1856                         VM_STAT_ADD(alloc_pages[1]);
1857                         page_list_concat(&pplist, &pp);
1858                         ASSERT(npgs >= curnpgs);
1859                         npgs -= curnpgs;
1860                 } else if (anypgsz) {
1861                         VM_STAT_ADD(alloc_pages[2]);
1862                         szc--;
1863                         pgsz = page_get_pagesize(szc);
1864                         curnpgs = pgsz >> PAGESHIFT;
1865                 } else {
1866                         VM_STAT_ADD(alloc_pages[3]);
1867                         ASSERT(npgs == totpgs);
1868                         page_create_putback(npgs);
1869                         return (ENOMEM);
1870                 }
1871         }
1872         if (szc == 0) {
1873                 VM_STAT_ADD(alloc_pages[4]);
1874                 ASSERT(npgs != 0);
1875                 page_create_putback(npgs);
1876                 err = ENOMEM;
1877         } else if (basepp != NULL) {
1878                 ASSERT(npgs == 0);
1879                 ASSERT(ppa == NULL);
1880                 *basepp = pplist;
1881         }
1882
1883         npgs = totpgs - npgs;
1884         pp = pplist;
1885
1886         /*
1887          * Clear the free and age bits. Also if we were passed in a ppa then
1888          * fill it in with all the constituent pages from the large page. But
1889          * if we failed to allocate all the pages just free what we got.
1890          */
1891         while (npgs != 0) {
1892                 ASSERT(PP_ISFREE(pp));
1893                 ASSERT(PP_ISAGED(pp));
1894                 if (ppa != NULL || err != 0) {
1895                         if (err == 0) {
1896                                 VM_STAT_ADD(alloc_pages[5]);
1897                                 PP_CLRFREE(pp);
1898                                 PP_CLRAGED(pp);
1899                                 page_sub(&pplist, pp);
1900                                 *ppa++ = pp;
1901                                 npgs--;
1902                         } else {
1903                                 VM_STAT_ADD(alloc_pages[6]);
1904                                 ASSERT(pp->p_szc != 0);
1905                                 curnpgs = page_get_pagecnt(pp->p_szc);
1906                                 page_list_break(&pp, &pplist, curnpgs);
1907                                 page_list_add_pages(pp, 0);
1908                                 page_create_putback(curnpgs);
1909                                 ASSERT(npgs >= curnpgs);
1910                                 npgs -= curnpgs;
1911                         }
1912                         pp = pplist;
1913                 } else {
1914                         VM_STAT_ADD(alloc_pages[7]);
1915                         PP_CLRFREE(pp);
1916                         PP_CLRAGED(pp);
1917                         pp = pp->p_next;
1918                         npgs--;
1919                 }
1920         }
1921         return (err);
1922 }
1923
1924 /*
1925  * Get a single large page off of the freelists, and set it up for use.
1926  * Number of bytes requested must be a supported page size.
1927  *
1928  * Note that this call may fail even if there is sufficient
1929  * memory available or PG_WAIT is set, so the caller must
1930  * be willing to fallback on page_create_va(), block and retry,
1931  * or fail the requester.
1932  */
1933 struct page *
1934 page_create_va_large(struct vmobject *obj, uoff_t off, size_t bytes,
1935                      uint_t flags, struct seg *seg, caddr_t vaddr, void *arg)
1936 {
1937         pgcnt_t         npages;
1938         page_t          *pp;
1939         page_t          *rootpp;
1940         lgrp_t          *lgrp;
1941         lgrp_id_t       *lgrpid = (lgrp_id_t *)arg;
1942
1943         ASSERT(obj != NULL);
1944
1945         ASSERT((flags & ~(PG_EXCL | PG_WAIT |
1946             PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
1947         /* but no others */
1948
1949         ASSERT((flags & PG_EXCL) == PG_EXCL);
1950
1951         npages = btop(bytes);
1952
1953         flags &= ~PG_NORELOC;
1954
1955         /*
1956          * Make sure there's adequate physical memory available.
1957          * Note: PG_WAIT is ignored here.
1958          */
1959         if (freemem <= throttlefree + npages) {
1960                 VM_STAT_ADD(page_create_large_cnt[1]);
1961                 return (NULL);
1962         }
1963
1964         if (!pcf_decrement_bucket(npages) &&
1965             !pcf_decrement_multiple(NULL, npages, 1)) {
1966                 VM_STAT_ADD(page_create_large_cnt[4]);
1967                 return (NULL);
1968         }
1969
1970         /*
1971          * This is where this function behaves fundamentally differently
1972          * than page_create_va(); since we're intending to map the page
1973          * with a single TTE, we have to get it as a physically contiguous
1974          * hardware pagesize chunk.  If we can't, we fail.
1975          */
1976         if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
1977             LGRP_EXISTS(lgrp_table[*lgrpid]))
1978                 lgrp = lgrp_table[*lgrpid];
1979         else
1980                 lgrp = lgrp_mem_choose(seg, vaddr, bytes);
1981
1982         if ((rootpp = page_get_freelist(&kvp.v_object, off, seg, vaddr,
1983             bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
1984                 page_create_putback(npages);
1985                 VM_STAT_ADD(page_create_large_cnt[5]);
1986                 return (NULL);
1987         }
1988
1989         /*
1990          * If satisfying this request has left us with too little
1991          * memory, start the wheels turning to get some back.  The
1992          * first clause of the test prevents waking up the pageout
1993          * daemon in situations where it would decide that there's
1994          * nothing to do.
1995          */
1996         if (nscan < desscan && freemem < minfree) {
1997                 cv_signal(&proc_pageout->p_cv);
1998         }
1999
2000         pp = rootpp;
2001         while (npages--) {
2002                 ASSERT(PAGE_EXCL(pp));
2003                 VERIFY(pp->p_object == NULL);
2004                 ASSERT(pp->p_vnode == NULL);
2005                 ASSERT(!hat_page_is_mapped(pp));
2006                 PP_CLRFREE(pp);
2007                 PP_CLRAGED(pp);
2008                 if (!page_hashin(pp, obj, off, false))
2009                         panic("page_create_large: hashin failed: page %p",
2010                             (void *)pp);
2011                 page_io_lock(pp);
2012                 off += PAGESIZE;
2013                 pp = pp->p_next;
2014         }
2015
2016         VM_STAT_ADD(page_create_large_cnt[0]);
2017         return (rootpp);
2018 }
2019
2020
2021 /*
2022  * Create enough pages for "bytes" worth of data starting at
2023  * "off" in "obj".
2024  *
2025  *      Where flag must be one of:
2026  *
2027  *              PG_EXCL:        Exclusive create (fail if any page already
2028  *                              exists in the page cache) which does not
2029  *                              wait for memory to become available.
2030  *
2031  *              PG_WAIT:        Non-exclusive create which can wait for
2032  *                              memory to become available.
2033  *
2034  *              PG_PHYSCONTIG:  Allocate physically contiguous pages.
2035  *                              (Not Supported)
2036  *
2037  * A doubly linked list of pages is returned to the caller.  Each page
2038  * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
2039  * lock.
2040  *
2041  * Unable to change the parameters to page_create() in a minor release,
2042  * we renamed page_create() to page_create_va(), and changed all known calls
2043  * from page_create() to page_create_va().
2044  *
2045  * We should consider ditch this renaming by replacing all the strings
2046  * "page_create_va", with "page_create".
2047  *
2048  * NOTE: There is a copy of this interface as page_create_io() in
2049  *       i86/vm/vm_machdep.c. Any bugs fixed here should be applied
2050  *       there.
2051  */
2052 struct page *
2053 page_create_va(struct vmobject *obj, uoff_t off, size_t bytes, uint_t flags,
2054     struct seg *seg, caddr_t vaddr)
2055 {
2056         page_t          *plist = NULL;
2057         pgcnt_t         npages;
2058         pgcnt_t         found_on_free = 0;
2059         pgcnt_t         pages_req;
2060         page_t          *npp = NULL;
2061         struct pcf      *p;
2062         lgrp_t          *lgrp;
2063
2064         ASSERT(bytes != 0 && obj != NULL);
2065
2066         if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2067                 panic("page_create: invalid flags");
2068                 /*NOTREACHED*/
2069         }
2070         ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2071             PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2072             /* but no others */
2073
2074         pages_req = npages = btopr(bytes);
2075         /*
2076          * Try to see whether request is too large to *ever* be
2077          * satisfied, in order to prevent deadlock.  We arbitrarily
2078          * decide to limit maximum size requests to max_page_get.
2079          */
2080         if (npages >= max_page_get) {
2081                 if ((flags & PG_WAIT) == 0) {
2082                         return (NULL);
2083                 } else {
2084                         cmn_err(CE_WARN,
2085                             "Request for too much kernel memory "
2086                             "(%lu bytes), will hang forever", bytes);
2087                         for (;;)
2088                                 delay(1000000000);
2089                 }
2090         }
2091
2092         flags &= ~PG_NORELOC;
2093
2094         if (freemem <= throttlefree + npages)
2095                 if (!page_create_throttle(npages, flags))
2096                         return (NULL);
2097
2098         VM_STAT_ADD(page_create_cnt[0]);
2099
2100         if (!pcf_decrement_bucket(npages)) {
2101                 /*
2102                  * Have to look harder.  If npages is greater than
2103                  * one, then we might have to coalesce the counters.
2104                  *
2105                  * Go wait.  We come back having accounted
2106                  * for the memory.
2107                  */
2108                 VM_STAT_ADD(page_create_cnt[1]);
2109                 if (!page_create_wait(npages, flags)) {
2110                         VM_STAT_ADD(page_create_cnt[2]);
2111                         return (NULL);
2112                 }
2113         }
2114
2115         /*
2116          * If satisfying this request has left us with too little
2117          * memory, start the wheels turning to get some back.  The
2118          * first clause of the test prevents waking up the pageout
2119          * daemon in situations where it would decide that there's
2120          * nothing to do.
2121          */
2122         if (nscan < desscan && freemem < minfree) {
2123                 cv_signal(&proc_pageout->p_cv);
2124         }
2125
2126         /*
2127          * Loop around collecting the requested number of pages.
2128          * Most of the time, we have to `create' a new page. With
2129          * this in mind, pull the page off the free list before
2130          * getting the hash lock.  This will minimize the hash
2131          * lock hold time, nesting, and the like.  If it turns
2132          * out we don't need the page, we put it back at the end.
2133          */
2134         while (npages--) {
2135                 page_t *pp;
2136
2137 top:
2138                 ASSERT(!VMOBJECT_LOCKED(obj));
2139
2140                 if (npp == NULL) {
2141                         /*
2142                          * Try to get a page from the freelist (ie,
2143                          * a page with no [obj, off] tag).  If that
2144                          * fails, use the cachelist.
2145                          *
2146                          * During the first attempt at both the free
2147                          * and cache lists we try for the correct color.
2148                          */
2149                         /*
2150                          * XXXX-how do we deal with virtual indexed
2151                          * caches and and colors?
2152                          */
2153                         VM_STAT_ADD(page_create_cnt[4]);
2154                         /*
2155                          * Get lgroup to allocate next page of shared memory
2156                          * from and use it to specify where to allocate
2157                          * the physical memory
2158                          */
2159                         lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2160                         npp = page_get_freelist(obj, off, seg, vaddr, PAGESIZE,
2161                                                 flags | PG_MATCH_COLOR, lgrp);
2162                         if (npp == NULL) {
2163                                 npp = page_get_cachelist(obj, off, seg, vaddr,
2164                                                          flags | PG_MATCH_COLOR,
2165                                                          lgrp);
2166                                 if (npp == NULL) {
2167                                         npp = page_create_get_something(
2168                                             obj, off, seg, vaddr,
2169                                             flags & ~PG_MATCH_COLOR);
2170                                 }
2171
2172                                 if (PP_ISAGED(npp) == 0) {
2173                                         /*
2174                                          * Since this page came from the
2175                                          * cachelist, we must destroy the
2176                                          * old vnode association.
2177                                          */
2178                                         page_hashout(npp, false);
2179                                 }
2180                         }
2181                 }
2182
2183                 /*
2184                  * We own this page!
2185                  */
2186                 ASSERT(PAGE_EXCL(npp));
2187                 VERIFY(npp->p_object == NULL);
2188                 ASSERT(npp->p_vnode == NULL);
2189                 ASSERT(!hat_page_is_mapped(npp));
2190                 PP_CLRFREE(npp);
2191                 PP_CLRAGED(npp);
2192
2193                 /*
2194                  * Here we have a page in our hot little mits and are
2195                  * just waiting to stuff it on the appropriate lists.
2196                  * Get the mutex and check to see if it really does
2197                  * not exist.
2198                  */
2199                 vmobject_lock(obj);
2200                 pp = find_page(obj, off);
2201                 if (pp == NULL) {
2202                         VM_STAT_ADD(page_create_new);
2203                         pp = npp;
2204                         npp = NULL;
2205                         if (!page_hashin(pp, obj, off, true)) {
2206                                 /*
2207                                  * Since we hold the page vnode page cache
2208                                  * mutex and just searched for this page,
2209                                  * page_hashin had better not fail.  If it
2210                                  * does, that means some thread did not
2211                                  * follow the page hash mutex rules.  Panic
2212                                  * now and get it over with.  As usual, go
2213                                  * down holding all the locks.
2214                                  */
2215                                 ASSERT(VMOBJECT_LOCKED(obj));
2216                                 panic("page_create: "
2217                                     "hashin failed %p %p %llx", pp, obj, off);
2218                                 /*NOTREACHED*/
2219                         }
2220                         ASSERT(VMOBJECT_LOCKED(obj));
2221                         vmobject_unlock(obj);
2222
2223                         /*
2224                          * Hat layer locking need not be done to set
2225                          * the following bits since the page is not hashed
2226                          * and was on the free list (i.e., had no mappings).
2227                          *
2228                          * Set the reference bit to protect
2229                          * against immediate pageout
2230                          *
2231                          * XXXmh modify freelist code to set reference
2232                          * bit so we don't have to do it here.
2233                          */
2234                         page_set_props(pp, P_REF);
2235                         found_on_free++;
2236                 } else {
2237                         VM_STAT_ADD(page_create_exists);
2238                         if (flags & PG_EXCL) {
2239                                 /*
2240                                  * Found an existing page, and the caller
2241                                  * wanted all new pages.  Undo all of the work
2242                                  * we have done.
2243                                  */
2244                                 vmobject_unlock(obj);
2245                                 while (plist != NULL) {
2246                                         pp = plist;
2247                                         page_sub(&plist, pp);
2248                                         page_io_unlock(pp);
2249                                         /* large pages should not end up here */
2250                                         ASSERT(pp->p_szc == 0);
2251
2252                                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
2253                                 }
2254                                 VM_STAT_ADD(page_create_found_one);
2255                                 goto fail;
2256                         }
2257                         ASSERT(flags & PG_WAIT);
2258                         if (!page_lock(pp, SE_EXCL, obj, P_NO_RECLAIM)) {
2259                                 /*
2260                                  * Start all over again if we blocked trying
2261                                  * to lock the page.
2262                                  */
2263                                 vmobject_unlock(obj);
2264                                 VM_STAT_ADD(page_create_page_lock_failed);
2265                                 goto top;
2266                         }
2267                         vmobject_unlock(obj);
2268
2269                         if (PP_ISFREE(pp)) {
2270                                 ASSERT(PP_ISAGED(pp) == 0);
2271                                 VM_STAT_ADD(pagecnt.pc_get_cache);
2272                                 page_list_sub(pp, PG_CACHE_LIST);
2273                                 PP_CLRFREE(pp);
2274                                 found_on_free++;
2275                         }
2276                 }
2277
2278                 /*
2279                  * Got a page!  It is locked.  Acquire the i/o
2280                  * lock since we are going to use the p_next and
2281                  * p_prev fields to link the requested pages together.
2282                  */
2283                 page_io_lock(pp);
2284                 page_add(&plist, pp);
2285                 plist = plist->p_next;
2286                 off += PAGESIZE;
2287                 vaddr += PAGESIZE;
2288         }
2289
2290         ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2291 fail:
2292         if (npp != NULL) {
2293                 /*
2294                  * Did not need this page after all.
2295                  * Put it back on the free list.
2296                  */
2297                 VM_STAT_ADD(page_create_putbacks);
2298                 PP_SETFREE(npp);
2299                 PP_SETAGED(npp);
2300                 npp->p_offset = (uoff_t)-1;
2301                 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2302                 page_unlock(npp);
2303         }
2304
2305         ASSERT(pages_req >= found_on_free);
2306
2307         {
2308                 uint_t overshoot = (uint_t)(pages_req - found_on_free);
2309
2310                 if (overshoot) {
2311                         VM_STAT_ADD(page_create_overshoot);
2312                         p = &pcf[PCF_INDEX()];
2313                         mutex_enter(&p->pcf_lock);
2314                         if (p->pcf_block) {
2315                                 p->pcf_reserve += overshoot;
2316                         } else {
2317                                 p->pcf_count += overshoot;
2318                                 if (p->pcf_wait) {
2319                                         mutex_enter(&new_freemem_lock);
2320                                         if (freemem_wait) {
2321                                                 cv_signal(&freemem_cv);
2322                                                 p->pcf_wait--;
2323                                         } else {
2324                                                 p->pcf_wait = 0;
2325                                         }
2326                                         mutex_exit(&new_freemem_lock);
2327                                 }
2328                         }
2329                         mutex_exit(&p->pcf_lock);
2330                         /* freemem is approximate, so this test OK */
2331                         if (!p->pcf_block)
2332                                 freemem += overshoot;
2333                 }
2334         }
2335
2336         return (plist);
2337 }
2338
2339 /*
2340  * One or more constituent pages of this large page has been marked
2341  * toxic. Simply demote the large page to PAGESIZE pages and let
2342  * page_free() handle it. This routine should only be called by
2343  * large page free routines (page_free_pages() and page_destroy_pages().
2344  * All pages are locked SE_EXCL and have already been marked free.
2345  */
2346 static void
2347 page_free_toxic_pages(page_t *rootpp)
2348 {
2349         page_t  *tpp;
2350         pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2351         uint_t  szc = rootpp->p_szc;
2352
2353         for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2354                 ASSERT(tpp->p_szc == szc);
2355                 ASSERT((PAGE_EXCL(tpp) &&
2356                     !page_iolock_assert(tpp)) || panicstr);
2357                 tpp->p_szc = 0;
2358         }
2359
2360         while (rootpp != NULL) {
2361                 tpp = rootpp;
2362                 page_sub(&rootpp, tpp);
2363                 ASSERT(PP_ISFREE(tpp));
2364                 PP_CLRFREE(tpp);
2365                 page_free(tpp, 1);
2366         }
2367 }
2368
2369 /*
2370  * Put page on the "free" list.
2371  * The free list is really two lists maintained by
2372  * the PSM of whatever machine we happen to be on.
2373  */
2374 void
2375 page_free(page_t *pp, int dontneed)
2376 {
2377         struct pcf      *p;
2378         uint_t          pcf_index;
2379
2380         ASSERT((PAGE_EXCL(pp) &&
2381             !page_iolock_assert(pp)) || panicstr);
2382
2383         if (PP_ISFREE(pp)) {
2384                 panic("page_free: page %p is free", (void *)pp);
2385         }
2386
2387         if (pp->p_szc != 0) {
2388                 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2389                     PP_ISKAS(pp)) {
2390                         panic("page_free: anon or kernel "
2391                             "or no vnode large page %p", (void *)pp);
2392                 }
2393                 page_demote_vp_pages(pp);
2394                 ASSERT(pp->p_szc == 0);
2395         }
2396
2397         /*
2398          * The page_struct_lock need not be acquired to examine these
2399          * fields since the page has an "exclusive" lock.
2400          */
2401         if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
2402             pp->p_slckcnt != 0) {
2403                 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d "
2404                     "slckcnt = %d", (void *)pp, page_pptonum(pp), pp->p_lckcnt,
2405                     pp->p_cowcnt, pp->p_slckcnt);
2406                 /*NOTREACHED*/
2407         }
2408
2409         ASSERT(!hat_page_getshare(pp));
2410
2411         PP_SETFREE(pp);
2412         ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2413             !hat_ismod(pp));
2414         page_clr_all_props(pp);
2415         ASSERT(!hat_page_getshare(pp));
2416
2417         /*
2418          * Now we add the page to the head of the free list.
2419          * But if this page is associated with a paged vnode
2420          * then we adjust the head forward so that the page is
2421          * effectively at the end of the list.
2422          */
2423         if (pp->p_vnode == NULL) {
2424                 /*
2425                  * Page has no identity, put it on the free list.
2426                  */
2427                 PP_SETAGED(pp);
2428                 pp->p_offset = (uoff_t)-1;
2429                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2430                 VM_STAT_ADD(pagecnt.pc_free_free);
2431         } else {
2432                 PP_CLRAGED(pp);
2433
2434                 if (!dontneed) {
2435                         /* move it to the tail of the list */
2436                         page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
2437
2438                         VM_STAT_ADD(pagecnt.pc_free_cache);
2439                 } else {
2440                         page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
2441
2442                         VM_STAT_ADD(pagecnt.pc_free_dontneed);
2443                 }
2444         }
2445         page_unlock(pp);
2446
2447         /*
2448          * Now do the `freemem' accounting.
2449          */
2450         pcf_index = PCF_INDEX();
2451         p = &pcf[pcf_index];
2452
2453         mutex_enter(&p->pcf_lock);
2454         if (p->pcf_block) {
2455                 p->pcf_reserve += 1;
2456         } else {
2457                 p->pcf_count += 1;
2458                 if (p->pcf_wait) {
2459                         mutex_enter(&new_freemem_lock);
2460                         /*
2461                          * Check to see if some other thread
2462                          * is actually waiting.  Another bucket
2463                          * may have woken it up by now.  If there
2464                          * are no waiters, then set our pcf_wait
2465                          * count to zero to avoid coming in here
2466                          * next time.  Also, since only one page
2467                          * was put on the free list, just wake
2468                          * up one waiter.
2469                          */
2470                         if (freemem_wait) {
2471                                 cv_signal(&freemem_cv);
2472                                 p->pcf_wait--;
2473                         } else {
2474                                 p->pcf_wait = 0;
2475                         }
2476                         mutex_exit(&new_freemem_lock);
2477                 }
2478         }
2479         mutex_exit(&p->pcf_lock);
2480
2481         /* freemem is approximate, so this test OK */
2482         if (!p->pcf_block)
2483                 freemem += 1;
2484 }
2485
2486 /*
2487  * Put page on the "free" list during intial startup.
2488  * This happens during initial single threaded execution.
2489  */
2490 void
2491 page_free_at_startup(page_t *pp)
2492 {
2493         struct pcf      *p;
2494         uint_t          pcf_index;
2495
2496         page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
2497         VM_STAT_ADD(pagecnt.pc_free_free);
2498
2499         /*
2500          * Now do the `freemem' accounting.
2501          */
2502         pcf_index = PCF_INDEX();
2503         p = &pcf[pcf_index];
2504
2505         ASSERT(p->pcf_block == 0);
2506         ASSERT(p->pcf_wait == 0);
2507         p->pcf_count += 1;
2508
2509         /* freemem is approximate, so this is OK */
2510         freemem += 1;
2511 }
2512
2513 void
2514 page_free_pages(page_t *pp)
2515 {
2516         page_t  *tpp, *rootpp = NULL;
2517         pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
2518         pgcnt_t i;
2519         uint_t  szc = pp->p_szc;
2520
2521         VM_STAT_ADD(pagecnt.pc_free_pages);
2522
2523         ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2524         if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2525                 panic("page_free_pages: not root page %p", (void *)pp);
2526                 /*NOTREACHED*/
2527         }
2528
2529         for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2530                 ASSERT((PAGE_EXCL(tpp) &&
2531                     !page_iolock_assert(tpp)) || panicstr);
2532                 if (PP_ISFREE(tpp)) {
2533                         panic("page_free_pages: page %p is free", (void *)tpp);
2534                         /*NOTREACHED*/
2535                 }
2536                 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
2537                     tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) {
2538                         panic("page_free_pages %p", (void *)tpp);
2539                         /*NOTREACHED*/
2540                 }
2541
2542                 ASSERT(!hat_page_getshare(tpp));
2543                 VERIFY(tpp->p_object == NULL);
2544                 ASSERT(tpp->p_vnode == NULL);
2545                 ASSERT(tpp->p_szc == szc);
2546
2547                 PP_SETFREE(tpp);
2548                 page_clr_all_props(tpp);
2549                 PP_SETAGED(tpp);
2550                 tpp->p_offset = (uoff_t)-1;
2551                 ASSERT(tpp->p_next == tpp);
2552                 ASSERT(tpp->p_prev == tpp);
2553                 page_list_concat(&rootpp, &tpp);
2554         }
2555         ASSERT(rootpp == pp);
2556
2557         page_list_add_pages(rootpp, 0);
2558         page_create_putback(pgcnt);
2559 }
2560
2561 int free_pages = 1;
2562
2563 /*
2564  * This routine attempts to return pages to the cachelist via page_release().
2565  * It does not *have* to be successful in all cases, since the pageout scanner
2566  * will catch any pages it misses.  It does need to be fast and not introduce
2567  * too much overhead.
2568  *
2569  * If a page isn't found on the unlocked sweep of the page_hash bucket, we
2570  * don't lock and retry.  This is ok, since the page scanner will eventually
2571  * find any page we miss in free_vp_pages().
2572  */
2573 void
2574 free_vp_pages(struct vmobject *obj, uoff_t off, size_t len)
2575 {
2576         page_t *pp;
2577         uoff_t eoff;
2578         extern int swap_in_range(vnode_t *, uoff_t, size_t);
2579
2580         eoff = off + len;
2581
2582         if (free_pages == 0)
2583                 return;
2584         if (swap_in_range(obj->vnode, off, len))
2585                 return;
2586
2587         for (; off < eoff; off += PAGESIZE) {
2588
2589                 /*
2590                  * find the page using a fast, but inexact search. It'll be OK
2591                  * if a few pages slip through the cracks here.
2592                  */
2593                 pp = page_exists(obj, off);
2594
2595                 /*
2596                  * If we didn't find the page (it may not exist), the page
2597                  * is free, looks still in use (shared), or we can't lock it,
2598                  * just give up.
2599                  */
2600                 if (pp == NULL ||
2601                     PP_ISFREE(pp) ||
2602                     page_share_cnt(pp) > 0 ||
2603                     !page_trylock(pp, SE_EXCL))
2604                         continue;
2605
2606                 /*
2607                  * Once we have locked pp, verify that it's still the
2608                  * correct page and not already free
2609                  */
2610                 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
2611                 if (pp->p_vnode != obj->vnode || pp->p_offset != off ||
2612                     PP_ISFREE(pp)) {
2613                         page_unlock(pp);
2614                         continue;
2615                 }
2616
2617                 /*
2618                  * try to release the page...
2619                  */
2620                 (void) page_release(pp, 1);
2621         }
2622 }
2623
2624 /*
2625  * Reclaim the given page from the free list.
2626  * If pp is part of a large pages, only the given constituent page is reclaimed
2627  * and the large page it belonged to will be demoted.  This can only happen
2628  * if the page is not on the cachelist.
2629  *
2630  * Returns 1 on success or 0 on failure.
2631  *
2632  * The page is unlocked if it can't be reclaimed (when freemem == 0).
2633  * If `lock' is non-null, it will be dropped and re-acquired if
2634  * the routine must wait while freemem is 0.
2635  *
2636  * As it turns out, boot_getpages() does this.  It picks a page,
2637  * based on where OBP mapped in some address, gets its pfn, searches
2638  * the memsegs, locks the page, then pulls it off the free list!
2639  */
2640 int
2641 page_reclaim(struct page *pp, struct vmobject *obj)
2642 {
2643         struct pcf      *p;
2644         struct cpu      *cpup;
2645         int             enough;
2646         uint_t          i;
2647
2648         ASSERT(obj != NULL ? VMOBJECT_LOCKED(obj) : 1);
2649         ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
2650
2651         /*
2652          * If `freemem' is 0, we cannot reclaim this page from the
2653          * freelist, so release every lock we might hold: the page,
2654          * and the vnode page lock before blocking.
2655          *
2656          * The only way `freemem' can become 0 while there are pages
2657          * marked free (have their p->p_free bit set) is when the
2658          * system is low on memory and doing a page_create().  In
2659          * order to guarantee that once page_create() starts acquiring
2660          * pages it will be able to get all that it needs since `freemem'
2661          * was decreased by the requested amount.  So, we need to release
2662          * this page, and let page_create() have it.
2663          *
2664          * Since `freemem' being zero is not supposed to happen, just
2665          * use the usual hash stuff as a starting point.  If that bucket
2666          * is empty, then assume the worst, and start at the beginning
2667          * of the pcf array.  If we always start at the beginning
2668          * when acquiring more than one pcf lock, there won't be any
2669          * deadlock problems.
2670          */
2671
2672         if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
2673                 pcf_acquire_all();
2674                 goto page_reclaim_nomem;
2675         }
2676
2677         enough = pcf_decrement_bucket(1);
2678
2679         if (!enough) {
2680                 VM_STAT_ADD(page_reclaim_zero);
2681                 /*
2682                  * Check again. Its possible that some other thread
2683                  * could have been right behind us, and added one
2684                  * to a list somewhere.  Acquire each of the pcf locks
2685                  * until we find a page.
2686                  */
2687                 p = pcf;
2688                 for (i = 0; i < pcf_fanout; i++) {
2689                         mutex_enter(&p->pcf_lock);
2690                         if (p->pcf_count >= 1) {
2691                                 p->pcf_count -= 1;
2692                                 /*
2693                                  * freemem is not protected by any lock. Thus,
2694                                  * we cannot have any assertion containing
2695                                  * freemem here.
2696                                  */
2697                                 freemem -= 1;
2698                                 enough = 1;
2699                                 break;
2700                         }
2701                         p++;
2702                 }
2703
2704                 if (!enough) {
2705 page_reclaim_nomem:
2706                         /*
2707                          * We really can't have page `pp'.
2708                          * Time for the no-memory dance with
2709                          * page_free().  This is just like
2710                          * page_create_wait().  Plus the added
2711                          * attraction of releasing the vnode page lock.
2712                          * Page_unlock() will wakeup any thread
2713                          * waiting around for this page.
2714                          */
2715                         if (obj != NULL) {
2716                                 VM_STAT_ADD(page_reclaim_zero_locked);
2717                                 vmobject_unlock(obj);
2718                         }
2719                         page_unlock(pp);
2720
2721                         /*
2722                          * get this before we drop all the pcf locks.
2723                          */
2724                         mutex_enter(&new_freemem_lock);
2725
2726                         p = pcf;
2727                         for (i = 0; i < pcf_fanout; i++) {
2728                                 p->pcf_wait++;
2729                                 mutex_exit(&p->pcf_lock);
2730                                 p++;
2731                         }
2732
2733                         freemem_wait++;
2734                         cv_wait(&freemem_cv, &new_freemem_lock);
2735                         freemem_wait--;
2736
2737                         mutex_exit(&new_freemem_lock);
2738
2739                         if (obj != NULL)
2740                                 vmobject_lock(obj);
2741
2742                         return (0);
2743                 }
2744
2745                 /*
2746                  * The pcf accounting has been done,
2747                  * though none of the pcf_wait flags have been set,
2748                  * drop the locks and continue on.
2749                  */
2750                 while (p >= pcf) {
2751                         mutex_exit(&p->pcf_lock);
2752                         p--;
2753                 }
2754         }
2755
2756
2757         VM_STAT_ADD(pagecnt.pc_reclaim);
2758
2759         /*
2760          * page_list_sub will handle the case where pp is a large page.
2761          * It's possible that the page was promoted while on the freelist
2762          */
2763         if (PP_ISAGED(pp)) {
2764                 page_list_sub(pp, PG_FREE_LIST);
2765         } else {
2766                 page_list_sub(pp, PG_CACHE_LIST);
2767         }
2768
2769         /*
2770          * clear the p_free & p_age bits since this page is no longer
2771          * on the free list.  Notice that there was a brief time where
2772          * a page is marked as free, but is not on the list.
2773          *
2774          * Set the reference bit to protect against immediate pageout.
2775          */
2776         PP_CLRFREE(pp);
2777         PP_CLRAGED(pp);
2778         page_set_props(pp, P_REF);
2779
2780         CPU_STATS_ENTER_K();
2781         cpup = CPU;     /* get cpup now that CPU cannot change */
2782         CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
2783         CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
2784         CPU_STATS_EXIT_K();
2785         ASSERT(pp->p_szc == 0);
2786
2787         return (1);
2788 }
2789
2790 /*
2791  * Destroy identity of the page and put it back on
2792  * the page free list.  Assumes that the caller has
2793  * acquired the "exclusive" lock on the page.
2794  */
2795 void
2796 page_destroy(page_t *pp, int dontfree)
2797 {
2798         ASSERT((PAGE_EXCL(pp) &&
2799             !page_iolock_assert(pp)) || panicstr);
2800         ASSERT(pp->p_slckcnt == 0 || panicstr);
2801
2802         if (pp->p_szc != 0) {
2803                 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2804                     PP_ISKAS(pp)) {
2805                         panic("page_destroy: anon or kernel or no vnode "
2806                             "large page %p", (void *)pp);
2807                 }
2808                 page_demote_vp_pages(pp);
2809                 ASSERT(pp->p_szc == 0);
2810         }
2811
2812         /*
2813          * Unload translations, if any, then hash out the
2814          * page to erase its identity.
2815          */
2816         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2817         page_hashout(pp, false);
2818
2819         if (!dontfree) {
2820                 /*
2821                  * Acquire the "freemem_lock" for availrmem.
2822                  * The page_struct_lock need not be acquired for lckcnt
2823                  * and cowcnt since the page has an "exclusive" lock.
2824                  * We are doing a modified version of page_pp_unlock here.
2825                  */
2826                 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
2827                         mutex_enter(&freemem_lock);
2828                         if (pp->p_lckcnt != 0) {
2829                                 availrmem++;
2830                                 pages_locked--;
2831                                 pp->p_lckcnt = 0;
2832                         }
2833                         if (pp->p_cowcnt != 0) {
2834                                 availrmem += pp->p_cowcnt;
2835                                 pages_locked -= pp->p_cowcnt;
2836                                 pp->p_cowcnt = 0;
2837                         }
2838                         mutex_exit(&freemem_lock);
2839                 }
2840                 /*
2841                  * Put the page on the "free" list.
2842                  */
2843                 page_free(pp, 0);
2844         }
2845 }
2846
2847 void
2848 page_destroy_pages(page_t *pp)
2849 {
2850
2851         page_t  *tpp, *rootpp = NULL;
2852         pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
2853         pgcnt_t i, pglcks = 0;
2854         uint_t  szc = pp->p_szc;
2855
2856         ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2857
2858         VM_STAT_ADD(pagecnt.pc_destroy_pages);
2859
2860         if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2861                 panic("page_destroy_pages: not root page %p", (void *)pp);
2862                 /*NOTREACHED*/
2863         }
2864
2865         for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2866                 ASSERT((PAGE_EXCL(tpp) &&
2867                     !page_iolock_assert(tpp)) || panicstr);
2868                 ASSERT(tpp->p_slckcnt == 0 || panicstr);
2869                 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
2870                 page_hashout(tpp, false);
2871                 ASSERT(tpp->p_offset == (uoff_t)-1);
2872                 if (tpp->p_lckcnt != 0) {
2873                         pglcks++;
2874                         tpp->p_lckcnt = 0;
2875                 } else if (tpp->p_cowcnt != 0) {
2876                         pglcks += tpp->p_cowcnt;
2877                         tpp->p_cowcnt = 0;
2878                 }
2879                 ASSERT(!hat_page_getshare(tpp));
2880                 VERIFY(tpp->p_object == NULL);
2881                 ASSERT(tpp->p_vnode == NULL);
2882                 ASSERT(tpp->p_szc == szc);
2883
2884                 PP_SETFREE(tpp);
2885                 page_clr_all_props(tpp);
2886                 PP_SETAGED(tpp);
2887                 ASSERT(tpp->p_next == tpp);
2888                 ASSERT(tpp->p_prev == tpp);
2889                 page_list_concat(&rootpp, &tpp);
2890         }
2891
2892         ASSERT(rootpp == pp);
2893         if (pglcks != 0) {
2894                 mutex_enter(&freemem_lock);
2895                 availrmem += pglcks;
2896                 mutex_exit(&freemem_lock);
2897         }
2898
2899         page_list_add_pages(rootpp, 0);
2900         page_create_putback(pgcnt);
2901 }
2902
2903 /*
2904  * Similar to page_destroy(), but destroys pages which are
2905  * locked and known to be on the page free list.  Since
2906  * the page is known to be free and locked, no one can access
2907  * it.
2908  *
2909  * Also, the number of free pages does not change.
2910  */
2911 void
2912 page_destroy_free(page_t *pp)
2913 {
2914         ASSERT(PAGE_EXCL(pp));
2915         ASSERT(PP_ISFREE(pp));
2916         ASSERT(pp->p_vnode);
2917         ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
2918         ASSERT(!hat_page_is_mapped(pp));
2919         ASSERT(PP_ISAGED(pp) == 0);
2920         ASSERT(pp->p_szc == 0);
2921
2922         VM_STAT_ADD(pagecnt.pc_destroy_free);
2923         page_list_sub(pp, PG_CACHE_LIST);
2924
2925         page_hashout(pp, false);
2926         VERIFY(pp->p_object == NULL);
2927         ASSERT(pp->p_vnode == NULL);
2928         ASSERT(pp->p_offset == (uoff_t)-1);
2929
2930         PP_SETAGED(pp);
2931         page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2932         page_unlock(pp);
2933
2934         mutex_enter(&new_freemem_lock);
2935         if (freemem_wait) {
2936                 cv_signal(&freemem_cv);
2937         }
2938         mutex_exit(&new_freemem_lock);
2939 }
2940
2941 /*
2942  * Rename the page "opp" to have an identity specified
2943  * by [vp, off].  If a page already exists with this name
2944  * it is locked and destroyed.  Note that the page's
2945  * translations are not unloaded during the rename.
2946  *
2947  * This routine is used by the anon layer to "steal" the
2948  * original page and is not unlike destroying a page and
2949  * creating a new page using the same page frame.
2950  *
2951  * XXX -- Could deadlock if caller 1 tries to rename A to B while
2952  * caller 2 tries to rename B to A.
2953  */
2954 void
2955 page_rename(struct page *opp, struct vmobject *obj, uoff_t off)
2956 {
2957         page_t          *pp;
2958         int             olckcnt = 0;
2959         int             ocowcnt = 0;
2960
2961         ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
2962         ASSERT(!VMOBJECT_LOCKED(obj));
2963         ASSERT(PP_ISFREE(opp) == 0);
2964
2965         VM_STAT_ADD(page_rename_count);
2966
2967         /*
2968          * CacheFS may call page_rename for a large NFS page
2969          * when both CacheFS and NFS mount points are used
2970          * by applications. Demote this large page before
2971          * renaming it, to ensure that there are no "partial"
2972          * large pages left lying around.
2973          */
2974         if (opp->p_szc != 0) {
2975                 vnode_t *ovp = opp->p_vnode;
2976                 ASSERT(ovp != NULL);
2977                 ASSERT(!IS_SWAPFSVP(ovp));
2978                 ASSERT(!VN_ISKAS(ovp));
2979                 page_demote_vp_pages(opp);
2980                 ASSERT(opp->p_szc == 0);
2981         }
2982
2983         page_hashout(opp, false);
2984         PP_CLRAGED(opp);
2985
2986         vmobject_lock(obj);
2987 top:
2988         /*
2989          * Look for an existing page with this name and destroy it if found.
2990          * By holding the page hash lock all the way to the page_hashin()
2991          * call, we are assured that no page can be created with this
2992          * identity.  In the case when the phm lock is dropped to undo any
2993          * hat layer mappings, the existing page is held with an "exclusive"
2994          * lock, again preventing another page from being created with
2995          * this identity.
2996          */
2997         pp = find_page(obj, off);
2998         if (pp != NULL) {
2999                 VM_STAT_ADD(page_rename_exists);
3000
3001                 /*
3002                  * As it turns out, this is one of only two places where
3003                  * page_lock() needs to hold the passed in lock in the
3004                  * successful case.  In all of the others, the lock could
3005                  * be dropped as soon as the attempt is made to lock
3006                  * the page.  It is tempting to add yet another arguement,
3007                  * PL_KEEP or PL_DROP, to let page_lock know what to do.
3008                  */
3009                 if (!page_lock(pp, SE_EXCL, obj, P_RECLAIM)) {
3010                         /*
3011                          * Went to sleep because the page could not
3012                          * be locked.  We were woken up when the page
3013                          * was unlocked, or when the page was destroyed.
3014                          * In either case, `phm' was dropped while we
3015                          * slept.  Hence we should not just roar through
3016                          * this loop.
3017                          */
3018                         goto top;
3019                 }
3020
3021                 /*
3022                  * If an existing page is a large page, then demote
3023                  * it to ensure that no "partial" large pages are
3024                  * "created" after page_rename. An existing page
3025                  * can be a CacheFS page, and can't belong to swapfs.
3026                  */
3027                 if (hat_page_is_mapped(pp)) {
3028                         /*
3029                          * Unload translations.  Since we hold the
3030                          * exclusive lock on this page, the page
3031                          * can not be changed while we drop phm.
3032                          * This is also not a lock protocol violation,
3033                          * but rather the proper way to do things.
3034                          */
3035                         vmobject_unlock(obj);
3036                         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3037                         if (pp->p_szc != 0) {
3038                                 ASSERT(!IS_SWAPFSVP(obj->vnode));
3039                                 ASSERT(!VN_ISKAS(obj->vnode));
3040                                 page_demote_vp_pages(pp);
3041                                 ASSERT(pp->p_szc == 0);
3042                         }
3043                         vmobject_lock(obj);
3044                 } else if (pp->p_szc != 0) {
3045                         ASSERT(!IS_SWAPFSVP(obj->vnode));
3046                         ASSERT(!VN_ISKAS(obj->vnode));
3047                         vmobject_unlock(obj);
3048                         page_demote_vp_pages(pp);
3049                         ASSERT(pp->p_szc == 0);
3050                         vmobject_lock(obj);
3051                 }
3052                 page_hashout(pp, true);
3053         }
3054         /*
3055          * Hash in the page with the new identity.
3056          */
3057         if (!page_hashin(opp, obj, off, true)) {
3058                 /*
3059                  * We were holding phm while we searched for [vp, off]
3060                  * and only dropped phm if we found and locked a page.
3061                  * If we can't create this page now, then some thing
3062                  * is really broken.
3063                  */
3064                 panic("page_rename: Can't hash in page: %p", (void *)pp);
3065                 /*NOTREACHED*/
3066         }
3067
3068         ASSERT(VMOBJECT_LOCKED(obj));
3069         vmobject_unlock(obj);
3070
3071         /*
3072          * Now that we have dropped phm, lets get around to finishing up
3073          * with pp.
3074          */
3075         if (pp != NULL) {
3076                 ASSERT(!hat_page_is_mapped(pp));
3077                 /* for now large pages should not end up here */
3078                 ASSERT(pp->p_szc == 0);
3079                 /*
3080                  * Save the locks for transfer to the new page and then
3081                  * clear them so page_free doesn't think they're important.
3082                  * The page_struct_lock need not be acquired for lckcnt and
3083                  * cowcnt since the page has an "exclusive" lock.
3084                  */
3085                 olckcnt = pp->p_lckcnt;
3086                 ocowcnt = pp->p_cowcnt;
3087                 pp->p_lckcnt = pp->p_cowcnt = 0;
3088
3089                 /*
3090                  * Put the page on the "free" list after we drop
3091                  * the lock.  The less work under the lock the better.
3092                  */
3093                 VN_DISPOSE(pp, B_FREE, 0, kcred);
3094         }
3095
3096         /*
3097          * Transfer the lock count from the old page (if any).
3098          * The page_struct_lock need not be acquired for lckcnt and
3099          * cowcnt since the page has an "exclusive" lock.
3100          */
3101         opp->p_lckcnt += olckcnt;
3102         opp->p_cowcnt += ocowcnt;
3103 }
3104
3105 /*
3106  * low level routine to add page `page' to the AVL tree and vnode chains for
3107  * [vp, offset]
3108  *
3109  * Pages are normally inserted at the start of a vnode's v_object list.
3110  * If the vnode is VMODSORT and the page is modified, it goes at the end.
3111  * This can happen when a modified page is relocated for DR.
3112  *
3113  * Returns 1 on success and 0 on failure.
3114  */
3115 static int
3116 page_do_hashin(struct page *page, struct vmobject *obj, uoff_t offset)
3117 {
3118         avl_index_t where;
3119         page_t **listp;
3120
3121         ASSERT(PAGE_EXCL(page));
3122         ASSERT(obj != NULL);
3123         ASSERT(obj->vnode != NULL);
3124         ASSERT(VMOBJECT_LOCKED(obj));
3125
3126         /*
3127          * Be sure to set these up before the page is inserted into the AVL
3128          * tree.  As soon as the page is placed on the list some other
3129          * thread might get confused and wonder how this page could
3130          * possibly hash to this list.
3131          */
3132         page->p_object = obj;
3133         page->p_vnode = obj->vnode;
3134         page->p_offset = offset;
3135
3136         /*
3137          * record if this page is on a swap vnode
3138          */
3139         if ((obj->vnode->v_flag & VISSWAP) != 0)
3140                 PP_SETSWAP(page);
3141
3142         /*
3143          * Duplicates are not allowed - fail to insert if we already have a
3144          * page with this identity.
3145          */
3146         if (avl_find(&obj->tree, page, &where) != NULL) {
3147                 page->p_object = NULL;
3148                 page->p_vnode = NULL;
3149                 page->p_offset = (uoff_t)(-1);
3150                 return (0);
3151         }
3152
3153         avl_insert(&obj->tree, page, where);
3154
3155         /*
3156          * Add the page to the vnode's list of pages
3157          */
3158         if (IS_VMODSORT(obj->vnode) && hat_ismod(page))
3159                 vmobject_add_page_tail(obj, page);
3160         else
3161                 vmobject_add_page_head(obj, page);
3162
3163         return (1);
3164 }
3165
3166 /*
3167  * Add page `pp' to both the hash and vp chains for [vp, offset].
3168  *
3169  * Returns 1 on success and 0 on failure.
3170  * If `locked` is true, we do *not* attempt to lock the vnode's page mutex.
3171  */
3172 int
3173 page_hashin(struct page *pp, struct vmobject *obj, uoff_t offset, bool locked)
3174 {
3175         int rc;
3176
3177         ASSERT(pp->p_fsdata == 0 || panicstr);
3178
3179         VM_STAT_ADD(hashin_count);
3180
3181         if (!locked) {
3182                 VM_STAT_ADD(hashin_not_held);
3183                 vmobject_lock(obj);
3184         }
3185
3186         rc = page_do_hashin(pp, obj, offset);
3187
3188         if (!locked)
3189                 vmobject_unlock(obj);
3190
3191         if (rc == 0)
3192                 VM_STAT_ADD(hashin_already);
3193
3194         return (rc);
3195 }
3196
3197 /*
3198  * Remove page `page' from the AVL tree and vnode chains and remove its
3199  * vnode association.  All mutexes must be held
3200  */
3201 static void
3202 page_do_hashout(page_t *page)
3203 {
3204         page_t  **hpp;
3205         page_t  *hp;
3206         vnode_t *vnode = page->p_vnode;
3207
3208         ASSERT(vnode != NULL);
3209         ASSERT(VMOBJECT_LOCKED(&vnode->v_object));
3210
3211         avl_remove(&vnode->v_object.tree, page);
3212
3213         vmobject_remove_page(&vnode->v_object, page);
3214
3215         page_clr_all_props(page);
3216         PP_CLRSWAP(page);
3217         page->p_object = NULL;
3218         page->p_vnode = NULL;
3219         page->p_offset = (uoff_t)-1;
3220         page->p_fsdata = 0;
3221 }
3222
3223 /*
3224  * Remove page `page' from the AVL tree and vnode chains and remove vnode
3225  * association.
3226  *
3227  * When `locked` is true, we do *not* attempt to lock the vnode's page
3228  * mutex.
3229  */
3230 void
3231 page_hashout(page_t *pp, bool locked)
3232 {
3233         struct vmobject *obj;
3234         ulong_t         index;
3235         kmutex_t        *sep;
3236
3237         ASSERT(hold != NULL ? MUTEX_HELD(hold) : 1);
3238         ASSERT(pp->p_vnode != NULL);
3239         ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3240
3241         obj = &pp->p_vnode->v_object;
3242
3243         if (!locked) {
3244                 VM_STAT_ADD(hashout_not_held);
3245                 vmobject_lock(obj);
3246         }
3247
3248         page_do_hashout(pp);
3249
3250         if (!locked)
3251                 vmobject_unlock(obj);
3252
3253         /*
3254          * Wake up processes waiting for this page.  The page's
3255          * identity has been changed, and is probably not the
3256          * desired page any longer.
3257          */
3258         sep = page_se_mutex(pp);
3259         mutex_enter(sep);
3260         pp->p_selock &= ~SE_EWANTED;
3261         if (CV_HAS_WAITERS(&pp->p_cv))
3262                 cv_broadcast(&pp->p_cv);
3263         mutex_exit(sep);
3264 }
3265
3266 /*
3267  * Add the page to the front of a linked list of pages
3268  * using the p_next & p_prev pointers for the list.
3269  * The caller is responsible for protecting the list pointers.
3270  */
3271 void
3272 page_add(page_t **ppp, page_t *pp)
3273 {
3274         ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3275
3276         page_add_common(ppp, pp);
3277 }
3278
3279
3280
3281 /*
3282  *  Common code for page_add() and mach_page_add()
3283  */
3284 void
3285 page_add_common(page_t **ppp, page_t *pp)
3286 {
3287         if (*ppp == NULL) {
3288                 pp->p_next = pp->p_prev = pp;
3289         } else {
3290                 pp->p_next = *ppp;
3291                 pp->p_prev = (*ppp)->p_prev;
3292                 (*ppp)->p_prev = pp;
3293                 pp->p_prev->p_next = pp;
3294         }
3295         *ppp = pp;
3296 }
3297
3298
3299 /*
3300  * Remove this page from a linked list of pages
3301  * using the p_next & p_prev pointers for the list.
3302  *
3303  * The caller is responsible for protecting the list pointers.
3304  */
3305 void
3306 page_sub(page_t **ppp, page_t *pp)
3307 {
3308         ASSERT((PP_ISFREE(pp)) ? 1 :
3309             (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3310
3311         if (*ppp == NULL || pp == NULL) {
3312                 panic("page_sub: bad arg(s): pp %p, *ppp %p",
3313                     (void *)pp, (void *)(*ppp));
3314                 /*NOTREACHED*/
3315         }
3316
3317         page_sub_common(ppp, pp);
3318 }
3319
3320
3321 /*
3322  *  Common code for page_sub() and mach_page_sub()
3323  */
3324 void
3325 page_sub_common(page_t **ppp, page_t *pp)
3326 {
3327         if (*ppp == pp)
3328                 *ppp = pp->p_next;              /* go to next page */
3329
3330         if (*ppp == pp)
3331                 *ppp = NULL;                    /* page list is gone */
3332         else {
3333                 pp->p_prev->p_next = pp->p_next;
3334                 pp->p_next->p_prev = pp->p_prev;
3335         }
3336         pp->p_prev = pp->p_next = pp;           /* make pp a list of one */
3337 }
3338
3339
3340 /*
3341  * Break page list cppp into two lists with npages in the first list.
3342  * The tail is returned in nppp.
3343  */
3344 void
3345 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3346 {
3347         page_t *s1pp = *oppp;
3348         page_t *s2pp;
3349         page_t *e1pp, *e2pp;
3350         long n = 0;
3351
3352         if (s1pp == NULL) {
3353                 *nppp = NULL;
3354                 return;
3355         }
3356         if (npages == 0) {
3357                 *nppp = s1pp;
3358                 *oppp = NULL;
3359                 return;
3360         }
3361         for (n = 0, s2pp = *oppp; n < npages; n++) {
3362                 s2pp = s2pp->p_next;
3363         }
3364         /* Fix head and tail of new lists */
3365         e1pp = s2pp->p_prev;
3366         e2pp = s1pp->p_prev;
3367         s1pp->p_prev = e1pp;
3368         e1pp->p_next = s1pp;
3369         s2pp->p_prev = e2pp;
3370         e2pp->p_next = s2pp;
3371
3372         /* second list empty */
3373         if (s2pp == s1pp) {
3374                 *oppp = s1pp;
3375                 *nppp = NULL;
3376         } else {
3377                 *oppp = s1pp;
3378                 *nppp = s2pp;
3379         }
3380 }
3381
3382 /*
3383  * Concatenate page list nppp onto the end of list ppp.
3384  */
3385 void
3386 page_list_concat(page_t **ppp, page_t **nppp)
3387 {
3388         page_t *s1pp, *s2pp, *e1pp, *e2pp;
3389
3390         if (*nppp == NULL) {
3391                 return;
3392         }
3393         if (*ppp == NULL) {
3394                 *ppp = *nppp;
3395                 return;
3396         }
3397         s1pp = *ppp;
3398         e1pp =  s1pp->p_prev;
3399         s2pp = *nppp;
3400         e2pp = s2pp->p_prev;
3401         s1pp->p_prev = e2pp;
3402         e2pp->p_next = s1pp;
3403         e1pp->p_next = s2pp;
3404         s2pp->p_prev = e1pp;
3405 }
3406
3407 /*
3408  * return the next page in the page list
3409  */
3410 page_t *
3411 page_list_next(page_t *pp)
3412 {
3413         return (pp->p_next);
3414 }
3415
3416
3417 /*
3418  * Add the page to the front of the linked list of pages
3419  * using p_list.vnode for the list.
3420  *
3421  * The caller is responsible for protecting the lists.
3422  */
3423 void
3424 page_vpadd(page_t **ppp, page_t *pp)
3425 {
3426         panic("%s should not be used", __func__);
3427 }
3428
3429 void
3430 page_lpadd(page_t **ppp, page_t *pp)
3431 {
3432         if (*ppp == NULL) {
3433                 pp->p_list.largepg.next = pp->p_list.largepg.prev = pp;
3434         } else {
3435                 pp->p_list.largepg.next = *ppp;
3436                 pp->p_list.largepg.prev = (*ppp)->p_list.largepg.prev;
3437                 (*ppp)->p_list.largepg.prev = pp;
3438                 pp->p_list.largepg.prev->p_list.largepg.next = pp;
3439         }
3440         *ppp = pp;
3441 }
3442
3443 /*
3444  * Remove this page from the linked list of pages
3445  * using p_list.vnode for the list.
3446  *
3447  * The caller is responsible for protecting the lists.
3448  */
3449 void
3450 page_vpsub(page_t **ppp, page_t *pp)
3451 {
3452         panic("%s should not be used", __func__);
3453 }
3454
3455 void
3456 page_lpsub(page_t **ppp, page_t *pp)
3457 {
3458         if (*ppp == NULL || pp == NULL) {
3459                 panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
3460                     (void *)pp, (void *)(*ppp));
3461                 /*NOTREACHED*/
3462         }
3463
3464         if (*ppp == pp)
3465                 *ppp = pp->p_list.largepg.next;         /* go to next page */
3466
3467         if (*ppp == pp)
3468                 *ppp = NULL;                    /* page list is gone */
3469         else {
3470                 pp->p_list.largepg.prev->p_list.largepg.next = pp->p_list.largepg.next;
3471                 pp->p_list.largepg.next->p_list.largepg.prev = pp->p_list.largepg.prev;
3472         }
3473         pp->p_list.largepg.prev = pp->p_list.largepg.next = pp; /* make pp a list of one */
3474 }
3475
3476 /*
3477  * Lock a physical page into memory "long term".  Used to support "lock
3478  * in memory" functions.  Accepts the page to be locked, and a cow variable
3479  * to indicate whether a the lock will travel to the new page during
3480  * a potential copy-on-write.
3481  */
3482 int
3483 page_pp_lock(
3484         page_t *pp,                     /* page to be locked */
3485         int cow,                        /* cow lock */
3486         int kernel)                     /* must succeed -- ignore checking */
3487 {
3488         int r = 0;                      /* result -- assume failure */
3489
3490         ASSERT(PAGE_LOCKED(pp));
3491
3492         page_struct_lock(pp);
3493         /*
3494          * Acquire the "freemem_lock" for availrmem.
3495          */
3496         if (cow) {
3497                 mutex_enter(&freemem_lock);
3498                 if ((availrmem > pages_pp_maximum) &&
3499                     (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3500                         availrmem--;
3501                         pages_locked++;
3502                         mutex_exit(&freemem_lock);
3503                         r = 1;
3504                         if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3505                                 cmn_err(CE_WARN,
3506                                     "COW lock limit reached on pfn 0x%lx",
3507                                     page_pptonum(pp));
3508                         }
3509                 } else
3510                         mutex_exit(&freemem_lock);
3511         } else {
3512                 if (pp->p_lckcnt) {
3513                         if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3514                                 r = 1;
3515                                 if (++pp->p_lckcnt ==
3516                                     (ushort_t)PAGE_LOCK_MAXIMUM) {
3517                                         cmn_err(CE_WARN, "Page lock limit "
3518                                             "reached on pfn 0x%lx",
3519                                             page_pptonum(pp));
3520                                 }
3521                         }
3522                 } else {
3523                         if (kernel) {
3524                                 /* availrmem accounting done by caller */
3525                                 ++pp->p_lckcnt;
3526                                 r = 1;
3527                         } else {
3528                                 mutex_enter(&freemem_lock);
3529                                 if (availrmem > pages_pp_maximum) {
3530                                         availrmem--;
3531                                         pages_locked++;
3532                                         ++pp->p_lckcnt;
3533                                         r = 1;
3534                                 }
3535                                 mutex_exit(&freemem_lock);
3536                         }
3537                 }
3538         }
3539         page_struct_unlock(pp);
3540         return (r);
3541 }
3542
3543 /*
3544  * Decommit a lock on a physical page frame.  Account for cow locks if
3545  * appropriate.
3546  */
3547 void
3548 page_pp_unlock(
3549         page_t *pp,                     /* page to be unlocked */
3550         int cow,                        /* expect cow lock */
3551         int kernel)                     /* this was a kernel lock */
3552 {
3553         ASSERT(PAGE_LOCKED(pp));
3554
3555         page_struct_lock(pp);
3556         /*
3557          * Acquire the "freemem_lock" for availrmem.
3558          * If cowcnt or lcknt is already 0 do nothing; i.e., we
3559          * could be called to unlock even if nothing is locked. This could
3560          * happen if locked file pages were truncated (removing the lock)
3561          * and the file was grown again and new pages faulted in; the new
3562          * pages are unlocked but the segment still thinks they're locked.
3563          */
3564         if (cow) {
3565                 if (pp->p_cowcnt) {
3566                         mutex_enter(&freemem_lock);
3567                         pp->p_cowcnt--;
3568                         availrmem++;
3569                         pages_locked--;
3570                         mutex_exit(&freemem_lock);
3571                 }
3572         } else {
3573                 if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
3574                         if (!kernel) {
3575                                 mutex_enter(&freemem_lock);
3576                                 availrmem++;
3577                                 pages_locked--;
3578                                 mutex_exit(&freemem_lock);
3579                         }
3580                 }
3581         }
3582         page_struct_unlock(pp);
3583 }
3584
3585 /*
3586  * This routine reserves availrmem for npages;
3587  *      flags: KM_NOSLEEP or KM_SLEEP
3588  *      returns 1 on success or 0 on failure
3589  */
3590 int
3591 page_resv(pgcnt_t npages, uint_t flags)
3592 {
3593         mutex_enter(&freemem_lock);
3594         while (availrmem < tune.t_minarmem + npages) {
3595                 if (flags & KM_NOSLEEP) {
3596                         mutex_exit(&freemem_lock);
3597                         return (0);
3598                 }
3599                 mutex_exit(&freemem_lock);
3600                 page_needfree(npages);
3601                 kmem_reap();
3602                 ddi_msleep(250);
3603                 page_needfree(-(spgcnt_t)npages);
3604                 mutex_enter(&freemem_lock);
3605         }
3606         availrmem -= npages;
3607         mutex_exit(&freemem_lock);
3608         return (1);
3609 }
3610
3611 /*
3612  * This routine unreserves availrmem for npages;
3613  */
3614 void
3615 page_unresv(pgcnt_t npages)
3616 {
3617         mutex_enter(&freemem_lock);
3618         availrmem += npages;
3619         mutex_exit(&freemem_lock);
3620 }
3621
3622 /*
3623  * See Statement at the beginning of segvn_lockop() regarding
3624  * the way we handle cowcnts and lckcnts.
3625  *
3626  * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
3627  * that breaks COW has PROT_WRITE.
3628  *
3629  * Note that, we may also break COW in case we are softlocking
3630  * on read access during physio;
3631  * in this softlock case, the vpage may not have PROT_WRITE.
3632  * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
3633  * if the vpage doesn't have PROT_WRITE.
3634  *
3635  * This routine is never called if we are stealing a page
3636  * in anon_private.
3637  *
3638  * The caller subtracted from availrmem for read only mapping.
3639  * if lckcnt is 1 increment availrmem.
3640  */
3641 void
3642 page_pp_useclaim(
3643         page_t *opp,            /* original page frame losing lock */
3644         page_t *npp,            /* new page frame gaining lock */
3645         uint_t  write_perm)     /* set if vpage has PROT_WRITE */
3646 {
3647         int payback = 0;
3648         int nidx, oidx;
3649
3650         ASSERT(PAGE_LOCKED(opp));
3651         ASSERT(PAGE_LOCKED(npp));
3652
3653         /*
3654          * Since we have two pages we probably have two locks.  We need to take
3655          * them in a defined order to avoid deadlocks.  It's also possible they
3656          * both hash to the same lock in which case this is a non-issue.
3657          */
3658         nidx = PAGE_LLOCK_HASH(PP_PAGEROOT(npp));
3659         oidx = PAGE_LLOCK_HASH(PP_PAGEROOT(opp));
3660         if (nidx < oidx) {
3661                 page_struct_lock(npp);
3662                 page_struct_lock(opp);
3663         } else if (oidx < nidx) {
3664                 page_struct_lock(opp);
3665                 page_struct_lock(npp);
3666         } else {        /* The pages hash to the same lock */
3667                 page_struct_lock(npp);
3668         }
3669
3670         ASSERT(npp->p_cowcnt == 0);
3671         ASSERT(npp->p_lckcnt == 0);
3672
3673         /* Don't use claim if nothing is locked (see page_pp_unlock above) */
3674         if ((write_perm && opp->p_cowcnt != 0) ||
3675             (!write_perm && opp->p_lckcnt != 0)) {
3676
3677                 if (write_perm) {
3678                         npp->p_cowcnt++;
3679                         ASSERT(opp->p_cowcnt != 0);
3680                         opp->p_cowcnt--;
3681                 } else {
3682
3683                         ASSERT(opp->p_lckcnt != 0);
3684
3685                         /*
3686                          * We didn't need availrmem decremented if p_lckcnt on
3687                          * original page is 1. Here, we are unlocking
3688                          * read-only copy belonging to original page and
3689                          * are locking a copy belonging to new page.
3690                          */
3691                         if (opp->p_lckcnt == 1)
3692                                 payback = 1;
3693
3694                         npp->p_lckcnt++;
3695                         opp->p_lckcnt--;
3696                 }
3697         }
3698         if (payback) {
3699                 mutex_enter(&freemem_lock);
3700                 availrmem++;
3701                 pages_useclaim--;
3702                 mutex_exit(&freemem_lock);
3703         }
3704
3705         if (nidx < oidx) {
3706                 page_struct_unlock(opp);
3707                 page_struct_unlock(npp);
3708         } else if (oidx < nidx) {
3709                 page_struct_unlock(npp);
3710                 page_struct_unlock(opp);
3711         } else {        /* The pages hash to the same lock */
3712                 page_struct_unlock(npp);
3713         }
3714 }
3715
3716 /*
3717  * Simple claim adjust functions -- used to support changes in
3718  * claims due to changes in access permissions.  Used by segvn_setprot().
3719  */
3720 int
3721 page_addclaim(page_t *pp)
3722 {
3723         int r = 0;                      /* result */
3724
3725         ASSERT(PAGE_LOCKED(pp));
3726
3727         page_struct_lock(pp);
3728         ASSERT(pp->p_lckcnt != 0);
3729
3730         if (pp->p_lckcnt == 1) {
3731                 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3732                         --pp->p_lckcnt;
3733                         r = 1;
3734                         if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3735                                 cmn_err(CE_WARN,
3736                                     "COW lock limit reached on pfn 0x%lx",
3737                                     page_pptonum(pp));
3738                         }
3739                 }
3740         } else {
3741                 mutex_enter(&freemem_lock);
3742                 if ((availrmem > pages_pp_maximum) &&
3743                     (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3744                         --availrmem;
3745                         ++pages_claimed;
3746                         mutex_exit(&freemem_lock);
3747                         --pp->p_lckcnt;
3748                         r = 1;
3749                         if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3750                                 cmn_err(CE_WARN,
3751                                     "COW lock limit reached on pfn 0x%lx",
3752                                     page_pptonum(pp));
3753                         }
3754                 } else
3755                         mutex_exit(&freemem_lock);
3756         }
3757         page_struct_unlock(pp);
3758         return (r);
3759 }
3760
3761 int
3762 page_subclaim(page_t *pp)
3763 {
3764         int r = 0;
3765
3766         ASSERT(PAGE_LOCKED(pp));
3767
3768         page_struct_lock(pp);
3769         ASSERT(pp->p_cowcnt != 0);
3770
3771         if (pp->p_lckcnt) {
3772                 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3773                         r = 1;
3774                         /*
3775                          * for availrmem
3776                          */
3777                         mutex_enter(&freemem_lock);
3778                         availrmem++;
3779                         pages_claimed--;
3780                         mutex_exit(&freemem_lock);
3781
3782                         pp->p_cowcnt--;
3783
3784                         if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3785                                 cmn_err(CE_WARN,
3786                                     "Page lock limit reached on pfn 0x%lx",
3787                                     page_pptonum(pp));
3788                         }
3789                 }
3790         } else {
3791                 r = 1;
3792                 pp->p_cowcnt--;
3793                 pp->p_lckcnt++;
3794         }
3795         page_struct_unlock(pp);
3796         return (r);
3797 }
3798
3799 /*
3800  * Variant of page_addclaim(), where ppa[] contains the pages of a single large
3801  * page.
3802  */
3803 int
3804 page_addclaim_pages(page_t  **ppa)
3805 {
3806         pgcnt_t lckpgs = 0, pg_idx;
3807
3808         VM_STAT_ADD(pagecnt.pc_addclaim_pages);
3809
3810         /*
3811          * Only need to take the page struct lock on the large page root.
3812          */
3813         page_struct_lock(ppa[0]);
3814         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
3815
3816                 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
3817                 ASSERT(ppa[pg_idx]->p_lckcnt != 0);
3818                 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3819                         page_struct_unlock(ppa[0]);
3820                         return (0);
3821                 }
3822                 if (ppa[pg_idx]->p_lckcnt > 1)
3823                         lckpgs++;
3824         }
3825
3826         if (lckpgs != 0) {
3827                 mutex_enter(&freemem_lock);
3828                 if (availrmem >= pages_pp_maximum + lckpgs) {
3829                         availrmem -= lckpgs;
3830                         pages_claimed += lckpgs;
3831                 } else {
3832                         mutex_exit(&freemem_lock);
3833                         page_struct_unlock(ppa[0]);
3834                         return (0);
3835                 }
3836                 mutex_exit(&freemem_lock);
3837         }
3838
3839         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
3840                 ppa[pg_idx]->p_lckcnt--;
3841                 ppa[pg_idx]->p_cowcnt++;
3842         }
3843         page_struct_unlock(ppa[0]);
3844         return (1);
3845 }
3846
3847 /*
3848  * Variant of page_subclaim(), where ppa[] contains the pages of a single large
3849  * page.
3850  */
3851 int
3852 page_subclaim_pages(page_t  **ppa)
3853 {
3854         pgcnt_t ulckpgs = 0, pg_idx;
3855
3856         VM_STAT_ADD(pagecnt.pc_subclaim_pages);
3857
3858         /*
3859          * Only need to take the page struct lock on the large page root.
3860          */
3861         page_struct_lock(ppa[0]);
3862         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
3863
3864                 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
3865                 ASSERT(ppa[pg_idx]->p_cowcnt != 0);
3866                 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3867                         page_struct_unlock(ppa[0]);
3868                         return (0);
3869                 }
3870                 if (ppa[pg_idx]->p_lckcnt != 0)
3871                         ulckpgs++;
3872         }
3873
3874         if (ulckpgs != 0) {
3875                 mutex_enter(&freemem_lock);
3876                 availrmem += ulckpgs;
3877                 pages_claimed -= ulckpgs;
3878                 mutex_exit(&freemem_lock);
3879         }
3880
3881         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
3882                 ppa[pg_idx]->p_cowcnt--;
3883                 ppa[pg_idx]->p_lckcnt++;
3884
3885         }
3886         page_struct_unlock(ppa[0]);
3887         return (1);
3888 }
3889
3890 page_t *
3891 page_numtopp(pfn_t pfnum, se_t se)
3892 {
3893         page_t *pp;
3894
3895 retry:
3896         pp = page_numtopp_nolock(pfnum);
3897         if (pp == NULL) {
3898                 return (NULL);
3899         }
3900
3901         /*
3902          * Acquire the appropriate lock on the page.
3903          */
3904         while (!page_lock(pp, se, NULL, P_RECLAIM)) {
3905                 if (page_pptonum(pp) != pfnum)
3906                         goto retry;
3907                 continue;
3908         }
3909
3910         if (page_pptonum(pp) != pfnum) {
3911                 page_unlock(pp);
3912                 goto retry;
3913         }
3914
3915         return (pp);
3916 }
3917
3918 page_t *
3919 page_numtopp_noreclaim(pfn_t pfnum, se_t se)
3920 {
3921         page_t *pp;
3922
3923 retry:
3924         pp = page_numtopp_nolock(pfnum);
3925         if (pp == NULL) {
3926                 return (NULL);
3927         }
3928
3929         /*
3930          * Acquire the appropriate lock on the page.
3931          */
3932         while (!page_lock(pp, se, NULL, P_NO_RECLAIM)) {
3933                 if (page_pptonum(pp) != pfnum)
3934                         goto retry;
3935                 continue;
3936         }
3937
3938         if (page_pptonum(pp) != pfnum) {
3939                 page_unlock(pp);
3940                 goto retry;
3941         }
3942
3943         return (pp);
3944 }
3945
3946 /*
3947  * This routine is like page_numtopp, but will only return page structs
3948  * for pages which are ok for loading into hardware using the page struct.
3949  */
3950 page_t *
3951 page_numtopp_nowait(pfn_t pfnum, se_t se)
3952 {
3953         page_t *pp;
3954
3955 retry:
3956         pp = page_numtopp_nolock(pfnum);
3957         if (pp == NULL) {
3958                 return (NULL);
3959         }
3960
3961         /*
3962          * Try to acquire the appropriate lock on the page.
3963          */
3964         if (PP_ISFREE(pp))
3965                 pp = NULL;
3966         else {
3967                 if (!page_trylock(pp, se))
3968                         pp = NULL;
3969                 else {
3970                         if (page_pptonum(pp) != pfnum) {
3971                                 page_unlock(pp);
3972                                 goto retry;
3973                         }
3974                         if (PP_ISFREE(pp)) {
3975                                 page_unlock(pp);
3976                                 pp = NULL;
3977                         }
3978                 }
3979         }
3980         return (pp);
3981 }
3982
3983 /*
3984  * Returns a count of dirty pages that are in the process
3985  * of being written out.  If 'cleanit' is set, try to push the page.
3986  */
3987 pgcnt_t
3988 page_busy(int cleanit)
3989 {
3990         page_t *page0 = page_first();
3991         page_t *pp = page0;
3992         pgcnt_t nppbusy = 0;
3993         uoff_t off;
3994
3995         do {
3996                 vnode_t *vp = pp->p_vnode;
3997                 /*
3998                  * A page is a candidate for syncing if it is:
3999                  *
4000                  * (a)  On neither the freelist nor the cachelist
4001                  * (b)  Hashed onto a vnode
4002                  * (c)  Not a kernel page
4003                  * (d)  Dirty
4004                  * (e)  Not part of a swapfile
4005                  * (f)  a page which belongs to a real vnode; eg has a non-null
4006                  *      v_vfsp pointer.
4007                  * (g)  Backed by a filesystem which doesn't have a
4008                  *      stubbed-out sync operation
4009                  */
4010                 if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
4011                     hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4012                     vfs_can_sync(vp->v_vfsp)) {
4013                         nppbusy++;
4014
4015                         if (!cleanit)
4016                                 continue;
4017                         if (!page_trylock(pp, SE_EXCL))
4018                                 continue;
4019
4020                         if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4021                             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4022                             !(hat_pagesync(pp,
4023                             HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4024                                 page_unlock(pp);
4025                                 continue;
4026                         }
4027                         off = pp->p_offset;
4028                         VN_HOLD(vp);
4029                         page_unlock(pp);
4030                         (void) fop_putpage(vp, off, PAGESIZE,
4031                             B_ASYNC | B_FREE, kcred, NULL);
4032                         VN_RELE(vp);
4033                 }
4034         } while ((pp = page_next(pp)) != page0);
4035
4036         return (nppbusy);
4037 }
4038
4039 void page_invalidate_pages(void);
4040
4041 /*
4042  * callback handler to vm sub-system
4043  *
4044  * callers make sure no recursive entries to this func.
4045  */
4046 /*ARGSUSED*/
4047 boolean_t
4048 callb_vm_cpr(void *arg, int code)
4049 {
4050         if (code == CB_CODE_CPR_CHKPT)
4051                 page_invalidate_pages();
4052         return (B_TRUE);
4053 }
4054
4055 /*
4056  * Invalidate all pages of the system.
4057  * It shouldn't be called until all user page activities are all stopped.
4058  */
4059 void
4060 page_invalidate_pages()
4061 {
4062         page_t *pp;
4063         page_t *page0;
4064         pgcnt_t nbusypages;
4065         int retry = 0;
4066         const int MAXRETRIES = 4;
4067 top:
4068         /*
4069          * Flush dirty pages and destroy the clean ones.
4070          */
4071         nbusypages = 0;
4072
4073         pp = page0 = page_first();
4074         do {
4075                 struct vnode    *vp;
4076                 uoff_t  offset;
4077                 int             mod;
4078
4079                 /*
4080                  * skip the page if it has no vnode or the page associated
4081                  * with the kernel vnode or prom allocated kernel mem.
4082                  */
4083                 if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
4084                         continue;
4085
4086                 /*
4087                  * skip the page which is already free invalidated.
4088                  */
4089                 if (PP_ISFREE(pp) && PP_ISAGED(pp))
4090                         continue;
4091
4092                 /*
4093                  * skip pages that are already locked or can't be "exclusively"
4094                  * locked or are already free.  After we lock the page, check
4095                  * the free and age bits again to be sure it's not destroyed
4096                  * yet.
4097                  * To achieve max. parallelization, we use page_trylock instead
4098                  * of page_lock so that we don't get block on individual pages
4099                  * while we have thousands of other pages to process.
4100                  */
4101                 if (!page_trylock(pp, SE_EXCL)) {
4102                         nbusypages++;
4103                         continue;
4104                 } else if (PP_ISFREE(pp)) {
4105                         if (!PP_ISAGED(pp)) {
4106                                 page_destroy_free(pp);
4107                         } else {
4108                                 page_unlock(pp);
4109                         }
4110                         continue;
4111                 }
4112                 /*
4113                  * Is this page involved in some I/O? shared?
4114                  *
4115                  * The page_struct_lock need not be acquired to
4116                  * examine these fields since the page has an
4117                  * "exclusive" lock.
4118                  */
4119                 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4120                         page_unlock(pp);
4121                         continue;
4122                 }
4123
4124                 if (vp->v_type == VCHR) {
4125                         panic("vp->v_type == VCHR");
4126                         /*NOTREACHED*/
4127                 }
4128
4129                 if (!page_try_demote_pages(pp)) {
4130                         page_unlock(pp);
4131                         continue;
4132                 }
4133
4134                 /*
4135                  * Check the modified bit. Leave the bits alone in hardware
4136                  * (they will be modified if we do the putpage).
4137                  */
4138                 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4139                     & P_MOD);
4140                 if (mod) {
4141                         offset = pp->p_offset;
4142                         /*
4143                          * Hold the vnode before releasing the page lock
4144                          * to prevent it from being freed and re-used by
4145                          * some other thread.
4146                          */
4147                         VN_HOLD(vp);
4148                         page_unlock(pp);
4149                         /*
4150                          * No error return is checked here. Callers such as
4151                          * cpr deals with the dirty pages at the dump time
4152                          * if this putpage fails.
4153                          */
4154                         (void) fop_putpage(vp, offset, PAGESIZE, B_INVAL,
4155                             kcred, NULL);
4156                         VN_RELE(vp);
4157                 } else {
4158                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
4159                 }
4160         } while ((pp = page_next(pp)) != page0);
4161         if (nbusypages && retry++ < MAXRETRIES) {
4162                 delay(1);
4163                 goto top;
4164         }
4165 }
4166
4167 /*
4168  * Replace the page "old" with the page "new" on the page hash and vnode lists
4169  *
4170  * the replacement must be done in place, ie the equivalent sequence:
4171  *
4172  *      vp = old->p_vnode;
4173  *      off = old->p_offset;
4174  *      page_do_hashout(old)
4175  *      page_do_hashin(new, obj, off)
4176  *
4177  * doesn't work, since
4178  *  1) if old is the only page on the vnode, the v_object list has a window
4179  *     where it looks empty. This will break file system assumptions.
4180  * and
4181  *  2) pvn_vplist_dirty() can't deal with pages moving on the v_object list.
4182  */
4183 static void
4184 page_do_relocate_hash(page_t *new, page_t *old)
4185 {
4186         page_t  **hash_list;
4187         vnode_t *vp = old->p_vnode;
4188         kmutex_t *sep;
4189
4190         ASSERT(PAGE_EXCL(old));
4191         ASSERT(PAGE_EXCL(new));
4192         ASSERT(vp != NULL);
4193         ASSERT(VMOBJECT_LOCKED(&vp->v_object));
4194
4195         /*
4196          * update new and replace old with new on the page hash list
4197          */
4198         new->p_object = old->p_object;
4199         new->p_vnode = old->p_vnode;
4200         new->p_offset = old->p_offset;
4201
4202         avl_remove(&vp->v_object.tree, old);
4203         avl_add(&vp->v_object.tree, new);
4204
4205         if ((new->p_vnode->v_flag & VISSWAP) != 0)
4206                 PP_SETSWAP(new);
4207
4208         /*
4209          * replace old with new on the vnode's page list
4210          */
4211         list_insert_before(&vp->v_object.list, old, new);
4212         list_remove(&vp->v_object.list, old);
4213
4214         /*
4215          * clear out the old page
4216          */
4217         old->p_object = NULL;
4218         old->p_vnode = NULL;
4219         PP_CLRSWAP(old);
4220         old->p_offset = (uoff_t)-1;
4221         page_clr_all_props(old);
4222
4223         /*
4224          * Wake up processes waiting for this page.  The page's
4225          * identity has been changed, and is probably not the
4226          * desired page any longer.
4227          */
4228         sep = page_se_mutex(old);
4229         mutex_enter(sep);
4230         old->p_selock &= ~SE_EWANTED;
4231         if (CV_HAS_WAITERS(&old->p_cv))
4232                 cv_broadcast(&old->p_cv);
4233         mutex_exit(sep);
4234 }
4235
4236 /*
4237  * This function moves the identity of page "pp_old" to page "pp_new".
4238  * Both pages must be locked on entry.  "pp_new" is free, has no identity,
4239  * and need not be hashed out from anywhere.
4240  */
4241 void
4242 page_relocate_hash(page_t *pp_new, page_t *pp_old)
4243 {
4244         vnode_t *vp = pp_old->p_vnode;
4245         uoff_t off = pp_old->p_offset;
4246
4247         /*
4248          * Rehash two pages
4249          */
4250         ASSERT(PAGE_EXCL(pp_old));
4251         ASSERT(PAGE_EXCL(pp_new));
4252         ASSERT(vp != NULL);
4253         VERIFY(pp_new->p_object == NULL);
4254         ASSERT(pp_new->p_vnode == NULL);
4255
4256         vmobject_lock(&vp->v_object);
4257
4258         page_do_relocate_hash(pp_new, pp_old);
4259         pp_new->p_fsdata = pp_old->p_fsdata;
4260         pp_old->p_fsdata = 0;
4261
4262         vmobject_unlock(&vp->v_object);
4263
4264         /*
4265          * The page_struct_lock need not be acquired for lckcnt and
4266          * cowcnt since the page has an "exclusive" lock.
4267          */
4268         ASSERT(pp_new->p_lckcnt == 0);
4269         ASSERT(pp_new->p_cowcnt == 0);
4270         pp_new->p_lckcnt = pp_old->p_lckcnt;
4271         pp_new->p_cowcnt = pp_old->p_cowcnt;
4272         pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4273 }
4274
4275 /*
4276  * Helper routine used to lock all remaining members of a
4277  * large page. The caller is responsible for passing in a locked
4278  * pp. If pp is a large page, then it succeeds in locking all the
4279  * remaining constituent pages or it returns with only the
4280  * original page locked.
4281  *
4282  * Returns 1 on success, 0 on failure.
4283  *
4284  * If success is returned this routine guarantees p_szc for all constituent
4285  * pages of a large page pp belongs to can't change. To achieve this we
4286  * recheck szc of pp after locking all constituent pages and retry if szc
4287  * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4288  * lock on one of constituent pages it can't be running after all constituent
4289  * pages are locked.  hat_page_demote() with a lock on a constituent page
4290  * outside of this large page (i.e. pp belonged to a larger large page) is
4291  * already done with all constituent pages of pp since the root's p_szc is
4292  * changed last. Therefore no need to synchronize with hat_page_demote() that
4293  * locked a constituent page outside of pp's current large page.
4294  */
4295 #ifdef DEBUG
4296 uint32_t gpg_trylock_mtbf = 0;
4297 #endif
4298
4299 int
4300 group_page_trylock(page_t *pp, se_t se)
4301 {
4302         page_t  *tpp;
4303         pgcnt_t npgs, i, j;
4304         uint_t pszc = pp->p_szc;
4305
4306 #ifdef DEBUG
4307         if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4308                 return (0);
4309         }
4310 #endif
4311
4312         if (pp != PP_GROUPLEADER(pp, pszc)) {
4313                 return (0);
4314         }
4315
4316 retry:
4317         ASSERT(PAGE_LOCKED_SE(pp, se));
4318         ASSERT(!PP_ISFREE(pp));
4319         if (pszc == 0) {
4320                 return (1);
4321         }
4322         npgs = page_get_pagecnt(pszc);
4323         tpp = pp + 1;
4324         for (i = 1; i < npgs; i++, tpp++) {
4325                 if (!page_trylock(tpp, se)) {
4326                         tpp = pp + 1;
4327                         for (j = 1; j < i; j++, tpp++) {
4328                                 page_unlock(tpp);
4329                         }
4330                         return (0);
4331                 }
4332         }
4333         if (pp->p_szc != pszc) {
4334                 ASSERT(pp->p_szc < pszc);
4335                 ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
4336                     !IS_SWAPFSVP(pp->p_vnode));
4337                 tpp = pp + 1;
4338                 for (i = 1; i < npgs; i++, tpp++) {
4339                         page_unlock(tpp);
4340                 }
4341                 pszc = pp->p_szc;
4342                 goto retry;
4343         }
4344         return (1);
4345 }
4346
4347 void
4348 group_page_unlock(page_t *pp)
4349 {
4350         page_t *tpp;
4351         pgcnt_t npgs, i;
4352
4353         ASSERT(PAGE_LOCKED(pp));
4354         ASSERT(!PP_ISFREE(pp));
4355         ASSERT(pp == PP_PAGEROOT(pp));
4356         npgs = page_get_pagecnt(pp->p_szc);
4357         for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4358                 page_unlock(tpp);
4359         }
4360 }
4361
4362 /*
4363  * returns
4364  * 0            : on success and *nrelocp is number of relocated PAGESIZE pages
4365  * ERANGE       : this is not a base page
4366  * EBUSY        : failure to get locks on the page/pages
4367  * ENOMEM       : failure to obtain replacement pages
4368  * EAGAIN       : OBP has not yet completed its boot-time handoff to the kernel
4369  * EIO          : An error occurred while trying to copy the page data
4370  *
4371  * Return with all constituent members of target and replacement
4372  * SE_EXCL locked. It is the callers responsibility to drop the
4373  * locks.
4374  */
4375 int
4376 do_page_relocate(
4377         page_t **target,
4378         page_t **replacement,
4379         int grouplock,
4380         spgcnt_t *nrelocp,
4381         lgrp_t *lgrp)
4382 {
4383         page_t *first_repl;
4384         page_t *repl;
4385         page_t *targ;
4386         page_t *pl = NULL;
4387         uint_t ppattr;
4388         pfn_t   pfn, repl_pfn;
4389         uint_t  szc;
4390         spgcnt_t npgs, i;
4391         int repl_contig = 0;
4392         uint_t flags = 0;
4393         spgcnt_t dofree = 0;
4394
4395         *nrelocp = 0;
4396
4397
4398         /*
4399          * If this is not a base page,
4400          * just return with 0x0 pages relocated.
4401          */
4402         targ = *target;
4403         ASSERT(PAGE_EXCL(targ));
4404         ASSERT(!PP_ISFREE(targ));
4405         szc = targ->p_szc;
4406         ASSERT(szc < mmu_page_sizes);
4407         VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4408         pfn = targ->p_pagenum;
4409         if (pfn != PFN_BASE(pfn, szc)) {
4410                 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
4411                 return (ERANGE);
4412         }
4413
4414         if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
4415                 repl_pfn = repl->p_pagenum;
4416                 if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
4417                         VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
4418                         return (ERANGE);
4419                 }
4420                 repl_contig = 1;
4421         }
4422
4423         /*
4424          * We must lock all members of this large page or we cannot
4425          * relocate any part of it.
4426          */
4427         if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
4428                 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
4429                 return (EBUSY);
4430         }
4431
4432         /*
4433          * reread szc it could have been decreased before
4434          * group_page_trylock() was done.
4435          */
4436         szc = targ->p_szc;
4437         ASSERT(szc < mmu_page_sizes);
4438         VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4439         ASSERT(pfn == PFN_BASE(pfn, szc));
4440
4441         npgs = page_get_pagecnt(targ->p_szc);
4442
4443         if (repl == NULL) {
4444                 dofree = npgs;          /* Size of target page in MMU pages */
4445                 if (!page_create_wait(dofree, 0)) {
4446                         if (grouplock != 0) {
4447                                 group_page_unlock(targ);
4448                         }
4449                         VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4450                         return (ENOMEM);
4451                 }
4452
4453                 /*
4454                  * seg kmem pages require that the target and replacement
4455                  * page be the same pagesize.
4456                  */
4457                 flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
4458                 repl = page_get_replacement_page(targ, lgrp, flags);
4459                 if (repl == NULL) {
4460                         if (grouplock != 0) {
4461                                 group_page_unlock(targ);
4462                         }
4463                         page_create_putback(dofree);
4464                         VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4465                         return (ENOMEM);
4466                 }
4467         }
4468 #ifdef DEBUG
4469         else {
4470                 ASSERT(PAGE_LOCKED(repl));
4471         }
4472 #endif /* DEBUG */
4473
4474
4475         first_repl = repl;
4476
4477         for (i = 0; i < npgs; i++) {
4478                 ASSERT(PAGE_EXCL(targ));
4479                 ASSERT(targ->p_slckcnt == 0);
4480                 ASSERT(repl->p_slckcnt == 0);
4481
4482                 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
4483
4484                 ASSERT(hat_page_getshare(targ) == 0);
4485                 ASSERT(!PP_ISFREE(targ));
4486                 ASSERT(targ->p_pagenum == (pfn + i));
4487                 ASSERT(repl_contig == 0 ||
4488                     repl->p_pagenum == (repl_pfn + i));
4489
4490                 /*
4491                  * Copy the page contents and attributes then
4492                  * relocate the page in the page hash.
4493                  */
4494                 if (ppcopy(targ, repl) == 0) {
4495                         targ = *target;
4496                         repl = first_repl;
4497                         VM_STAT_ADD(vmm_vmstats.ppr_copyfail);
4498                         if (grouplock != 0) {
4499                                 group_page_unlock(targ);
4500                         }
4501                         if (dofree) {
4502                                 *replacement = NULL;
4503                                 page_free_replacement_page(repl);
4504                                 page_create_putback(dofree);
4505                         }
4506                         return (EIO);
4507                 }
4508
4509                 targ++;
4510                 if (repl_contig != 0) {
4511                         repl++;
4512                 } else {
4513                         repl = repl->p_next;
4514                 }
4515         }
4516
4517         repl = first_repl;
4518         targ = *target;
4519
4520         for (i = 0; i < npgs; i++) {
4521                 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
4522                 page_clr_all_props(repl);
4523                 page_set_props(repl, ppattr);
4524                 page_relocate_hash(repl, targ);
4525
4526                 ASSERT(hat_page_getshare(targ) == 0);
4527                 ASSERT(hat_page_getshare(repl) == 0);
4528                 /*
4529                  * Now clear the props on targ, after the
4530                  * page_relocate_hash(), they no longer
4531                  * have any meaning.
4532                  */
4533                 page_clr_all_props(targ);
4534                 ASSERT(targ->p_next == targ);
4535                 ASSERT(targ->p_prev == targ);
4536                 page_list_concat(&pl, &targ);
4537
4538                 targ++;
4539                 if (repl_contig != 0) {
4540                         repl++;
4541                 } else {
4542                         repl = repl->p_next;
4543                 }
4544         }
4545         /* assert that we have come full circle with repl */
4546         ASSERT(repl_contig == 1 || first_repl == repl);
4547
4548         *target = pl;
4549         if (*replacement == NULL) {
4550                 ASSERT(first_repl == repl);
4551                 *replacement = repl;
4552         }
4553         VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4554         *nrelocp = npgs;
4555         return (0);
4556 }
4557 /*
4558  * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
4559  */
4560 int
4561 page_relocate(
4562         page_t **target,
4563         page_t **replacement,
4564         int grouplock,
4565         int freetarget,
4566         spgcnt_t *nrelocp,
4567         lgrp_t *lgrp)
4568 {
4569         spgcnt_t ret;
4570
4571         /* do_page_relocate returns 0 on success or errno value */
4572         ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
4573
4574         if (ret != 0 || freetarget == 0) {
4575                 return (ret);
4576         }
4577         if (*nrelocp == 1) {
4578                 ASSERT(*target != NULL);
4579                 page_free(*target, 1);
4580         } else {
4581                 page_t *tpp = *target;
4582                 uint_t szc = tpp->p_szc;
4583                 pgcnt_t npgs = page_get_pagecnt(szc);
4584                 ASSERT(npgs > 1);
4585                 ASSERT(szc != 0);
4586                 do {
4587                         ASSERT(PAGE_EXCL(tpp));
4588                         ASSERT(!hat_page_is_mapped(tpp));
4589                         ASSERT(tpp->p_szc == szc);
4590                         PP_SETFREE(tpp);
4591                         PP_SETAGED(tpp);
4592                         npgs--;
4593                 } while ((tpp = tpp->p_next) != *target);
4594                 ASSERT(npgs == 0);
4595                 page_list_add_pages(*target, 0);
4596                 npgs = page_get_pagecnt(szc);
4597                 page_create_putback(npgs);
4598         }
4599         return (ret);
4600 }
4601
4602 /*
4603  * it is up to the caller to deal with pcf accounting.
4604  */
4605 void
4606 page_free_replacement_page(page_t *pplist)
4607 {
4608         page_t *pp;
4609
4610         while (pplist != NULL) {
4611                 /*
4612                  * pp_targ is a linked list.
4613                  */
4614                 pp = pplist;
4615                 if (pp->p_szc == 0) {
4616                         page_sub(&pplist, pp);
4617                         page_clr_all_props(pp);
4618                         PP_SETFREE(pp);
4619                         PP_SETAGED(pp);
4620                         page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
4621                         page_unlock(pp);
4622                         VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
4623                 } else {
4624                         spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
4625                         page_t *tpp;
4626                         page_list_break(&pp, &pplist, curnpgs);
4627                         tpp = pp;
4628                         do {
4629                                 ASSERT(PAGE_EXCL(tpp));
4630                                 ASSERT(!hat_page_is_mapped(tpp));
4631                                 page_clr_all_props(tpp);
4632                                 PP_SETFREE(tpp);
4633                                 PP_SETAGED(tpp);
4634                         } while ((tpp = tpp->p_next) != pp);
4635                         page_list_add_pages(pp, 0);
4636                         VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
4637                 }
4638         }
4639 }
4640
4641 /*
4642  * Release the page lock on a page, place on cachelist
4643  * tail if no longer mapped. Caller can let us know if
4644  * the page is known to be clean.
4645  */
4646 int
4647 page_release(page_t *pp, int checkmod)
4648 {
4649         int status;
4650
4651         ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
4652             (pp->p_vnode != NULL));
4653
4654         if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
4655             ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
4656             pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
4657             !hat_page_is_mapped(pp)) {
4658
4659                 /*
4660                  * If page is modified, unlock it
4661                  *
4662                  * (p_nrm & P_MOD) bit has the latest stuff because:
4663                  * (1) We found that this page doesn't have any mappings
4664                  *      _after_ holding SE_EXCL and
4665                  * (2) We didn't drop SE_EXCL lock after the check in (1)
4666                  */
4667                 if (checkmod && hat_ismod(pp)) {
4668                         page_unlock(pp);
4669                         status = PGREL_MOD;
4670                 } else {
4671                         VN_DISPOSE(pp, B_FREE, 0, kcred);
4672                         status = PGREL_CLEAN;
4673                 }
4674         } else {
4675                 page_unlock(pp);
4676                 status = PGREL_NOTREL;
4677         }
4678         return (status);
4679 }
4680
4681 /*
4682  * Given a constituent page, try to demote the large page on the freelist.
4683  *
4684  * Returns nonzero if the page could be demoted successfully. Returns with
4685  * the constituent page still locked.
4686  */
4687 int
4688 page_try_demote_free_pages(page_t *pp)
4689 {
4690         page_t *rootpp = pp;
4691         pfn_t   pfn = page_pptonum(pp);
4692         spgcnt_t npgs;
4693         uint_t  szc = pp->p_szc;
4694
4695         ASSERT(PP_ISFREE(pp));
4696         ASSERT(PAGE_EXCL(pp));
4697
4698         /*
4699          * Adjust rootpp and lock it, if `pp' is not the base
4700          * constituent page.
4701          */
4702         npgs = page_get_pagecnt(pp->p_szc);
4703         if (npgs == 1) {
4704                 return (0);
4705         }
4706
4707         if (!IS_P2ALIGNED(pfn, npgs)) {
4708                 pfn = P2ALIGN(pfn, npgs);
4709                 rootpp = page_numtopp_nolock(pfn);
4710         }
4711
4712         if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
4713                 return (0);
4714         }
4715
4716         if (rootpp->p_szc != szc) {
4717                 if (pp != rootpp)
4718                         page_unlock(rootpp);
4719                 return (0);
4720         }
4721
4722         page_demote_free_pages(rootpp);
4723
4724         if (pp != rootpp)
4725                 page_unlock(rootpp);
4726
4727         ASSERT(PP_ISFREE(pp));
4728         ASSERT(PAGE_EXCL(pp));
4729         return (1);
4730 }
4731
4732 /*
4733  * Given a constituent page, try to demote the large page.
4734  *
4735  * Returns nonzero if the page could be demoted successfully. Returns with
4736  * the constituent page still locked.
4737  */
4738 int
4739 page_try_demote_pages(page_t *pp)
4740 {
4741         page_t *tpp, *rootpp = pp;
4742         pfn_t   pfn = page_pptonum(pp);
4743         spgcnt_t i, npgs;
4744         uint_t  szc = pp->p_szc;
4745         vnode_t *vp = pp->p_vnode;
4746
4747         ASSERT(PAGE_EXCL(pp));
4748
4749         VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
4750
4751         if (pp->p_szc == 0) {
4752                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
4753                 return (1);
4754         }
4755
4756         if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
4757                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
4758                 page_demote_vp_pages(pp);
4759                 ASSERT(pp->p_szc == 0);
4760                 return (1);
4761         }
4762
4763         /*
4764          * Adjust rootpp if passed in is not the base
4765          * constituent page.
4766          */
4767         npgs = page_get_pagecnt(pp->p_szc);
4768         ASSERT(npgs > 1);
4769         if (!IS_P2ALIGNED(pfn, npgs)) {
4770                 pfn = P2ALIGN(pfn, npgs);
4771                 rootpp = page_numtopp_nolock(pfn);
4772                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
4773                 ASSERT(rootpp->p_vnode != NULL);
4774                 ASSERT(rootpp->p_szc == szc);
4775         }
4776
4777         /*
4778          * We can't demote kernel pages since we can't hat_unload()
4779          * the mappings.
4780          */
4781         if (VN_ISKAS(rootpp->p_vnode))
4782                 return (0);
4783
4784         /*
4785          * Attempt to lock all constituent pages except the page passed
4786          * in since it's already locked.
4787          */
4788         for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
4789                 ASSERT(!PP_ISFREE(tpp));
4790                 ASSERT(tpp->p_vnode != NULL);
4791
4792                 if (tpp != pp && !page_trylock(tpp, SE_EXCL))
4793                         break;
4794                 ASSERT(tpp->p_szc == rootpp->p_szc);
4795                 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
4796         }
4797
4798         /*
4799          * If we failed to lock them all then unlock what we have
4800          * locked so far and bail.
4801          */
4802         if (i < npgs) {
4803                 tpp = rootpp;
4804                 while (i-- > 0) {
4805                         if (tpp != pp)
4806                                 page_unlock(tpp);
4807                         tpp++;
4808                 }
4809                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
4810                 return (0);
4811         }
4812
4813         for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
4814                 ASSERT(PAGE_EXCL(tpp));
4815                 ASSERT(tpp->p_slckcnt == 0);
4816                 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
4817                 tpp->p_szc = 0;
4818         }
4819
4820         /*
4821          * Unlock all pages except the page passed in.
4822          */
4823         for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
4824                 ASSERT(!hat_page_is_mapped(tpp));
4825                 if (tpp != pp)
4826                         page_unlock(tpp);
4827         }
4828
4829         VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
4830         return (1);
4831 }
4832
4833 /*
4834  * Called by page_free() and page_destroy() to demote the page size code
4835  * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
4836  * p_szc on free list, neither can we just clear p_szc of a single page_t
4837  * within a large page since it will break other code that relies on p_szc
4838  * being the same for all page_t's of a large page). Anonymous pages should
4839  * never end up here because anon_map_getpages() cannot deal with p_szc
4840  * changes after a single constituent page is locked.  While anonymous or
4841  * kernel large pages are demoted or freed the entire large page at a time
4842  * with all constituent pages locked EXCL for the file system pages we
4843  * have to be able to demote a large page (i.e. decrease all constituent pages
4844  * p_szc) with only just an EXCL lock on one of constituent pages. The reason
4845  * we can easily deal with anonymous page demotion the entire large page at a
4846  * time is that those operation originate at address space level and concern
4847  * the entire large page region with actual demotion only done when pages are
4848  * not shared with any other processes (therefore we can always get EXCL lock
4849  * on all anonymous constituent pages after clearing segment page
4850  * cache). However file system pages can be truncated or invalidated at a
4851  * PAGESIZE level from the file system side and end up in page_free() or
4852  * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
4853  * and therefore pageout should be able to demote a large page by EXCL locking
4854  * any constituent page that is not under SOFTLOCK). In those cases we cannot
4855  * rely on being able to lock EXCL all constituent pages.
4856  *
4857  * To prevent szc changes on file system pages one has to lock all constituent
4858  * pages at least SHARED (or call page_szc_lock()). The only subsystem that
4859  * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
4860  * prevent szc changes is hat layer that uses its own page level mlist
4861  * locks. hat assumes that szc doesn't change after mlist lock for a page is
4862  * taken. Therefore we need to change szc under hat level locks if we only
4863  * have an EXCL lock on a single constituent page and hat still references any
4864  * of constituent pages.  (Note we can't "ignore" hat layer by simply
4865  * hat_pageunload() all constituent pages without having EXCL locks on all of
4866  * constituent pages). We use hat_page_demote() call to safely demote szc of
4867  * all constituent pages under hat locks when we only have an EXCL lock on one
4868  * of constituent pages.
4869  *
4870  * This routine calls page_szc_lock() before calling hat_page_demote() to
4871  * allow segvn in one special case not to lock all constituent pages SHARED
4872  * before calling hat_memload_array() that relies on p_szc not changing even
4873  * before hat level mlist lock is taken.  In that case segvn uses
4874  * page_szc_lock() to prevent hat_page_demote() changing p_szc values.
4875  *
4876  * Anonymous or kernel page demotion still has to lock all pages exclusively
4877  * and do hat_pageunload() on all constituent pages before demoting the page
4878  * therefore there's no need for anonymous or kernel page demotion to use
4879  * hat_page_demote() mechanism.
4880  *
4881  * hat_page_demote() removes all large mappings that map pp and then decreases
4882  * p_szc starting from the last constituent page of the large page. By working
4883  * from the tail of a large page in pfn decreasing order allows one looking at
4884  * the root page to know that hat_page_demote() is done for root's szc area.
4885  * e.g. if a root page has szc 1 one knows it only has to lock all constituent
4886  * pages within szc 1 area to prevent szc changes because hat_page_demote()
4887  * that started on this page when it had szc > 1 is done for this szc 1 area.
4888  *
4889  * We are guaranteed that all constituent pages of pp's large page belong to
4890  * the same vnode with the consecutive offsets increasing in the direction of
4891  * the pfn i.e. the identity of constituent pages can't change until their
4892  * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
4893  * large mappings to pp even though we don't lock any constituent page except
4894  * pp (i.e. we won't unload e.g. kernel locked page).
4895  */
4896 static void
4897 page_demote_vp_pages(page_t *pp)
4898 {
4899         kmutex_t *mtx;
4900
4901         ASSERT(PAGE_EXCL(pp));
4902         ASSERT(!PP_ISFREE(pp));
4903         ASSERT(pp->p_vnode != NULL);
4904         ASSERT(!IS_SWAPFSVP(pp->p_vnode));
4905         ASSERT(!PP_ISKAS(pp));
4906
4907         VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
4908
4909         mtx = page_szc_lock(pp);
4910         if (mtx != NULL) {
4911                 hat_page_demote(pp);
4912                 mutex_exit(mtx);
4913         }
4914         ASSERT(pp->p_szc == 0);
4915 }
4916
4917 /*
4918  * Mark any existing pages for migration in the given range
4919  */
4920 void
4921 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
4922     struct anon_map *amp, ulong_t anon_index, struct vmobject *obj,
4923     uoff_t objoff, int rflag)
4924 {
4925         struct anon     *ap;
4926         struct vmobject *curobj;
4927         lgrp_t          *from;
4928         pgcnt_t         nlocked;
4929         uoff_t  off;
4930         pfn_t           pfn;
4931         size_t          pgsz;
4932         size_t          segpgsz;
4933         pgcnt_t         pages;
4934         uint_t          pszc;
4935         page_t          *pp0, *pp;
4936         caddr_t         va;
4937         ulong_t         an_idx;
4938         anon_sync_obj_t cookie;
4939
4940         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
4941
4942         /*
4943          * Don't do anything if don't need to do lgroup optimizations
4944          * on this system
4945          */
4946         if (!lgrp_optimizations())
4947                 return;
4948
4949         /*
4950          * Align address and length to (potentially large) page boundary
4951          */
4952         segpgsz = page_get_pagesize(seg->s_szc);
4953         addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
4954         if (rflag)
4955                 len = P2ROUNDUP(len, segpgsz);
4956
4957         /*
4958          * Do one (large) page at a time
4959          */
4960         va = addr;
4961         while (va < addr + len) {
4962                 /*
4963                  * Lookup (root) page for vnode and offset corresponding to
4964                  * this virtual address
4965                  * Try anonmap first since there may be copy-on-write
4966                  * pages, but initialize object pointer and offset using
4967                  * arguments just in case there isn't an amp.
4968                  */
4969                 curobj = obj;
4970                 off = objoff + va - seg->s_base;
4971                 if (amp) {
4972                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
4973                         an_idx = anon_index + seg_page(seg, va);
4974                         anon_array_enter(amp, an_idx, &cookie);
4975                         ap = anon_get_ptr(amp->ahp, an_idx);
4976                         if (ap) {
4977                                 struct vnode *vn;
4978
4979                                 swap_xlate(ap, &vn, &off);
4980
4981                                 curobj = (vn != NULL) ? &vn->v_object : NULL;
4982                         }
4983                         anon_array_exit(&cookie);
4984                         ANON_LOCK_EXIT(&amp->a_rwlock);
4985                 }
4986
4987                 pp = NULL;
4988                 if (curobj)
4989                         pp = page_lookup(curobj, off, SE_SHARED);
4990
4991                 /*
4992                  * If there isn't a page at this virtual address,
4993                  * skip to next page
4994                  */
4995                 if (pp == NULL) {
4996                         va += PAGESIZE;
4997                         continue;
4998                 }
4999
5000                 /*
5001                  * Figure out which lgroup this page is in for kstats
5002                  */
5003                 pfn = page_pptonum(pp);
5004                 from = lgrp_pfn_to_lgrp(pfn);
5005
5006                 /*
5007                  * Get page size, and round up and skip to next page boundary
5008                  * if unaligned address
5009                  */
5010                 pszc = pp->p_szc;
5011                 pgsz = page_get_pagesize(pszc);
5012                 pages = btop(pgsz);
5013                 if (!IS_P2ALIGNED(va, pgsz) ||
5014                     !IS_P2ALIGNED(pfn, pages) ||
5015                     pgsz > segpgsz) {
5016                         pgsz = MIN(pgsz, segpgsz);
5017                         page_unlock(pp);
5018                         pages = btop(P2END((uintptr_t)va, pgsz) -
5019                             (uintptr_t)va);
5020                         va = (caddr_t)P2END((uintptr_t)va, pgsz);
5021                         lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, pages);
5022                         continue;
5023                 }
5024
5025                 /*
5026                  * Upgrade to exclusive lock on page
5027                  */
5028                 if (!page_tryupgrade(pp)) {
5029                         page_unlock(pp);
5030                         va += pgsz;
5031                         lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5032                             btop(pgsz));
5033                         continue;
5034                 }
5035
5036                 pp0 = pp++;
5037                 nlocked = 1;
5038
5039                 /*
5040                  * Lock constituent pages if this is large page
5041                  */
5042                 if (pages > 1) {
5043                         /*
5044                          * Lock all constituents except root page, since it
5045                          * should be locked already.
5046                          */
5047                         for (; nlocked < pages; nlocked++) {
5048                                 if (!page_trylock(pp, SE_EXCL)) {
5049                                         break;
5050                                 }
5051                                 if (PP_ISFREE(pp) ||
5052                                     pp->p_szc != pszc) {
5053                                         /*
5054                                          * hat_page_demote() raced in with us.
5055                                          */
5056                                         ASSERT(!IS_SWAPFSVP(curobj->vnode));
5057                                         page_unlock(pp);
5058                                         break;
5059                                 }
5060                                 pp++;
5061                         }
5062                 }
5063
5064                 /*
5065                  * If all constituent pages couldn't be locked,
5066                  * unlock pages locked so far and skip to next page.
5067                  */
5068                 if (nlocked < pages) {
5069                         while (pp0 < pp) {
5070                                 page_unlock(pp0++);
5071                         }
5072                         va += pgsz;
5073                         lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5074                             btop(pgsz));
5075                         continue;
5076                 }
5077
5078                 /*
5079                  * hat_page_demote() can no longer happen
5080                  * since last cons page had the right p_szc after
5081                  * all cons pages were locked. all cons pages
5082                  * should now have the same p_szc.
5083                  */
5084
5085                 /*
5086                  * All constituent pages locked successfully, so mark
5087                  * large page for migration and unload the mappings of
5088                  * constituent pages, so a fault will occur on any part of the
5089                  * large page
5090                  */
5091                 PP_SETMIGRATE(pp0);
5092                 while (pp0 < pp) {
5093                         (void) hat_pageunload(pp0, HAT_FORCE_PGUNLOAD);
5094                         ASSERT(hat_page_getshare(pp0) == 0);
5095                         page_unlock(pp0++);
5096                 }
5097                 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5098
5099                 va += pgsz;
5100         }
5101 }
5102
5103 /*
5104  * Migrate any pages that have been marked for migration in the given range
5105  */
5106 void
5107 page_migrate(
5108         struct seg      *seg,
5109         caddr_t         addr,
5110         page_t          **ppa,
5111         pgcnt_t         npages)
5112 {
5113         lgrp_t          *from;
5114         lgrp_t          *to;
5115         page_t          *newpp;
5116         page_t          *pp;
5117         pfn_t           pfn;
5118         size_t          pgsz;
5119         spgcnt_t        page_cnt;
5120         spgcnt_t        i;
5121         uint_t          pszc;
5122
5123         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
5124
5125         while (npages > 0) {
5126                 pp = *ppa;
5127                 pszc = pp->p_szc;
5128                 pgsz = page_get_pagesize(pszc);
5129                 page_cnt = btop(pgsz);
5130
5131                 /*
5132                  * Check to see whether this page is marked for migration
5133                  *
5134                  * Assume that root page of large page is marked for
5135                  * migration and none of the other constituent pages
5136                  * are marked.  This really simplifies clearing the
5137                  * migrate bit by not having to clear it from each
5138                  * constituent page.
5139                  *
5140                  * note we don't want to relocate an entire large page if
5141                  * someone is only using one subpage.
5142                  */
5143                 if (npages < page_cnt)
5144                         break;
5145
5146                 /*
5147                  * Is it marked for migration?
5148                  */
5149                 if (!PP_ISMIGRATE(pp))
5150                         goto next;
5151
5152                 /*
5153                  * Determine lgroups that page is being migrated between
5154                  */
5155                 pfn = page_pptonum(pp);
5156                 if (!IS_P2ALIGNED(pfn, page_cnt)) {
5157                         break;
5158                 }
5159                 from = lgrp_pfn_to_lgrp(pfn);
5160                 to = lgrp_mem_choose(seg, addr, pgsz);
5161
5162                 /*
5163                  * Need to get exclusive lock's to migrate
5164                  */
5165                 for (i = 0; i < page_cnt; i++) {
5166                         ASSERT(PAGE_LOCKED(ppa[i]));
5167                         if (page_pptonum(ppa[i]) != pfn + i ||
5168                             ppa[i]->p_szc != pszc) {
5169                                 break;
5170                         }
5171                         if (!page_tryupgrade(ppa[i])) {
5172                                 lgrp_stat_add(from->lgrp_id,
5173                                     LGRP_PM_FAIL_LOCK_PGS,
5174                                     page_cnt);
5175                                 break;
5176                         }
5177
5178                         /*
5179                          * Check to see whether we are trying to migrate
5180                          * page to lgroup where it is allocated already.
5181                          * If so, clear the migrate bit and skip to next
5182                          * page.
5183                          */
5184                         if (i == 0 && to == from) {
5185                                 PP_CLRMIGRATE(ppa[0]);
5186                                 page_downgrade(ppa[0]);
5187                                 goto next;
5188                         }
5189                 }
5190
5191                 /*
5192                  * If all constituent pages couldn't be locked,
5193                  * unlock pages locked so far and skip to next page.
5194                  */
5195                 if (i != page_cnt) {
5196                         while (--i != -1) {
5197                                 page_downgrade(ppa[i]);
5198                         }
5199                         goto next;
5200                 }
5201
5202                 (void) page_create_wait(page_cnt, PG_WAIT);
5203                 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
5204                 if (newpp == NULL) {
5205                         page_create_putback(page_cnt);
5206                         for (i = 0; i < page_cnt; i++) {
5207                                 page_downgrade(ppa[i]);
5208                         }
5209                         lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
5210                             page_cnt);
5211                         goto next;
5212                 }
5213                 ASSERT(newpp->p_szc == pszc);
5214                 /*
5215                  * Clear migrate bit and relocate page
5216                  */
5217                 PP_CLRMIGRATE(pp);
5218                 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
5219                         panic("page_migrate: page_relocate failed");
5220                 }
5221                 ASSERT(page_cnt * PAGESIZE == pgsz);
5222
5223                 /*
5224                  * Keep stats for number of pages migrated from and to
5225                  * each lgroup
5226                  */
5227                 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
5228                 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
5229                 /*
5230                  * update the page_t array we were passed in and
5231                  * unlink constituent pages of a large page.
5232                  */
5233                 for (i = 0; i < page_cnt; ++i, ++pp) {
5234                         ASSERT(PAGE_EXCL(newpp));
5235                         ASSERT(newpp->p_szc == pszc);
5236                         ppa[i] = newpp;
5237                         pp = newpp;
5238                         page_sub(&newpp, pp);
5239                         page_downgrade(pp);
5240                 }
5241                 ASSERT(newpp == NULL);
5242 next:
5243                 addr += pgsz;
5244                 ppa += page_cnt;
5245                 npages -= page_cnt;
5246         }
5247 }
5248
5249 uint_t page_reclaim_maxcnt = 60; /* max total iterations */
5250 uint_t page_reclaim_nofree_maxcnt = 3; /* max iterations without progress */
5251 /*
5252  * Reclaim/reserve availrmem for npages.
5253  * If there is not enough memory start reaping seg, kmem caches.
5254  * Start pageout scanner (via page_needfree()).
5255  * Exit after ~ MAX_CNT s regardless of how much memory has been released.
5256  * Note: There is no guarantee that any availrmem will be freed as
5257  * this memory typically is locked (kernel heap) or reserved for swap.
5258  * Also due to memory fragmentation kmem allocator may not be able
5259  * to free any memory (single user allocated buffer will prevent
5260  * freeing slab or a page).
5261  */
5262 int
5263 page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust)
5264 {
5265         int     i = 0;
5266         int     i_nofree = 0;
5267         int     ret = 0;
5268         pgcnt_t deficit;
5269         pgcnt_t old_availrmem = 0;
5270
5271         mutex_enter(&freemem_lock);
5272         while (availrmem < tune.t_minarmem + npages + epages &&
5273             i++ < page_reclaim_maxcnt) {
5274                 /* ensure we made some progress in the last few iterations */
5275                 if (old_availrmem < availrmem) {
5276                         old_availrmem = availrmem;
5277                         i_nofree = 0;
5278                 } else if (i_nofree++ >= page_reclaim_nofree_maxcnt) {
5279                         break;
5280                 }
5281
5282                 deficit = tune.t_minarmem + npages + epages - availrmem;
5283                 mutex_exit(&freemem_lock);
5284                 page_needfree(deficit);
5285                 kmem_reap();
5286                 ddi_sleep(1);
5287                 page_needfree(-(spgcnt_t)deficit);
5288                 mutex_enter(&freemem_lock);
5289         }
5290
5291         if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) {
5292                 availrmem -= npages;
5293                 ret = 1;
5294         }
5295
5296         mutex_exit(&freemem_lock);
5297
5298         return (ret);
5299 }
5300
5301 /*
5302  * Search the memory segments to locate the desired page.  Within a
5303  * segment, pages increase linearly with one page structure per
5304  * physical page frame (size PAGESIZE).  The search begins
5305  * with the segment that was accessed last, to take advantage of locality.
5306  * If the hint misses, we start from the beginning of the sorted memseg list
5307  */
5308
5309
5310 /*
5311  * Some data structures for pfn to pp lookup.
5312  */
5313 ulong_t mhash_per_slot;
5314 struct memseg *memseg_hash[N_MEM_SLOTS];
5315
5316 page_t *
5317 page_numtopp_nolock(pfn_t pfnum)
5318 {
5319         struct memseg *seg;
5320         page_t *pp;
5321         vm_cpu_data_t *vc;
5322
5323         /*
5324          * We need to disable kernel preemption while referencing the
5325          * cpu_vm_data field in order to prevent us from being switched to
5326          * another cpu and trying to reference it after it has been freed.
5327          * This will keep us on cpu and prevent it from being removed while
5328          * we are still on it.
5329          *
5330          * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5331          * which is being resued by DR who will flush those references
5332          * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5333          */
5334         kpreempt_disable();
5335         vc = CPU->cpu_vm_data;
5336         ASSERT(vc != NULL);
5337
5338         MEMSEG_STAT_INCR(nsearch);
5339
5340         /* Try last winner first */
5341         if (((seg = vc->vc_pnum_memseg) != NULL) &&
5342             (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5343                 MEMSEG_STAT_INCR(nlastwon);
5344                 pp = seg->pages + (pfnum - seg->pages_base);
5345                 if (pp->p_pagenum == pfnum) {
5346                         kpreempt_enable();
5347                         return ((page_t *)pp);
5348                 }
5349         }
5350
5351         /* Else Try hash */
5352         if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5353             (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5354                 MEMSEG_STAT_INCR(nhashwon);
5355                 vc->vc_pnum_memseg = seg;
5356                 pp = seg->pages + (pfnum - seg->pages_base);
5357                 if (pp->p_pagenum == pfnum) {
5358                         kpreempt_enable();
5359                         return ((page_t *)pp);
5360                 }
5361         }
5362
5363         /* Else Brute force */
5364         for (seg = memsegs; seg != NULL; seg = seg->next) {
5365                 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5366                         vc->vc_pnum_memseg = seg;
5367                         pp = seg->pages + (pfnum - seg->pages_base);
5368                         if (pp->p_pagenum == pfnum) {
5369                                 kpreempt_enable();
5370                                 return ((page_t *)pp);
5371                         }
5372                 }
5373         }
5374         vc->vc_pnum_memseg = NULL;
5375         kpreempt_enable();
5376         MEMSEG_STAT_INCR(nnotfound);
5377         return (NULL);
5378
5379 }
5380
5381 struct memseg *
5382 page_numtomemseg_nolock(pfn_t pfnum)
5383 {
5384         struct memseg *seg;
5385         page_t *pp;
5386
5387         /*
5388          * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5389          * which is being resued by DR who will flush those references
5390          * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5391          */
5392         kpreempt_disable();
5393         /* Try hash */
5394         if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5395             (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5396                 pp = seg->pages + (pfnum - seg->pages_base);
5397                 if (pp->p_pagenum == pfnum) {
5398                         kpreempt_enable();
5399                         return (seg);
5400                 }
5401         }
5402
5403         /* Else Brute force */
5404         for (seg = memsegs; seg != NULL; seg = seg->next) {
5405                 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5406                         pp = seg->pages + (pfnum - seg->pages_base);
5407                         if (pp->p_pagenum == pfnum) {
5408                                 kpreempt_enable();
5409                                 return (seg);
5410                         }
5411                 }
5412         }
5413         kpreempt_enable();
5414         return (NULL);
5415 }
5416
5417 /*
5418  * Given a page and a count return the page struct that is
5419  * n structs away from the current one in the global page
5420  * list.
5421  *
5422  * This function wraps to the first page upon
5423  * reaching the end of the memseg list.
5424  */
5425 page_t *
5426 page_nextn(page_t *pp, ulong_t n)
5427 {
5428         struct memseg *seg;
5429         page_t *ppn;
5430         vm_cpu_data_t *vc;
5431
5432         /*
5433          * We need to disable kernel preemption while referencing the
5434          * cpu_vm_data field in order to prevent us from being switched to
5435          * another cpu and trying to reference it after it has been freed.
5436          * This will keep us on cpu and prevent it from being removed while
5437          * we are still on it.
5438          *
5439          * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5440          * which is being resued by DR who will flush those references
5441          * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5442          */
5443         kpreempt_disable();
5444         vc = (vm_cpu_data_t *)CPU->cpu_vm_data;
5445
5446         ASSERT(vc != NULL);
5447
5448         if (((seg = vc->vc_pnext_memseg) == NULL) ||
5449             (seg->pages_base == seg->pages_end) ||
5450             !(pp >= seg->pages && pp < seg->epages)) {
5451
5452                 for (seg = memsegs; seg; seg = seg->next) {
5453                         if (pp >= seg->pages && pp < seg->epages)
5454                                 break;
5455                 }
5456
5457                 if (seg == NULL) {
5458                         /* Memory delete got in, return something valid. */
5459                         /* TODO: fix me. */
5460                         seg = memsegs;
5461                         pp = seg->pages;
5462                 }
5463         }
5464
5465         /* check for wraparound - possible if n is large */
5466         while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
5467                 n -= seg->epages - pp;
5468                 seg = seg->next;
5469                 if (seg == NULL)
5470                         seg = memsegs;
5471                 pp = seg->pages;
5472         }
5473         vc->vc_pnext_memseg = seg;
5474         kpreempt_enable();
5475         return (ppn);
5476 }
5477
5478 /*
5479  * Initialize for a loop using page_next_scan_large().
5480  */
5481 page_t *
5482 page_next_scan_init(void **cookie)
5483 {
5484         ASSERT(cookie != NULL);
5485         *cookie = (void *)memsegs;
5486         return ((page_t *)memsegs->pages);
5487 }
5488
5489 /*
5490  * Return the next page in a scan of page_t's, assuming we want
5491  * to skip over sub-pages within larger page sizes.
5492  *
5493  * The cookie is used to keep track of the current memseg.
5494  */
5495 page_t *
5496 page_next_scan_large(
5497         page_t          *pp,
5498         ulong_t         *n,
5499         void            **cookie)
5500 {
5501         struct memseg   *seg = (struct memseg *)*cookie;
5502         page_t          *new_pp;
5503         ulong_t         cnt;
5504         pfn_t           pfn;
5505
5506
5507         /*
5508          * get the count of page_t's to skip based on the page size
5509          */
5510         ASSERT(pp != NULL);
5511         if (pp->p_szc == 0) {
5512                 cnt = 1;
5513         } else {
5514                 pfn = page_pptonum(pp);
5515                 cnt = page_get_pagecnt(pp->p_szc);
5516                 cnt -= pfn & (cnt - 1);
5517         }
5518         *n += cnt;
5519         new_pp = pp + cnt;
5520
5521         /*
5522          * Catch if we went past the end of the current memory segment. If so,
5523          * just move to the next segment with pages.
5524          */
5525         if (new_pp >= seg->epages || seg->pages_base == seg->pages_end) {
5526                 do {
5527                         seg = seg->next;
5528                         if (seg == NULL)
5529                                 seg = memsegs;
5530                 } while (seg->pages_base == seg->pages_end);
5531                 new_pp = seg->pages;
5532                 *cookie = (void *)seg;
5533         }
5534
5535         return (new_pp);
5536 }
5537
5538
5539 /*
5540  * Returns next page in list. Note: this function wraps
5541  * to the first page in the list upon reaching the end
5542  * of the list. Callers should be aware of this fact.
5543  */
5544
5545 /* We should change this be a #define */
5546
5547 page_t *
5548 page_next(page_t *pp)
5549 {
5550         return (page_nextn(pp, 1));
5551 }
5552
5553 page_t *
5554 page_first()
5555 {
5556         return ((page_t *)memsegs->pages);
5557 }
5558
5559
5560 /*
5561  * This routine is called at boot with the initial memory configuration
5562  * and when memory is added or removed.
5563  */
5564 void
5565 build_pfn_hash()
5566 {
5567         pfn_t cur;
5568         pgcnt_t index;
5569         struct memseg *pseg;
5570         int     i;
5571
5572         /*
5573          * Clear memseg_hash array.
5574          * Since memory add/delete is designed to operate concurrently
5575          * with normal operation, the hash rebuild must be able to run
5576          * concurrently with page_numtopp_nolock(). To support this
5577          * functionality, assignments to memseg_hash array members must
5578          * be done atomically.
5579          *
5580          * NOTE: bzero() does not currently guarantee this for kernel
5581          * threads, and cannot be used here.
5582          */
5583         for (i = 0; i < N_MEM_SLOTS; i++)
5584                 memseg_hash[i] = NULL;
5585
5586         hat_kpm_mseghash_clear(N_MEM_SLOTS);
5587
5588         /*
5589          * Physmax is the last valid pfn.
5590          */
5591         mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
5592         for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
5593                 index = MEMSEG_PFN_HASH(pseg->pages_base);
5594                 cur = pseg->pages_base;
5595                 do {
5596                         if (index >= N_MEM_SLOTS)
5597                                 index = MEMSEG_PFN_HASH(cur);
5598
5599                         if (memseg_hash[index] == NULL ||
5600                             memseg_hash[index]->pages_base > pseg->pages_base) {
5601                                 memseg_hash[index] = pseg;
5602                                 hat_kpm_mseghash_update(index, pseg);
5603                         }
5604                         cur += mhash_per_slot;
5605                         index++;
5606                 } while (cur < pseg->pages_end);
5607         }
5608 }
5609
5610 /*
5611  * Return the pagenum for the pp
5612  */
5613 pfn_t
5614 page_pptonum(page_t *pp)
5615 {
5616         return (pp->p_pagenum);
5617 }
5618
5619 /*
5620  * interface to the referenced and modified etc bits
5621  * in the PSM part of the page struct
5622  * when no locking is desired.
5623  */
5624 void
5625 page_set_props(page_t *pp, uint_t flags)
5626 {
5627         ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
5628         pp->p_nrm |= (uchar_t)flags;
5629 }
5630
5631 void
5632 page_clr_all_props(page_t *pp)
5633 {
5634         pp->p_nrm = 0;
5635 }
5636
5637 /*
5638  * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
5639  */
5640 int
5641 page_clear_lck_cow(page_t *pp, int adjust)
5642 {
5643         int     f_amount;
5644
5645         ASSERT(PAGE_EXCL(pp));
5646
5647         /*
5648          * The page_struct_lock need not be acquired here since
5649          * we require the caller hold the page exclusively locked.
5650          */
5651         f_amount = 0;
5652         if (pp->p_lckcnt) {
5653                 f_amount = 1;
5654                 pp->p_lckcnt = 0;
5655         }
5656         if (pp->p_cowcnt) {
5657                 f_amount += pp->p_cowcnt;
5658                 pp->p_cowcnt = 0;
5659         }
5660
5661         if (adjust && f_amount) {
5662                 mutex_enter(&freemem_lock);
5663                 availrmem += f_amount;
5664                 mutex_exit(&freemem_lock);
5665         }
5666
5667         return (f_amount);
5668 }
5669
5670 /*
5671  * The following functions is called from free_vp_pages()
5672  * for an inexact estimate of a newly free'd page...
5673  */
5674 ulong_t
5675 page_share_cnt(page_t *pp)
5676 {
5677         return (hat_page_getshare(pp));
5678 }
5679
5680 int
5681 page_isshared(page_t *pp)
5682 {
5683         return (hat_page_checkshare(pp, 1));
5684 }
5685
5686 int
5687 page_isfree(page_t *pp)
5688 {
5689         return (PP_ISFREE(pp));
5690 }
5691
5692 int
5693 page_isref(page_t *pp)
5694 {
5695         return (hat_page_getattr(pp, P_REF));
5696 }
5697
5698 int
5699 page_ismod(page_t *pp)
5700 {
5701         return (hat_page_getattr(pp, P_MOD));
5702 }
5703
5704 /*
5705  * The following code all currently relates to the page capture logic:
5706  *
5707  * This logic is used for cases where there is a desire to claim a certain
5708  * physical page in the system for the caller.  As it may not be possible
5709  * to capture the page immediately, the p_toxic bits are used in the page
5710  * structure to indicate that someone wants to capture this page.  When the
5711  * page gets unlocked, the toxic flag will be noted and an attempt to capture
5712  * the page will be made.  If it is successful, the original callers callback
5713  * will be called with the page to do with it what they please.
5714  *
5715  * There is also an async thread which wakes up to attempt to capture
5716  * pages occasionally which have the capture bit set.  All of the pages which
5717  * need to be captured asynchronously have been inserted into the
5718  * page_capture_hash and thus this thread walks that hash list.  Items in the
5719  * hash have an expiration time so this thread handles that as well by removing
5720  * the item from the hash if it has expired.
5721  *
5722  * Some important things to note are:
5723  * - if the PR_CAPTURE bit is set on a page, then the page is in the
5724  *   page_capture_hash.  The page_capture_hash_head.pchh_mutex is needed
5725  *   to set and clear this bit, and while the lock is held is the only time
5726  *   you can add or remove an entry from the hash.
5727  * - the PR_CAPTURE bit can only be set and cleared while holding the
5728  *   page_capture_hash_head.pchh_mutex
5729  * - the t_flag field of the thread struct is used with the T_CAPTURING
5730  *   flag to prevent recursion while dealing with large pages.
5731  * - pages which need to be retired never expire on the page_capture_hash.
5732  */
5733
5734 static void page_capture_thread(void);
5735 static kthread_t *pc_thread_id;
5736 kcondvar_t pc_cv;
5737 static kmutex_t pc_thread_mutex;
5738 static clock_t pc_thread_shortwait;
5739 static clock_t pc_thread_longwait;
5740 static int pc_thread_retry;
5741
5742 struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
5743
5744 /* Note that this is a circular linked list */
5745 typedef struct page_capture_hash_bucket {
5746         page_t *pp;
5747         uchar_t szc;
5748         uchar_t pri;
5749         uint_t flags;
5750         clock_t expires;        /* lbolt at which this request expires. */
5751         void *datap;            /* Cached data passed in for callback */
5752         struct page_capture_hash_bucket *next;
5753         struct page_capture_hash_bucket *prev;
5754 } page_capture_hash_bucket_t;
5755
5756 #define PC_PRI_HI       0       /* capture now */
5757 #define PC_PRI_LO       1       /* capture later */
5758 #define PC_NUM_PRI      2
5759
5760 #define PAGE_CAPTURE_PRIO(pp) (PP_ISRAF(pp) ? PC_PRI_LO : PC_PRI_HI)
5761
5762
5763 /*
5764  * Each hash bucket will have it's own mutex and two lists which are:
5765  * active (0):  represents requests which have not been processed by
5766  *              the page_capture async thread yet.
5767  * walked (1):  represents requests which have been processed by the
5768  *              page_capture async thread within it's given walk of this bucket.
5769  *
5770  * These are all needed so that we can synchronize all async page_capture
5771  * events.  When the async thread moves to a new bucket, it will append the
5772  * walked list to the active list and walk each item one at a time, moving it
5773  * from the active list to the walked list.  Thus if there is an async request
5774  * outstanding for a given page, it will always be in one of the two lists.
5775  * New requests will always be added to the active list.
5776  * If we were not able to capture a page before the request expired, we'd free
5777  * up the request structure which would indicate to page_capture that there is
5778  * no longer a need for the given page, and clear the PR_CAPTURE flag if
5779  * possible.
5780  */
5781 typedef struct page_capture_hash_head {
5782         kmutex_t pchh_mutex;
5783         uint_t num_pages[PC_NUM_PRI];
5784         page_capture_hash_bucket_t lists[2]; /* sentinel nodes */
5785 } page_capture_hash_head_t;
5786
5787 #ifdef DEBUG
5788 #define NUM_PAGE_CAPTURE_BUCKETS 4
5789 #else
5790 #define NUM_PAGE_CAPTURE_BUCKETS 64
5791 #endif
5792
5793 page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS];
5794
5795 /* for now use a very simple hash based upon the size of a page struct */
5796 #define PAGE_CAPTURE_HASH(pp)   \
5797         ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1)))
5798
5799 extern pgcnt_t swapfs_minfree;
5800
5801 int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
5802
5803 /*
5804  * a callback function is required for page capture requests.
5805  */
5806 void
5807 page_capture_register_callback(uint_t index, clock_t duration,
5808     int (*cb_func)(page_t *, void *, uint_t))
5809 {
5810         ASSERT(pc_cb[index].cb_active == 0);
5811         ASSERT(cb_func != NULL);
5812         rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
5813         pc_cb[index].duration = duration;
5814         pc_cb[index].cb_func = cb_func;
5815         pc_cb[index].cb_active = 1;
5816         rw_exit(&pc_cb[index].cb_rwlock);
5817 }
5818
5819 void
5820 page_capture_unregister_callback(uint_t index)
5821 {
5822         int i, j;
5823         struct page_capture_hash_bucket *bp1;
5824         struct page_capture_hash_bucket *bp2;
5825         struct page_capture_hash_bucket *head = NULL;
5826         uint_t flags = (1 << index);
5827
5828         rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
5829         ASSERT(pc_cb[index].cb_active == 1);
5830         pc_cb[index].duration = 0;      /* Paranoia */
5831         pc_cb[index].cb_func = NULL;    /* Paranoia */
5832         pc_cb[index].cb_active = 0;
5833         rw_exit(&pc_cb[index].cb_rwlock);
5834
5835         /*
5836          * Just move all the entries to a private list which we can walk
5837          * through without the need to hold any locks.
5838          * No more requests can get added to the hash lists for this consumer
5839          * as the cb_active field for the callback has been cleared.
5840          */
5841         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
5842                 mutex_enter(&page_capture_hash[i].pchh_mutex);
5843                 for (j = 0; j < 2; j++) {
5844                         bp1 = page_capture_hash[i].lists[j].next;
5845                         /* walk through all but first (sentinel) element */
5846                         while (bp1 != &page_capture_hash[i].lists[j]) {
5847                                 bp2 = bp1;
5848                                 if (bp2->flags & flags) {
5849                                         bp1 = bp2->next;
5850                                         bp1->prev = bp2->prev;
5851                                         bp2->prev->next = bp1;
5852                                         bp2->next = head;
5853                                         head = bp2;
5854                                         /*
5855                                          * Clear the PR_CAPTURE bit as we
5856                                          * hold appropriate locks here.
5857                                          */
5858                                         page_clrtoxic(head->pp, PR_CAPTURE);
5859                                         page_capture_hash[i].
5860                                             num_pages[bp2->pri]--;
5861                                         continue;
5862                                 }
5863                                 bp1 = bp1->next;
5864                         }
5865                 }
5866                 mutex_exit(&page_capture_hash[i].pchh_mutex);
5867         }
5868
5869         while (head != NULL) {
5870                 bp1 = head;
5871                 head = head->next;
5872                 kmem_free(bp1, sizeof (*bp1));
5873         }
5874 }
5875
5876
5877 /*
5878  * Find pp in the active list and move it to the walked list if it
5879  * exists.
5880  * Note that most often pp should be at the front of the active list
5881  * as it is currently used and thus there is no other sort of optimization
5882  * being done here as this is a linked list data structure.
5883  * Returns 1 on successful move or 0 if page could not be found.
5884  */
5885 static int
5886 page_capture_move_to_walked(page_t *pp)
5887 {
5888         page_capture_hash_bucket_t *bp;
5889         int index;
5890
5891         index = PAGE_CAPTURE_HASH(pp);
5892
5893         mutex_enter(&page_capture_hash[index].pchh_mutex);
5894         bp = page_capture_hash[index].lists[0].next;
5895         while (bp != &page_capture_hash[index].lists[0]) {
5896                 if (bp->pp == pp) {
5897                         /* Remove from old list */
5898                         bp->next->prev = bp->prev;
5899                         bp->prev->next = bp->next;
5900
5901                         /* Add to new list */
5902                         bp->next = page_capture_hash[index].lists[1].next;
5903                         bp->prev = &page_capture_hash[index].lists[1];
5904                         page_capture_hash[index].lists[1].next = bp;
5905                         bp->next->prev = bp;
5906
5907                         /*
5908                          * There is a small probability of page on a free
5909                          * list being retired while being allocated
5910                          * and before P_RAF is set on it. The page may
5911                          * end up marked as high priority request instead
5912                          * of low priority request.
5913                          * If P_RAF page is not marked as low priority request
5914                          * change it to low priority request.
5915                          */
5916                         page_capture_hash[index].num_pages[bp->pri]--;
5917                         bp->pri = PAGE_CAPTURE_PRIO(pp);
5918                         page_capture_hash[index].num_pages[bp->pri]++;
5919                         mutex_exit(&page_capture_hash[index].pchh_mutex);
5920                         return (1);
5921                 }
5922                 bp = bp->next;
5923         }
5924         mutex_exit(&page_capture_hash[index].pchh_mutex);
5925         return (0);
5926 }
5927
5928 /*
5929  * Add a new entry to the page capture hash.  The only case where a new
5930  * entry is not added is when the page capture consumer is no longer registered.
5931  * In this case, we'll silently not add the page to the hash.  We know that
5932  * page retire will always be registered for the case where we are currently
5933  * unretiring a page and thus there are no conflicts.
5934  */
5935 static void
5936 page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap)
5937 {
5938         page_capture_hash_bucket_t *bp1;
5939         page_capture_hash_bucket_t *bp2;
5940         int index;
5941         int cb_index;
5942         int i;
5943         uchar_t pri;
5944 #ifdef DEBUG
5945         page_capture_hash_bucket_t *tp1;
5946         int l;
5947 #endif
5948
5949         ASSERT(!(flags & CAPTURE_ASYNC));
5950
5951         bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP);
5952
5953         bp1->pp = pp;
5954         bp1->szc = szc;
5955         bp1->flags = flags;
5956         bp1->datap = datap;
5957
5958         for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
5959                 if ((flags >> cb_index) & 1) {
5960                         break;
5961                 }
5962         }
5963
5964         ASSERT(cb_index != PC_NUM_CALLBACKS);
5965
5966         rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
5967         if (pc_cb[cb_index].cb_active) {
5968                 if (pc_cb[cb_index].duration == -1) {
5969                         bp1->expires = (clock_t)-1;
5970                 } else {
5971                         bp1->expires = ddi_get_lbolt() +
5972                             pc_cb[cb_index].duration;
5973                 }
5974         } else {
5975                 /* There's no callback registered so don't add to the hash */
5976                 rw_exit(&pc_cb[cb_index].cb_rwlock);
5977                 kmem_free(bp1, sizeof (*bp1));
5978                 return;
5979         }
5980
5981         index = PAGE_CAPTURE_HASH(pp);
5982
5983         /*
5984          * Only allow capture flag to be modified under this mutex.
5985          * Prevents multiple entries for same page getting added.
5986          */
5987         mutex_enter(&page_capture_hash[index].pchh_mutex);
5988
5989         /*
5990          * if not already on the hash, set capture bit and add to the hash
5991          */
5992         if (!(pp->p_toxic & PR_CAPTURE)) {
5993 #ifdef DEBUG
5994                 /* Check for duplicate entries */
5995                 for (l = 0; l < 2; l++) {
5996                         tp1 = page_capture_hash[index].lists[l].next;
5997                         while (tp1 != &page_capture_hash[index].lists[l]) {
5998                                 if (tp1->pp == pp) {
5999                                         panic("page pp 0x%p already on hash "
6000                                             "at 0x%p\n",
6001                                             (void *)pp, (void *)tp1);
6002                                 }
6003                                 tp1 = tp1->next;
6004                         }
6005                 }
6006
6007 #endif
6008                 page_settoxic(pp, PR_CAPTURE);
6009                 pri = PAGE_CAPTURE_PRIO(pp);
6010                 bp1->pri = pri;
6011                 bp1->next = page_capture_hash[index].lists[0].next;
6012                 bp1->prev = &page_capture_hash[index].lists[0];
6013                 bp1->next->prev = bp1;
6014                 page_capture_hash[index].lists[0].next = bp1;
6015                 page_capture_hash[index].num_pages[pri]++;
6016                 if (flags & CAPTURE_RETIRE) {
6017                         page_retire_incr_pend_count(datap);
6018                 }
6019                 mutex_exit(&page_capture_hash[index].pchh_mutex);
6020                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6021                 cv_signal(&pc_cv);
6022                 return;
6023         }
6024
6025         /*
6026          * A page retire request will replace any other request.
6027          * A second physmem request which is for a different process than
6028          * the currently registered one will be dropped as there is
6029          * no way to hold the private data for both calls.
6030          * In the future, once there are more callers, this will have to
6031          * be worked out better as there needs to be private storage for
6032          * at least each type of caller (maybe have datap be an array of
6033          * *void's so that we can index based upon callers index).
6034          */
6035
6036         /* walk hash list to update expire time */
6037         for (i = 0; i < 2; i++) {
6038                 bp2 = page_capture_hash[index].lists[i].next;
6039                 while (bp2 != &page_capture_hash[index].lists[i]) {
6040                         if (bp2->pp == pp) {
6041                                 if (flags & CAPTURE_RETIRE) {
6042                                         if (!(bp2->flags & CAPTURE_RETIRE)) {
6043                                                 page_retire_incr_pend_count(
6044                                                     datap);
6045                                                 bp2->flags = flags;
6046                                                 bp2->expires = bp1->expires;
6047                                                 bp2->datap = datap;
6048                                         }
6049                                 } else {
6050                                         ASSERT(flags & CAPTURE_PHYSMEM);
6051                                         if (!(bp2->flags & CAPTURE_RETIRE) &&
6052                                             (datap == bp2->datap)) {
6053                                                 bp2->expires = bp1->expires;
6054                                         }
6055                                 }
6056                                 mutex_exit(&page_capture_hash[index].
6057                                     pchh_mutex);
6058                                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6059                                 kmem_free(bp1, sizeof (*bp1));
6060                                 return;
6061                         }
6062                         bp2 = bp2->next;
6063                 }
6064         }
6065
6066         /*
6067          * the PR_CAPTURE flag is protected by the page_capture_hash mutexes
6068          * and thus it either has to be set or not set and can't change
6069          * while holding the mutex above.
6070          */
6071         panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n",
6072             (void *)pp);
6073 }
6074
6075 /*
6076  * We have a page in our hands, lets try and make it ours by turning
6077  * it into a clean page like it had just come off the freelists.
6078  *
6079  * Returns 0 on success, with the page still EXCL locked.
6080  * On failure, the page will be unlocked, and returns EAGAIN
6081  */
6082 static int
6083 page_capture_clean_page(page_t *pp)
6084 {
6085         page_t *newpp;
6086         int skip_unlock = 0;
6087         spgcnt_t count;
6088         page_t *tpp;
6089         int ret = 0;
6090         int extra;
6091
6092         ASSERT(PAGE_EXCL(pp));
6093         ASSERT(!PP_RETIRED(pp));
6094         ASSERT(curthread->t_flag & T_CAPTURING);
6095
6096         if (PP_ISFREE(pp)) {
6097                 if (!page_reclaim(pp, NULL)) {
6098                         skip_unlock = 1;
6099                         ret = EAGAIN;
6100                         goto cleanup;
6101                 }
6102                 ASSERT(pp->p_szc == 0);
6103                 if (pp->p_vnode != NULL) {
6104                         /*
6105                          * Since this page came from the
6106                          * cachelist, we must destroy the
6107                          * old vnode association.
6108                          */
6109                         page_hashout(pp, false);
6110                 }
6111                 goto cleanup;
6112         }
6113
6114         /*
6115          * If we know page_relocate will fail, skip it
6116          * It could still fail due to a UE on another page but we
6117          * can't do anything about that.
6118          */
6119         if (pp->p_toxic & PR_UE) {
6120                 goto skip_relocate;
6121         }
6122
6123         /*
6124          * It's possible that pages can not have a vnode as fsflush comes
6125          * through and cleans up these pages.  It's ugly but that's how it is.
6126          */
6127         if (pp->p_vnode == NULL) {
6128                 goto skip_relocate;
6129         }
6130
6131         /*
6132          * Page was not free, so lets try to relocate it.
6133          * page_relocate only works with root pages, so if this is not a root
6134          * page, we need to demote it to try and relocate it.
6135          * Unfortunately this is the best we can do right now.
6136          */
6137         newpp = NULL;
6138         if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) {
6139                 if (page_try_demote_pages(pp) == 0) {
6140                         ret = EAGAIN;
6141                         goto cleanup;
6142                 }
6143         }
6144         ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL);
6145         if (ret == 0) {
6146                 page_t *npp;
6147                 /* unlock the new page(s) */
6148                 while (count-- > 0) {
6149                         ASSERT(newpp != NULL);
6150                         npp = newpp;
6151                         page_sub(&newpp, npp);
6152                         page_unlock(npp);
6153                 }
6154                 ASSERT(newpp == NULL);
6155                 /*
6156                  * Check to see if the page we have is too large.
6157                  * If so, demote it freeing up the extra pages.
6158                  */
6159                 if (pp->p_szc > 0) {
6160                         /* For now demote extra pages to szc == 0 */
6161                         extra = page_get_pagecnt(pp->p_szc) - 1;
6162                         while (extra > 0) {
6163                                 tpp = pp->p_next;
6164                                 page_sub(&pp, tpp);
6165                                 tpp->p_szc = 0;
6166                                 page_free(tpp, 1);
6167                                 extra--;
6168                         }
6169                         /* Make sure to set our page to szc 0 as well */
6170                         ASSERT(pp->p_next == pp && pp->p_prev == pp);
6171                         pp->p_szc = 0;
6172                 }
6173                 goto cleanup;
6174         } else if (ret == EIO) {
6175                 ret = EAGAIN;
6176                 goto cleanup;
6177         } else {
6178                 /*
6179                  * Need to reset return type as we failed to relocate the page
6180                  * but that does not mean that some of the next steps will not
6181                  * work.
6182                  */
6183                 ret = 0;
6184         }
6185
6186 skip_relocate:
6187
6188         if (pp->p_szc > 0) {
6189                 if (page_try_demote_pages(pp) == 0) {
6190                         ret = EAGAIN;
6191                         goto cleanup;
6192                 }
6193         }
6194
6195         ASSERT(pp->p_szc == 0);
6196
6197         if (hat_ismod(pp)) {
6198                 ret = EAGAIN;
6199                 goto cleanup;
6200         }
6201         if (PP_ISKAS(pp)) {
6202                 ret = EAGAIN;
6203                 goto cleanup;
6204         }
6205         if (pp->p_lckcnt || pp->p_cowcnt) {
6206                 ret = EAGAIN;
6207                 goto cleanup;
6208         }
6209
6210         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6211         ASSERT(!hat_page_is_mapped(pp));
6212
6213         if (hat_ismod(pp)) {
6214                 /*
6215                  * This is a semi-odd case as the page is now modified but not
6216                  * mapped as we just unloaded the mappings above.
6217                  */
6218                 ret = EAGAIN;
6219                 goto cleanup;
6220         }
6221         if (pp->p_vnode != NULL) {
6222                 page_hashout(pp, false);
6223         }
6224
6225         /*
6226          * At this point, the page should be in a clean state and
6227          * we can do whatever we want with it.
6228          */
6229
6230 cleanup:
6231         if (ret != 0) {
6232                 if (!skip_unlock) {
6233                         page_unlock(pp);
6234                 }
6235         } else {
6236                 ASSERT(pp->p_szc == 0);
6237                 ASSERT(PAGE_EXCL(pp));
6238
6239                 pp->p_next = pp;
6240                 pp->p_prev = pp;
6241         }
6242         return (ret);
6243 }
6244
6245 /*
6246  * Various callers of page_trycapture() can have different restrictions upon
6247  * what memory they have access to.
6248  * Returns 0 on success, with the following error codes on failure:
6249  *      EPERM - The requested page is long term locked, and thus repeated
6250  *              requests to capture this page will likely fail.
6251  *      ENOMEM - There was not enough free memory in the system to safely
6252  *              map the requested page.
6253  *      ENOENT - The requested page was inside the kernel cage, and the
6254  *              PHYSMEM_CAGE flag was not set.
6255  */
6256 int
6257 page_capture_pre_checks(page_t *pp, uint_t flags)
6258 {
6259         ASSERT(pp != NULL);
6260
6261         if (PP_ISKAS(pp)) {
6262                 return (EPERM);
6263         }
6264
6265         /* only physmem currently has the restrictions checked below */
6266         if (!(flags & CAPTURE_PHYSMEM)) {
6267                 return (0);
6268         }
6269
6270         if (availrmem < swapfs_minfree) {
6271                 /*
6272                  * We won't try to capture this page as we are
6273                  * running low on memory.
6274                  */
6275                 return (ENOMEM);
6276         }
6277         return (0);
6278 }
6279
6280 /*
6281  * Once we have a page in our mits, go ahead and complete the capture
6282  * operation.
6283  * Returns 1 on failure where page is no longer needed
6284  * Returns 0 on success
6285  * Returns -1 if there was a transient failure.
6286  * Failure cases must release the SE_EXCL lock on pp (usually via page_free).
6287  */
6288 int
6289 page_capture_take_action(page_t *pp, uint_t flags, void *datap)
6290 {
6291         int cb_index;
6292         int ret = 0;
6293         page_capture_hash_bucket_t *bp1;
6294         page_capture_hash_bucket_t *bp2;
6295         int index;
6296         int found = 0;
6297         int i;
6298
6299         ASSERT(PAGE_EXCL(pp));
6300         ASSERT(curthread->t_flag & T_CAPTURING);
6301
6302         for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6303                 if ((flags >> cb_index) & 1) {
6304                         break;
6305                 }
6306         }
6307         ASSERT(cb_index < PC_NUM_CALLBACKS);
6308
6309         /*
6310          * Remove the entry from the page_capture hash, but don't free it yet
6311          * as we may need to put it back.
6312          * Since we own the page at this point in time, we should find it
6313          * in the hash if this is an ASYNC call.  If we don't it's likely
6314          * that the page_capture_async() thread decided that this request
6315          * had expired, in which case we just continue on.
6316          */
6317         if (flags & CAPTURE_ASYNC) {
6318
6319                 index = PAGE_CAPTURE_HASH(pp);
6320
6321                 mutex_enter(&page_capture_hash[index].pchh_mutex);
6322                 for (i = 0; i < 2 && !found; i++) {
6323                         bp1 = page_capture_hash[index].lists[i].next;
6324                         while (bp1 != &page_capture_hash[index].lists[i]) {
6325                                 if (bp1->pp == pp) {
6326                                         bp1->next->prev = bp1->prev;
6327                                         bp1->prev->next = bp1->next;
6328                                         page_capture_hash[index].
6329                                             num_pages[bp1->pri]--;
6330                                         page_clrtoxic(pp, PR_CAPTURE);
6331                                         found = 1;
6332                                         break;
6333                                 }
6334                                 bp1 = bp1->next;
6335                         }
6336                 }
6337                 mutex_exit(&page_capture_hash[index].pchh_mutex);
6338         }
6339
6340         /* Synchronize with the unregister func. */
6341         rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6342         if (!pc_cb[cb_index].cb_active) {
6343                 page_free(pp, 1);
6344                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6345                 if (found) {
6346                         kmem_free(bp1, sizeof (*bp1));
6347                 }
6348                 return (1);
6349         }
6350
6351         /*
6352          * We need to remove the entry from the page capture hash and turn off
6353          * the PR_CAPTURE bit before calling the callback.  We'll need to cache
6354          * the entry here, and then based upon the return value, cleanup
6355          * appropriately or re-add it to the hash, making sure that someone else
6356          * hasn't already done so.
6357          * It should be rare for the callback to fail and thus it's ok for
6358          * the failure path to be a bit complicated as the success path is
6359          * cleaner and the locking rules are easier to follow.
6360          */
6361
6362         ret = pc_cb[cb_index].cb_func(pp, datap, flags);
6363
6364         rw_exit(&pc_cb[cb_index].cb_rwlock);
6365
6366         /*
6367          * If this was an ASYNC request, we need to cleanup the hash if the
6368          * callback was successful or if the request was no longer valid.
6369          * For non-ASYNC requests, we return failure to map and the caller
6370          * will take care of adding the request to the hash.
6371          * Note also that the callback itself is responsible for the page
6372          * at this point in time in terms of locking ...  The most common
6373          * case for the failure path should just be a page_free.
6374          */
6375         if (ret >= 0) {
6376                 if (found) {
6377                         if (bp1->flags & CAPTURE_RETIRE) {
6378                                 page_retire_decr_pend_count(datap);
6379                         }
6380                         kmem_free(bp1, sizeof (*bp1));
6381                 }
6382                 return (ret);
6383         }
6384         if (!found) {
6385                 return (ret);
6386         }
6387
6388         ASSERT(flags & CAPTURE_ASYNC);
6389
6390         /*
6391          * Check for expiration time first as we can just free it up if it's
6392          * expired.
6393          */
6394         if (ddi_get_lbolt() > bp1->expires && bp1->expires != -1) {
6395                 kmem_free(bp1, sizeof (*bp1));
6396                 return (ret);
6397         }
6398
6399         /*
6400          * The callback failed and there used to be an entry in the hash for
6401          * this page, so we need to add it back to the hash.
6402          */
6403         mutex_enter(&page_capture_hash[index].pchh_mutex);
6404         if (!(pp->p_toxic & PR_CAPTURE)) {
6405                 /* just add bp1 back to head of walked list */
6406                 page_settoxic(pp, PR_CAPTURE);
6407                 bp1->next = page_capture_hash[index].lists[1].next;
6408                 bp1->prev = &page_capture_hash[index].lists[1];
6409                 bp1->next->prev = bp1;
6410                 bp1->pri = PAGE_CAPTURE_PRIO(pp);
6411                 page_capture_hash[index].lists[1].next = bp1;
6412                 page_capture_hash[index].num_pages[bp1->pri]++;
6413                 mutex_exit(&page_capture_hash[index].pchh_mutex);
6414                 return (ret);
6415         }
6416
6417         /*
6418          * Otherwise there was a new capture request added to list
6419          * Need to make sure that our original data is represented if
6420          * appropriate.
6421          */
6422         for (i = 0; i < 2; i++) {
6423                 bp2 = page_capture_hash[index].lists[i].next;
6424                 while (bp2 != &page_capture_hash[index].lists[i]) {
6425                         if (bp2->pp == pp) {
6426                                 if (bp1->flags & CAPTURE_RETIRE) {
6427                                         if (!(bp2->flags & CAPTURE_RETIRE)) {
6428                                                 bp2->szc = bp1->szc;
6429                                                 bp2->flags = bp1->flags;
6430                                                 bp2->expires = bp1->expires;
6431                                                 bp2->datap = bp1->datap;
6432                                         }
6433                                 } else {
6434                                         ASSERT(bp1->flags & CAPTURE_PHYSMEM);
6435                                         if (!(bp2->flags & CAPTURE_RETIRE)) {
6436                                                 bp2->szc = bp1->szc;
6437                                                 bp2->flags = bp1->flags;
6438                                                 bp2->expires = bp1->expires;
6439                                                 bp2->datap = bp1->datap;
6440                                         }
6441                                 }
6442                                 page_capture_hash[index].num_pages[bp2->pri]--;
6443                                 bp2->pri = PAGE_CAPTURE_PRIO(pp);
6444                                 page_capture_hash[index].num_pages[bp2->pri]++;
6445                                 mutex_exit(&page_capture_hash[index].
6446                                     pchh_mutex);
6447                                 kmem_free(bp1, sizeof (*bp1));
6448                                 return (ret);
6449                         }
6450                         bp2 = bp2->next;
6451                 }
6452         }
6453         panic("PR_CAPTURE set but not on hash for pp 0x%p\n", (void *)pp);
6454         /*NOTREACHED*/
6455 }
6456
6457 /*
6458  * Try to capture the given page for the caller specified in the flags
6459  * parameter.  The page will either be captured and handed over to the
6460  * appropriate callback, or will be queued up in the page capture hash
6461  * to be captured asynchronously.
6462  * If the current request is due to an async capture, the page must be
6463  * exclusively locked before calling this function.
6464  * Currently szc must be 0 but in the future this should be expandable to
6465  * other page sizes.
6466  * Returns 0 on success, with the following error codes on failure:
6467  *      EPERM - The requested page is long term locked, and thus repeated
6468  *              requests to capture this page will likely fail.
6469  *      ENOMEM - There was not enough free memory in the system to safely
6470  *              map the requested page.
6471  *      ENOENT - The requested page was inside the kernel cage, and the
6472  *              CAPTURE_GET_CAGE flag was not set.
6473  *      EAGAIN - The requested page could not be capturead at this point in
6474  *              time but future requests will likely work.
6475  *      EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag
6476  *              was not set.
6477  */
6478 int
6479 page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6480 {
6481         int ret;
6482         int cb_index;
6483
6484         if (flags & CAPTURE_ASYNC) {
6485                 ASSERT(PAGE_EXCL(pp));
6486                 goto async;
6487         }
6488
6489         /* Make sure there's enough availrmem ... */
6490         ret = page_capture_pre_checks(pp, flags);
6491         if (ret != 0) {
6492                 return (ret);
6493         }
6494
6495         if (!page_trylock(pp, SE_EXCL)) {
6496                 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6497                         if ((flags >> cb_index) & 1) {
6498                                 break;
6499                         }
6500                 }
6501                 ASSERT(cb_index < PC_NUM_CALLBACKS);
6502                 ret = EAGAIN;
6503                 /* Special case for retired pages */
6504                 if (PP_RETIRED(pp)) {
6505                         if (flags & CAPTURE_GET_RETIRED) {
6506                                 if (!page_unretire_pp(pp, PR_UNR_TEMP)) {
6507                                         /*
6508                                          * Need to set capture bit and add to
6509                                          * hash so that the page will be
6510                                          * retired when freed.
6511                                          */
6512                                         page_capture_add_hash(pp, szc,
6513                                             CAPTURE_RETIRE, NULL);
6514                                         ret = 0;
6515                                         goto own_page;
6516                                 }
6517                         } else {
6518                                 return (EBUSY);
6519                         }
6520                 }
6521                 page_capture_add_hash(pp, szc, flags, datap);
6522                 return (ret);
6523         }
6524
6525 async:
6526         ASSERT(PAGE_EXCL(pp));
6527
6528         /* Need to check for physmem async requests that availrmem is sane */
6529         if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) ==
6530             (CAPTURE_ASYNC | CAPTURE_PHYSMEM) &&
6531             (availrmem < swapfs_minfree)) {
6532                 page_unlock(pp);
6533                 return (ENOMEM);
6534         }
6535
6536         ret = page_capture_clean_page(pp);
6537
6538         if (ret != 0) {
6539                 /* We failed to get the page, so lets add it to the hash */
6540                 if (!(flags & CAPTURE_ASYNC)) {
6541                         page_capture_add_hash(pp, szc, flags, datap);
6542                 }
6543                 return (ret);
6544         }
6545
6546 own_page:
6547         ASSERT(PAGE_EXCL(pp));
6548         ASSERT(pp->p_szc == 0);
6549
6550         /* Call the callback */
6551         ret = page_capture_take_action(pp, flags, datap);
6552
6553         if (ret == 0) {
6554                 return (0);
6555         }
6556
6557         /*
6558          * Note that in the failure cases from page_capture_take_action, the
6559          * EXCL lock will have already been dropped.
6560          */
6561         if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) {
6562                 page_capture_add_hash(pp, szc, flags, datap);
6563         }
6564         return (EAGAIN);
6565 }
6566
6567 int
6568 page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6569 {
6570         int ret;
6571
6572         curthread->t_flag |= T_CAPTURING;
6573         ret = page_itrycapture(pp, szc, flags, datap);
6574         curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */
6575         return (ret);
6576 }
6577
6578 /*
6579  * When unlocking a page which has the PR_CAPTURE bit set, this routine
6580  * gets called to try and capture the page.
6581  */
6582 void
6583 page_unlock_capture(page_t *pp)
6584 {
6585         page_capture_hash_bucket_t *bp;
6586         int index;
6587         int i;
6588         uint_t szc;
6589         uint_t flags = 0;
6590         void *datap;
6591         kmutex_t *mp;
6592         extern vnode_t retired_pages;
6593
6594         /*
6595          * We need to protect against a possible deadlock here where we own
6596          * the vnode page hash mutex and want to acquire it again as there
6597          * are locations in the code, where we unlock a page while holding
6598          * the mutex which can lead to the page being captured and eventually
6599          * end up here.  As we may be hashing out the old page and hashing into
6600          * the retire vnode, we need to make sure we don't own them.
6601          * Other callbacks who do hash operations also need to make sure that
6602          * before they hashin to a vnode that they do not currently own the
6603          * vphm mutex otherwise there will be a panic.
6604          */
6605         if (VMOBJECT_LOCKED(&retired_pages.v_object)) {
6606                 page_unlock_nocapture(pp);
6607                 return;
6608         }
6609         if (pp->p_vnode != NULL && VMOBJECT_LOCKED(&pp->p_vnode->v_object)) {
6610                 page_unlock_nocapture(pp);
6611                 return;
6612         }
6613
6614         index = PAGE_CAPTURE_HASH(pp);
6615
6616         mp = &page_capture_hash[index].pchh_mutex;
6617         mutex_enter(mp);
6618         for (i = 0; i < 2; i++) {
6619                 bp = page_capture_hash[index].lists[i].next;
6620                 while (bp != &page_capture_hash[index].lists[i]) {
6621                         if (bp->pp == pp) {
6622                                 szc = bp->szc;
6623                                 flags = bp->flags | CAPTURE_ASYNC;
6624                                 datap = bp->datap;
6625                                 mutex_exit(mp);
6626                                 (void) page_trycapture(pp, szc, flags, datap);
6627                                 return;
6628                         }
6629                         bp = bp->next;
6630                 }
6631         }
6632
6633         /* Failed to find page in hash so clear flags and unlock it. */
6634         page_clrtoxic(pp, PR_CAPTURE);
6635         page_unlock(pp);
6636
6637         mutex_exit(mp);
6638 }
6639
6640 void
6641 page_capture_init()
6642 {
6643         int i;
6644         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6645                 page_capture_hash[i].lists[0].next =
6646                     &page_capture_hash[i].lists[0];
6647                 page_capture_hash[i].lists[0].prev =
6648                     &page_capture_hash[i].lists[0];
6649                 page_capture_hash[i].lists[1].next =
6650                     &page_capture_hash[i].lists[1];
6651                 page_capture_hash[i].lists[1].prev =
6652                     &page_capture_hash[i].lists[1];
6653         }
6654
6655         pc_thread_shortwait = 23 * hz;
6656         pc_thread_longwait = 1201 * hz;
6657         pc_thread_retry = 3;
6658         mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
6659         cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
6660         pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
6661             TS_RUN, minclsyspri);
6662 }
6663
6664 /*
6665  * It is necessary to scrub any failing pages prior to reboot in order to
6666  * prevent a latent error trap from occurring on the next boot.
6667  */
6668 void
6669 page_retire_mdboot()
6670 {
6671         page_t *pp;
6672         int i, j;
6673         page_capture_hash_bucket_t *bp;
6674         uchar_t pri;
6675
6676         /* walk lists looking for pages to scrub */
6677         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6678                 for (pri = 0; pri < PC_NUM_PRI; pri++) {
6679                         if (page_capture_hash[i].num_pages[pri] != 0) {
6680                                 break;
6681                         }
6682                 }
6683                 if (pri == PC_NUM_PRI)
6684                         continue;
6685
6686                 mutex_enter(&page_capture_hash[i].pchh_mutex);
6687
6688                 for (j = 0; j < 2; j++) {
6689                         bp = page_capture_hash[i].lists[j].next;
6690                         while (bp != &page_capture_hash[i].lists[j]) {
6691                                 pp = bp->pp;
6692                                 if (PP_TOXIC(pp)) {
6693                                         if (page_trylock(pp, SE_EXCL)) {
6694                                                 PP_CLRFREE(pp);
6695                                                 pagescrub(pp, 0, PAGESIZE);
6696                                                 page_unlock(pp);
6697                                         }
6698                                 }
6699                                 bp = bp->next;
6700                         }
6701                 }
6702                 mutex_exit(&page_capture_hash[i].pchh_mutex);
6703         }
6704 }
6705
6706 /*
6707  * Walk the page_capture_hash trying to capture pages and also cleanup old
6708  * entries which have expired.
6709  */
6710 void
6711 page_capture_async()
6712 {
6713         page_t *pp;
6714         int i;
6715         int ret;
6716         page_capture_hash_bucket_t *bp1, *bp2;
6717         uint_t szc;
6718         uint_t flags;
6719         void *datap;
6720         uchar_t pri;
6721
6722         /* If there are outstanding pages to be captured, get to work */
6723         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6724                 for (pri = 0; pri < PC_NUM_PRI; pri++) {
6725                         if (page_capture_hash[i].num_pages[pri] != 0)
6726                                 break;
6727                 }
6728                 if (pri == PC_NUM_PRI)
6729                         continue;
6730
6731                 /* Append list 1 to list 0 and then walk through list 0 */
6732                 mutex_enter(&page_capture_hash[i].pchh_mutex);
6733                 bp1 = &page_capture_hash[i].lists[1];
6734                 bp2 = bp1->next;
6735                 if (bp1 != bp2) {
6736                         bp1->prev->next = page_capture_hash[i].lists[0].next;
6737                         bp2->prev = &page_capture_hash[i].lists[0];
6738                         page_capture_hash[i].lists[0].next->prev = bp1->prev;
6739                         page_capture_hash[i].lists[0].next = bp2;
6740                         bp1->next = bp1;
6741                         bp1->prev = bp1;
6742                 }
6743
6744                 /* list[1] will be empty now */
6745
6746                 bp1 = page_capture_hash[i].lists[0].next;
6747                 while (bp1 != &page_capture_hash[i].lists[0]) {
6748                         /* Check expiration time */
6749                         if ((ddi_get_lbolt() > bp1->expires &&
6750                             bp1->expires != -1) ||
6751                             page_deleted(bp1->pp)) {
6752                                 page_capture_hash[i].lists[0].next = bp1->next;
6753                                 bp1->next->prev =
6754                                     &page_capture_hash[i].lists[0];
6755                                 page_capture_hash[i].num_pages[bp1->pri]--;
6756
6757                                 /*
6758                                  * We can safely remove the PR_CAPTURE bit
6759                                  * without holding the EXCL lock on the page
6760                                  * as the PR_CAPTURE bit requres that the
6761                                  * page_capture_hash[].pchh_mutex be held
6762                                  * to modify it.
6763                                  */
6764                                 page_clrtoxic(bp1->pp, PR_CAPTURE);
6765                                 mutex_exit(&page_capture_hash[i].pchh_mutex);
6766                                 kmem_free(bp1, sizeof (*bp1));
6767                                 mutex_enter(&page_capture_hash[i].pchh_mutex);
6768                                 bp1 = page_capture_hash[i].lists[0].next;
6769                                 continue;
6770                         }
6771                         pp = bp1->pp;
6772                         szc = bp1->szc;
6773                         flags = bp1->flags;
6774                         datap = bp1->datap;
6775                         mutex_exit(&page_capture_hash[i].pchh_mutex);
6776                         if (page_trylock(pp, SE_EXCL)) {
6777                                 ret = page_trycapture(pp, szc,
6778                                     flags | CAPTURE_ASYNC, datap);
6779                         } else {
6780                                 ret = 1;        /* move to walked hash */
6781                         }
6782
6783                         if (ret != 0) {
6784                                 /* Move to walked hash */
6785                                 (void) page_capture_move_to_walked(pp);
6786                         }
6787                         mutex_enter(&page_capture_hash[i].pchh_mutex);
6788                         bp1 = page_capture_hash[i].lists[0].next;
6789                 }
6790
6791                 mutex_exit(&page_capture_hash[i].pchh_mutex);
6792         }
6793 }
6794
6795 /*
6796  * This function is called by the page_capture_thread, and is needed in
6797  * in order to initiate aio cleanup, so that pages used in aio
6798  * will be unlocked and subsequently retired by page_capture_thread.
6799  */
6800 static int
6801 do_aio_cleanup(void)
6802 {
6803         proc_t *procp;
6804         int (*aio_cleanup_dr_delete_memory)(proc_t *);
6805         int cleaned = 0;
6806
6807         if (modload("sys", "kaio") == -1) {
6808                 cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio");
6809                 return (0);
6810         }
6811         /*
6812          * We use the aio_cleanup_dr_delete_memory function to
6813          * initiate the actual clean up; this function will wake
6814          * up the per-process aio_cleanup_thread.
6815          */
6816         aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
6817             modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
6818         if (aio_cleanup_dr_delete_memory == NULL) {
6819                 cmn_err(CE_WARN,
6820             "aio_cleanup_dr_delete_memory not found in kaio");
6821                 return (0);
6822         }
6823         mutex_enter(&pidlock);
6824         for (procp = practive; (procp != NULL); procp = procp->p_next) {
6825                 mutex_enter(&procp->p_lock);
6826                 if (procp->p_aio != NULL) {
6827                         /* cleanup proc's outstanding kaio */
6828                         cleaned += (*aio_cleanup_dr_delete_memory)(procp);
6829                 }
6830                 mutex_exit(&procp->p_lock);
6831         }
6832         mutex_exit(&pidlock);
6833         return (cleaned);
6834 }
6835
6836 /*
6837  * helper function for page_capture_thread
6838  */
6839 static void
6840 page_capture_handle_outstanding(void)
6841 {
6842         int ntry;
6843
6844         /* Reap pages before attempting capture pages */
6845         kmem_reap();
6846
6847         if ((page_retire_pend_count() > page_retire_pend_kas_count()) &&
6848             hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL)) {
6849                 /*
6850                  * Note: Purging only for platforms that support
6851                  * ISM hat_pageunload() - mainly SPARC. On x86/x64
6852                  * platforms ISM pages SE_SHARED locked until destroyed.
6853                  */
6854
6855                 /* disable and purge seg_pcache */
6856                 (void) seg_p_disable();
6857                 for (ntry = 0; ntry < pc_thread_retry; ntry++) {
6858                         if (!page_retire_pend_count())
6859                                 break;
6860                         if (do_aio_cleanup()) {
6861                                 /*
6862                                  * allow the apps cleanup threads
6863                                  * to run
6864                                  */
6865                                 delay(pc_thread_shortwait);
6866                         }
6867                         page_capture_async();
6868                 }
6869                 /* reenable seg_pcache */
6870                 seg_p_enable();
6871
6872                 /* completed what can be done.  break out */
6873                 return;
6874         }
6875
6876         /*
6877          * For kernel pages and/or unsupported HAT_DYNAMIC_ISM_UNMAP, reap
6878          * and then attempt to capture.
6879          */
6880         seg_preap();
6881         page_capture_async();
6882 }
6883
6884 /*
6885  * The page_capture_thread loops forever, looking to see if there are
6886  * pages still waiting to be captured.
6887  */
6888 static void
6889 page_capture_thread(void)
6890 {
6891         callb_cpr_t c;
6892         int i;
6893         int high_pri_pages;
6894         int low_pri_pages;
6895         clock_t timeout;
6896
6897         CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture");
6898
6899         mutex_enter(&pc_thread_mutex);
6900         for (;;) {
6901                 high_pri_pages = 0;
6902                 low_pri_pages = 0;
6903                 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6904                         high_pri_pages +=
6905                             page_capture_hash[i].num_pages[PC_PRI_HI];
6906                         low_pri_pages +=
6907                             page_capture_hash[i].num_pages[PC_PRI_LO];
6908                 }
6909
6910                 timeout = pc_thread_longwait;
6911                 if (high_pri_pages != 0) {
6912                         timeout = pc_thread_shortwait;
6913                         page_capture_handle_outstanding();
6914                 } else if (low_pri_pages != 0) {
6915                         page_capture_async();
6916                 }
6917                 CALLB_CPR_SAFE_BEGIN(&c);
6918                 (void) cv_reltimedwait(&pc_cv, &pc_thread_mutex,
6919                     timeout, TR_CLOCK_TICK);
6920                 CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
6921         }
6922         /*NOTREACHED*/
6923 }
6924 /*
6925  * Attempt to locate a bucket that has enough pages to satisfy the request.
6926  * The initial check is done without the lock to avoid unneeded contention.
6927  * The function returns 1 if enough pages were found, else 0 if it could not
6928  * find enough pages in a bucket.
6929  */
6930 static int
6931 pcf_decrement_bucket(pgcnt_t npages)
6932 {
6933         struct pcf      *p;
6934         struct pcf      *q;
6935         int i;
6936
6937         p = &pcf[PCF_INDEX()];
6938         q = &pcf[pcf_fanout];
6939         for (i = 0; i < pcf_fanout; i++) {
6940                 if (p->pcf_count > npages) {
6941                         /*
6942                          * a good one to try.
6943                          */
6944                         mutex_enter(&p->pcf_lock);
6945                         if (p->pcf_count > npages) {
6946                                 p->pcf_count -= (uint_t)npages;
6947                                 /*
6948                                  * freemem is not protected by any lock.
6949                                  * Thus, we cannot have any assertion
6950                                  * containing freemem here.
6951                                  */
6952                                 freemem -= npages;
6953                                 mutex_exit(&p->pcf_lock);
6954                                 return (1);
6955                         }
6956                         mutex_exit(&p->pcf_lock);
6957                 }
6958                 p++;
6959                 if (p >= q) {
6960                         p = pcf;
6961                 }
6962         }
6963         return (0);
6964 }
6965
6966 /*
6967  * Arguments:
6968  *      pcftotal_ret:   If the value is not NULL and we have walked all the
6969  *                      buckets but did not find enough pages then it will
6970  *                      be set to the total number of pages in all the pcf
6971  *                      buckets.
6972  *      npages:         Is the number of pages we have been requested to
6973  *                      find.
6974  *      unlock:         If set to 0 we will leave the buckets locked if the
6975  *                      requested number of pages are not found.
6976  *
6977  * Go and try to satisfy the page request  from any number of buckets.
6978  * This can be a very expensive operation as we have to lock the buckets
6979  * we are checking (and keep them locked), starting at bucket 0.
6980  *
6981  * The function returns 1 if enough pages were found, else 0 if it could not
6982  * find enough pages in the buckets.
6983  *
6984  */
6985 static int
6986 pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock)
6987 {
6988         struct pcf      *p;
6989         pgcnt_t pcftotal;
6990         int i;
6991
6992         p = pcf;
6993         /* try to collect pages from several pcf bins */
6994         for (pcftotal = 0, i = 0; i < pcf_fanout; i++) {
6995                 mutex_enter(&p->pcf_lock);
6996                 pcftotal += p->pcf_count;
6997                 if (pcftotal >= npages) {
6998                         /*
6999                          * Wow!  There are enough pages laying around
7000                          * to satisfy the request.  Do the accounting,
7001                          * drop the locks we acquired, and go back.
7002                          *
7003                          * freemem is not protected by any lock. So,
7004                          * we cannot have any assertion containing
7005                          * freemem.
7006                          */
7007                         freemem -= npages;
7008                         while (p >= pcf) {
7009                                 if (p->pcf_count <= npages) {
7010                                         npages -= p->pcf_count;
7011                                         p->pcf_count = 0;
7012                                 } else {
7013                                         p->pcf_count -= (uint_t)npages;
7014                                         npages = 0;
7015                                 }
7016                                 mutex_exit(&p->pcf_lock);
7017                                 p--;
7018                         }
7019                         ASSERT(npages == 0);
7020                         return (1);
7021                 }
7022                 p++;
7023         }
7024         if (unlock) {
7025                 /* failed to collect pages - release the locks */
7026                 while (--p >= pcf) {
7027                         mutex_exit(&p->pcf_lock);
7028                 }
7029         }
7030         if (pcftotal_ret != NULL)
7031                 *pcftotal_ret = pcftotal;
7032         return (0);
7033 }
7034
7035 static int
7036 vmobject_cmp(const void *va, const void *vb)
7037 {
7038         const page_t *a = va;
7039         const page_t *b = vb;
7040
7041         if (a->p_offset > b->p_offset)
7042                 return (1);
7043         if (a->p_offset < b->p_offset)
7044                 return (-1);
7045         return (0);
7046 }
7047
7048 void
7049 vmobject_init(struct vmobject *obj, struct vnode *vnode)
7050 {
7051         avl_create(&obj->tree, vmobject_cmp, sizeof (struct page),
7052             offsetof(struct page, p_object_node));
7053         list_create(&obj->list, sizeof (struct page),
7054             offsetof(struct page, p_list.vnode));
7055         mutex_init(&obj->lock, NULL, MUTEX_DEFAULT, NULL);
7056
7057         obj->vnode = vnode;
7058 }
7059
7060 void
7061 vmobject_fini(struct vmobject *obj)
7062 {
7063         mutex_destroy(&obj->lock);
7064         list_destroy(&obj->list);
7065         avl_destroy(&obj->tree);
7066 }