kernel/vm/vm_pagelist.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 /*
  26  * Copyright 2012 Joyent, Inc.  All rights reserved.
  27  */
  28
  29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  30 /*      All Rights Reserved   */
  31
  32 /*
  33  * Portions of this source code were derived from Berkeley 4.3 BSD
  34  * under license from the Regents of the University of California.
  35  */
  36
  37
  38 /*
  39  * This file contains common functions to access and manage the page lists.
  40  * Many of these routines originated from platform dependent modules
  41  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
  42  * a platform independent manner.
  43  *
  44  * vm/vm_dep.h provides for platform specific support.
  45  */
  46
  47 #include <sys/types.h>
  48 #include <sys/debug.h>
  49 #include <sys/cmn_err.h>
  50 #include <sys/systm.h>
  51 #include <sys/atomic.h>
  52 #include <sys/sysmacros.h>
  53 #include <vm/as.h>
  54 #include <vm/page.h>
  55 #include <vm/seg_kmem.h>
  56 #include <vm/seg_vn.h>
  57 #include <sys/vmsystm.h>
  58 #include <sys/memnode.h>
  59 #include <vm/vm_dep.h>
  60 #include <sys/lgrp.h>
  61 #include <sys/mem_config.h>
  62 #include <sys/callb.h>
  63 #include <sys/sdt.h>
  64 #include <sys/dumphdr.h>
  65 #include <sys/swap.h>
  66
  67 extern uint_t   vac_colors;
  68
  69 #define MAX_PRAGMA_ALIGN        128
  70
  71 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
  72
  73 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
  74 #pragma align   L2CACHE_ALIGN_MAX(vm_cpu_data0)
  75 #else
  76 #pragma align   MAX_PRAGMA_ALIGN(vm_cpu_data0)
  77 #endif
  78 char            vm_cpu_data0[VM_CPU_DATA_PADSIZE];
  79
  80 /*
  81  * number of page colors equivalent to reqested color in page_get routines.
  82  * If set, keeps large pages intact longer and keeps MPO allocation
  83  * from the local mnode in favor of acquiring the 'correct' page color from
  84  * a demoted large page or from a remote mnode.
  85  */
  86 uint_t  colorequiv;
  87
  88 /*
  89  * color equivalency mask for each page size.
  90  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
  91  * High 4 bits determine the number of high order bits of the color to ignore.
  92  * Low 4 bits determines number of low order bits of color to ignore (it's only
  93  * relevant for hashed index based page coloring).
  94  */
  95 uchar_t colorequivszc[MMU_PAGE_SIZES];
  96
  97 /*
  98  * if set, specifies the percentage of large pages that are free from within
  99  * a large page region before attempting to lock those pages for
 100  * page_get_contig_pages processing.
 101  *
 102  * Should be turned on when kpr is available when page_trylock_contig_pages
 103  * can be more selective.
 104  */
 105
 106 int     ptcpthreshold;
 107
 108 /*
 109  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
 110  * Enabled by default via pgcplimitsearch.
 111  *
 112  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
 113  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
 114  * bound. This upper bound range guarantees:
 115  *    - all large page 'slots' will be searched over time
 116  *    - the minimum (1) large page candidates considered on each pgcp call
 117  *    - count doesn't wrap around to 0
 118  */
 119 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES];
 120 int     pgcplimitsearch = 1;
 121
 122 #define PGCPFAILMAX             (1 << (highbit(physinstalled) - 1))
 123 #define SETPGCPFAILCNT(szc)                                             \
 124         if (++pgcpfailcnt[szc] >= PGCPFAILMAX)                          \
 125                 pgcpfailcnt[szc] = PGCPFAILMAX / 2;
 126
 127 #ifdef VM_STATS
 128 struct vmm_vmstats_str  vmm_vmstats;
 129
 130 #endif /* VM_STATS */
 131
 132 /* enable page_get_contig_pages */
 133 #define LPGCREATE       1
 134
 135 int pg_contig_disable;
 136 int pg_lpgcreate_nocage = LPGCREATE;
 137
 138 /*
 139  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
 140  */
 141 #define PFNNULL         0
 142
 143 /* Flags involved in promotion and demotion routines */
 144 #define PC_FREE         0x1     /* put page on freelist */
 145 #define PC_ALLOC        0x2     /* return page for allocation */
 146
 147 /*
 148  * Flag for page_demote to be used with PC_FREE to denote that we don't care
 149  * what the color is as the color parameter to the function is ignored.
 150  */
 151 #define PC_NO_COLOR     (-1)
 152
 153 /* mtype value for page_promote to use when mtype does not matter */
 154 #define PC_MTYPE_ANY    (-1)
 155
 156 /*
 157  * page counters candidates info
 158  * See page_ctrs_cands comment below for more details.
 159  * fields are as follows:
 160  *      pcc_pages_free:         # pages which freelist coalesce can create
 161  *      pcc_color_free:         pointer to page free counts per color
 162  */
 163 typedef struct pcc_info {
 164         pgcnt_t pcc_pages_free;
 165         pgcnt_t *pcc_color_free;
 166         uint_t  pad[12];
 167 } pcc_info_t;
 168
 169 /*
 170  * On big machines it can take a long time to check page_counters
 171  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
 172  * updated sum of all elements of the corresponding page_counters arrays.
 173  * page_freelist_coalesce() searches page_counters only if an appropriate
 174  * element of page_ctrs_cands array is greater than 0.
 175  *
 176  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
 177  */
 178 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
 179
 180 /*
 181  * Return in val the total number of free pages which can be created
 182  * for the given mnode (m), mrange (g), and region size (r)
 183  */
 184 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) {                           \
 185         int i;                                                          \
 186         val = 0;                                                        \
 187         for (i = 0; i < NPC_MUTEX; i++) {                               \
 188             val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;    \
 189         }                                                               \
 190 }
 191
 192 /*
 193  * Return in val the total number of free pages which can be created
 194  * for the given mnode (m), mrange (g), region size (r), and color (c)
 195  */
 196 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {                   \
 197         int i;                                                          \
 198         val = 0;                                                        \
 199         ASSERT((c) < PAGE_GET_PAGECOLORS(r));                           \
 200         for (i = 0; i < NPC_MUTEX; i++) {                               \
 201             val +=                                                      \
 202                 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];  \
 203         }                                                               \
 204 }
 205
 206 /*
 207  * We can only allow a single thread to update a counter within the physical
 208  * range of the largest supported page size. That is the finest granularity
 209  * possible since the counter values are dependent on each other
 210  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
 211  * ctr_mutex lock index for a particular physical range.
 212  */
 213 static kmutex_t *ctr_mutex[NPC_MUTEX];
 214
 215 #define PP_CTR_LOCK_INDX(pp)                                            \
 216         (((pp)->p_pagenum >>                                            \
 217             (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
 218
 219 #define INVALID_COLOR 0xffffffff
 220 #define INVALID_MASK  0xffffffff
 221
 222 /*
 223  * Local functions prototypes.
 224  */
 225
 226 void page_ctr_add(int, int, page_t *, int);
 227 void page_ctr_add_internal(int, int, page_t *, int);
 228 void page_ctr_sub(int, int, page_t *, int);
 229 void page_ctr_sub_internal(int, int, page_t *, int);
 230 void page_freelist_lock(int);
 231 void page_freelist_unlock(int);
 232 page_t *page_promote(int, pfn_t, uchar_t, int, int);
 233 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
 234 page_t *page_freelist_split(uchar_t,
 235     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
 236 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
 237 static int page_trylock_cons(page_t *pp, se_t se);
 238
 239 /*
 240  * The page_counters array below is used to keep track of free contiguous
 241  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
 242  * This contains an array of counters, the size of the array, a shift value
 243  * used to convert a pagenum into a counter array index or vice versa, as
 244  * well as a cache of the last successful index to be promoted to a larger
 245  * page size.  As an optimization, we keep track of the last successful index
 246  * to be promoted per page color for the given size region, and this is
 247  * allocated dynamically based upon the number of colors for a given
 248  * region size.
 249  *
 250  * Conceptually, the page counters are represented as:
 251  *
 252  *      page_counters[region_size][mnode]
 253  *
 254  *      region_size:    size code of a candidate larger page made up
 255  *                      of contiguous free smaller pages.
 256  *
 257  *      page_counters[region_size][mnode].hpm_counters[index]:
 258  *              represents how many (region_size - 1) pages either
 259  *              exist or can be created within the given index range.
 260  *
 261  * Let's look at a sparc example:
 262  *      If we want to create a free 512k page, we look at region_size 2
 263  *      for the mnode we want.  We calculate the index and look at a specific
 264  *      hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
 265  *      this location, it means that 8 64k pages either exist or can be created
 266  *      from 8K pages in order to make a single free 512k page at the given
 267  *      index.  Note that when a region is full, it will contribute to the
 268  *      counts in the region above it.  Thus we will not know what page
 269  *      size the free pages will be which can be promoted to this new free
 270  *      page unless we look at all regions below the current region.
 271  */
 272
 273 /*
 274  * Note: hpmctr_t is defined in platform vm_dep.h
 275  * hw_page_map_t contains all the information needed for the page_counters
 276  * logic. The fields are as follows:
 277  *
 278  *      hpm_counters:   dynamically allocated array to hold counter data
 279  *      hpm_entries:    entries in hpm_counters
 280  *      hpm_shift:      shift for pnum/array index conv
 281  *      hpm_base:       PFN mapped to counter index 0
 282  *      hpm_color_current:      last index in counter array for this color at
 283  *                              which we successfully created a large page
 284  */
 285 typedef struct hw_page_map {
 286         hpmctr_t        *hpm_counters;
 287         size_t          hpm_entries;
 288         int             hpm_shift;
 289         pfn_t           hpm_base;
 290         size_t          *hpm_color_current[MAX_MNODE_MRANGES];
 291 } hw_page_map_t;
 292
 293 /*
 294  * Element zero is not used, but is allocated for convenience.
 295  */
 296 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
 297
 298 /*
 299  * Cached value of MNODE_RANGE_CNT(mnode).
 300  * This is a function call in x86.
 301  */
 302 static int mnode_nranges[MAX_MEM_NODES];
 303 static int mnode_maxmrange[MAX_MEM_NODES];
 304
 305 /*
 306  * The following macros are convenient ways to get access to the individual
 307  * elements of the page_counters arrays.  They can be used on both
 308  * the left side and right side of equations.
 309  */
 310 #define PAGE_COUNTERS(mnode, rg_szc, idx)                       \
 311         (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
 312
 313 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc)                   \
 314         (page_counters[(rg_szc)][(mnode)].hpm_counters)
 315
 316 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc)                      \
 317         (page_counters[(rg_szc)][(mnode)].hpm_shift)
 318
 319 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc)                    \
 320         (page_counters[(rg_szc)][(mnode)].hpm_entries)
 321
 322 #define PAGE_COUNTERS_BASE(mnode, rg_szc)                       \
 323         (page_counters[(rg_szc)][(mnode)].hpm_base)
 324
 325 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)             \
 326         (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
 327
 328 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)       \
 329         (page_counters[(rg_szc)][(mnode)].                              \
 330         hpm_color_current[(mrange)][(color)])
 331
 332 #define PNUM_TO_IDX(mnode, rg_szc, pnum)                        \
 333         (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>    \
 334                 PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
 335
 336 #define IDX_TO_PNUM(mnode, rg_szc, index)                       \
 337         (PAGE_COUNTERS_BASE((mnode), (rg_szc)) +                \
 338                 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
 339
 340 /*
 341  * Protects the hpm_counters and hpm_color_current memory from changing while
 342  * looking at page counters information.
 343  * Grab the write lock to modify what these fields point at.
 344  * Grab the read lock to prevent any pointers from changing.
 345  * The write lock can not be held during memory allocation due to a possible
 346  * recursion deadlock with trying to grab the read lock while the
 347  * write lock is already held.
 348  */
 349 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
 350
 351
 352 /*
 353  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
 354  */
 355 void
 356 cpu_vm_data_init(struct cpu *cp)
 357 {
 358         if (cp == CPU0) {
 359                 cp->cpu_vm_data = (void *)&vm_cpu_data0;
 360         } else {
 361                 void    *kmptr;
 362                 int     align;
 363                 size_t  sz;
 364
 365                 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
 366                 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
 367                 kmptr = kmem_zalloc(sz, KM_SLEEP);
 368                 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
 369                 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
 370                 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
 371         }
 372 }
 373
 374 /*
 375  * free cpu_vm_data
 376  */
 377 void
 378 cpu_vm_data_destroy(struct cpu *cp)
 379 {
 380         if (cp->cpu_seqid && cp->cpu_vm_data) {
 381                 ASSERT(cp != CPU0);
 382                 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
 383                     ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
 384         }
 385         cp->cpu_vm_data = NULL;
 386 }
 387
 388
 389 /*
 390  * page size to page size code
 391  */
 392 int
 393 page_szc(size_t pagesize)
 394 {
 395         int     i = 0;
 396
 397         while (hw_page_array[i].hp_size) {
 398                 if (pagesize == hw_page_array[i].hp_size)
 399                         return (i);
 400                 i++;
 401         }
 402         return (-1);
 403 }
 404
 405 /*
 406  * page size to page size code with the restriction that it be a supported
 407  * user page size.  If it's not a supported user page size, -1 will be returned.
 408  */
 409 int
 410 page_szc_user_filtered(size_t pagesize)
 411 {
 412         int szc = page_szc(pagesize);
 413         if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
 414                 return (szc);
 415         }
 416         return (-1);
 417 }
 418
 419 /*
 420  * Return how many page sizes are available for the user to use.  This is
 421  * what the hardware supports and not based upon how the OS implements the
 422  * support of different page sizes.
 423  *
 424  * If legacy is non-zero, return the number of pagesizes available to legacy
 425  * applications. The number of legacy page sizes might be less than the
 426  * exported user page sizes. This is to prevent legacy applications that
 427  * use the largest page size returned from getpagesizes(3c) from inadvertantly
 428  * using the 'new' large pagesizes.
 429  */
 430 uint_t
 431 page_num_user_pagesizes(int legacy)
 432 {
 433         if (legacy)
 434                 return (mmu_legacy_page_sizes);
 435         return (mmu_exported_page_sizes);
 436 }
 437
 438 uint_t
 439 page_num_pagesizes(void)
 440 {
 441         return (mmu_page_sizes);
 442 }
 443
 444 /*
 445  * returns the count of the number of base pagesize pages associated with szc
 446  */
 447 pgcnt_t
 448 page_get_pagecnt(uint_t szc)
 449 {
 450         if (szc >= mmu_page_sizes)
 451                 panic("page_get_pagecnt: out of range %d", szc);
 452         return (hw_page_array[szc].hp_pgcnt);
 453 }
 454
 455 size_t
 456 page_get_pagesize(uint_t szc)
 457 {
 458         if (szc >= mmu_page_sizes)
 459                 panic("page_get_pagesize: out of range %d", szc);
 460         return (hw_page_array[szc].hp_size);
 461 }
 462
 463 /*
 464  * Return the size of a page based upon the index passed in.  An index of
 465  * zero refers to the smallest page size in the system, and as index increases
 466  * it refers to the next larger supported page size in the system.
 467  * Note that szc and userszc may not be the same due to unsupported szc's on
 468  * some systems.
 469  */
 470 size_t
 471 page_get_user_pagesize(uint_t userszc)
 472 {
 473         uint_t szc = USERSZC_2_SZC(userszc);
 474
 475         if (szc >= mmu_page_sizes)
 476                 panic("page_get_user_pagesize: out of range %d", szc);
 477         return (hw_page_array[szc].hp_size);
 478 }
 479
 480 uint_t
 481 page_get_shift(uint_t szc)
 482 {
 483         if (szc >= mmu_page_sizes)
 484                 panic("page_get_shift: out of range %d", szc);
 485         return (PAGE_GET_SHIFT(szc));
 486 }
 487
 488 uint_t
 489 page_get_pagecolors(uint_t szc)
 490 {
 491         if (szc >= mmu_page_sizes)
 492                 panic("page_get_pagecolors: out of range %d", szc);
 493         return (PAGE_GET_PAGECOLORS(szc));
 494 }
 495
 496 /*
 497  * this assigns the desired equivalent color after a split
 498  */
 499 uint_t
 500 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
 501     uint_t ncolor, uint_t ceq_mask)
 502 {
 503         ASSERT(nszc > szc);
 504         ASSERT(szc < mmu_page_sizes);
 505         ASSERT(color < PAGE_GET_PAGECOLORS(szc));
 506         ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
 507
 508         color &= ceq_mask;
 509         ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
 510         return (color | (ncolor & ~ceq_mask));
 511 }
 512
 513 /*
 514  * The interleaved_mnodes flag is set when mnodes overlap in
 515  * the physbase..physmax range, but have disjoint slices.
 516  * In this case hpm_counters is shared by all mnodes.
 517  * This flag is set dynamically by the platform.
 518  */
 519 int interleaved_mnodes = 0;
 520
 521 /*
 522  * Called by startup().
 523  * Size up the per page size free list counters based on physmax
 524  * of each node and max_mem_nodes.
 525  *
 526  * If interleaved_mnodes is set we need to find the first mnode that
 527  * exists. hpm_counters for the first mnode will then be shared by
 528  * all other mnodes. If interleaved_mnodes is not set, just set
 529  * first=mnode each time. That means there will be no sharing.
 530  */
 531 size_t
 532 page_ctrs_sz(void)
 533 {
 534         int     r;              /* region size */
 535         int     mnode;
 536         int     firstmn;        /* first mnode that exists */
 537         int     nranges;
 538         pfn_t   physbase;
 539         pfn_t   physmax;
 540         uint_t  ctrs_sz = 0;
 541         int     i;
 542         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 543
 544         /*
 545          * We need to determine how many page colors there are for each
 546          * page size in order to allocate memory for any color specific
 547          * arrays.
 548          */
 549         for (i = 0; i < mmu_page_sizes; i++) {
 550                 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 551         }
 552
 553         for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 554
 555                 pgcnt_t r_pgcnt;
 556                 pfn_t   r_base;
 557                 pgcnt_t r_align;
 558
 559                 if (mem_node_config[mnode].exists == 0)
 560                         continue;
 561
 562                 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 563                 nranges = MNODE_RANGE_CNT(mnode);
 564                 mnode_nranges[mnode] = nranges;
 565                 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
 566
 567                 /*
 568                  * determine size needed for page counter arrays with
 569                  * base aligned to large page size.
 570                  */
 571                 for (r = 1; r < mmu_page_sizes; r++) {
 572                         /* add in space for hpm_color_current */
 573                         ctrs_sz += sizeof (size_t) *
 574                             colors_per_szc[r] * nranges;
 575
 576                         if (firstmn != mnode)
 577                                 continue;
 578
 579                         /* add in space for hpm_counters */
 580                         r_align = page_get_pagecnt(r);
 581                         r_base = physbase;
 582                         r_base &= ~(r_align - 1);
 583                         r_pgcnt = howmany(physmax - r_base + 1, r_align);
 584
 585                         /*
 586                          * Round up to always allocate on pointer sized
 587                          * boundaries.
 588                          */
 589                         ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
 590                             sizeof (hpmctr_t *));
 591                 }
 592         }
 593
 594         for (r = 1; r < mmu_page_sizes; r++) {
 595                 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
 596         }
 597
 598         /* add in space for page_ctrs_cands and pcc_color_free */
 599         ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
 600             mmu_page_sizes * NPC_MUTEX;
 601
 602         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 603
 604                 if (mem_node_config[mnode].exists == 0)
 605                         continue;
 606
 607                 nranges = mnode_nranges[mnode];
 608                 ctrs_sz += sizeof (pcc_info_t) * nranges *
 609                     mmu_page_sizes * NPC_MUTEX;
 610                 for (r = 1; r < mmu_page_sizes; r++) {
 611                         ctrs_sz += sizeof (pgcnt_t) * nranges *
 612                             colors_per_szc[r] * NPC_MUTEX;
 613                 }
 614         }
 615
 616         /* ctr_mutex */
 617         ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
 618
 619         /* size for page list counts */
 620         PLCNT_SZ(ctrs_sz);
 621
 622         /*
 623          * add some slop for roundups. page_ctrs_alloc will roundup the start
 624          * address of the counters to ecache_alignsize boundary for every
 625          * memory node.
 626          */
 627         return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
 628 }
 629
 630 caddr_t
 631 page_ctrs_alloc(caddr_t alloc_base)
 632 {
 633         int     mnode;
 634         int     mrange, nranges;
 635         int     r;              /* region size */
 636         int     i;
 637         int     firstmn;        /* first mnode that exists */
 638         pfn_t   physbase;
 639         pfn_t   physmax;
 640         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 641
 642         /*
 643          * We need to determine how many page colors there are for each
 644          * page size in order to allocate memory for any color specific
 645          * arrays.
 646          */
 647         for (i = 0; i < mmu_page_sizes; i++) {
 648                 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 649         }
 650
 651         for (r = 1; r < mmu_page_sizes; r++) {
 652                 page_counters[r] = (hw_page_map_t *)alloc_base;
 653                 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
 654         }
 655
 656         /* page_ctrs_cands and pcc_color_free array */
 657         for (i = 0; i < NPC_MUTEX; i++) {
 658                 for (r = 1; r < mmu_page_sizes; r++) {
 659
 660                         page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
 661                         alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
 662
 663                         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 664                                 pcc_info_t *pi;
 665
 666                                 if (mem_node_config[mnode].exists == 0)
 667                                         continue;
 668
 669                                 nranges = mnode_nranges[mnode];
 670
 671                                 pi = (pcc_info_t *)alloc_base;
 672                                 alloc_base += sizeof (pcc_info_t) * nranges;
 673                                 page_ctrs_cands[i][r][mnode] = pi;
 674
 675                                 for (mrange = 0; mrange < nranges; mrange++) {
 676                                         pi->pcc_color_free =
 677                                             (pgcnt_t *)alloc_base;
 678                                         alloc_base += sizeof (pgcnt_t) *
 679                                             colors_per_szc[r];
 680                                         pi++;
 681                                 }
 682                         }
 683                 }
 684         }
 685
 686         /* ctr_mutex */
 687         for (i = 0; i < NPC_MUTEX; i++) {
 688                 ctr_mutex[i] = (kmutex_t *)alloc_base;
 689                 alloc_base += (max_mem_nodes * sizeof (kmutex_t));
 690         }
 691
 692         /* initialize page list counts */
 693         PLCNT_INIT(alloc_base);
 694
 695         for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 696
 697                 pgcnt_t r_pgcnt;
 698                 pfn_t   r_base;
 699                 pgcnt_t r_align;
 700                 int     r_shift;
 701                 int     nranges = mnode_nranges[mnode];
 702
 703                 if (mem_node_config[mnode].exists == 0)
 704                         continue;
 705
 706                 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 707
 708                 for (r = 1; r < mmu_page_sizes; r++) {
 709                         /*
 710                          * the page_counters base has to be aligned to the
 711                          * page count of page size code r otherwise the counts
 712                          * will cross large page boundaries.
 713                          */
 714                         r_align = page_get_pagecnt(r);
 715                         r_base = physbase;
 716                         /* base needs to be aligned - lower to aligned value */
 717                         r_base &= ~(r_align - 1);
 718                         r_pgcnt = howmany(physmax - r_base + 1, r_align);
 719                         r_shift = PAGE_BSZS_SHIFT(r);
 720
 721                         PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
 722                         PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
 723                         PAGE_COUNTERS_BASE(mnode, r) = r_base;
 724                         for (mrange = 0; mrange < nranges; mrange++) {
 725                                 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
 726                                     r, mrange) = (size_t *)alloc_base;
 727                                 alloc_base += sizeof (size_t) *
 728                                     colors_per_szc[r];
 729                         }
 730                         for (i = 0; i < colors_per_szc[r]; i++) {
 731                                 uint_t color_mask = colors_per_szc[r] - 1;
 732                                 pfn_t  pfnum = r_base;
 733                                 size_t idx;
 734                                 int mrange;
 735                                 MEM_NODE_ITERATOR_DECL(it);
 736
 737                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
 738                                 if (pfnum == (pfn_t)-1) {
 739                                         idx = 0;
 740                                 } else {
 741                                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
 742                                             color_mask, color_mask, &it);
 743                                         idx = PNUM_TO_IDX(mnode, r, pfnum);
 744                                         idx = (idx >= r_pgcnt) ? 0 : idx;
 745                                 }
 746                                 for (mrange = 0; mrange < nranges; mrange++) {
 747                                         PAGE_COUNTERS_CURRENT_COLOR(mnode,
 748                                             r, i, mrange) = idx;
 749                                 }
 750                         }
 751
 752                         /* hpm_counters may be shared by all mnodes */
 753                         if (firstmn == mnode) {
 754                                 PAGE_COUNTERS_COUNTERS(mnode, r) =
 755                                     (hpmctr_t *)alloc_base;
 756                                 alloc_base +=
 757                                     P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
 758                                     sizeof (hpmctr_t *));
 759                         } else {
 760                                 PAGE_COUNTERS_COUNTERS(mnode, r) =
 761                                     PAGE_COUNTERS_COUNTERS(firstmn, r);
 762                         }
 763
 764                         /*
 765                          * Verify that PNUM_TO_IDX and IDX_TO_PNUM
 766                          * satisfy the identity requirement.
 767                          * We should be able to go from one to the other
 768                          * and get consistent values.
 769                          */
 770                         ASSERT(PNUM_TO_IDX(mnode, r,
 771                             (IDX_TO_PNUM(mnode, r, 0))) == 0);
 772                         ASSERT(IDX_TO_PNUM(mnode, r,
 773                             (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
 774                 }
 775                 /*
 776                  * Roundup the start address of the page_counters to
 777                  * cache aligned boundary for every memory node.
 778                  * page_ctrs_sz() has added some slop for these roundups.
 779                  */
 780                 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
 781                     L2CACHE_ALIGN);
 782         }
 783
 784         /* Initialize other page counter specific data structures. */
 785         for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
 786                 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
 787         }
 788
 789         return (alloc_base);
 790 }
 791
 792 /*
 793  * Functions to adjust region counters for each size free list.
 794  * Caller is responsible to acquire the ctr_mutex lock if necessary and
 795  * thus can be called during startup without locks.
 796  */
 797 /* ARGSUSED */
 798 void
 799 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
 800 {
 801         ssize_t         r;      /* region size */
 802         ssize_t         idx;
 803         pfn_t           pfnum;
 804         int             lckidx;
 805
 806         ASSERT(mnode == PP_2_MEM_NODE(pp));
 807         ASSERT(mtype == PP_2_MTYPE(pp));
 808
 809         ASSERT(pp->p_szc < mmu_page_sizes);
 810
 811         PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
 812
 813         /* no counter update needed for largest page size */
 814         if (pp->p_szc >= mmu_page_sizes - 1) {
 815                 return;
 816         }
 817
 818         r = pp->p_szc + 1;
 819         pfnum = pp->p_pagenum;
 820         lckidx = PP_CTR_LOCK_INDX(pp);
 821
 822         /*
 823          * Increment the count of free pages for the current
 824          * region. Continue looping up in region size incrementing
 825          * count if the preceeding region is full.
 826          */
 827         while (r < mmu_page_sizes) {
 828                 idx = PNUM_TO_IDX(mnode, r, pfnum);
 829
 830                 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 831                 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
 832
 833                 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
 834                         break;
 835                 } else {
 836                         int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
 837                         pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
 838                             [MTYPE_2_MRANGE(mnode, root_mtype)];
 839
 840                         cand->pcc_pages_free++;
 841                         cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
 842                 }
 843                 r++;
 844         }
 845 }
 846
 847 void
 848 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
 849 {
 850         int             lckidx = PP_CTR_LOCK_INDX(pp);
 851         kmutex_t        *lock = &ctr_mutex[lckidx][mnode];
 852
 853         mutex_enter(lock);
 854         page_ctr_add_internal(mnode, mtype, pp, flags);
 855         mutex_exit(lock);
 856 }
 857
 858 void
 859 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
 860 {
 861         int             lckidx;
 862         ssize_t         r;      /* region size */
 863         ssize_t         idx;
 864         pfn_t           pfnum;
 865
 866         ASSERT(mnode == PP_2_MEM_NODE(pp));
 867         ASSERT(mtype == PP_2_MTYPE(pp));
 868
 869         ASSERT(pp->p_szc < mmu_page_sizes);
 870
 871         PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
 872
 873         /* no counter update needed for largest page size */
 874         if (pp->p_szc >= mmu_page_sizes - 1) {
 875                 return;
 876         }
 877
 878         r = pp->p_szc + 1;
 879         pfnum = pp->p_pagenum;
 880         lckidx = PP_CTR_LOCK_INDX(pp);
 881
 882         /*
 883          * Decrement the count of free pages for the current
 884          * region. Continue looping up in region size decrementing
 885          * count if the preceeding region was full.
 886          */
 887         while (r < mmu_page_sizes) {
 888                 idx = PNUM_TO_IDX(mnode, r, pfnum);
 889
 890                 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 891                 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
 892
 893                 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
 894                         break;
 895                 } else {
 896                         int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
 897                         pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
 898                             [MTYPE_2_MRANGE(mnode, root_mtype)];
 899
 900                         ASSERT(cand->pcc_pages_free != 0);
 901                         ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
 902
 903                         cand->pcc_pages_free--;
 904                         cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
 905                 }
 906                 r++;
 907         }
 908 }
 909
 910 void
 911 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
 912 {
 913         int             lckidx = PP_CTR_LOCK_INDX(pp);
 914         kmutex_t        *lock = &ctr_mutex[lckidx][mnode];
 915
 916         mutex_enter(lock);
 917         page_ctr_sub_internal(mnode, mtype, pp, flags);
 918         mutex_exit(lock);
 919 }
 920
 921 /*
 922  * Adjust page counters following a memory attach, since typically the
 923  * size of the array needs to change, and the PFN to counter index
 924  * mapping needs to change.
 925  *
 926  * It is possible this mnode did not exist at startup. In that case
 927  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
 928  * to change (a theoretical possibility on x86), which means pcc_color_free
 929  * arrays must be extended.
 930  */
 931 uint_t
 932 page_ctrs_adjust(int mnode)
 933 {
 934         pgcnt_t npgs;
 935         int     r;              /* region size */
 936         int     i;
 937         size_t  pcsz, old_csz;
 938         hpmctr_t *new_ctr, *old_ctr;
 939         pfn_t   oldbase, newbase;
 940         pfn_t   physbase, physmax;
 941         size_t  old_npgs;
 942         hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
 943         size_t  size_cache[MMU_PAGE_SIZES];
 944         size_t  *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 945         size_t  *old_color_array[MAX_MNODE_MRANGES];
 946         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 947         pcc_info_t **cands_cache;
 948         pcc_info_t *old_pi, *pi;
 949         pgcnt_t *pgcntp;
 950         int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
 951         int cands_cache_nranges;
 952         int old_maxmrange, new_maxmrange;
 953         int rc = 0;
 954         int oldmnode;
 955
 956         cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
 957             MMU_PAGE_SIZES, KM_NOSLEEP);
 958         if (cands_cache == NULL)
 959                 return (ENOMEM);
 960
 961         i = -1;
 962         HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
 963
 964         newbase = physbase & ~PC_BASE_ALIGN_MASK;
 965         npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
 966
 967         /* prepare to free non-null pointers on the way out */
 968         cands_cache_nranges = nranges;
 969         bzero(ctr_cache, sizeof (ctr_cache));
 970         bzero(color_cache, sizeof (color_cache));
 971
 972         /*
 973          * We need to determine how many page colors there are for each
 974          * page size in order to allocate memory for any color specific
 975          * arrays.
 976          */
 977         for (r = 0; r < mmu_page_sizes; r++) {
 978                 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
 979         }
 980
 981         /*
 982          * Preallocate all of the new hpm_counters arrays as we can't
 983          * hold the page_ctrs_rwlock as a writer and allocate memory.
 984          * If we can't allocate all of the arrays, undo our work so far
 985          * and return failure.
 986          */
 987         for (r = 1; r < mmu_page_sizes; r++) {
 988                 pcsz = npgs >> PAGE_BSZS_SHIFT(r);
 989                 size_cache[r] = pcsz;
 990                 ctr_cache[r] = kmem_zalloc(pcsz *
 991                     sizeof (hpmctr_t), KM_NOSLEEP);
 992                 if (ctr_cache[r] == NULL) {
 993                         rc = ENOMEM;
 994                         goto cleanup;
 995                 }
 996         }
 997
 998         /*
 999          * Preallocate all of the new color current arrays as we can't
1000          * hold the page_ctrs_rwlock as a writer and allocate memory.
1001          * If we can't allocate all of the arrays, undo our work so far
1002          * and return failure.
1003          */
1004         for (r = 1; r < mmu_page_sizes; r++) {
1005                 for (mrange = 0; mrange < nranges; mrange++) {
1006                         color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1007                             colors_per_szc[r], KM_NOSLEEP);
1008                         if (color_cache[r][mrange] == NULL) {
1009                                 rc = ENOMEM;
1010                                 goto cleanup;
1011                         }
1012                 }
1013         }
1014
1015         /*
1016          * Preallocate all of the new pcc_info_t arrays as we can't
1017          * hold the page_ctrs_rwlock as a writer and allocate memory.
1018          * If we can't allocate all of the arrays, undo our work so far
1019          * and return failure.
1020          */
1021         for (r = 1; r < mmu_page_sizes; r++) {
1022                 for (i = 0; i < NPC_MUTEX; i++) {
1023                         pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1024                             KM_NOSLEEP);
1025                         if (pi == NULL) {
1026                                 rc = ENOMEM;
1027                                 goto cleanup;
1028                         }
1029                         cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1030
1031                         for (mrange = 0; mrange < nranges; mrange++, pi++) {
1032                                 pgcntp = kmem_zalloc(colors_per_szc[r] *
1033                                     sizeof (pgcnt_t), KM_NOSLEEP);
1034                                 if (pgcntp == NULL) {
1035                                         rc = ENOMEM;
1036                                         goto cleanup;
1037                                 }
1038                                 pi->pcc_color_free = pgcntp;
1039                         }
1040                 }
1041         }
1042
1043         /*
1044          * Grab the write lock to prevent others from walking these arrays
1045          * while we are modifying them.
1046          */
1047         PAGE_CTRS_WRITE_LOCK(mnode);
1048
1049         /*
1050          * For interleaved mnodes, find the first mnode
1051          * with valid page counters since the current
1052          * mnode may have just been added and not have
1053          * valid page counters.
1054          */
1055         if (interleaved_mnodes) {
1056                 for (i = 0; i < max_mem_nodes; i++)
1057                         if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1058                                 break;
1059                 ASSERT(i < max_mem_nodes);
1060                 oldmnode = i;
1061         } else
1062                 oldmnode = mnode;
1063
1064         old_nranges = mnode_nranges[mnode];
1065         cands_cache_nranges = old_nranges;
1066         mnode_nranges[mnode] = nranges;
1067         old_maxmrange = mnode_maxmrange[mnode];
1068         mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1069         new_maxmrange = mnode_maxmrange[mnode];
1070
1071         for (r = 1; r < mmu_page_sizes; r++) {
1072                 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1073                 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1074                 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1075                 oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1076                 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1077                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1078                         old_color_array[mrange] =
1079                             PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1080                             r, mrange);
1081                 }
1082
1083                 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1084                 new_ctr = ctr_cache[r];
1085                 ctr_cache[r] = NULL;
1086                 if (old_ctr != NULL &&
1087                     (oldbase + old_npgs > newbase) &&
1088                     (newbase + npgs > oldbase)) {
1089                         /*
1090                          * Map the intersection of the old and new
1091                          * counters into the new array.
1092                          */
1093                         size_t offset;
1094                         if (newbase > oldbase) {
1095                                 offset = (newbase - oldbase) >>
1096                                     PAGE_COUNTERS_SHIFT(mnode, r);
1097                                 bcopy(old_ctr + offset, new_ctr,
1098                                     MIN(pcsz, (old_csz - offset)) *
1099                                     sizeof (hpmctr_t));
1100                         } else {
1101                                 offset = (oldbase - newbase) >>
1102                                     PAGE_COUNTERS_SHIFT(mnode, r);
1103                                 bcopy(old_ctr, new_ctr + offset,
1104                                     MIN(pcsz - offset, old_csz) *
1105                                     sizeof (hpmctr_t));
1106                         }
1107                 }
1108
1109                 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1110                 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1111                 PAGE_COUNTERS_BASE(mnode, r) = newbase;
1112
1113                 /* update shared hpm_counters in other mnodes */
1114                 if (interleaved_mnodes) {
1115                         for (i = 0; i < max_mem_nodes; i++) {
1116                                 if ((i == mnode) ||
1117                                     (mem_node_config[i].exists == 0))
1118                                         continue;
1119                                 ASSERT(
1120                                     PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1121                                     PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1122                                 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1123                                 PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1124                                 PAGE_COUNTERS_BASE(i, r) = newbase;
1125                         }
1126                 }
1127
1128                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1129                         PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1130                             color_cache[r][mrange];
1131                         color_cache[r][mrange] = NULL;
1132                 }
1133                 /*
1134                  * for now, just reset on these events as it's probably
1135                  * not worthwhile to try and optimize this.
1136                  */
1137                 for (i = 0; i < colors_per_szc[r]; i++) {
1138                         uint_t color_mask = colors_per_szc[r] - 1;
1139                         int mlo = interleaved_mnodes ? 0 : mnode;
1140                         int mhi = interleaved_mnodes ? max_mem_nodes :
1141                             (mnode + 1);
1142                         int m;
1143                         pfn_t  pfnum;
1144                         size_t idx;
1145                         MEM_NODE_ITERATOR_DECL(it);
1146
1147                         for (m = mlo; m < mhi; m++) {
1148                                 if (mem_node_config[m].exists == 0)
1149                                         continue;
1150                                 pfnum = newbase;
1151                                 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1152                                 if (pfnum == (pfn_t)-1) {
1153                                         idx = 0;
1154                                 } else {
1155                                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1156                                             color_mask, color_mask, &it);
1157                                         idx = PNUM_TO_IDX(m, r, pfnum);
1158                                         idx = (idx < pcsz) ? idx : 0;
1159                                 }
1160                                 for (mrange = 0; mrange < nranges; mrange++) {
1161                                         if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1162                                             r, mrange) != NULL)
1163                                                 PAGE_COUNTERS_CURRENT_COLOR(m,
1164                                                     r, i, mrange) = idx;
1165                                 }
1166                         }
1167                 }
1168
1169                 /* cache info for freeing out of the critical path */
1170                 if ((caddr_t)old_ctr >= kernelheap &&
1171                     (caddr_t)old_ctr < ekernelheap) {
1172                         ctr_cache[r] = old_ctr;
1173                         size_cache[r] = old_csz;
1174                 }
1175                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1176                         size_t *tmp = old_color_array[mrange];
1177                         if ((caddr_t)tmp >= kernelheap &&
1178                             (caddr_t)tmp < ekernelheap) {
1179                                 color_cache[r][mrange] = tmp;
1180                         }
1181                 }
1182                 /*
1183                  * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1184                  * satisfy the identity requirement.
1185                  * We should be able to go from one to the other
1186                  * and get consistent values.
1187                  */
1188                 ASSERT(PNUM_TO_IDX(mnode, r,
1189                     (IDX_TO_PNUM(mnode, r, 0))) == 0);
1190                 ASSERT(IDX_TO_PNUM(mnode, r,
1191                     (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1192
1193                 /* pcc_info_t and pcc_color_free */
1194                 for (i = 0; i < NPC_MUTEX; i++) {
1195                         pcc_info_t *epi;
1196                         pcc_info_t *eold_pi;
1197
1198                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1199                         old_pi = page_ctrs_cands[i][r][mnode];
1200                         page_ctrs_cands[i][r][mnode] = pi;
1201                         cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1202
1203                         /* preserve old pcc_color_free values, if any */
1204                         if (old_pi == NULL)
1205                                 continue;
1206
1207                         /*
1208                          * when/if x86 does DR, must account for
1209                          * possible change in range index when
1210                          * preserving pcc_info
1211                          */
1212                         epi = &pi[nranges];
1213                         eold_pi = &old_pi[old_nranges];
1214                         if (new_maxmrange > old_maxmrange) {
1215                                 pi += new_maxmrange - old_maxmrange;
1216                         } else if (new_maxmrange < old_maxmrange) {
1217                                 old_pi += old_maxmrange - new_maxmrange;
1218                         }
1219                         for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1220                                 pcc_info_t tmp = *pi;
1221                                 *pi = *old_pi;
1222                                 *old_pi = tmp;
1223                         }
1224                 }
1225         }
1226         PAGE_CTRS_WRITE_UNLOCK(mnode);
1227
1228         /*
1229          * Now that we have dropped the write lock, it is safe to free all
1230          * of the memory we have cached above.
1231          * We come thru here to free memory when pre-alloc fails, and also to
1232          * free old pointers which were recorded while locked.
1233          */
1234 cleanup:
1235         for (r = 1; r < mmu_page_sizes; r++) {
1236                 if (ctr_cache[r] != NULL) {
1237                         kmem_free(ctr_cache[r],
1238                             size_cache[r] * sizeof (hpmctr_t));
1239                 }
1240                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1241                         if (color_cache[r][mrange] != NULL) {
1242                                 kmem_free(color_cache[r][mrange],
1243                                     colors_per_szc[r] * sizeof (size_t));
1244                         }
1245                 }
1246                 for (i = 0; i < NPC_MUTEX; i++) {
1247                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1248                         if (pi == NULL)
1249                                 continue;
1250                         nr = cands_cache_nranges;
1251                         for (mrange = 0; mrange < nr; mrange++, pi++) {
1252                                 pgcntp = pi->pcc_color_free;
1253                                 if (pgcntp == NULL)
1254                                         continue;
1255                                 if ((caddr_t)pgcntp >= kernelheap &&
1256                                     (caddr_t)pgcntp < ekernelheap) {
1257                                         kmem_free(pgcntp,
1258                                             colors_per_szc[r] *
1259                                             sizeof (pgcnt_t));
1260                                 }
1261                         }
1262                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1263                         if ((caddr_t)pi >= kernelheap &&
1264                             (caddr_t)pi < ekernelheap) {
1265                                 kmem_free(pi, nr * sizeof (pcc_info_t));
1266                         }
1267                 }
1268         }
1269
1270         kmem_free(cands_cache,
1271             sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1272         return (rc);
1273 }
1274
1275 /*
1276  * Cleanup the hpm_counters field in the page counters
1277  * array.
1278  */
1279 void
1280 page_ctrs_cleanup(void)
1281 {
1282         int r;  /* region size */
1283         int i;  /* mnode index */
1284
1285         /*
1286          * Get the page counters write lock while we are
1287          * setting the page hpm_counters field to NULL
1288          * for non-existent mnodes.
1289          */
1290         for (i = 0; i < max_mem_nodes; i++) {
1291                 PAGE_CTRS_WRITE_LOCK(i);
1292                 if (mem_node_config[i].exists) {
1293                         PAGE_CTRS_WRITE_UNLOCK(i);
1294                         continue;
1295                 }
1296                 for (r = 1; r < mmu_page_sizes; r++) {
1297                         PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1298                 }
1299                 PAGE_CTRS_WRITE_UNLOCK(i);
1300         }
1301 }
1302
1303 #ifdef DEBUG
1304
1305 /*
1306  * confirm pp is a large page corresponding to szc
1307  */
1308 void
1309 chk_lpg(page_t *pp, uchar_t szc)
1310 {
1311         spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1312         uint_t noreloc;
1313
1314         if (npgs == 1) {
1315                 ASSERT(pp->p_szc == 0);
1316                 ASSERT(pp->p_next == pp);
1317                 ASSERT(pp->p_prev == pp);
1318                 return;
1319         }
1320
1321         ASSERT(pp->p_list.largepg.next == pp || pp->p_list.largepg.next == NULL);
1322         ASSERT(pp->p_list.largepg.prev == pp || pp->p_list.largepg.prev == NULL);
1323
1324         ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1325         ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1326         ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1327         ASSERT(pp->p_prev == (pp + (npgs - 1)));
1328
1329         /*
1330          * Check list of pages.
1331          */
1332         noreloc = PP_ISNORELOC(pp);
1333         while (npgs--) {
1334                 if (npgs != 0) {
1335                         ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1336                         ASSERT(pp->p_next == (pp + 1));
1337                 }
1338                 ASSERT(pp->p_szc == szc);
1339                 ASSERT(PP_ISFREE(pp));
1340                 ASSERT(PP_ISAGED(pp));
1341                 ASSERT(pp->p_list.largepg.next == pp || pp->p_list.largepg.next == NULL);
1342                 ASSERT(pp->p_list.largepg.prev == pp || pp->p_list.largepg.prev == NULL);
1343                 VERIFY(pp->p_object == NULL);
1344                 ASSERT(pp->p_vnode  == NULL);
1345                 ASSERT(PP_ISNORELOC(pp) == noreloc);
1346
1347                 pp = pp->p_next;
1348         }
1349 }
1350 #endif /* DEBUG */
1351
1352 void
1353 page_freelist_lock(int mnode)
1354 {
1355         int i;
1356         for (i = 0; i < NPC_MUTEX; i++) {
1357                 mutex_enter(FPC_MUTEX(mnode, i));
1358                 mutex_enter(CPC_MUTEX(mnode, i));
1359         }
1360 }
1361
1362 void
1363 page_freelist_unlock(int mnode)
1364 {
1365         int i;
1366         for (i = 0; i < NPC_MUTEX; i++) {
1367                 mutex_exit(FPC_MUTEX(mnode, i));
1368                 mutex_exit(CPC_MUTEX(mnode, i));
1369         }
1370 }
1371
1372 /*
1373  * add pp to the specified page list. Defaults to head of the page list
1374  * unless PG_LIST_TAIL is specified.
1375  */
1376 void
1377 page_list_add(page_t *pp, int flags)
1378 {
1379         page_t          **ppp;
1380         kmutex_t        *pcm;
1381         uint_t          bin, mtype;
1382         int             mnode;
1383
1384         ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1385         ASSERT(PP_ISFREE(pp));
1386         ASSERT(!hat_page_is_mapped(pp));
1387         ASSERT(hat_page_getshare(pp) == 0);
1388
1389         /*
1390          * Large pages should be freed via page_list_add_pages().
1391          */
1392         ASSERT(pp->p_szc == 0);
1393
1394         /*
1395          * Don't need to lock the freelist first here
1396          * because the page isn't on the freelist yet.
1397          * This means p_szc can't change on us.
1398          */
1399
1400         bin = PP_2_BIN(pp);
1401         mnode = PP_2_MEM_NODE(pp);
1402         mtype = PP_2_MTYPE(pp);
1403
1404         if (flags & PG_LIST_ISINIT) {
1405                 /*
1406                  * PG_LIST_ISINIT is set during system startup (ie. single
1407                  * threaded), add a page to the free list and add to the
1408                  * the free region counters w/o any locking
1409                  */
1410                 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1411
1412                 /* inline version of page_add() */
1413                 if (*ppp != NULL) {
1414                         pp->p_next = *ppp;
1415                         pp->p_prev = (*ppp)->p_prev;
1416                         (*ppp)->p_prev = pp;
1417                         pp->p_prev->p_next = pp;
1418                 } else
1419                         *ppp = pp;
1420
1421                 page_ctr_add_internal(mnode, mtype, pp, flags);
1422                 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1423         } else {
1424                 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1425
1426                 if (flags & PG_FREE_LIST) {
1427                         VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1428                         ASSERT(PP_ISAGED(pp));
1429                         ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1430
1431                 } else {
1432                         VM_STAT_ADD(vmm_vmstats.pladd_cache);
1433                         VERIFY(pp->p_object);
1434                         ASSERT(pp->p_vnode);
1435                         ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1436                         ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1437                 }
1438                 mutex_enter(pcm);
1439                 page_add(ppp, pp);
1440
1441                 if (flags & PG_LIST_TAIL)
1442                         *ppp = (*ppp)->p_next;
1443                 /*
1444                  * Add counters before releasing pcm mutex to avoid a race with
1445                  * page_freelist_coalesce and page_freelist_split.
1446                  */
1447                 page_ctr_add(mnode, mtype, pp, flags);
1448                 mutex_exit(pcm);
1449         }
1450
1451
1452         /*
1453          * It is up to the caller to unlock the page!
1454          */
1455         ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1456 }
1457
1458
1459
1460 /* ARGSUSED */
1461 void
1462 page_list_noreloc_startup(page_t *pp)
1463 {
1464         panic("page_list_noreloc_startup: should be here only for sparc");
1465 }
1466
1467 void
1468 page_list_add_pages(page_t *pp, int flags)
1469 {
1470         kmutex_t *pcm;
1471         pgcnt_t pgcnt;
1472         uint_t  bin, mtype, i;
1473         int     mnode;
1474
1475         /* default to freelist/head */
1476         ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1477
1478         CHK_LPG(pp, pp->p_szc);
1479         VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1480
1481         bin = PP_2_BIN(pp);
1482         mnode = PP_2_MEM_NODE(pp);
1483         mtype = PP_2_MTYPE(pp);
1484
1485         if (flags & PG_LIST_ISINIT) {
1486                 ASSERT(pp->p_szc == mmu_page_sizes - 1);
1487                 page_lpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1488                 ASSERT(!PP_ISNORELOC(pp));
1489                 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1490         } else {
1491
1492                 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1493
1494                 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1495
1496                 mutex_enter(pcm);
1497                 page_lpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1498                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1499                 mutex_exit(pcm);
1500
1501                 pgcnt = page_get_pagecnt(pp->p_szc);
1502                 for (i = 0; i < pgcnt; i++, pp++)
1503                         page_unlock_nocapture(pp);
1504         }
1505 }
1506
1507 /*
1508  * During boot, need to demote a large page to base
1509  * pagesize pages for seg_kmem for use in boot_alloc()
1510  */
1511 void
1512 page_boot_demote(page_t *pp)
1513 {
1514         ASSERT(pp->p_szc != 0);
1515         ASSERT(PP_ISFREE(pp));
1516         ASSERT(PP_ISAGED(pp));
1517
1518         (void) page_demote(PP_2_MEM_NODE(pp),
1519             PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1520             PC_FREE);
1521
1522         ASSERT(PP_ISFREE(pp));
1523         ASSERT(PP_ISAGED(pp));
1524         ASSERT(pp->p_szc == 0);
1525 }
1526
1527 /*
1528  * Take a particular page off of whatever freelist the page
1529  * is claimed to be on.
1530  *
1531  * NOTE: Only used for PAGESIZE pages.
1532  */
1533 void
1534 page_list_sub(page_t *pp, int flags)
1535 {
1536         int             bin;
1537         uint_t          mtype;
1538         int             mnode;
1539         kmutex_t        *pcm;
1540         page_t          **ppp;
1541
1542         ASSERT(PAGE_EXCL(pp));
1543         ASSERT(PP_ISFREE(pp));
1544
1545         /*
1546          * The p_szc field can only be changed by page_promote()
1547          * and page_demote(). Only free pages can be promoted and
1548          * demoted and the free list MUST be locked during these
1549          * operations. So to prevent a race in page_list_sub()
1550          * between computing which bin of the freelist lock to
1551          * grab and actually grabing the lock we check again that
1552          * the bin we locked is still the correct one. Notice that
1553          * the p_szc field could have actually changed on us but
1554          * if the bin happens to still be the same we are safe.
1555          */
1556 try_again:
1557         bin = PP_2_BIN(pp);
1558         mnode = PP_2_MEM_NODE(pp);
1559         pcm = PC_BIN_MUTEX(mnode, bin, flags);
1560         mutex_enter(pcm);
1561         if (PP_2_BIN(pp) != bin) {
1562                 mutex_exit(pcm);
1563                 goto try_again;
1564         }
1565         mtype = PP_2_MTYPE(pp);
1566
1567         if (flags & PG_FREE_LIST) {
1568                 VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1569                 ASSERT(PP_ISAGED(pp));
1570                 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1571         } else {
1572                 VM_STAT_ADD(vmm_vmstats.plsub_cache);
1573                 ASSERT(!PP_ISAGED(pp));
1574                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1575         }
1576
1577         /*
1578          * Common PAGESIZE case.
1579          *
1580          * Note that we locked the freelist. This prevents
1581          * any page promotion/demotion operations. Therefore
1582          * the p_szc will not change until we drop pcm mutex.
1583          */
1584         if (pp->p_szc == 0) {
1585                 page_sub(ppp, pp);
1586                 /*
1587                  * Subtract counters before releasing pcm mutex
1588                  * to avoid race with page_freelist_coalesce.
1589                  */
1590                 page_ctr_sub(mnode, mtype, pp, flags);
1591                 mutex_exit(pcm);
1592
1593                 return;
1594         }
1595
1596         /*
1597          * Large pages on the cache list are not supported.
1598          */
1599         if (flags & PG_CACHE_LIST)
1600                 panic("page_list_sub: large page on cachelist");
1601
1602         /*
1603          * Slow but rare.
1604          *
1605          * Somebody wants this particular page which is part
1606          * of a large page. In this case we just demote the page
1607          * if it's on the freelist.
1608          *
1609          * We have to drop pcm before locking the entire freelist.
1610          * Once we have re-locked the freelist check to make sure
1611          * the page hasn't already been demoted or completely
1612          * freed.
1613          */
1614         mutex_exit(pcm);
1615         page_freelist_lock(mnode);
1616         if (pp->p_szc != 0) {
1617                 /*
1618                  * Large page is on freelist.
1619                  */
1620                 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1621                     0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1622         }
1623         ASSERT(PP_ISFREE(pp));
1624         ASSERT(PP_ISAGED(pp));
1625         ASSERT(pp->p_szc == 0);
1626
1627         /*
1628          * Subtract counters before releasing pcm mutex
1629          * to avoid race with page_freelist_coalesce.
1630          */
1631         bin = PP_2_BIN(pp);
1632         mtype = PP_2_MTYPE(pp);
1633         ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1634
1635         page_sub(ppp, pp);
1636         page_ctr_sub(mnode, mtype, pp, flags);
1637         page_freelist_unlock(mnode);
1638
1639 }
1640
1641 void
1642 page_list_sub_pages(page_t *pp, uint_t szc)
1643 {
1644         kmutex_t *pcm;
1645         uint_t  bin, mtype;
1646         int     mnode;
1647
1648         ASSERT(PAGE_EXCL(pp));
1649         ASSERT(PP_ISFREE(pp));
1650         ASSERT(PP_ISAGED(pp));
1651
1652         /*
1653          * See comment in page_list_sub().
1654          */
1655 try_again:
1656         bin = PP_2_BIN(pp);
1657         mnode = PP_2_MEM_NODE(pp);
1658         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1659         mutex_enter(pcm);
1660         if (PP_2_BIN(pp) != bin) {
1661                 mutex_exit(pcm);
1662                 goto    try_again;
1663         }
1664
1665         /*
1666          * If we're called with a page larger than szc or it got
1667          * promoted above szc before we locked the freelist then
1668          * drop pcm and re-lock entire freelist. If page still larger
1669          * than szc then demote it.
1670          */
1671         if (pp->p_szc > szc) {
1672                 mutex_exit(pcm);
1673                 pcm = NULL;
1674                 page_freelist_lock(mnode);
1675                 if (pp->p_szc > szc) {
1676                         VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1677                         (void) page_demote(mnode,
1678                             PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1679                             pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1680                 }
1681                 bin = PP_2_BIN(pp);
1682         }
1683         ASSERT(PP_ISFREE(pp));
1684         ASSERT(PP_ISAGED(pp));
1685         ASSERT(pp->p_szc <= szc);
1686         ASSERT(pp == PP_PAGEROOT(pp));
1687
1688         VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1689
1690         mtype = PP_2_MTYPE(pp);
1691         if (pp->p_szc != 0) {
1692                 page_lpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1693                 CHK_LPG(pp, pp->p_szc);
1694         } else {
1695                 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1696                 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1697         }
1698         page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1699
1700         if (pcm != NULL) {
1701                 mutex_exit(pcm);
1702         } else {
1703                 page_freelist_unlock(mnode);
1704         }
1705
1706 }
1707
1708 /*
1709  * Add the page to the front of a linked list of pages
1710  * using the p_next & p_prev pointers for the list.
1711  * The caller is responsible for protecting the list pointers.
1712  */
1713 void
1714 mach_page_add(page_t **ppp, page_t *pp)
1715 {
1716         if (*ppp == NULL) {
1717                 pp->p_next = pp->p_prev = pp;
1718         } else {
1719                 pp->p_next = *ppp;
1720                 pp->p_prev = (*ppp)->p_prev;
1721                 (*ppp)->p_prev = pp;
1722                 pp->p_prev->p_next = pp;
1723         }
1724         *ppp = pp;
1725 }
1726
1727 /*
1728  * Remove this page from a linked list of pages
1729  * using the p_next & p_prev pointers for the list.
1730  *
1731  * The caller is responsible for protecting the list pointers.
1732  */
1733 void
1734 mach_page_sub(page_t **ppp, page_t *pp)
1735 {
1736         ASSERT(PP_ISFREE(pp));
1737
1738         if (*ppp == NULL || pp == NULL)
1739                 panic("mach_page_sub");
1740
1741         if (*ppp == pp)
1742                 *ppp = pp->p_next;              /* go to next page */
1743
1744         if (*ppp == pp)
1745                 *ppp = NULL;                    /* page list is gone */
1746         else {
1747                 pp->p_prev->p_next = pp->p_next;
1748                 pp->p_next->p_prev = pp->p_prev;
1749         }
1750         pp->p_prev = pp->p_next = pp;           /* make pp a list of one */
1751 }
1752
1753 /*
1754  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1755  */
1756 void
1757 page_promote_size(page_t *pp, uint_t cur_szc)
1758 {
1759         pfn_t pfn;
1760         int mnode;
1761         int idx;
1762         int new_szc = cur_szc + 1;
1763         int full = FULL_REGION_CNT(new_szc);
1764
1765         pfn = page_pptonum(pp);
1766         mnode = PFN_2_MEM_NODE(pfn);
1767
1768         page_freelist_lock(mnode);
1769
1770         idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1771         if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1772                 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1773
1774         page_freelist_unlock(mnode);
1775 }
1776
1777 static uint_t page_promote_err;
1778 static uint_t page_promote_noreloc_err;
1779
1780 /*
1781  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1782  * for the given mnode starting at pfnum. Pages involved are on the freelist
1783  * before the call and may be returned to the caller if requested, otherwise
1784  * they will be placed back on the freelist.
1785  * If flags is PC_ALLOC, then the large page will be returned to the user in
1786  * a state which is consistent with a page being taken off the freelist.  If
1787  * we failed to lock the new large page, then we will return NULL to the
1788  * caller and put the large page on the freelist instead.
1789  * If flags is PC_FREE, then the large page will be placed on the freelist,
1790  * and NULL will be returned.
1791  * The caller is responsible for locking the freelist as well as any other
1792  * accounting which needs to be done for a returned page.
1793  *
1794  * RFE: For performance pass in pp instead of pfnum so
1795  *      we can avoid excessive calls to page_numtopp_nolock().
1796  *      This would depend on an assumption that all contiguous
1797  *      pages are in the same memseg so we can just add/dec
1798  *      our pp.
1799  *
1800  * Lock ordering:
1801  *
1802  *      There is a potential but rare deadlock situation
1803  *      for page promotion and demotion operations. The problem
1804  *      is there are two paths into the freelist manager and
1805  *      they have different lock orders:
1806  *
1807  *      page_create()
1808  *              lock freelist
1809  *              page_lock(EXCL)
1810  *              unlock freelist
1811  *              return
1812  *              caller drops page_lock
1813  *
1814  *      page_free() and page_reclaim()
1815  *              caller grabs page_lock(EXCL)
1816  *
1817  *              lock freelist
1818  *              unlock freelist
1819  *              drop page_lock
1820  *
1821  *      What prevents a thread in page_create() from deadlocking
1822  *      with a thread freeing or reclaiming the same page is the
1823  *      page_trylock() in page_get_freelist(). If the trylock fails
1824  *      it skips the page.
1825  *
1826  *      The lock ordering for promotion and demotion is the same as
1827  *      for page_create(). Since the same deadlock could occur during
1828  *      page promotion and freeing or reclaiming of a page on the
1829  *      cache list we might have to fail the operation and undo what
1830  *      have done so far. Again this is rare.
1831  */
1832 page_t *
1833 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1834 {
1835         page_t          *pp, *pplist, *tpp, *start_pp;
1836         pgcnt_t         new_npgs, npgs;
1837         uint_t          bin;
1838         pgcnt_t         tmpnpgs, pages_left;
1839         uint_t          noreloc;
1840         int             which_list;
1841         ulong_t         index;
1842         kmutex_t        *phm;
1843
1844         /*
1845          * General algorithm:
1846          * Find the starting page
1847          * Walk each page struct removing it from the freelist,
1848          * and linking it to all the other pages removed.
1849          * Once all pages are off the freelist,
1850          * walk the list, modifying p_szc to new_szc and what
1851          * ever other info needs to be done to create a large free page.
1852          * According to the flags, either return the page or put it
1853          * on the freelist.
1854          */
1855
1856         start_pp = page_numtopp_nolock(pfnum);
1857         ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1858         new_npgs = page_get_pagecnt(new_szc);
1859         ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1860
1861         /* don't return page of the wrong mtype */
1862         if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1863                         return (NULL);
1864
1865         /*
1866          * Loop through smaller pages to confirm that all pages
1867          * give the same result for PP_ISNORELOC().
1868          * We can check this reliably here as the protocol for setting
1869          * P_NORELOC requires pages to be taken off the free list first.
1870          */
1871         noreloc = PP_ISNORELOC(start_pp);
1872         for (pp = start_pp + new_npgs; --pp > start_pp; ) {
1873                 if (noreloc != PP_ISNORELOC(pp)) {
1874                         page_promote_noreloc_err++;
1875                         page_promote_err++;
1876                         return (NULL);
1877                 }
1878         }
1879
1880         pages_left = new_npgs;
1881         pplist = NULL;
1882         pp = start_pp;
1883
1884         /* Loop around coalescing the smaller pages into a big page. */
1885         while (pages_left) {
1886                 /*
1887                  * Remove from the freelist.
1888                  */
1889                 ASSERT(PP_ISFREE(pp));
1890                 bin = PP_2_BIN(pp);
1891                 ASSERT(mnode == PP_2_MEM_NODE(pp));
1892                 mtype = PP_2_MTYPE(pp);
1893                 if (PP_ISAGED(pp)) {
1894
1895                         /*
1896                          * PG_FREE_LIST
1897                          */
1898                         if (pp->p_szc) {
1899                                 page_lpsub(&PAGE_FREELISTS(mnode,
1900                                     pp->p_szc, bin, mtype), pp);
1901                         } else {
1902                                 mach_page_sub(&PAGE_FREELISTS(mnode, 0,
1903                                     bin, mtype), pp);
1904                         }
1905                         which_list = PG_FREE_LIST;
1906                 } else {
1907                         struct vmobject *obj;
1908
1909                         ASSERT(pp->p_szc == 0);
1910
1911                         /*
1912                          * PG_CACHE_LIST
1913                          *
1914                          * Since this page comes from the
1915                          * cachelist, we must destroy the
1916                          * vnode association.
1917                          */
1918                         if (!page_trylock(pp, SE_EXCL)) {
1919                                 goto fail_promote;
1920                         }
1921
1922                         obj = &pp->p_vnode->v_object;
1923
1924                         /*
1925                          * We need to be careful not to deadlock
1926                          * with another thread in page_lookup().
1927                          * The page_lookup() thread could be holding
1928                          * the same phm that we need if the two
1929                          * pages happen to hash to the same phm lock.
1930                          * At this point we have locked the entire
1931                          * freelist and page_lookup() could be trying
1932                          * to grab a freelist lock.
1933                          */
1934                         if (!vmobject_trylock(obj)) {
1935                                 page_unlock_nocapture(pp);
1936                                 goto fail_promote;
1937                         }
1938
1939                         mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
1940                         page_hashout(pp, true);
1941                         vmobject_unlock(obj);
1942                         PP_SETAGED(pp);
1943                         page_unlock_nocapture(pp);
1944                         which_list = PG_CACHE_LIST;
1945                 }
1946                 page_ctr_sub(mnode, mtype, pp, which_list);
1947
1948                 /*
1949                  * Concatenate the smaller page(s) onto
1950                  * the large page list.
1951                  */
1952                 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
1953                 pages_left -= npgs;
1954                 tpp = pp;
1955                 while (npgs--) {
1956                         tpp->p_szc = new_szc;
1957                         tpp = tpp->p_next;
1958                 }
1959                 page_list_concat(&pplist, &pp);
1960                 pp += tmpnpgs;
1961         }
1962         CHK_LPG(pplist, new_szc);
1963
1964         /*
1965          * return the page to the user if requested
1966          * in the properly locked state.
1967          */
1968         if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
1969                 return (pplist);
1970         }
1971
1972         /*
1973          * Otherwise place the new large page on the freelist
1974          */
1975         bin = PP_2_BIN(pplist);
1976         mnode = PP_2_MEM_NODE(pplist);
1977         mtype = PP_2_MTYPE(pplist);
1978         page_lpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
1979
1980         page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
1981         return (NULL);
1982
1983 fail_promote:
1984         /*
1985          * A thread must have still been freeing or
1986          * reclaiming the page on the cachelist.
1987          * To prevent a deadlock undo what we have
1988          * done sofar and return failure. This
1989          * situation can only happen while promoting
1990          * PAGESIZE pages.
1991          */
1992         page_promote_err++;
1993         while (pplist) {
1994                 pp = pplist;
1995                 mach_page_sub(&pplist, pp);
1996                 pp->p_szc = 0;
1997                 bin = PP_2_BIN(pp);
1998                 mtype = PP_2_MTYPE(pp);
1999                 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2000                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2001         }
2002         return (NULL);
2003
2004 }
2005
2006 /*
2007  * Break up a large page into smaller size pages.
2008  * Pages involved are on the freelist before the call and may
2009  * be returned to the caller if requested, otherwise they will
2010  * be placed back on the freelist.
2011  * The caller is responsible for locking the freelist as well as any other
2012  * accounting which needs to be done for a returned page.
2013  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2014  * technically, any value may be passed in but PC_NO_COLOR is the standard
2015  * which should be followed for clarity's sake.
2016  * Returns a page whose pfn is < pfnmax
2017  */
2018 page_t *
2019 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2020     uchar_t new_szc, int color, int flags)
2021 {
2022         page_t  *pp, *pplist, *npplist;
2023         pgcnt_t npgs, n;
2024         uint_t  bin;
2025         uint_t  mtype;
2026         page_t  *ret_pp = NULL;
2027
2028         ASSERT(cur_szc != 0);
2029         ASSERT(new_szc < cur_szc);
2030
2031         pplist = page_numtopp_nolock(pfnum);
2032         ASSERT(pplist != NULL);
2033
2034         ASSERT(pplist->p_szc == cur_szc);
2035
2036         bin = PP_2_BIN(pplist);
2037         ASSERT(mnode == PP_2_MEM_NODE(pplist));
2038         mtype = PP_2_MTYPE(pplist);
2039         page_lpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2040
2041         CHK_LPG(pplist, cur_szc);
2042         page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2043
2044         /*
2045          * Number of PAGESIZE pages for smaller new_szc
2046          * page.
2047          */
2048         npgs = page_get_pagecnt(new_szc);
2049
2050         while (pplist) {
2051                 pp = pplist;
2052
2053                 ASSERT(pp->p_szc == cur_szc);
2054
2055                 /*
2056                  * We either break it up into PAGESIZE pages or larger.
2057                  */
2058                 if (npgs == 1) {        /* PAGESIZE case */
2059                         mach_page_sub(&pplist, pp);
2060                         ASSERT(pp->p_szc == cur_szc);
2061                         ASSERT(new_szc == 0);
2062                         ASSERT(mnode == PP_2_MEM_NODE(pp));
2063                         pp->p_szc = new_szc;
2064                         bin = PP_2_BIN(pp);
2065                         if ((bin == color) && (flags == PC_ALLOC) &&
2066                             (ret_pp == NULL) && (pfnmax == 0 ||
2067                             pp->p_pagenum < pfnmax) &&
2068                             page_trylock_cons(pp, SE_EXCL)) {
2069                                 ret_pp = pp;
2070                         } else {
2071                                 mtype = PP_2_MTYPE(pp);
2072                                 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2073                                     mtype), pp);
2074                                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2075                         }
2076                 } else {
2077                         page_t *try_to_return_this_page = NULL;
2078                         int count = 0;
2079
2080                         /*
2081                          * Break down into smaller lists of pages.
2082                          */
2083                         page_list_break(&pplist, &npplist, npgs);
2084
2085                         pp = pplist;
2086                         n = npgs;
2087                         while (n--) {
2088                                 ASSERT(pp->p_szc == cur_szc);
2089                                 /*
2090                                  * Check whether all the pages in this list
2091                                  * fit the request criteria.
2092                                  */
2093                                 if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2094                                         count++;
2095                                 }
2096                                 pp->p_szc = new_szc;
2097                                 pp = pp->p_next;
2098                         }
2099
2100                         if (count == npgs &&
2101                             (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2102                                 try_to_return_this_page = pp;
2103                         }
2104
2105                         CHK_LPG(pplist, new_szc);
2106
2107                         bin = PP_2_BIN(pplist);
2108                         if (try_to_return_this_page)
2109                                 ASSERT(mnode ==
2110                                     PP_2_MEM_NODE(try_to_return_this_page));
2111                         if ((bin == color) && (flags == PC_ALLOC) &&
2112                             (ret_pp == NULL) && try_to_return_this_page &&
2113                             page_trylock_cons(try_to_return_this_page,
2114                             SE_EXCL)) {
2115                                 ret_pp = try_to_return_this_page;
2116                         } else {
2117                                 mtype = PP_2_MTYPE(pp);
2118                                 page_lpadd(&PAGE_FREELISTS(mnode, new_szc,
2119                                     bin, mtype), pplist);
2120
2121                                 page_ctr_add(mnode, mtype, pplist,
2122                                     PG_FREE_LIST);
2123                         }
2124                         pplist = npplist;
2125                 }
2126         }
2127         return (ret_pp);
2128 }
2129
2130 int mpss_coalesce_disable = 0;
2131
2132 /*
2133  * Coalesce free pages into a page of the given szc and color if possible.
2134  * Return the pointer to the page created, otherwise, return NULL.
2135  *
2136  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2137  */
2138 page_t *
2139 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2140     int mtype, pfn_t pfnhi)
2141 {
2142         int     r = szc;                /* region size */
2143         int     mrange;
2144         uint_t  full, bin, color_mask, wrap = 0;
2145         pfn_t   pfnum, lo, hi;
2146         size_t  len, idx, idx0;
2147         pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
2148         page_t  *ret_pp;
2149         MEM_NODE_ITERATOR_DECL(it);
2150
2151         if (mpss_coalesce_disable) {
2152                 ASSERT(szc < MMU_PAGE_SIZES);
2153                 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2154                 return (NULL);
2155         }
2156
2157         ASSERT(szc < mmu_page_sizes);
2158         color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2159         ASSERT(ceq_mask <= color_mask);
2160         ASSERT(color <= color_mask);
2161         color &= ceq_mask;
2162
2163         /* Prevent page_counters dynamic memory from being freed */
2164         rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2165
2166         mrange = MTYPE_2_MRANGE(mnode, mtype);
2167         ASSERT(mrange < mnode_nranges[mnode]);
2168         VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2169
2170         /* get pfn range for mtype */
2171         len = PAGE_COUNTERS_ENTRIES(mnode, r);
2172         MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2173         hi++;
2174
2175         /* use lower limit if given */
2176         if (pfnhi != PFNNULL && pfnhi < hi)
2177                 hi = pfnhi;
2178
2179         /* round to szcpgcnt boundaries */
2180         lo = P2ROUNDUP(lo, szcpgcnt);
2181         MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2182         if (lo == (pfn_t)-1) {
2183                 rw_exit(&page_ctrs_rwlock[mnode]);
2184                 return (NULL);
2185         }
2186         hi = hi & ~(szcpgcnt - 1);
2187
2188         /* set lo to the closest pfn of the right color */
2189         if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2190             (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2191                 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2192                     &it);
2193         }
2194
2195         if (hi <= lo) {
2196                 rw_exit(&page_ctrs_rwlock[mnode]);
2197                 return (NULL);
2198         }
2199
2200         full = FULL_REGION_CNT(r);
2201
2202         /* calculate the number of page candidates and initial search index */
2203         bin = color;
2204         idx0 = (size_t)(-1);
2205         do {
2206                 pgcnt_t acand;
2207
2208                 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2209                 if (acand) {
2210                         idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2211                             r, bin, mrange);
2212                         idx0 = MIN(idx0, idx);
2213                         cands += acand;
2214                 }
2215                 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2216         } while (bin != color);
2217
2218         if (cands == 0) {
2219                 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2220                 rw_exit(&page_ctrs_rwlock[mnode]);
2221                 return (NULL);
2222         }
2223
2224         pfnum = IDX_TO_PNUM(mnode, r, idx0);
2225         if (pfnum < lo || pfnum >= hi) {
2226                 pfnum = lo;
2227         } else {
2228                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2229                 if (pfnum == (pfn_t)-1) {
2230                         pfnum = lo;
2231                         MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2232                         ASSERT(pfnum != (pfn_t)-1);
2233                 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2234                     (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2235                         /* invalid color, get the closest correct pfn */
2236                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2237                             color_mask, &it);
2238                         if (pfnum >= hi) {
2239                                 pfnum = lo;
2240                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2241                         }
2242                 }
2243         }
2244
2245         /* set starting index */
2246         idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2247         ASSERT(idx0 < len);
2248
2249
2250         for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2251
2252
2253                 if (PAGE_COUNTERS(mnode, r, idx) != full)
2254                         goto next;
2255
2256                 /*
2257                  * RFE: For performance maybe we can do something less
2258                  *      brutal than locking the entire freelist. So far
2259                  *      this doesn't seem to be a performance problem?
2260                  */
2261                 page_freelist_lock(mnode);
2262                 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2263                         ret_pp =
2264                             page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2265                         if (ret_pp != NULL) {
2266                                 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2267                                 PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2268                                     PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2269                                 page_freelist_unlock(mnode);
2270                                 rw_exit(&page_ctrs_rwlock[mnode]);
2271                                 return (ret_pp);
2272                         }
2273                 } else {
2274                         VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2275                 }
2276
2277                 page_freelist_unlock(mnode);
2278                 /*
2279                  * No point looking for another page if we've
2280                  * already tried all of the ones that
2281                  * page_ctr_cands indicated.  Stash off where we left
2282                  * off.
2283                  * Note: this is not exact since we don't hold the
2284                  * page_freelist_locks before we initially get the
2285                  * value of cands for performance reasons, but should
2286                  * be a decent approximation.
2287                  */
2288                 if (--cands == 0) {
2289                         PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2290                             idx;
2291                         break;
2292                 }
2293 next:
2294                 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2295                     color_mask, &it);
2296                 idx = PNUM_TO_IDX(mnode, r, pfnum);
2297                 if (idx >= len || pfnum >= hi) {
2298 wrapit:
2299                         pfnum = lo;
2300                         MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2301                         idx = PNUM_TO_IDX(mnode, r, pfnum);
2302                         wrap++;
2303                 }
2304         }
2305
2306         rw_exit(&page_ctrs_rwlock[mnode]);
2307         VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2308         return (NULL);
2309 }
2310
2311 /*
2312  * For the given mnode, promote as many small pages to large pages as possible.
2313  * mnode can be -1, which means do them all
2314  */
2315 void
2316 page_freelist_coalesce_all(int mnode)
2317 {
2318         int     r;              /* region size */
2319         int     idx, full;
2320         size_t  len;
2321         int doall = interleaved_mnodes || mnode < 0;
2322         int mlo = doall ? 0 : mnode;
2323         int mhi = doall ? max_mem_nodes : (mnode + 1);
2324
2325         VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2326
2327         if (mpss_coalesce_disable) {
2328                 return;
2329         }
2330
2331         /*
2332          * Lock the entire freelist and coalesce what we can.
2333          *
2334          * Always promote to the largest page possible
2335          * first to reduce the number of page promotions.
2336          */
2337         for (mnode = mlo; mnode < mhi; mnode++) {
2338                 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2339                 page_freelist_lock(mnode);
2340         }
2341         for (r = mmu_page_sizes - 1; r > 0; r--) {
2342                 for (mnode = mlo; mnode < mhi; mnode++) {
2343                         pgcnt_t cands = 0;
2344                         int mrange, nranges = mnode_nranges[mnode];
2345
2346                         for (mrange = 0; mrange < nranges; mrange++) {
2347                                 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2348                                 if (cands != 0)
2349                                         break;
2350                         }
2351                         if (cands == 0) {
2352                                 VM_STAT_ADD(vmm_vmstats.
2353                                     page_ctrs_cands_skip_all);
2354                                 continue;
2355                         }
2356
2357                         full = FULL_REGION_CNT(r);
2358                         len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2359
2360                         for (idx = 0; idx < len; idx++) {
2361                                 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2362                                         pfn_t pfnum =
2363                                             IDX_TO_PNUM(mnode, r, idx);
2364                                         int tmnode = interleaved_mnodes ?
2365                                             PFN_2_MEM_NODE(pfnum) : mnode;
2366
2367                                         ASSERT(pfnum >=
2368                                             mem_node_config[tmnode].physbase &&
2369                                             pfnum <
2370                                             mem_node_config[tmnode].physmax);
2371
2372                                         (void) page_promote(tmnode,
2373                                             pfnum, r, PC_FREE, PC_MTYPE_ANY);
2374                                 }
2375                         }
2376                         /* shared hpm_counters covers all mnodes, so we quit */
2377                         if (interleaved_mnodes)
2378                                 break;
2379                 }
2380         }
2381         for (mnode = mlo; mnode < mhi; mnode++) {
2382                 page_freelist_unlock(mnode);
2383                 rw_exit(&page_ctrs_rwlock[mnode]);
2384         }
2385 }
2386
2387 /*
2388  * This is where all polices for moving pages around
2389  * to different page size free lists is implemented.
2390  * Returns 1 on success, 0 on failure.
2391  *
2392  * So far these are the priorities for this algorithm in descending
2393  * order:
2394  *
2395  *      1) When servicing a request try to do so with a free page
2396  *         from next size up. Helps defer fragmentation as long
2397  *         as possible.
2398  *
2399  *      2) Page coalesce on demand. Only when a freelist
2400  *         larger than PAGESIZE is empty and step 1
2401  *         will not work since all larger size lists are
2402  *         also empty.
2403  *
2404  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2405  */
2406
2407 page_t *
2408 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2409     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2410 {
2411         uchar_t nszc = szc + 1;
2412         uint_t  bin, sbin, bin_prev;
2413         page_t  *pp, *firstpp;
2414         page_t  *ret_pp = NULL;
2415         uint_t  color_mask;
2416
2417         if (nszc == mmu_page_sizes)
2418                 return (NULL);
2419
2420         ASSERT(nszc < mmu_page_sizes);
2421         color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2422         bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2423         bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2424             PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2425
2426         VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2427         /*
2428          * First try to break up a larger page to fill current size freelist.
2429          */
2430         while (plw->plw_bins[nszc] != 0) {
2431
2432                 ASSERT(nszc < mmu_page_sizes);
2433
2434                 /*
2435                  * If page found then demote it.
2436                  */
2437                 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2438                         page_freelist_lock(mnode);
2439                         firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2440
2441                         /*
2442                          * If pfnhi is not PFNNULL, look for large page below
2443                          * pfnhi. PFNNULL signifies no pfn requirement.
2444                          */
2445                         if (pp &&
2446                             ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2447                             (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2448                                 do {
2449                                         pp = pp->p_list.largepg.next;
2450                                         if (pp == firstpp) {
2451                                                 pp = NULL;
2452                                                 break;
2453                                         }
2454                                 } while ((pfnhi != PFNNULL &&
2455                                     pp->p_pagenum >= pfnhi) ||
2456                                     (pfnlo != PFNNULL &&
2457                                     pp->p_pagenum < pfnlo));
2458
2459                                 if (pfnhi != PFNNULL && pp != NULL)
2460                                         ASSERT(pp->p_pagenum < pfnhi);
2461
2462                                 if (pfnlo != PFNNULL && pp != NULL)
2463                                         ASSERT(pp->p_pagenum >= pfnlo);
2464                         }
2465                         if (pp) {
2466                                 uint_t ccolor = page_correct_color(szc, nszc,
2467                                     color, bin, plw->plw_ceq_mask[szc]);
2468
2469                                 ASSERT(pp->p_szc == nszc);
2470                                 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2471                                 ret_pp = page_demote(mnode, pp->p_pagenum,
2472                                     pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2473                                 if (ret_pp) {
2474                                         page_freelist_unlock(mnode);
2475                                         return (ret_pp);
2476                                 }
2477                         }
2478                         page_freelist_unlock(mnode);
2479                 }
2480
2481                 /* loop through next size bins */
2482                 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2483                 plw->plw_bins[nszc]--;
2484
2485                 if (bin == sbin) {
2486                         uchar_t nnszc = nszc + 1;
2487
2488                         /* we are done with this page size - check next */
2489                         if (plw->plw_bins[nnszc] == 0)
2490                                 /* we have already checked next size bins */
2491                                 break;
2492
2493                         bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2494                         if (bin_prev != INVALID_COLOR) {
2495                                 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2496                                 if (!((bin ^ bin_prev) &
2497                                     plw->plw_ceq_mask[nnszc]))
2498                                         break;
2499                         }
2500                         ASSERT(nnszc < mmu_page_sizes);
2501                         color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2502                         nszc = nnszc;
2503                         ASSERT(nszc < mmu_page_sizes);
2504                 }
2505         }
2506
2507         return (ret_pp);
2508 }
2509
2510 /*
2511  * Helper routine used only by the freelist code to lock
2512  * a page. If the page is a large page then it succeeds in
2513  * locking all the constituent pages or none at all.
2514  * Returns 1 on sucess, 0 on failure.
2515  */
2516 static int
2517 page_trylock_cons(page_t *pp, se_t se)
2518 {
2519         page_t  *tpp, *first_pp = pp;
2520
2521         /*
2522          * Fail if can't lock first or only page.
2523          */
2524         if (!page_trylock(pp, se)) {
2525                 return (0);
2526         }
2527
2528         /*
2529          * PAGESIZE: common case.
2530          */
2531         if (pp->p_szc == 0) {
2532                 return (1);
2533         }
2534
2535         /*
2536          * Large page case.
2537          */
2538         tpp = pp->p_next;
2539         while (tpp != pp) {
2540                 if (!page_trylock(tpp, se)) {
2541                         /*
2542                          * On failure unlock what we have locked so far.
2543                          * We want to avoid attempting to capture these
2544                          * pages as the pcm mutex may be held which could
2545                          * lead to a recursive mutex panic.
2546                          */
2547                         while (first_pp != tpp) {
2548                                 page_unlock_nocapture(first_pp);
2549                                 first_pp = first_pp->p_next;
2550                         }
2551                         return (0);
2552                 }
2553                 tpp = tpp->p_next;
2554         }
2555         return (1);
2556 }
2557
2558 /*
2559  * init context for walking page lists
2560  * Called when a page of the given szc in unavailable. Sets markers
2561  * for the beginning of the search to detect when search has
2562  * completed a full cycle. Sets flags for splitting larger pages
2563  * and coalescing smaller pages. Page walking procedes until a page
2564  * of the desired equivalent color is found.
2565  */
2566 void
2567 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2568     int use_ceq, page_list_walker_t *plw)
2569 {
2570         uint_t  nszc, ceq_mask, colors;
2571         uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2572
2573         ASSERT(szc < mmu_page_sizes);
2574         colors = PAGE_GET_PAGECOLORS(szc);
2575
2576         plw->plw_colors = colors;
2577         plw->plw_color_mask = colors - 1;
2578         plw->plw_bin_marker = plw->plw_bin0 = bin;
2579         plw->plw_bin_split_prev = bin;
2580         plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2581
2582         /*
2583          * if vac aliasing is possible make sure lower order color
2584          * bits are never ignored
2585          */
2586         if (vac_colors > 1)
2587                 ceq &= 0xf0;
2588
2589         /*
2590          * calculate the number of non-equivalent colors and
2591          * color equivalency mask
2592          */
2593         plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2594         ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2595         ASSERT(plw->plw_ceq_dif > 0);
2596         plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2597
2598         if (flags & PG_MATCH_COLOR) {
2599                 if (cpu_page_colors <  0) {
2600                         /*
2601                          * this is a heterogeneous machine with different CPUs
2602                          * having different size e$ (not supported for ni2/rock
2603                          */
2604                         uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2605                         cpucolors = MAX(cpucolors, 1);
2606                         ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2607                         plw->plw_ceq_mask[szc] =
2608                             MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2609                 }
2610                 plw->plw_ceq_dif = 1;
2611         }
2612
2613         /* we can split pages in the freelist, but not the cachelist */
2614         if (can_split) {
2615                 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2616
2617                 /* set next szc color masks and number of free list bins */
2618                 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2619                         plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2620                             plw->plw_ceq_mask[szc]);
2621                         plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2622                 }
2623                 plw->plw_ceq_mask[nszc] = INVALID_MASK;
2624                 plw->plw_bins[nszc] = 0;
2625
2626         } else {
2627                 ASSERT(szc == 0);
2628                 plw->plw_do_split = 0;
2629                 plw->plw_bins[1] = 0;
2630                 plw->plw_ceq_mask[1] = INVALID_MASK;
2631         }
2632 }
2633
2634 /*
2635  * set mark to flag where next split should occur
2636  */
2637 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {                    \
2638         uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);                       \
2639         uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);            \
2640         uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2641         plw->plw_split_next =                                                \
2642                 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);          \
2643         if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2644                 plw->plw_split_next =                                        \
2645                 INC_MASKED(plw->plw_split_next,                              \
2646                     neq_mask, plw->plw_color_mask);                          \
2647         }                                                                    \
2648 }
2649
2650 uint_t
2651 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2652 {
2653         uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2654         uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2655         uchar_t nszc = szc + 1;
2656
2657         nbin = ADD_MASKED(bin,
2658             plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2659
2660         if (plw->plw_do_split) {
2661                 plw->plw_bin_split_prev = bin;
2662                 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2663                 plw->plw_do_split = 0;
2664         }
2665
2666         if (szc == 0) {
2667                 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2668                         if (nbin == plw->plw_bin0 &&
2669                             (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2670                                 nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2671                                     neq_mask, plw->plw_color_mask);
2672                                 plw->plw_bin_split_prev = plw->plw_bin0;
2673                         }
2674
2675                         if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2676                                 plw->plw_bin_marker =
2677                                     nbin = INC_MASKED(nbin, neq_mask,
2678                                     plw->plw_color_mask);
2679                                 plw->plw_bin_split_prev = plw->plw_bin0;
2680                                 /*
2681                                  * large pages all have the same vac color
2682                                  * so by now we should be done with next
2683                                  * size page splitting process
2684                                  */
2685                                 ASSERT(plw->plw_bins[1] == 0);
2686                                 plw->plw_do_split = 0;
2687                                 return (nbin);
2688                         }
2689
2690                 } else {
2691                         uint_t bin_jump = (vac_colors == 1) ?
2692                             (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2693
2694                         bin_jump &= ~(vac_colors - 1);
2695
2696                         nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2697                             plw->plw_color_mask);
2698
2699                         if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2700
2701                                 plw->plw_bin_marker = nbin = nbin0;
2702
2703                                 if (plw->plw_bins[nszc] != 0) {
2704                                         /*
2705                                          * check if next page size bin is the
2706                                          * same as the next page size bin for
2707                                          * bin0
2708                                          */
2709                                         nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2710                                             nbin);
2711                                         bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2712                                             plw->plw_bin0);
2713
2714                                         if ((bin0_nsz ^ nbin_nsz) &
2715                                             plw->plw_ceq_mask[nszc])
2716                                                 plw->plw_do_split = 1;
2717                                 }
2718                                 return (nbin);
2719                         }
2720                 }
2721         }
2722
2723         if (plw->plw_bins[nszc] != 0) {
2724                 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2725                 if (!((plw->plw_split_next ^ nbin_nsz) &
2726                     plw->plw_ceq_mask[nszc]))
2727                         plw->plw_do_split = 1;
2728         }
2729
2730         return (nbin);
2731 }
2732
2733 page_t *
2734 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2735     uint_t flags)
2736 {
2737         kmutex_t                *pcm;
2738         page_t                  *pp, *first_pp;
2739         uint_t                  sbin;
2740         int                     plw_initialized;
2741         page_list_walker_t      plw;
2742
2743         ASSERT(szc < mmu_page_sizes);
2744
2745         VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2746
2747         MTYPE_START(mnode, mtype, flags);
2748         if (mtype < 0) {        /* mnode does not have memory in mtype range */
2749                 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2750                 return (NULL);
2751         }
2752 try_again:
2753
2754         plw_initialized = 0;
2755         plw.plw_ceq_dif = 1;
2756
2757         /*
2758          * Only hold one freelist lock at a time, that way we
2759          * can start anywhere and not have to worry about lock
2760          * ordering.
2761          */
2762         for (plw.plw_count = 0;
2763             plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2764                 sbin = bin;
2765                 do {
2766                         if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2767                                 goto bin_empty_1;
2768
2769                         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2770                         mutex_enter(pcm);
2771                         pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2772                         if (pp == NULL)
2773                                 goto bin_empty_0;
2774
2775                         /*
2776                          * These were set before the page
2777                          * was put on the free list,
2778                          * they must still be set.
2779                          */
2780                         ASSERT(PP_ISFREE(pp));
2781                         ASSERT(PP_ISAGED(pp));
2782                         VERIFY(pp->p_object == NULL);
2783                         ASSERT(pp->p_vnode == NULL);
2784                         ASSERT(pp->p_offset == (uoff_t)-1);
2785                         ASSERT(pp->p_szc == szc);
2786                         ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2787
2788                         /*
2789                          * Walk down the hash chain.  4k/8k pages are linked
2790                          * on p_next and p_prev fields. Large pages are a
2791                          * contiguous group of constituent pages linked
2792                          * together on their p_next and p_prev fields.  The
2793                          * large pages are linked together on the hash chain
2794                          * using p_list.largepg of the base constituent page
2795                          * of each large page.
2796                          */
2797                         first_pp = pp;
2798                         while (!page_trylock_cons(pp, SE_EXCL)) {
2799                                 if (szc == 0) {
2800                                         pp = pp->p_next;
2801                                 } else {
2802                                         pp = pp->p_list.largepg.next;
2803                                 }
2804
2805                                 ASSERT(PP_ISFREE(pp));
2806                                 ASSERT(PP_ISAGED(pp));
2807                                 VERIFY(pp->p_object == NULL);
2808                                 ASSERT(pp->p_vnode == NULL);
2809                                 ASSERT(pp->p_offset == (uoff_t)-1);
2810                                 ASSERT(pp->p_szc == szc);
2811                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2812
2813                                 if (pp == first_pp)
2814                                         goto bin_empty_0;
2815                         }
2816
2817                         ASSERT(pp != NULL);
2818                         ASSERT(mtype == PP_2_MTYPE(pp));
2819                         ASSERT(pp->p_szc == szc);
2820                         if (szc == 0) {
2821                                 page_sub(&PAGE_FREELISTS(mnode,
2822                                     szc, bin, mtype), pp);
2823                         } else {
2824                                 page_lpsub(&PAGE_FREELISTS(mnode,
2825                                     szc, bin, mtype), pp);
2826                                 CHK_LPG(pp, szc);
2827                         }
2828                         page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
2829
2830                         if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
2831                                 panic("free page is not. pp %p", (void *)pp);
2832                         mutex_exit(pcm);
2833
2834                         VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
2835                         return (pp);
2836
2837 bin_empty_0:
2838                         mutex_exit(pcm);
2839 bin_empty_1:
2840                         if (plw_initialized == 0) {
2841                                 page_list_walk_init(szc, flags, bin, 1, 1,
2842                                     &plw);
2843                                 plw_initialized = 1;
2844                                 ASSERT(plw.plw_colors <=
2845                                     PAGE_GET_PAGECOLORS(szc));
2846                                 ASSERT(plw.plw_colors > 0);
2847                                 ASSERT((plw.plw_colors &
2848                                     (plw.plw_colors - 1)) == 0);
2849                                 ASSERT(bin < plw.plw_colors);
2850                                 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
2851                         }
2852                         /* calculate the next bin with equivalent color */
2853                         bin = ADD_MASKED(bin, plw.plw_bin_step,
2854                             plw.plw_ceq_mask[szc], plw.plw_color_mask);
2855                 } while (sbin != bin);
2856
2857                 /*
2858                  * color bins are all empty if color match. Try and
2859                  * satisfy the request by breaking up or coalescing
2860                  * pages from a different size freelist of the correct
2861                  * color that satisfies the ORIGINAL color requested.
2862                  * If that fails then try pages of the same size but
2863                  * different colors assuming we are not called with
2864                  * PG_MATCH_COLOR.
2865                  */
2866                 if (plw.plw_do_split &&
2867                     (pp = page_freelist_split(szc, bin, mnode,
2868                     mtype, PFNNULL, PFNNULL, &plw)) != NULL)
2869                         return (pp);
2870
2871                 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
2872                     bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
2873                         return (pp);
2874
2875                 if (plw.plw_ceq_dif > 1)
2876                         bin = page_list_walk_next_bin(szc, bin, &plw);
2877         }
2878
2879         /* if allowed, cycle through additional mtypes */
2880         MTYPE_NEXT(mnode, mtype, flags);
2881         if (mtype >= 0)
2882                 goto try_again;
2883
2884         VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
2885
2886         return (NULL);
2887 }
2888
2889 /*
2890  * Returns the count of free pages for 'pp' with size code 'szc'.
2891  * Note: This function does not return an exact value as the page freelist
2892  * locks are not held and thus the values in the page_counters may be
2893  * changing as we walk through the data.
2894  */
2895 static int
2896 page_freecnt(int mnode, page_t *pp, uchar_t szc)
2897 {
2898         pgcnt_t pgfree;
2899         pgcnt_t cnt;
2900         ssize_t r = szc;        /* region size */
2901         ssize_t idx;
2902         int     i;
2903         int     full, range;
2904
2905         /* Make sure pagenum passed in is aligned properly */
2906         ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
2907         ASSERT(szc > 0);
2908
2909         /* Prevent page_counters dynamic memory from being freed */
2910         rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2911         idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2912         cnt = PAGE_COUNTERS(mnode, r, idx);
2913         pgfree = cnt << PNUM_SHIFT(r - 1);
2914         range = FULL_REGION_CNT(szc);
2915
2916         /* Check for completely full region */
2917         if (cnt == range) {
2918                 rw_exit(&page_ctrs_rwlock[mnode]);
2919                 return (pgfree);
2920         }
2921
2922         while (--r > 0) {
2923                 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2924                 full = FULL_REGION_CNT(r);
2925                 for (i = 0; i < range; i++, idx++) {
2926                         cnt = PAGE_COUNTERS(mnode, r, idx);
2927                         /*
2928                          * If cnt here is full, that means we have already
2929                          * accounted for these pages earlier.
2930                          */
2931                         if (cnt != full) {
2932                                 pgfree += (cnt << PNUM_SHIFT(r - 1));
2933                         }
2934                 }
2935                 range *= full;
2936         }
2937         rw_exit(&page_ctrs_rwlock[mnode]);
2938         return (pgfree);
2939 }
2940
2941 /*
2942  * Called from page_geti_contig_pages to exclusively lock constituent pages
2943  * starting from 'spp' for page size code 'szc'.
2944  *
2945  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
2946  * region needs to be greater than or equal to the threshold.
2947  */
2948 static int
2949 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
2950 {
2951         pgcnt_t pgcnt = PNUM_SIZE(szc);
2952         pgcnt_t pgfree, i;
2953         page_t *pp;
2954
2955         VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
2956
2957
2958         if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
2959                 goto skipptcpcheck;
2960         /*
2961          * check if there are sufficient free pages available before attempting
2962          * to trylock. Count is approximate as page counters can change.
2963          */
2964         pgfree = page_freecnt(mnode, spp, szc);
2965
2966         /* attempt to trylock if there are sufficient already free pages */
2967         if (pgfree < pgcnt/ptcpthreshold) {
2968                 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
2969                 return (0);
2970         }
2971
2972 skipptcpcheck:
2973
2974         for (i = 0; i < pgcnt; i++) {
2975                 pp = &spp[i];
2976                 if (!page_trylock(pp, SE_EXCL)) {
2977                         VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
2978                         while (--i != (pgcnt_t)-1) {
2979                                 pp = &spp[i];
2980                                 ASSERT(PAGE_EXCL(pp));
2981                                 page_unlock_nocapture(pp);
2982                         }
2983                         return (0);
2984                 }
2985                 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
2986                 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
2987                     !PP_ISFREE(pp)) {
2988                         VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
2989                         ASSERT(i == 0);
2990                         page_unlock_nocapture(pp);
2991                         return (0);
2992                 }
2993
2994                 /*
2995                  * If a page has been marked non-relocatable or has been
2996                  * explicitly locked in memory, we don't want to relocate it;
2997                  * unlock the pages and fail the operation.
2998                  */
2999                 if (PP_ISNORELOC(pp) ||
3000                     pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
3001                         VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3002                         while (i != (pgcnt_t)-1) {
3003                                 pp = &spp[i];
3004                                 ASSERT(PAGE_EXCL(pp));
3005                                 page_unlock_nocapture(pp);
3006                                 i--;
3007                         }
3008                         return (0);
3009                 }
3010         }
3011         VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3012         return (1);
3013 }
3014
3015 /*
3016  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3017  * of 'szc' constituent pages that had been locked exclusively previously.
3018  * Will attempt to relocate constituent pages in use.
3019  */
3020 static page_t *
3021 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3022 {
3023         spgcnt_t pgcnt, npgs, i;
3024         page_t *targpp, *rpp, *hpp;
3025         page_t *replpp = NULL;
3026         page_t *pplist = NULL;
3027
3028         ASSERT(pp != NULL);
3029
3030         pgcnt = page_get_pagecnt(szc);
3031         while (pgcnt) {
3032                 ASSERT(PAGE_EXCL(pp));
3033                 ASSERT(!PP_ISNORELOC(pp));
3034                 if (PP_ISFREE(pp)) {
3035                         /*
3036                          * If this is a PG_FREE_LIST page then its
3037                          * size code can change underneath us due to
3038                          * page promotion or demotion. As an optimzation
3039                          * use page_list_sub_pages() instead of
3040                          * page_list_sub().
3041                          */
3042                         if (PP_ISAGED(pp)) {
3043                                 page_list_sub_pages(pp, szc);
3044                                 if (pp->p_szc == szc) {
3045                                         return (pp);
3046                                 }
3047                                 ASSERT(pp->p_szc < szc);
3048                                 npgs = page_get_pagecnt(pp->p_szc);
3049                                 hpp = pp;
3050                                 for (i = 0; i < npgs; i++, pp++) {
3051                                         pp->p_szc = szc;
3052                                 }
3053                                 page_list_concat(&pplist, &hpp);
3054                                 pgcnt -= npgs;
3055                                 continue;
3056                         }
3057                         ASSERT(!PP_ISAGED(pp));
3058                         ASSERT(pp->p_szc == 0);
3059                         page_list_sub(pp, PG_CACHE_LIST);
3060                         page_hashout(pp, false);
3061                         PP_SETAGED(pp);
3062                         pp->p_szc = szc;
3063                         page_list_concat(&pplist, &pp);
3064                         pp++;
3065                         pgcnt--;
3066                         continue;
3067                 }
3068                 npgs = page_get_pagecnt(pp->p_szc);
3069
3070                 /*
3071                  * page_create_wait freemem accounting done by caller of
3072                  * page_get_freelist and not necessary to call it prior to
3073                  * calling page_get_replacement_page.
3074                  *
3075                  * page_get_replacement_page can call page_get_contig_pages
3076                  * to acquire a large page (szc > 0); the replacement must be
3077                  * smaller than the contig page size to avoid looping or
3078                  * szc == 0 and PGI_PGCPSZC0 is set.
3079                  */
3080                 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3081                         replpp = page_get_replacement_page(pp, NULL, 0);
3082                         if (replpp) {
3083                                 npgs = page_get_pagecnt(pp->p_szc);
3084                                 ASSERT(npgs <= pgcnt);
3085                                 targpp = pp;
3086                         }
3087                 }
3088
3089                 /*
3090                  * If replacement is NULL or do_page_relocate fails, fail
3091                  * coalescing of pages.
3092                  */
3093                 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3094                     &npgs, NULL) != 0)) {
3095                         /*
3096                          * Unlock un-processed target list
3097                          */
3098                         while (pgcnt--) {
3099                                 ASSERT(PAGE_EXCL(pp));
3100                                 page_unlock_nocapture(pp);
3101                                 pp++;
3102                         }
3103                         /*
3104                          * Free the processed target list.
3105                          */
3106                         while (pplist) {
3107                                 pp = pplist;
3108                                 page_sub(&pplist, pp);
3109                                 ASSERT(PAGE_EXCL(pp));
3110                                 ASSERT(pp->p_szc == szc);
3111                                 ASSERT(PP_ISFREE(pp));
3112                                 ASSERT(PP_ISAGED(pp));
3113                                 pp->p_szc = 0;
3114                                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3115                                 page_unlock_nocapture(pp);
3116                         }
3117
3118                         if (replpp != NULL)
3119                                 page_free_replacement_page(replpp);
3120
3121                         return (NULL);
3122                 }
3123                 ASSERT(pp == targpp);
3124
3125                 ASSERT(hpp = pp); /* That's right, it's an assignment */
3126
3127                 pp += npgs;
3128                 pgcnt -= npgs;
3129
3130                 while (npgs--) {
3131                         ASSERT(PAGE_EXCL(targpp));
3132                         ASSERT(!PP_ISFREE(targpp));
3133                         ASSERT(!PP_ISNORELOC(targpp));
3134                         PP_SETFREE(targpp);
3135                         ASSERT(PP_ISAGED(targpp));
3136                         ASSERT(targpp->p_szc < szc || (szc == 0 &&
3137                             (flags & PGI_PGCPSZC0)));
3138                         targpp->p_szc = szc;
3139                         targpp = targpp->p_next;
3140
3141                         rpp = replpp;
3142                         ASSERT(rpp != NULL);
3143                         page_sub(&replpp, rpp);
3144                         ASSERT(PAGE_EXCL(rpp));
3145                         ASSERT(!PP_ISFREE(rpp));
3146                         page_unlock_nocapture(rpp);
3147                 }
3148                 ASSERT(targpp == hpp);
3149                 ASSERT(replpp == NULL);
3150                 page_list_concat(&pplist, &targpp);
3151         }
3152         CHK_LPG(pplist, szc);
3153         return (pplist);
3154 }
3155
3156 /*
3157  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3158  * page with size code 'szc'. Claiming such a page requires acquiring
3159  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3160  * relocating pages in use and concatenating these constituent pages into a
3161  * large page.
3162  *
3163  * The page lists do not have such a large page and page_freelist_split has
3164  * already failed to demote larger pages and/or coalesce smaller free pages.
3165  *
3166  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3167  * pages with the same color as 'bin'.
3168  *
3169  * 'pfnflag' specifies the subset of the pfn range to search.
3170  */
3171
3172 static page_t *
3173 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3174     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3175 {
3176         struct memseg *mseg;
3177         pgcnt_t szcpgcnt = page_get_pagecnt(szc);
3178         pgcnt_t szcpgmask = szcpgcnt - 1;
3179         pfn_t   randpfn;
3180         page_t *pp, *randpp, *endpp;
3181         uint_t colors, ceq_mask;
3182         uint_t color_mask __unused;
3183         pfn_t hi, lo;
3184         uint_t skip;
3185         MEM_NODE_ITERATOR_DECL(it);
3186
3187         ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3188
3189         pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3190
3191         if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3192                 return (NULL);
3193
3194         ASSERT(szc < mmu_page_sizes);
3195
3196         colors = PAGE_GET_PAGECOLORS(szc);
3197         color_mask = colors - 1;
3198         if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3199                 uchar_t ceq = colorequivszc[szc];
3200                 uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3201
3202                 ASSERT(ceq_dif > 0);
3203                 ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3204         } else {
3205                 ceq_mask = 0;
3206         }
3207
3208         ASSERT(bin < colors);
3209
3210         /* clear "non-significant" color bits */
3211         bin &= ceq_mask;
3212
3213         /*
3214          * trim the pfn range to search based on pfnflag. pfnflag is set
3215          * when there have been previous page_get_contig_page failures to
3216          * limit the search.
3217          *
3218          * The high bit in pfnflag specifies the number of 'slots' in the
3219          * pfn range and the remainder of pfnflag specifies which slot.
3220          * For example, a value of 1010b would mean the second slot of
3221          * the pfn range that has been divided into 8 slots.
3222          */
3223         if (pfnflag > 1) {
3224                 int     slots = 1 << (highbit(pfnflag) - 1);
3225                 int     slotid = pfnflag & (slots - 1);
3226                 pgcnt_t szcpages;
3227                 int     slotlen;
3228
3229                 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3230                 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3231                 slotlen = howmany(szcpages, slots);
3232                 /* skip if 'slotid' slot is empty */
3233                 if (slotid * slotlen >= szcpages)
3234                         return (NULL);
3235                 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3236                 ASSERT(pfnlo < pfnhi);
3237                 if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3238                         pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3239         }
3240
3241         /*
3242          * This routine is can be called recursively so we shouldn't
3243          * acquire a reader lock if a write request is pending. This
3244          * could lead to a deadlock with the DR thread.
3245          *
3246          * Returning NULL informs the caller that we could not get
3247          * a contig page with the required characteristics.
3248          */
3249
3250         if (!memsegs_trylock(0))
3251                 return (NULL);
3252
3253         /*
3254          * loop through memsegs to look for contig page candidates
3255          */
3256
3257         for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3258                 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3259                         /* no overlap */
3260                         continue;
3261                 }
3262
3263                 if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3264                         /* mseg too small */
3265                         continue;
3266
3267                 lo = MAX(pfnlo, mseg->pages_base);
3268                 hi = MIN(pfnhi, (mseg->pages_end - 1));
3269
3270                 /* round to szcpgcnt boundaries */
3271                 lo = P2ROUNDUP(lo, szcpgcnt);
3272
3273                 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3274                 hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3275
3276                 if (hi <= lo)
3277                         continue;
3278
3279                 /*
3280                  * set lo to point to the pfn for the desired bin. Large
3281                  * page sizes may only have a single page color
3282                  */
3283                 skip = szcpgcnt;
3284                 if (ceq_mask > 0 || interleaved_mnodes) {
3285                         /* set lo to point at appropriate color */
3286                         if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3287                             (interleaved_mnodes &&
3288                             PFN_2_MEM_NODE(lo) != mnode)) {
3289                                 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3290                                     color_mask, &it);
3291                         }
3292                         if (hi <= lo)
3293                                 /* mseg cannot satisfy color request */
3294                                 continue;
3295                 }
3296
3297                 /* randomly choose a point between lo and hi to begin search */
3298
3299                 randpfn = (pfn_t)GETTICK();
3300                 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3301                 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3302                 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3303                         if (randpfn != (pfn_t)-1) {
3304                                 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3305                                     ceq_mask, color_mask, &it);
3306                         }
3307                         if (randpfn >= hi) {
3308                                 randpfn = lo;
3309                                 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3310                                     &it);
3311                         }
3312                 }
3313                 randpp = mseg->pages + (randpfn - mseg->pages_base);
3314
3315                 ASSERT(randpp->p_pagenum == randpfn);
3316
3317                 pp = randpp;
3318                 endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3319
3320                 ASSERT(randpp + szcpgcnt <= endpp);
3321
3322                 do {
3323                         ASSERT(!(pp->p_pagenum & szcpgmask));
3324                         ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3325
3326                         if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3327                                 /* pages unlocked by page_claim on failure */
3328                                 if (page_claim_contig_pages(pp, szc, flags)) {
3329                                         memsegs_unlock(0);
3330                                         return (pp);
3331                                 }
3332                         }
3333
3334                         if (ceq_mask == 0 && !interleaved_mnodes) {
3335                                 pp += skip;
3336                         } else {
3337                                 pfn_t pfn = pp->p_pagenum;
3338
3339                                 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3340                                     ceq_mask, color_mask, &it);
3341                                 if (pfn == (pfn_t)-1) {
3342                                         pp = endpp;
3343                                 } else {
3344                                         pp = mseg->pages +
3345                                             (pfn - mseg->pages_base);
3346                                 }
3347                         }
3348                         if (pp >= endpp) {
3349                                 /* start from the beginning */
3350                                 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3351                                 pp = mseg->pages + (lo - mseg->pages_base);
3352                                 ASSERT(pp->p_pagenum == lo);
3353                                 ASSERT(pp + szcpgcnt <= endpp);
3354                         }
3355                 } while (pp != randpp);
3356         }
3357         memsegs_unlock(0);
3358         return (NULL);
3359 }
3360
3361
3362 /*
3363  * controlling routine that searches through physical memory in an attempt to
3364  * claim a large page based on the input parameters.
3365  * on the page free lists.
3366  *
3367  * calls page_geti_contig_pages with an initial pfn range from the mnode
3368  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3369  * that overlaps with the kernel cage or does not match the requested page
3370  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3371  * page_geti_contig_pages may further limit the search range based on
3372  * previous failure counts (pgcpfailcnt[]).
3373  *
3374  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3375  * pagesize page that satisfies mtype.
3376  */
3377 page_t *
3378 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3379     uint_t flags)
3380 {
3381         pfn_t           pfnlo, pfnhi;   /* contig pages pfn range */
3382         page_t          *pp;
3383         pgcnt_t         pfnflag = 0;    /* no limit on search if 0 */
3384
3385         VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3386
3387         /* no allocations from cage */
3388         flags |= PGI_NOCAGE;
3389
3390         MTYPE_START(mnode, mtype, flags);
3391         if (mtype < 0) {        /* mnode does not have memory in mtype range */
3392                 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3393                 return (NULL);
3394         }
3395
3396         ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3397
3398         /* do not limit search and ignore color if hi pri */
3399
3400         if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3401                 pfnflag = pgcpfailcnt[szc];
3402
3403         /* remove color match to improve chances */
3404
3405         if (flags & PGI_PGCPHIPRI || pfnflag)
3406                 flags &= ~PG_MATCH_COLOR;
3407
3408         do {
3409                 /* get pfn range based on mnode and mtype */
3410                 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3411
3412                 ASSERT(pfnhi >= pfnlo);
3413
3414                 pp = page_geti_contig_pages(mnode, bin, szc, flags,
3415                     pfnlo, pfnhi, pfnflag);
3416
3417                 if (pp != NULL) {
3418                         pfnflag = pgcpfailcnt[szc];
3419                         if (pfnflag) {
3420                                 /* double the search size */
3421                                 pgcpfailcnt[szc] = pfnflag >> 1;
3422                         }
3423                         VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3424                         return (pp);
3425                 }
3426                 MTYPE_NEXT(mnode, mtype, flags);
3427         } while (mtype >= 0);
3428
3429         VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3430         return (NULL);
3431 }
3432
3433 #if defined(__i386) || defined(__amd64)
3434 /*
3435  * Determine the likelihood of finding/coalescing a szc page.
3436  * Return 0 if the likelihood is small otherwise return 1.
3437  *
3438  * For now, be conservative and check only 1g pages and return 0
3439  * if there had been previous coalescing failures and the szc pages
3440  * needed to satisfy request would exhaust most of freemem.
3441  */
3442 int
3443 page_chk_freelist(uint_t szc)
3444 {
3445         pgcnt_t         pgcnt;
3446
3447         if (szc <= 1)
3448                 return (1);
3449
3450         pgcnt = page_get_pagecnt(szc);
3451         if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3452                 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3453                 return (0);
3454         }
3455         VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3456         return (1);
3457 }
3458 #endif
3459
3460 /*
3461  * Find the `best' page on the freelist for this (obj,off) (as,vaddr) pair.
3462  *
3463  * Does its own locking and accounting.
3464  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3465  * pages of the proper color even if there are pages of a different color.
3466  *
3467  * Finds a page, removes it, THEN locks it.
3468  */
3469
3470 /*ARGSUSED*/
3471 page_t *
3472 page_get_freelist(struct vmobject *obj, uoff_t off, struct seg *seg,
3473         caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3474 {
3475         struct as       *as = seg->s_as;
3476         page_t          *pp = NULL;
3477         ulong_t         bin;
3478         uchar_t         szc;
3479         int             mnode;
3480         int             mtype;
3481         page_t          *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3482         lgrp_mnode_cookie_t     lgrp_cookie;
3483
3484         page_get_func = page_get_mnode_freelist;
3485
3486         /*
3487          * If we aren't passed a specific lgroup, or passed a freed lgrp
3488          * assume we wish to allocate near to the current thread's home.
3489          */
3490         if (!LGRP_EXISTS(lgrp))
3491                 lgrp = lgrp_home_lgrp();
3492
3493         flags &= ~PG_NORELOC;
3494         flags |= PGI_NOCAGE;
3495
3496         MTYPE_INIT(mtype, obj->vnode, vaddr, flags, size);
3497
3498         /*
3499          * Convert size to page size code.
3500          */
3501         if ((szc = page_szc(size)) == (uchar_t)-1)
3502                 panic("page_get_freelist: illegal page size request");
3503         ASSERT(szc < mmu_page_sizes);
3504
3505         VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3506
3507         AS_2_BIN(as, seg, obj->vnode, vaddr, bin, szc);
3508
3509         ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3510
3511         /*
3512          * Try to get a local page first, but try remote if we can't
3513          * get a page of the right color.
3514          */
3515 pgretry:
3516         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3517         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3518                 pp = page_get_func(mnode, bin, mtype, szc, flags);
3519                 if (pp != NULL) {
3520                         VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3521                         DTRACE_PROBE4(page__get,
3522                             lgrp_t *, lgrp,
3523                             int, mnode,
3524                             ulong_t, bin,
3525                             uint_t, flags);
3526                         return (pp);
3527                 }
3528         }
3529         ASSERT(pp == NULL);
3530
3531         /*
3532          * for non-SZC0 PAGESIZE requests, check cachelist before checking
3533          * remote free lists.  Caller expected to call page_get_cachelist which
3534          * will check local cache lists and remote free lists.
3535          */
3536         if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3537                 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3538                 return (NULL);
3539         }
3540
3541         ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3542
3543         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3544
3545         if (!(flags & PG_LOCAL)) {
3546                 /*
3547                  * Try to get a non-local freelist page.
3548                  */
3549                 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3550                 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3551                         pp = page_get_func(mnode, bin, mtype, szc, flags);
3552                         if (pp != NULL) {
3553                                 DTRACE_PROBE4(page__get,
3554                                     lgrp_t *, lgrp,
3555                                     int, mnode,
3556                                     ulong_t, bin,
3557                                     uint_t, flags);
3558                                 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3559                                 return (pp);
3560                         }
3561                 }
3562                 ASSERT(pp == NULL);
3563         }
3564
3565         /*
3566          * when the cage is off chances are page_get_contig_pages() will fail
3567          * to lock a large page chunk therefore when the cage is off it's not
3568          * called by default.  this can be changed via /etc/system.
3569          *
3570          * page_get_contig_pages() also called to acquire a base pagesize page
3571          * for page_create_get_something().
3572          */
3573         if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3574             (pg_lpgcreate_nocage || szc == 0) &&
3575             (page_get_func != page_get_contig_pages)) {
3576
3577                 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3578                 page_get_func = page_get_contig_pages;
3579                 goto pgretry;
3580         }
3581
3582         if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3583             page_get_func == page_get_contig_pages)
3584                 SETPGCPFAILCNT(szc);
3585
3586         VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3587         return (NULL);
3588 }
3589
3590 /*
3591  * Find the `best' page on the cachelist for this (obj,off) (as,vaddr) pair.
3592  *
3593  * Does its own locking.
3594  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3595  * pages of the proper color even if there are pages of a different color.
3596  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3597  * try to lock one of them.  If no page can be locked, try the
3598  * next bin.  Return NULL if a page can not be found and locked.
3599  *
3600  * Finds a pages, trys to lock it, then removes it.
3601  */
3602
3603 /*ARGSUSED*/
3604 struct page *
3605 page_get_cachelist(struct vmobject *obj, uoff_t off, struct seg *seg,
3606     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3607 {
3608         page_t          *pp;
3609         struct as       *as = seg->s_as;
3610         ulong_t         bin;
3611         int             mnode;
3612         int             mtype;
3613         lgrp_mnode_cookie_t     lgrp_cookie;
3614
3615         /*
3616          * If we aren't passed a specific lgroup, or pasased a freed lgrp
3617          * assume we wish to allocate near to the current thread's home.
3618          */
3619         if (!LGRP_EXISTS(lgrp))
3620                 lgrp = lgrp_home_lgrp();
3621
3622         flags &= ~PG_NORELOC;
3623         flags |= PGI_NOCAGE;
3624
3625         if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC)
3626                 return (NULL);
3627
3628         AS_2_BIN(as, seg, obj->vnode, vaddr, bin, 0);
3629
3630         ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3631
3632         MTYPE_INIT(mtype, obj->vnode, vaddr, flags, MMU_PAGESIZE);
3633
3634         VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3635
3636         /*
3637          * Try local cachelists first
3638          */
3639         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3640         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3641                 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3642                 if (pp != NULL) {
3643                         VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3644                         DTRACE_PROBE4(page__get,
3645                             lgrp_t *, lgrp,
3646                             int, mnode,
3647                             ulong_t, bin,
3648                             uint_t, flags);
3649                         return (pp);
3650                 }
3651         }
3652
3653         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3654
3655         /*
3656          * Try freelists/cachelists that are farther away
3657          * This is our only chance to allocate remote pages for PAGESIZE
3658          * requests.
3659          */
3660         LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3661         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3662                 pp = page_get_mnode_freelist(mnode, bin, mtype,
3663                     0, flags);
3664                 if (pp != NULL) {
3665                         VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3666                         DTRACE_PROBE4(page__get,
3667                             lgrp_t *, lgrp,
3668                             int, mnode,
3669                             ulong_t, bin,
3670                             uint_t, flags);
3671                         return (pp);
3672                 }
3673                 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3674                 if (pp != NULL) {
3675                         VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3676                         DTRACE_PROBE4(page__get,
3677                             lgrp_t *, lgrp,
3678                             int, mnode,
3679                             ulong_t, bin,
3680                             uint_t, flags);
3681                         return (pp);
3682                 }
3683         }
3684
3685         VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3686         return (NULL);
3687 }
3688
3689 page_t *
3690 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3691 {
3692         kmutex_t                *pcm;
3693         page_t                  *pp, *first_pp;
3694         uint_t                  sbin;
3695         int                     plw_initialized;
3696         page_list_walker_t      plw;
3697
3698         VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3699
3700         MTYPE_START(mnode, mtype, flags);
3701         if (mtype < 0) {        /* mnode does not have memory in mtype range */
3702                 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3703                 return (NULL);
3704         }
3705
3706 try_again:
3707
3708         plw_initialized = 0;
3709         plw.plw_ceq_dif = 1;
3710
3711         /*
3712          * Only hold one cachelist lock at a time, that way we
3713          * can start anywhere and not have to worry about lock
3714          * ordering.
3715          */
3716
3717         for (plw.plw_count = 0;
3718             plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3719                 sbin = bin;
3720                 do {
3721
3722                         if (!PAGE_CACHELISTS(mnode, bin, mtype))
3723                                 goto bin_empty_1;
3724                         pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3725                         mutex_enter(pcm);
3726                         pp = PAGE_CACHELISTS(mnode, bin, mtype);
3727                         if (pp == NULL)
3728                                 goto bin_empty_0;
3729
3730                         first_pp = pp;
3731                         VERIFY(pp->p_object);
3732                         ASSERT(pp->p_vnode);
3733                         ASSERT(PP_ISAGED(pp) == 0);
3734                         ASSERT(pp->p_szc == 0);
3735                         ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3736                         while (!page_trylock(pp, SE_EXCL)) {
3737                                 pp = pp->p_next;
3738                                 ASSERT(pp->p_szc == 0);
3739                                 if (pp == first_pp) {
3740                                         /*
3741                                          * We have searched the complete list!
3742                                          * And all of them (might only be one)
3743                                          * are locked. This can happen since
3744                                          * these pages can also be found via
3745                                          * the hash list. When found via the
3746                                          * hash list, they are locked first,
3747                                          * then removed. We give up to let the
3748                                          * other thread run.
3749                                          */
3750                                         pp = NULL;
3751                                         break;
3752                                 }
3753                                 VERIFY(pp->p_object);
3754                                 ASSERT(pp->p_vnode);
3755                                 ASSERT(PP_ISFREE(pp));
3756                                 ASSERT(PP_ISAGED(pp) == 0);
3757                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
3758                                     mnode);
3759                         }
3760
3761                         if (pp) {
3762                                 page_t  **ppp;
3763                                 /*
3764                                  * Found and locked a page.
3765                                  * Pull it off the list.
3766                                  */
3767                                 ASSERT(mtype == PP_2_MTYPE(pp));
3768                                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
3769                                 page_sub(ppp, pp);
3770                                 /*
3771                                  * Subtract counters before releasing pcm mutex
3772                                  * to avoid a race with page_freelist_coalesce
3773                                  * and page_freelist_split.
3774                                  */
3775                                 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3776                                 mutex_exit(pcm);
3777                                 VERIFY(pp->p_object);
3778                                 ASSERT(pp->p_vnode);
3779                                 ASSERT(PP_ISAGED(pp) == 0);
3780                                 VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
3781                                 return (pp);
3782                         }
3783 bin_empty_0:
3784                         mutex_exit(pcm);
3785 bin_empty_1:
3786                         if (plw_initialized == 0) {
3787                                 page_list_walk_init(0, flags, bin, 0, 1, &plw);
3788                                 plw_initialized = 1;
3789                         }
3790                         /* calculate the next bin with equivalent color */
3791                         bin = ADD_MASKED(bin, plw.plw_bin_step,
3792                             plw.plw_ceq_mask[0], plw.plw_color_mask);
3793                 } while (sbin != bin);
3794
3795                 if (plw.plw_ceq_dif > 1)
3796                         bin = page_list_walk_next_bin(0, bin, &plw);
3797         }
3798
3799         MTYPE_NEXT(mnode, mtype, flags);
3800         if (mtype >= 0)
3801                 goto try_again;
3802
3803         VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
3804         return (NULL);
3805 }
3806
3807 #ifdef DEBUG
3808 #define REPL_PAGE_STATS
3809 #endif /* DEBUG */
3810
3811 #ifdef REPL_PAGE_STATS
3812 struct repl_page_stats {
3813         uint_t  ngets;
3814         uint_t  ngets_noreloc;
3815         uint_t  npgr_noreloc;
3816         uint_t  nnopage_first;
3817         uint_t  nnopage;
3818         uint_t  nhashout;
3819         uint_t  nnofree;
3820         uint_t  nnext_pp;
3821 } repl_page_stats;
3822 #define REPL_STAT_INCR(v)       atomic_inc_32(&repl_page_stats.v)
3823 #else /* REPL_PAGE_STATS */
3824 #define REPL_STAT_INCR(v)
3825 #endif /* REPL_PAGE_STATS */
3826
3827 int     pgrppgcp;
3828
3829 /*
3830  * The freemem accounting must be done by the caller.
3831  * First we try to get a replacement page of the same size as like_pp,
3832  * if that is not possible, then we just get a set of discontiguous
3833  * PAGESIZE pages.
3834  */
3835 page_t *
3836 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
3837     uint_t pgrflags)
3838 {
3839         page_t          *like_pp;
3840         page_t          *pp, *pplist;
3841         page_t          *pl = NULL;
3842         ulong_t         bin;
3843         int             mnode, page_mnode;
3844         int             szc;
3845         spgcnt_t        npgs, pg_cnt;
3846         pfn_t           pfnum;
3847         int             mtype;
3848         int             flags = 0;
3849         lgrp_mnode_cookie_t     lgrp_cookie;
3850         lgrp_t          *lgrp;
3851
3852         REPL_STAT_INCR(ngets);
3853         like_pp = orig_like_pp;
3854         ASSERT(PAGE_EXCL(like_pp));
3855
3856         szc = like_pp->p_szc;
3857         npgs = page_get_pagecnt(szc);
3858         /*
3859          * Now we reset like_pp to the base page_t.
3860          * That way, we won't walk past the end of this 'szc' page.
3861          */
3862         pfnum = PFN_BASE(like_pp->p_pagenum, szc);
3863         like_pp = page_numtopp_nolock(pfnum);
3864         ASSERT(like_pp->p_szc == szc);
3865
3866         VERIFY0(PP_ISNORELOC(like_pp));
3867         VERIFY0(pgrflags & PGR_NORELOC);
3868
3869         /*
3870          * Kernel pages must always be replaced with the same size
3871          * pages, since we cannot properly handle demotion of kernel
3872          * pages.
3873          */
3874         if (PP_ISKAS(like_pp))
3875                 pgrflags |= PGR_SAMESZC;
3876
3877         MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
3878
3879         while (npgs) {
3880                 pplist = NULL;
3881                 for (;;) {
3882                         pg_cnt = page_get_pagecnt(szc);
3883                         bin = PP_2_BIN(like_pp);
3884                         ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
3885                         ASSERT(pg_cnt <= npgs);
3886
3887                         /*
3888                          * If an lgroup was specified, try to get the
3889                          * page from that lgroup.
3890                          * NOTE: Must be careful with code below because
3891                          *       lgroup may disappear and reappear since there
3892                          *       is no locking for lgroup here.
3893                          */
3894                         if (LGRP_EXISTS(lgrp_target)) {
3895                                 /*
3896                                  * Keep local variable for lgroup separate
3897                                  * from lgroup argument since this code should
3898                                  * only be exercised when lgroup argument
3899                                  * exists....
3900                                  */
3901                                 lgrp = lgrp_target;
3902
3903                                 /* Try the lgroup's freelists first */
3904                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3905                                     LGRP_SRCH_LOCAL);
3906                                 while ((pplist == NULL) &&
3907                                     (mnode = lgrp_memnode_choose(&lgrp_cookie))
3908                                     != -1) {
3909                                         pplist =
3910                                             page_get_mnode_freelist(mnode, bin,
3911                                             mtype, szc, flags);
3912                                 }
3913
3914                                 /*
3915                                  * Now try it's cachelists if this is a
3916                                  * small page. Don't need to do it for
3917                                  * larger ones since page_freelist_coalesce()
3918                                  * already failed.
3919                                  */
3920                                 if (pplist != NULL || szc != 0)
3921                                         break;
3922
3923                                 /* Now try it's cachelists */
3924                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3925                                     LGRP_SRCH_LOCAL);
3926
3927                                 while ((pplist == NULL) &&
3928                                     (mnode = lgrp_memnode_choose(&lgrp_cookie))
3929                                     != -1) {
3930                                         pplist =
3931                                             page_get_mnode_cachelist(bin, flags,
3932                                             mnode, mtype);
3933                                 }
3934                                 if (pplist != NULL) {
3935                                         page_hashout(pplist, false);
3936                                         PP_SETAGED(pplist);
3937                                         REPL_STAT_INCR(nhashout);
3938                                         break;
3939                                 }
3940                                 /* Done looking in this lgroup. Bail out. */
3941                                 break;
3942                         }
3943
3944                         /*
3945                          * No lgroup was specified (or lgroup was removed by
3946                          * DR, so just try to get the page as close to
3947                          * like_pp's mnode as possible.
3948                          * First try the local freelist...
3949                          */
3950                         mnode = PP_2_MEM_NODE(like_pp);
3951                         pplist = page_get_mnode_freelist(mnode, bin,
3952                             mtype, szc, flags);
3953                         if (pplist != NULL)
3954                                 break;
3955
3956                         REPL_STAT_INCR(nnofree);
3957
3958                         /*
3959                          * ...then the local cachelist. Don't need to do it for
3960                          * larger pages cause page_freelist_coalesce() already
3961                          * failed there anyway.
3962                          */
3963                         if (szc == 0) {
3964                                 pplist = page_get_mnode_cachelist(bin, flags,
3965                                     mnode, mtype);
3966                                 if (pplist != NULL) {
3967                                         page_hashout(pplist, false);
3968                                         PP_SETAGED(pplist);
3969                                         REPL_STAT_INCR(nhashout);
3970                                         break;
3971                                 }
3972                         }
3973
3974                         /* Now try remote freelists */
3975                         page_mnode = mnode;
3976                         lgrp =
3977                             lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
3978                         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3979                             LGRP_SRCH_HIER);
3980                         while (pplist == NULL &&
3981                             (mnode = lgrp_memnode_choose(&lgrp_cookie))
3982                             != -1) {
3983                                 /*
3984                                  * Skip local mnode.
3985                                  */
3986                                 if ((mnode == page_mnode) ||
3987                                     (mem_node_config[mnode].exists == 0))
3988                                         continue;
3989
3990                                 pplist = page_get_mnode_freelist(mnode,
3991                                     bin, mtype, szc, flags);
3992                         }
3993
3994                         if (pplist != NULL)
3995                                 break;
3996
3997
3998                         /* Now try remote cachelists */
3999                         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4000                             LGRP_SRCH_HIER);
4001                         while (pplist == NULL && szc == 0) {
4002                                 mnode = lgrp_memnode_choose(&lgrp_cookie);
4003                                 if (mnode == -1)
4004                                         break;
4005                                 /*
4006                                  * Skip local mnode.
4007                                  */
4008                                 if ((mnode == page_mnode) ||
4009                                     (mem_node_config[mnode].exists == 0))
4010                                         continue;
4011
4012                                 pplist = page_get_mnode_cachelist(bin,
4013                                     flags, mnode, mtype);
4014
4015                                 if (pplist != NULL) {
4016                                         page_hashout(pplist, false);
4017                                         PP_SETAGED(pplist);
4018                                         REPL_STAT_INCR(nhashout);
4019                                         break;
4020                                 }
4021                         }
4022
4023                         /*
4024                          * Break out of while loop under the following cases:
4025                          * - If we successfully got a page.
4026                          * - If pgrflags specified only returning a specific
4027                          *   page size and we could not find that page size.
4028                          * - If we could not satisfy the request with PAGESIZE
4029                          *   or larger pages.
4030                          */
4031                         if (pplist != NULL || szc == 0)
4032                                 break;
4033
4034                         if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4035                                 /* try to find contig page */
4036
4037                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4038                                     LGRP_SRCH_HIER);
4039
4040                                 while ((pplist == NULL) &&
4041                                     (mnode =
4042                                     lgrp_memnode_choose(&lgrp_cookie))
4043                                     != -1) {
4044                                         pplist = page_get_contig_pages(
4045                                             mnode, bin, mtype, szc,
4046                                             flags | PGI_PGCPHIPRI);
4047                                 }
4048                                 break;
4049                         }
4050
4051                         /*
4052                          * The correct thing to do here is try the next
4053                          * page size down using szc--. Due to a bug
4054                          * with the processing of HAT_RELOAD_SHARE
4055                          * where the sfmmu_ttecnt arrays of all
4056                          * hats sharing an ISM segment don't get updated,
4057                          * using intermediate size pages for relocation
4058                          * can lead to continuous page faults.
4059                          */
4060                         szc = 0;
4061                 }
4062
4063                 if (pplist != NULL) {
4064                         DTRACE_PROBE4(page__get,
4065                             lgrp_t *, lgrp,
4066                             int, mnode,
4067                             ulong_t, bin,
4068                             uint_t, flags);
4069
4070                         while (pplist != NULL && pg_cnt--) {
4071                                 ASSERT(pplist != NULL);
4072                                 pp = pplist;
4073                                 page_sub(&pplist, pp);
4074                                 PP_CLRFREE(pp);
4075                                 PP_CLRAGED(pp);
4076                                 page_list_concat(&pl, &pp);
4077                                 npgs--;
4078                                 like_pp = like_pp + 1;
4079                                 REPL_STAT_INCR(nnext_pp);
4080                         }
4081                         ASSERT(pg_cnt == 0);
4082                 } else {
4083                         break;
4084                 }
4085         }
4086
4087         if (npgs) {
4088                 /*
4089                  * We were unable to allocate the necessary number
4090                  * of pages.
4091                  * We need to free up any pl.
4092                  */
4093                 REPL_STAT_INCR(nnopage);
4094                 page_free_replacement_page(pl);
4095                 return (NULL);
4096         } else {
4097                 return (pl);
4098         }
4099 }
4100
4101 /*
4102  * demote a free large page to it's constituent pages
4103  */
4104 void
4105 page_demote_free_pages(page_t *pp)
4106 {
4107
4108         int mnode;
4109
4110         ASSERT(pp != NULL);
4111         ASSERT(PAGE_LOCKED(pp));
4112         ASSERT(PP_ISFREE(pp));
4113         ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4114
4115         mnode = PP_2_MEM_NODE(pp);
4116         page_freelist_lock(mnode);
4117         if (pp->p_szc != 0) {
4118                 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4119                     pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4120         }
4121         page_freelist_unlock(mnode);
4122         ASSERT(pp->p_szc == 0);
4123 }
4124
4125 /*
4126  * Factor in colorequiv to check additional 'equivalent' bins.
4127  * colorequiv may be set in /etc/system
4128  */
4129 void
4130 page_set_colorequiv_arr(void)
4131 {
4132         if (colorequiv > 1) {
4133                 int i;
4134                 uint_t sv_a = lowbit(colorequiv) - 1;
4135
4136                 if (sv_a > 15)
4137                         sv_a = 15;
4138
4139                 for (i = 0; i < MMU_PAGE_SIZES; i++) {
4140                         uint_t colors;
4141                         uint_t a = sv_a;
4142
4143                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
4144                                 continue;
4145                         }
4146                         while ((colors >> a) == 0)
4147                                 a--;
4148                         if ((a << 4) > colorequivszc[i]) {
4149                                 colorequivszc[i] = (a << 4);
4150                         }
4151                 }
4152         }
4153 }