kernel/vm/vm_pagelist.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 /*
  26  * Copyright 2012 Joyent, Inc.  All rights reserved.
  27  */
  28
  29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  30 /*      All Rights Reserved   */
  31
  32 /*
  33  * Portions of this source code were derived from Berkeley 4.3 BSD
  34  * under license from the Regents of the University of California.
  35  */
  36
  37
  38 /*
  39  * This file contains common functions to access and manage the page lists.
  40  * Many of these routines originated from platform dependent modules
  41  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
  42  * a platform independent manner.
  43  *
  44  * vm/vm_dep.h provides for platform specific support.
  45  */
  46
  47 #include <sys/types.h>
  48 #include <sys/debug.h>
  49 #include <sys/cmn_err.h>
  50 #include <sys/systm.h>
  51 #include <sys/atomic.h>
  52 #include <sys/sysmacros.h>
  53 #include <vm/as.h>
  54 #include <vm/page.h>
  55 #include <vm/seg_kmem.h>
  56 #include <vm/seg_vn.h>
  57 #include <sys/vmsystm.h>
  58 #include <sys/memnode.h>
  59 #include <vm/vm_dep.h>
  60 #include <sys/lgrp.h>
  61 #include <sys/mem_config.h>
  62 #include <sys/callb.h>
  63 #include <sys/mem_cage.h>
  64 #include <sys/sdt.h>
  65 #include <sys/dumphdr.h>
  66 #include <sys/swap.h>
  67
  68 extern uint_t   vac_colors;
  69
  70 #define MAX_PRAGMA_ALIGN        128
  71
  72 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
  73
  74 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
  75 #pragma align   L2CACHE_ALIGN_MAX(vm_cpu_data0)
  76 #else
  77 #pragma align   MAX_PRAGMA_ALIGN(vm_cpu_data0)
  78 #endif
  79 char            vm_cpu_data0[VM_CPU_DATA_PADSIZE];
  80
  81 /*
  82  * number of page colors equivalent to reqested color in page_get routines.
  83  * If set, keeps large pages intact longer and keeps MPO allocation
  84  * from the local mnode in favor of acquiring the 'correct' page color from
  85  * a demoted large page or from a remote mnode.
  86  */
  87 uint_t  colorequiv;
  88
  89 /*
  90  * color equivalency mask for each page size.
  91  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
  92  * High 4 bits determine the number of high order bits of the color to ignore.
  93  * Low 4 bits determines number of low order bits of color to ignore (it's only
  94  * relevant for hashed index based page coloring).
  95  */
  96 uchar_t colorequivszc[MMU_PAGE_SIZES];
  97
  98 /*
  99  * if set, specifies the percentage of large pages that are free from within
 100  * a large page region before attempting to lock those pages for
 101  * page_get_contig_pages processing.
 102  *
 103  * Should be turned on when kpr is available when page_trylock_contig_pages
 104  * can be more selective.
 105  */
 106
 107 int     ptcpthreshold;
 108
 109 /*
 110  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
 111  * Enabled by default via pgcplimitsearch.
 112  *
 113  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
 114  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
 115  * bound. This upper bound range guarantees:
 116  *    - all large page 'slots' will be searched over time
 117  *    - the minimum (1) large page candidates considered on each pgcp call
 118  *    - count doesn't wrap around to 0
 119  */
 120 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES];
 121 int     pgcplimitsearch = 1;
 122
 123 #define PGCPFAILMAX             (1 << (highbit(physinstalled) - 1))
 124 #define SETPGCPFAILCNT(szc)                                             \
 125         if (++pgcpfailcnt[szc] >= PGCPFAILMAX)                          \
 126                 pgcpfailcnt[szc] = PGCPFAILMAX / 2;
 127
 128 #ifdef VM_STATS
 129 struct vmm_vmstats_str  vmm_vmstats;
 130
 131 #endif /* VM_STATS */
 132
 133 #if defined(__sparc)
 134 #define LPGCREATE       0
 135 #else
 136 /* enable page_get_contig_pages */
 137 #define LPGCREATE       1
 138 #endif
 139
 140 int pg_contig_disable;
 141 int pg_lpgcreate_nocage = LPGCREATE;
 142
 143 /*
 144  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
 145  */
 146 #define PFNNULL         0
 147
 148 /* Flags involved in promotion and demotion routines */
 149 #define PC_FREE         0x1     /* put page on freelist */
 150 #define PC_ALLOC        0x2     /* return page for allocation */
 151
 152 /*
 153  * Flag for page_demote to be used with PC_FREE to denote that we don't care
 154  * what the color is as the color parameter to the function is ignored.
 155  */
 156 #define PC_NO_COLOR     (-1)
 157
 158 /* mtype value for page_promote to use when mtype does not matter */
 159 #define PC_MTYPE_ANY    (-1)
 160
 161 /*
 162  * page counters candidates info
 163  * See page_ctrs_cands comment below for more details.
 164  * fields are as follows:
 165  *      pcc_pages_free:         # pages which freelist coalesce can create
 166  *      pcc_color_free:         pointer to page free counts per color
 167  */
 168 typedef struct pcc_info {
 169         pgcnt_t pcc_pages_free;
 170         pgcnt_t *pcc_color_free;
 171         uint_t  pad[12];
 172 } pcc_info_t;
 173
 174 /*
 175  * On big machines it can take a long time to check page_counters
 176  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
 177  * updated sum of all elements of the corresponding page_counters arrays.
 178  * page_freelist_coalesce() searches page_counters only if an appropriate
 179  * element of page_ctrs_cands array is greater than 0.
 180  *
 181  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
 182  */
 183 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
 184
 185 /*
 186  * Return in val the total number of free pages which can be created
 187  * for the given mnode (m), mrange (g), and region size (r)
 188  */
 189 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) {                           \
 190         int i;                                                          \
 191         val = 0;                                                        \
 192         for (i = 0; i < NPC_MUTEX; i++) {                               \
 193             val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;    \
 194         }                                                               \
 195 }
 196
 197 /*
 198  * Return in val the total number of free pages which can be created
 199  * for the given mnode (m), mrange (g), region size (r), and color (c)
 200  */
 201 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {                   \
 202         int i;                                                          \
 203         val = 0;                                                        \
 204         ASSERT((c) < PAGE_GET_PAGECOLORS(r));                           \
 205         for (i = 0; i < NPC_MUTEX; i++) {                               \
 206             val +=                                                      \
 207                 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];  \
 208         }                                                               \
 209 }
 210
 211 /*
 212  * We can only allow a single thread to update a counter within the physical
 213  * range of the largest supported page size. That is the finest granularity
 214  * possible since the counter values are dependent on each other
 215  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
 216  * ctr_mutex lock index for a particular physical range.
 217  */
 218 static kmutex_t *ctr_mutex[NPC_MUTEX];
 219
 220 #define PP_CTR_LOCK_INDX(pp)                                            \
 221         (((pp)->p_pagenum >>                                            \
 222             (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
 223
 224 #define INVALID_COLOR 0xffffffff
 225 #define INVALID_MASK  0xffffffff
 226
 227 /*
 228  * Local functions prototypes.
 229  */
 230
 231 void page_ctr_add(int, int, page_t *, int);
 232 void page_ctr_add_internal(int, int, page_t *, int);
 233 void page_ctr_sub(int, int, page_t *, int);
 234 void page_ctr_sub_internal(int, int, page_t *, int);
 235 void page_freelist_lock(int);
 236 void page_freelist_unlock(int);
 237 page_t *page_promote(int, pfn_t, uchar_t, int, int);
 238 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
 239 page_t *page_freelist_split(uchar_t,
 240     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
 241 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
 242 static int page_trylock_cons(page_t *pp, se_t se);
 243
 244 /*
 245  * The page_counters array below is used to keep track of free contiguous
 246  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
 247  * This contains an array of counters, the size of the array, a shift value
 248  * used to convert a pagenum into a counter array index or vice versa, as
 249  * well as a cache of the last successful index to be promoted to a larger
 250  * page size.  As an optimization, we keep track of the last successful index
 251  * to be promoted per page color for the given size region, and this is
 252  * allocated dynamically based upon the number of colors for a given
 253  * region size.
 254  *
 255  * Conceptually, the page counters are represented as:
 256  *
 257  *      page_counters[region_size][mnode]
 258  *
 259  *      region_size:    size code of a candidate larger page made up
 260  *                      of contiguous free smaller pages.
 261  *
 262  *      page_counters[region_size][mnode].hpm_counters[index]:
 263  *              represents how many (region_size - 1) pages either
 264  *              exist or can be created within the given index range.
 265  *
 266  * Let's look at a sparc example:
 267  *      If we want to create a free 512k page, we look at region_size 2
 268  *      for the mnode we want.  We calculate the index and look at a specific
 269  *      hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
 270  *      this location, it means that 8 64k pages either exist or can be created
 271  *      from 8K pages in order to make a single free 512k page at the given
 272  *      index.  Note that when a region is full, it will contribute to the
 273  *      counts in the region above it.  Thus we will not know what page
 274  *      size the free pages will be which can be promoted to this new free
 275  *      page unless we look at all regions below the current region.
 276  */
 277
 278 /*
 279  * Note: hpmctr_t is defined in platform vm_dep.h
 280  * hw_page_map_t contains all the information needed for the page_counters
 281  * logic. The fields are as follows:
 282  *
 283  *      hpm_counters:   dynamically allocated array to hold counter data
 284  *      hpm_entries:    entries in hpm_counters
 285  *      hpm_shift:      shift for pnum/array index conv
 286  *      hpm_base:       PFN mapped to counter index 0
 287  *      hpm_color_current:      last index in counter array for this color at
 288  *                              which we successfully created a large page
 289  */
 290 typedef struct hw_page_map {
 291         hpmctr_t        *hpm_counters;
 292         size_t          hpm_entries;
 293         int             hpm_shift;
 294         pfn_t           hpm_base;
 295         size_t          *hpm_color_current[MAX_MNODE_MRANGES];
 296 #if defined(__sparc)
 297         uint_t          pad[4];
 298 #endif
 299 } hw_page_map_t;
 300
 301 /*
 302  * Element zero is not used, but is allocated for convenience.
 303  */
 304 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
 305
 306 /*
 307  * Cached value of MNODE_RANGE_CNT(mnode).
 308  * This is a function call in x86.
 309  */
 310 static int mnode_nranges[MAX_MEM_NODES];
 311 static int mnode_maxmrange[MAX_MEM_NODES];
 312
 313 /*
 314  * The following macros are convenient ways to get access to the individual
 315  * elements of the page_counters arrays.  They can be used on both
 316  * the left side and right side of equations.
 317  */
 318 #define PAGE_COUNTERS(mnode, rg_szc, idx)                       \
 319         (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
 320
 321 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc)                   \
 322         (page_counters[(rg_szc)][(mnode)].hpm_counters)
 323
 324 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc)                      \
 325         (page_counters[(rg_szc)][(mnode)].hpm_shift)
 326
 327 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc)                    \
 328         (page_counters[(rg_szc)][(mnode)].hpm_entries)
 329
 330 #define PAGE_COUNTERS_BASE(mnode, rg_szc)                       \
 331         (page_counters[(rg_szc)][(mnode)].hpm_base)
 332
 333 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)             \
 334         (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
 335
 336 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)       \
 337         (page_counters[(rg_szc)][(mnode)].                              \
 338         hpm_color_current[(mrange)][(color)])
 339
 340 #define PNUM_TO_IDX(mnode, rg_szc, pnum)                        \
 341         (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>    \
 342                 PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
 343
 344 #define IDX_TO_PNUM(mnode, rg_szc, index)                       \
 345         (PAGE_COUNTERS_BASE((mnode), (rg_szc)) +                \
 346                 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
 347
 348 /*
 349  * Protects the hpm_counters and hpm_color_current memory from changing while
 350  * looking at page counters information.
 351  * Grab the write lock to modify what these fields point at.
 352  * Grab the read lock to prevent any pointers from changing.
 353  * The write lock can not be held during memory allocation due to a possible
 354  * recursion deadlock with trying to grab the read lock while the
 355  * write lock is already held.
 356  */
 357 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
 358
 359
 360 /*
 361  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
 362  */
 363 void
 364 cpu_vm_data_init(struct cpu *cp)
 365 {
 366         if (cp == CPU0) {
 367                 cp->cpu_vm_data = (void *)&vm_cpu_data0;
 368         } else {
 369                 void    *kmptr;
 370                 int     align;
 371                 size_t  sz;
 372
 373                 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
 374                 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
 375                 kmptr = kmem_zalloc(sz, KM_SLEEP);
 376                 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
 377                 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
 378                 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
 379         }
 380 }
 381
 382 /*
 383  * free cpu_vm_data
 384  */
 385 void
 386 cpu_vm_data_destroy(struct cpu *cp)
 387 {
 388         if (cp->cpu_seqid && cp->cpu_vm_data) {
 389                 ASSERT(cp != CPU0);
 390                 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
 391                     ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
 392         }
 393         cp->cpu_vm_data = NULL;
 394 }
 395
 396
 397 /*
 398  * page size to page size code
 399  */
 400 int
 401 page_szc(size_t pagesize)
 402 {
 403         int     i = 0;
 404
 405         while (hw_page_array[i].hp_size) {
 406                 if (pagesize == hw_page_array[i].hp_size)
 407                         return (i);
 408                 i++;
 409         }
 410         return (-1);
 411 }
 412
 413 /*
 414  * page size to page size code with the restriction that it be a supported
 415  * user page size.  If it's not a supported user page size, -1 will be returned.
 416  */
 417 int
 418 page_szc_user_filtered(size_t pagesize)
 419 {
 420         int szc = page_szc(pagesize);
 421         if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
 422                 return (szc);
 423         }
 424         return (-1);
 425 }
 426
 427 /*
 428  * Return how many page sizes are available for the user to use.  This is
 429  * what the hardware supports and not based upon how the OS implements the
 430  * support of different page sizes.
 431  *
 432  * If legacy is non-zero, return the number of pagesizes available to legacy
 433  * applications. The number of legacy page sizes might be less than the
 434  * exported user page sizes. This is to prevent legacy applications that
 435  * use the largest page size returned from getpagesizes(3c) from inadvertantly
 436  * using the 'new' large pagesizes.
 437  */
 438 uint_t
 439 page_num_user_pagesizes(int legacy)
 440 {
 441         if (legacy)
 442                 return (mmu_legacy_page_sizes);
 443         return (mmu_exported_page_sizes);
 444 }
 445
 446 uint_t
 447 page_num_pagesizes(void)
 448 {
 449         return (mmu_page_sizes);
 450 }
 451
 452 /*
 453  * returns the count of the number of base pagesize pages associated with szc
 454  */
 455 pgcnt_t
 456 page_get_pagecnt(uint_t szc)
 457 {
 458         if (szc >= mmu_page_sizes)
 459                 panic("page_get_pagecnt: out of range %d", szc);
 460         return (hw_page_array[szc].hp_pgcnt);
 461 }
 462
 463 size_t
 464 page_get_pagesize(uint_t szc)
 465 {
 466         if (szc >= mmu_page_sizes)
 467                 panic("page_get_pagesize: out of range %d", szc);
 468         return (hw_page_array[szc].hp_size);
 469 }
 470
 471 /*
 472  * Return the size of a page based upon the index passed in.  An index of
 473  * zero refers to the smallest page size in the system, and as index increases
 474  * it refers to the next larger supported page size in the system.
 475  * Note that szc and userszc may not be the same due to unsupported szc's on
 476  * some systems.
 477  */
 478 size_t
 479 page_get_user_pagesize(uint_t userszc)
 480 {
 481         uint_t szc = USERSZC_2_SZC(userszc);
 482
 483         if (szc >= mmu_page_sizes)
 484                 panic("page_get_user_pagesize: out of range %d", szc);
 485         return (hw_page_array[szc].hp_size);
 486 }
 487
 488 uint_t
 489 page_get_shift(uint_t szc)
 490 {
 491         if (szc >= mmu_page_sizes)
 492                 panic("page_get_shift: out of range %d", szc);
 493         return (PAGE_GET_SHIFT(szc));
 494 }
 495
 496 uint_t
 497 page_get_pagecolors(uint_t szc)
 498 {
 499         if (szc >= mmu_page_sizes)
 500                 panic("page_get_pagecolors: out of range %d", szc);
 501         return (PAGE_GET_PAGECOLORS(szc));
 502 }
 503
 504 /*
 505  * this assigns the desired equivalent color after a split
 506  */
 507 uint_t
 508 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
 509     uint_t ncolor, uint_t ceq_mask)
 510 {
 511         ASSERT(nszc > szc);
 512         ASSERT(szc < mmu_page_sizes);
 513         ASSERT(color < PAGE_GET_PAGECOLORS(szc));
 514         ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
 515
 516         color &= ceq_mask;
 517         ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
 518         return (color | (ncolor & ~ceq_mask));
 519 }
 520
 521 /*
 522  * The interleaved_mnodes flag is set when mnodes overlap in
 523  * the physbase..physmax range, but have disjoint slices.
 524  * In this case hpm_counters is shared by all mnodes.
 525  * This flag is set dynamically by the platform.
 526  */
 527 int interleaved_mnodes = 0;
 528
 529 /*
 530  * Called by startup().
 531  * Size up the per page size free list counters based on physmax
 532  * of each node and max_mem_nodes.
 533  *
 534  * If interleaved_mnodes is set we need to find the first mnode that
 535  * exists. hpm_counters for the first mnode will then be shared by
 536  * all other mnodes. If interleaved_mnodes is not set, just set
 537  * first=mnode each time. That means there will be no sharing.
 538  */
 539 size_t
 540 page_ctrs_sz(void)
 541 {
 542         int     r;              /* region size */
 543         int     mnode;
 544         int     firstmn;        /* first mnode that exists */
 545         int     nranges;
 546         pfn_t   physbase;
 547         pfn_t   physmax;
 548         uint_t  ctrs_sz = 0;
 549         int     i;
 550         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 551
 552         /*
 553          * We need to determine how many page colors there are for each
 554          * page size in order to allocate memory for any color specific
 555          * arrays.
 556          */
 557         for (i = 0; i < mmu_page_sizes; i++) {
 558                 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 559         }
 560
 561         for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 562
 563                 pgcnt_t r_pgcnt;
 564                 pfn_t   r_base;
 565                 pgcnt_t r_align;
 566
 567                 if (mem_node_config[mnode].exists == 0)
 568                         continue;
 569
 570                 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 571                 nranges = MNODE_RANGE_CNT(mnode);
 572                 mnode_nranges[mnode] = nranges;
 573                 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
 574
 575                 /*
 576                  * determine size needed for page counter arrays with
 577                  * base aligned to large page size.
 578                  */
 579                 for (r = 1; r < mmu_page_sizes; r++) {
 580                         /* add in space for hpm_color_current */
 581                         ctrs_sz += sizeof (size_t) *
 582                             colors_per_szc[r] * nranges;
 583
 584                         if (firstmn != mnode)
 585                                 continue;
 586
 587                         /* add in space for hpm_counters */
 588                         r_align = page_get_pagecnt(r);
 589                         r_base = physbase;
 590                         r_base &= ~(r_align - 1);
 591                         r_pgcnt = howmany(physmax - r_base + 1, r_align);
 592
 593                         /*
 594                          * Round up to always allocate on pointer sized
 595                          * boundaries.
 596                          */
 597                         ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
 598                             sizeof (hpmctr_t *));
 599                 }
 600         }
 601
 602         for (r = 1; r < mmu_page_sizes; r++) {
 603                 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
 604         }
 605
 606         /* add in space for page_ctrs_cands and pcc_color_free */
 607         ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
 608             mmu_page_sizes * NPC_MUTEX;
 609
 610         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 611
 612                 if (mem_node_config[mnode].exists == 0)
 613                         continue;
 614
 615                 nranges = mnode_nranges[mnode];
 616                 ctrs_sz += sizeof (pcc_info_t) * nranges *
 617                     mmu_page_sizes * NPC_MUTEX;
 618                 for (r = 1; r < mmu_page_sizes; r++) {
 619                         ctrs_sz += sizeof (pgcnt_t) * nranges *
 620                             colors_per_szc[r] * NPC_MUTEX;
 621                 }
 622         }
 623
 624         /* ctr_mutex */
 625         ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
 626
 627         /* size for page list counts */
 628         PLCNT_SZ(ctrs_sz);
 629
 630         /*
 631          * add some slop for roundups. page_ctrs_alloc will roundup the start
 632          * address of the counters to ecache_alignsize boundary for every
 633          * memory node.
 634          */
 635         return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
 636 }
 637
 638 caddr_t
 639 page_ctrs_alloc(caddr_t alloc_base)
 640 {
 641         int     mnode;
 642         int     mrange, nranges;
 643         int     r;              /* region size */
 644         int     i;
 645         int     firstmn;        /* first mnode that exists */
 646         pfn_t   physbase;
 647         pfn_t   physmax;
 648         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 649
 650         /*
 651          * We need to determine how many page colors there are for each
 652          * page size in order to allocate memory for any color specific
 653          * arrays.
 654          */
 655         for (i = 0; i < mmu_page_sizes; i++) {
 656                 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 657         }
 658
 659         for (r = 1; r < mmu_page_sizes; r++) {
 660                 page_counters[r] = (hw_page_map_t *)alloc_base;
 661                 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
 662         }
 663
 664         /* page_ctrs_cands and pcc_color_free array */
 665         for (i = 0; i < NPC_MUTEX; i++) {
 666                 for (r = 1; r < mmu_page_sizes; r++) {
 667
 668                         page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
 669                         alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
 670
 671                         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 672                                 pcc_info_t *pi;
 673
 674                                 if (mem_node_config[mnode].exists == 0)
 675                                         continue;
 676
 677                                 nranges = mnode_nranges[mnode];
 678
 679                                 pi = (pcc_info_t *)alloc_base;
 680                                 alloc_base += sizeof (pcc_info_t) * nranges;
 681                                 page_ctrs_cands[i][r][mnode] = pi;
 682
 683                                 for (mrange = 0; mrange < nranges; mrange++) {
 684                                         pi->pcc_color_free =
 685                                             (pgcnt_t *)alloc_base;
 686                                         alloc_base += sizeof (pgcnt_t) *
 687                                             colors_per_szc[r];
 688                                         pi++;
 689                                 }
 690                         }
 691                 }
 692         }
 693
 694         /* ctr_mutex */
 695         for (i = 0; i < NPC_MUTEX; i++) {
 696                 ctr_mutex[i] = (kmutex_t *)alloc_base;
 697                 alloc_base += (max_mem_nodes * sizeof (kmutex_t));
 698         }
 699
 700         /* initialize page list counts */
 701         PLCNT_INIT(alloc_base);
 702
 703         for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 704
 705                 pgcnt_t r_pgcnt;
 706                 pfn_t   r_base;
 707                 pgcnt_t r_align;
 708                 int     r_shift;
 709                 int     nranges = mnode_nranges[mnode];
 710
 711                 if (mem_node_config[mnode].exists == 0)
 712                         continue;
 713
 714                 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 715
 716                 for (r = 1; r < mmu_page_sizes; r++) {
 717                         /*
 718                          * the page_counters base has to be aligned to the
 719                          * page count of page size code r otherwise the counts
 720                          * will cross large page boundaries.
 721                          */
 722                         r_align = page_get_pagecnt(r);
 723                         r_base = physbase;
 724                         /* base needs to be aligned - lower to aligned value */
 725                         r_base &= ~(r_align - 1);
 726                         r_pgcnt = howmany(physmax - r_base + 1, r_align);
 727                         r_shift = PAGE_BSZS_SHIFT(r);
 728
 729                         PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
 730                         PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
 731                         PAGE_COUNTERS_BASE(mnode, r) = r_base;
 732                         for (mrange = 0; mrange < nranges; mrange++) {
 733                                 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
 734                                     r, mrange) = (size_t *)alloc_base;
 735                                 alloc_base += sizeof (size_t) *
 736                                     colors_per_szc[r];
 737                         }
 738                         for (i = 0; i < colors_per_szc[r]; i++) {
 739                                 uint_t color_mask = colors_per_szc[r] - 1;
 740                                 pfn_t  pfnum = r_base;
 741                                 size_t idx;
 742                                 int mrange;
 743                                 MEM_NODE_ITERATOR_DECL(it);
 744
 745                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
 746                                 if (pfnum == (pfn_t)-1) {
 747                                         idx = 0;
 748                                 } else {
 749                                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
 750                                             color_mask, color_mask, &it);
 751                                         idx = PNUM_TO_IDX(mnode, r, pfnum);
 752                                         idx = (idx >= r_pgcnt) ? 0 : idx;
 753                                 }
 754                                 for (mrange = 0; mrange < nranges; mrange++) {
 755                                         PAGE_COUNTERS_CURRENT_COLOR(mnode,
 756                                             r, i, mrange) = idx;
 757                                 }
 758                         }
 759
 760                         /* hpm_counters may be shared by all mnodes */
 761                         if (firstmn == mnode) {
 762                                 PAGE_COUNTERS_COUNTERS(mnode, r) =
 763                                     (hpmctr_t *)alloc_base;
 764                                 alloc_base +=
 765                                     P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
 766                                     sizeof (hpmctr_t *));
 767                         } else {
 768                                 PAGE_COUNTERS_COUNTERS(mnode, r) =
 769                                     PAGE_COUNTERS_COUNTERS(firstmn, r);
 770                         }
 771
 772                         /*
 773                          * Verify that PNUM_TO_IDX and IDX_TO_PNUM
 774                          * satisfy the identity requirement.
 775                          * We should be able to go from one to the other
 776                          * and get consistent values.
 777                          */
 778                         ASSERT(PNUM_TO_IDX(mnode, r,
 779                             (IDX_TO_PNUM(mnode, r, 0))) == 0);
 780                         ASSERT(IDX_TO_PNUM(mnode, r,
 781                             (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
 782                 }
 783                 /*
 784                  * Roundup the start address of the page_counters to
 785                  * cache aligned boundary for every memory node.
 786                  * page_ctrs_sz() has added some slop for these roundups.
 787                  */
 788                 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
 789                     L2CACHE_ALIGN);
 790         }
 791
 792         /* Initialize other page counter specific data structures. */
 793         for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
 794                 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
 795         }
 796
 797         return (alloc_base);
 798 }
 799
 800 /*
 801  * Functions to adjust region counters for each size free list.
 802  * Caller is responsible to acquire the ctr_mutex lock if necessary and
 803  * thus can be called during startup without locks.
 804  */
 805 /* ARGSUSED */
 806 void
 807 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
 808 {
 809         ssize_t         r;      /* region size */
 810         ssize_t         idx;
 811         pfn_t           pfnum;
 812         int             lckidx;
 813
 814         ASSERT(mnode == PP_2_MEM_NODE(pp));
 815         ASSERT(mtype == PP_2_MTYPE(pp));
 816
 817         ASSERT(pp->p_szc < mmu_page_sizes);
 818
 819         PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
 820
 821         /* no counter update needed for largest page size */
 822         if (pp->p_szc >= mmu_page_sizes - 1) {
 823                 return;
 824         }
 825
 826         r = pp->p_szc + 1;
 827         pfnum = pp->p_pagenum;
 828         lckidx = PP_CTR_LOCK_INDX(pp);
 829
 830         /*
 831          * Increment the count of free pages for the current
 832          * region. Continue looping up in region size incrementing
 833          * count if the preceeding region is full.
 834          */
 835         while (r < mmu_page_sizes) {
 836                 idx = PNUM_TO_IDX(mnode, r, pfnum);
 837
 838                 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 839                 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
 840
 841                 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
 842                         break;
 843                 } else {
 844                         int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
 845                         pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
 846                             [MTYPE_2_MRANGE(mnode, root_mtype)];
 847
 848                         cand->pcc_pages_free++;
 849                         cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
 850                 }
 851                 r++;
 852         }
 853 }
 854
 855 void
 856 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
 857 {
 858         int             lckidx = PP_CTR_LOCK_INDX(pp);
 859         kmutex_t        *lock = &ctr_mutex[lckidx][mnode];
 860
 861         mutex_enter(lock);
 862         page_ctr_add_internal(mnode, mtype, pp, flags);
 863         mutex_exit(lock);
 864 }
 865
 866 void
 867 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
 868 {
 869         int             lckidx;
 870         ssize_t         r;      /* region size */
 871         ssize_t         idx;
 872         pfn_t           pfnum;
 873
 874         ASSERT(mnode == PP_2_MEM_NODE(pp));
 875         ASSERT(mtype == PP_2_MTYPE(pp));
 876
 877         ASSERT(pp->p_szc < mmu_page_sizes);
 878
 879         PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
 880
 881         /* no counter update needed for largest page size */
 882         if (pp->p_szc >= mmu_page_sizes - 1) {
 883                 return;
 884         }
 885
 886         r = pp->p_szc + 1;
 887         pfnum = pp->p_pagenum;
 888         lckidx = PP_CTR_LOCK_INDX(pp);
 889
 890         /*
 891          * Decrement the count of free pages for the current
 892          * region. Continue looping up in region size decrementing
 893          * count if the preceeding region was full.
 894          */
 895         while (r < mmu_page_sizes) {
 896                 idx = PNUM_TO_IDX(mnode, r, pfnum);
 897
 898                 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 899                 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
 900
 901                 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
 902                         break;
 903                 } else {
 904                         int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
 905                         pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
 906                             [MTYPE_2_MRANGE(mnode, root_mtype)];
 907
 908                         ASSERT(cand->pcc_pages_free != 0);
 909                         ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
 910
 911                         cand->pcc_pages_free--;
 912                         cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
 913                 }
 914                 r++;
 915         }
 916 }
 917
 918 void
 919 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
 920 {
 921         int             lckidx = PP_CTR_LOCK_INDX(pp);
 922         kmutex_t        *lock = &ctr_mutex[lckidx][mnode];
 923
 924         mutex_enter(lock);
 925         page_ctr_sub_internal(mnode, mtype, pp, flags);
 926         mutex_exit(lock);
 927 }
 928
 929 /*
 930  * Adjust page counters following a memory attach, since typically the
 931  * size of the array needs to change, and the PFN to counter index
 932  * mapping needs to change.
 933  *
 934  * It is possible this mnode did not exist at startup. In that case
 935  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
 936  * to change (a theoretical possibility on x86), which means pcc_color_free
 937  * arrays must be extended.
 938  */
 939 uint_t
 940 page_ctrs_adjust(int mnode)
 941 {
 942         pgcnt_t npgs;
 943         int     r;              /* region size */
 944         int     i;
 945         size_t  pcsz, old_csz;
 946         hpmctr_t *new_ctr, *old_ctr;
 947         pfn_t   oldbase, newbase;
 948         pfn_t   physbase, physmax;
 949         size_t  old_npgs;
 950         hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
 951         size_t  size_cache[MMU_PAGE_SIZES];
 952         size_t  *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 953         size_t  *old_color_array[MAX_MNODE_MRANGES];
 954         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 955         pcc_info_t **cands_cache;
 956         pcc_info_t *old_pi, *pi;
 957         pgcnt_t *pgcntp;
 958         int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
 959         int cands_cache_nranges;
 960         int old_maxmrange, new_maxmrange;
 961         int rc = 0;
 962         int oldmnode;
 963
 964         cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
 965             MMU_PAGE_SIZES, KM_NOSLEEP);
 966         if (cands_cache == NULL)
 967                 return (ENOMEM);
 968
 969         i = -1;
 970         HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
 971
 972         newbase = physbase & ~PC_BASE_ALIGN_MASK;
 973         npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
 974
 975         /* prepare to free non-null pointers on the way out */
 976         cands_cache_nranges = nranges;
 977         bzero(ctr_cache, sizeof (ctr_cache));
 978         bzero(color_cache, sizeof (color_cache));
 979
 980         /*
 981          * We need to determine how many page colors there are for each
 982          * page size in order to allocate memory for any color specific
 983          * arrays.
 984          */
 985         for (r = 0; r < mmu_page_sizes; r++) {
 986                 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
 987         }
 988
 989         /*
 990          * Preallocate all of the new hpm_counters arrays as we can't
 991          * hold the page_ctrs_rwlock as a writer and allocate memory.
 992          * If we can't allocate all of the arrays, undo our work so far
 993          * and return failure.
 994          */
 995         for (r = 1; r < mmu_page_sizes; r++) {
 996                 pcsz = npgs >> PAGE_BSZS_SHIFT(r);
 997                 size_cache[r] = pcsz;
 998                 ctr_cache[r] = kmem_zalloc(pcsz *
 999                     sizeof (hpmctr_t), KM_NOSLEEP);
1000                 if (ctr_cache[r] == NULL) {
1001                         rc = ENOMEM;
1002                         goto cleanup;
1003                 }
1004         }
1005
1006         /*
1007          * Preallocate all of the new color current arrays as we can't
1008          * hold the page_ctrs_rwlock as a writer and allocate memory.
1009          * If we can't allocate all of the arrays, undo our work so far
1010          * and return failure.
1011          */
1012         for (r = 1; r < mmu_page_sizes; r++) {
1013                 for (mrange = 0; mrange < nranges; mrange++) {
1014                         color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1015                             colors_per_szc[r], KM_NOSLEEP);
1016                         if (color_cache[r][mrange] == NULL) {
1017                                 rc = ENOMEM;
1018                                 goto cleanup;
1019                         }
1020                 }
1021         }
1022
1023         /*
1024          * Preallocate all of the new pcc_info_t arrays as we can't
1025          * hold the page_ctrs_rwlock as a writer and allocate memory.
1026          * If we can't allocate all of the arrays, undo our work so far
1027          * and return failure.
1028          */
1029         for (r = 1; r < mmu_page_sizes; r++) {
1030                 for (i = 0; i < NPC_MUTEX; i++) {
1031                         pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1032                             KM_NOSLEEP);
1033                         if (pi == NULL) {
1034                                 rc = ENOMEM;
1035                                 goto cleanup;
1036                         }
1037                         cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1038
1039                         for (mrange = 0; mrange < nranges; mrange++, pi++) {
1040                                 pgcntp = kmem_zalloc(colors_per_szc[r] *
1041                                     sizeof (pgcnt_t), KM_NOSLEEP);
1042                                 if (pgcntp == NULL) {
1043                                         rc = ENOMEM;
1044                                         goto cleanup;
1045                                 }
1046                                 pi->pcc_color_free = pgcntp;
1047                         }
1048                 }
1049         }
1050
1051         /*
1052          * Grab the write lock to prevent others from walking these arrays
1053          * while we are modifying them.
1054          */
1055         PAGE_CTRS_WRITE_LOCK(mnode);
1056
1057         /*
1058          * For interleaved mnodes, find the first mnode
1059          * with valid page counters since the current
1060          * mnode may have just been added and not have
1061          * valid page counters.
1062          */
1063         if (interleaved_mnodes) {
1064                 for (i = 0; i < max_mem_nodes; i++)
1065                         if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1066                                 break;
1067                 ASSERT(i < max_mem_nodes);
1068                 oldmnode = i;
1069         } else
1070                 oldmnode = mnode;
1071
1072         old_nranges = mnode_nranges[mnode];
1073         cands_cache_nranges = old_nranges;
1074         mnode_nranges[mnode] = nranges;
1075         old_maxmrange = mnode_maxmrange[mnode];
1076         mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1077         new_maxmrange = mnode_maxmrange[mnode];
1078
1079         for (r = 1; r < mmu_page_sizes; r++) {
1080                 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1081                 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1082                 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1083                 oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1084                 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1085                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1086                         old_color_array[mrange] =
1087                             PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1088                             r, mrange);
1089                 }
1090
1091                 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1092                 new_ctr = ctr_cache[r];
1093                 ctr_cache[r] = NULL;
1094                 if (old_ctr != NULL &&
1095                     (oldbase + old_npgs > newbase) &&
1096                     (newbase + npgs > oldbase)) {
1097                         /*
1098                          * Map the intersection of the old and new
1099                          * counters into the new array.
1100                          */
1101                         size_t offset;
1102                         if (newbase > oldbase) {
1103                                 offset = (newbase - oldbase) >>
1104                                     PAGE_COUNTERS_SHIFT(mnode, r);
1105                                 bcopy(old_ctr + offset, new_ctr,
1106                                     MIN(pcsz, (old_csz - offset)) *
1107                                     sizeof (hpmctr_t));
1108                         } else {
1109                                 offset = (oldbase - newbase) >>
1110                                     PAGE_COUNTERS_SHIFT(mnode, r);
1111                                 bcopy(old_ctr, new_ctr + offset,
1112                                     MIN(pcsz - offset, old_csz) *
1113                                     sizeof (hpmctr_t));
1114                         }
1115                 }
1116
1117                 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1118                 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1119                 PAGE_COUNTERS_BASE(mnode, r) = newbase;
1120
1121                 /* update shared hpm_counters in other mnodes */
1122                 if (interleaved_mnodes) {
1123                         for (i = 0; i < max_mem_nodes; i++) {
1124                                 if ((i == mnode) ||
1125                                     (mem_node_config[i].exists == 0))
1126                                         continue;
1127                                 ASSERT(
1128                                     PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1129                                     PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1130                                 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1131                                 PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1132                                 PAGE_COUNTERS_BASE(i, r) = newbase;
1133                         }
1134                 }
1135
1136                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1137                         PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1138                             color_cache[r][mrange];
1139                         color_cache[r][mrange] = NULL;
1140                 }
1141                 /*
1142                  * for now, just reset on these events as it's probably
1143                  * not worthwhile to try and optimize this.
1144                  */
1145                 for (i = 0; i < colors_per_szc[r]; i++) {
1146                         uint_t color_mask = colors_per_szc[r] - 1;
1147                         int mlo = interleaved_mnodes ? 0 : mnode;
1148                         int mhi = interleaved_mnodes ? max_mem_nodes :
1149                             (mnode + 1);
1150                         int m;
1151                         pfn_t  pfnum;
1152                         size_t idx;
1153                         MEM_NODE_ITERATOR_DECL(it);
1154
1155                         for (m = mlo; m < mhi; m++) {
1156                                 if (mem_node_config[m].exists == 0)
1157                                         continue;
1158                                 pfnum = newbase;
1159                                 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1160                                 if (pfnum == (pfn_t)-1) {
1161                                         idx = 0;
1162                                 } else {
1163                                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1164                                             color_mask, color_mask, &it);
1165                                         idx = PNUM_TO_IDX(m, r, pfnum);
1166                                         idx = (idx < pcsz) ? idx : 0;
1167                                 }
1168                                 for (mrange = 0; mrange < nranges; mrange++) {
1169                                         if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1170                                             r, mrange) != NULL)
1171                                                 PAGE_COUNTERS_CURRENT_COLOR(m,
1172                                                     r, i, mrange) = idx;
1173                                 }
1174                         }
1175                 }
1176
1177                 /* cache info for freeing out of the critical path */
1178                 if ((caddr_t)old_ctr >= kernelheap &&
1179                     (caddr_t)old_ctr < ekernelheap) {
1180                         ctr_cache[r] = old_ctr;
1181                         size_cache[r] = old_csz;
1182                 }
1183                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1184                         size_t *tmp = old_color_array[mrange];
1185                         if ((caddr_t)tmp >= kernelheap &&
1186                             (caddr_t)tmp < ekernelheap) {
1187                                 color_cache[r][mrange] = tmp;
1188                         }
1189                 }
1190                 /*
1191                  * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1192                  * satisfy the identity requirement.
1193                  * We should be able to go from one to the other
1194                  * and get consistent values.
1195                  */
1196                 ASSERT(PNUM_TO_IDX(mnode, r,
1197                     (IDX_TO_PNUM(mnode, r, 0))) == 0);
1198                 ASSERT(IDX_TO_PNUM(mnode, r,
1199                     (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1200
1201                 /* pcc_info_t and pcc_color_free */
1202                 for (i = 0; i < NPC_MUTEX; i++) {
1203                         pcc_info_t *epi;
1204                         pcc_info_t *eold_pi;
1205
1206                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1207                         old_pi = page_ctrs_cands[i][r][mnode];
1208                         page_ctrs_cands[i][r][mnode] = pi;
1209                         cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1210
1211                         /* preserve old pcc_color_free values, if any */
1212                         if (old_pi == NULL)
1213                                 continue;
1214
1215                         /*
1216                          * when/if x86 does DR, must account for
1217                          * possible change in range index when
1218                          * preserving pcc_info
1219                          */
1220                         epi = &pi[nranges];
1221                         eold_pi = &old_pi[old_nranges];
1222                         if (new_maxmrange > old_maxmrange) {
1223                                 pi += new_maxmrange - old_maxmrange;
1224                         } else if (new_maxmrange < old_maxmrange) {
1225                                 old_pi += old_maxmrange - new_maxmrange;
1226                         }
1227                         for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1228                                 pcc_info_t tmp = *pi;
1229                                 *pi = *old_pi;
1230                                 *old_pi = tmp;
1231                         }
1232                 }
1233         }
1234         PAGE_CTRS_WRITE_UNLOCK(mnode);
1235
1236         /*
1237          * Now that we have dropped the write lock, it is safe to free all
1238          * of the memory we have cached above.
1239          * We come thru here to free memory when pre-alloc fails, and also to
1240          * free old pointers which were recorded while locked.
1241          */
1242 cleanup:
1243         for (r = 1; r < mmu_page_sizes; r++) {
1244                 if (ctr_cache[r] != NULL) {
1245                         kmem_free(ctr_cache[r],
1246                             size_cache[r] * sizeof (hpmctr_t));
1247                 }
1248                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1249                         if (color_cache[r][mrange] != NULL) {
1250                                 kmem_free(color_cache[r][mrange],
1251                                     colors_per_szc[r] * sizeof (size_t));
1252                         }
1253                 }
1254                 for (i = 0; i < NPC_MUTEX; i++) {
1255                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1256                         if (pi == NULL)
1257                                 continue;
1258                         nr = cands_cache_nranges;
1259                         for (mrange = 0; mrange < nr; mrange++, pi++) {
1260                                 pgcntp = pi->pcc_color_free;
1261                                 if (pgcntp == NULL)
1262                                         continue;
1263                                 if ((caddr_t)pgcntp >= kernelheap &&
1264                                     (caddr_t)pgcntp < ekernelheap) {
1265                                         kmem_free(pgcntp,
1266                                             colors_per_szc[r] *
1267                                             sizeof (pgcnt_t));
1268                                 }
1269                         }
1270                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1271                         if ((caddr_t)pi >= kernelheap &&
1272                             (caddr_t)pi < ekernelheap) {
1273                                 kmem_free(pi, nr * sizeof (pcc_info_t));
1274                         }
1275                 }
1276         }
1277
1278         kmem_free(cands_cache,
1279             sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1280         return (rc);
1281 }
1282
1283 /*
1284  * Cleanup the hpm_counters field in the page counters
1285  * array.
1286  */
1287 void
1288 page_ctrs_cleanup(void)
1289 {
1290         int r;  /* region size */
1291         int i;  /* mnode index */
1292
1293         /*
1294          * Get the page counters write lock while we are
1295          * setting the page hpm_counters field to NULL
1296          * for non-existent mnodes.
1297          */
1298         for (i = 0; i < max_mem_nodes; i++) {
1299                 PAGE_CTRS_WRITE_LOCK(i);
1300                 if (mem_node_config[i].exists) {
1301                         PAGE_CTRS_WRITE_UNLOCK(i);
1302                         continue;
1303                 }
1304                 for (r = 1; r < mmu_page_sizes; r++) {
1305                         PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1306                 }
1307                 PAGE_CTRS_WRITE_UNLOCK(i);
1308         }
1309 }
1310
1311 #ifdef DEBUG
1312
1313 /*
1314  * confirm pp is a large page corresponding to szc
1315  */
1316 void
1317 chk_lpg(page_t *pp, uchar_t szc)
1318 {
1319         spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1320         uint_t noreloc;
1321
1322         if (npgs == 1) {
1323                 ASSERT(pp->p_szc == 0);
1324                 ASSERT(pp->p_next == pp);
1325                 ASSERT(pp->p_prev == pp);
1326                 return;
1327         }
1328
1329         ASSERT(pp->p_list.largepg.next == pp || pp->p_list.largepg.next == NULL);
1330         ASSERT(pp->p_list.largepg.prev == pp || pp->p_list.largepg.prev == NULL);
1331
1332         ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1333         ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1334         ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1335         ASSERT(pp->p_prev == (pp + (npgs - 1)));
1336
1337         /*
1338          * Check list of pages.
1339          */
1340         noreloc = PP_ISNORELOC(pp);
1341         while (npgs--) {
1342                 if (npgs != 0) {
1343                         ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1344                         ASSERT(pp->p_next == (pp + 1));
1345                 }
1346                 ASSERT(pp->p_szc == szc);
1347                 ASSERT(PP_ISFREE(pp));
1348                 ASSERT(PP_ISAGED(pp));
1349                 ASSERT(pp->p_list.largepg.next == pp || pp->p_list.largepg.next == NULL);
1350                 ASSERT(pp->p_list.largepg.prev == pp || pp->p_list.largepg.prev == NULL);
1351                 ASSERT(pp->p_vnode  == NULL);
1352                 ASSERT(PP_ISNORELOC(pp) == noreloc);
1353
1354                 pp = pp->p_next;
1355         }
1356 }
1357 #endif /* DEBUG */
1358
1359 void
1360 page_freelist_lock(int mnode)
1361 {
1362         int i;
1363         for (i = 0; i < NPC_MUTEX; i++) {
1364                 mutex_enter(FPC_MUTEX(mnode, i));
1365                 mutex_enter(CPC_MUTEX(mnode, i));
1366         }
1367 }
1368
1369 void
1370 page_freelist_unlock(int mnode)
1371 {
1372         int i;
1373         for (i = 0; i < NPC_MUTEX; i++) {
1374                 mutex_exit(FPC_MUTEX(mnode, i));
1375                 mutex_exit(CPC_MUTEX(mnode, i));
1376         }
1377 }
1378
1379 /*
1380  * add pp to the specified page list. Defaults to head of the page list
1381  * unless PG_LIST_TAIL is specified.
1382  */
1383 void
1384 page_list_add(page_t *pp, int flags)
1385 {
1386         page_t          **ppp;
1387         kmutex_t        *pcm;
1388         uint_t          bin, mtype;
1389         int             mnode;
1390
1391         ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1392         ASSERT(PP_ISFREE(pp));
1393         ASSERT(!hat_page_is_mapped(pp));
1394         ASSERT(hat_page_getshare(pp) == 0);
1395
1396         /*
1397          * Large pages should be freed via page_list_add_pages().
1398          */
1399         ASSERT(pp->p_szc == 0);
1400
1401         /*
1402          * Don't need to lock the freelist first here
1403          * because the page isn't on the freelist yet.
1404          * This means p_szc can't change on us.
1405          */
1406
1407         bin = PP_2_BIN(pp);
1408         mnode = PP_2_MEM_NODE(pp);
1409         mtype = PP_2_MTYPE(pp);
1410
1411         if (flags & PG_LIST_ISINIT) {
1412                 /*
1413                  * PG_LIST_ISINIT is set during system startup (ie. single
1414                  * threaded), add a page to the free list and add to the
1415                  * the free region counters w/o any locking
1416                  */
1417                 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1418
1419                 /* inline version of page_add() */
1420                 if (*ppp != NULL) {
1421                         pp->p_next = *ppp;
1422                         pp->p_prev = (*ppp)->p_prev;
1423                         (*ppp)->p_prev = pp;
1424                         pp->p_prev->p_next = pp;
1425                 } else
1426                         *ppp = pp;
1427
1428                 page_ctr_add_internal(mnode, mtype, pp, flags);
1429                 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1430         } else {
1431                 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1432
1433                 if (flags & PG_FREE_LIST) {
1434                         VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1435                         ASSERT(PP_ISAGED(pp));
1436                         ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1437
1438                 } else {
1439                         VM_STAT_ADD(vmm_vmstats.pladd_cache);
1440                         ASSERT(pp->p_vnode);
1441                         ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1442                         ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1443                 }
1444                 mutex_enter(pcm);
1445                 page_add(ppp, pp);
1446
1447                 if (flags & PG_LIST_TAIL)
1448                         *ppp = (*ppp)->p_next;
1449                 /*
1450                  * Add counters before releasing pcm mutex to avoid a race with
1451                  * page_freelist_coalesce and page_freelist_split.
1452                  */
1453                 page_ctr_add(mnode, mtype, pp, flags);
1454                 mutex_exit(pcm);
1455         }
1456
1457
1458 #if defined(__sparc)
1459         if (PP_ISNORELOC(pp)) {
1460                 kcage_freemem_add(1);
1461         }
1462 #endif
1463         /*
1464          * It is up to the caller to unlock the page!
1465          */
1466         ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1467 }
1468
1469
1470 #ifdef __sparc
1471 /*
1472  * This routine is only used by kcage_init during system startup.
1473  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1474  * without the overhead of taking locks and updating counters.
1475  */
1476 void
1477 page_list_noreloc_startup(page_t *pp)
1478 {
1479         page_t          **ppp;
1480         uint_t          bin;
1481         int             mnode;
1482         int             mtype;
1483         int             flags = 0;
1484
1485         /*
1486          * If this is a large page on the freelist then
1487          * break it up into smaller pages.
1488          */
1489         if (pp->p_szc != 0)
1490                 page_boot_demote(pp);
1491
1492         /*
1493          * Get list page is currently on.
1494          */
1495         bin = PP_2_BIN(pp);
1496         mnode = PP_2_MEM_NODE(pp);
1497         mtype = PP_2_MTYPE(pp);
1498         ASSERT(mtype == MTYPE_RELOC);
1499         ASSERT(pp->p_szc == 0);
1500
1501         if (PP_ISAGED(pp)) {
1502                 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1503                 flags |= PG_FREE_LIST;
1504         } else {
1505                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1506                 flags |= PG_CACHE_LIST;
1507         }
1508
1509         ASSERT(*ppp != NULL);
1510
1511         /*
1512          * Delete page from current list.
1513          */
1514         if (*ppp == pp)
1515                 *ppp = pp->p_next;              /* go to next page */
1516         if (*ppp == pp) {
1517                 *ppp = NULL;                    /* page list is gone */
1518         } else {
1519                 pp->p_prev->p_next = pp->p_next;
1520                 pp->p_next->p_prev = pp->p_prev;
1521         }
1522
1523         /*
1524          * Decrement page counters
1525          */
1526         page_ctr_sub_internal(mnode, mtype, pp, flags);
1527
1528         /*
1529          * Set no reloc for cage initted pages.
1530          */
1531         PP_SETNORELOC(pp);
1532
1533         mtype = PP_2_MTYPE(pp);
1534         ASSERT(mtype == MTYPE_NORELOC);
1535
1536         /*
1537          * Get new list for page.
1538          */
1539         if (PP_ISAGED(pp)) {
1540                 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1541         } else {
1542                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1543         }
1544
1545         /*
1546          * Insert page on new list.
1547          */
1548         if (*ppp == NULL) {
1549                 *ppp = pp;
1550                 pp->p_next = pp->p_prev = pp;
1551         } else {
1552                 pp->p_next = *ppp;
1553                 pp->p_prev = (*ppp)->p_prev;
1554                 (*ppp)->p_prev = pp;
1555                 pp->p_prev->p_next = pp;
1556         }
1557
1558         /*
1559          * Increment page counters
1560          */
1561         page_ctr_add_internal(mnode, mtype, pp, flags);
1562
1563         /*
1564          * Update cage freemem counter
1565          */
1566         atomic_inc_ulong(&kcage_freemem);
1567 }
1568 #else   /* __sparc */
1569
1570 /* ARGSUSED */
1571 void
1572 page_list_noreloc_startup(page_t *pp)
1573 {
1574         panic("page_list_noreloc_startup: should be here only for sparc");
1575 }
1576 #endif
1577
1578 void
1579 page_list_add_pages(page_t *pp, int flags)
1580 {
1581         kmutex_t *pcm;
1582         pgcnt_t pgcnt;
1583         uint_t  bin, mtype, i;
1584         int     mnode;
1585
1586         /* default to freelist/head */
1587         ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1588
1589         CHK_LPG(pp, pp->p_szc);
1590         VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1591
1592         bin = PP_2_BIN(pp);
1593         mnode = PP_2_MEM_NODE(pp);
1594         mtype = PP_2_MTYPE(pp);
1595
1596         if (flags & PG_LIST_ISINIT) {
1597                 ASSERT(pp->p_szc == mmu_page_sizes - 1);
1598                 page_lpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1599                 ASSERT(!PP_ISNORELOC(pp));
1600                 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1601         } else {
1602
1603                 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1604
1605                 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1606
1607                 mutex_enter(pcm);
1608                 page_lpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1609                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1610                 mutex_exit(pcm);
1611
1612                 pgcnt = page_get_pagecnt(pp->p_szc);
1613 #if defined(__sparc)
1614                 if (PP_ISNORELOC(pp))
1615                         kcage_freemem_add(pgcnt);
1616 #endif
1617                 for (i = 0; i < pgcnt; i++, pp++)
1618                         page_unlock_nocapture(pp);
1619         }
1620 }
1621
1622 /*
1623  * During boot, need to demote a large page to base
1624  * pagesize pages for seg_kmem for use in boot_alloc()
1625  */
1626 void
1627 page_boot_demote(page_t *pp)
1628 {
1629         ASSERT(pp->p_szc != 0);
1630         ASSERT(PP_ISFREE(pp));
1631         ASSERT(PP_ISAGED(pp));
1632
1633         (void) page_demote(PP_2_MEM_NODE(pp),
1634             PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1635             PC_FREE);
1636
1637         ASSERT(PP_ISFREE(pp));
1638         ASSERT(PP_ISAGED(pp));
1639         ASSERT(pp->p_szc == 0);
1640 }
1641
1642 /*
1643  * Take a particular page off of whatever freelist the page
1644  * is claimed to be on.
1645  *
1646  * NOTE: Only used for PAGESIZE pages.
1647  */
1648 void
1649 page_list_sub(page_t *pp, int flags)
1650 {
1651         int             bin;
1652         uint_t          mtype;
1653         int             mnode;
1654         kmutex_t        *pcm;
1655         page_t          **ppp;
1656
1657         ASSERT(PAGE_EXCL(pp));
1658         ASSERT(PP_ISFREE(pp));
1659
1660         /*
1661          * The p_szc field can only be changed by page_promote()
1662          * and page_demote(). Only free pages can be promoted and
1663          * demoted and the free list MUST be locked during these
1664          * operations. So to prevent a race in page_list_sub()
1665          * between computing which bin of the freelist lock to
1666          * grab and actually grabing the lock we check again that
1667          * the bin we locked is still the correct one. Notice that
1668          * the p_szc field could have actually changed on us but
1669          * if the bin happens to still be the same we are safe.
1670          */
1671 try_again:
1672         bin = PP_2_BIN(pp);
1673         mnode = PP_2_MEM_NODE(pp);
1674         pcm = PC_BIN_MUTEX(mnode, bin, flags);
1675         mutex_enter(pcm);
1676         if (PP_2_BIN(pp) != bin) {
1677                 mutex_exit(pcm);
1678                 goto try_again;
1679         }
1680         mtype = PP_2_MTYPE(pp);
1681
1682         if (flags & PG_FREE_LIST) {
1683                 VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1684                 ASSERT(PP_ISAGED(pp));
1685                 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1686         } else {
1687                 VM_STAT_ADD(vmm_vmstats.plsub_cache);
1688                 ASSERT(!PP_ISAGED(pp));
1689                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1690         }
1691
1692         /*
1693          * Common PAGESIZE case.
1694          *
1695          * Note that we locked the freelist. This prevents
1696          * any page promotion/demotion operations. Therefore
1697          * the p_szc will not change until we drop pcm mutex.
1698          */
1699         if (pp->p_szc == 0) {
1700                 page_sub(ppp, pp);
1701                 /*
1702                  * Subtract counters before releasing pcm mutex
1703                  * to avoid race with page_freelist_coalesce.
1704                  */
1705                 page_ctr_sub(mnode, mtype, pp, flags);
1706                 mutex_exit(pcm);
1707
1708 #if defined(__sparc)
1709                 if (PP_ISNORELOC(pp)) {
1710                         kcage_freemem_sub(1);
1711                 }
1712 #endif
1713                 return;
1714         }
1715
1716         /*
1717          * Large pages on the cache list are not supported.
1718          */
1719         if (flags & PG_CACHE_LIST)
1720                 panic("page_list_sub: large page on cachelist");
1721
1722         /*
1723          * Slow but rare.
1724          *
1725          * Somebody wants this particular page which is part
1726          * of a large page. In this case we just demote the page
1727          * if it's on the freelist.
1728          *
1729          * We have to drop pcm before locking the entire freelist.
1730          * Once we have re-locked the freelist check to make sure
1731          * the page hasn't already been demoted or completely
1732          * freed.
1733          */
1734         mutex_exit(pcm);
1735         page_freelist_lock(mnode);
1736         if (pp->p_szc != 0) {
1737                 /*
1738                  * Large page is on freelist.
1739                  */
1740                 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1741                     0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1742         }
1743         ASSERT(PP_ISFREE(pp));
1744         ASSERT(PP_ISAGED(pp));
1745         ASSERT(pp->p_szc == 0);
1746
1747         /*
1748          * Subtract counters before releasing pcm mutex
1749          * to avoid race with page_freelist_coalesce.
1750          */
1751         bin = PP_2_BIN(pp);
1752         mtype = PP_2_MTYPE(pp);
1753         ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1754
1755         page_sub(ppp, pp);
1756         page_ctr_sub(mnode, mtype, pp, flags);
1757         page_freelist_unlock(mnode);
1758
1759 #if defined(__sparc)
1760         if (PP_ISNORELOC(pp)) {
1761                 kcage_freemem_sub(1);
1762         }
1763 #endif
1764 }
1765
1766 void
1767 page_list_sub_pages(page_t *pp, uint_t szc)
1768 {
1769         kmutex_t *pcm;
1770         uint_t  bin, mtype;
1771         int     mnode;
1772
1773         ASSERT(PAGE_EXCL(pp));
1774         ASSERT(PP_ISFREE(pp));
1775         ASSERT(PP_ISAGED(pp));
1776
1777         /*
1778          * See comment in page_list_sub().
1779          */
1780 try_again:
1781         bin = PP_2_BIN(pp);
1782         mnode = PP_2_MEM_NODE(pp);
1783         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1784         mutex_enter(pcm);
1785         if (PP_2_BIN(pp) != bin) {
1786                 mutex_exit(pcm);
1787                 goto    try_again;
1788         }
1789
1790         /*
1791          * If we're called with a page larger than szc or it got
1792          * promoted above szc before we locked the freelist then
1793          * drop pcm and re-lock entire freelist. If page still larger
1794          * than szc then demote it.
1795          */
1796         if (pp->p_szc > szc) {
1797                 mutex_exit(pcm);
1798                 pcm = NULL;
1799                 page_freelist_lock(mnode);
1800                 if (pp->p_szc > szc) {
1801                         VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1802                         (void) page_demote(mnode,
1803                             PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1804                             pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1805                 }
1806                 bin = PP_2_BIN(pp);
1807         }
1808         ASSERT(PP_ISFREE(pp));
1809         ASSERT(PP_ISAGED(pp));
1810         ASSERT(pp->p_szc <= szc);
1811         ASSERT(pp == PP_PAGEROOT(pp));
1812
1813         VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1814
1815         mtype = PP_2_MTYPE(pp);
1816         if (pp->p_szc != 0) {
1817                 page_lpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1818                 CHK_LPG(pp, pp->p_szc);
1819         } else {
1820                 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1821                 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1822         }
1823         page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1824
1825         if (pcm != NULL) {
1826                 mutex_exit(pcm);
1827         } else {
1828                 page_freelist_unlock(mnode);
1829         }
1830
1831 #if defined(__sparc)
1832         if (PP_ISNORELOC(pp)) {
1833                 pgcnt_t pgcnt;
1834
1835                 pgcnt = page_get_pagecnt(pp->p_szc);
1836                 kcage_freemem_sub(pgcnt);
1837         }
1838 #endif
1839 }
1840
1841 /*
1842  * Add the page to the front of a linked list of pages
1843  * using the p_next & p_prev pointers for the list.
1844  * The caller is responsible for protecting the list pointers.
1845  */
1846 void
1847 mach_page_add(page_t **ppp, page_t *pp)
1848 {
1849         if (*ppp == NULL) {
1850                 pp->p_next = pp->p_prev = pp;
1851         } else {
1852                 pp->p_next = *ppp;
1853                 pp->p_prev = (*ppp)->p_prev;
1854                 (*ppp)->p_prev = pp;
1855                 pp->p_prev->p_next = pp;
1856         }
1857         *ppp = pp;
1858 }
1859
1860 /*
1861  * Remove this page from a linked list of pages
1862  * using the p_next & p_prev pointers for the list.
1863  *
1864  * The caller is responsible for protecting the list pointers.
1865  */
1866 void
1867 mach_page_sub(page_t **ppp, page_t *pp)
1868 {
1869         ASSERT(PP_ISFREE(pp));
1870
1871         if (*ppp == NULL || pp == NULL)
1872                 panic("mach_page_sub");
1873
1874         if (*ppp == pp)
1875                 *ppp = pp->p_next;              /* go to next page */
1876
1877         if (*ppp == pp)
1878                 *ppp = NULL;                    /* page list is gone */
1879         else {
1880                 pp->p_prev->p_next = pp->p_next;
1881                 pp->p_next->p_prev = pp->p_prev;
1882         }
1883         pp->p_prev = pp->p_next = pp;           /* make pp a list of one */
1884 }
1885
1886 /*
1887  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1888  */
1889 void
1890 page_promote_size(page_t *pp, uint_t cur_szc)
1891 {
1892         pfn_t pfn;
1893         int mnode;
1894         int idx;
1895         int new_szc = cur_szc + 1;
1896         int full = FULL_REGION_CNT(new_szc);
1897
1898         pfn = page_pptonum(pp);
1899         mnode = PFN_2_MEM_NODE(pfn);
1900
1901         page_freelist_lock(mnode);
1902
1903         idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1904         if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1905                 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1906
1907         page_freelist_unlock(mnode);
1908 }
1909
1910 static uint_t page_promote_err;
1911 static uint_t page_promote_noreloc_err;
1912
1913 /*
1914  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1915  * for the given mnode starting at pfnum. Pages involved are on the freelist
1916  * before the call and may be returned to the caller if requested, otherwise
1917  * they will be placed back on the freelist.
1918  * If flags is PC_ALLOC, then the large page will be returned to the user in
1919  * a state which is consistent with a page being taken off the freelist.  If
1920  * we failed to lock the new large page, then we will return NULL to the
1921  * caller and put the large page on the freelist instead.
1922  * If flags is PC_FREE, then the large page will be placed on the freelist,
1923  * and NULL will be returned.
1924  * The caller is responsible for locking the freelist as well as any other
1925  * accounting which needs to be done for a returned page.
1926  *
1927  * RFE: For performance pass in pp instead of pfnum so
1928  *      we can avoid excessive calls to page_numtopp_nolock().
1929  *      This would depend on an assumption that all contiguous
1930  *      pages are in the same memseg so we can just add/dec
1931  *      our pp.
1932  *
1933  * Lock ordering:
1934  *
1935  *      There is a potential but rare deadlock situation
1936  *      for page promotion and demotion operations. The problem
1937  *      is there are two paths into the freelist manager and
1938  *      they have different lock orders:
1939  *
1940  *      page_create()
1941  *              lock freelist
1942  *              page_lock(EXCL)
1943  *              unlock freelist
1944  *              return
1945  *              caller drops page_lock
1946  *
1947  *      page_free() and page_reclaim()
1948  *              caller grabs page_lock(EXCL)
1949  *
1950  *              lock freelist
1951  *              unlock freelist
1952  *              drop page_lock
1953  *
1954  *      What prevents a thread in page_create() from deadlocking
1955  *      with a thread freeing or reclaiming the same page is the
1956  *      page_trylock() in page_get_freelist(). If the trylock fails
1957  *      it skips the page.
1958  *
1959  *      The lock ordering for promotion and demotion is the same as
1960  *      for page_create(). Since the same deadlock could occur during
1961  *      page promotion and freeing or reclaiming of a page on the
1962  *      cache list we might have to fail the operation and undo what
1963  *      have done so far. Again this is rare.
1964  */
1965 page_t *
1966 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1967 {
1968         page_t          *pp, *pplist, *tpp, *start_pp;
1969         pgcnt_t         new_npgs, npgs;
1970         uint_t          bin;
1971         pgcnt_t         tmpnpgs, pages_left;
1972         uint_t          noreloc;
1973         int             which_list;
1974         ulong_t         index;
1975         kmutex_t        *phm;
1976
1977         /*
1978          * General algorithm:
1979          * Find the starting page
1980          * Walk each page struct removing it from the freelist,
1981          * and linking it to all the other pages removed.
1982          * Once all pages are off the freelist,
1983          * walk the list, modifying p_szc to new_szc and what
1984          * ever other info needs to be done to create a large free page.
1985          * According to the flags, either return the page or put it
1986          * on the freelist.
1987          */
1988
1989         start_pp = page_numtopp_nolock(pfnum);
1990         ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1991         new_npgs = page_get_pagecnt(new_szc);
1992         ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1993
1994         /* don't return page of the wrong mtype */
1995         if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1996                         return (NULL);
1997
1998         /*
1999          * Loop through smaller pages to confirm that all pages
2000          * give the same result for PP_ISNORELOC().
2001          * We can check this reliably here as the protocol for setting
2002          * P_NORELOC requires pages to be taken off the free list first.
2003          */
2004         noreloc = PP_ISNORELOC(start_pp);
2005         for (pp = start_pp + new_npgs; --pp > start_pp; ) {
2006                 if (noreloc != PP_ISNORELOC(pp)) {
2007                         page_promote_noreloc_err++;
2008                         page_promote_err++;
2009                         return (NULL);
2010                 }
2011         }
2012
2013         pages_left = new_npgs;
2014         pplist = NULL;
2015         pp = start_pp;
2016
2017         /* Loop around coalescing the smaller pages into a big page. */
2018         while (pages_left) {
2019                 /*
2020                  * Remove from the freelist.
2021                  */
2022                 ASSERT(PP_ISFREE(pp));
2023                 bin = PP_2_BIN(pp);
2024                 ASSERT(mnode == PP_2_MEM_NODE(pp));
2025                 mtype = PP_2_MTYPE(pp);
2026                 if (PP_ISAGED(pp)) {
2027
2028                         /*
2029                          * PG_FREE_LIST
2030                          */
2031                         if (pp->p_szc) {
2032                                 page_lpsub(&PAGE_FREELISTS(mnode,
2033                                     pp->p_szc, bin, mtype), pp);
2034                         } else {
2035                                 mach_page_sub(&PAGE_FREELISTS(mnode, 0,
2036                                     bin, mtype), pp);
2037                         }
2038                         which_list = PG_FREE_LIST;
2039                 } else {
2040                         vnode_t *vp;
2041
2042                         ASSERT(pp->p_szc == 0);
2043
2044                         /*
2045                          * PG_CACHE_LIST
2046                          *
2047                          * Since this page comes from the
2048                          * cachelist, we must destroy the
2049                          * vnode association.
2050                          */
2051                         if (!page_trylock(pp, SE_EXCL)) {
2052                                 goto fail_promote;
2053                         }
2054
2055                         vp = pp->p_vnode;
2056
2057                         /*
2058                          * We need to be careful not to deadlock
2059                          * with another thread in page_lookup().
2060                          * The page_lookup() thread could be holding
2061                          * the same phm that we need if the two
2062                          * pages happen to hash to the same phm lock.
2063                          * At this point we have locked the entire
2064                          * freelist and page_lookup() could be trying
2065                          * to grab a freelist lock.
2066                          */
2067                         if (!mutex_tryenter(page_vnode_mutex(vp))) {
2068                                 page_unlock_nocapture(pp);
2069                                 goto fail_promote;
2070                         }
2071
2072                         mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2073                         page_hashout(pp, true);
2074                         mutex_exit(page_vnode_mutex(vp));
2075                         PP_SETAGED(pp);
2076                         page_unlock_nocapture(pp);
2077                         which_list = PG_CACHE_LIST;
2078                 }
2079                 page_ctr_sub(mnode, mtype, pp, which_list);
2080
2081                 /*
2082                  * Concatenate the smaller page(s) onto
2083                  * the large page list.
2084                  */
2085                 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2086                 pages_left -= npgs;
2087                 tpp = pp;
2088                 while (npgs--) {
2089                         tpp->p_szc = new_szc;
2090                         tpp = tpp->p_next;
2091                 }
2092                 page_list_concat(&pplist, &pp);
2093                 pp += tmpnpgs;
2094         }
2095         CHK_LPG(pplist, new_szc);
2096
2097         /*
2098          * return the page to the user if requested
2099          * in the properly locked state.
2100          */
2101         if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2102                 return (pplist);
2103         }
2104
2105         /*
2106          * Otherwise place the new large page on the freelist
2107          */
2108         bin = PP_2_BIN(pplist);
2109         mnode = PP_2_MEM_NODE(pplist);
2110         mtype = PP_2_MTYPE(pplist);
2111         page_lpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2112
2113         page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2114         return (NULL);
2115
2116 fail_promote:
2117         /*
2118          * A thread must have still been freeing or
2119          * reclaiming the page on the cachelist.
2120          * To prevent a deadlock undo what we have
2121          * done sofar and return failure. This
2122          * situation can only happen while promoting
2123          * PAGESIZE pages.
2124          */
2125         page_promote_err++;
2126         while (pplist) {
2127                 pp = pplist;
2128                 mach_page_sub(&pplist, pp);
2129                 pp->p_szc = 0;
2130                 bin = PP_2_BIN(pp);
2131                 mtype = PP_2_MTYPE(pp);
2132                 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2133                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2134         }
2135         return (NULL);
2136
2137 }
2138
2139 /*
2140  * Break up a large page into smaller size pages.
2141  * Pages involved are on the freelist before the call and may
2142  * be returned to the caller if requested, otherwise they will
2143  * be placed back on the freelist.
2144  * The caller is responsible for locking the freelist as well as any other
2145  * accounting which needs to be done for a returned page.
2146  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2147  * technically, any value may be passed in but PC_NO_COLOR is the standard
2148  * which should be followed for clarity's sake.
2149  * Returns a page whose pfn is < pfnmax
2150  */
2151 page_t *
2152 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2153     uchar_t new_szc, int color, int flags)
2154 {
2155         page_t  *pp, *pplist, *npplist;
2156         pgcnt_t npgs, n;
2157         uint_t  bin;
2158         uint_t  mtype;
2159         page_t  *ret_pp = NULL;
2160
2161         ASSERT(cur_szc != 0);
2162         ASSERT(new_szc < cur_szc);
2163
2164         pplist = page_numtopp_nolock(pfnum);
2165         ASSERT(pplist != NULL);
2166
2167         ASSERT(pplist->p_szc == cur_szc);
2168
2169         bin = PP_2_BIN(pplist);
2170         ASSERT(mnode == PP_2_MEM_NODE(pplist));
2171         mtype = PP_2_MTYPE(pplist);
2172         page_lpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2173
2174         CHK_LPG(pplist, cur_szc);
2175         page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2176
2177         /*
2178          * Number of PAGESIZE pages for smaller new_szc
2179          * page.
2180          */
2181         npgs = page_get_pagecnt(new_szc);
2182
2183         while (pplist) {
2184                 pp = pplist;
2185
2186                 ASSERT(pp->p_szc == cur_szc);
2187
2188                 /*
2189                  * We either break it up into PAGESIZE pages or larger.
2190                  */
2191                 if (npgs == 1) {        /* PAGESIZE case */
2192                         mach_page_sub(&pplist, pp);
2193                         ASSERT(pp->p_szc == cur_szc);
2194                         ASSERT(new_szc == 0);
2195                         ASSERT(mnode == PP_2_MEM_NODE(pp));
2196                         pp->p_szc = new_szc;
2197                         bin = PP_2_BIN(pp);
2198                         if ((bin == color) && (flags == PC_ALLOC) &&
2199                             (ret_pp == NULL) && (pfnmax == 0 ||
2200                             pp->p_pagenum < pfnmax) &&
2201                             page_trylock_cons(pp, SE_EXCL)) {
2202                                 ret_pp = pp;
2203                         } else {
2204                                 mtype = PP_2_MTYPE(pp);
2205                                 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2206                                     mtype), pp);
2207                                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2208                         }
2209                 } else {
2210                         page_t *try_to_return_this_page = NULL;
2211                         int count = 0;
2212
2213                         /*
2214                          * Break down into smaller lists of pages.
2215                          */
2216                         page_list_break(&pplist, &npplist, npgs);
2217
2218                         pp = pplist;
2219                         n = npgs;
2220                         while (n--) {
2221                                 ASSERT(pp->p_szc == cur_szc);
2222                                 /*
2223                                  * Check whether all the pages in this list
2224                                  * fit the request criteria.
2225                                  */
2226                                 if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2227                                         count++;
2228                                 }
2229                                 pp->p_szc = new_szc;
2230                                 pp = pp->p_next;
2231                         }
2232
2233                         if (count == npgs &&
2234                             (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2235                                 try_to_return_this_page = pp;
2236                         }
2237
2238                         CHK_LPG(pplist, new_szc);
2239
2240                         bin = PP_2_BIN(pplist);
2241                         if (try_to_return_this_page)
2242                                 ASSERT(mnode ==
2243                                     PP_2_MEM_NODE(try_to_return_this_page));
2244                         if ((bin == color) && (flags == PC_ALLOC) &&
2245                             (ret_pp == NULL) && try_to_return_this_page &&
2246                             page_trylock_cons(try_to_return_this_page,
2247                             SE_EXCL)) {
2248                                 ret_pp = try_to_return_this_page;
2249                         } else {
2250                                 mtype = PP_2_MTYPE(pp);
2251                                 page_lpadd(&PAGE_FREELISTS(mnode, new_szc,
2252                                     bin, mtype), pplist);
2253
2254                                 page_ctr_add(mnode, mtype, pplist,
2255                                     PG_FREE_LIST);
2256                         }
2257                         pplist = npplist;
2258                 }
2259         }
2260         return (ret_pp);
2261 }
2262
2263 int mpss_coalesce_disable = 0;
2264
2265 /*
2266  * Coalesce free pages into a page of the given szc and color if possible.
2267  * Return the pointer to the page created, otherwise, return NULL.
2268  *
2269  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2270  */
2271 page_t *
2272 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2273     int mtype, pfn_t pfnhi)
2274 {
2275         int     r = szc;                /* region size */
2276         int     mrange;
2277         uint_t  full, bin, color_mask, wrap = 0;
2278         pfn_t   pfnum, lo, hi;
2279         size_t  len, idx, idx0;
2280         pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
2281         page_t  *ret_pp;
2282         MEM_NODE_ITERATOR_DECL(it);
2283 #if defined(__sparc)
2284         pfn_t pfnum0, nlo, nhi;
2285 #endif
2286
2287         if (mpss_coalesce_disable) {
2288                 ASSERT(szc < MMU_PAGE_SIZES);
2289                 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2290                 return (NULL);
2291         }
2292
2293         ASSERT(szc < mmu_page_sizes);
2294         color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2295         ASSERT(ceq_mask <= color_mask);
2296         ASSERT(color <= color_mask);
2297         color &= ceq_mask;
2298
2299         /* Prevent page_counters dynamic memory from being freed */
2300         rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2301
2302         mrange = MTYPE_2_MRANGE(mnode, mtype);
2303         ASSERT(mrange < mnode_nranges[mnode]);
2304         VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2305
2306         /* get pfn range for mtype */
2307         len = PAGE_COUNTERS_ENTRIES(mnode, r);
2308         MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2309         hi++;
2310
2311         /* use lower limit if given */
2312         if (pfnhi != PFNNULL && pfnhi < hi)
2313                 hi = pfnhi;
2314
2315         /* round to szcpgcnt boundaries */
2316         lo = P2ROUNDUP(lo, szcpgcnt);
2317         MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2318         if (lo == (pfn_t)-1) {
2319                 rw_exit(&page_ctrs_rwlock[mnode]);
2320                 return (NULL);
2321         }
2322         hi = hi & ~(szcpgcnt - 1);
2323
2324         /* set lo to the closest pfn of the right color */
2325         if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2326             (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2327                 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2328                     &it);
2329         }
2330
2331         if (hi <= lo) {
2332                 rw_exit(&page_ctrs_rwlock[mnode]);
2333                 return (NULL);
2334         }
2335
2336         full = FULL_REGION_CNT(r);
2337
2338         /* calculate the number of page candidates and initial search index */
2339         bin = color;
2340         idx0 = (size_t)(-1);
2341         do {
2342                 pgcnt_t acand;
2343
2344                 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2345                 if (acand) {
2346                         idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2347                             r, bin, mrange);
2348                         idx0 = MIN(idx0, idx);
2349                         cands += acand;
2350                 }
2351                 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2352         } while (bin != color);
2353
2354         if (cands == 0) {
2355                 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2356                 rw_exit(&page_ctrs_rwlock[mnode]);
2357                 return (NULL);
2358         }
2359
2360         pfnum = IDX_TO_PNUM(mnode, r, idx0);
2361         if (pfnum < lo || pfnum >= hi) {
2362                 pfnum = lo;
2363         } else {
2364                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2365                 if (pfnum == (pfn_t)-1) {
2366                         pfnum = lo;
2367                         MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2368                         ASSERT(pfnum != (pfn_t)-1);
2369                 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2370                     (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2371                         /* invalid color, get the closest correct pfn */
2372                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2373                             color_mask, &it);
2374                         if (pfnum >= hi) {
2375                                 pfnum = lo;
2376                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2377                         }
2378                 }
2379         }
2380
2381         /* set starting index */
2382         idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2383         ASSERT(idx0 < len);
2384
2385 #if defined(__sparc)
2386         pfnum0 = pfnum;         /* page corresponding to idx0 */
2387         nhi = 0;                /* search kcage ranges */
2388 #endif
2389
2390         for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2391
2392 #if defined(__sparc)
2393                 /*
2394                  * Find lowest intersection of kcage ranges and mnode.
2395                  * MTYPE_NORELOC means look in the cage, otherwise outside.
2396                  */
2397                 if (nhi <= pfnum) {
2398                         if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2399                             (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2400                                 goto wrapit;
2401
2402                         /* jump to the next page in the range */
2403                         if (pfnum < nlo) {
2404                                 pfnum = P2ROUNDUP(nlo, szcpgcnt);
2405                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2406                                 idx = PNUM_TO_IDX(mnode, r, pfnum);
2407                                 if (idx >= len || pfnum >= hi)
2408                                         goto wrapit;
2409                                 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2410                                     ceq_mask)
2411                                         goto next;
2412                                 if (interleaved_mnodes &&
2413                                     PFN_2_MEM_NODE(pfnum) != mnode)
2414                                         goto next;
2415                         }
2416                 }
2417 #endif
2418
2419                 if (PAGE_COUNTERS(mnode, r, idx) != full)
2420                         goto next;
2421
2422                 /*
2423                  * RFE: For performance maybe we can do something less
2424                  *      brutal than locking the entire freelist. So far
2425                  *      this doesn't seem to be a performance problem?
2426                  */
2427                 page_freelist_lock(mnode);
2428                 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2429                         ret_pp =
2430                             page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2431                         if (ret_pp != NULL) {
2432                                 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2433                                 PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2434                                     PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2435                                 page_freelist_unlock(mnode);
2436                                 rw_exit(&page_ctrs_rwlock[mnode]);
2437 #if defined(__sparc)
2438                                 if (PP_ISNORELOC(ret_pp)) {
2439                                         pgcnt_t npgs;
2440
2441                                         npgs = page_get_pagecnt(ret_pp->p_szc);
2442                                         kcage_freemem_sub(npgs);
2443                                 }
2444 #endif
2445                                 return (ret_pp);
2446                         }
2447                 } else {
2448                         VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2449                 }
2450
2451                 page_freelist_unlock(mnode);
2452                 /*
2453                  * No point looking for another page if we've
2454                  * already tried all of the ones that
2455                  * page_ctr_cands indicated.  Stash off where we left
2456                  * off.
2457                  * Note: this is not exact since we don't hold the
2458                  * page_freelist_locks before we initially get the
2459                  * value of cands for performance reasons, but should
2460                  * be a decent approximation.
2461                  */
2462                 if (--cands == 0) {
2463                         PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2464                             idx;
2465                         break;
2466                 }
2467 next:
2468                 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2469                     color_mask, &it);
2470                 idx = PNUM_TO_IDX(mnode, r, pfnum);
2471                 if (idx >= len || pfnum >= hi) {
2472 wrapit:
2473                         pfnum = lo;
2474                         MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2475                         idx = PNUM_TO_IDX(mnode, r, pfnum);
2476                         wrap++;
2477 #if defined(__sparc)
2478                         nhi = 0;        /* search kcage ranges */
2479 #endif
2480                 }
2481         }
2482
2483         rw_exit(&page_ctrs_rwlock[mnode]);
2484         VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2485         return (NULL);
2486 }
2487
2488 /*
2489  * For the given mnode, promote as many small pages to large pages as possible.
2490  * mnode can be -1, which means do them all
2491  */
2492 void
2493 page_freelist_coalesce_all(int mnode)
2494 {
2495         int     r;              /* region size */
2496         int     idx, full;
2497         size_t  len;
2498         int doall = interleaved_mnodes || mnode < 0;
2499         int mlo = doall ? 0 : mnode;
2500         int mhi = doall ? max_mem_nodes : (mnode + 1);
2501
2502         VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2503
2504         if (mpss_coalesce_disable) {
2505                 return;
2506         }
2507
2508         /*
2509          * Lock the entire freelist and coalesce what we can.
2510          *
2511          * Always promote to the largest page possible
2512          * first to reduce the number of page promotions.
2513          */
2514         for (mnode = mlo; mnode < mhi; mnode++) {
2515                 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2516                 page_freelist_lock(mnode);
2517         }
2518         for (r = mmu_page_sizes - 1; r > 0; r--) {
2519                 for (mnode = mlo; mnode < mhi; mnode++) {
2520                         pgcnt_t cands = 0;
2521                         int mrange, nranges = mnode_nranges[mnode];
2522
2523                         for (mrange = 0; mrange < nranges; mrange++) {
2524                                 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2525                                 if (cands != 0)
2526                                         break;
2527                         }
2528                         if (cands == 0) {
2529                                 VM_STAT_ADD(vmm_vmstats.
2530                                     page_ctrs_cands_skip_all);
2531                                 continue;
2532                         }
2533
2534                         full = FULL_REGION_CNT(r);
2535                         len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2536
2537                         for (idx = 0; idx < len; idx++) {
2538                                 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2539                                         pfn_t pfnum =
2540                                             IDX_TO_PNUM(mnode, r, idx);
2541                                         int tmnode = interleaved_mnodes ?
2542                                             PFN_2_MEM_NODE(pfnum) : mnode;
2543
2544                                         ASSERT(pfnum >=
2545                                             mem_node_config[tmnode].physbase &&
2546                                             pfnum <
2547                                             mem_node_config[tmnode].physmax);
2548
2549                                         (void) page_promote(tmnode,
2550                                             pfnum, r, PC_FREE, PC_MTYPE_ANY);
2551                                 }
2552                         }
2553                         /* shared hpm_counters covers all mnodes, so we quit */
2554                         if (interleaved_mnodes)
2555                                 break;
2556                 }
2557         }
2558         for (mnode = mlo; mnode < mhi; mnode++) {
2559                 page_freelist_unlock(mnode);
2560                 rw_exit(&page_ctrs_rwlock[mnode]);
2561         }
2562 }
2563
2564 /*
2565  * This is where all polices for moving pages around
2566  * to different page size free lists is implemented.
2567  * Returns 1 on success, 0 on failure.
2568  *
2569  * So far these are the priorities for this algorithm in descending
2570  * order:
2571  *
2572  *      1) When servicing a request try to do so with a free page
2573  *         from next size up. Helps defer fragmentation as long
2574  *         as possible.
2575  *
2576  *      2) Page coalesce on demand. Only when a freelist
2577  *         larger than PAGESIZE is empty and step 1
2578  *         will not work since all larger size lists are
2579  *         also empty.
2580  *
2581  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2582  */
2583
2584 page_t *
2585 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2586     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2587 {
2588         uchar_t nszc = szc + 1;
2589         uint_t  bin, sbin, bin_prev;
2590         page_t  *pp, *firstpp;
2591         page_t  *ret_pp = NULL;
2592         uint_t  color_mask;
2593
2594         if (nszc == mmu_page_sizes)
2595                 return (NULL);
2596
2597         ASSERT(nszc < mmu_page_sizes);
2598         color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2599         bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2600         bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2601             PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2602
2603         VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2604         /*
2605          * First try to break up a larger page to fill current size freelist.
2606          */
2607         while (plw->plw_bins[nszc] != 0) {
2608
2609                 ASSERT(nszc < mmu_page_sizes);
2610
2611                 /*
2612                  * If page found then demote it.
2613                  */
2614                 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2615                         page_freelist_lock(mnode);
2616                         firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2617
2618                         /*
2619                          * If pfnhi is not PFNNULL, look for large page below
2620                          * pfnhi. PFNNULL signifies no pfn requirement.
2621                          */
2622                         if (pp &&
2623                             ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2624                             (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2625                                 do {
2626                                         pp = pp->p_list.largepg.next;
2627                                         if (pp == firstpp) {
2628                                                 pp = NULL;
2629                                                 break;
2630                                         }
2631                                 } while ((pfnhi != PFNNULL &&
2632                                     pp->p_pagenum >= pfnhi) ||
2633                                     (pfnlo != PFNNULL &&
2634                                     pp->p_pagenum < pfnlo));
2635
2636                                 if (pfnhi != PFNNULL && pp != NULL)
2637                                         ASSERT(pp->p_pagenum < pfnhi);
2638
2639                                 if (pfnlo != PFNNULL && pp != NULL)
2640                                         ASSERT(pp->p_pagenum >= pfnlo);
2641                         }
2642                         if (pp) {
2643                                 uint_t ccolor = page_correct_color(szc, nszc,
2644                                     color, bin, plw->plw_ceq_mask[szc]);
2645
2646                                 ASSERT(pp->p_szc == nszc);
2647                                 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2648                                 ret_pp = page_demote(mnode, pp->p_pagenum,
2649                                     pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2650                                 if (ret_pp) {
2651                                         page_freelist_unlock(mnode);
2652 #if defined(__sparc)
2653                                         if (PP_ISNORELOC(ret_pp)) {
2654                                                 pgcnt_t npgs;
2655
2656                                                 npgs = page_get_pagecnt(
2657                                                     ret_pp->p_szc);
2658                                                 kcage_freemem_sub(npgs);
2659                                         }
2660 #endif
2661                                         return (ret_pp);
2662                                 }
2663                         }
2664                         page_freelist_unlock(mnode);
2665                 }
2666
2667                 /* loop through next size bins */
2668                 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2669                 plw->plw_bins[nszc]--;
2670
2671                 if (bin == sbin) {
2672                         uchar_t nnszc = nszc + 1;
2673
2674                         /* we are done with this page size - check next */
2675                         if (plw->plw_bins[nnszc] == 0)
2676                                 /* we have already checked next size bins */
2677                                 break;
2678
2679                         bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2680                         if (bin_prev != INVALID_COLOR) {
2681                                 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2682                                 if (!((bin ^ bin_prev) &
2683                                     plw->plw_ceq_mask[nnszc]))
2684                                         break;
2685                         }
2686                         ASSERT(nnszc < mmu_page_sizes);
2687                         color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2688                         nszc = nnszc;
2689                         ASSERT(nszc < mmu_page_sizes);
2690                 }
2691         }
2692
2693         return (ret_pp);
2694 }
2695
2696 /*
2697  * Helper routine used only by the freelist code to lock
2698  * a page. If the page is a large page then it succeeds in
2699  * locking all the constituent pages or none at all.
2700  * Returns 1 on sucess, 0 on failure.
2701  */
2702 static int
2703 page_trylock_cons(page_t *pp, se_t se)
2704 {
2705         page_t  *tpp, *first_pp = pp;
2706
2707         /*
2708          * Fail if can't lock first or only page.
2709          */
2710         if (!page_trylock(pp, se)) {
2711                 return (0);
2712         }
2713
2714         /*
2715          * PAGESIZE: common case.
2716          */
2717         if (pp->p_szc == 0) {
2718                 return (1);
2719         }
2720
2721         /*
2722          * Large page case.
2723          */
2724         tpp = pp->p_next;
2725         while (tpp != pp) {
2726                 if (!page_trylock(tpp, se)) {
2727                         /*
2728                          * On failure unlock what we have locked so far.
2729                          * We want to avoid attempting to capture these
2730                          * pages as the pcm mutex may be held which could
2731                          * lead to a recursive mutex panic.
2732                          */
2733                         while (first_pp != tpp) {
2734                                 page_unlock_nocapture(first_pp);
2735                                 first_pp = first_pp->p_next;
2736                         }
2737                         return (0);
2738                 }
2739                 tpp = tpp->p_next;
2740         }
2741         return (1);
2742 }
2743
2744 /*
2745  * init context for walking page lists
2746  * Called when a page of the given szc in unavailable. Sets markers
2747  * for the beginning of the search to detect when search has
2748  * completed a full cycle. Sets flags for splitting larger pages
2749  * and coalescing smaller pages. Page walking procedes until a page
2750  * of the desired equivalent color is found.
2751  */
2752 void
2753 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2754     int use_ceq, page_list_walker_t *plw)
2755 {
2756         uint_t  nszc, ceq_mask, colors;
2757         uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2758
2759         ASSERT(szc < mmu_page_sizes);
2760         colors = PAGE_GET_PAGECOLORS(szc);
2761
2762         plw->plw_colors = colors;
2763         plw->plw_color_mask = colors - 1;
2764         plw->plw_bin_marker = plw->plw_bin0 = bin;
2765         plw->plw_bin_split_prev = bin;
2766         plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2767
2768         /*
2769          * if vac aliasing is possible make sure lower order color
2770          * bits are never ignored
2771          */
2772         if (vac_colors > 1)
2773                 ceq &= 0xf0;
2774
2775         /*
2776          * calculate the number of non-equivalent colors and
2777          * color equivalency mask
2778          */
2779         plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2780         ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2781         ASSERT(plw->plw_ceq_dif > 0);
2782         plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2783
2784         if (flags & PG_MATCH_COLOR) {
2785                 if (cpu_page_colors <  0) {
2786                         /*
2787                          * this is a heterogeneous machine with different CPUs
2788                          * having different size e$ (not supported for ni2/rock
2789                          */
2790                         uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2791                         cpucolors = MAX(cpucolors, 1);
2792                         ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2793                         plw->plw_ceq_mask[szc] =
2794                             MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2795                 }
2796                 plw->plw_ceq_dif = 1;
2797         }
2798
2799         /* we can split pages in the freelist, but not the cachelist */
2800         if (can_split) {
2801                 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2802
2803                 /* set next szc color masks and number of free list bins */
2804                 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2805                         plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2806                             plw->plw_ceq_mask[szc]);
2807                         plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2808                 }
2809                 plw->plw_ceq_mask[nszc] = INVALID_MASK;
2810                 plw->plw_bins[nszc] = 0;
2811
2812         } else {
2813                 ASSERT(szc == 0);
2814                 plw->plw_do_split = 0;
2815                 plw->plw_bins[1] = 0;
2816                 plw->plw_ceq_mask[1] = INVALID_MASK;
2817         }
2818 }
2819
2820 /*
2821  * set mark to flag where next split should occur
2822  */
2823 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {                    \
2824         uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);                       \
2825         uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);            \
2826         uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2827         plw->plw_split_next =                                                \
2828                 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);          \
2829         if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2830                 plw->plw_split_next =                                        \
2831                 INC_MASKED(plw->plw_split_next,                              \
2832                     neq_mask, plw->plw_color_mask);                          \
2833         }                                                                    \
2834 }
2835
2836 uint_t
2837 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2838 {
2839         uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2840         uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2841         uchar_t nszc = szc + 1;
2842
2843         nbin = ADD_MASKED(bin,
2844             plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2845
2846         if (plw->plw_do_split) {
2847                 plw->plw_bin_split_prev = bin;
2848                 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2849                 plw->plw_do_split = 0;
2850         }
2851
2852         if (szc == 0) {
2853                 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2854                         if (nbin == plw->plw_bin0 &&
2855                             (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2856                                 nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2857                                     neq_mask, plw->plw_color_mask);
2858                                 plw->plw_bin_split_prev = plw->plw_bin0;
2859                         }
2860
2861                         if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2862                                 plw->plw_bin_marker =
2863                                     nbin = INC_MASKED(nbin, neq_mask,
2864                                     plw->plw_color_mask);
2865                                 plw->plw_bin_split_prev = plw->plw_bin0;
2866                                 /*
2867                                  * large pages all have the same vac color
2868                                  * so by now we should be done with next
2869                                  * size page splitting process
2870                                  */
2871                                 ASSERT(plw->plw_bins[1] == 0);
2872                                 plw->plw_do_split = 0;
2873                                 return (nbin);
2874                         }
2875
2876                 } else {
2877                         uint_t bin_jump = (vac_colors == 1) ?
2878                             (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2879
2880                         bin_jump &= ~(vac_colors - 1);
2881
2882                         nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2883                             plw->plw_color_mask);
2884
2885                         if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2886
2887                                 plw->plw_bin_marker = nbin = nbin0;
2888
2889                                 if (plw->plw_bins[nszc] != 0) {
2890                                         /*
2891                                          * check if next page size bin is the
2892                                          * same as the next page size bin for
2893                                          * bin0
2894                                          */
2895                                         nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2896                                             nbin);
2897                                         bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2898                                             plw->plw_bin0);
2899
2900                                         if ((bin0_nsz ^ nbin_nsz) &
2901                                             plw->plw_ceq_mask[nszc])
2902                                                 plw->plw_do_split = 1;
2903                                 }
2904                                 return (nbin);
2905                         }
2906                 }
2907         }
2908
2909         if (plw->plw_bins[nszc] != 0) {
2910                 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2911                 if (!((plw->plw_split_next ^ nbin_nsz) &
2912                     plw->plw_ceq_mask[nszc]))
2913                         plw->plw_do_split = 1;
2914         }
2915
2916         return (nbin);
2917 }
2918
2919 page_t *
2920 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2921     uint_t flags)
2922 {
2923         kmutex_t                *pcm;
2924         page_t                  *pp, *first_pp;
2925         uint_t                  sbin;
2926         int                     plw_initialized;
2927         page_list_walker_t      plw;
2928
2929         ASSERT(szc < mmu_page_sizes);
2930
2931         VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2932
2933         MTYPE_START(mnode, mtype, flags);
2934         if (mtype < 0) {        /* mnode does not have memory in mtype range */
2935                 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2936                 return (NULL);
2937         }
2938 try_again:
2939
2940         plw_initialized = 0;
2941         plw.plw_ceq_dif = 1;
2942
2943         /*
2944          * Only hold one freelist lock at a time, that way we
2945          * can start anywhere and not have to worry about lock
2946          * ordering.
2947          */
2948         for (plw.plw_count = 0;
2949             plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2950                 sbin = bin;
2951                 do {
2952                         if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2953                                 goto bin_empty_1;
2954
2955                         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2956                         mutex_enter(pcm);
2957                         pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2958                         if (pp == NULL)
2959                                 goto bin_empty_0;
2960
2961                         /*
2962                          * These were set before the page
2963                          * was put on the free list,
2964                          * they must still be set.
2965                          */
2966                         ASSERT(PP_ISFREE(pp));
2967                         ASSERT(PP_ISAGED(pp));
2968                         ASSERT(pp->p_vnode == NULL);
2969                         ASSERT(pp->p_offset == (uoff_t)-1);
2970                         ASSERT(pp->p_szc == szc);
2971                         ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2972
2973                         /*
2974                          * Walk down the hash chain.  4k/8k pages are linked
2975                          * on p_next and p_prev fields. Large pages are a
2976                          * contiguous group of constituent pages linked
2977                          * together on their p_next and p_prev fields.  The
2978                          * large pages are linked together on the hash chain
2979                          * using p_list.largepg of the base constituent page
2980                          * of each large page.
2981                          */
2982                         first_pp = pp;
2983                         while (!page_trylock_cons(pp, SE_EXCL)) {
2984                                 if (szc == 0) {
2985                                         pp = pp->p_next;
2986                                 } else {
2987                                         pp = pp->p_list.largepg.next;
2988                                 }
2989
2990                                 ASSERT(PP_ISFREE(pp));
2991                                 ASSERT(PP_ISAGED(pp));
2992                                 ASSERT(pp->p_vnode == NULL);
2993                                 ASSERT(pp->p_offset == (uoff_t)-1);
2994                                 ASSERT(pp->p_szc == szc);
2995                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2996
2997                                 if (pp == first_pp)
2998                                         goto bin_empty_0;
2999                         }
3000
3001                         ASSERT(pp != NULL);
3002                         ASSERT(mtype == PP_2_MTYPE(pp));
3003                         ASSERT(pp->p_szc == szc);
3004                         if (szc == 0) {
3005                                 page_sub(&PAGE_FREELISTS(mnode,
3006                                     szc, bin, mtype), pp);
3007                         } else {
3008                                 page_lpsub(&PAGE_FREELISTS(mnode,
3009                                     szc, bin, mtype), pp);
3010                                 CHK_LPG(pp, szc);
3011                         }
3012                         page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3013
3014                         if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
3015                                 panic("free page is not. pp %p", (void *)pp);
3016                         mutex_exit(pcm);
3017
3018 #if defined(__sparc)
3019                         ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
3020                             (flags & PG_NORELOC) == 0);
3021
3022                         if (PP_ISNORELOC(pp))
3023                                 kcage_freemem_sub(page_get_pagecnt(szc));
3024 #endif
3025                         VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
3026                         return (pp);
3027
3028 bin_empty_0:
3029                         mutex_exit(pcm);
3030 bin_empty_1:
3031                         if (plw_initialized == 0) {
3032                                 page_list_walk_init(szc, flags, bin, 1, 1,
3033                                     &plw);
3034                                 plw_initialized = 1;
3035                                 ASSERT(plw.plw_colors <=
3036                                     PAGE_GET_PAGECOLORS(szc));
3037                                 ASSERT(plw.plw_colors > 0);
3038                                 ASSERT((plw.plw_colors &
3039                                     (plw.plw_colors - 1)) == 0);
3040                                 ASSERT(bin < plw.plw_colors);
3041                                 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
3042                         }
3043                         /* calculate the next bin with equivalent color */
3044                         bin = ADD_MASKED(bin, plw.plw_bin_step,
3045                             plw.plw_ceq_mask[szc], plw.plw_color_mask);
3046                 } while (sbin != bin);
3047
3048                 /*
3049                  * color bins are all empty if color match. Try and
3050                  * satisfy the request by breaking up or coalescing
3051                  * pages from a different size freelist of the correct
3052                  * color that satisfies the ORIGINAL color requested.
3053                  * If that fails then try pages of the same size but
3054                  * different colors assuming we are not called with
3055                  * PG_MATCH_COLOR.
3056                  */
3057                 if (plw.plw_do_split &&
3058                     (pp = page_freelist_split(szc, bin, mnode,
3059                     mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3060                         return (pp);
3061
3062                 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
3063                     bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
3064                         return (pp);
3065
3066                 if (plw.plw_ceq_dif > 1)
3067                         bin = page_list_walk_next_bin(szc, bin, &plw);
3068         }
3069
3070         /* if allowed, cycle through additional mtypes */
3071         MTYPE_NEXT(mnode, mtype, flags);
3072         if (mtype >= 0)
3073                 goto try_again;
3074
3075         VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
3076
3077         return (NULL);
3078 }
3079
3080 /*
3081  * Returns the count of free pages for 'pp' with size code 'szc'.
3082  * Note: This function does not return an exact value as the page freelist
3083  * locks are not held and thus the values in the page_counters may be
3084  * changing as we walk through the data.
3085  */
3086 static int
3087 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3088 {
3089         pgcnt_t pgfree;
3090         pgcnt_t cnt;
3091         ssize_t r = szc;        /* region size */
3092         ssize_t idx;
3093         int     i;
3094         int     full, range;
3095
3096         /* Make sure pagenum passed in is aligned properly */
3097         ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3098         ASSERT(szc > 0);
3099
3100         /* Prevent page_counters dynamic memory from being freed */
3101         rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3102         idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3103         cnt = PAGE_COUNTERS(mnode, r, idx);
3104         pgfree = cnt << PNUM_SHIFT(r - 1);
3105         range = FULL_REGION_CNT(szc);
3106
3107         /* Check for completely full region */
3108         if (cnt == range) {
3109                 rw_exit(&page_ctrs_rwlock[mnode]);
3110                 return (pgfree);
3111         }
3112
3113         while (--r > 0) {
3114                 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3115                 full = FULL_REGION_CNT(r);
3116                 for (i = 0; i < range; i++, idx++) {
3117                         cnt = PAGE_COUNTERS(mnode, r, idx);
3118                         /*
3119                          * If cnt here is full, that means we have already
3120                          * accounted for these pages earlier.
3121                          */
3122                         if (cnt != full) {
3123                                 pgfree += (cnt << PNUM_SHIFT(r - 1));
3124                         }
3125                 }
3126                 range *= full;
3127         }
3128         rw_exit(&page_ctrs_rwlock[mnode]);
3129         return (pgfree);
3130 }
3131
3132 /*
3133  * Called from page_geti_contig_pages to exclusively lock constituent pages
3134  * starting from 'spp' for page size code 'szc'.
3135  *
3136  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3137  * region needs to be greater than or equal to the threshold.
3138  */
3139 static int
3140 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3141 {
3142         pgcnt_t pgcnt = PNUM_SIZE(szc);
3143         pgcnt_t pgfree, i;
3144         page_t *pp;
3145
3146         VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3147
3148
3149         if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3150                 goto skipptcpcheck;
3151         /*
3152          * check if there are sufficient free pages available before attempting
3153          * to trylock. Count is approximate as page counters can change.
3154          */
3155         pgfree = page_freecnt(mnode, spp, szc);
3156
3157         /* attempt to trylock if there are sufficient already free pages */
3158         if (pgfree < pgcnt/ptcpthreshold) {
3159                 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3160                 return (0);
3161         }
3162
3163 skipptcpcheck:
3164
3165         for (i = 0; i < pgcnt; i++) {
3166                 pp = &spp[i];
3167                 if (!page_trylock(pp, SE_EXCL)) {
3168                         VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3169                         while (--i != (pgcnt_t)-1) {
3170                                 pp = &spp[i];
3171                                 ASSERT(PAGE_EXCL(pp));
3172                                 page_unlock_nocapture(pp);
3173                         }
3174                         return (0);
3175                 }
3176                 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3177                 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3178                     !PP_ISFREE(pp)) {
3179                         VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3180                         ASSERT(i == 0);
3181                         page_unlock_nocapture(pp);
3182                         return (0);
3183                 }
3184
3185                 /*
3186                  * If a page has been marked non-relocatable or has been
3187                  * explicitly locked in memory, we don't want to relocate it;
3188                  * unlock the pages and fail the operation.
3189                  */
3190                 if (PP_ISNORELOC(pp) ||
3191                     pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
3192                         VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3193                         while (i != (pgcnt_t)-1) {
3194                                 pp = &spp[i];
3195                                 ASSERT(PAGE_EXCL(pp));
3196                                 page_unlock_nocapture(pp);
3197                                 i--;
3198                         }
3199                         return (0);
3200                 }
3201         }
3202         VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3203         return (1);
3204 }
3205
3206 /*
3207  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3208  * of 'szc' constituent pages that had been locked exclusively previously.
3209  * Will attempt to relocate constituent pages in use.
3210  */
3211 static page_t *
3212 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3213 {
3214         spgcnt_t pgcnt, npgs, i;
3215         page_t *targpp, *rpp, *hpp;
3216         page_t *replpp = NULL;
3217         page_t *pplist = NULL;
3218
3219         ASSERT(pp != NULL);
3220
3221         pgcnt = page_get_pagecnt(szc);
3222         while (pgcnt) {
3223                 ASSERT(PAGE_EXCL(pp));
3224                 ASSERT(!PP_ISNORELOC(pp));
3225                 if (PP_ISFREE(pp)) {
3226                         /*
3227                          * If this is a PG_FREE_LIST page then its
3228                          * size code can change underneath us due to
3229                          * page promotion or demotion. As an optimzation
3230                          * use page_list_sub_pages() instead of
3231                          * page_list_sub().
3232                          */
3233                         if (PP_ISAGED(pp)) {
3234                                 page_list_sub_pages(pp, szc);
3235                                 if (pp->p_szc == szc) {
3236                                         return (pp);
3237                                 }
3238                                 ASSERT(pp->p_szc < szc);
3239                                 npgs = page_get_pagecnt(pp->p_szc);
3240                                 hpp = pp;
3241                                 for (i = 0; i < npgs; i++, pp++) {
3242                                         pp->p_szc = szc;
3243                                 }
3244                                 page_list_concat(&pplist, &hpp);
3245                                 pgcnt -= npgs;
3246                                 continue;
3247                         }
3248                         ASSERT(!PP_ISAGED(pp));
3249                         ASSERT(pp->p_szc == 0);
3250                         page_list_sub(pp, PG_CACHE_LIST);
3251                         page_hashout(pp, false);
3252                         PP_SETAGED(pp);
3253                         pp->p_szc = szc;
3254                         page_list_concat(&pplist, &pp);
3255                         pp++;
3256                         pgcnt--;
3257                         continue;
3258                 }
3259                 npgs = page_get_pagecnt(pp->p_szc);
3260
3261                 /*
3262                  * page_create_wait freemem accounting done by caller of
3263                  * page_get_freelist and not necessary to call it prior to
3264                  * calling page_get_replacement_page.
3265                  *
3266                  * page_get_replacement_page can call page_get_contig_pages
3267                  * to acquire a large page (szc > 0); the replacement must be
3268                  * smaller than the contig page size to avoid looping or
3269                  * szc == 0 and PGI_PGCPSZC0 is set.
3270                  */
3271                 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3272                         replpp = page_get_replacement_page(pp, NULL, 0);
3273                         if (replpp) {
3274                                 npgs = page_get_pagecnt(pp->p_szc);
3275                                 ASSERT(npgs <= pgcnt);
3276                                 targpp = pp;
3277                         }
3278                 }
3279
3280                 /*
3281                  * If replacement is NULL or do_page_relocate fails, fail
3282                  * coalescing of pages.
3283                  */
3284                 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3285                     &npgs, NULL) != 0)) {
3286                         /*
3287                          * Unlock un-processed target list
3288                          */
3289                         while (pgcnt--) {
3290                                 ASSERT(PAGE_EXCL(pp));
3291                                 page_unlock_nocapture(pp);
3292                                 pp++;
3293                         }
3294                         /*
3295                          * Free the processed target list.
3296                          */
3297                         while (pplist) {
3298                                 pp = pplist;
3299                                 page_sub(&pplist, pp);
3300                                 ASSERT(PAGE_EXCL(pp));
3301                                 ASSERT(pp->p_szc == szc);
3302                                 ASSERT(PP_ISFREE(pp));
3303                                 ASSERT(PP_ISAGED(pp));
3304                                 pp->p_szc = 0;
3305                                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3306                                 page_unlock_nocapture(pp);
3307                         }
3308
3309                         if (replpp != NULL)
3310                                 page_free_replacement_page(replpp);
3311
3312                         return (NULL);
3313                 }
3314                 ASSERT(pp == targpp);
3315
3316                 ASSERT(hpp = pp); /* That's right, it's an assignment */
3317
3318                 pp += npgs;
3319                 pgcnt -= npgs;
3320
3321                 while (npgs--) {
3322                         ASSERT(PAGE_EXCL(targpp));
3323                         ASSERT(!PP_ISFREE(targpp));
3324                         ASSERT(!PP_ISNORELOC(targpp));
3325                         PP_SETFREE(targpp);
3326                         ASSERT(PP_ISAGED(targpp));
3327                         ASSERT(targpp->p_szc < szc || (szc == 0 &&
3328                             (flags & PGI_PGCPSZC0)));
3329                         targpp->p_szc = szc;
3330                         targpp = targpp->p_next;
3331
3332                         rpp = replpp;
3333                         ASSERT(rpp != NULL);
3334                         page_sub(&replpp, rpp);
3335                         ASSERT(PAGE_EXCL(rpp));
3336                         ASSERT(!PP_ISFREE(rpp));
3337                         page_unlock_nocapture(rpp);
3338                 }
3339                 ASSERT(targpp == hpp);
3340                 ASSERT(replpp == NULL);
3341                 page_list_concat(&pplist, &targpp);
3342         }
3343         CHK_LPG(pplist, szc);
3344         return (pplist);
3345 }
3346
3347 /*
3348  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3349  * of 0 means nothing left after trim.
3350  */
3351 int
3352 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3353 {
3354         pfn_t   kcagepfn;
3355         int     decr;
3356         int     rc = 0;
3357
3358         if (PP_ISNORELOC(mseg->pages)) {
3359                 if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3360
3361                         /* lower part of this mseg inside kernel cage */
3362                         decr = kcage_current_pfn(&kcagepfn);
3363
3364                         /* kernel cage may have transitioned past mseg */
3365                         if (kcagepfn >= mseg->pages_base &&
3366                             kcagepfn < mseg->pages_end) {
3367                                 ASSERT(decr == 0);
3368                                 *lo = MAX(kcagepfn, pfnlo);
3369                                 *hi = MIN(pfnhi, (mseg->pages_end - 1));
3370                                 rc = 1;
3371                         }
3372                 }
3373                 /* else entire mseg in the cage */
3374         } else {
3375                 if (PP_ISNORELOC(mseg->epages - 1)) {
3376
3377                         /* upper part of this mseg inside kernel cage */
3378                         decr = kcage_current_pfn(&kcagepfn);
3379
3380                         /* kernel cage may have transitioned past mseg */
3381                         if (kcagepfn >= mseg->pages_base &&
3382                             kcagepfn < mseg->pages_end) {
3383                                 ASSERT(decr);
3384                                 *hi = MIN(kcagepfn, pfnhi);
3385                                 *lo = MAX(pfnlo, mseg->pages_base);
3386                                 rc = 1;
3387                         }
3388                 } else {
3389                         /* entire mseg outside of kernel cage */
3390                         *lo = MAX(pfnlo, mseg->pages_base);
3391                         *hi = MIN(pfnhi, (mseg->pages_end - 1));
3392                         rc = 1;
3393                 }
3394         }
3395         return (rc);
3396 }
3397
3398 /*
3399  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3400  * page with size code 'szc'. Claiming such a page requires acquiring
3401  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3402  * relocating pages in use and concatenating these constituent pages into a
3403  * large page.
3404  *
3405  * The page lists do not have such a large page and page_freelist_split has
3406  * already failed to demote larger pages and/or coalesce smaller free pages.
3407  *
3408  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3409  * pages with the same color as 'bin'.
3410  *
3411  * 'pfnflag' specifies the subset of the pfn range to search.
3412  */
3413
3414 static page_t *
3415 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3416     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3417 {
3418         struct memseg *mseg;
3419         pgcnt_t szcpgcnt = page_get_pagecnt(szc);
3420         pgcnt_t szcpgmask = szcpgcnt - 1;
3421         pfn_t   randpfn;
3422         page_t *pp, *randpp, *endpp;
3423         uint_t colors, ceq_mask;
3424         uint_t color_mask;
3425         pfn_t hi, lo;
3426         uint_t skip;
3427         MEM_NODE_ITERATOR_DECL(it);
3428
3429         ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3430
3431         pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3432
3433         if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3434                 return (NULL);
3435
3436         ASSERT(szc < mmu_page_sizes);
3437
3438         colors = PAGE_GET_PAGECOLORS(szc);
3439         color_mask = colors - 1;
3440         if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3441                 uchar_t ceq = colorequivszc[szc];
3442                 uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3443
3444                 ASSERT(ceq_dif > 0);
3445                 ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3446         } else {
3447                 ceq_mask = 0;
3448         }
3449
3450         ASSERT(bin < colors);
3451
3452         /* clear "non-significant" color bits */
3453         bin &= ceq_mask;
3454
3455         /*
3456          * trim the pfn range to search based on pfnflag. pfnflag is set
3457          * when there have been previous page_get_contig_page failures to
3458          * limit the search.
3459          *
3460          * The high bit in pfnflag specifies the number of 'slots' in the
3461          * pfn range and the remainder of pfnflag specifies which slot.
3462          * For example, a value of 1010b would mean the second slot of
3463          * the pfn range that has been divided into 8 slots.
3464          */
3465         if (pfnflag > 1) {
3466                 int     slots = 1 << (highbit(pfnflag) - 1);
3467                 int     slotid = pfnflag & (slots - 1);
3468                 pgcnt_t szcpages;
3469                 int     slotlen;
3470
3471                 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3472                 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3473                 slotlen = howmany(szcpages, slots);
3474                 /* skip if 'slotid' slot is empty */
3475                 if (slotid * slotlen >= szcpages)
3476                         return (NULL);
3477                 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3478                 ASSERT(pfnlo < pfnhi);
3479                 if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3480                         pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3481         }
3482
3483         /*
3484          * This routine is can be called recursively so we shouldn't
3485          * acquire a reader lock if a write request is pending. This
3486          * could lead to a deadlock with the DR thread.
3487          *
3488          * Returning NULL informs the caller that we could not get
3489          * a contig page with the required characteristics.
3490          */
3491
3492         if (!memsegs_trylock(0))
3493                 return (NULL);
3494
3495         /*
3496          * loop through memsegs to look for contig page candidates
3497          */
3498
3499         for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3500                 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3501                         /* no overlap */
3502                         continue;
3503                 }
3504
3505                 if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3506                         /* mseg too small */
3507                         continue;
3508
3509                 /*
3510                  * trim off kernel cage pages from pfn range and check for
3511                  * a trimmed pfn range returned that does not span the
3512                  * desired large page size.
3513                  */
3514                 if (kcage_on) {
3515                         if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3516                             lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3517                                 continue;
3518                 } else {
3519                         lo = MAX(pfnlo, mseg->pages_base);
3520                         hi = MIN(pfnhi, (mseg->pages_end - 1));
3521                 }
3522
3523                 /* round to szcpgcnt boundaries */
3524                 lo = P2ROUNDUP(lo, szcpgcnt);
3525
3526                 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3527                 hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3528
3529                 if (hi <= lo)
3530                         continue;
3531
3532                 /*
3533                  * set lo to point to the pfn for the desired bin. Large
3534                  * page sizes may only have a single page color
3535                  */
3536                 skip = szcpgcnt;
3537                 if (ceq_mask > 0 || interleaved_mnodes) {
3538                         /* set lo to point at appropriate color */
3539                         if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3540                             (interleaved_mnodes &&
3541                             PFN_2_MEM_NODE(lo) != mnode)) {
3542                                 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3543                                     color_mask, &it);
3544                         }
3545                         if (hi <= lo)
3546                                 /* mseg cannot satisfy color request */
3547                                 continue;
3548                 }
3549
3550                 /* randomly choose a point between lo and hi to begin search */
3551
3552                 randpfn = (pfn_t)GETTICK();
3553                 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3554                 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3555                 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3556                         if (randpfn != (pfn_t)-1) {
3557                                 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3558                                     ceq_mask, color_mask, &it);
3559                         }
3560                         if (randpfn >= hi) {
3561                                 randpfn = lo;
3562                                 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3563                                     &it);
3564                         }
3565                 }
3566                 randpp = mseg->pages + (randpfn - mseg->pages_base);
3567
3568                 ASSERT(randpp->p_pagenum == randpfn);
3569
3570                 pp = randpp;
3571                 endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3572
3573                 ASSERT(randpp + szcpgcnt <= endpp);
3574
3575                 do {
3576                         ASSERT(!(pp->p_pagenum & szcpgmask));
3577                         ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3578
3579                         if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3580                                 /* pages unlocked by page_claim on failure */
3581                                 if (page_claim_contig_pages(pp, szc, flags)) {
3582                                         memsegs_unlock(0);
3583                                         return (pp);
3584                                 }
3585                         }
3586
3587                         if (ceq_mask == 0 && !interleaved_mnodes) {
3588                                 pp += skip;
3589                         } else {
3590                                 pfn_t pfn = pp->p_pagenum;
3591
3592                                 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3593                                     ceq_mask, color_mask, &it);
3594                                 if (pfn == (pfn_t)-1) {
3595                                         pp = endpp;
3596                                 } else {
3597                                         pp = mseg->pages +
3598                                             (pfn - mseg->pages_base);
3599                                 }
3600                         }
3601                         if (pp >= endpp) {
3602                                 /* start from the beginning */
3603                                 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3604                                 pp = mseg->pages + (lo - mseg->pages_base);
3605                                 ASSERT(pp->p_pagenum == lo);
3606                                 ASSERT(pp + szcpgcnt <= endpp);
3607                         }
3608                 } while (pp != randpp);
3609         }
3610         memsegs_unlock(0);
3611         return (NULL);
3612 }
3613
3614
3615 /*
3616  * controlling routine that searches through physical memory in an attempt to
3617  * claim a large page based on the input parameters.
3618  * on the page free lists.
3619  *
3620  * calls page_geti_contig_pages with an initial pfn range from the mnode
3621  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3622  * that overlaps with the kernel cage or does not match the requested page
3623  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3624  * page_geti_contig_pages may further limit the search range based on
3625  * previous failure counts (pgcpfailcnt[]).
3626  *
3627  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3628  * pagesize page that satisfies mtype.
3629  */
3630 page_t *
3631 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3632     uint_t flags)
3633 {
3634         pfn_t           pfnlo, pfnhi;   /* contig pages pfn range */
3635         page_t          *pp;
3636         pgcnt_t         pfnflag = 0;    /* no limit on search if 0 */
3637
3638         VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3639
3640         /* no allocations from cage */
3641         flags |= PGI_NOCAGE;
3642
3643         MTYPE_START(mnode, mtype, flags);
3644         if (mtype < 0) {        /* mnode does not have memory in mtype range */
3645                 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3646                 return (NULL);
3647         }
3648
3649         ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3650
3651         /* do not limit search and ignore color if hi pri */
3652
3653         if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3654                 pfnflag = pgcpfailcnt[szc];
3655
3656         /* remove color match to improve chances */
3657
3658         if (flags & PGI_PGCPHIPRI || pfnflag)
3659                 flags &= ~PG_MATCH_COLOR;
3660
3661         do {
3662                 /* get pfn range based on mnode and mtype */
3663                 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3664
3665                 ASSERT(pfnhi >= pfnlo);
3666
3667                 pp = page_geti_contig_pages(mnode, bin, szc, flags,
3668                     pfnlo, pfnhi, pfnflag);
3669
3670                 if (pp != NULL) {
3671                         pfnflag = pgcpfailcnt[szc];
3672                         if (pfnflag) {
3673                                 /* double the search size */
3674                                 pgcpfailcnt[szc] = pfnflag >> 1;
3675                         }
3676                         VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3677                         return (pp);
3678                 }
3679                 MTYPE_NEXT(mnode, mtype, flags);
3680         } while (mtype >= 0);
3681
3682         VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3683         return (NULL);
3684 }
3685
3686 #if defined(__i386) || defined(__amd64)
3687 /*
3688  * Determine the likelihood of finding/coalescing a szc page.
3689  * Return 0 if the likelihood is small otherwise return 1.
3690  *
3691  * For now, be conservative and check only 1g pages and return 0
3692  * if there had been previous coalescing failures and the szc pages
3693  * needed to satisfy request would exhaust most of freemem.
3694  */
3695 int
3696 page_chk_freelist(uint_t szc)
3697 {
3698         pgcnt_t         pgcnt;
3699
3700         if (szc <= 1)
3701                 return (1);
3702
3703         pgcnt = page_get_pagecnt(szc);
3704         if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3705                 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3706                 return (0);
3707         }
3708         VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3709         return (1);
3710 }
3711 #endif
3712
3713 /*
3714  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3715  *
3716  * Does its own locking and accounting.
3717  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3718  * pages of the proper color even if there are pages of a different color.
3719  *
3720  * Finds a page, removes it, THEN locks it.
3721  */
3722
3723 /*ARGSUSED*/
3724 page_t *
3725 page_get_freelist(struct vnode *vp, uoff_t off, struct seg *seg,
3726         caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3727 {
3728         struct as       *as = seg->s_as;
3729         page_t          *pp = NULL;
3730         ulong_t         bin;
3731         uchar_t         szc;
3732         int             mnode;
3733         int             mtype;
3734         page_t          *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3735         lgrp_mnode_cookie_t     lgrp_cookie;
3736
3737         page_get_func = page_get_mnode_freelist;
3738
3739         /*
3740          * If we aren't passed a specific lgroup, or passed a freed lgrp
3741          * assume we wish to allocate near to the current thread's home.
3742          */
3743         if (!LGRP_EXISTS(lgrp))
3744                 lgrp = lgrp_home_lgrp();
3745
3746         if (kcage_on) {
3747                 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3748                     kcage_freemem < kcage_throttlefree + btop(size) &&
3749                     curthread != kcage_cageout_thread) {
3750                         /*
3751                          * Set a "reserve" of kcage_throttlefree pages for
3752                          * PG_PANIC and cageout thread allocations.
3753                          *
3754                          * Everybody else has to serialize in
3755                          * page_create_get_something() to get a cage page, so
3756                          * that we don't deadlock cageout!
3757                          */
3758                         return (NULL);
3759                 }
3760         } else {
3761                 flags &= ~PG_NORELOC;
3762                 flags |= PGI_NOCAGE;
3763         }
3764
3765         MTYPE_INIT(mtype, vp, vaddr, flags, size);
3766
3767         /*
3768          * Convert size to page size code.
3769          */
3770         if ((szc = page_szc(size)) == (uchar_t)-1)
3771                 panic("page_get_freelist: illegal page size request");
3772         ASSERT(szc < mmu_page_sizes);
3773
3774         VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3775
3776         AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3777
3778         ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3779
3780         /*
3781          * Try to get a local page first, but try remote if we can't
3782          * get a page of the right color.
3783          */
3784 pgretry:
3785         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3786         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3787                 pp = page_get_func(mnode, bin, mtype, szc, flags);
3788                 if (pp != NULL) {
3789                         VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3790                         DTRACE_PROBE4(page__get,
3791                             lgrp_t *, lgrp,
3792                             int, mnode,
3793                             ulong_t, bin,
3794                             uint_t, flags);
3795                         return (pp);
3796                 }
3797         }
3798         ASSERT(pp == NULL);
3799
3800         /*
3801          * for non-SZC0 PAGESIZE requests, check cachelist before checking
3802          * remote free lists.  Caller expected to call page_get_cachelist which
3803          * will check local cache lists and remote free lists.
3804          */
3805         if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3806                 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3807                 return (NULL);
3808         }
3809
3810         ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3811
3812         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3813
3814         if (!(flags & PG_LOCAL)) {
3815                 /*
3816                  * Try to get a non-local freelist page.
3817                  */
3818                 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3819                 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3820                         pp = page_get_func(mnode, bin, mtype, szc, flags);
3821                         if (pp != NULL) {
3822                                 DTRACE_PROBE4(page__get,
3823                                     lgrp_t *, lgrp,
3824                                     int, mnode,
3825                                     ulong_t, bin,
3826                                     uint_t, flags);
3827                                 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3828                                 return (pp);
3829                         }
3830                 }
3831                 ASSERT(pp == NULL);
3832         }
3833
3834         /*
3835          * when the cage is off chances are page_get_contig_pages() will fail
3836          * to lock a large page chunk therefore when the cage is off it's not
3837          * called by default.  this can be changed via /etc/system.
3838          *
3839          * page_get_contig_pages() also called to acquire a base pagesize page
3840          * for page_create_get_something().
3841          */
3842         if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3843             (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3844             (page_get_func != page_get_contig_pages)) {
3845
3846                 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3847                 page_get_func = page_get_contig_pages;
3848                 goto pgretry;
3849         }
3850
3851         if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3852             page_get_func == page_get_contig_pages)
3853                 SETPGCPFAILCNT(szc);
3854
3855         VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3856         return (NULL);
3857 }
3858
3859 /*
3860  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3861  *
3862  * Does its own locking.
3863  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3864  * pages of the proper color even if there are pages of a different color.
3865  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3866  * try to lock one of them.  If no page can be locked, try the
3867  * next bin.  Return NULL if a page can not be found and locked.
3868  *
3869  * Finds a pages, trys to lock it, then removes it.
3870  */
3871
3872 /*ARGSUSED*/
3873 page_t *
3874 page_get_cachelist(struct vnode *vp, uoff_t off, struct seg *seg,
3875     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3876 {
3877         page_t          *pp;
3878         struct as       *as = seg->s_as;
3879         ulong_t         bin;
3880         int             mnode;
3881         int             mtype;
3882         lgrp_mnode_cookie_t     lgrp_cookie;
3883
3884         /*
3885          * If we aren't passed a specific lgroup, or pasased a freed lgrp
3886          * assume we wish to allocate near to the current thread's home.
3887          */
3888         if (!LGRP_EXISTS(lgrp))
3889                 lgrp = lgrp_home_lgrp();
3890
3891         if (!kcage_on) {
3892                 flags &= ~PG_NORELOC;
3893                 flags |= PGI_NOCAGE;
3894         }
3895
3896         if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3897             kcage_freemem <= kcage_throttlefree) {
3898                 /*
3899                  * Reserve kcage_throttlefree pages for critical kernel
3900                  * threads.
3901                  *
3902                  * Everybody else has to go to page_create_get_something()
3903                  * to get a cage page, so we don't deadlock cageout.
3904                  */
3905                 return (NULL);
3906         }
3907
3908         AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3909
3910         ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3911
3912         MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3913
3914         VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3915
3916         /*
3917          * Try local cachelists first
3918          */
3919         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3920         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3921                 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3922                 if (pp != NULL) {
3923                         VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3924                         DTRACE_PROBE4(page__get,
3925                             lgrp_t *, lgrp,
3926                             int, mnode,
3927                             ulong_t, bin,
3928                             uint_t, flags);
3929                         return (pp);
3930                 }
3931         }
3932
3933         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3934
3935         /*
3936          * Try freelists/cachelists that are farther away
3937          * This is our only chance to allocate remote pages for PAGESIZE
3938          * requests.
3939          */
3940         LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3941         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3942                 pp = page_get_mnode_freelist(mnode, bin, mtype,
3943                     0, flags);
3944                 if (pp != NULL) {
3945                         VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3946                         DTRACE_PROBE4(page__get,
3947                             lgrp_t *, lgrp,
3948                             int, mnode,
3949                             ulong_t, bin,
3950                             uint_t, flags);
3951                         return (pp);
3952                 }
3953                 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3954                 if (pp != NULL) {
3955                         VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3956                         DTRACE_PROBE4(page__get,
3957                             lgrp_t *, lgrp,
3958                             int, mnode,
3959                             ulong_t, bin,
3960                             uint_t, flags);
3961                         return (pp);
3962                 }
3963         }
3964
3965         VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3966         return (NULL);
3967 }
3968
3969 page_t *
3970 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3971 {
3972         kmutex_t                *pcm;
3973         page_t                  *pp, *first_pp;
3974         uint_t                  sbin;
3975         int                     plw_initialized;
3976         page_list_walker_t      plw;
3977
3978         VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3979
3980         MTYPE_START(mnode, mtype, flags);
3981         if (mtype < 0) {        /* mnode does not have memory in mtype range */
3982                 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3983                 return (NULL);
3984         }
3985
3986 try_again:
3987
3988         plw_initialized = 0;
3989         plw.plw_ceq_dif = 1;
3990
3991         /*
3992          * Only hold one cachelist lock at a time, that way we
3993          * can start anywhere and not have to worry about lock
3994          * ordering.
3995          */
3996
3997         for (plw.plw_count = 0;
3998             plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3999                 sbin = bin;
4000                 do {
4001
4002                         if (!PAGE_CACHELISTS(mnode, bin, mtype))
4003                                 goto bin_empty_1;
4004                         pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
4005                         mutex_enter(pcm);
4006                         pp = PAGE_CACHELISTS(mnode, bin, mtype);
4007                         if (pp == NULL)
4008                                 goto bin_empty_0;
4009
4010                         first_pp = pp;
4011                         ASSERT(pp->p_vnode);
4012                         ASSERT(PP_ISAGED(pp) == 0);
4013                         ASSERT(pp->p_szc == 0);
4014                         ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
4015                         while (!page_trylock(pp, SE_EXCL)) {
4016                                 pp = pp->p_next;
4017                                 ASSERT(pp->p_szc == 0);
4018                                 if (pp == first_pp) {
4019                                         /*
4020                                          * We have searched the complete list!
4021                                          * And all of them (might only be one)
4022                                          * are locked. This can happen since
4023                                          * these pages can also be found via
4024                                          * the hash list. When found via the
4025                                          * hash list, they are locked first,
4026                                          * then removed. We give up to let the
4027                                          * other thread run.
4028                                          */
4029                                         pp = NULL;
4030                                         break;
4031                                 }
4032                                 ASSERT(pp->p_vnode);
4033                                 ASSERT(PP_ISFREE(pp));
4034                                 ASSERT(PP_ISAGED(pp) == 0);
4035                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
4036                                     mnode);
4037                         }
4038
4039                         if (pp) {
4040                                 page_t  **ppp;
4041                                 /*
4042                                  * Found and locked a page.
4043                                  * Pull it off the list.
4044                                  */
4045                                 ASSERT(mtype == PP_2_MTYPE(pp));
4046                                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
4047                                 page_sub(ppp, pp);
4048                                 /*
4049                                  * Subtract counters before releasing pcm mutex
4050                                  * to avoid a race with page_freelist_coalesce
4051                                  * and page_freelist_split.
4052                                  */
4053                                 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
4054                                 mutex_exit(pcm);
4055                                 ASSERT(pp->p_vnode);
4056                                 ASSERT(PP_ISAGED(pp) == 0);
4057 #if defined(__sparc)
4058                                 ASSERT(!kcage_on ||
4059                                     (flags & PG_NORELOC) == 0 ||
4060                                     PP_ISNORELOC(pp));
4061                                 if (PP_ISNORELOC(pp)) {
4062                                         kcage_freemem_sub(1);
4063                                 }
4064 #endif
4065                                 VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
4066                                 return (pp);
4067                         }
4068 bin_empty_0:
4069                         mutex_exit(pcm);
4070 bin_empty_1:
4071                         if (plw_initialized == 0) {
4072                                 page_list_walk_init(0, flags, bin, 0, 1, &plw);
4073                                 plw_initialized = 1;
4074                         }
4075                         /* calculate the next bin with equivalent color */
4076                         bin = ADD_MASKED(bin, plw.plw_bin_step,
4077                             plw.plw_ceq_mask[0], plw.plw_color_mask);
4078                 } while (sbin != bin);
4079
4080                 if (plw.plw_ceq_dif > 1)
4081                         bin = page_list_walk_next_bin(0, bin, &plw);
4082         }
4083
4084         MTYPE_NEXT(mnode, mtype, flags);
4085         if (mtype >= 0)
4086                 goto try_again;
4087
4088         VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4089         return (NULL);
4090 }
4091
4092 #ifdef DEBUG
4093 #define REPL_PAGE_STATS
4094 #endif /* DEBUG */
4095
4096 #ifdef REPL_PAGE_STATS
4097 struct repl_page_stats {
4098         uint_t  ngets;
4099         uint_t  ngets_noreloc;
4100         uint_t  npgr_noreloc;
4101         uint_t  nnopage_first;
4102         uint_t  nnopage;
4103         uint_t  nhashout;
4104         uint_t  nnofree;
4105         uint_t  nnext_pp;
4106 } repl_page_stats;
4107 #define REPL_STAT_INCR(v)       atomic_inc_32(&repl_page_stats.v)
4108 #else /* REPL_PAGE_STATS */
4109 #define REPL_STAT_INCR(v)
4110 #endif /* REPL_PAGE_STATS */
4111
4112 int     pgrppgcp;
4113
4114 /*
4115  * The freemem accounting must be done by the caller.
4116  * First we try to get a replacement page of the same size as like_pp,
4117  * if that is not possible, then we just get a set of discontiguous
4118  * PAGESIZE pages.
4119  */
4120 page_t *
4121 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4122     uint_t pgrflags)
4123 {
4124         page_t          *like_pp;
4125         page_t          *pp, *pplist;
4126         page_t          *pl = NULL;
4127         ulong_t         bin;
4128         int             mnode, page_mnode;
4129         int             szc;
4130         spgcnt_t        npgs, pg_cnt;
4131         pfn_t           pfnum;
4132         int             mtype;
4133         int             flags = 0;
4134         lgrp_mnode_cookie_t     lgrp_cookie;
4135         lgrp_t          *lgrp;
4136
4137         REPL_STAT_INCR(ngets);
4138         like_pp = orig_like_pp;
4139         ASSERT(PAGE_EXCL(like_pp));
4140
4141         szc = like_pp->p_szc;
4142         npgs = page_get_pagecnt(szc);
4143         /*
4144          * Now we reset like_pp to the base page_t.
4145          * That way, we won't walk past the end of this 'szc' page.
4146          */
4147         pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4148         like_pp = page_numtopp_nolock(pfnum);
4149         ASSERT(like_pp->p_szc == szc);
4150
4151         if (PP_ISNORELOC(like_pp)) {
4152                 ASSERT(kcage_on);
4153                 REPL_STAT_INCR(ngets_noreloc);
4154                 flags = PGI_RELOCONLY;
4155         } else if (pgrflags & PGR_NORELOC) {
4156                 ASSERT(kcage_on);
4157                 REPL_STAT_INCR(npgr_noreloc);
4158                 flags = PG_NORELOC;
4159         }
4160
4161         /*
4162          * Kernel pages must always be replaced with the same size
4163          * pages, since we cannot properly handle demotion of kernel
4164          * pages.
4165          */
4166         if (PP_ISKAS(like_pp))
4167                 pgrflags |= PGR_SAMESZC;
4168
4169         MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4170
4171         while (npgs) {
4172                 pplist = NULL;
4173                 for (;;) {
4174                         pg_cnt = page_get_pagecnt(szc);
4175                         bin = PP_2_BIN(like_pp);
4176                         ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4177                         ASSERT(pg_cnt <= npgs);
4178
4179                         /*
4180                          * If an lgroup was specified, try to get the
4181                          * page from that lgroup.
4182                          * NOTE: Must be careful with code below because
4183                          *       lgroup may disappear and reappear since there
4184                          *       is no locking for lgroup here.
4185                          */
4186                         if (LGRP_EXISTS(lgrp_target)) {
4187                                 /*
4188                                  * Keep local variable for lgroup separate
4189                                  * from lgroup argument since this code should
4190                                  * only be exercised when lgroup argument
4191                                  * exists....
4192                                  */
4193                                 lgrp = lgrp_target;
4194
4195                                 /* Try the lgroup's freelists first */
4196                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4197                                     LGRP_SRCH_LOCAL);
4198                                 while ((pplist == NULL) &&
4199                                     (mnode = lgrp_memnode_choose(&lgrp_cookie))
4200                                     != -1) {
4201                                         pplist =
4202                                             page_get_mnode_freelist(mnode, bin,
4203                                             mtype, szc, flags);
4204                                 }
4205
4206                                 /*
4207                                  * Now try it's cachelists if this is a
4208                                  * small page. Don't need to do it for
4209                                  * larger ones since page_freelist_coalesce()
4210                                  * already failed.
4211                                  */
4212                                 if (pplist != NULL || szc != 0)
4213                                         break;
4214
4215                                 /* Now try it's cachelists */
4216                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4217                                     LGRP_SRCH_LOCAL);
4218
4219                                 while ((pplist == NULL) &&
4220                                     (mnode = lgrp_memnode_choose(&lgrp_cookie))
4221                                     != -1) {
4222                                         pplist =
4223                                             page_get_mnode_cachelist(bin, flags,
4224                                             mnode, mtype);
4225                                 }
4226                                 if (pplist != NULL) {
4227                                         page_hashout(pplist, false);
4228                                         PP_SETAGED(pplist);
4229                                         REPL_STAT_INCR(nhashout);
4230                                         break;
4231                                 }
4232                                 /* Done looking in this lgroup. Bail out. */
4233                                 break;
4234                         }
4235
4236                         /*
4237                          * No lgroup was specified (or lgroup was removed by
4238                          * DR, so just try to get the page as close to
4239                          * like_pp's mnode as possible.
4240                          * First try the local freelist...
4241                          */
4242                         mnode = PP_2_MEM_NODE(like_pp);
4243                         pplist = page_get_mnode_freelist(mnode, bin,
4244                             mtype, szc, flags);
4245                         if (pplist != NULL)
4246                                 break;
4247
4248                         REPL_STAT_INCR(nnofree);
4249
4250                         /*
4251                          * ...then the local cachelist. Don't need to do it for
4252                          * larger pages cause page_freelist_coalesce() already
4253                          * failed there anyway.
4254                          */
4255                         if (szc == 0) {
4256                                 pplist = page_get_mnode_cachelist(bin, flags,
4257                                     mnode, mtype);
4258                                 if (pplist != NULL) {
4259                                         page_hashout(pplist, false);
4260                                         PP_SETAGED(pplist);
4261                                         REPL_STAT_INCR(nhashout);
4262                                         break;
4263                                 }
4264                         }
4265
4266                         /* Now try remote freelists */
4267                         page_mnode = mnode;
4268                         lgrp =
4269                             lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4270                         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4271                             LGRP_SRCH_HIER);
4272                         while (pplist == NULL &&
4273                             (mnode = lgrp_memnode_choose(&lgrp_cookie))
4274                             != -1) {
4275                                 /*
4276                                  * Skip local mnode.
4277                                  */
4278                                 if ((mnode == page_mnode) ||
4279                                     (mem_node_config[mnode].exists == 0))
4280                                         continue;
4281
4282                                 pplist = page_get_mnode_freelist(mnode,
4283                                     bin, mtype, szc, flags);
4284                         }
4285
4286                         if (pplist != NULL)
4287                                 break;
4288
4289
4290                         /* Now try remote cachelists */
4291                         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4292                             LGRP_SRCH_HIER);
4293                         while (pplist == NULL && szc == 0) {
4294                                 mnode = lgrp_memnode_choose(&lgrp_cookie);
4295                                 if (mnode == -1)
4296                                         break;
4297                                 /*
4298                                  * Skip local mnode.
4299                                  */
4300                                 if ((mnode == page_mnode) ||
4301                                     (mem_node_config[mnode].exists == 0))
4302                                         continue;
4303
4304                                 pplist = page_get_mnode_cachelist(bin,
4305                                     flags, mnode, mtype);
4306
4307                                 if (pplist != NULL) {
4308                                         page_hashout(pplist, false);
4309                                         PP_SETAGED(pplist);
4310                                         REPL_STAT_INCR(nhashout);
4311                                         break;
4312                                 }
4313                         }
4314
4315                         /*
4316                          * Break out of while loop under the following cases:
4317                          * - If we successfully got a page.
4318                          * - If pgrflags specified only returning a specific
4319                          *   page size and we could not find that page size.
4320                          * - If we could not satisfy the request with PAGESIZE
4321                          *   or larger pages.
4322                          */
4323                         if (pplist != NULL || szc == 0)
4324                                 break;
4325
4326                         if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4327                                 /* try to find contig page */
4328
4329                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4330                                     LGRP_SRCH_HIER);
4331
4332                                 while ((pplist == NULL) &&
4333                                     (mnode =
4334                                     lgrp_memnode_choose(&lgrp_cookie))
4335                                     != -1) {
4336                                         pplist = page_get_contig_pages(
4337                                             mnode, bin, mtype, szc,
4338                                             flags | PGI_PGCPHIPRI);
4339                                 }
4340                                 break;
4341                         }
4342
4343                         /*
4344                          * The correct thing to do here is try the next
4345                          * page size down using szc--. Due to a bug
4346                          * with the processing of HAT_RELOAD_SHARE
4347                          * where the sfmmu_ttecnt arrays of all
4348                          * hats sharing an ISM segment don't get updated,
4349                          * using intermediate size pages for relocation
4350                          * can lead to continuous page faults.
4351                          */
4352                         szc = 0;
4353                 }
4354
4355                 if (pplist != NULL) {
4356                         DTRACE_PROBE4(page__get,
4357                             lgrp_t *, lgrp,
4358                             int, mnode,
4359                             ulong_t, bin,
4360                             uint_t, flags);
4361
4362                         while (pplist != NULL && pg_cnt--) {
4363                                 ASSERT(pplist != NULL);
4364                                 pp = pplist;
4365                                 page_sub(&pplist, pp);
4366                                 PP_CLRFREE(pp);
4367                                 PP_CLRAGED(pp);
4368                                 page_list_concat(&pl, &pp);
4369                                 npgs--;
4370                                 like_pp = like_pp + 1;
4371                                 REPL_STAT_INCR(nnext_pp);
4372                         }
4373                         ASSERT(pg_cnt == 0);
4374                 } else {
4375                         break;
4376                 }
4377         }
4378
4379         if (npgs) {
4380                 /*
4381                  * We were unable to allocate the necessary number
4382                  * of pages.
4383                  * We need to free up any pl.
4384                  */
4385                 REPL_STAT_INCR(nnopage);
4386                 page_free_replacement_page(pl);
4387                 return (NULL);
4388         } else {
4389                 return (pl);
4390         }
4391 }
4392
4393 /*
4394  * demote a free large page to it's constituent pages
4395  */
4396 void
4397 page_demote_free_pages(page_t *pp)
4398 {
4399
4400         int mnode;
4401
4402         ASSERT(pp != NULL);
4403         ASSERT(PAGE_LOCKED(pp));
4404         ASSERT(PP_ISFREE(pp));
4405         ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4406
4407         mnode = PP_2_MEM_NODE(pp);
4408         page_freelist_lock(mnode);
4409         if (pp->p_szc != 0) {
4410                 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4411                     pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4412         }
4413         page_freelist_unlock(mnode);
4414         ASSERT(pp->p_szc == 0);
4415 }
4416
4417 /*
4418  * Factor in colorequiv to check additional 'equivalent' bins.
4419  * colorequiv may be set in /etc/system
4420  */
4421 void
4422 page_set_colorequiv_arr(void)
4423 {
4424         if (colorequiv > 1) {
4425                 int i;
4426                 uint_t sv_a = lowbit(colorequiv) - 1;
4427
4428                 if (sv_a > 15)
4429                         sv_a = 15;
4430
4431                 for (i = 0; i < MMU_PAGE_SIZES; i++) {
4432                         uint_t colors;
4433                         uint_t a = sv_a;
4434
4435                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
4436                                 continue;
4437                         }
4438                         while ((colors >> a) == 0)
4439                                 a--;
4440                         if ((a << 4) > colorequivszc[i]) {
4441                                 colorequivszc[i] = (a << 4);
4442                         }
4443                 }
4444         }
4445 }