kernel/vm/vm_pagelist.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 /*
  26  * Copyright 2012 Joyent, Inc.  All rights reserved.
  27  */
  28
  29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  30 /*      All Rights Reserved   */
  31
  32 /*
  33  * Portions of this source code were derived from Berkeley 4.3 BSD
  34  * under license from the Regents of the University of California.
  35  */
  36
  37
  38 /*
  39  * This file contains common functions to access and manage the page lists.
  40  * Many of these routines originated from platform dependent modules
  41  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
  42  * a platform independent manner.
  43  *
  44  * vm/vm_dep.h provides for platform specific support.
  45  */
  46
  47 #include <sys/types.h>
  48 #include <sys/debug.h>
  49 #include <sys/cmn_err.h>
  50 #include <sys/systm.h>
  51 #include <sys/atomic.h>
  52 #include <sys/sysmacros.h>
  53 #include <vm/as.h>
  54 #include <vm/page.h>
  55 #include <vm/seg_kmem.h>
  56 #include <vm/seg_vn.h>
  57 #include <sys/vmsystm.h>
  58 #include <sys/memnode.h>
  59 #include <vm/vm_dep.h>
  60 #include <sys/lgrp.h>
  61 #include <sys/mem_config.h>
  62 #include <sys/callb.h>
  63 #include <sys/mem_cage.h>
  64 #include <sys/sdt.h>
  65 #include <sys/dumphdr.h>
  66 #include <sys/swap.h>
  67
  68 extern uint_t   vac_colors;
  69
  70 #define MAX_PRAGMA_ALIGN        128
  71
  72 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
  73
  74 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
  75 #pragma align   L2CACHE_ALIGN_MAX(vm_cpu_data0)
  76 #else
  77 #pragma align   MAX_PRAGMA_ALIGN(vm_cpu_data0)
  78 #endif
  79 char            vm_cpu_data0[VM_CPU_DATA_PADSIZE];
  80
  81 /*
  82  * number of page colors equivalent to reqested color in page_get routines.
  83  * If set, keeps large pages intact longer and keeps MPO allocation
  84  * from the local mnode in favor of acquiring the 'correct' page color from
  85  * a demoted large page or from a remote mnode.
  86  */
  87 uint_t  colorequiv;
  88
  89 /*
  90  * color equivalency mask for each page size.
  91  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
  92  * High 4 bits determine the number of high order bits of the color to ignore.
  93  * Low 4 bits determines number of low order bits of color to ignore (it's only
  94  * relevant for hashed index based page coloring).
  95  */
  96 uchar_t colorequivszc[MMU_PAGE_SIZES];
  97
  98 /*
  99  * if set, specifies the percentage of large pages that are free from within
 100  * a large page region before attempting to lock those pages for
 101  * page_get_contig_pages processing.
 102  *
 103  * Should be turned on when kpr is available when page_trylock_contig_pages
 104  * can be more selective.
 105  */
 106
 107 int     ptcpthreshold;
 108
 109 /*
 110  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
 111  * Enabled by default via pgcplimitsearch.
 112  *
 113  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
 114  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
 115  * bound. This upper bound range guarantees:
 116  *    - all large page 'slots' will be searched over time
 117  *    - the minimum (1) large page candidates considered on each pgcp call
 118  *    - count doesn't wrap around to 0
 119  */
 120 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES];
 121 int     pgcplimitsearch = 1;
 122
 123 #define PGCPFAILMAX             (1 << (highbit(physinstalled) - 1))
 124 #define SETPGCPFAILCNT(szc)                                             \
 125         if (++pgcpfailcnt[szc] >= PGCPFAILMAX)                          \
 126                 pgcpfailcnt[szc] = PGCPFAILMAX / 2;
 127
 128 #ifdef VM_STATS
 129 struct vmm_vmstats_str  vmm_vmstats;
 130
 131 #endif /* VM_STATS */
 132
 133 #if defined(__sparc)
 134 #define LPGCREATE       0
 135 #else
 136 /* enable page_get_contig_pages */
 137 #define LPGCREATE       1
 138 #endif
 139
 140 int pg_contig_disable;
 141 int pg_lpgcreate_nocage = LPGCREATE;
 142
 143 /*
 144  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
 145  */
 146 #define PFNNULL         0
 147
 148 /* Flags involved in promotion and demotion routines */
 149 #define PC_FREE         0x1     /* put page on freelist */
 150 #define PC_ALLOC        0x2     /* return page for allocation */
 151
 152 /*
 153  * Flag for page_demote to be used with PC_FREE to denote that we don't care
 154  * what the color is as the color parameter to the function is ignored.
 155  */
 156 #define PC_NO_COLOR     (-1)
 157
 158 /* mtype value for page_promote to use when mtype does not matter */
 159 #define PC_MTYPE_ANY    (-1)
 160
 161 /*
 162  * page counters candidates info
 163  * See page_ctrs_cands comment below for more details.
 164  * fields are as follows:
 165  *      pcc_pages_free:         # pages which freelist coalesce can create
 166  *      pcc_color_free:         pointer to page free counts per color
 167  */
 168 typedef struct pcc_info {
 169         pgcnt_t pcc_pages_free;
 170         pgcnt_t *pcc_color_free;
 171         uint_t  pad[12];
 172 } pcc_info_t;
 173
 174 /*
 175  * On big machines it can take a long time to check page_counters
 176  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
 177  * updated sum of all elements of the corresponding page_counters arrays.
 178  * page_freelist_coalesce() searches page_counters only if an appropriate
 179  * element of page_ctrs_cands array is greater than 0.
 180  *
 181  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
 182  */
 183 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
 184
 185 /*
 186  * Return in val the total number of free pages which can be created
 187  * for the given mnode (m), mrange (g), and region size (r)
 188  */
 189 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) {                           \
 190         int i;                                                          \
 191         val = 0;                                                        \
 192         for (i = 0; i < NPC_MUTEX; i++) {                               \
 193             val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;    \
 194         }                                                               \
 195 }
 196
 197 /*
 198  * Return in val the total number of free pages which can be created
 199  * for the given mnode (m), mrange (g), region size (r), and color (c)
 200  */
 201 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {                   \
 202         int i;                                                          \
 203         val = 0;                                                        \
 204         ASSERT((c) < PAGE_GET_PAGECOLORS(r));                           \
 205         for (i = 0; i < NPC_MUTEX; i++) {                               \
 206             val +=                                                      \
 207                 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];  \
 208         }                                                               \
 209 }
 210
 211 /*
 212  * We can only allow a single thread to update a counter within the physical
 213  * range of the largest supported page size. That is the finest granularity
 214  * possible since the counter values are dependent on each other
 215  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
 216  * ctr_mutex lock index for a particular physical range.
 217  */
 218 static kmutex_t *ctr_mutex[NPC_MUTEX];
 219
 220 #define PP_CTR_LOCK_INDX(pp)                                            \
 221         (((pp)->p_pagenum >>                                            \
 222             (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
 223
 224 #define INVALID_COLOR 0xffffffff
 225 #define INVALID_MASK  0xffffffff
 226
 227 /*
 228  * Local functions prototypes.
 229  */
 230
 231 void page_ctr_add(int, int, page_t *, int);
 232 void page_ctr_add_internal(int, int, page_t *, int);
 233 void page_ctr_sub(int, int, page_t *, int);
 234 void page_ctr_sub_internal(int, int, page_t *, int);
 235 void page_freelist_lock(int);
 236 void page_freelist_unlock(int);
 237 page_t *page_promote(int, pfn_t, uchar_t, int, int);
 238 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
 239 page_t *page_freelist_split(uchar_t,
 240     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
 241 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
 242 static int page_trylock_cons(page_t *pp, se_t se);
 243
 244 /*
 245  * The page_counters array below is used to keep track of free contiguous
 246  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
 247  * This contains an array of counters, the size of the array, a shift value
 248  * used to convert a pagenum into a counter array index or vice versa, as
 249  * well as a cache of the last successful index to be promoted to a larger
 250  * page size.  As an optimization, we keep track of the last successful index
 251  * to be promoted per page color for the given size region, and this is
 252  * allocated dynamically based upon the number of colors for a given
 253  * region size.
 254  *
 255  * Conceptually, the page counters are represented as:
 256  *
 257  *      page_counters[region_size][mnode]
 258  *
 259  *      region_size:    size code of a candidate larger page made up
 260  *                      of contiguous free smaller pages.
 261  *
 262  *      page_counters[region_size][mnode].hpm_counters[index]:
 263  *              represents how many (region_size - 1) pages either
 264  *              exist or can be created within the given index range.
 265  *
 266  * Let's look at a sparc example:
 267  *      If we want to create a free 512k page, we look at region_size 2
 268  *      for the mnode we want.  We calculate the index and look at a specific
 269  *      hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
 270  *      this location, it means that 8 64k pages either exist or can be created
 271  *      from 8K pages in order to make a single free 512k page at the given
 272  *      index.  Note that when a region is full, it will contribute to the
 273  *      counts in the region above it.  Thus we will not know what page
 274  *      size the free pages will be which can be promoted to this new free
 275  *      page unless we look at all regions below the current region.
 276  */
 277
 278 /*
 279  * Note: hpmctr_t is defined in platform vm_dep.h
 280  * hw_page_map_t contains all the information needed for the page_counters
 281  * logic. The fields are as follows:
 282  *
 283  *      hpm_counters:   dynamically allocated array to hold counter data
 284  *      hpm_entries:    entries in hpm_counters
 285  *      hpm_shift:      shift for pnum/array index conv
 286  *      hpm_base:       PFN mapped to counter index 0
 287  *      hpm_color_current:      last index in counter array for this color at
 288  *                              which we successfully created a large page
 289  */
 290 typedef struct hw_page_map {
 291         hpmctr_t        *hpm_counters;
 292         size_t          hpm_entries;
 293         int             hpm_shift;
 294         pfn_t           hpm_base;
 295         size_t          *hpm_color_current[MAX_MNODE_MRANGES];
 296 #if defined(__sparc)
 297         uint_t          pad[4];
 298 #endif
 299 } hw_page_map_t;
 300
 301 /*
 302  * Element zero is not used, but is allocated for convenience.
 303  */
 304 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
 305
 306 /*
 307  * Cached value of MNODE_RANGE_CNT(mnode).
 308  * This is a function call in x86.
 309  */
 310 static int mnode_nranges[MAX_MEM_NODES];
 311 static int mnode_maxmrange[MAX_MEM_NODES];
 312
 313 /*
 314  * The following macros are convenient ways to get access to the individual
 315  * elements of the page_counters arrays.  They can be used on both
 316  * the left side and right side of equations.
 317  */
 318 #define PAGE_COUNTERS(mnode, rg_szc, idx)                       \
 319         (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
 320
 321 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc)                   \
 322         (page_counters[(rg_szc)][(mnode)].hpm_counters)
 323
 324 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc)                      \
 325         (page_counters[(rg_szc)][(mnode)].hpm_shift)
 326
 327 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc)                    \
 328         (page_counters[(rg_szc)][(mnode)].hpm_entries)
 329
 330 #define PAGE_COUNTERS_BASE(mnode, rg_szc)                       \
 331         (page_counters[(rg_szc)][(mnode)].hpm_base)
 332
 333 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)             \
 334         (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
 335
 336 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)       \
 337         (page_counters[(rg_szc)][(mnode)].                              \
 338         hpm_color_current[(mrange)][(color)])
 339
 340 #define PNUM_TO_IDX(mnode, rg_szc, pnum)                        \
 341         (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>    \
 342                 PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
 343
 344 #define IDX_TO_PNUM(mnode, rg_szc, index)                       \
 345         (PAGE_COUNTERS_BASE((mnode), (rg_szc)) +                \
 346                 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
 347
 348 /*
 349  * Protects the hpm_counters and hpm_color_current memory from changing while
 350  * looking at page counters information.
 351  * Grab the write lock to modify what these fields point at.
 352  * Grab the read lock to prevent any pointers from changing.
 353  * The write lock can not be held during memory allocation due to a possible
 354  * recursion deadlock with trying to grab the read lock while the
 355  * write lock is already held.
 356  */
 357 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
 358
 359
 360 /*
 361  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
 362  */
 363 void
 364 cpu_vm_data_init(struct cpu *cp)
 365 {
 366         if (cp == CPU0) {
 367                 cp->cpu_vm_data = (void *)&vm_cpu_data0;
 368         } else {
 369                 void    *kmptr;
 370                 int     align;
 371                 size_t  sz;
 372
 373                 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
 374                 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
 375                 kmptr = kmem_zalloc(sz, KM_SLEEP);
 376                 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
 377                 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
 378                 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
 379         }
 380 }
 381
 382 /*
 383  * free cpu_vm_data
 384  */
 385 void
 386 cpu_vm_data_destroy(struct cpu *cp)
 387 {
 388         if (cp->cpu_seqid && cp->cpu_vm_data) {
 389                 ASSERT(cp != CPU0);
 390                 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
 391                     ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
 392         }
 393         cp->cpu_vm_data = NULL;
 394 }
 395
 396
 397 /*
 398  * page size to page size code
 399  */
 400 int
 401 page_szc(size_t pagesize)
 402 {
 403         int     i = 0;
 404
 405         while (hw_page_array[i].hp_size) {
 406                 if (pagesize == hw_page_array[i].hp_size)
 407                         return (i);
 408                 i++;
 409         }
 410         return (-1);
 411 }
 412
 413 /*
 414  * page size to page size code with the restriction that it be a supported
 415  * user page size.  If it's not a supported user page size, -1 will be returned.
 416  */
 417 int
 418 page_szc_user_filtered(size_t pagesize)
 419 {
 420         int szc = page_szc(pagesize);
 421         if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
 422                 return (szc);
 423         }
 424         return (-1);
 425 }
 426
 427 /*
 428  * Return how many page sizes are available for the user to use.  This is
 429  * what the hardware supports and not based upon how the OS implements the
 430  * support of different page sizes.
 431  *
 432  * If legacy is non-zero, return the number of pagesizes available to legacy
 433  * applications. The number of legacy page sizes might be less than the
 434  * exported user page sizes. This is to prevent legacy applications that
 435  * use the largest page size returned from getpagesizes(3c) from inadvertantly
 436  * using the 'new' large pagesizes.
 437  */
 438 uint_t
 439 page_num_user_pagesizes(int legacy)
 440 {
 441         if (legacy)
 442                 return (mmu_legacy_page_sizes);
 443         return (mmu_exported_page_sizes);
 444 }
 445
 446 uint_t
 447 page_num_pagesizes(void)
 448 {
 449         return (mmu_page_sizes);
 450 }
 451
 452 /*
 453  * returns the count of the number of base pagesize pages associated with szc
 454  */
 455 pgcnt_t
 456 page_get_pagecnt(uint_t szc)
 457 {
 458         if (szc >= mmu_page_sizes)
 459                 panic("page_get_pagecnt: out of range %d", szc);
 460         return (hw_page_array[szc].hp_pgcnt);
 461 }
 462
 463 size_t
 464 page_get_pagesize(uint_t szc)
 465 {
 466         if (szc >= mmu_page_sizes)
 467                 panic("page_get_pagesize: out of range %d", szc);
 468         return (hw_page_array[szc].hp_size);
 469 }
 470
 471 /*
 472  * Return the size of a page based upon the index passed in.  An index of
 473  * zero refers to the smallest page size in the system, and as index increases
 474  * it refers to the next larger supported page size in the system.
 475  * Note that szc and userszc may not be the same due to unsupported szc's on
 476  * some systems.
 477  */
 478 size_t
 479 page_get_user_pagesize(uint_t userszc)
 480 {
 481         uint_t szc = USERSZC_2_SZC(userszc);
 482
 483         if (szc >= mmu_page_sizes)
 484                 panic("page_get_user_pagesize: out of range %d", szc);
 485         return (hw_page_array[szc].hp_size);
 486 }
 487
 488 uint_t
 489 page_get_shift(uint_t szc)
 490 {
 491         if (szc >= mmu_page_sizes)
 492                 panic("page_get_shift: out of range %d", szc);
 493         return (PAGE_GET_SHIFT(szc));
 494 }
 495
 496 uint_t
 497 page_get_pagecolors(uint_t szc)
 498 {
 499         if (szc >= mmu_page_sizes)
 500                 panic("page_get_pagecolors: out of range %d", szc);
 501         return (PAGE_GET_PAGECOLORS(szc));
 502 }
 503
 504 /*
 505  * this assigns the desired equivalent color after a split
 506  */
 507 uint_t
 508 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
 509     uint_t ncolor, uint_t ceq_mask)
 510 {
 511         ASSERT(nszc > szc);
 512         ASSERT(szc < mmu_page_sizes);
 513         ASSERT(color < PAGE_GET_PAGECOLORS(szc));
 514         ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
 515
 516         color &= ceq_mask;
 517         ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
 518         return (color | (ncolor & ~ceq_mask));
 519 }
 520
 521 /*
 522  * The interleaved_mnodes flag is set when mnodes overlap in
 523  * the physbase..physmax range, but have disjoint slices.
 524  * In this case hpm_counters is shared by all mnodes.
 525  * This flag is set dynamically by the platform.
 526  */
 527 int interleaved_mnodes = 0;
 528
 529 /*
 530  * Called by startup().
 531  * Size up the per page size free list counters based on physmax
 532  * of each node and max_mem_nodes.
 533  *
 534  * If interleaved_mnodes is set we need to find the first mnode that
 535  * exists. hpm_counters for the first mnode will then be shared by
 536  * all other mnodes. If interleaved_mnodes is not set, just set
 537  * first=mnode each time. That means there will be no sharing.
 538  */
 539 size_t
 540 page_ctrs_sz(void)
 541 {
 542         int     r;              /* region size */
 543         int     mnode;
 544         int     firstmn;        /* first mnode that exists */
 545         int     nranges;
 546         pfn_t   physbase;
 547         pfn_t   physmax;
 548         uint_t  ctrs_sz = 0;
 549         int     i;
 550         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 551
 552         /*
 553          * We need to determine how many page colors there are for each
 554          * page size in order to allocate memory for any color specific
 555          * arrays.
 556          */
 557         for (i = 0; i < mmu_page_sizes; i++) {
 558                 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 559         }
 560
 561         for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 562
 563                 pgcnt_t r_pgcnt;
 564                 pfn_t   r_base;
 565                 pgcnt_t r_align;
 566
 567                 if (mem_node_config[mnode].exists == 0)
 568                         continue;
 569
 570                 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 571                 nranges = MNODE_RANGE_CNT(mnode);
 572                 mnode_nranges[mnode] = nranges;
 573                 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
 574
 575                 /*
 576                  * determine size needed for page counter arrays with
 577                  * base aligned to large page size.
 578                  */
 579                 for (r = 1; r < mmu_page_sizes; r++) {
 580                         /* add in space for hpm_color_current */
 581                         ctrs_sz += sizeof (size_t) *
 582                             colors_per_szc[r] * nranges;
 583
 584                         if (firstmn != mnode)
 585                                 continue;
 586
 587                         /* add in space for hpm_counters */
 588                         r_align = page_get_pagecnt(r);
 589                         r_base = physbase;
 590                         r_base &= ~(r_align - 1);
 591                         r_pgcnt = howmany(physmax - r_base + 1, r_align);
 592
 593                         /*
 594                          * Round up to always allocate on pointer sized
 595                          * boundaries.
 596                          */
 597                         ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
 598                             sizeof (hpmctr_t *));
 599                 }
 600         }
 601
 602         for (r = 1; r < mmu_page_sizes; r++) {
 603                 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
 604         }
 605
 606         /* add in space for page_ctrs_cands and pcc_color_free */
 607         ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
 608             mmu_page_sizes * NPC_MUTEX;
 609
 610         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 611
 612                 if (mem_node_config[mnode].exists == 0)
 613                         continue;
 614
 615                 nranges = mnode_nranges[mnode];
 616                 ctrs_sz += sizeof (pcc_info_t) * nranges *
 617                     mmu_page_sizes * NPC_MUTEX;
 618                 for (r = 1; r < mmu_page_sizes; r++) {
 619                         ctrs_sz += sizeof (pgcnt_t) * nranges *
 620                             colors_per_szc[r] * NPC_MUTEX;
 621                 }
 622         }
 623
 624         /* ctr_mutex */
 625         ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
 626
 627         /* size for page list counts */
 628         PLCNT_SZ(ctrs_sz);
 629
 630         /*
 631          * add some slop for roundups. page_ctrs_alloc will roundup the start
 632          * address of the counters to ecache_alignsize boundary for every
 633          * memory node.
 634          */
 635         return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
 636 }
 637
 638 caddr_t
 639 page_ctrs_alloc(caddr_t alloc_base)
 640 {
 641         int     mnode;
 642         int     mrange, nranges;
 643         int     r;              /* region size */
 644         int     i;
 645         int     firstmn;        /* first mnode that exists */
 646         pfn_t   physbase;
 647         pfn_t   physmax;
 648         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 649
 650         /*
 651          * We need to determine how many page colors there are for each
 652          * page size in order to allocate memory for any color specific
 653          * arrays.
 654          */
 655         for (i = 0; i < mmu_page_sizes; i++) {
 656                 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 657         }
 658
 659         for (r = 1; r < mmu_page_sizes; r++) {
 660                 page_counters[r] = (hw_page_map_t *)alloc_base;
 661                 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
 662         }
 663
 664         /* page_ctrs_cands and pcc_color_free array */
 665         for (i = 0; i < NPC_MUTEX; i++) {
 666                 for (r = 1; r < mmu_page_sizes; r++) {
 667
 668                         page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
 669                         alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
 670
 671                         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 672                                 pcc_info_t *pi;
 673
 674                                 if (mem_node_config[mnode].exists == 0)
 675                                         continue;
 676
 677                                 nranges = mnode_nranges[mnode];
 678
 679                                 pi = (pcc_info_t *)alloc_base;
 680                                 alloc_base += sizeof (pcc_info_t) * nranges;
 681                                 page_ctrs_cands[i][r][mnode] = pi;
 682
 683                                 for (mrange = 0; mrange < nranges; mrange++) {
 684                                         pi->pcc_color_free =
 685                                             (pgcnt_t *)alloc_base;
 686                                         alloc_base += sizeof (pgcnt_t) *
 687                                             colors_per_szc[r];
 688                                         pi++;
 689                                 }
 690                         }
 691                 }
 692         }
 693
 694         /* ctr_mutex */
 695         for (i = 0; i < NPC_MUTEX; i++) {
 696                 ctr_mutex[i] = (kmutex_t *)alloc_base;
 697                 alloc_base += (max_mem_nodes * sizeof (kmutex_t));
 698         }
 699
 700         /* initialize page list counts */
 701         PLCNT_INIT(alloc_base);
 702
 703         for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 704
 705                 pgcnt_t r_pgcnt;
 706                 pfn_t   r_base;
 707                 pgcnt_t r_align;
 708                 int     r_shift;
 709                 int     nranges = mnode_nranges[mnode];
 710
 711                 if (mem_node_config[mnode].exists == 0)
 712                         continue;
 713
 714                 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 715
 716                 for (r = 1; r < mmu_page_sizes; r++) {
 717                         /*
 718                          * the page_counters base has to be aligned to the
 719                          * page count of page size code r otherwise the counts
 720                          * will cross large page boundaries.
 721                          */
 722                         r_align = page_get_pagecnt(r);
 723                         r_base = physbase;
 724                         /* base needs to be aligned - lower to aligned value */
 725                         r_base &= ~(r_align - 1);
 726                         r_pgcnt = howmany(physmax - r_base + 1, r_align);
 727                         r_shift = PAGE_BSZS_SHIFT(r);
 728
 729                         PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
 730                         PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
 731                         PAGE_COUNTERS_BASE(mnode, r) = r_base;
 732                         for (mrange = 0; mrange < nranges; mrange++) {
 733                                 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
 734                                     r, mrange) = (size_t *)alloc_base;
 735                                 alloc_base += sizeof (size_t) *
 736                                     colors_per_szc[r];
 737                         }
 738                         for (i = 0; i < colors_per_szc[r]; i++) {
 739                                 uint_t color_mask = colors_per_szc[r] - 1;
 740                                 pfn_t  pfnum = r_base;
 741                                 size_t idx;
 742                                 int mrange;
 743                                 MEM_NODE_ITERATOR_DECL(it);
 744
 745                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
 746                                 if (pfnum == (pfn_t)-1) {
 747                                         idx = 0;
 748                                 } else {
 749                                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
 750                                             color_mask, color_mask, &it);
 751                                         idx = PNUM_TO_IDX(mnode, r, pfnum);
 752                                         idx = (idx >= r_pgcnt) ? 0 : idx;
 753                                 }
 754                                 for (mrange = 0; mrange < nranges; mrange++) {
 755                                         PAGE_COUNTERS_CURRENT_COLOR(mnode,
 756                                             r, i, mrange) = idx;
 757                                 }
 758                         }
 759
 760                         /* hpm_counters may be shared by all mnodes */
 761                         if (firstmn == mnode) {
 762                                 PAGE_COUNTERS_COUNTERS(mnode, r) =
 763                                     (hpmctr_t *)alloc_base;
 764                                 alloc_base +=
 765                                     P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
 766                                     sizeof (hpmctr_t *));
 767                         } else {
 768                                 PAGE_COUNTERS_COUNTERS(mnode, r) =
 769                                     PAGE_COUNTERS_COUNTERS(firstmn, r);
 770                         }
 771
 772                         /*
 773                          * Verify that PNUM_TO_IDX and IDX_TO_PNUM
 774                          * satisfy the identity requirement.
 775                          * We should be able to go from one to the other
 776                          * and get consistent values.
 777                          */
 778                         ASSERT(PNUM_TO_IDX(mnode, r,
 779                             (IDX_TO_PNUM(mnode, r, 0))) == 0);
 780                         ASSERT(IDX_TO_PNUM(mnode, r,
 781                             (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
 782                 }
 783                 /*
 784                  * Roundup the start address of the page_counters to
 785                  * cache aligned boundary for every memory node.
 786                  * page_ctrs_sz() has added some slop for these roundups.
 787                  */
 788                 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
 789                     L2CACHE_ALIGN);
 790         }
 791
 792         /* Initialize other page counter specific data structures. */
 793         for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
 794                 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
 795         }
 796
 797         return (alloc_base);
 798 }
 799
 800 /*
 801  * Functions to adjust region counters for each size free list.
 802  * Caller is responsible to acquire the ctr_mutex lock if necessary and
 803  * thus can be called during startup without locks.
 804  */
 805 /* ARGSUSED */
 806 void
 807 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
 808 {
 809         ssize_t         r;      /* region size */
 810         ssize_t         idx;
 811         pfn_t           pfnum;
 812         int             lckidx;
 813
 814         ASSERT(mnode == PP_2_MEM_NODE(pp));
 815         ASSERT(mtype == PP_2_MTYPE(pp));
 816
 817         ASSERT(pp->p_szc < mmu_page_sizes);
 818
 819         PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
 820
 821         /* no counter update needed for largest page size */
 822         if (pp->p_szc >= mmu_page_sizes - 1) {
 823                 return;
 824         }
 825
 826         r = pp->p_szc + 1;
 827         pfnum = pp->p_pagenum;
 828         lckidx = PP_CTR_LOCK_INDX(pp);
 829
 830         /*
 831          * Increment the count of free pages for the current
 832          * region. Continue looping up in region size incrementing
 833          * count if the preceeding region is full.
 834          */
 835         while (r < mmu_page_sizes) {
 836                 idx = PNUM_TO_IDX(mnode, r, pfnum);
 837
 838                 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 839                 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
 840
 841                 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
 842                         break;
 843                 } else {
 844                         int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
 845                         pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
 846                             [MTYPE_2_MRANGE(mnode, root_mtype)];
 847
 848                         cand->pcc_pages_free++;
 849                         cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
 850                 }
 851                 r++;
 852         }
 853 }
 854
 855 void
 856 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
 857 {
 858         int             lckidx = PP_CTR_LOCK_INDX(pp);
 859         kmutex_t        *lock = &ctr_mutex[lckidx][mnode];
 860
 861         mutex_enter(lock);
 862         page_ctr_add_internal(mnode, mtype, pp, flags);
 863         mutex_exit(lock);
 864 }
 865
 866 void
 867 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
 868 {
 869         int             lckidx;
 870         ssize_t         r;      /* region size */
 871         ssize_t         idx;
 872         pfn_t           pfnum;
 873
 874         ASSERT(mnode == PP_2_MEM_NODE(pp));
 875         ASSERT(mtype == PP_2_MTYPE(pp));
 876
 877         ASSERT(pp->p_szc < mmu_page_sizes);
 878
 879         PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
 880
 881         /* no counter update needed for largest page size */
 882         if (pp->p_szc >= mmu_page_sizes - 1) {
 883                 return;
 884         }
 885
 886         r = pp->p_szc + 1;
 887         pfnum = pp->p_pagenum;
 888         lckidx = PP_CTR_LOCK_INDX(pp);
 889
 890         /*
 891          * Decrement the count of free pages for the current
 892          * region. Continue looping up in region size decrementing
 893          * count if the preceeding region was full.
 894          */
 895         while (r < mmu_page_sizes) {
 896                 idx = PNUM_TO_IDX(mnode, r, pfnum);
 897
 898                 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 899                 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
 900
 901                 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
 902                         break;
 903                 } else {
 904                         int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
 905                         pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
 906                             [MTYPE_2_MRANGE(mnode, root_mtype)];
 907
 908                         ASSERT(cand->pcc_pages_free != 0);
 909                         ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
 910
 911                         cand->pcc_pages_free--;
 912                         cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
 913                 }
 914                 r++;
 915         }
 916 }
 917
 918 void
 919 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
 920 {
 921         int             lckidx = PP_CTR_LOCK_INDX(pp);
 922         kmutex_t        *lock = &ctr_mutex[lckidx][mnode];
 923
 924         mutex_enter(lock);
 925         page_ctr_sub_internal(mnode, mtype, pp, flags);
 926         mutex_exit(lock);
 927 }
 928
 929 /*
 930  * Adjust page counters following a memory attach, since typically the
 931  * size of the array needs to change, and the PFN to counter index
 932  * mapping needs to change.
 933  *
 934  * It is possible this mnode did not exist at startup. In that case
 935  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
 936  * to change (a theoretical possibility on x86), which means pcc_color_free
 937  * arrays must be extended.
 938  */
 939 uint_t
 940 page_ctrs_adjust(int mnode)
 941 {
 942         pgcnt_t npgs;
 943         int     r;              /* region size */
 944         int     i;
 945         size_t  pcsz, old_csz;
 946         hpmctr_t *new_ctr, *old_ctr;
 947         pfn_t   oldbase, newbase;
 948         pfn_t   physbase, physmax;
 949         size_t  old_npgs;
 950         hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
 951         size_t  size_cache[MMU_PAGE_SIZES];
 952         size_t  *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 953         size_t  *old_color_array[MAX_MNODE_MRANGES];
 954         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 955         pcc_info_t **cands_cache;
 956         pcc_info_t *old_pi, *pi;
 957         pgcnt_t *pgcntp;
 958         int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
 959         int cands_cache_nranges;
 960         int old_maxmrange, new_maxmrange;
 961         int rc = 0;
 962         int oldmnode;
 963
 964         cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
 965             MMU_PAGE_SIZES, KM_NOSLEEP);
 966         if (cands_cache == NULL)
 967                 return (ENOMEM);
 968
 969         i = -1;
 970         HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
 971
 972         newbase = physbase & ~PC_BASE_ALIGN_MASK;
 973         npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
 974
 975         /* prepare to free non-null pointers on the way out */
 976         cands_cache_nranges = nranges;
 977         bzero(ctr_cache, sizeof (ctr_cache));
 978         bzero(color_cache, sizeof (color_cache));
 979
 980         /*
 981          * We need to determine how many page colors there are for each
 982          * page size in order to allocate memory for any color specific
 983          * arrays.
 984          */
 985         for (r = 0; r < mmu_page_sizes; r++) {
 986                 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
 987         }
 988
 989         /*
 990          * Preallocate all of the new hpm_counters arrays as we can't
 991          * hold the page_ctrs_rwlock as a writer and allocate memory.
 992          * If we can't allocate all of the arrays, undo our work so far
 993          * and return failure.
 994          */
 995         for (r = 1; r < mmu_page_sizes; r++) {
 996                 pcsz = npgs >> PAGE_BSZS_SHIFT(r);
 997                 size_cache[r] = pcsz;
 998                 ctr_cache[r] = kmem_zalloc(pcsz *
 999                     sizeof (hpmctr_t), KM_NOSLEEP);
1000                 if (ctr_cache[r] == NULL) {
1001                         rc = ENOMEM;
1002                         goto cleanup;
1003                 }
1004         }
1005
1006         /*
1007          * Preallocate all of the new color current arrays as we can't
1008          * hold the page_ctrs_rwlock as a writer and allocate memory.
1009          * If we can't allocate all of the arrays, undo our work so far
1010          * and return failure.
1011          */
1012         for (r = 1; r < mmu_page_sizes; r++) {
1013                 for (mrange = 0; mrange < nranges; mrange++) {
1014                         color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1015                             colors_per_szc[r], KM_NOSLEEP);
1016                         if (color_cache[r][mrange] == NULL) {
1017                                 rc = ENOMEM;
1018                                 goto cleanup;
1019                         }
1020                 }
1021         }
1022
1023         /*
1024          * Preallocate all of the new pcc_info_t arrays as we can't
1025          * hold the page_ctrs_rwlock as a writer and allocate memory.
1026          * If we can't allocate all of the arrays, undo our work so far
1027          * and return failure.
1028          */
1029         for (r = 1; r < mmu_page_sizes; r++) {
1030                 for (i = 0; i < NPC_MUTEX; i++) {
1031                         pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1032                             KM_NOSLEEP);
1033                         if (pi == NULL) {
1034                                 rc = ENOMEM;
1035                                 goto cleanup;
1036                         }
1037                         cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1038
1039                         for (mrange = 0; mrange < nranges; mrange++, pi++) {
1040                                 pgcntp = kmem_zalloc(colors_per_szc[r] *
1041                                     sizeof (pgcnt_t), KM_NOSLEEP);
1042                                 if (pgcntp == NULL) {
1043                                         rc = ENOMEM;
1044                                         goto cleanup;
1045                                 }
1046                                 pi->pcc_color_free = pgcntp;
1047                         }
1048                 }
1049         }
1050
1051         /*
1052          * Grab the write lock to prevent others from walking these arrays
1053          * while we are modifying them.
1054          */
1055         PAGE_CTRS_WRITE_LOCK(mnode);
1056
1057         /*
1058          * For interleaved mnodes, find the first mnode
1059          * with valid page counters since the current
1060          * mnode may have just been added and not have
1061          * valid page counters.
1062          */
1063         if (interleaved_mnodes) {
1064                 for (i = 0; i < max_mem_nodes; i++)
1065                         if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1066                                 break;
1067                 ASSERT(i < max_mem_nodes);
1068                 oldmnode = i;
1069         } else
1070                 oldmnode = mnode;
1071
1072         old_nranges = mnode_nranges[mnode];
1073         cands_cache_nranges = old_nranges;
1074         mnode_nranges[mnode] = nranges;
1075         old_maxmrange = mnode_maxmrange[mnode];
1076         mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1077         new_maxmrange = mnode_maxmrange[mnode];
1078
1079         for (r = 1; r < mmu_page_sizes; r++) {
1080                 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1081                 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1082                 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1083                 oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1084                 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1085                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1086                         old_color_array[mrange] =
1087                             PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1088                             r, mrange);
1089                 }
1090
1091                 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1092                 new_ctr = ctr_cache[r];
1093                 ctr_cache[r] = NULL;
1094                 if (old_ctr != NULL &&
1095                     (oldbase + old_npgs > newbase) &&
1096                     (newbase + npgs > oldbase)) {
1097                         /*
1098                          * Map the intersection of the old and new
1099                          * counters into the new array.
1100                          */
1101                         size_t offset;
1102                         if (newbase > oldbase) {
1103                                 offset = (newbase - oldbase) >>
1104                                     PAGE_COUNTERS_SHIFT(mnode, r);
1105                                 bcopy(old_ctr + offset, new_ctr,
1106                                     MIN(pcsz, (old_csz - offset)) *
1107                                     sizeof (hpmctr_t));
1108                         } else {
1109                                 offset = (oldbase - newbase) >>
1110                                     PAGE_COUNTERS_SHIFT(mnode, r);
1111                                 bcopy(old_ctr, new_ctr + offset,
1112                                     MIN(pcsz - offset, old_csz) *
1113                                     sizeof (hpmctr_t));
1114                         }
1115                 }
1116
1117                 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1118                 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1119                 PAGE_COUNTERS_BASE(mnode, r) = newbase;
1120
1121                 /* update shared hpm_counters in other mnodes */
1122                 if (interleaved_mnodes) {
1123                         for (i = 0; i < max_mem_nodes; i++) {
1124                                 if ((i == mnode) ||
1125                                     (mem_node_config[i].exists == 0))
1126                                         continue;
1127                                 ASSERT(
1128                                     PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1129                                     PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1130                                 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1131                                 PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1132                                 PAGE_COUNTERS_BASE(i, r) = newbase;
1133                         }
1134                 }
1135
1136                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1137                         PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1138                             color_cache[r][mrange];
1139                         color_cache[r][mrange] = NULL;
1140                 }
1141                 /*
1142                  * for now, just reset on these events as it's probably
1143                  * not worthwhile to try and optimize this.
1144                  */
1145                 for (i = 0; i < colors_per_szc[r]; i++) {
1146                         uint_t color_mask = colors_per_szc[r] - 1;
1147                         int mlo = interleaved_mnodes ? 0 : mnode;
1148                         int mhi = interleaved_mnodes ? max_mem_nodes :
1149                             (mnode + 1);
1150                         int m;
1151                         pfn_t  pfnum;
1152                         size_t idx;
1153                         MEM_NODE_ITERATOR_DECL(it);
1154
1155                         for (m = mlo; m < mhi; m++) {
1156                                 if (mem_node_config[m].exists == 0)
1157                                         continue;
1158                                 pfnum = newbase;
1159                                 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1160                                 if (pfnum == (pfn_t)-1) {
1161                                         idx = 0;
1162                                 } else {
1163                                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1164                                             color_mask, color_mask, &it);
1165                                         idx = PNUM_TO_IDX(m, r, pfnum);
1166                                         idx = (idx < pcsz) ? idx : 0;
1167                                 }
1168                                 for (mrange = 0; mrange < nranges; mrange++) {
1169                                         if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1170                                             r, mrange) != NULL)
1171                                                 PAGE_COUNTERS_CURRENT_COLOR(m,
1172                                                     r, i, mrange) = idx;
1173                                 }
1174                         }
1175                 }
1176
1177                 /* cache info for freeing out of the critical path */
1178                 if ((caddr_t)old_ctr >= kernelheap &&
1179                     (caddr_t)old_ctr < ekernelheap) {
1180                         ctr_cache[r] = old_ctr;
1181                         size_cache[r] = old_csz;
1182                 }
1183                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1184                         size_t *tmp = old_color_array[mrange];
1185                         if ((caddr_t)tmp >= kernelheap &&
1186                             (caddr_t)tmp < ekernelheap) {
1187                                 color_cache[r][mrange] = tmp;
1188                         }
1189                 }
1190                 /*
1191                  * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1192                  * satisfy the identity requirement.
1193                  * We should be able to go from one to the other
1194                  * and get consistent values.
1195                  */
1196                 ASSERT(PNUM_TO_IDX(mnode, r,
1197                     (IDX_TO_PNUM(mnode, r, 0))) == 0);
1198                 ASSERT(IDX_TO_PNUM(mnode, r,
1199                     (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1200
1201                 /* pcc_info_t and pcc_color_free */
1202                 for (i = 0; i < NPC_MUTEX; i++) {
1203                         pcc_info_t *epi;
1204                         pcc_info_t *eold_pi;
1205
1206                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1207                         old_pi = page_ctrs_cands[i][r][mnode];
1208                         page_ctrs_cands[i][r][mnode] = pi;
1209                         cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1210
1211                         /* preserve old pcc_color_free values, if any */
1212                         if (old_pi == NULL)
1213                                 continue;
1214
1215                         /*
1216                          * when/if x86 does DR, must account for
1217                          * possible change in range index when
1218                          * preserving pcc_info
1219                          */
1220                         epi = &pi[nranges];
1221                         eold_pi = &old_pi[old_nranges];
1222                         if (new_maxmrange > old_maxmrange) {
1223                                 pi += new_maxmrange - old_maxmrange;
1224                         } else if (new_maxmrange < old_maxmrange) {
1225                                 old_pi += old_maxmrange - new_maxmrange;
1226                         }
1227                         for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1228                                 pcc_info_t tmp = *pi;
1229                                 *pi = *old_pi;
1230                                 *old_pi = tmp;
1231                         }
1232                 }
1233         }
1234         PAGE_CTRS_WRITE_UNLOCK(mnode);
1235
1236         /*
1237          * Now that we have dropped the write lock, it is safe to free all
1238          * of the memory we have cached above.
1239          * We come thru here to free memory when pre-alloc fails, and also to
1240          * free old pointers which were recorded while locked.
1241          */
1242 cleanup:
1243         for (r = 1; r < mmu_page_sizes; r++) {
1244                 if (ctr_cache[r] != NULL) {
1245                         kmem_free(ctr_cache[r],
1246                             size_cache[r] * sizeof (hpmctr_t));
1247                 }
1248                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1249                         if (color_cache[r][mrange] != NULL) {
1250                                 kmem_free(color_cache[r][mrange],
1251                                     colors_per_szc[r] * sizeof (size_t));
1252                         }
1253                 }
1254                 for (i = 0; i < NPC_MUTEX; i++) {
1255                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1256                         if (pi == NULL)
1257                                 continue;
1258                         nr = cands_cache_nranges;
1259                         for (mrange = 0; mrange < nr; mrange++, pi++) {
1260                                 pgcntp = pi->pcc_color_free;
1261                                 if (pgcntp == NULL)
1262                                         continue;
1263                                 if ((caddr_t)pgcntp >= kernelheap &&
1264                                     (caddr_t)pgcntp < ekernelheap) {
1265                                         kmem_free(pgcntp,
1266                                             colors_per_szc[r] *
1267                                             sizeof (pgcnt_t));
1268                                 }
1269                         }
1270                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1271                         if ((caddr_t)pi >= kernelheap &&
1272                             (caddr_t)pi < ekernelheap) {
1273                                 kmem_free(pi, nr * sizeof (pcc_info_t));
1274                         }
1275                 }
1276         }
1277
1278         kmem_free(cands_cache,
1279             sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1280         return (rc);
1281 }
1282
1283 /*
1284  * Cleanup the hpm_counters field in the page counters
1285  * array.
1286  */
1287 void
1288 page_ctrs_cleanup(void)
1289 {
1290         int r;  /* region size */
1291         int i;  /* mnode index */
1292
1293         /*
1294          * Get the page counters write lock while we are
1295          * setting the page hpm_counters field to NULL
1296          * for non-existent mnodes.
1297          */
1298         for (i = 0; i < max_mem_nodes; i++) {
1299                 PAGE_CTRS_WRITE_LOCK(i);
1300                 if (mem_node_config[i].exists) {
1301                         PAGE_CTRS_WRITE_UNLOCK(i);
1302                         continue;
1303                 }
1304                 for (r = 1; r < mmu_page_sizes; r++) {
1305                         PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1306                 }
1307                 PAGE_CTRS_WRITE_UNLOCK(i);
1308         }
1309 }
1310
1311 #ifdef DEBUG
1312
1313 /*
1314  * confirm pp is a large page corresponding to szc
1315  */
1316 void
1317 chk_lpg(page_t *pp, uchar_t szc)
1318 {
1319         spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1320         uint_t noreloc;
1321
1322         if (npgs == 1) {
1323                 ASSERT(pp->p_szc == 0);
1324                 ASSERT(pp->p_next == pp);
1325                 ASSERT(pp->p_prev == pp);
1326                 return;
1327         }
1328
1329         ASSERT(pp->p_list.largepg.next == pp || pp->p_list.largepg.next == NULL);
1330         ASSERT(pp->p_list.largepg.prev == pp || pp->p_list.largepg.prev == NULL);
1331
1332         ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1333         ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1334         ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1335         ASSERT(pp->p_prev == (pp + (npgs - 1)));
1336
1337         /*
1338          * Check list of pages.
1339          */
1340         noreloc = PP_ISNORELOC(pp);
1341         while (npgs--) {
1342                 if (npgs != 0) {
1343                         ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1344                         ASSERT(pp->p_next == (pp + 1));
1345                 }
1346                 ASSERT(pp->p_szc == szc);
1347                 ASSERT(PP_ISFREE(pp));
1348                 ASSERT(PP_ISAGED(pp));
1349                 ASSERT(pp->p_list.largepg.next == pp || pp->p_list.largepg.next == NULL);
1350                 ASSERT(pp->p_list.largepg.prev == pp || pp->p_list.largepg.prev == NULL);
1351                 VERIFY(pp->p_object == NULL);
1352                 ASSERT(pp->p_vnode  == NULL);
1353                 ASSERT(PP_ISNORELOC(pp) == noreloc);
1354
1355                 pp = pp->p_next;
1356         }
1357 }
1358 #endif /* DEBUG */
1359
1360 void
1361 page_freelist_lock(int mnode)
1362 {
1363         int i;
1364         for (i = 0; i < NPC_MUTEX; i++) {
1365                 mutex_enter(FPC_MUTEX(mnode, i));
1366                 mutex_enter(CPC_MUTEX(mnode, i));
1367         }
1368 }
1369
1370 void
1371 page_freelist_unlock(int mnode)
1372 {
1373         int i;
1374         for (i = 0; i < NPC_MUTEX; i++) {
1375                 mutex_exit(FPC_MUTEX(mnode, i));
1376                 mutex_exit(CPC_MUTEX(mnode, i));
1377         }
1378 }
1379
1380 /*
1381  * add pp to the specified page list. Defaults to head of the page list
1382  * unless PG_LIST_TAIL is specified.
1383  */
1384 void
1385 page_list_add(page_t *pp, int flags)
1386 {
1387         page_t          **ppp;
1388         kmutex_t        *pcm;
1389         uint_t          bin, mtype;
1390         int             mnode;
1391
1392         ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1393         ASSERT(PP_ISFREE(pp));
1394         ASSERT(!hat_page_is_mapped(pp));
1395         ASSERT(hat_page_getshare(pp) == 0);
1396
1397         /*
1398          * Large pages should be freed via page_list_add_pages().
1399          */
1400         ASSERT(pp->p_szc == 0);
1401
1402         /*
1403          * Don't need to lock the freelist first here
1404          * because the page isn't on the freelist yet.
1405          * This means p_szc can't change on us.
1406          */
1407
1408         bin = PP_2_BIN(pp);
1409         mnode = PP_2_MEM_NODE(pp);
1410         mtype = PP_2_MTYPE(pp);
1411
1412         if (flags & PG_LIST_ISINIT) {
1413                 /*
1414                  * PG_LIST_ISINIT is set during system startup (ie. single
1415                  * threaded), add a page to the free list and add to the
1416                  * the free region counters w/o any locking
1417                  */
1418                 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1419
1420                 /* inline version of page_add() */
1421                 if (*ppp != NULL) {
1422                         pp->p_next = *ppp;
1423                         pp->p_prev = (*ppp)->p_prev;
1424                         (*ppp)->p_prev = pp;
1425                         pp->p_prev->p_next = pp;
1426                 } else
1427                         *ppp = pp;
1428
1429                 page_ctr_add_internal(mnode, mtype, pp, flags);
1430                 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1431         } else {
1432                 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1433
1434                 if (flags & PG_FREE_LIST) {
1435                         VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1436                         ASSERT(PP_ISAGED(pp));
1437                         ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1438
1439                 } else {
1440                         VM_STAT_ADD(vmm_vmstats.pladd_cache);
1441                         VERIFY(pp->p_object);
1442                         ASSERT(pp->p_vnode);
1443                         ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1444                         ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1445                 }
1446                 mutex_enter(pcm);
1447                 page_add(ppp, pp);
1448
1449                 if (flags & PG_LIST_TAIL)
1450                         *ppp = (*ppp)->p_next;
1451                 /*
1452                  * Add counters before releasing pcm mutex to avoid a race with
1453                  * page_freelist_coalesce and page_freelist_split.
1454                  */
1455                 page_ctr_add(mnode, mtype, pp, flags);
1456                 mutex_exit(pcm);
1457         }
1458
1459
1460 #if defined(__sparc)
1461         if (PP_ISNORELOC(pp)) {
1462                 kcage_freemem_add(1);
1463         }
1464 #endif
1465         /*
1466          * It is up to the caller to unlock the page!
1467          */
1468         ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1469 }
1470
1471
1472 #ifdef __sparc
1473 /*
1474  * This routine is only used by kcage_init during system startup.
1475  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1476  * without the overhead of taking locks and updating counters.
1477  */
1478 void
1479 page_list_noreloc_startup(page_t *pp)
1480 {
1481         page_t          **ppp;
1482         uint_t          bin;
1483         int             mnode;
1484         int             mtype;
1485         int             flags = 0;
1486
1487         /*
1488          * If this is a large page on the freelist then
1489          * break it up into smaller pages.
1490          */
1491         if (pp->p_szc != 0)
1492                 page_boot_demote(pp);
1493
1494         /*
1495          * Get list page is currently on.
1496          */
1497         bin = PP_2_BIN(pp);
1498         mnode = PP_2_MEM_NODE(pp);
1499         mtype = PP_2_MTYPE(pp);
1500         ASSERT(mtype == MTYPE_RELOC);
1501         ASSERT(pp->p_szc == 0);
1502
1503         if (PP_ISAGED(pp)) {
1504                 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1505                 flags |= PG_FREE_LIST;
1506         } else {
1507                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1508                 flags |= PG_CACHE_LIST;
1509         }
1510
1511         ASSERT(*ppp != NULL);
1512
1513         /*
1514          * Delete page from current list.
1515          */
1516         if (*ppp == pp)
1517                 *ppp = pp->p_next;              /* go to next page */
1518         if (*ppp == pp) {
1519                 *ppp = NULL;                    /* page list is gone */
1520         } else {
1521                 pp->p_prev->p_next = pp->p_next;
1522                 pp->p_next->p_prev = pp->p_prev;
1523         }
1524
1525         /*
1526          * Decrement page counters
1527          */
1528         page_ctr_sub_internal(mnode, mtype, pp, flags);
1529
1530         /*
1531          * Set no reloc for cage initted pages.
1532          */
1533         PP_SETNORELOC(pp);
1534
1535         mtype = PP_2_MTYPE(pp);
1536         ASSERT(mtype == MTYPE_NORELOC);
1537
1538         /*
1539          * Get new list for page.
1540          */
1541         if (PP_ISAGED(pp)) {
1542                 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1543         } else {
1544                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1545         }
1546
1547         /*
1548          * Insert page on new list.
1549          */
1550         if (*ppp == NULL) {
1551                 *ppp = pp;
1552                 pp->p_next = pp->p_prev = pp;
1553         } else {
1554                 pp->p_next = *ppp;
1555                 pp->p_prev = (*ppp)->p_prev;
1556                 (*ppp)->p_prev = pp;
1557                 pp->p_prev->p_next = pp;
1558         }
1559
1560         /*
1561          * Increment page counters
1562          */
1563         page_ctr_add_internal(mnode, mtype, pp, flags);
1564
1565         /*
1566          * Update cage freemem counter
1567          */
1568         atomic_inc_ulong(&kcage_freemem);
1569 }
1570 #else   /* __sparc */
1571
1572 /* ARGSUSED */
1573 void
1574 page_list_noreloc_startup(page_t *pp)
1575 {
1576         panic("page_list_noreloc_startup: should be here only for sparc");
1577 }
1578 #endif
1579
1580 void
1581 page_list_add_pages(page_t *pp, int flags)
1582 {
1583         kmutex_t *pcm;
1584         pgcnt_t pgcnt;
1585         uint_t  bin, mtype, i;
1586         int     mnode;
1587
1588         /* default to freelist/head */
1589         ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1590
1591         CHK_LPG(pp, pp->p_szc);
1592         VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1593
1594         bin = PP_2_BIN(pp);
1595         mnode = PP_2_MEM_NODE(pp);
1596         mtype = PP_2_MTYPE(pp);
1597
1598         if (flags & PG_LIST_ISINIT) {
1599                 ASSERT(pp->p_szc == mmu_page_sizes - 1);
1600                 page_lpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1601                 ASSERT(!PP_ISNORELOC(pp));
1602                 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1603         } else {
1604
1605                 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1606
1607                 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1608
1609                 mutex_enter(pcm);
1610                 page_lpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1611                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1612                 mutex_exit(pcm);
1613
1614                 pgcnt = page_get_pagecnt(pp->p_szc);
1615 #if defined(__sparc)
1616                 if (PP_ISNORELOC(pp))
1617                         kcage_freemem_add(pgcnt);
1618 #endif
1619                 for (i = 0; i < pgcnt; i++, pp++)
1620                         page_unlock_nocapture(pp);
1621         }
1622 }
1623
1624 /*
1625  * During boot, need to demote a large page to base
1626  * pagesize pages for seg_kmem for use in boot_alloc()
1627  */
1628 void
1629 page_boot_demote(page_t *pp)
1630 {
1631         ASSERT(pp->p_szc != 0);
1632         ASSERT(PP_ISFREE(pp));
1633         ASSERT(PP_ISAGED(pp));
1634
1635         (void) page_demote(PP_2_MEM_NODE(pp),
1636             PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1637             PC_FREE);
1638
1639         ASSERT(PP_ISFREE(pp));
1640         ASSERT(PP_ISAGED(pp));
1641         ASSERT(pp->p_szc == 0);
1642 }
1643
1644 /*
1645  * Take a particular page off of whatever freelist the page
1646  * is claimed to be on.
1647  *
1648  * NOTE: Only used for PAGESIZE pages.
1649  */
1650 void
1651 page_list_sub(page_t *pp, int flags)
1652 {
1653         int             bin;
1654         uint_t          mtype;
1655         int             mnode;
1656         kmutex_t        *pcm;
1657         page_t          **ppp;
1658
1659         ASSERT(PAGE_EXCL(pp));
1660         ASSERT(PP_ISFREE(pp));
1661
1662         /*
1663          * The p_szc field can only be changed by page_promote()
1664          * and page_demote(). Only free pages can be promoted and
1665          * demoted and the free list MUST be locked during these
1666          * operations. So to prevent a race in page_list_sub()
1667          * between computing which bin of the freelist lock to
1668          * grab and actually grabing the lock we check again that
1669          * the bin we locked is still the correct one. Notice that
1670          * the p_szc field could have actually changed on us but
1671          * if the bin happens to still be the same we are safe.
1672          */
1673 try_again:
1674         bin = PP_2_BIN(pp);
1675         mnode = PP_2_MEM_NODE(pp);
1676         pcm = PC_BIN_MUTEX(mnode, bin, flags);
1677         mutex_enter(pcm);
1678         if (PP_2_BIN(pp) != bin) {
1679                 mutex_exit(pcm);
1680                 goto try_again;
1681         }
1682         mtype = PP_2_MTYPE(pp);
1683
1684         if (flags & PG_FREE_LIST) {
1685                 VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1686                 ASSERT(PP_ISAGED(pp));
1687                 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1688         } else {
1689                 VM_STAT_ADD(vmm_vmstats.plsub_cache);
1690                 ASSERT(!PP_ISAGED(pp));
1691                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1692         }
1693
1694         /*
1695          * Common PAGESIZE case.
1696          *
1697          * Note that we locked the freelist. This prevents
1698          * any page promotion/demotion operations. Therefore
1699          * the p_szc will not change until we drop pcm mutex.
1700          */
1701         if (pp->p_szc == 0) {
1702                 page_sub(ppp, pp);
1703                 /*
1704                  * Subtract counters before releasing pcm mutex
1705                  * to avoid race with page_freelist_coalesce.
1706                  */
1707                 page_ctr_sub(mnode, mtype, pp, flags);
1708                 mutex_exit(pcm);
1709
1710 #if defined(__sparc)
1711                 if (PP_ISNORELOC(pp)) {
1712                         kcage_freemem_sub(1);
1713                 }
1714 #endif
1715                 return;
1716         }
1717
1718         /*
1719          * Large pages on the cache list are not supported.
1720          */
1721         if (flags & PG_CACHE_LIST)
1722                 panic("page_list_sub: large page on cachelist");
1723
1724         /*
1725          * Slow but rare.
1726          *
1727          * Somebody wants this particular page which is part
1728          * of a large page. In this case we just demote the page
1729          * if it's on the freelist.
1730          *
1731          * We have to drop pcm before locking the entire freelist.
1732          * Once we have re-locked the freelist check to make sure
1733          * the page hasn't already been demoted or completely
1734          * freed.
1735          */
1736         mutex_exit(pcm);
1737         page_freelist_lock(mnode);
1738         if (pp->p_szc != 0) {
1739                 /*
1740                  * Large page is on freelist.
1741                  */
1742                 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1743                     0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1744         }
1745         ASSERT(PP_ISFREE(pp));
1746         ASSERT(PP_ISAGED(pp));
1747         ASSERT(pp->p_szc == 0);
1748
1749         /*
1750          * Subtract counters before releasing pcm mutex
1751          * to avoid race with page_freelist_coalesce.
1752          */
1753         bin = PP_2_BIN(pp);
1754         mtype = PP_2_MTYPE(pp);
1755         ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1756
1757         page_sub(ppp, pp);
1758         page_ctr_sub(mnode, mtype, pp, flags);
1759         page_freelist_unlock(mnode);
1760
1761 #if defined(__sparc)
1762         if (PP_ISNORELOC(pp)) {
1763                 kcage_freemem_sub(1);
1764         }
1765 #endif
1766 }
1767
1768 void
1769 page_list_sub_pages(page_t *pp, uint_t szc)
1770 {
1771         kmutex_t *pcm;
1772         uint_t  bin, mtype;
1773         int     mnode;
1774
1775         ASSERT(PAGE_EXCL(pp));
1776         ASSERT(PP_ISFREE(pp));
1777         ASSERT(PP_ISAGED(pp));
1778
1779         /*
1780          * See comment in page_list_sub().
1781          */
1782 try_again:
1783         bin = PP_2_BIN(pp);
1784         mnode = PP_2_MEM_NODE(pp);
1785         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1786         mutex_enter(pcm);
1787         if (PP_2_BIN(pp) != bin) {
1788                 mutex_exit(pcm);
1789                 goto    try_again;
1790         }
1791
1792         /*
1793          * If we're called with a page larger than szc or it got
1794          * promoted above szc before we locked the freelist then
1795          * drop pcm and re-lock entire freelist. If page still larger
1796          * than szc then demote it.
1797          */
1798         if (pp->p_szc > szc) {
1799                 mutex_exit(pcm);
1800                 pcm = NULL;
1801                 page_freelist_lock(mnode);
1802                 if (pp->p_szc > szc) {
1803                         VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1804                         (void) page_demote(mnode,
1805                             PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1806                             pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1807                 }
1808                 bin = PP_2_BIN(pp);
1809         }
1810         ASSERT(PP_ISFREE(pp));
1811         ASSERT(PP_ISAGED(pp));
1812         ASSERT(pp->p_szc <= szc);
1813         ASSERT(pp == PP_PAGEROOT(pp));
1814
1815         VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1816
1817         mtype = PP_2_MTYPE(pp);
1818         if (pp->p_szc != 0) {
1819                 page_lpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1820                 CHK_LPG(pp, pp->p_szc);
1821         } else {
1822                 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1823                 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1824         }
1825         page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1826
1827         if (pcm != NULL) {
1828                 mutex_exit(pcm);
1829         } else {
1830                 page_freelist_unlock(mnode);
1831         }
1832
1833 #if defined(__sparc)
1834         if (PP_ISNORELOC(pp)) {
1835                 pgcnt_t pgcnt;
1836
1837                 pgcnt = page_get_pagecnt(pp->p_szc);
1838                 kcage_freemem_sub(pgcnt);
1839         }
1840 #endif
1841 }
1842
1843 /*
1844  * Add the page to the front of a linked list of pages
1845  * using the p_next & p_prev pointers for the list.
1846  * The caller is responsible for protecting the list pointers.
1847  */
1848 void
1849 mach_page_add(page_t **ppp, page_t *pp)
1850 {
1851         if (*ppp == NULL) {
1852                 pp->p_next = pp->p_prev = pp;
1853         } else {
1854                 pp->p_next = *ppp;
1855                 pp->p_prev = (*ppp)->p_prev;
1856                 (*ppp)->p_prev = pp;
1857                 pp->p_prev->p_next = pp;
1858         }
1859         *ppp = pp;
1860 }
1861
1862 /*
1863  * Remove this page from a linked list of pages
1864  * using the p_next & p_prev pointers for the list.
1865  *
1866  * The caller is responsible for protecting the list pointers.
1867  */
1868 void
1869 mach_page_sub(page_t **ppp, page_t *pp)
1870 {
1871         ASSERT(PP_ISFREE(pp));
1872
1873         if (*ppp == NULL || pp == NULL)
1874                 panic("mach_page_sub");
1875
1876         if (*ppp == pp)
1877                 *ppp = pp->p_next;              /* go to next page */
1878
1879         if (*ppp == pp)
1880                 *ppp = NULL;                    /* page list is gone */
1881         else {
1882                 pp->p_prev->p_next = pp->p_next;
1883                 pp->p_next->p_prev = pp->p_prev;
1884         }
1885         pp->p_prev = pp->p_next = pp;           /* make pp a list of one */
1886 }
1887
1888 /*
1889  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1890  */
1891 void
1892 page_promote_size(page_t *pp, uint_t cur_szc)
1893 {
1894         pfn_t pfn;
1895         int mnode;
1896         int idx;
1897         int new_szc = cur_szc + 1;
1898         int full = FULL_REGION_CNT(new_szc);
1899
1900         pfn = page_pptonum(pp);
1901         mnode = PFN_2_MEM_NODE(pfn);
1902
1903         page_freelist_lock(mnode);
1904
1905         idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1906         if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1907                 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1908
1909         page_freelist_unlock(mnode);
1910 }
1911
1912 static uint_t page_promote_err;
1913 static uint_t page_promote_noreloc_err;
1914
1915 /*
1916  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1917  * for the given mnode starting at pfnum. Pages involved are on the freelist
1918  * before the call and may be returned to the caller if requested, otherwise
1919  * they will be placed back on the freelist.
1920  * If flags is PC_ALLOC, then the large page will be returned to the user in
1921  * a state which is consistent with a page being taken off the freelist.  If
1922  * we failed to lock the new large page, then we will return NULL to the
1923  * caller and put the large page on the freelist instead.
1924  * If flags is PC_FREE, then the large page will be placed on the freelist,
1925  * and NULL will be returned.
1926  * The caller is responsible for locking the freelist as well as any other
1927  * accounting which needs to be done for a returned page.
1928  *
1929  * RFE: For performance pass in pp instead of pfnum so
1930  *      we can avoid excessive calls to page_numtopp_nolock().
1931  *      This would depend on an assumption that all contiguous
1932  *      pages are in the same memseg so we can just add/dec
1933  *      our pp.
1934  *
1935  * Lock ordering:
1936  *
1937  *      There is a potential but rare deadlock situation
1938  *      for page promotion and demotion operations. The problem
1939  *      is there are two paths into the freelist manager and
1940  *      they have different lock orders:
1941  *
1942  *      page_create()
1943  *              lock freelist
1944  *              page_lock(EXCL)
1945  *              unlock freelist
1946  *              return
1947  *              caller drops page_lock
1948  *
1949  *      page_free() and page_reclaim()
1950  *              caller grabs page_lock(EXCL)
1951  *
1952  *              lock freelist
1953  *              unlock freelist
1954  *              drop page_lock
1955  *
1956  *      What prevents a thread in page_create() from deadlocking
1957  *      with a thread freeing or reclaiming the same page is the
1958  *      page_trylock() in page_get_freelist(). If the trylock fails
1959  *      it skips the page.
1960  *
1961  *      The lock ordering for promotion and demotion is the same as
1962  *      for page_create(). Since the same deadlock could occur during
1963  *      page promotion and freeing or reclaiming of a page on the
1964  *      cache list we might have to fail the operation and undo what
1965  *      have done so far. Again this is rare.
1966  */
1967 page_t *
1968 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1969 {
1970         page_t          *pp, *pplist, *tpp, *start_pp;
1971         pgcnt_t         new_npgs, npgs;
1972         uint_t          bin;
1973         pgcnt_t         tmpnpgs, pages_left;
1974         uint_t          noreloc;
1975         int             which_list;
1976         ulong_t         index;
1977         kmutex_t        *phm;
1978
1979         /*
1980          * General algorithm:
1981          * Find the starting page
1982          * Walk each page struct removing it from the freelist,
1983          * and linking it to all the other pages removed.
1984          * Once all pages are off the freelist,
1985          * walk the list, modifying p_szc to new_szc and what
1986          * ever other info needs to be done to create a large free page.
1987          * According to the flags, either return the page or put it
1988          * on the freelist.
1989          */
1990
1991         start_pp = page_numtopp_nolock(pfnum);
1992         ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1993         new_npgs = page_get_pagecnt(new_szc);
1994         ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1995
1996         /* don't return page of the wrong mtype */
1997         if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1998                         return (NULL);
1999
2000         /*
2001          * Loop through smaller pages to confirm that all pages
2002          * give the same result for PP_ISNORELOC().
2003          * We can check this reliably here as the protocol for setting
2004          * P_NORELOC requires pages to be taken off the free list first.
2005          */
2006         noreloc = PP_ISNORELOC(start_pp);
2007         for (pp = start_pp + new_npgs; --pp > start_pp; ) {
2008                 if (noreloc != PP_ISNORELOC(pp)) {
2009                         page_promote_noreloc_err++;
2010                         page_promote_err++;
2011                         return (NULL);
2012                 }
2013         }
2014
2015         pages_left = new_npgs;
2016         pplist = NULL;
2017         pp = start_pp;
2018
2019         /* Loop around coalescing the smaller pages into a big page. */
2020         while (pages_left) {
2021                 /*
2022                  * Remove from the freelist.
2023                  */
2024                 ASSERT(PP_ISFREE(pp));
2025                 bin = PP_2_BIN(pp);
2026                 ASSERT(mnode == PP_2_MEM_NODE(pp));
2027                 mtype = PP_2_MTYPE(pp);
2028                 if (PP_ISAGED(pp)) {
2029
2030                         /*
2031                          * PG_FREE_LIST
2032                          */
2033                         if (pp->p_szc) {
2034                                 page_lpsub(&PAGE_FREELISTS(mnode,
2035                                     pp->p_szc, bin, mtype), pp);
2036                         } else {
2037                                 mach_page_sub(&PAGE_FREELISTS(mnode, 0,
2038                                     bin, mtype), pp);
2039                         }
2040                         which_list = PG_FREE_LIST;
2041                 } else {
2042                         struct vmobject *obj;
2043
2044                         ASSERT(pp->p_szc == 0);
2045
2046                         /*
2047                          * PG_CACHE_LIST
2048                          *
2049                          * Since this page comes from the
2050                          * cachelist, we must destroy the
2051                          * vnode association.
2052                          */
2053                         if (!page_trylock(pp, SE_EXCL)) {
2054                                 goto fail_promote;
2055                         }
2056
2057                         obj = &pp->p_vnode->v_object;
2058
2059                         /*
2060                          * We need to be careful not to deadlock
2061                          * with another thread in page_lookup().
2062                          * The page_lookup() thread could be holding
2063                          * the same phm that we need if the two
2064                          * pages happen to hash to the same phm lock.
2065                          * At this point we have locked the entire
2066                          * freelist and page_lookup() could be trying
2067                          * to grab a freelist lock.
2068                          */
2069                         if (!vmobject_trylock(obj)) {
2070                                 page_unlock_nocapture(pp);
2071                                 goto fail_promote;
2072                         }
2073
2074                         mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2075                         page_hashout(pp, true);
2076                         vmobject_unlock(obj);
2077                         PP_SETAGED(pp);
2078                         page_unlock_nocapture(pp);
2079                         which_list = PG_CACHE_LIST;
2080                 }
2081                 page_ctr_sub(mnode, mtype, pp, which_list);
2082
2083                 /*
2084                  * Concatenate the smaller page(s) onto
2085                  * the large page list.
2086                  */
2087                 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2088                 pages_left -= npgs;
2089                 tpp = pp;
2090                 while (npgs--) {
2091                         tpp->p_szc = new_szc;
2092                         tpp = tpp->p_next;
2093                 }
2094                 page_list_concat(&pplist, &pp);
2095                 pp += tmpnpgs;
2096         }
2097         CHK_LPG(pplist, new_szc);
2098
2099         /*
2100          * return the page to the user if requested
2101          * in the properly locked state.
2102          */
2103         if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2104                 return (pplist);
2105         }
2106
2107         /*
2108          * Otherwise place the new large page on the freelist
2109          */
2110         bin = PP_2_BIN(pplist);
2111         mnode = PP_2_MEM_NODE(pplist);
2112         mtype = PP_2_MTYPE(pplist);
2113         page_lpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2114
2115         page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2116         return (NULL);
2117
2118 fail_promote:
2119         /*
2120          * A thread must have still been freeing or
2121          * reclaiming the page on the cachelist.
2122          * To prevent a deadlock undo what we have
2123          * done sofar and return failure. This
2124          * situation can only happen while promoting
2125          * PAGESIZE pages.
2126          */
2127         page_promote_err++;
2128         while (pplist) {
2129                 pp = pplist;
2130                 mach_page_sub(&pplist, pp);
2131                 pp->p_szc = 0;
2132                 bin = PP_2_BIN(pp);
2133                 mtype = PP_2_MTYPE(pp);
2134                 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2135                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2136         }
2137         return (NULL);
2138
2139 }
2140
2141 /*
2142  * Break up a large page into smaller size pages.
2143  * Pages involved are on the freelist before the call and may
2144  * be returned to the caller if requested, otherwise they will
2145  * be placed back on the freelist.
2146  * The caller is responsible for locking the freelist as well as any other
2147  * accounting which needs to be done for a returned page.
2148  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2149  * technically, any value may be passed in but PC_NO_COLOR is the standard
2150  * which should be followed for clarity's sake.
2151  * Returns a page whose pfn is < pfnmax
2152  */
2153 page_t *
2154 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2155     uchar_t new_szc, int color, int flags)
2156 {
2157         page_t  *pp, *pplist, *npplist;
2158         pgcnt_t npgs, n;
2159         uint_t  bin;
2160         uint_t  mtype;
2161         page_t  *ret_pp = NULL;
2162
2163         ASSERT(cur_szc != 0);
2164         ASSERT(new_szc < cur_szc);
2165
2166         pplist = page_numtopp_nolock(pfnum);
2167         ASSERT(pplist != NULL);
2168
2169         ASSERT(pplist->p_szc == cur_szc);
2170
2171         bin = PP_2_BIN(pplist);
2172         ASSERT(mnode == PP_2_MEM_NODE(pplist));
2173         mtype = PP_2_MTYPE(pplist);
2174         page_lpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2175
2176         CHK_LPG(pplist, cur_szc);
2177         page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2178
2179         /*
2180          * Number of PAGESIZE pages for smaller new_szc
2181          * page.
2182          */
2183         npgs = page_get_pagecnt(new_szc);
2184
2185         while (pplist) {
2186                 pp = pplist;
2187
2188                 ASSERT(pp->p_szc == cur_szc);
2189
2190                 /*
2191                  * We either break it up into PAGESIZE pages or larger.
2192                  */
2193                 if (npgs == 1) {        /* PAGESIZE case */
2194                         mach_page_sub(&pplist, pp);
2195                         ASSERT(pp->p_szc == cur_szc);
2196                         ASSERT(new_szc == 0);
2197                         ASSERT(mnode == PP_2_MEM_NODE(pp));
2198                         pp->p_szc = new_szc;
2199                         bin = PP_2_BIN(pp);
2200                         if ((bin == color) && (flags == PC_ALLOC) &&
2201                             (ret_pp == NULL) && (pfnmax == 0 ||
2202                             pp->p_pagenum < pfnmax) &&
2203                             page_trylock_cons(pp, SE_EXCL)) {
2204                                 ret_pp = pp;
2205                         } else {
2206                                 mtype = PP_2_MTYPE(pp);
2207                                 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2208                                     mtype), pp);
2209                                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2210                         }
2211                 } else {
2212                         page_t *try_to_return_this_page = NULL;
2213                         int count = 0;
2214
2215                         /*
2216                          * Break down into smaller lists of pages.
2217                          */
2218                         page_list_break(&pplist, &npplist, npgs);
2219
2220                         pp = pplist;
2221                         n = npgs;
2222                         while (n--) {
2223                                 ASSERT(pp->p_szc == cur_szc);
2224                                 /*
2225                                  * Check whether all the pages in this list
2226                                  * fit the request criteria.
2227                                  */
2228                                 if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2229                                         count++;
2230                                 }
2231                                 pp->p_szc = new_szc;
2232                                 pp = pp->p_next;
2233                         }
2234
2235                         if (count == npgs &&
2236                             (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2237                                 try_to_return_this_page = pp;
2238                         }
2239
2240                         CHK_LPG(pplist, new_szc);
2241
2242                         bin = PP_2_BIN(pplist);
2243                         if (try_to_return_this_page)
2244                                 ASSERT(mnode ==
2245                                     PP_2_MEM_NODE(try_to_return_this_page));
2246                         if ((bin == color) && (flags == PC_ALLOC) &&
2247                             (ret_pp == NULL) && try_to_return_this_page &&
2248                             page_trylock_cons(try_to_return_this_page,
2249                             SE_EXCL)) {
2250                                 ret_pp = try_to_return_this_page;
2251                         } else {
2252                                 mtype = PP_2_MTYPE(pp);
2253                                 page_lpadd(&PAGE_FREELISTS(mnode, new_szc,
2254                                     bin, mtype), pplist);
2255
2256                                 page_ctr_add(mnode, mtype, pplist,
2257                                     PG_FREE_LIST);
2258                         }
2259                         pplist = npplist;
2260                 }
2261         }
2262         return (ret_pp);
2263 }
2264
2265 int mpss_coalesce_disable = 0;
2266
2267 /*
2268  * Coalesce free pages into a page of the given szc and color if possible.
2269  * Return the pointer to the page created, otherwise, return NULL.
2270  *
2271  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2272  */
2273 page_t *
2274 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2275     int mtype, pfn_t pfnhi)
2276 {
2277         int     r = szc;                /* region size */
2278         int     mrange;
2279         uint_t  full, bin, color_mask, wrap = 0;
2280         pfn_t   pfnum, lo, hi;
2281         size_t  len, idx, idx0;
2282         pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
2283         page_t  *ret_pp;
2284         MEM_NODE_ITERATOR_DECL(it);
2285 #if defined(__sparc)
2286         pfn_t pfnum0, nlo, nhi;
2287 #endif
2288
2289         if (mpss_coalesce_disable) {
2290                 ASSERT(szc < MMU_PAGE_SIZES);
2291                 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2292                 return (NULL);
2293         }
2294
2295         ASSERT(szc < mmu_page_sizes);
2296         color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2297         ASSERT(ceq_mask <= color_mask);
2298         ASSERT(color <= color_mask);
2299         color &= ceq_mask;
2300
2301         /* Prevent page_counters dynamic memory from being freed */
2302         rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2303
2304         mrange = MTYPE_2_MRANGE(mnode, mtype);
2305         ASSERT(mrange < mnode_nranges[mnode]);
2306         VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2307
2308         /* get pfn range for mtype */
2309         len = PAGE_COUNTERS_ENTRIES(mnode, r);
2310         MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2311         hi++;
2312
2313         /* use lower limit if given */
2314         if (pfnhi != PFNNULL && pfnhi < hi)
2315                 hi = pfnhi;
2316
2317         /* round to szcpgcnt boundaries */
2318         lo = P2ROUNDUP(lo, szcpgcnt);
2319         MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2320         if (lo == (pfn_t)-1) {
2321                 rw_exit(&page_ctrs_rwlock[mnode]);
2322                 return (NULL);
2323         }
2324         hi = hi & ~(szcpgcnt - 1);
2325
2326         /* set lo to the closest pfn of the right color */
2327         if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2328             (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2329                 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2330                     &it);
2331         }
2332
2333         if (hi <= lo) {
2334                 rw_exit(&page_ctrs_rwlock[mnode]);
2335                 return (NULL);
2336         }
2337
2338         full = FULL_REGION_CNT(r);
2339
2340         /* calculate the number of page candidates and initial search index */
2341         bin = color;
2342         idx0 = (size_t)(-1);
2343         do {
2344                 pgcnt_t acand;
2345
2346                 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2347                 if (acand) {
2348                         idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2349                             r, bin, mrange);
2350                         idx0 = MIN(idx0, idx);
2351                         cands += acand;
2352                 }
2353                 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2354         } while (bin != color);
2355
2356         if (cands == 0) {
2357                 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2358                 rw_exit(&page_ctrs_rwlock[mnode]);
2359                 return (NULL);
2360         }
2361
2362         pfnum = IDX_TO_PNUM(mnode, r, idx0);
2363         if (pfnum < lo || pfnum >= hi) {
2364                 pfnum = lo;
2365         } else {
2366                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2367                 if (pfnum == (pfn_t)-1) {
2368                         pfnum = lo;
2369                         MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2370                         ASSERT(pfnum != (pfn_t)-1);
2371                 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2372                     (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2373                         /* invalid color, get the closest correct pfn */
2374                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2375                             color_mask, &it);
2376                         if (pfnum >= hi) {
2377                                 pfnum = lo;
2378                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2379                         }
2380                 }
2381         }
2382
2383         /* set starting index */
2384         idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2385         ASSERT(idx0 < len);
2386
2387 #if defined(__sparc)
2388         pfnum0 = pfnum;         /* page corresponding to idx0 */
2389         nhi = 0;                /* search kcage ranges */
2390 #endif
2391
2392         for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2393
2394 #if defined(__sparc)
2395                 /*
2396                  * Find lowest intersection of kcage ranges and mnode.
2397                  * MTYPE_NORELOC means look in the cage, otherwise outside.
2398                  */
2399                 if (nhi <= pfnum) {
2400                         if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2401                             (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2402                                 goto wrapit;
2403
2404                         /* jump to the next page in the range */
2405                         if (pfnum < nlo) {
2406                                 pfnum = P2ROUNDUP(nlo, szcpgcnt);
2407                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2408                                 idx = PNUM_TO_IDX(mnode, r, pfnum);
2409                                 if (idx >= len || pfnum >= hi)
2410                                         goto wrapit;
2411                                 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2412                                     ceq_mask)
2413                                         goto next;
2414                                 if (interleaved_mnodes &&
2415                                     PFN_2_MEM_NODE(pfnum) != mnode)
2416                                         goto next;
2417                         }
2418                 }
2419 #endif
2420
2421                 if (PAGE_COUNTERS(mnode, r, idx) != full)
2422                         goto next;
2423
2424                 /*
2425                  * RFE: For performance maybe we can do something less
2426                  *      brutal than locking the entire freelist. So far
2427                  *      this doesn't seem to be a performance problem?
2428                  */
2429                 page_freelist_lock(mnode);
2430                 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2431                         ret_pp =
2432                             page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2433                         if (ret_pp != NULL) {
2434                                 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2435                                 PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2436                                     PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2437                                 page_freelist_unlock(mnode);
2438                                 rw_exit(&page_ctrs_rwlock[mnode]);
2439 #if defined(__sparc)
2440                                 if (PP_ISNORELOC(ret_pp)) {
2441                                         pgcnt_t npgs;
2442
2443                                         npgs = page_get_pagecnt(ret_pp->p_szc);
2444                                         kcage_freemem_sub(npgs);
2445                                 }
2446 #endif
2447                                 return (ret_pp);
2448                         }
2449                 } else {
2450                         VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2451                 }
2452
2453                 page_freelist_unlock(mnode);
2454                 /*
2455                  * No point looking for another page if we've
2456                  * already tried all of the ones that
2457                  * page_ctr_cands indicated.  Stash off where we left
2458                  * off.
2459                  * Note: this is not exact since we don't hold the
2460                  * page_freelist_locks before we initially get the
2461                  * value of cands for performance reasons, but should
2462                  * be a decent approximation.
2463                  */
2464                 if (--cands == 0) {
2465                         PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2466                             idx;
2467                         break;
2468                 }
2469 next:
2470                 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2471                     color_mask, &it);
2472                 idx = PNUM_TO_IDX(mnode, r, pfnum);
2473                 if (idx >= len || pfnum >= hi) {
2474 wrapit:
2475                         pfnum = lo;
2476                         MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2477                         idx = PNUM_TO_IDX(mnode, r, pfnum);
2478                         wrap++;
2479 #if defined(__sparc)
2480                         nhi = 0;        /* search kcage ranges */
2481 #endif
2482                 }
2483         }
2484
2485         rw_exit(&page_ctrs_rwlock[mnode]);
2486         VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2487         return (NULL);
2488 }
2489
2490 /*
2491  * For the given mnode, promote as many small pages to large pages as possible.
2492  * mnode can be -1, which means do them all
2493  */
2494 void
2495 page_freelist_coalesce_all(int mnode)
2496 {
2497         int     r;              /* region size */
2498         int     idx, full;
2499         size_t  len;
2500         int doall = interleaved_mnodes || mnode < 0;
2501         int mlo = doall ? 0 : mnode;
2502         int mhi = doall ? max_mem_nodes : (mnode + 1);
2503
2504         VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2505
2506         if (mpss_coalesce_disable) {
2507                 return;
2508         }
2509
2510         /*
2511          * Lock the entire freelist and coalesce what we can.
2512          *
2513          * Always promote to the largest page possible
2514          * first to reduce the number of page promotions.
2515          */
2516         for (mnode = mlo; mnode < mhi; mnode++) {
2517                 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2518                 page_freelist_lock(mnode);
2519         }
2520         for (r = mmu_page_sizes - 1; r > 0; r--) {
2521                 for (mnode = mlo; mnode < mhi; mnode++) {
2522                         pgcnt_t cands = 0;
2523                         int mrange, nranges = mnode_nranges[mnode];
2524
2525                         for (mrange = 0; mrange < nranges; mrange++) {
2526                                 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2527                                 if (cands != 0)
2528                                         break;
2529                         }
2530                         if (cands == 0) {
2531                                 VM_STAT_ADD(vmm_vmstats.
2532                                     page_ctrs_cands_skip_all);
2533                                 continue;
2534                         }
2535
2536                         full = FULL_REGION_CNT(r);
2537                         len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2538
2539                         for (idx = 0; idx < len; idx++) {
2540                                 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2541                                         pfn_t pfnum =
2542                                             IDX_TO_PNUM(mnode, r, idx);
2543                                         int tmnode = interleaved_mnodes ?
2544                                             PFN_2_MEM_NODE(pfnum) : mnode;
2545
2546                                         ASSERT(pfnum >=
2547                                             mem_node_config[tmnode].physbase &&
2548                                             pfnum <
2549                                             mem_node_config[tmnode].physmax);
2550
2551                                         (void) page_promote(tmnode,
2552                                             pfnum, r, PC_FREE, PC_MTYPE_ANY);
2553                                 }
2554                         }
2555                         /* shared hpm_counters covers all mnodes, so we quit */
2556                         if (interleaved_mnodes)
2557                                 break;
2558                 }
2559         }
2560         for (mnode = mlo; mnode < mhi; mnode++) {
2561                 page_freelist_unlock(mnode);
2562                 rw_exit(&page_ctrs_rwlock[mnode]);
2563         }
2564 }
2565
2566 /*
2567  * This is where all polices for moving pages around
2568  * to different page size free lists is implemented.
2569  * Returns 1 on success, 0 on failure.
2570  *
2571  * So far these are the priorities for this algorithm in descending
2572  * order:
2573  *
2574  *      1) When servicing a request try to do so with a free page
2575  *         from next size up. Helps defer fragmentation as long
2576  *         as possible.
2577  *
2578  *      2) Page coalesce on demand. Only when a freelist
2579  *         larger than PAGESIZE is empty and step 1
2580  *         will not work since all larger size lists are
2581  *         also empty.
2582  *
2583  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2584  */
2585
2586 page_t *
2587 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2588     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2589 {
2590         uchar_t nszc = szc + 1;
2591         uint_t  bin, sbin, bin_prev;
2592         page_t  *pp, *firstpp;
2593         page_t  *ret_pp = NULL;
2594         uint_t  color_mask;
2595
2596         if (nszc == mmu_page_sizes)
2597                 return (NULL);
2598
2599         ASSERT(nszc < mmu_page_sizes);
2600         color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2601         bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2602         bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2603             PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2604
2605         VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2606         /*
2607          * First try to break up a larger page to fill current size freelist.
2608          */
2609         while (plw->plw_bins[nszc] != 0) {
2610
2611                 ASSERT(nszc < mmu_page_sizes);
2612
2613                 /*
2614                  * If page found then demote it.
2615                  */
2616                 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2617                         page_freelist_lock(mnode);
2618                         firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2619
2620                         /*
2621                          * If pfnhi is not PFNNULL, look for large page below
2622                          * pfnhi. PFNNULL signifies no pfn requirement.
2623                          */
2624                         if (pp &&
2625                             ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2626                             (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2627                                 do {
2628                                         pp = pp->p_list.largepg.next;
2629                                         if (pp == firstpp) {
2630                                                 pp = NULL;
2631                                                 break;
2632                                         }
2633                                 } while ((pfnhi != PFNNULL &&
2634                                     pp->p_pagenum >= pfnhi) ||
2635                                     (pfnlo != PFNNULL &&
2636                                     pp->p_pagenum < pfnlo));
2637
2638                                 if (pfnhi != PFNNULL && pp != NULL)
2639                                         ASSERT(pp->p_pagenum < pfnhi);
2640
2641                                 if (pfnlo != PFNNULL && pp != NULL)
2642                                         ASSERT(pp->p_pagenum >= pfnlo);
2643                         }
2644                         if (pp) {
2645                                 uint_t ccolor = page_correct_color(szc, nszc,
2646                                     color, bin, plw->plw_ceq_mask[szc]);
2647
2648                                 ASSERT(pp->p_szc == nszc);
2649                                 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2650                                 ret_pp = page_demote(mnode, pp->p_pagenum,
2651                                     pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2652                                 if (ret_pp) {
2653                                         page_freelist_unlock(mnode);
2654 #if defined(__sparc)
2655                                         if (PP_ISNORELOC(ret_pp)) {
2656                                                 pgcnt_t npgs;
2657
2658                                                 npgs = page_get_pagecnt(
2659                                                     ret_pp->p_szc);
2660                                                 kcage_freemem_sub(npgs);
2661                                         }
2662 #endif
2663                                         return (ret_pp);
2664                                 }
2665                         }
2666                         page_freelist_unlock(mnode);
2667                 }
2668
2669                 /* loop through next size bins */
2670                 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2671                 plw->plw_bins[nszc]--;
2672
2673                 if (bin == sbin) {
2674                         uchar_t nnszc = nszc + 1;
2675
2676                         /* we are done with this page size - check next */
2677                         if (plw->plw_bins[nnszc] == 0)
2678                                 /* we have already checked next size bins */
2679                                 break;
2680
2681                         bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2682                         if (bin_prev != INVALID_COLOR) {
2683                                 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2684                                 if (!((bin ^ bin_prev) &
2685                                     plw->plw_ceq_mask[nnszc]))
2686                                         break;
2687                         }
2688                         ASSERT(nnszc < mmu_page_sizes);
2689                         color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2690                         nszc = nnszc;
2691                         ASSERT(nszc < mmu_page_sizes);
2692                 }
2693         }
2694
2695         return (ret_pp);
2696 }
2697
2698 /*
2699  * Helper routine used only by the freelist code to lock
2700  * a page. If the page is a large page then it succeeds in
2701  * locking all the constituent pages or none at all.
2702  * Returns 1 on sucess, 0 on failure.
2703  */
2704 static int
2705 page_trylock_cons(page_t *pp, se_t se)
2706 {
2707         page_t  *tpp, *first_pp = pp;
2708
2709         /*
2710          * Fail if can't lock first or only page.
2711          */
2712         if (!page_trylock(pp, se)) {
2713                 return (0);
2714         }
2715
2716         /*
2717          * PAGESIZE: common case.
2718          */
2719         if (pp->p_szc == 0) {
2720                 return (1);
2721         }
2722
2723         /*
2724          * Large page case.
2725          */
2726         tpp = pp->p_next;
2727         while (tpp != pp) {
2728                 if (!page_trylock(tpp, se)) {
2729                         /*
2730                          * On failure unlock what we have locked so far.
2731                          * We want to avoid attempting to capture these
2732                          * pages as the pcm mutex may be held which could
2733                          * lead to a recursive mutex panic.
2734                          */
2735                         while (first_pp != tpp) {
2736                                 page_unlock_nocapture(first_pp);
2737                                 first_pp = first_pp->p_next;
2738                         }
2739                         return (0);
2740                 }
2741                 tpp = tpp->p_next;
2742         }
2743         return (1);
2744 }
2745
2746 /*
2747  * init context for walking page lists
2748  * Called when a page of the given szc in unavailable. Sets markers
2749  * for the beginning of the search to detect when search has
2750  * completed a full cycle. Sets flags for splitting larger pages
2751  * and coalescing smaller pages. Page walking procedes until a page
2752  * of the desired equivalent color is found.
2753  */
2754 void
2755 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2756     int use_ceq, page_list_walker_t *plw)
2757 {
2758         uint_t  nszc, ceq_mask, colors;
2759         uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2760
2761         ASSERT(szc < mmu_page_sizes);
2762         colors = PAGE_GET_PAGECOLORS(szc);
2763
2764         plw->plw_colors = colors;
2765         plw->plw_color_mask = colors - 1;
2766         plw->plw_bin_marker = plw->plw_bin0 = bin;
2767         plw->plw_bin_split_prev = bin;
2768         plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2769
2770         /*
2771          * if vac aliasing is possible make sure lower order color
2772          * bits are never ignored
2773          */
2774         if (vac_colors > 1)
2775                 ceq &= 0xf0;
2776
2777         /*
2778          * calculate the number of non-equivalent colors and
2779          * color equivalency mask
2780          */
2781         plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2782         ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2783         ASSERT(plw->plw_ceq_dif > 0);
2784         plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2785
2786         if (flags & PG_MATCH_COLOR) {
2787                 if (cpu_page_colors <  0) {
2788                         /*
2789                          * this is a heterogeneous machine with different CPUs
2790                          * having different size e$ (not supported for ni2/rock
2791                          */
2792                         uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2793                         cpucolors = MAX(cpucolors, 1);
2794                         ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2795                         plw->plw_ceq_mask[szc] =
2796                             MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2797                 }
2798                 plw->plw_ceq_dif = 1;
2799         }
2800
2801         /* we can split pages in the freelist, but not the cachelist */
2802         if (can_split) {
2803                 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2804
2805                 /* set next szc color masks and number of free list bins */
2806                 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2807                         plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2808                             plw->plw_ceq_mask[szc]);
2809                         plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2810                 }
2811                 plw->plw_ceq_mask[nszc] = INVALID_MASK;
2812                 plw->plw_bins[nszc] = 0;
2813
2814         } else {
2815                 ASSERT(szc == 0);
2816                 plw->plw_do_split = 0;
2817                 plw->plw_bins[1] = 0;
2818                 plw->plw_ceq_mask[1] = INVALID_MASK;
2819         }
2820 }
2821
2822 /*
2823  * set mark to flag where next split should occur
2824  */
2825 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {                    \
2826         uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);                       \
2827         uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);            \
2828         uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2829         plw->plw_split_next =                                                \
2830                 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);          \
2831         if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2832                 plw->plw_split_next =                                        \
2833                 INC_MASKED(plw->plw_split_next,                              \
2834                     neq_mask, plw->plw_color_mask);                          \
2835         }                                                                    \
2836 }
2837
2838 uint_t
2839 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2840 {
2841         uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2842         uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2843         uchar_t nszc = szc + 1;
2844
2845         nbin = ADD_MASKED(bin,
2846             plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2847
2848         if (plw->plw_do_split) {
2849                 plw->plw_bin_split_prev = bin;
2850                 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2851                 plw->plw_do_split = 0;
2852         }
2853
2854         if (szc == 0) {
2855                 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2856                         if (nbin == plw->plw_bin0 &&
2857                             (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2858                                 nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2859                                     neq_mask, plw->plw_color_mask);
2860                                 plw->plw_bin_split_prev = plw->plw_bin0;
2861                         }
2862
2863                         if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2864                                 plw->plw_bin_marker =
2865                                     nbin = INC_MASKED(nbin, neq_mask,
2866                                     plw->plw_color_mask);
2867                                 plw->plw_bin_split_prev = plw->plw_bin0;
2868                                 /*
2869                                  * large pages all have the same vac color
2870                                  * so by now we should be done with next
2871                                  * size page splitting process
2872                                  */
2873                                 ASSERT(plw->plw_bins[1] == 0);
2874                                 plw->plw_do_split = 0;
2875                                 return (nbin);
2876                         }
2877
2878                 } else {
2879                         uint_t bin_jump = (vac_colors == 1) ?
2880                             (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2881
2882                         bin_jump &= ~(vac_colors - 1);
2883
2884                         nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2885                             plw->plw_color_mask);
2886
2887                         if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2888
2889                                 plw->plw_bin_marker = nbin = nbin0;
2890
2891                                 if (plw->plw_bins[nszc] != 0) {
2892                                         /*
2893                                          * check if next page size bin is the
2894                                          * same as the next page size bin for
2895                                          * bin0
2896                                          */
2897                                         nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2898                                             nbin);
2899                                         bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2900                                             plw->plw_bin0);
2901
2902                                         if ((bin0_nsz ^ nbin_nsz) &
2903                                             plw->plw_ceq_mask[nszc])
2904                                                 plw->plw_do_split = 1;
2905                                 }
2906                                 return (nbin);
2907                         }
2908                 }
2909         }
2910
2911         if (plw->plw_bins[nszc] != 0) {
2912                 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2913                 if (!((plw->plw_split_next ^ nbin_nsz) &
2914                     plw->plw_ceq_mask[nszc]))
2915                         plw->plw_do_split = 1;
2916         }
2917
2918         return (nbin);
2919 }
2920
2921 page_t *
2922 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2923     uint_t flags)
2924 {
2925         kmutex_t                *pcm;
2926         page_t                  *pp, *first_pp;
2927         uint_t                  sbin;
2928         int                     plw_initialized;
2929         page_list_walker_t      plw;
2930
2931         ASSERT(szc < mmu_page_sizes);
2932
2933         VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2934
2935         MTYPE_START(mnode, mtype, flags);
2936         if (mtype < 0) {        /* mnode does not have memory in mtype range */
2937                 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2938                 return (NULL);
2939         }
2940 try_again:
2941
2942         plw_initialized = 0;
2943         plw.plw_ceq_dif = 1;
2944
2945         /*
2946          * Only hold one freelist lock at a time, that way we
2947          * can start anywhere and not have to worry about lock
2948          * ordering.
2949          */
2950         for (plw.plw_count = 0;
2951             plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2952                 sbin = bin;
2953                 do {
2954                         if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2955                                 goto bin_empty_1;
2956
2957                         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2958                         mutex_enter(pcm);
2959                         pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2960                         if (pp == NULL)
2961                                 goto bin_empty_0;
2962
2963                         /*
2964                          * These were set before the page
2965                          * was put on the free list,
2966                          * they must still be set.
2967                          */
2968                         ASSERT(PP_ISFREE(pp));
2969                         ASSERT(PP_ISAGED(pp));
2970                         VERIFY(pp->p_object == NULL);
2971                         ASSERT(pp->p_vnode == NULL);
2972                         ASSERT(pp->p_offset == (uoff_t)-1);
2973                         ASSERT(pp->p_szc == szc);
2974                         ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2975
2976                         /*
2977                          * Walk down the hash chain.  4k/8k pages are linked
2978                          * on p_next and p_prev fields. Large pages are a
2979                          * contiguous group of constituent pages linked
2980                          * together on their p_next and p_prev fields.  The
2981                          * large pages are linked together on the hash chain
2982                          * using p_list.largepg of the base constituent page
2983                          * of each large page.
2984                          */
2985                         first_pp = pp;
2986                         while (!page_trylock_cons(pp, SE_EXCL)) {
2987                                 if (szc == 0) {
2988                                         pp = pp->p_next;
2989                                 } else {
2990                                         pp = pp->p_list.largepg.next;
2991                                 }
2992
2993                                 ASSERT(PP_ISFREE(pp));
2994                                 ASSERT(PP_ISAGED(pp));
2995                                 VERIFY(pp->p_object == NULL);
2996                                 ASSERT(pp->p_vnode == NULL);
2997                                 ASSERT(pp->p_offset == (uoff_t)-1);
2998                                 ASSERT(pp->p_szc == szc);
2999                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3000
3001                                 if (pp == first_pp)
3002                                         goto bin_empty_0;
3003                         }
3004
3005                         ASSERT(pp != NULL);
3006                         ASSERT(mtype == PP_2_MTYPE(pp));
3007                         ASSERT(pp->p_szc == szc);
3008                         if (szc == 0) {
3009                                 page_sub(&PAGE_FREELISTS(mnode,
3010                                     szc, bin, mtype), pp);
3011                         } else {
3012                                 page_lpsub(&PAGE_FREELISTS(mnode,
3013                                     szc, bin, mtype), pp);
3014                                 CHK_LPG(pp, szc);
3015                         }
3016                         page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3017
3018                         if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
3019                                 panic("free page is not. pp %p", (void *)pp);
3020                         mutex_exit(pcm);
3021
3022 #if defined(__sparc)
3023                         ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
3024                             (flags & PG_NORELOC) == 0);
3025
3026                         if (PP_ISNORELOC(pp))
3027                                 kcage_freemem_sub(page_get_pagecnt(szc));
3028 #endif
3029                         VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
3030                         return (pp);
3031
3032 bin_empty_0:
3033                         mutex_exit(pcm);
3034 bin_empty_1:
3035                         if (plw_initialized == 0) {
3036                                 page_list_walk_init(szc, flags, bin, 1, 1,
3037                                     &plw);
3038                                 plw_initialized = 1;
3039                                 ASSERT(plw.plw_colors <=
3040                                     PAGE_GET_PAGECOLORS(szc));
3041                                 ASSERT(plw.plw_colors > 0);
3042                                 ASSERT((plw.plw_colors &
3043                                     (plw.plw_colors - 1)) == 0);
3044                                 ASSERT(bin < plw.plw_colors);
3045                                 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
3046                         }
3047                         /* calculate the next bin with equivalent color */
3048                         bin = ADD_MASKED(bin, plw.plw_bin_step,
3049                             plw.plw_ceq_mask[szc], plw.plw_color_mask);
3050                 } while (sbin != bin);
3051
3052                 /*
3053                  * color bins are all empty if color match. Try and
3054                  * satisfy the request by breaking up or coalescing
3055                  * pages from a different size freelist of the correct
3056                  * color that satisfies the ORIGINAL color requested.
3057                  * If that fails then try pages of the same size but
3058                  * different colors assuming we are not called with
3059                  * PG_MATCH_COLOR.
3060                  */
3061                 if (plw.plw_do_split &&
3062                     (pp = page_freelist_split(szc, bin, mnode,
3063                     mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3064                         return (pp);
3065
3066                 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
3067                     bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
3068                         return (pp);
3069
3070                 if (plw.plw_ceq_dif > 1)
3071                         bin = page_list_walk_next_bin(szc, bin, &plw);
3072         }
3073
3074         /* if allowed, cycle through additional mtypes */
3075         MTYPE_NEXT(mnode, mtype, flags);
3076         if (mtype >= 0)
3077                 goto try_again;
3078
3079         VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
3080
3081         return (NULL);
3082 }
3083
3084 /*
3085  * Returns the count of free pages for 'pp' with size code 'szc'.
3086  * Note: This function does not return an exact value as the page freelist
3087  * locks are not held and thus the values in the page_counters may be
3088  * changing as we walk through the data.
3089  */
3090 static int
3091 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3092 {
3093         pgcnt_t pgfree;
3094         pgcnt_t cnt;
3095         ssize_t r = szc;        /* region size */
3096         ssize_t idx;
3097         int     i;
3098         int     full, range;
3099
3100         /* Make sure pagenum passed in is aligned properly */
3101         ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3102         ASSERT(szc > 0);
3103
3104         /* Prevent page_counters dynamic memory from being freed */
3105         rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3106         idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3107         cnt = PAGE_COUNTERS(mnode, r, idx);
3108         pgfree = cnt << PNUM_SHIFT(r - 1);
3109         range = FULL_REGION_CNT(szc);
3110
3111         /* Check for completely full region */
3112         if (cnt == range) {
3113                 rw_exit(&page_ctrs_rwlock[mnode]);
3114                 return (pgfree);
3115         }
3116
3117         while (--r > 0) {
3118                 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3119                 full = FULL_REGION_CNT(r);
3120                 for (i = 0; i < range; i++, idx++) {
3121                         cnt = PAGE_COUNTERS(mnode, r, idx);
3122                         /*
3123                          * If cnt here is full, that means we have already
3124                          * accounted for these pages earlier.
3125                          */
3126                         if (cnt != full) {
3127                                 pgfree += (cnt << PNUM_SHIFT(r - 1));
3128                         }
3129                 }
3130                 range *= full;
3131         }
3132         rw_exit(&page_ctrs_rwlock[mnode]);
3133         return (pgfree);
3134 }
3135
3136 /*
3137  * Called from page_geti_contig_pages to exclusively lock constituent pages
3138  * starting from 'spp' for page size code 'szc'.
3139  *
3140  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3141  * region needs to be greater than or equal to the threshold.
3142  */
3143 static int
3144 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3145 {
3146         pgcnt_t pgcnt = PNUM_SIZE(szc);
3147         pgcnt_t pgfree, i;
3148         page_t *pp;
3149
3150         VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3151
3152
3153         if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3154                 goto skipptcpcheck;
3155         /*
3156          * check if there are sufficient free pages available before attempting
3157          * to trylock. Count is approximate as page counters can change.
3158          */
3159         pgfree = page_freecnt(mnode, spp, szc);
3160
3161         /* attempt to trylock if there are sufficient already free pages */
3162         if (pgfree < pgcnt/ptcpthreshold) {
3163                 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3164                 return (0);
3165         }
3166
3167 skipptcpcheck:
3168
3169         for (i = 0; i < pgcnt; i++) {
3170                 pp = &spp[i];
3171                 if (!page_trylock(pp, SE_EXCL)) {
3172                         VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3173                         while (--i != (pgcnt_t)-1) {
3174                                 pp = &spp[i];
3175                                 ASSERT(PAGE_EXCL(pp));
3176                                 page_unlock_nocapture(pp);
3177                         }
3178                         return (0);
3179                 }
3180                 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3181                 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3182                     !PP_ISFREE(pp)) {
3183                         VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3184                         ASSERT(i == 0);
3185                         page_unlock_nocapture(pp);
3186                         return (0);
3187                 }
3188
3189                 /*
3190                  * If a page has been marked non-relocatable or has been
3191                  * explicitly locked in memory, we don't want to relocate it;
3192                  * unlock the pages and fail the operation.
3193                  */
3194                 if (PP_ISNORELOC(pp) ||
3195                     pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
3196                         VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3197                         while (i != (pgcnt_t)-1) {
3198                                 pp = &spp[i];
3199                                 ASSERT(PAGE_EXCL(pp));
3200                                 page_unlock_nocapture(pp);
3201                                 i--;
3202                         }
3203                         return (0);
3204                 }
3205         }
3206         VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3207         return (1);
3208 }
3209
3210 /*
3211  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3212  * of 'szc' constituent pages that had been locked exclusively previously.
3213  * Will attempt to relocate constituent pages in use.
3214  */
3215 static page_t *
3216 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3217 {
3218         spgcnt_t pgcnt, npgs, i;
3219         page_t *targpp, *rpp, *hpp;
3220         page_t *replpp = NULL;
3221         page_t *pplist = NULL;
3222
3223         ASSERT(pp != NULL);
3224
3225         pgcnt = page_get_pagecnt(szc);
3226         while (pgcnt) {
3227                 ASSERT(PAGE_EXCL(pp));
3228                 ASSERT(!PP_ISNORELOC(pp));
3229                 if (PP_ISFREE(pp)) {
3230                         /*
3231                          * If this is a PG_FREE_LIST page then its
3232                          * size code can change underneath us due to
3233                          * page promotion or demotion. As an optimzation
3234                          * use page_list_sub_pages() instead of
3235                          * page_list_sub().
3236                          */
3237                         if (PP_ISAGED(pp)) {
3238                                 page_list_sub_pages(pp, szc);
3239                                 if (pp->p_szc == szc) {
3240                                         return (pp);
3241                                 }
3242                                 ASSERT(pp->p_szc < szc);
3243                                 npgs = page_get_pagecnt(pp->p_szc);
3244                                 hpp = pp;
3245                                 for (i = 0; i < npgs; i++, pp++) {
3246                                         pp->p_szc = szc;
3247                                 }
3248                                 page_list_concat(&pplist, &hpp);
3249                                 pgcnt -= npgs;
3250                                 continue;
3251                         }
3252                         ASSERT(!PP_ISAGED(pp));
3253                         ASSERT(pp->p_szc == 0);
3254                         page_list_sub(pp, PG_CACHE_LIST);
3255                         page_hashout(pp, false);
3256                         PP_SETAGED(pp);
3257                         pp->p_szc = szc;
3258                         page_list_concat(&pplist, &pp);
3259                         pp++;
3260                         pgcnt--;
3261                         continue;
3262                 }
3263                 npgs = page_get_pagecnt(pp->p_szc);
3264
3265                 /*
3266                  * page_create_wait freemem accounting done by caller of
3267                  * page_get_freelist and not necessary to call it prior to
3268                  * calling page_get_replacement_page.
3269                  *
3270                  * page_get_replacement_page can call page_get_contig_pages
3271                  * to acquire a large page (szc > 0); the replacement must be
3272                  * smaller than the contig page size to avoid looping or
3273                  * szc == 0 and PGI_PGCPSZC0 is set.
3274                  */
3275                 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3276                         replpp = page_get_replacement_page(pp, NULL, 0);
3277                         if (replpp) {
3278                                 npgs = page_get_pagecnt(pp->p_szc);
3279                                 ASSERT(npgs <= pgcnt);
3280                                 targpp = pp;
3281                         }
3282                 }
3283
3284                 /*
3285                  * If replacement is NULL or do_page_relocate fails, fail
3286                  * coalescing of pages.
3287                  */
3288                 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3289                     &npgs, NULL) != 0)) {
3290                         /*
3291                          * Unlock un-processed target list
3292                          */
3293                         while (pgcnt--) {
3294                                 ASSERT(PAGE_EXCL(pp));
3295                                 page_unlock_nocapture(pp);
3296                                 pp++;
3297                         }
3298                         /*
3299                          * Free the processed target list.
3300                          */
3301                         while (pplist) {
3302                                 pp = pplist;
3303                                 page_sub(&pplist, pp);
3304                                 ASSERT(PAGE_EXCL(pp));
3305                                 ASSERT(pp->p_szc == szc);
3306                                 ASSERT(PP_ISFREE(pp));
3307                                 ASSERT(PP_ISAGED(pp));
3308                                 pp->p_szc = 0;
3309                                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3310                                 page_unlock_nocapture(pp);
3311                         }
3312
3313                         if (replpp != NULL)
3314                                 page_free_replacement_page(replpp);
3315
3316                         return (NULL);
3317                 }
3318                 ASSERT(pp == targpp);
3319
3320                 ASSERT(hpp = pp); /* That's right, it's an assignment */
3321
3322                 pp += npgs;
3323                 pgcnt -= npgs;
3324
3325                 while (npgs--) {
3326                         ASSERT(PAGE_EXCL(targpp));
3327                         ASSERT(!PP_ISFREE(targpp));
3328                         ASSERT(!PP_ISNORELOC(targpp));
3329                         PP_SETFREE(targpp);
3330                         ASSERT(PP_ISAGED(targpp));
3331                         ASSERT(targpp->p_szc < szc || (szc == 0 &&
3332                             (flags & PGI_PGCPSZC0)));
3333                         targpp->p_szc = szc;
3334                         targpp = targpp->p_next;
3335
3336                         rpp = replpp;
3337                         ASSERT(rpp != NULL);
3338                         page_sub(&replpp, rpp);
3339                         ASSERT(PAGE_EXCL(rpp));
3340                         ASSERT(!PP_ISFREE(rpp));
3341                         page_unlock_nocapture(rpp);
3342                 }
3343                 ASSERT(targpp == hpp);
3344                 ASSERT(replpp == NULL);
3345                 page_list_concat(&pplist, &targpp);
3346         }
3347         CHK_LPG(pplist, szc);
3348         return (pplist);
3349 }
3350
3351 /*
3352  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3353  * of 0 means nothing left after trim.
3354  */
3355 int
3356 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3357 {
3358         pfn_t   kcagepfn;
3359         int     decr;
3360         int     rc = 0;
3361
3362         if (PP_ISNORELOC(mseg->pages)) {
3363                 if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3364
3365                         /* lower part of this mseg inside kernel cage */
3366                         decr = kcage_current_pfn(&kcagepfn);
3367
3368                         /* kernel cage may have transitioned past mseg */
3369                         if (kcagepfn >= mseg->pages_base &&
3370                             kcagepfn < mseg->pages_end) {
3371                                 ASSERT(decr == 0);
3372                                 *lo = MAX(kcagepfn, pfnlo);
3373                                 *hi = MIN(pfnhi, (mseg->pages_end - 1));
3374                                 rc = 1;
3375                         }
3376                 }
3377                 /* else entire mseg in the cage */
3378         } else {
3379                 if (PP_ISNORELOC(mseg->epages - 1)) {
3380
3381                         /* upper part of this mseg inside kernel cage */
3382                         decr = kcage_current_pfn(&kcagepfn);
3383
3384                         /* kernel cage may have transitioned past mseg */
3385                         if (kcagepfn >= mseg->pages_base &&
3386                             kcagepfn < mseg->pages_end) {
3387                                 ASSERT(decr);
3388                                 *hi = MIN(kcagepfn, pfnhi);
3389                                 *lo = MAX(pfnlo, mseg->pages_base);
3390                                 rc = 1;
3391                         }
3392                 } else {
3393                         /* entire mseg outside of kernel cage */
3394                         *lo = MAX(pfnlo, mseg->pages_base);
3395                         *hi = MIN(pfnhi, (mseg->pages_end - 1));
3396                         rc = 1;
3397                 }
3398         }
3399         return (rc);
3400 }
3401
3402 /*
3403  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3404  * page with size code 'szc'. Claiming such a page requires acquiring
3405  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3406  * relocating pages in use and concatenating these constituent pages into a
3407  * large page.
3408  *
3409  * The page lists do not have such a large page and page_freelist_split has
3410  * already failed to demote larger pages and/or coalesce smaller free pages.
3411  *
3412  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3413  * pages with the same color as 'bin'.
3414  *
3415  * 'pfnflag' specifies the subset of the pfn range to search.
3416  */
3417
3418 static page_t *
3419 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3420     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3421 {
3422         struct memseg *mseg;
3423         pgcnt_t szcpgcnt = page_get_pagecnt(szc);
3424         pgcnt_t szcpgmask = szcpgcnt - 1;
3425         pfn_t   randpfn;
3426         page_t *pp, *randpp, *endpp;
3427         uint_t colors, ceq_mask;
3428         uint_t color_mask;
3429         pfn_t hi, lo;
3430         uint_t skip;
3431         MEM_NODE_ITERATOR_DECL(it);
3432
3433         ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3434
3435         pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3436
3437         if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3438                 return (NULL);
3439
3440         ASSERT(szc < mmu_page_sizes);
3441
3442         colors = PAGE_GET_PAGECOLORS(szc);
3443         color_mask = colors - 1;
3444         if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3445                 uchar_t ceq = colorequivszc[szc];
3446                 uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3447
3448                 ASSERT(ceq_dif > 0);
3449                 ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3450         } else {
3451                 ceq_mask = 0;
3452         }
3453
3454         ASSERT(bin < colors);
3455
3456         /* clear "non-significant" color bits */
3457         bin &= ceq_mask;
3458
3459         /*
3460          * trim the pfn range to search based on pfnflag. pfnflag is set
3461          * when there have been previous page_get_contig_page failures to
3462          * limit the search.
3463          *
3464          * The high bit in pfnflag specifies the number of 'slots' in the
3465          * pfn range and the remainder of pfnflag specifies which slot.
3466          * For example, a value of 1010b would mean the second slot of
3467          * the pfn range that has been divided into 8 slots.
3468          */
3469         if (pfnflag > 1) {
3470                 int     slots = 1 << (highbit(pfnflag) - 1);
3471                 int     slotid = pfnflag & (slots - 1);
3472                 pgcnt_t szcpages;
3473                 int     slotlen;
3474
3475                 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3476                 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3477                 slotlen = howmany(szcpages, slots);
3478                 /* skip if 'slotid' slot is empty */
3479                 if (slotid * slotlen >= szcpages)
3480                         return (NULL);
3481                 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3482                 ASSERT(pfnlo < pfnhi);
3483                 if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3484                         pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3485         }
3486
3487         /*
3488          * This routine is can be called recursively so we shouldn't
3489          * acquire a reader lock if a write request is pending. This
3490          * could lead to a deadlock with the DR thread.
3491          *
3492          * Returning NULL informs the caller that we could not get
3493          * a contig page with the required characteristics.
3494          */
3495
3496         if (!memsegs_trylock(0))
3497                 return (NULL);
3498
3499         /*
3500          * loop through memsegs to look for contig page candidates
3501          */
3502
3503         for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3504                 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3505                         /* no overlap */
3506                         continue;
3507                 }
3508
3509                 if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3510                         /* mseg too small */
3511                         continue;
3512
3513                 /*
3514                  * trim off kernel cage pages from pfn range and check for
3515                  * a trimmed pfn range returned that does not span the
3516                  * desired large page size.
3517                  */
3518                 if (kcage_on) {
3519                         if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3520                             lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3521                                 continue;
3522                 } else {
3523                         lo = MAX(pfnlo, mseg->pages_base);
3524                         hi = MIN(pfnhi, (mseg->pages_end - 1));
3525                 }
3526
3527                 /* round to szcpgcnt boundaries */
3528                 lo = P2ROUNDUP(lo, szcpgcnt);
3529
3530                 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3531                 hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3532
3533                 if (hi <= lo)
3534                         continue;
3535
3536                 /*
3537                  * set lo to point to the pfn for the desired bin. Large
3538                  * page sizes may only have a single page color
3539                  */
3540                 skip = szcpgcnt;
3541                 if (ceq_mask > 0 || interleaved_mnodes) {
3542                         /* set lo to point at appropriate color */
3543                         if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3544                             (interleaved_mnodes &&
3545                             PFN_2_MEM_NODE(lo) != mnode)) {
3546                                 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3547                                     color_mask, &it);
3548                         }
3549                         if (hi <= lo)
3550                                 /* mseg cannot satisfy color request */
3551                                 continue;
3552                 }
3553
3554                 /* randomly choose a point between lo and hi to begin search */
3555
3556                 randpfn = (pfn_t)GETTICK();
3557                 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3558                 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3559                 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3560                         if (randpfn != (pfn_t)-1) {
3561                                 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3562                                     ceq_mask, color_mask, &it);
3563                         }
3564                         if (randpfn >= hi) {
3565                                 randpfn = lo;
3566                                 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3567                                     &it);
3568                         }
3569                 }
3570                 randpp = mseg->pages + (randpfn - mseg->pages_base);
3571
3572                 ASSERT(randpp->p_pagenum == randpfn);
3573
3574                 pp = randpp;
3575                 endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3576
3577                 ASSERT(randpp + szcpgcnt <= endpp);
3578
3579                 do {
3580                         ASSERT(!(pp->p_pagenum & szcpgmask));
3581                         ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3582
3583                         if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3584                                 /* pages unlocked by page_claim on failure */
3585                                 if (page_claim_contig_pages(pp, szc, flags)) {
3586                                         memsegs_unlock(0);
3587                                         return (pp);
3588                                 }
3589                         }
3590
3591                         if (ceq_mask == 0 && !interleaved_mnodes) {
3592                                 pp += skip;
3593                         } else {
3594                                 pfn_t pfn = pp->p_pagenum;
3595
3596                                 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3597                                     ceq_mask, color_mask, &it);
3598                                 if (pfn == (pfn_t)-1) {
3599                                         pp = endpp;
3600                                 } else {
3601                                         pp = mseg->pages +
3602                                             (pfn - mseg->pages_base);
3603                                 }
3604                         }
3605                         if (pp >= endpp) {
3606                                 /* start from the beginning */
3607                                 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3608                                 pp = mseg->pages + (lo - mseg->pages_base);
3609                                 ASSERT(pp->p_pagenum == lo);
3610                                 ASSERT(pp + szcpgcnt <= endpp);
3611                         }
3612                 } while (pp != randpp);
3613         }
3614         memsegs_unlock(0);
3615         return (NULL);
3616 }
3617
3618
3619 /*
3620  * controlling routine that searches through physical memory in an attempt to
3621  * claim a large page based on the input parameters.
3622  * on the page free lists.
3623  *
3624  * calls page_geti_contig_pages with an initial pfn range from the mnode
3625  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3626  * that overlaps with the kernel cage or does not match the requested page
3627  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3628  * page_geti_contig_pages may further limit the search range based on
3629  * previous failure counts (pgcpfailcnt[]).
3630  *
3631  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3632  * pagesize page that satisfies mtype.
3633  */
3634 page_t *
3635 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3636     uint_t flags)
3637 {
3638         pfn_t           pfnlo, pfnhi;   /* contig pages pfn range */
3639         page_t          *pp;
3640         pgcnt_t         pfnflag = 0;    /* no limit on search if 0 */
3641
3642         VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3643
3644         /* no allocations from cage */
3645         flags |= PGI_NOCAGE;
3646
3647         MTYPE_START(mnode, mtype, flags);
3648         if (mtype < 0) {        /* mnode does not have memory in mtype range */
3649                 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3650                 return (NULL);
3651         }
3652
3653         ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3654
3655         /* do not limit search and ignore color if hi pri */
3656
3657         if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3658                 pfnflag = pgcpfailcnt[szc];
3659
3660         /* remove color match to improve chances */
3661
3662         if (flags & PGI_PGCPHIPRI || pfnflag)
3663                 flags &= ~PG_MATCH_COLOR;
3664
3665         do {
3666                 /* get pfn range based on mnode and mtype */
3667                 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3668
3669                 ASSERT(pfnhi >= pfnlo);
3670
3671                 pp = page_geti_contig_pages(mnode, bin, szc, flags,
3672                     pfnlo, pfnhi, pfnflag);
3673
3674                 if (pp != NULL) {
3675                         pfnflag = pgcpfailcnt[szc];
3676                         if (pfnflag) {
3677                                 /* double the search size */
3678                                 pgcpfailcnt[szc] = pfnflag >> 1;
3679                         }
3680                         VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3681                         return (pp);
3682                 }
3683                 MTYPE_NEXT(mnode, mtype, flags);
3684         } while (mtype >= 0);
3685
3686         VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3687         return (NULL);
3688 }
3689
3690 #if defined(__i386) || defined(__amd64)
3691 /*
3692  * Determine the likelihood of finding/coalescing a szc page.
3693  * Return 0 if the likelihood is small otherwise return 1.
3694  *
3695  * For now, be conservative and check only 1g pages and return 0
3696  * if there had been previous coalescing failures and the szc pages
3697  * needed to satisfy request would exhaust most of freemem.
3698  */
3699 int
3700 page_chk_freelist(uint_t szc)
3701 {
3702         pgcnt_t         pgcnt;
3703
3704         if (szc <= 1)
3705                 return (1);
3706
3707         pgcnt = page_get_pagecnt(szc);
3708         if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3709                 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3710                 return (0);
3711         }
3712         VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3713         return (1);
3714 }
3715 #endif
3716
3717 /*
3718  * Find the `best' page on the freelist for this (obj,off) (as,vaddr) pair.
3719  *
3720  * Does its own locking and accounting.
3721  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3722  * pages of the proper color even if there are pages of a different color.
3723  *
3724  * Finds a page, removes it, THEN locks it.
3725  */
3726
3727 /*ARGSUSED*/
3728 page_t *
3729 page_get_freelist(struct vmobject *obj, uoff_t off, struct seg *seg,
3730         caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3731 {
3732         struct as       *as = seg->s_as;
3733         page_t          *pp = NULL;
3734         ulong_t         bin;
3735         uchar_t         szc;
3736         int             mnode;
3737         int             mtype;
3738         page_t          *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3739         lgrp_mnode_cookie_t     lgrp_cookie;
3740
3741         page_get_func = page_get_mnode_freelist;
3742
3743         /*
3744          * If we aren't passed a specific lgroup, or passed a freed lgrp
3745          * assume we wish to allocate near to the current thread's home.
3746          */
3747         if (!LGRP_EXISTS(lgrp))
3748                 lgrp = lgrp_home_lgrp();
3749
3750         if (kcage_on) {
3751                 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3752                     kcage_freemem < kcage_throttlefree + btop(size) &&
3753                     curthread != kcage_cageout_thread) {
3754                         /*
3755                          * Set a "reserve" of kcage_throttlefree pages for
3756                          * PG_PANIC and cageout thread allocations.
3757                          *
3758                          * Everybody else has to serialize in
3759                          * page_create_get_something() to get a cage page, so
3760                          * that we don't deadlock cageout!
3761                          */
3762                         return (NULL);
3763                 }
3764         } else {
3765                 flags &= ~PG_NORELOC;
3766                 flags |= PGI_NOCAGE;
3767         }
3768
3769         MTYPE_INIT(mtype, obj->vnode, vaddr, flags, size);
3770
3771         /*
3772          * Convert size to page size code.
3773          */
3774         if ((szc = page_szc(size)) == (uchar_t)-1)
3775                 panic("page_get_freelist: illegal page size request");
3776         ASSERT(szc < mmu_page_sizes);
3777
3778         VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3779
3780         AS_2_BIN(as, seg, obj->vnode, vaddr, bin, szc);
3781
3782         ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3783
3784         /*
3785          * Try to get a local page first, but try remote if we can't
3786          * get a page of the right color.
3787          */
3788 pgretry:
3789         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3790         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3791                 pp = page_get_func(mnode, bin, mtype, szc, flags);
3792                 if (pp != NULL) {
3793                         VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3794                         DTRACE_PROBE4(page__get,
3795                             lgrp_t *, lgrp,
3796                             int, mnode,
3797                             ulong_t, bin,
3798                             uint_t, flags);
3799                         return (pp);
3800                 }
3801         }
3802         ASSERT(pp == NULL);
3803
3804         /*
3805          * for non-SZC0 PAGESIZE requests, check cachelist before checking
3806          * remote free lists.  Caller expected to call page_get_cachelist which
3807          * will check local cache lists and remote free lists.
3808          */
3809         if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3810                 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3811                 return (NULL);
3812         }
3813
3814         ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3815
3816         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3817
3818         if (!(flags & PG_LOCAL)) {
3819                 /*
3820                  * Try to get a non-local freelist page.
3821                  */
3822                 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3823                 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3824                         pp = page_get_func(mnode, bin, mtype, szc, flags);
3825                         if (pp != NULL) {
3826                                 DTRACE_PROBE4(page__get,
3827                                     lgrp_t *, lgrp,
3828                                     int, mnode,
3829                                     ulong_t, bin,
3830                                     uint_t, flags);
3831                                 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3832                                 return (pp);
3833                         }
3834                 }
3835                 ASSERT(pp == NULL);
3836         }
3837
3838         /*
3839          * when the cage is off chances are page_get_contig_pages() will fail
3840          * to lock a large page chunk therefore when the cage is off it's not
3841          * called by default.  this can be changed via /etc/system.
3842          *
3843          * page_get_contig_pages() also called to acquire a base pagesize page
3844          * for page_create_get_something().
3845          */
3846         if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3847             (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3848             (page_get_func != page_get_contig_pages)) {
3849
3850                 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3851                 page_get_func = page_get_contig_pages;
3852                 goto pgretry;
3853         }
3854
3855         if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3856             page_get_func == page_get_contig_pages)
3857                 SETPGCPFAILCNT(szc);
3858
3859         VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3860         return (NULL);
3861 }
3862
3863 /*
3864  * Find the `best' page on the cachelist for this (obj,off) (as,vaddr) pair.
3865  *
3866  * Does its own locking.
3867  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3868  * pages of the proper color even if there are pages of a different color.
3869  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3870  * try to lock one of them.  If no page can be locked, try the
3871  * next bin.  Return NULL if a page can not be found and locked.
3872  *
3873  * Finds a pages, trys to lock it, then removes it.
3874  */
3875
3876 /*ARGSUSED*/
3877 struct page *
3878 page_get_cachelist(struct vmobject *obj, uoff_t off, struct seg *seg,
3879     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3880 {
3881         page_t          *pp;
3882         struct as       *as = seg->s_as;
3883         ulong_t         bin;
3884         int             mnode;
3885         int             mtype;
3886         lgrp_mnode_cookie_t     lgrp_cookie;
3887
3888         /*
3889          * If we aren't passed a specific lgroup, or pasased a freed lgrp
3890          * assume we wish to allocate near to the current thread's home.
3891          */
3892         if (!LGRP_EXISTS(lgrp))
3893                 lgrp = lgrp_home_lgrp();
3894
3895         if (!kcage_on) {
3896                 flags &= ~PG_NORELOC;
3897                 flags |= PGI_NOCAGE;
3898         }
3899
3900         if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3901             kcage_freemem <= kcage_throttlefree) {
3902                 /*
3903                  * Reserve kcage_throttlefree pages for critical kernel
3904                  * threads.
3905                  *
3906                  * Everybody else has to go to page_create_get_something()
3907                  * to get a cage page, so we don't deadlock cageout.
3908                  */
3909                 return (NULL);
3910         }
3911
3912         AS_2_BIN(as, seg, obj->vnode, vaddr, bin, 0);
3913
3914         ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3915
3916         MTYPE_INIT(mtype, obj->vnode, vaddr, flags, MMU_PAGESIZE);
3917
3918         VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3919
3920         /*
3921          * Try local cachelists first
3922          */
3923         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3924         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3925                 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3926                 if (pp != NULL) {
3927                         VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3928                         DTRACE_PROBE4(page__get,
3929                             lgrp_t *, lgrp,
3930                             int, mnode,
3931                             ulong_t, bin,
3932                             uint_t, flags);
3933                         return (pp);
3934                 }
3935         }
3936
3937         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3938
3939         /*
3940          * Try freelists/cachelists that are farther away
3941          * This is our only chance to allocate remote pages for PAGESIZE
3942          * requests.
3943          */
3944         LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3945         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3946                 pp = page_get_mnode_freelist(mnode, bin, mtype,
3947                     0, flags);
3948                 if (pp != NULL) {
3949                         VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3950                         DTRACE_PROBE4(page__get,
3951                             lgrp_t *, lgrp,
3952                             int, mnode,
3953                             ulong_t, bin,
3954                             uint_t, flags);
3955                         return (pp);
3956                 }
3957                 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3958                 if (pp != NULL) {
3959                         VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3960                         DTRACE_PROBE4(page__get,
3961                             lgrp_t *, lgrp,
3962                             int, mnode,
3963                             ulong_t, bin,
3964                             uint_t, flags);
3965                         return (pp);
3966                 }
3967         }
3968
3969         VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3970         return (NULL);
3971 }
3972
3973 page_t *
3974 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3975 {
3976         kmutex_t                *pcm;
3977         page_t                  *pp, *first_pp;
3978         uint_t                  sbin;
3979         int                     plw_initialized;
3980         page_list_walker_t      plw;
3981
3982         VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3983
3984         MTYPE_START(mnode, mtype, flags);
3985         if (mtype < 0) {        /* mnode does not have memory in mtype range */
3986                 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3987                 return (NULL);
3988         }
3989
3990 try_again:
3991
3992         plw_initialized = 0;
3993         plw.plw_ceq_dif = 1;
3994
3995         /*
3996          * Only hold one cachelist lock at a time, that way we
3997          * can start anywhere and not have to worry about lock
3998          * ordering.
3999          */
4000
4001         for (plw.plw_count = 0;
4002             plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
4003                 sbin = bin;
4004                 do {
4005
4006                         if (!PAGE_CACHELISTS(mnode, bin, mtype))
4007                                 goto bin_empty_1;
4008                         pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
4009                         mutex_enter(pcm);
4010                         pp = PAGE_CACHELISTS(mnode, bin, mtype);
4011                         if (pp == NULL)
4012                                 goto bin_empty_0;
4013
4014                         first_pp = pp;
4015                         VERIFY(pp->p_object);
4016                         ASSERT(pp->p_vnode);
4017                         ASSERT(PP_ISAGED(pp) == 0);
4018                         ASSERT(pp->p_szc == 0);
4019                         ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
4020                         while (!page_trylock(pp, SE_EXCL)) {
4021                                 pp = pp->p_next;
4022                                 ASSERT(pp->p_szc == 0);
4023                                 if (pp == first_pp) {
4024                                         /*
4025                                          * We have searched the complete list!
4026                                          * And all of them (might only be one)
4027                                          * are locked. This can happen since
4028                                          * these pages can also be found via
4029                                          * the hash list. When found via the
4030                                          * hash list, they are locked first,
4031                                          * then removed. We give up to let the
4032                                          * other thread run.
4033                                          */
4034                                         pp = NULL;
4035                                         break;
4036                                 }
4037                                 VERIFY(pp->p_object);
4038                                 ASSERT(pp->p_vnode);
4039                                 ASSERT(PP_ISFREE(pp));
4040                                 ASSERT(PP_ISAGED(pp) == 0);
4041                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
4042                                     mnode);
4043                         }
4044
4045                         if (pp) {
4046                                 page_t  **ppp;
4047                                 /*
4048                                  * Found and locked a page.
4049                                  * Pull it off the list.
4050                                  */
4051                                 ASSERT(mtype == PP_2_MTYPE(pp));
4052                                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
4053                                 page_sub(ppp, pp);
4054                                 /*
4055                                  * Subtract counters before releasing pcm mutex
4056                                  * to avoid a race with page_freelist_coalesce
4057                                  * and page_freelist_split.
4058                                  */
4059                                 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
4060                                 mutex_exit(pcm);
4061                                 VERIFY(pp->p_object);
4062                                 ASSERT(pp->p_vnode);
4063                                 ASSERT(PP_ISAGED(pp) == 0);
4064 #if defined(__sparc)
4065                                 ASSERT(!kcage_on ||
4066                                     (flags & PG_NORELOC) == 0 ||
4067                                     PP_ISNORELOC(pp));
4068                                 if (PP_ISNORELOC(pp)) {
4069                                         kcage_freemem_sub(1);
4070                                 }
4071 #endif
4072                                 VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
4073                                 return (pp);
4074                         }
4075 bin_empty_0:
4076                         mutex_exit(pcm);
4077 bin_empty_1:
4078                         if (plw_initialized == 0) {
4079                                 page_list_walk_init(0, flags, bin, 0, 1, &plw);
4080                                 plw_initialized = 1;
4081                         }
4082                         /* calculate the next bin with equivalent color */
4083                         bin = ADD_MASKED(bin, plw.plw_bin_step,
4084                             plw.plw_ceq_mask[0], plw.plw_color_mask);
4085                 } while (sbin != bin);
4086
4087                 if (plw.plw_ceq_dif > 1)
4088                         bin = page_list_walk_next_bin(0, bin, &plw);
4089         }
4090
4091         MTYPE_NEXT(mnode, mtype, flags);
4092         if (mtype >= 0)
4093                 goto try_again;
4094
4095         VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4096         return (NULL);
4097 }
4098
4099 #ifdef DEBUG
4100 #define REPL_PAGE_STATS
4101 #endif /* DEBUG */
4102
4103 #ifdef REPL_PAGE_STATS
4104 struct repl_page_stats {
4105         uint_t  ngets;
4106         uint_t  ngets_noreloc;
4107         uint_t  npgr_noreloc;
4108         uint_t  nnopage_first;
4109         uint_t  nnopage;
4110         uint_t  nhashout;
4111         uint_t  nnofree;
4112         uint_t  nnext_pp;
4113 } repl_page_stats;
4114 #define REPL_STAT_INCR(v)       atomic_inc_32(&repl_page_stats.v)
4115 #else /* REPL_PAGE_STATS */
4116 #define REPL_STAT_INCR(v)
4117 #endif /* REPL_PAGE_STATS */
4118
4119 int     pgrppgcp;
4120
4121 /*
4122  * The freemem accounting must be done by the caller.
4123  * First we try to get a replacement page of the same size as like_pp,
4124  * if that is not possible, then we just get a set of discontiguous
4125  * PAGESIZE pages.
4126  */
4127 page_t *
4128 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4129     uint_t pgrflags)
4130 {
4131         page_t          *like_pp;
4132         page_t          *pp, *pplist;
4133         page_t          *pl = NULL;
4134         ulong_t         bin;
4135         int             mnode, page_mnode;
4136         int             szc;
4137         spgcnt_t        npgs, pg_cnt;
4138         pfn_t           pfnum;
4139         int             mtype;
4140         int             flags = 0;
4141         lgrp_mnode_cookie_t     lgrp_cookie;
4142         lgrp_t          *lgrp;
4143
4144         REPL_STAT_INCR(ngets);
4145         like_pp = orig_like_pp;
4146         ASSERT(PAGE_EXCL(like_pp));
4147
4148         szc = like_pp->p_szc;
4149         npgs = page_get_pagecnt(szc);
4150         /*
4151          * Now we reset like_pp to the base page_t.
4152          * That way, we won't walk past the end of this 'szc' page.
4153          */
4154         pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4155         like_pp = page_numtopp_nolock(pfnum);
4156         ASSERT(like_pp->p_szc == szc);
4157
4158         if (PP_ISNORELOC(like_pp)) {
4159                 ASSERT(kcage_on);
4160                 REPL_STAT_INCR(ngets_noreloc);
4161                 flags = PGI_RELOCONLY;
4162         } else if (pgrflags & PGR_NORELOC) {
4163                 ASSERT(kcage_on);
4164                 REPL_STAT_INCR(npgr_noreloc);
4165                 flags = PG_NORELOC;
4166         }
4167
4168         /*
4169          * Kernel pages must always be replaced with the same size
4170          * pages, since we cannot properly handle demotion of kernel
4171          * pages.
4172          */
4173         if (PP_ISKAS(like_pp))
4174                 pgrflags |= PGR_SAMESZC;
4175
4176         MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4177
4178         while (npgs) {
4179                 pplist = NULL;
4180                 for (;;) {
4181                         pg_cnt = page_get_pagecnt(szc);
4182                         bin = PP_2_BIN(like_pp);
4183                         ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4184                         ASSERT(pg_cnt <= npgs);
4185
4186                         /*
4187                          * If an lgroup was specified, try to get the
4188                          * page from that lgroup.
4189                          * NOTE: Must be careful with code below because
4190                          *       lgroup may disappear and reappear since there
4191                          *       is no locking for lgroup here.
4192                          */
4193                         if (LGRP_EXISTS(lgrp_target)) {
4194                                 /*
4195                                  * Keep local variable for lgroup separate
4196                                  * from lgroup argument since this code should
4197                                  * only be exercised when lgroup argument
4198                                  * exists....
4199                                  */
4200                                 lgrp = lgrp_target;
4201
4202                                 /* Try the lgroup's freelists first */
4203                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4204                                     LGRP_SRCH_LOCAL);
4205                                 while ((pplist == NULL) &&
4206                                     (mnode = lgrp_memnode_choose(&lgrp_cookie))
4207                                     != -1) {
4208                                         pplist =
4209                                             page_get_mnode_freelist(mnode, bin,
4210                                             mtype, szc, flags);
4211                                 }
4212
4213                                 /*
4214                                  * Now try it's cachelists if this is a
4215                                  * small page. Don't need to do it for
4216                                  * larger ones since page_freelist_coalesce()
4217                                  * already failed.
4218                                  */
4219                                 if (pplist != NULL || szc != 0)
4220                                         break;
4221
4222                                 /* Now try it's cachelists */
4223                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4224                                     LGRP_SRCH_LOCAL);
4225
4226                                 while ((pplist == NULL) &&
4227                                     (mnode = lgrp_memnode_choose(&lgrp_cookie))
4228                                     != -1) {
4229                                         pplist =
4230                                             page_get_mnode_cachelist(bin, flags,
4231                                             mnode, mtype);
4232                                 }
4233                                 if (pplist != NULL) {
4234                                         page_hashout(pplist, false);
4235                                         PP_SETAGED(pplist);
4236                                         REPL_STAT_INCR(nhashout);
4237                                         break;
4238                                 }
4239                                 /* Done looking in this lgroup. Bail out. */
4240                                 break;
4241                         }
4242
4243                         /*
4244                          * No lgroup was specified (or lgroup was removed by
4245                          * DR, so just try to get the page as close to
4246                          * like_pp's mnode as possible.
4247                          * First try the local freelist...
4248                          */
4249                         mnode = PP_2_MEM_NODE(like_pp);
4250                         pplist = page_get_mnode_freelist(mnode, bin,
4251                             mtype, szc, flags);
4252                         if (pplist != NULL)
4253                                 break;
4254
4255                         REPL_STAT_INCR(nnofree);
4256
4257                         /*
4258                          * ...then the local cachelist. Don't need to do it for
4259                          * larger pages cause page_freelist_coalesce() already
4260                          * failed there anyway.
4261                          */
4262                         if (szc == 0) {
4263                                 pplist = page_get_mnode_cachelist(bin, flags,
4264                                     mnode, mtype);
4265                                 if (pplist != NULL) {
4266                                         page_hashout(pplist, false);
4267                                         PP_SETAGED(pplist);
4268                                         REPL_STAT_INCR(nhashout);
4269                                         break;
4270                                 }
4271                         }
4272
4273                         /* Now try remote freelists */
4274                         page_mnode = mnode;
4275                         lgrp =
4276                             lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4277                         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4278                             LGRP_SRCH_HIER);
4279                         while (pplist == NULL &&
4280                             (mnode = lgrp_memnode_choose(&lgrp_cookie))
4281                             != -1) {
4282                                 /*
4283                                  * Skip local mnode.
4284                                  */
4285                                 if ((mnode == page_mnode) ||
4286                                     (mem_node_config[mnode].exists == 0))
4287                                         continue;
4288
4289                                 pplist = page_get_mnode_freelist(mnode,
4290                                     bin, mtype, szc, flags);
4291                         }
4292
4293                         if (pplist != NULL)
4294                                 break;
4295
4296
4297                         /* Now try remote cachelists */
4298                         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4299                             LGRP_SRCH_HIER);
4300                         while (pplist == NULL && szc == 0) {
4301                                 mnode = lgrp_memnode_choose(&lgrp_cookie);
4302                                 if (mnode == -1)
4303                                         break;
4304                                 /*
4305                                  * Skip local mnode.
4306                                  */
4307                                 if ((mnode == page_mnode) ||
4308                                     (mem_node_config[mnode].exists == 0))
4309                                         continue;
4310
4311                                 pplist = page_get_mnode_cachelist(bin,
4312                                     flags, mnode, mtype);
4313
4314                                 if (pplist != NULL) {
4315                                         page_hashout(pplist, false);
4316                                         PP_SETAGED(pplist);
4317                                         REPL_STAT_INCR(nhashout);
4318                                         break;
4319                                 }
4320                         }
4321
4322                         /*
4323                          * Break out of while loop under the following cases:
4324                          * - If we successfully got a page.
4325                          * - If pgrflags specified only returning a specific
4326                          *   page size and we could not find that page size.
4327                          * - If we could not satisfy the request with PAGESIZE
4328                          *   or larger pages.
4329                          */
4330                         if (pplist != NULL || szc == 0)
4331                                 break;
4332
4333                         if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4334                                 /* try to find contig page */
4335
4336                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4337                                     LGRP_SRCH_HIER);
4338
4339                                 while ((pplist == NULL) &&
4340                                     (mnode =
4341                                     lgrp_memnode_choose(&lgrp_cookie))
4342                                     != -1) {
4343                                         pplist = page_get_contig_pages(
4344                                             mnode, bin, mtype, szc,
4345                                             flags | PGI_PGCPHIPRI);
4346                                 }
4347                                 break;
4348                         }
4349
4350                         /*
4351                          * The correct thing to do here is try the next
4352                          * page size down using szc--. Due to a bug
4353                          * with the processing of HAT_RELOAD_SHARE
4354                          * where the sfmmu_ttecnt arrays of all
4355                          * hats sharing an ISM segment don't get updated,
4356                          * using intermediate size pages for relocation
4357                          * can lead to continuous page faults.
4358                          */
4359                         szc = 0;
4360                 }
4361
4362                 if (pplist != NULL) {
4363                         DTRACE_PROBE4(page__get,
4364                             lgrp_t *, lgrp,
4365                             int, mnode,
4366                             ulong_t, bin,
4367                             uint_t, flags);
4368
4369                         while (pplist != NULL && pg_cnt--) {
4370                                 ASSERT(pplist != NULL);
4371                                 pp = pplist;
4372                                 page_sub(&pplist, pp);
4373                                 PP_CLRFREE(pp);
4374                                 PP_CLRAGED(pp);
4375                                 page_list_concat(&pl, &pp);
4376                                 npgs--;
4377                                 like_pp = like_pp + 1;
4378                                 REPL_STAT_INCR(nnext_pp);
4379                         }
4380                         ASSERT(pg_cnt == 0);
4381                 } else {
4382                         break;
4383                 }
4384         }
4385
4386         if (npgs) {
4387                 /*
4388                  * We were unable to allocate the necessary number
4389                  * of pages.
4390                  * We need to free up any pl.
4391                  */
4392                 REPL_STAT_INCR(nnopage);
4393                 page_free_replacement_page(pl);
4394                 return (NULL);
4395         } else {
4396                 return (pl);
4397         }
4398 }
4399
4400 /*
4401  * demote a free large page to it's constituent pages
4402  */
4403 void
4404 page_demote_free_pages(page_t *pp)
4405 {
4406
4407         int mnode;
4408
4409         ASSERT(pp != NULL);
4410         ASSERT(PAGE_LOCKED(pp));
4411         ASSERT(PP_ISFREE(pp));
4412         ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4413
4414         mnode = PP_2_MEM_NODE(pp);
4415         page_freelist_lock(mnode);
4416         if (pp->p_szc != 0) {
4417                 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4418                     pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4419         }
4420         page_freelist_unlock(mnode);
4421         ASSERT(pp->p_szc == 0);
4422 }
4423
4424 /*
4425  * Factor in colorequiv to check additional 'equivalent' bins.
4426  * colorequiv may be set in /etc/system
4427  */
4428 void
4429 page_set_colorequiv_arr(void)
4430 {
4431         if (colorequiv > 1) {
4432                 int i;
4433                 uint_t sv_a = lowbit(colorequiv) - 1;
4434
4435                 if (sv_a > 15)
4436                         sv_a = 15;
4437
4438                 for (i = 0; i < MMU_PAGE_SIZES; i++) {
4439                         uint_t colors;
4440                         uint_t a = sv_a;
4441
4442                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
4443                                 continue;
4444                         }
4445                         while ((colors >> a) == 0)
4446                                 a--;
4447                         if ((a << 4) > colorequivszc[i]) {
4448                                 colorequivszc[i] = (a << 4);
4449                         }
4450                 }
4451         }
4452 }