Merge illumos-gate
[unleashed.git] / kernel / vm / vm_pagelist.c
blobbba19fee124ccdd932aafcc9cac9019aeb1bdb77
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright 2012 Joyent, Inc. All rights reserved.
29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
33 * Portions of this source code were derived from Berkeley 4.3 BSD
34 * under license from the Regents of the University of California.
39 * This file contains common functions to access and manage the page lists.
40 * Many of these routines originated from platform dependent modules
41 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
42 * a platform independent manner.
44 * vm/vm_dep.h provides for platform specific support.
47 #include <sys/types.h>
48 #include <sys/debug.h>
49 #include <sys/cmn_err.h>
50 #include <sys/systm.h>
51 #include <sys/atomic.h>
52 #include <sys/sysmacros.h>
53 #include <vm/as.h>
54 #include <vm/page.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/seg_vn.h>
57 #include <sys/vmsystm.h>
58 #include <sys/memnode.h>
59 #include <vm/vm_dep.h>
60 #include <sys/lgrp.h>
61 #include <sys/mem_config.h>
62 #include <sys/callb.h>
63 #include <sys/sdt.h>
64 #include <sys/dumphdr.h>
65 #include <sys/swap.h>
67 extern uint_t vac_colors;
69 #define MAX_PRAGMA_ALIGN 128
71 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
73 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
74 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0)
75 #else
76 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0)
77 #endif
78 char vm_cpu_data0[VM_CPU_DATA_PADSIZE];
81 * number of page colors equivalent to reqested color in page_get routines.
82 * If set, keeps large pages intact longer and keeps MPO allocation
83 * from the local mnode in favor of acquiring the 'correct' page color from
84 * a demoted large page or from a remote mnode.
86 uint_t colorequiv;
89 * color equivalency mask for each page size.
90 * Mask is computed based on cpu L2$ way sizes and colorequiv global.
91 * High 4 bits determine the number of high order bits of the color to ignore.
92 * Low 4 bits determines number of low order bits of color to ignore (it's only
93 * relevant for hashed index based page coloring).
95 uchar_t colorequivszc[MMU_PAGE_SIZES];
98 * if set, specifies the percentage of large pages that are free from within
99 * a large page region before attempting to lock those pages for
100 * page_get_contig_pages processing.
102 * Should be turned on when kpr is available when page_trylock_contig_pages
103 * can be more selective.
106 int ptcpthreshold;
109 * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
110 * Enabled by default via pgcplimitsearch.
112 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
113 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
114 * bound. This upper bound range guarantees:
115 * - all large page 'slots' will be searched over time
116 * - the minimum (1) large page candidates considered on each pgcp call
117 * - count doesn't wrap around to 0
119 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES];
120 int pgcplimitsearch = 1;
122 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1))
123 #define SETPGCPFAILCNT(szc) \
124 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \
125 pgcpfailcnt[szc] = PGCPFAILMAX / 2;
127 #ifdef VM_STATS
128 struct vmm_vmstats_str vmm_vmstats;
130 #endif /* VM_STATS */
132 /* enable page_get_contig_pages */
133 #define LPGCREATE 1
135 int pg_contig_disable;
136 int pg_lpgcreate_nocage = LPGCREATE;
139 * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
141 #define PFNNULL 0
143 /* Flags involved in promotion and demotion routines */
144 #define PC_FREE 0x1 /* put page on freelist */
145 #define PC_ALLOC 0x2 /* return page for allocation */
148 * Flag for page_demote to be used with PC_FREE to denote that we don't care
149 * what the color is as the color parameter to the function is ignored.
151 #define PC_NO_COLOR (-1)
153 /* mtype value for page_promote to use when mtype does not matter */
154 #define PC_MTYPE_ANY (-1)
157 * page counters candidates info
158 * See page_ctrs_cands comment below for more details.
159 * fields are as follows:
160 * pcc_pages_free: # pages which freelist coalesce can create
161 * pcc_color_free: pointer to page free counts per color
163 typedef struct pcc_info {
164 pgcnt_t pcc_pages_free;
165 pgcnt_t *pcc_color_free;
166 uint_t pad[12];
167 } pcc_info_t;
170 * On big machines it can take a long time to check page_counters
171 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
172 * updated sum of all elements of the corresponding page_counters arrays.
173 * page_freelist_coalesce() searches page_counters only if an appropriate
174 * element of page_ctrs_cands array is greater than 0.
176 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
178 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
181 * Return in val the total number of free pages which can be created
182 * for the given mnode (m), mrange (g), and region size (r)
184 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \
185 int i; \
186 val = 0; \
187 for (i = 0; i < NPC_MUTEX; i++) { \
188 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \
193 * Return in val the total number of free pages which can be created
194 * for the given mnode (m), mrange (g), region size (r), and color (c)
196 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \
197 int i; \
198 val = 0; \
199 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \
200 for (i = 0; i < NPC_MUTEX; i++) { \
201 val += \
202 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \
207 * We can only allow a single thread to update a counter within the physical
208 * range of the largest supported page size. That is the finest granularity
209 * possible since the counter values are dependent on each other
210 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
211 * ctr_mutex lock index for a particular physical range.
213 static kmutex_t *ctr_mutex[NPC_MUTEX];
215 #define PP_CTR_LOCK_INDX(pp) \
216 (((pp)->p_pagenum >> \
217 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
219 #define INVALID_COLOR 0xffffffff
220 #define INVALID_MASK 0xffffffff
223 * Local functions prototypes.
226 void page_ctr_add(int, int, page_t *, int);
227 void page_ctr_add_internal(int, int, page_t *, int);
228 void page_ctr_sub(int, int, page_t *, int);
229 void page_ctr_sub_internal(int, int, page_t *, int);
230 void page_freelist_lock(int);
231 void page_freelist_unlock(int);
232 page_t *page_promote(int, pfn_t, uchar_t, int, int);
233 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
234 page_t *page_freelist_split(uchar_t,
235 uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
236 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
237 static int page_trylock_cons(page_t *pp, se_t se);
240 * The page_counters array below is used to keep track of free contiguous
241 * physical memory. A hw_page_map_t will be allocated per mnode per szc.
242 * This contains an array of counters, the size of the array, a shift value
243 * used to convert a pagenum into a counter array index or vice versa, as
244 * well as a cache of the last successful index to be promoted to a larger
245 * page size. As an optimization, we keep track of the last successful index
246 * to be promoted per page color for the given size region, and this is
247 * allocated dynamically based upon the number of colors for a given
248 * region size.
250 * Conceptually, the page counters are represented as:
252 * page_counters[region_size][mnode]
254 * region_size: size code of a candidate larger page made up
255 * of contiguous free smaller pages.
257 * page_counters[region_size][mnode].hpm_counters[index]:
258 * represents how many (region_size - 1) pages either
259 * exist or can be created within the given index range.
261 * Let's look at a sparc example:
262 * If we want to create a free 512k page, we look at region_size 2
263 * for the mnode we want. We calculate the index and look at a specific
264 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at
265 * this location, it means that 8 64k pages either exist or can be created
266 * from 8K pages in order to make a single free 512k page at the given
267 * index. Note that when a region is full, it will contribute to the
268 * counts in the region above it. Thus we will not know what page
269 * size the free pages will be which can be promoted to this new free
270 * page unless we look at all regions below the current region.
274 * Note: hpmctr_t is defined in platform vm_dep.h
275 * hw_page_map_t contains all the information needed for the page_counters
276 * logic. The fields are as follows:
278 * hpm_counters: dynamically allocated array to hold counter data
279 * hpm_entries: entries in hpm_counters
280 * hpm_shift: shift for pnum/array index conv
281 * hpm_base: PFN mapped to counter index 0
282 * hpm_color_current: last index in counter array for this color at
283 * which we successfully created a large page
285 typedef struct hw_page_map {
286 hpmctr_t *hpm_counters;
287 size_t hpm_entries;
288 int hpm_shift;
289 pfn_t hpm_base;
290 size_t *hpm_color_current[MAX_MNODE_MRANGES];
291 } hw_page_map_t;
294 * Element zero is not used, but is allocated for convenience.
296 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
299 * Cached value of MNODE_RANGE_CNT(mnode).
300 * This is a function call in x86.
302 static int mnode_nranges[MAX_MEM_NODES];
303 static int mnode_maxmrange[MAX_MEM_NODES];
306 * The following macros are convenient ways to get access to the individual
307 * elements of the page_counters arrays. They can be used on both
308 * the left side and right side of equations.
310 #define PAGE_COUNTERS(mnode, rg_szc, idx) \
311 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
313 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \
314 (page_counters[(rg_szc)][(mnode)].hpm_counters)
316 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \
317 (page_counters[(rg_szc)][(mnode)].hpm_shift)
319 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \
320 (page_counters[(rg_szc)][(mnode)].hpm_entries)
322 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \
323 (page_counters[(rg_szc)][(mnode)].hpm_base)
325 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \
326 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
328 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \
329 (page_counters[(rg_szc)][(mnode)]. \
330 hpm_color_current[(mrange)][(color)])
332 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \
333 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \
334 PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
336 #define IDX_TO_PNUM(mnode, rg_szc, index) \
337 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \
338 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
341 * Protects the hpm_counters and hpm_color_current memory from changing while
342 * looking at page counters information.
343 * Grab the write lock to modify what these fields point at.
344 * Grab the read lock to prevent any pointers from changing.
345 * The write lock can not be held during memory allocation due to a possible
346 * recursion deadlock with trying to grab the read lock while the
347 * write lock is already held.
349 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
353 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
355 void
356 cpu_vm_data_init(struct cpu *cp)
358 if (cp == CPU0) {
359 cp->cpu_vm_data = (void *)&vm_cpu_data0;
360 } else {
361 void *kmptr;
362 int align;
363 size_t sz;
365 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
366 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
367 kmptr = kmem_zalloc(sz, KM_SLEEP);
368 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
369 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
370 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
375 * free cpu_vm_data
377 void
378 cpu_vm_data_destroy(struct cpu *cp)
380 if (cp->cpu_seqid && cp->cpu_vm_data) {
381 ASSERT(cp != CPU0);
382 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
383 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
385 cp->cpu_vm_data = NULL;
390 * page size to page size code
393 page_szc(size_t pagesize)
395 int i = 0;
397 while (hw_page_array[i].hp_size) {
398 if (pagesize == hw_page_array[i].hp_size)
399 return (i);
400 i++;
402 return (-1);
406 * page size to page size code with the restriction that it be a supported
407 * user page size. If it's not a supported user page size, -1 will be returned.
410 page_szc_user_filtered(size_t pagesize)
412 int szc = page_szc(pagesize);
413 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
414 return (szc);
416 return (-1);
420 * Return how many page sizes are available for the user to use. This is
421 * what the hardware supports and not based upon how the OS implements the
422 * support of different page sizes.
424 * If legacy is non-zero, return the number of pagesizes available to legacy
425 * applications. The number of legacy page sizes might be less than the
426 * exported user page sizes. This is to prevent legacy applications that
427 * use the largest page size returned from getpagesizes(3c) from inadvertantly
428 * using the 'new' large pagesizes.
430 uint_t
431 page_num_user_pagesizes(int legacy)
433 if (legacy)
434 return (mmu_legacy_page_sizes);
435 return (mmu_exported_page_sizes);
438 uint_t
439 page_num_pagesizes(void)
441 return (mmu_page_sizes);
445 * returns the count of the number of base pagesize pages associated with szc
447 pgcnt_t
448 page_get_pagecnt(uint_t szc)
450 if (szc >= mmu_page_sizes)
451 panic("page_get_pagecnt: out of range %d", szc);
452 return (hw_page_array[szc].hp_pgcnt);
455 size_t
456 page_get_pagesize(uint_t szc)
458 if (szc >= mmu_page_sizes)
459 panic("page_get_pagesize: out of range %d", szc);
460 return (hw_page_array[szc].hp_size);
464 * Return the size of a page based upon the index passed in. An index of
465 * zero refers to the smallest page size in the system, and as index increases
466 * it refers to the next larger supported page size in the system.
467 * Note that szc and userszc may not be the same due to unsupported szc's on
468 * some systems.
470 size_t
471 page_get_user_pagesize(uint_t userszc)
473 uint_t szc = USERSZC_2_SZC(userszc);
475 if (szc >= mmu_page_sizes)
476 panic("page_get_user_pagesize: out of range %d", szc);
477 return (hw_page_array[szc].hp_size);
480 uint_t
481 page_get_shift(uint_t szc)
483 if (szc >= mmu_page_sizes)
484 panic("page_get_shift: out of range %d", szc);
485 return (PAGE_GET_SHIFT(szc));
488 uint_t
489 page_get_pagecolors(uint_t szc)
491 if (szc >= mmu_page_sizes)
492 panic("page_get_pagecolors: out of range %d", szc);
493 return (PAGE_GET_PAGECOLORS(szc));
497 * this assigns the desired equivalent color after a split
499 uint_t
500 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
501 uint_t ncolor, uint_t ceq_mask)
503 ASSERT(nszc > szc);
504 ASSERT(szc < mmu_page_sizes);
505 ASSERT(color < PAGE_GET_PAGECOLORS(szc));
506 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
508 color &= ceq_mask;
509 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
510 return (color | (ncolor & ~ceq_mask));
514 * The interleaved_mnodes flag is set when mnodes overlap in
515 * the physbase..physmax range, but have disjoint slices.
516 * In this case hpm_counters is shared by all mnodes.
517 * This flag is set dynamically by the platform.
519 int interleaved_mnodes = 0;
522 * Called by startup().
523 * Size up the per page size free list counters based on physmax
524 * of each node and max_mem_nodes.
526 * If interleaved_mnodes is set we need to find the first mnode that
527 * exists. hpm_counters for the first mnode will then be shared by
528 * all other mnodes. If interleaved_mnodes is not set, just set
529 * first=mnode each time. That means there will be no sharing.
531 size_t
532 page_ctrs_sz(void)
534 int r; /* region size */
535 int mnode;
536 int firstmn; /* first mnode that exists */
537 int nranges;
538 pfn_t physbase;
539 pfn_t physmax;
540 uint_t ctrs_sz = 0;
541 int i;
542 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
545 * We need to determine how many page colors there are for each
546 * page size in order to allocate memory for any color specific
547 * arrays.
549 for (i = 0; i < mmu_page_sizes; i++) {
550 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
553 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
555 pgcnt_t r_pgcnt;
556 pfn_t r_base;
557 pgcnt_t r_align;
559 if (mem_node_config[mnode].exists == 0)
560 continue;
562 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
563 nranges = MNODE_RANGE_CNT(mnode);
564 mnode_nranges[mnode] = nranges;
565 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
568 * determine size needed for page counter arrays with
569 * base aligned to large page size.
571 for (r = 1; r < mmu_page_sizes; r++) {
572 /* add in space for hpm_color_current */
573 ctrs_sz += sizeof (size_t) *
574 colors_per_szc[r] * nranges;
576 if (firstmn != mnode)
577 continue;
579 /* add in space for hpm_counters */
580 r_align = page_get_pagecnt(r);
581 r_base = physbase;
582 r_base &= ~(r_align - 1);
583 r_pgcnt = howmany(physmax - r_base + 1, r_align);
586 * Round up to always allocate on pointer sized
587 * boundaries.
589 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
590 sizeof (hpmctr_t *));
594 for (r = 1; r < mmu_page_sizes; r++) {
595 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
598 /* add in space for page_ctrs_cands and pcc_color_free */
599 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
600 mmu_page_sizes * NPC_MUTEX;
602 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
604 if (mem_node_config[mnode].exists == 0)
605 continue;
607 nranges = mnode_nranges[mnode];
608 ctrs_sz += sizeof (pcc_info_t) * nranges *
609 mmu_page_sizes * NPC_MUTEX;
610 for (r = 1; r < mmu_page_sizes; r++) {
611 ctrs_sz += sizeof (pgcnt_t) * nranges *
612 colors_per_szc[r] * NPC_MUTEX;
616 /* ctr_mutex */
617 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
619 /* size for page list counts */
620 PLCNT_SZ(ctrs_sz);
623 * add some slop for roundups. page_ctrs_alloc will roundup the start
624 * address of the counters to ecache_alignsize boundary for every
625 * memory node.
627 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
630 caddr_t
631 page_ctrs_alloc(caddr_t alloc_base)
633 int mnode;
634 int mrange, nranges;
635 int r; /* region size */
636 int i;
637 int firstmn; /* first mnode that exists */
638 pfn_t physbase;
639 pfn_t physmax;
640 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
643 * We need to determine how many page colors there are for each
644 * page size in order to allocate memory for any color specific
645 * arrays.
647 for (i = 0; i < mmu_page_sizes; i++) {
648 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
651 for (r = 1; r < mmu_page_sizes; r++) {
652 page_counters[r] = (hw_page_map_t *)alloc_base;
653 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
656 /* page_ctrs_cands and pcc_color_free array */
657 for (i = 0; i < NPC_MUTEX; i++) {
658 for (r = 1; r < mmu_page_sizes; r++) {
660 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
661 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
663 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
664 pcc_info_t *pi;
666 if (mem_node_config[mnode].exists == 0)
667 continue;
669 nranges = mnode_nranges[mnode];
671 pi = (pcc_info_t *)alloc_base;
672 alloc_base += sizeof (pcc_info_t) * nranges;
673 page_ctrs_cands[i][r][mnode] = pi;
675 for (mrange = 0; mrange < nranges; mrange++) {
676 pi->pcc_color_free =
677 (pgcnt_t *)alloc_base;
678 alloc_base += sizeof (pgcnt_t) *
679 colors_per_szc[r];
680 pi++;
686 /* ctr_mutex */
687 for (i = 0; i < NPC_MUTEX; i++) {
688 ctr_mutex[i] = (kmutex_t *)alloc_base;
689 alloc_base += (max_mem_nodes * sizeof (kmutex_t));
692 /* initialize page list counts */
693 PLCNT_INIT(alloc_base);
695 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
697 pgcnt_t r_pgcnt;
698 pfn_t r_base;
699 pgcnt_t r_align;
700 int r_shift;
701 int nranges = mnode_nranges[mnode];
703 if (mem_node_config[mnode].exists == 0)
704 continue;
706 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
708 for (r = 1; r < mmu_page_sizes; r++) {
710 * the page_counters base has to be aligned to the
711 * page count of page size code r otherwise the counts
712 * will cross large page boundaries.
714 r_align = page_get_pagecnt(r);
715 r_base = physbase;
716 /* base needs to be aligned - lower to aligned value */
717 r_base &= ~(r_align - 1);
718 r_pgcnt = howmany(physmax - r_base + 1, r_align);
719 r_shift = PAGE_BSZS_SHIFT(r);
721 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
722 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
723 PAGE_COUNTERS_BASE(mnode, r) = r_base;
724 for (mrange = 0; mrange < nranges; mrange++) {
725 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
726 r, mrange) = (size_t *)alloc_base;
727 alloc_base += sizeof (size_t) *
728 colors_per_szc[r];
730 for (i = 0; i < colors_per_szc[r]; i++) {
731 uint_t color_mask = colors_per_szc[r] - 1;
732 pfn_t pfnum = r_base;
733 size_t idx;
734 int mrange;
735 MEM_NODE_ITERATOR_DECL(it);
737 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
738 if (pfnum == (pfn_t)-1) {
739 idx = 0;
740 } else {
741 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
742 color_mask, color_mask, &it);
743 idx = PNUM_TO_IDX(mnode, r, pfnum);
744 idx = (idx >= r_pgcnt) ? 0 : idx;
746 for (mrange = 0; mrange < nranges; mrange++) {
747 PAGE_COUNTERS_CURRENT_COLOR(mnode,
748 r, i, mrange) = idx;
752 /* hpm_counters may be shared by all mnodes */
753 if (firstmn == mnode) {
754 PAGE_COUNTERS_COUNTERS(mnode, r) =
755 (hpmctr_t *)alloc_base;
756 alloc_base +=
757 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
758 sizeof (hpmctr_t *));
759 } else {
760 PAGE_COUNTERS_COUNTERS(mnode, r) =
761 PAGE_COUNTERS_COUNTERS(firstmn, r);
765 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
766 * satisfy the identity requirement.
767 * We should be able to go from one to the other
768 * and get consistent values.
770 ASSERT(PNUM_TO_IDX(mnode, r,
771 (IDX_TO_PNUM(mnode, r, 0))) == 0);
772 ASSERT(IDX_TO_PNUM(mnode, r,
773 (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
776 * Roundup the start address of the page_counters to
777 * cache aligned boundary for every memory node.
778 * page_ctrs_sz() has added some slop for these roundups.
780 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
781 L2CACHE_ALIGN);
784 /* Initialize other page counter specific data structures. */
785 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
786 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
789 return (alloc_base);
793 * Functions to adjust region counters for each size free list.
794 * Caller is responsible to acquire the ctr_mutex lock if necessary and
795 * thus can be called during startup without locks.
797 /* ARGSUSED */
798 void
799 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
801 ssize_t r; /* region size */
802 ssize_t idx;
803 pfn_t pfnum;
804 int lckidx;
806 ASSERT(mnode == PP_2_MEM_NODE(pp));
807 ASSERT(mtype == PP_2_MTYPE(pp));
809 ASSERT(pp->p_szc < mmu_page_sizes);
811 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
813 /* no counter update needed for largest page size */
814 if (pp->p_szc >= mmu_page_sizes - 1) {
815 return;
818 r = pp->p_szc + 1;
819 pfnum = pp->p_pagenum;
820 lckidx = PP_CTR_LOCK_INDX(pp);
823 * Increment the count of free pages for the current
824 * region. Continue looping up in region size incrementing
825 * count if the preceeding region is full.
827 while (r < mmu_page_sizes) {
828 idx = PNUM_TO_IDX(mnode, r, pfnum);
830 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
831 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
833 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
834 break;
835 } else {
836 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
837 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
838 [MTYPE_2_MRANGE(mnode, root_mtype)];
840 cand->pcc_pages_free++;
841 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
843 r++;
847 void
848 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
850 int lckidx = PP_CTR_LOCK_INDX(pp);
851 kmutex_t *lock = &ctr_mutex[lckidx][mnode];
853 mutex_enter(lock);
854 page_ctr_add_internal(mnode, mtype, pp, flags);
855 mutex_exit(lock);
858 void
859 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
861 int lckidx;
862 ssize_t r; /* region size */
863 ssize_t idx;
864 pfn_t pfnum;
866 ASSERT(mnode == PP_2_MEM_NODE(pp));
867 ASSERT(mtype == PP_2_MTYPE(pp));
869 ASSERT(pp->p_szc < mmu_page_sizes);
871 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
873 /* no counter update needed for largest page size */
874 if (pp->p_szc >= mmu_page_sizes - 1) {
875 return;
878 r = pp->p_szc + 1;
879 pfnum = pp->p_pagenum;
880 lckidx = PP_CTR_LOCK_INDX(pp);
883 * Decrement the count of free pages for the current
884 * region. Continue looping up in region size decrementing
885 * count if the preceeding region was full.
887 while (r < mmu_page_sizes) {
888 idx = PNUM_TO_IDX(mnode, r, pfnum);
890 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
891 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
893 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
894 break;
895 } else {
896 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
897 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
898 [MTYPE_2_MRANGE(mnode, root_mtype)];
900 ASSERT(cand->pcc_pages_free != 0);
901 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
903 cand->pcc_pages_free--;
904 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
906 r++;
910 void
911 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
913 int lckidx = PP_CTR_LOCK_INDX(pp);
914 kmutex_t *lock = &ctr_mutex[lckidx][mnode];
916 mutex_enter(lock);
917 page_ctr_sub_internal(mnode, mtype, pp, flags);
918 mutex_exit(lock);
922 * Adjust page counters following a memory attach, since typically the
923 * size of the array needs to change, and the PFN to counter index
924 * mapping needs to change.
926 * It is possible this mnode did not exist at startup. In that case
927 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
928 * to change (a theoretical possibility on x86), which means pcc_color_free
929 * arrays must be extended.
931 uint_t
932 page_ctrs_adjust(int mnode)
934 pgcnt_t npgs;
935 int r; /* region size */
936 int i;
937 size_t pcsz, old_csz;
938 hpmctr_t *new_ctr, *old_ctr;
939 pfn_t oldbase, newbase;
940 pfn_t physbase, physmax;
941 size_t old_npgs;
942 hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
943 size_t size_cache[MMU_PAGE_SIZES];
944 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
945 size_t *old_color_array[MAX_MNODE_MRANGES];
946 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
947 pcc_info_t **cands_cache;
948 pcc_info_t *old_pi, *pi;
949 pgcnt_t *pgcntp;
950 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
951 int cands_cache_nranges;
952 int old_maxmrange, new_maxmrange;
953 int rc = 0;
954 int oldmnode;
956 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
957 MMU_PAGE_SIZES, KM_NOSLEEP);
958 if (cands_cache == NULL)
959 return (ENOMEM);
961 i = -1;
962 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
964 newbase = physbase & ~PC_BASE_ALIGN_MASK;
965 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
967 /* prepare to free non-null pointers on the way out */
968 cands_cache_nranges = nranges;
969 bzero(ctr_cache, sizeof (ctr_cache));
970 bzero(color_cache, sizeof (color_cache));
973 * We need to determine how many page colors there are for each
974 * page size in order to allocate memory for any color specific
975 * arrays.
977 for (r = 0; r < mmu_page_sizes; r++) {
978 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
982 * Preallocate all of the new hpm_counters arrays as we can't
983 * hold the page_ctrs_rwlock as a writer and allocate memory.
984 * If we can't allocate all of the arrays, undo our work so far
985 * and return failure.
987 for (r = 1; r < mmu_page_sizes; r++) {
988 pcsz = npgs >> PAGE_BSZS_SHIFT(r);
989 size_cache[r] = pcsz;
990 ctr_cache[r] = kmem_zalloc(pcsz *
991 sizeof (hpmctr_t), KM_NOSLEEP);
992 if (ctr_cache[r] == NULL) {
993 rc = ENOMEM;
994 goto cleanup;
999 * Preallocate all of the new color current arrays as we can't
1000 * hold the page_ctrs_rwlock as a writer and allocate memory.
1001 * If we can't allocate all of the arrays, undo our work so far
1002 * and return failure.
1004 for (r = 1; r < mmu_page_sizes; r++) {
1005 for (mrange = 0; mrange < nranges; mrange++) {
1006 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1007 colors_per_szc[r], KM_NOSLEEP);
1008 if (color_cache[r][mrange] == NULL) {
1009 rc = ENOMEM;
1010 goto cleanup;
1016 * Preallocate all of the new pcc_info_t arrays as we can't
1017 * hold the page_ctrs_rwlock as a writer and allocate memory.
1018 * If we can't allocate all of the arrays, undo our work so far
1019 * and return failure.
1021 for (r = 1; r < mmu_page_sizes; r++) {
1022 for (i = 0; i < NPC_MUTEX; i++) {
1023 pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1024 KM_NOSLEEP);
1025 if (pi == NULL) {
1026 rc = ENOMEM;
1027 goto cleanup;
1029 cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1031 for (mrange = 0; mrange < nranges; mrange++, pi++) {
1032 pgcntp = kmem_zalloc(colors_per_szc[r] *
1033 sizeof (pgcnt_t), KM_NOSLEEP);
1034 if (pgcntp == NULL) {
1035 rc = ENOMEM;
1036 goto cleanup;
1038 pi->pcc_color_free = pgcntp;
1044 * Grab the write lock to prevent others from walking these arrays
1045 * while we are modifying them.
1047 PAGE_CTRS_WRITE_LOCK(mnode);
1050 * For interleaved mnodes, find the first mnode
1051 * with valid page counters since the current
1052 * mnode may have just been added and not have
1053 * valid page counters.
1055 if (interleaved_mnodes) {
1056 for (i = 0; i < max_mem_nodes; i++)
1057 if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1058 break;
1059 ASSERT(i < max_mem_nodes);
1060 oldmnode = i;
1061 } else
1062 oldmnode = mnode;
1064 old_nranges = mnode_nranges[mnode];
1065 cands_cache_nranges = old_nranges;
1066 mnode_nranges[mnode] = nranges;
1067 old_maxmrange = mnode_maxmrange[mnode];
1068 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1069 new_maxmrange = mnode_maxmrange[mnode];
1071 for (r = 1; r < mmu_page_sizes; r++) {
1072 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1073 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1074 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1075 oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1076 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1077 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1078 old_color_array[mrange] =
1079 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1080 r, mrange);
1083 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1084 new_ctr = ctr_cache[r];
1085 ctr_cache[r] = NULL;
1086 if (old_ctr != NULL &&
1087 (oldbase + old_npgs > newbase) &&
1088 (newbase + npgs > oldbase)) {
1090 * Map the intersection of the old and new
1091 * counters into the new array.
1093 size_t offset;
1094 if (newbase > oldbase) {
1095 offset = (newbase - oldbase) >>
1096 PAGE_COUNTERS_SHIFT(mnode, r);
1097 bcopy(old_ctr + offset, new_ctr,
1098 MIN(pcsz, (old_csz - offset)) *
1099 sizeof (hpmctr_t));
1100 } else {
1101 offset = (oldbase - newbase) >>
1102 PAGE_COUNTERS_SHIFT(mnode, r);
1103 bcopy(old_ctr, new_ctr + offset,
1104 MIN(pcsz - offset, old_csz) *
1105 sizeof (hpmctr_t));
1109 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1110 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1111 PAGE_COUNTERS_BASE(mnode, r) = newbase;
1113 /* update shared hpm_counters in other mnodes */
1114 if (interleaved_mnodes) {
1115 for (i = 0; i < max_mem_nodes; i++) {
1116 if ((i == mnode) ||
1117 (mem_node_config[i].exists == 0))
1118 continue;
1119 ASSERT(
1120 PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1121 PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1122 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1123 PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1124 PAGE_COUNTERS_BASE(i, r) = newbase;
1128 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1129 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1130 color_cache[r][mrange];
1131 color_cache[r][mrange] = NULL;
1134 * for now, just reset on these events as it's probably
1135 * not worthwhile to try and optimize this.
1137 for (i = 0; i < colors_per_szc[r]; i++) {
1138 uint_t color_mask = colors_per_szc[r] - 1;
1139 int mlo = interleaved_mnodes ? 0 : mnode;
1140 int mhi = interleaved_mnodes ? max_mem_nodes :
1141 (mnode + 1);
1142 int m;
1143 pfn_t pfnum;
1144 size_t idx;
1145 MEM_NODE_ITERATOR_DECL(it);
1147 for (m = mlo; m < mhi; m++) {
1148 if (mem_node_config[m].exists == 0)
1149 continue;
1150 pfnum = newbase;
1151 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1152 if (pfnum == (pfn_t)-1) {
1153 idx = 0;
1154 } else {
1155 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1156 color_mask, color_mask, &it);
1157 idx = PNUM_TO_IDX(m, r, pfnum);
1158 idx = (idx < pcsz) ? idx : 0;
1160 for (mrange = 0; mrange < nranges; mrange++) {
1161 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1162 r, mrange) != NULL)
1163 PAGE_COUNTERS_CURRENT_COLOR(m,
1164 r, i, mrange) = idx;
1169 /* cache info for freeing out of the critical path */
1170 if ((caddr_t)old_ctr >= kernelheap &&
1171 (caddr_t)old_ctr < ekernelheap) {
1172 ctr_cache[r] = old_ctr;
1173 size_cache[r] = old_csz;
1175 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1176 size_t *tmp = old_color_array[mrange];
1177 if ((caddr_t)tmp >= kernelheap &&
1178 (caddr_t)tmp < ekernelheap) {
1179 color_cache[r][mrange] = tmp;
1183 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1184 * satisfy the identity requirement.
1185 * We should be able to go from one to the other
1186 * and get consistent values.
1188 ASSERT(PNUM_TO_IDX(mnode, r,
1189 (IDX_TO_PNUM(mnode, r, 0))) == 0);
1190 ASSERT(IDX_TO_PNUM(mnode, r,
1191 (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1193 /* pcc_info_t and pcc_color_free */
1194 for (i = 0; i < NPC_MUTEX; i++) {
1195 pcc_info_t *epi;
1196 pcc_info_t *eold_pi;
1198 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1199 old_pi = page_ctrs_cands[i][r][mnode];
1200 page_ctrs_cands[i][r][mnode] = pi;
1201 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1203 /* preserve old pcc_color_free values, if any */
1204 if (old_pi == NULL)
1205 continue;
1208 * when/if x86 does DR, must account for
1209 * possible change in range index when
1210 * preserving pcc_info
1212 epi = &pi[nranges];
1213 eold_pi = &old_pi[old_nranges];
1214 if (new_maxmrange > old_maxmrange) {
1215 pi += new_maxmrange - old_maxmrange;
1216 } else if (new_maxmrange < old_maxmrange) {
1217 old_pi += old_maxmrange - new_maxmrange;
1219 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1220 pcc_info_t tmp = *pi;
1221 *pi = *old_pi;
1222 *old_pi = tmp;
1226 PAGE_CTRS_WRITE_UNLOCK(mnode);
1229 * Now that we have dropped the write lock, it is safe to free all
1230 * of the memory we have cached above.
1231 * We come thru here to free memory when pre-alloc fails, and also to
1232 * free old pointers which were recorded while locked.
1234 cleanup:
1235 for (r = 1; r < mmu_page_sizes; r++) {
1236 if (ctr_cache[r] != NULL) {
1237 kmem_free(ctr_cache[r],
1238 size_cache[r] * sizeof (hpmctr_t));
1240 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1241 if (color_cache[r][mrange] != NULL) {
1242 kmem_free(color_cache[r][mrange],
1243 colors_per_szc[r] * sizeof (size_t));
1246 for (i = 0; i < NPC_MUTEX; i++) {
1247 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1248 if (pi == NULL)
1249 continue;
1250 nr = cands_cache_nranges;
1251 for (mrange = 0; mrange < nr; mrange++, pi++) {
1252 pgcntp = pi->pcc_color_free;
1253 if (pgcntp == NULL)
1254 continue;
1255 if ((caddr_t)pgcntp >= kernelheap &&
1256 (caddr_t)pgcntp < ekernelheap) {
1257 kmem_free(pgcntp,
1258 colors_per_szc[r] *
1259 sizeof (pgcnt_t));
1262 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1263 if ((caddr_t)pi >= kernelheap &&
1264 (caddr_t)pi < ekernelheap) {
1265 kmem_free(pi, nr * sizeof (pcc_info_t));
1270 kmem_free(cands_cache,
1271 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1272 return (rc);
1276 * Cleanup the hpm_counters field in the page counters
1277 * array.
1279 void
1280 page_ctrs_cleanup(void)
1282 int r; /* region size */
1283 int i; /* mnode index */
1286 * Get the page counters write lock while we are
1287 * setting the page hpm_counters field to NULL
1288 * for non-existent mnodes.
1290 for (i = 0; i < max_mem_nodes; i++) {
1291 PAGE_CTRS_WRITE_LOCK(i);
1292 if (mem_node_config[i].exists) {
1293 PAGE_CTRS_WRITE_UNLOCK(i);
1294 continue;
1296 for (r = 1; r < mmu_page_sizes; r++) {
1297 PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1299 PAGE_CTRS_WRITE_UNLOCK(i);
1303 #ifdef DEBUG
1306 * confirm pp is a large page corresponding to szc
1308 void
1309 chk_lpg(page_t *pp, uchar_t szc)
1311 spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1312 uint_t noreloc;
1314 if (npgs == 1) {
1315 ASSERT(pp->p_szc == 0);
1316 ASSERT(pp->p_next == pp);
1317 ASSERT(pp->p_prev == pp);
1318 return;
1321 ASSERT(pp->p_list.largepg.next == pp || pp->p_list.largepg.next == NULL);
1322 ASSERT(pp->p_list.largepg.prev == pp || pp->p_list.largepg.prev == NULL);
1324 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1325 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1326 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1327 ASSERT(pp->p_prev == (pp + (npgs - 1)));
1330 * Check list of pages.
1332 noreloc = PP_ISNORELOC(pp);
1333 while (npgs--) {
1334 if (npgs != 0) {
1335 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1336 ASSERT(pp->p_next == (pp + 1));
1338 ASSERT(pp->p_szc == szc);
1339 ASSERT(PP_ISFREE(pp));
1340 ASSERT(PP_ISAGED(pp));
1341 ASSERT(pp->p_list.largepg.next == pp || pp->p_list.largepg.next == NULL);
1342 ASSERT(pp->p_list.largepg.prev == pp || pp->p_list.largepg.prev == NULL);
1343 VERIFY(pp->p_object == NULL);
1344 ASSERT(pp->p_vnode == NULL);
1345 ASSERT(PP_ISNORELOC(pp) == noreloc);
1347 pp = pp->p_next;
1350 #endif /* DEBUG */
1352 void
1353 page_freelist_lock(int mnode)
1355 int i;
1356 for (i = 0; i < NPC_MUTEX; i++) {
1357 mutex_enter(FPC_MUTEX(mnode, i));
1358 mutex_enter(CPC_MUTEX(mnode, i));
1362 void
1363 page_freelist_unlock(int mnode)
1365 int i;
1366 for (i = 0; i < NPC_MUTEX; i++) {
1367 mutex_exit(FPC_MUTEX(mnode, i));
1368 mutex_exit(CPC_MUTEX(mnode, i));
1373 * add pp to the specified page list. Defaults to head of the page list
1374 * unless PG_LIST_TAIL is specified.
1376 void
1377 page_list_add(page_t *pp, int flags)
1379 page_t **ppp;
1380 kmutex_t *pcm;
1381 uint_t bin, mtype;
1382 int mnode;
1384 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1385 ASSERT(PP_ISFREE(pp));
1386 ASSERT(!hat_page_is_mapped(pp));
1387 ASSERT(hat_page_getshare(pp) == 0);
1390 * Large pages should be freed via page_list_add_pages().
1392 ASSERT(pp->p_szc == 0);
1395 * Don't need to lock the freelist first here
1396 * because the page isn't on the freelist yet.
1397 * This means p_szc can't change on us.
1400 bin = PP_2_BIN(pp);
1401 mnode = PP_2_MEM_NODE(pp);
1402 mtype = PP_2_MTYPE(pp);
1404 if (flags & PG_LIST_ISINIT) {
1406 * PG_LIST_ISINIT is set during system startup (ie. single
1407 * threaded), add a page to the free list and add to the
1408 * the free region counters w/o any locking
1410 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1412 /* inline version of page_add() */
1413 if (*ppp != NULL) {
1414 pp->p_next = *ppp;
1415 pp->p_prev = (*ppp)->p_prev;
1416 (*ppp)->p_prev = pp;
1417 pp->p_prev->p_next = pp;
1418 } else
1419 *ppp = pp;
1421 page_ctr_add_internal(mnode, mtype, pp, flags);
1422 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1423 } else {
1424 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1426 if (flags & PG_FREE_LIST) {
1427 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1428 ASSERT(PP_ISAGED(pp));
1429 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1431 } else {
1432 VM_STAT_ADD(vmm_vmstats.pladd_cache);
1433 VERIFY(pp->p_object);
1434 ASSERT(pp->p_vnode);
1435 ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1436 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1438 mutex_enter(pcm);
1439 page_add(ppp, pp);
1441 if (flags & PG_LIST_TAIL)
1442 *ppp = (*ppp)->p_next;
1444 * Add counters before releasing pcm mutex to avoid a race with
1445 * page_freelist_coalesce and page_freelist_split.
1447 page_ctr_add(mnode, mtype, pp, flags);
1448 mutex_exit(pcm);
1453 * It is up to the caller to unlock the page!
1455 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1460 /* ARGSUSED */
1461 void
1462 page_list_noreloc_startup(page_t *pp)
1464 panic("page_list_noreloc_startup: should be here only for sparc");
1467 void
1468 page_list_add_pages(page_t *pp, int flags)
1470 kmutex_t *pcm;
1471 pgcnt_t pgcnt;
1472 uint_t bin, mtype, i;
1473 int mnode;
1475 /* default to freelist/head */
1476 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1478 CHK_LPG(pp, pp->p_szc);
1479 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1481 bin = PP_2_BIN(pp);
1482 mnode = PP_2_MEM_NODE(pp);
1483 mtype = PP_2_MTYPE(pp);
1485 if (flags & PG_LIST_ISINIT) {
1486 ASSERT(pp->p_szc == mmu_page_sizes - 1);
1487 page_lpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1488 ASSERT(!PP_ISNORELOC(pp));
1489 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1490 } else {
1492 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1494 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1496 mutex_enter(pcm);
1497 page_lpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1498 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1499 mutex_exit(pcm);
1501 pgcnt = page_get_pagecnt(pp->p_szc);
1502 for (i = 0; i < pgcnt; i++, pp++)
1503 page_unlock_nocapture(pp);
1508 * During boot, need to demote a large page to base
1509 * pagesize pages for seg_kmem for use in boot_alloc()
1511 void
1512 page_boot_demote(page_t *pp)
1514 ASSERT(pp->p_szc != 0);
1515 ASSERT(PP_ISFREE(pp));
1516 ASSERT(PP_ISAGED(pp));
1518 (void) page_demote(PP_2_MEM_NODE(pp),
1519 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1520 PC_FREE);
1522 ASSERT(PP_ISFREE(pp));
1523 ASSERT(PP_ISAGED(pp));
1524 ASSERT(pp->p_szc == 0);
1528 * Take a particular page off of whatever freelist the page
1529 * is claimed to be on.
1531 * NOTE: Only used for PAGESIZE pages.
1533 void
1534 page_list_sub(page_t *pp, int flags)
1536 int bin;
1537 uint_t mtype;
1538 int mnode;
1539 kmutex_t *pcm;
1540 page_t **ppp;
1542 ASSERT(PAGE_EXCL(pp));
1543 ASSERT(PP_ISFREE(pp));
1546 * The p_szc field can only be changed by page_promote()
1547 * and page_demote(). Only free pages can be promoted and
1548 * demoted and the free list MUST be locked during these
1549 * operations. So to prevent a race in page_list_sub()
1550 * between computing which bin of the freelist lock to
1551 * grab and actually grabing the lock we check again that
1552 * the bin we locked is still the correct one. Notice that
1553 * the p_szc field could have actually changed on us but
1554 * if the bin happens to still be the same we are safe.
1556 try_again:
1557 bin = PP_2_BIN(pp);
1558 mnode = PP_2_MEM_NODE(pp);
1559 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1560 mutex_enter(pcm);
1561 if (PP_2_BIN(pp) != bin) {
1562 mutex_exit(pcm);
1563 goto try_again;
1565 mtype = PP_2_MTYPE(pp);
1567 if (flags & PG_FREE_LIST) {
1568 VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1569 ASSERT(PP_ISAGED(pp));
1570 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1571 } else {
1572 VM_STAT_ADD(vmm_vmstats.plsub_cache);
1573 ASSERT(!PP_ISAGED(pp));
1574 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1578 * Common PAGESIZE case.
1580 * Note that we locked the freelist. This prevents
1581 * any page promotion/demotion operations. Therefore
1582 * the p_szc will not change until we drop pcm mutex.
1584 if (pp->p_szc == 0) {
1585 page_sub(ppp, pp);
1587 * Subtract counters before releasing pcm mutex
1588 * to avoid race with page_freelist_coalesce.
1590 page_ctr_sub(mnode, mtype, pp, flags);
1591 mutex_exit(pcm);
1593 return;
1597 * Large pages on the cache list are not supported.
1599 if (flags & PG_CACHE_LIST)
1600 panic("page_list_sub: large page on cachelist");
1603 * Slow but rare.
1605 * Somebody wants this particular page which is part
1606 * of a large page. In this case we just demote the page
1607 * if it's on the freelist.
1609 * We have to drop pcm before locking the entire freelist.
1610 * Once we have re-locked the freelist check to make sure
1611 * the page hasn't already been demoted or completely
1612 * freed.
1614 mutex_exit(pcm);
1615 page_freelist_lock(mnode);
1616 if (pp->p_szc != 0) {
1618 * Large page is on freelist.
1620 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1621 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1623 ASSERT(PP_ISFREE(pp));
1624 ASSERT(PP_ISAGED(pp));
1625 ASSERT(pp->p_szc == 0);
1628 * Subtract counters before releasing pcm mutex
1629 * to avoid race with page_freelist_coalesce.
1631 bin = PP_2_BIN(pp);
1632 mtype = PP_2_MTYPE(pp);
1633 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1635 page_sub(ppp, pp);
1636 page_ctr_sub(mnode, mtype, pp, flags);
1637 page_freelist_unlock(mnode);
1641 void
1642 page_list_sub_pages(page_t *pp, uint_t szc)
1644 kmutex_t *pcm;
1645 uint_t bin, mtype;
1646 int mnode;
1648 ASSERT(PAGE_EXCL(pp));
1649 ASSERT(PP_ISFREE(pp));
1650 ASSERT(PP_ISAGED(pp));
1653 * See comment in page_list_sub().
1655 try_again:
1656 bin = PP_2_BIN(pp);
1657 mnode = PP_2_MEM_NODE(pp);
1658 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1659 mutex_enter(pcm);
1660 if (PP_2_BIN(pp) != bin) {
1661 mutex_exit(pcm);
1662 goto try_again;
1666 * If we're called with a page larger than szc or it got
1667 * promoted above szc before we locked the freelist then
1668 * drop pcm and re-lock entire freelist. If page still larger
1669 * than szc then demote it.
1671 if (pp->p_szc > szc) {
1672 mutex_exit(pcm);
1673 pcm = NULL;
1674 page_freelist_lock(mnode);
1675 if (pp->p_szc > szc) {
1676 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1677 (void) page_demote(mnode,
1678 PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1679 pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1681 bin = PP_2_BIN(pp);
1683 ASSERT(PP_ISFREE(pp));
1684 ASSERT(PP_ISAGED(pp));
1685 ASSERT(pp->p_szc <= szc);
1686 ASSERT(pp == PP_PAGEROOT(pp));
1688 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1690 mtype = PP_2_MTYPE(pp);
1691 if (pp->p_szc != 0) {
1692 page_lpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1693 CHK_LPG(pp, pp->p_szc);
1694 } else {
1695 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1696 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1698 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1700 if (pcm != NULL) {
1701 mutex_exit(pcm);
1702 } else {
1703 page_freelist_unlock(mnode);
1709 * Add the page to the front of a linked list of pages
1710 * using the p_next & p_prev pointers for the list.
1711 * The caller is responsible for protecting the list pointers.
1713 void
1714 mach_page_add(page_t **ppp, page_t *pp)
1716 if (*ppp == NULL) {
1717 pp->p_next = pp->p_prev = pp;
1718 } else {
1719 pp->p_next = *ppp;
1720 pp->p_prev = (*ppp)->p_prev;
1721 (*ppp)->p_prev = pp;
1722 pp->p_prev->p_next = pp;
1724 *ppp = pp;
1728 * Remove this page from a linked list of pages
1729 * using the p_next & p_prev pointers for the list.
1731 * The caller is responsible for protecting the list pointers.
1733 void
1734 mach_page_sub(page_t **ppp, page_t *pp)
1736 ASSERT(PP_ISFREE(pp));
1738 if (*ppp == NULL || pp == NULL)
1739 panic("mach_page_sub");
1741 if (*ppp == pp)
1742 *ppp = pp->p_next; /* go to next page */
1744 if (*ppp == pp)
1745 *ppp = NULL; /* page list is gone */
1746 else {
1747 pp->p_prev->p_next = pp->p_next;
1748 pp->p_next->p_prev = pp->p_prev;
1750 pp->p_prev = pp->p_next = pp; /* make pp a list of one */
1754 * Routine fsflush uses to gradually coalesce the free list into larger pages.
1756 void
1757 page_promote_size(page_t *pp, uint_t cur_szc)
1759 pfn_t pfn;
1760 int mnode;
1761 int idx;
1762 int new_szc = cur_szc + 1;
1763 int full = FULL_REGION_CNT(new_szc);
1765 pfn = page_pptonum(pp);
1766 mnode = PFN_2_MEM_NODE(pfn);
1768 page_freelist_lock(mnode);
1770 idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1771 if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1772 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1774 page_freelist_unlock(mnode);
1777 static uint_t page_promote_err;
1778 static uint_t page_promote_noreloc_err;
1781 * Create a single larger page (of szc new_szc) from smaller contiguous pages
1782 * for the given mnode starting at pfnum. Pages involved are on the freelist
1783 * before the call and may be returned to the caller if requested, otherwise
1784 * they will be placed back on the freelist.
1785 * If flags is PC_ALLOC, then the large page will be returned to the user in
1786 * a state which is consistent with a page being taken off the freelist. If
1787 * we failed to lock the new large page, then we will return NULL to the
1788 * caller and put the large page on the freelist instead.
1789 * If flags is PC_FREE, then the large page will be placed on the freelist,
1790 * and NULL will be returned.
1791 * The caller is responsible for locking the freelist as well as any other
1792 * accounting which needs to be done for a returned page.
1794 * RFE: For performance pass in pp instead of pfnum so
1795 * we can avoid excessive calls to page_numtopp_nolock().
1796 * This would depend on an assumption that all contiguous
1797 * pages are in the same memseg so we can just add/dec
1798 * our pp.
1800 * Lock ordering:
1802 * There is a potential but rare deadlock situation
1803 * for page promotion and demotion operations. The problem
1804 * is there are two paths into the freelist manager and
1805 * they have different lock orders:
1807 * page_create()
1808 * lock freelist
1809 * page_lock(EXCL)
1810 * unlock freelist
1811 * return
1812 * caller drops page_lock
1814 * page_free() and page_reclaim()
1815 * caller grabs page_lock(EXCL)
1817 * lock freelist
1818 * unlock freelist
1819 * drop page_lock
1821 * What prevents a thread in page_create() from deadlocking
1822 * with a thread freeing or reclaiming the same page is the
1823 * page_trylock() in page_get_freelist(). If the trylock fails
1824 * it skips the page.
1826 * The lock ordering for promotion and demotion is the same as
1827 * for page_create(). Since the same deadlock could occur during
1828 * page promotion and freeing or reclaiming of a page on the
1829 * cache list we might have to fail the operation and undo what
1830 * have done so far. Again this is rare.
1832 page_t *
1833 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1835 page_t *pp, *pplist, *tpp, *start_pp;
1836 pgcnt_t new_npgs, npgs;
1837 uint_t bin;
1838 pgcnt_t tmpnpgs, pages_left;
1839 uint_t noreloc;
1840 int which_list;
1841 ulong_t index;
1842 kmutex_t *phm;
1845 * General algorithm:
1846 * Find the starting page
1847 * Walk each page struct removing it from the freelist,
1848 * and linking it to all the other pages removed.
1849 * Once all pages are off the freelist,
1850 * walk the list, modifying p_szc to new_szc and what
1851 * ever other info needs to be done to create a large free page.
1852 * According to the flags, either return the page or put it
1853 * on the freelist.
1856 start_pp = page_numtopp_nolock(pfnum);
1857 ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1858 new_npgs = page_get_pagecnt(new_szc);
1859 ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1861 /* don't return page of the wrong mtype */
1862 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1863 return (NULL);
1866 * Loop through smaller pages to confirm that all pages
1867 * give the same result for PP_ISNORELOC().
1868 * We can check this reliably here as the protocol for setting
1869 * P_NORELOC requires pages to be taken off the free list first.
1871 noreloc = PP_ISNORELOC(start_pp);
1872 for (pp = start_pp + new_npgs; --pp > start_pp; ) {
1873 if (noreloc != PP_ISNORELOC(pp)) {
1874 page_promote_noreloc_err++;
1875 page_promote_err++;
1876 return (NULL);
1880 pages_left = new_npgs;
1881 pplist = NULL;
1882 pp = start_pp;
1884 /* Loop around coalescing the smaller pages into a big page. */
1885 while (pages_left) {
1887 * Remove from the freelist.
1889 ASSERT(PP_ISFREE(pp));
1890 bin = PP_2_BIN(pp);
1891 ASSERT(mnode == PP_2_MEM_NODE(pp));
1892 mtype = PP_2_MTYPE(pp);
1893 if (PP_ISAGED(pp)) {
1896 * PG_FREE_LIST
1898 if (pp->p_szc) {
1899 page_lpsub(&PAGE_FREELISTS(mnode,
1900 pp->p_szc, bin, mtype), pp);
1901 } else {
1902 mach_page_sub(&PAGE_FREELISTS(mnode, 0,
1903 bin, mtype), pp);
1905 which_list = PG_FREE_LIST;
1906 } else {
1907 struct vmobject *obj;
1909 ASSERT(pp->p_szc == 0);
1912 * PG_CACHE_LIST
1914 * Since this page comes from the
1915 * cachelist, we must destroy the
1916 * vnode association.
1918 if (!page_trylock(pp, SE_EXCL)) {
1919 goto fail_promote;
1922 obj = &pp->p_vnode->v_object;
1925 * We need to be careful not to deadlock
1926 * with another thread in page_lookup().
1927 * The page_lookup() thread could be holding
1928 * the same phm that we need if the two
1929 * pages happen to hash to the same phm lock.
1930 * At this point we have locked the entire
1931 * freelist and page_lookup() could be trying
1932 * to grab a freelist lock.
1934 if (!vmobject_trylock(obj)) {
1935 page_unlock_nocapture(pp);
1936 goto fail_promote;
1939 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
1940 page_hashout(pp, true);
1941 vmobject_unlock(obj);
1942 PP_SETAGED(pp);
1943 page_unlock_nocapture(pp);
1944 which_list = PG_CACHE_LIST;
1946 page_ctr_sub(mnode, mtype, pp, which_list);
1949 * Concatenate the smaller page(s) onto
1950 * the large page list.
1952 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
1953 pages_left -= npgs;
1954 tpp = pp;
1955 while (npgs--) {
1956 tpp->p_szc = new_szc;
1957 tpp = tpp->p_next;
1959 page_list_concat(&pplist, &pp);
1960 pp += tmpnpgs;
1962 CHK_LPG(pplist, new_szc);
1965 * return the page to the user if requested
1966 * in the properly locked state.
1968 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
1969 return (pplist);
1973 * Otherwise place the new large page on the freelist
1975 bin = PP_2_BIN(pplist);
1976 mnode = PP_2_MEM_NODE(pplist);
1977 mtype = PP_2_MTYPE(pplist);
1978 page_lpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
1980 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
1981 return (NULL);
1983 fail_promote:
1985 * A thread must have still been freeing or
1986 * reclaiming the page on the cachelist.
1987 * To prevent a deadlock undo what we have
1988 * done sofar and return failure. This
1989 * situation can only happen while promoting
1990 * PAGESIZE pages.
1992 page_promote_err++;
1993 while (pplist) {
1994 pp = pplist;
1995 mach_page_sub(&pplist, pp);
1996 pp->p_szc = 0;
1997 bin = PP_2_BIN(pp);
1998 mtype = PP_2_MTYPE(pp);
1999 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2000 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2002 return (NULL);
2007 * Break up a large page into smaller size pages.
2008 * Pages involved are on the freelist before the call and may
2009 * be returned to the caller if requested, otherwise they will
2010 * be placed back on the freelist.
2011 * The caller is responsible for locking the freelist as well as any other
2012 * accounting which needs to be done for a returned page.
2013 * If flags is not PC_ALLOC, the color argument is ignored, and thus
2014 * technically, any value may be passed in but PC_NO_COLOR is the standard
2015 * which should be followed for clarity's sake.
2016 * Returns a page whose pfn is < pfnmax
2018 page_t *
2019 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2020 uchar_t new_szc, int color, int flags)
2022 page_t *pp, *pplist, *npplist;
2023 pgcnt_t npgs, n;
2024 uint_t bin;
2025 uint_t mtype;
2026 page_t *ret_pp = NULL;
2028 ASSERT(cur_szc != 0);
2029 ASSERT(new_szc < cur_szc);
2031 pplist = page_numtopp_nolock(pfnum);
2032 ASSERT(pplist != NULL);
2034 ASSERT(pplist->p_szc == cur_szc);
2036 bin = PP_2_BIN(pplist);
2037 ASSERT(mnode == PP_2_MEM_NODE(pplist));
2038 mtype = PP_2_MTYPE(pplist);
2039 page_lpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2041 CHK_LPG(pplist, cur_szc);
2042 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2045 * Number of PAGESIZE pages for smaller new_szc
2046 * page.
2048 npgs = page_get_pagecnt(new_szc);
2050 while (pplist) {
2051 pp = pplist;
2053 ASSERT(pp->p_szc == cur_szc);
2056 * We either break it up into PAGESIZE pages or larger.
2058 if (npgs == 1) { /* PAGESIZE case */
2059 mach_page_sub(&pplist, pp);
2060 ASSERT(pp->p_szc == cur_szc);
2061 ASSERT(new_szc == 0);
2062 ASSERT(mnode == PP_2_MEM_NODE(pp));
2063 pp->p_szc = new_szc;
2064 bin = PP_2_BIN(pp);
2065 if ((bin == color) && (flags == PC_ALLOC) &&
2066 (ret_pp == NULL) && (pfnmax == 0 ||
2067 pp->p_pagenum < pfnmax) &&
2068 page_trylock_cons(pp, SE_EXCL)) {
2069 ret_pp = pp;
2070 } else {
2071 mtype = PP_2_MTYPE(pp);
2072 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2073 mtype), pp);
2074 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2076 } else {
2077 page_t *try_to_return_this_page = NULL;
2078 int count = 0;
2081 * Break down into smaller lists of pages.
2083 page_list_break(&pplist, &npplist, npgs);
2085 pp = pplist;
2086 n = npgs;
2087 while (n--) {
2088 ASSERT(pp->p_szc == cur_szc);
2090 * Check whether all the pages in this list
2091 * fit the request criteria.
2093 if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2094 count++;
2096 pp->p_szc = new_szc;
2097 pp = pp->p_next;
2100 if (count == npgs &&
2101 (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2102 try_to_return_this_page = pp;
2105 CHK_LPG(pplist, new_szc);
2107 bin = PP_2_BIN(pplist);
2108 if (try_to_return_this_page)
2109 ASSERT(mnode ==
2110 PP_2_MEM_NODE(try_to_return_this_page));
2111 if ((bin == color) && (flags == PC_ALLOC) &&
2112 (ret_pp == NULL) && try_to_return_this_page &&
2113 page_trylock_cons(try_to_return_this_page,
2114 SE_EXCL)) {
2115 ret_pp = try_to_return_this_page;
2116 } else {
2117 mtype = PP_2_MTYPE(pp);
2118 page_lpadd(&PAGE_FREELISTS(mnode, new_szc,
2119 bin, mtype), pplist);
2121 page_ctr_add(mnode, mtype, pplist,
2122 PG_FREE_LIST);
2124 pplist = npplist;
2127 return (ret_pp);
2130 int mpss_coalesce_disable = 0;
2133 * Coalesce free pages into a page of the given szc and color if possible.
2134 * Return the pointer to the page created, otherwise, return NULL.
2136 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2138 page_t *
2139 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2140 int mtype, pfn_t pfnhi)
2142 int r = szc; /* region size */
2143 int mrange;
2144 uint_t full, bin, color_mask, wrap = 0;
2145 pfn_t pfnum, lo, hi;
2146 size_t len, idx, idx0;
2147 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
2148 page_t *ret_pp;
2149 MEM_NODE_ITERATOR_DECL(it);
2151 if (mpss_coalesce_disable) {
2152 ASSERT(szc < MMU_PAGE_SIZES);
2153 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2154 return (NULL);
2157 ASSERT(szc < mmu_page_sizes);
2158 color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2159 ASSERT(ceq_mask <= color_mask);
2160 ASSERT(color <= color_mask);
2161 color &= ceq_mask;
2163 /* Prevent page_counters dynamic memory from being freed */
2164 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2166 mrange = MTYPE_2_MRANGE(mnode, mtype);
2167 ASSERT(mrange < mnode_nranges[mnode]);
2168 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2170 /* get pfn range for mtype */
2171 len = PAGE_COUNTERS_ENTRIES(mnode, r);
2172 MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2173 hi++;
2175 /* use lower limit if given */
2176 if (pfnhi != PFNNULL && pfnhi < hi)
2177 hi = pfnhi;
2179 /* round to szcpgcnt boundaries */
2180 lo = P2ROUNDUP(lo, szcpgcnt);
2181 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2182 if (lo == (pfn_t)-1) {
2183 rw_exit(&page_ctrs_rwlock[mnode]);
2184 return (NULL);
2186 hi = hi & ~(szcpgcnt - 1);
2188 /* set lo to the closest pfn of the right color */
2189 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2190 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2191 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2192 &it);
2195 if (hi <= lo) {
2196 rw_exit(&page_ctrs_rwlock[mnode]);
2197 return (NULL);
2200 full = FULL_REGION_CNT(r);
2202 /* calculate the number of page candidates and initial search index */
2203 bin = color;
2204 idx0 = (size_t)(-1);
2205 do {
2206 pgcnt_t acand;
2208 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2209 if (acand) {
2210 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2211 r, bin, mrange);
2212 idx0 = MIN(idx0, idx);
2213 cands += acand;
2215 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2216 } while (bin != color);
2218 if (cands == 0) {
2219 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2220 rw_exit(&page_ctrs_rwlock[mnode]);
2221 return (NULL);
2224 pfnum = IDX_TO_PNUM(mnode, r, idx0);
2225 if (pfnum < lo || pfnum >= hi) {
2226 pfnum = lo;
2227 } else {
2228 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2229 if (pfnum == (pfn_t)-1) {
2230 pfnum = lo;
2231 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2232 ASSERT(pfnum != (pfn_t)-1);
2233 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2234 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2235 /* invalid color, get the closest correct pfn */
2236 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2237 color_mask, &it);
2238 if (pfnum >= hi) {
2239 pfnum = lo;
2240 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2245 /* set starting index */
2246 idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2247 ASSERT(idx0 < len);
2250 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2253 if (PAGE_COUNTERS(mnode, r, idx) != full)
2254 goto next;
2257 * RFE: For performance maybe we can do something less
2258 * brutal than locking the entire freelist. So far
2259 * this doesn't seem to be a performance problem?
2261 page_freelist_lock(mnode);
2262 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2263 ret_pp =
2264 page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2265 if (ret_pp != NULL) {
2266 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2267 PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2268 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2269 page_freelist_unlock(mnode);
2270 rw_exit(&page_ctrs_rwlock[mnode]);
2271 return (ret_pp);
2273 } else {
2274 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2277 page_freelist_unlock(mnode);
2279 * No point looking for another page if we've
2280 * already tried all of the ones that
2281 * page_ctr_cands indicated. Stash off where we left
2282 * off.
2283 * Note: this is not exact since we don't hold the
2284 * page_freelist_locks before we initially get the
2285 * value of cands for performance reasons, but should
2286 * be a decent approximation.
2288 if (--cands == 0) {
2289 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2290 idx;
2291 break;
2293 next:
2294 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2295 color_mask, &it);
2296 idx = PNUM_TO_IDX(mnode, r, pfnum);
2297 if (idx >= len || pfnum >= hi) {
2298 wrapit:
2299 pfnum = lo;
2300 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2301 idx = PNUM_TO_IDX(mnode, r, pfnum);
2302 wrap++;
2306 rw_exit(&page_ctrs_rwlock[mnode]);
2307 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2308 return (NULL);
2312 * For the given mnode, promote as many small pages to large pages as possible.
2313 * mnode can be -1, which means do them all
2315 void
2316 page_freelist_coalesce_all(int mnode)
2318 int r; /* region size */
2319 int idx, full;
2320 size_t len;
2321 int doall = interleaved_mnodes || mnode < 0;
2322 int mlo = doall ? 0 : mnode;
2323 int mhi = doall ? max_mem_nodes : (mnode + 1);
2325 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2327 if (mpss_coalesce_disable) {
2328 return;
2332 * Lock the entire freelist and coalesce what we can.
2334 * Always promote to the largest page possible
2335 * first to reduce the number of page promotions.
2337 for (mnode = mlo; mnode < mhi; mnode++) {
2338 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2339 page_freelist_lock(mnode);
2341 for (r = mmu_page_sizes - 1; r > 0; r--) {
2342 for (mnode = mlo; mnode < mhi; mnode++) {
2343 pgcnt_t cands = 0;
2344 int mrange, nranges = mnode_nranges[mnode];
2346 for (mrange = 0; mrange < nranges; mrange++) {
2347 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2348 if (cands != 0)
2349 break;
2351 if (cands == 0) {
2352 VM_STAT_ADD(vmm_vmstats.
2353 page_ctrs_cands_skip_all);
2354 continue;
2357 full = FULL_REGION_CNT(r);
2358 len = PAGE_COUNTERS_ENTRIES(mnode, r);
2360 for (idx = 0; idx < len; idx++) {
2361 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2362 pfn_t pfnum =
2363 IDX_TO_PNUM(mnode, r, idx);
2364 int tmnode = interleaved_mnodes ?
2365 PFN_2_MEM_NODE(pfnum) : mnode;
2367 ASSERT(pfnum >=
2368 mem_node_config[tmnode].physbase &&
2369 pfnum <
2370 mem_node_config[tmnode].physmax);
2372 (void) page_promote(tmnode,
2373 pfnum, r, PC_FREE, PC_MTYPE_ANY);
2376 /* shared hpm_counters covers all mnodes, so we quit */
2377 if (interleaved_mnodes)
2378 break;
2381 for (mnode = mlo; mnode < mhi; mnode++) {
2382 page_freelist_unlock(mnode);
2383 rw_exit(&page_ctrs_rwlock[mnode]);
2388 * This is where all polices for moving pages around
2389 * to different page size free lists is implemented.
2390 * Returns 1 on success, 0 on failure.
2392 * So far these are the priorities for this algorithm in descending
2393 * order:
2395 * 1) When servicing a request try to do so with a free page
2396 * from next size up. Helps defer fragmentation as long
2397 * as possible.
2399 * 2) Page coalesce on demand. Only when a freelist
2400 * larger than PAGESIZE is empty and step 1
2401 * will not work since all larger size lists are
2402 * also empty.
2404 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2407 page_t *
2408 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2409 pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2411 uchar_t nszc = szc + 1;
2412 uint_t bin, sbin, bin_prev;
2413 page_t *pp, *firstpp;
2414 page_t *ret_pp = NULL;
2415 uint_t color_mask;
2417 if (nszc == mmu_page_sizes)
2418 return (NULL);
2420 ASSERT(nszc < mmu_page_sizes);
2421 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2422 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2423 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2424 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2426 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2428 * First try to break up a larger page to fill current size freelist.
2430 while (plw->plw_bins[nszc] != 0) {
2432 ASSERT(nszc < mmu_page_sizes);
2435 * If page found then demote it.
2437 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2438 page_freelist_lock(mnode);
2439 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2442 * If pfnhi is not PFNNULL, look for large page below
2443 * pfnhi. PFNNULL signifies no pfn requirement.
2445 if (pp &&
2446 ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2447 (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2448 do {
2449 pp = pp->p_list.largepg.next;
2450 if (pp == firstpp) {
2451 pp = NULL;
2452 break;
2454 } while ((pfnhi != PFNNULL &&
2455 pp->p_pagenum >= pfnhi) ||
2456 (pfnlo != PFNNULL &&
2457 pp->p_pagenum < pfnlo));
2459 if (pfnhi != PFNNULL && pp != NULL)
2460 ASSERT(pp->p_pagenum < pfnhi);
2462 if (pfnlo != PFNNULL && pp != NULL)
2463 ASSERT(pp->p_pagenum >= pfnlo);
2465 if (pp) {
2466 uint_t ccolor = page_correct_color(szc, nszc,
2467 color, bin, plw->plw_ceq_mask[szc]);
2469 ASSERT(pp->p_szc == nszc);
2470 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2471 ret_pp = page_demote(mnode, pp->p_pagenum,
2472 pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2473 if (ret_pp) {
2474 page_freelist_unlock(mnode);
2475 return (ret_pp);
2478 page_freelist_unlock(mnode);
2481 /* loop through next size bins */
2482 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2483 plw->plw_bins[nszc]--;
2485 if (bin == sbin) {
2486 uchar_t nnszc = nszc + 1;
2488 /* we are done with this page size - check next */
2489 if (plw->plw_bins[nnszc] == 0)
2490 /* we have already checked next size bins */
2491 break;
2493 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2494 if (bin_prev != INVALID_COLOR) {
2495 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2496 if (!((bin ^ bin_prev) &
2497 plw->plw_ceq_mask[nnszc]))
2498 break;
2500 ASSERT(nnszc < mmu_page_sizes);
2501 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2502 nszc = nnszc;
2503 ASSERT(nszc < mmu_page_sizes);
2507 return (ret_pp);
2511 * Helper routine used only by the freelist code to lock
2512 * a page. If the page is a large page then it succeeds in
2513 * locking all the constituent pages or none at all.
2514 * Returns 1 on sucess, 0 on failure.
2516 static int
2517 page_trylock_cons(page_t *pp, se_t se)
2519 page_t *tpp, *first_pp = pp;
2522 * Fail if can't lock first or only page.
2524 if (!page_trylock(pp, se)) {
2525 return (0);
2529 * PAGESIZE: common case.
2531 if (pp->p_szc == 0) {
2532 return (1);
2536 * Large page case.
2538 tpp = pp->p_next;
2539 while (tpp != pp) {
2540 if (!page_trylock(tpp, se)) {
2542 * On failure unlock what we have locked so far.
2543 * We want to avoid attempting to capture these
2544 * pages as the pcm mutex may be held which could
2545 * lead to a recursive mutex panic.
2547 while (first_pp != tpp) {
2548 page_unlock_nocapture(first_pp);
2549 first_pp = first_pp->p_next;
2551 return (0);
2553 tpp = tpp->p_next;
2555 return (1);
2559 * init context for walking page lists
2560 * Called when a page of the given szc in unavailable. Sets markers
2561 * for the beginning of the search to detect when search has
2562 * completed a full cycle. Sets flags for splitting larger pages
2563 * and coalescing smaller pages. Page walking procedes until a page
2564 * of the desired equivalent color is found.
2566 void
2567 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2568 int use_ceq, page_list_walker_t *plw)
2570 uint_t nszc, ceq_mask, colors;
2571 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2573 ASSERT(szc < mmu_page_sizes);
2574 colors = PAGE_GET_PAGECOLORS(szc);
2576 plw->plw_colors = colors;
2577 plw->plw_color_mask = colors - 1;
2578 plw->plw_bin_marker = plw->plw_bin0 = bin;
2579 plw->plw_bin_split_prev = bin;
2580 plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2583 * if vac aliasing is possible make sure lower order color
2584 * bits are never ignored
2586 if (vac_colors > 1)
2587 ceq &= 0xf0;
2590 * calculate the number of non-equivalent colors and
2591 * color equivalency mask
2593 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2594 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2595 ASSERT(plw->plw_ceq_dif > 0);
2596 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2598 if (flags & PG_MATCH_COLOR) {
2599 if (cpu_page_colors < 0) {
2601 * this is a heterogeneous machine with different CPUs
2602 * having different size e$ (not supported for ni2/rock
2604 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2605 cpucolors = MAX(cpucolors, 1);
2606 ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2607 plw->plw_ceq_mask[szc] =
2608 MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2610 plw->plw_ceq_dif = 1;
2613 /* we can split pages in the freelist, but not the cachelist */
2614 if (can_split) {
2615 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2617 /* set next szc color masks and number of free list bins */
2618 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2619 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2620 plw->plw_ceq_mask[szc]);
2621 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2623 plw->plw_ceq_mask[nszc] = INVALID_MASK;
2624 plw->plw_bins[nszc] = 0;
2626 } else {
2627 ASSERT(szc == 0);
2628 plw->plw_do_split = 0;
2629 plw->plw_bins[1] = 0;
2630 plw->plw_ceq_mask[1] = INVALID_MASK;
2635 * set mark to flag where next split should occur
2637 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \
2638 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \
2639 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \
2640 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \
2641 plw->plw_split_next = \
2642 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \
2643 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2644 plw->plw_split_next = \
2645 INC_MASKED(plw->plw_split_next, \
2646 neq_mask, plw->plw_color_mask); \
2650 uint_t
2651 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2653 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2654 uint_t bin0_nsz, nbin_nsz, nbin0, nbin;
2655 uchar_t nszc = szc + 1;
2657 nbin = ADD_MASKED(bin,
2658 plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2660 if (plw->plw_do_split) {
2661 plw->plw_bin_split_prev = bin;
2662 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2663 plw->plw_do_split = 0;
2666 if (szc == 0) {
2667 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2668 if (nbin == plw->plw_bin0 &&
2669 (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2670 nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2671 neq_mask, plw->plw_color_mask);
2672 plw->plw_bin_split_prev = plw->plw_bin0;
2675 if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2676 plw->plw_bin_marker =
2677 nbin = INC_MASKED(nbin, neq_mask,
2678 plw->plw_color_mask);
2679 plw->plw_bin_split_prev = plw->plw_bin0;
2681 * large pages all have the same vac color
2682 * so by now we should be done with next
2683 * size page splitting process
2685 ASSERT(plw->plw_bins[1] == 0);
2686 plw->plw_do_split = 0;
2687 return (nbin);
2690 } else {
2691 uint_t bin_jump = (vac_colors == 1) ?
2692 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2694 bin_jump &= ~(vac_colors - 1);
2696 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2697 plw->plw_color_mask);
2699 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2701 plw->plw_bin_marker = nbin = nbin0;
2703 if (plw->plw_bins[nszc] != 0) {
2705 * check if next page size bin is the
2706 * same as the next page size bin for
2707 * bin0
2709 nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2710 nbin);
2711 bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2712 plw->plw_bin0);
2714 if ((bin0_nsz ^ nbin_nsz) &
2715 plw->plw_ceq_mask[nszc])
2716 plw->plw_do_split = 1;
2718 return (nbin);
2723 if (plw->plw_bins[nszc] != 0) {
2724 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2725 if (!((plw->plw_split_next ^ nbin_nsz) &
2726 plw->plw_ceq_mask[nszc]))
2727 plw->plw_do_split = 1;
2730 return (nbin);
2733 page_t *
2734 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2735 uint_t flags)
2737 kmutex_t *pcm;
2738 page_t *pp, *first_pp;
2739 uint_t sbin;
2740 int plw_initialized;
2741 page_list_walker_t plw;
2743 ASSERT(szc < mmu_page_sizes);
2745 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2747 MTYPE_START(mnode, mtype, flags);
2748 if (mtype < 0) { /* mnode does not have memory in mtype range */
2749 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2750 return (NULL);
2752 try_again:
2754 plw_initialized = 0;
2755 plw.plw_ceq_dif = 1;
2758 * Only hold one freelist lock at a time, that way we
2759 * can start anywhere and not have to worry about lock
2760 * ordering.
2762 for (plw.plw_count = 0;
2763 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2764 sbin = bin;
2765 do {
2766 if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2767 goto bin_empty_1;
2769 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2770 mutex_enter(pcm);
2771 pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2772 if (pp == NULL)
2773 goto bin_empty_0;
2776 * These were set before the page
2777 * was put on the free list,
2778 * they must still be set.
2780 ASSERT(PP_ISFREE(pp));
2781 ASSERT(PP_ISAGED(pp));
2782 VERIFY(pp->p_object == NULL);
2783 ASSERT(pp->p_vnode == NULL);
2784 ASSERT(pp->p_offset == (uoff_t)-1);
2785 ASSERT(pp->p_szc == szc);
2786 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2789 * Walk down the hash chain. 4k/8k pages are linked
2790 * on p_next and p_prev fields. Large pages are a
2791 * contiguous group of constituent pages linked
2792 * together on their p_next and p_prev fields. The
2793 * large pages are linked together on the hash chain
2794 * using p_list.largepg of the base constituent page
2795 * of each large page.
2797 first_pp = pp;
2798 while (!page_trylock_cons(pp, SE_EXCL)) {
2799 if (szc == 0) {
2800 pp = pp->p_next;
2801 } else {
2802 pp = pp->p_list.largepg.next;
2805 ASSERT(PP_ISFREE(pp));
2806 ASSERT(PP_ISAGED(pp));
2807 VERIFY(pp->p_object == NULL);
2808 ASSERT(pp->p_vnode == NULL);
2809 ASSERT(pp->p_offset == (uoff_t)-1);
2810 ASSERT(pp->p_szc == szc);
2811 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2813 if (pp == first_pp)
2814 goto bin_empty_0;
2817 ASSERT(pp != NULL);
2818 ASSERT(mtype == PP_2_MTYPE(pp));
2819 ASSERT(pp->p_szc == szc);
2820 if (szc == 0) {
2821 page_sub(&PAGE_FREELISTS(mnode,
2822 szc, bin, mtype), pp);
2823 } else {
2824 page_lpsub(&PAGE_FREELISTS(mnode,
2825 szc, bin, mtype), pp);
2826 CHK_LPG(pp, szc);
2828 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
2830 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
2831 panic("free page is not. pp %p", (void *)pp);
2832 mutex_exit(pcm);
2834 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
2835 return (pp);
2837 bin_empty_0:
2838 mutex_exit(pcm);
2839 bin_empty_1:
2840 if (plw_initialized == 0) {
2841 page_list_walk_init(szc, flags, bin, 1, 1,
2842 &plw);
2843 plw_initialized = 1;
2844 ASSERT(plw.plw_colors <=
2845 PAGE_GET_PAGECOLORS(szc));
2846 ASSERT(plw.plw_colors > 0);
2847 ASSERT((plw.plw_colors &
2848 (plw.plw_colors - 1)) == 0);
2849 ASSERT(bin < plw.plw_colors);
2850 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
2852 /* calculate the next bin with equivalent color */
2853 bin = ADD_MASKED(bin, plw.plw_bin_step,
2854 plw.plw_ceq_mask[szc], plw.plw_color_mask);
2855 } while (sbin != bin);
2858 * color bins are all empty if color match. Try and
2859 * satisfy the request by breaking up or coalescing
2860 * pages from a different size freelist of the correct
2861 * color that satisfies the ORIGINAL color requested.
2862 * If that fails then try pages of the same size but
2863 * different colors assuming we are not called with
2864 * PG_MATCH_COLOR.
2866 if (plw.plw_do_split &&
2867 (pp = page_freelist_split(szc, bin, mnode,
2868 mtype, PFNNULL, PFNNULL, &plw)) != NULL)
2869 return (pp);
2871 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
2872 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL)
2873 return (pp);
2875 if (plw.plw_ceq_dif > 1)
2876 bin = page_list_walk_next_bin(szc, bin, &plw);
2879 /* if allowed, cycle through additional mtypes */
2880 MTYPE_NEXT(mnode, mtype, flags);
2881 if (mtype >= 0)
2882 goto try_again;
2884 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
2886 return (NULL);
2890 * Returns the count of free pages for 'pp' with size code 'szc'.
2891 * Note: This function does not return an exact value as the page freelist
2892 * locks are not held and thus the values in the page_counters may be
2893 * changing as we walk through the data.
2895 static int
2896 page_freecnt(int mnode, page_t *pp, uchar_t szc)
2898 pgcnt_t pgfree;
2899 pgcnt_t cnt;
2900 ssize_t r = szc; /* region size */
2901 ssize_t idx;
2902 int i;
2903 int full, range;
2905 /* Make sure pagenum passed in is aligned properly */
2906 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
2907 ASSERT(szc > 0);
2909 /* Prevent page_counters dynamic memory from being freed */
2910 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2911 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2912 cnt = PAGE_COUNTERS(mnode, r, idx);
2913 pgfree = cnt << PNUM_SHIFT(r - 1);
2914 range = FULL_REGION_CNT(szc);
2916 /* Check for completely full region */
2917 if (cnt == range) {
2918 rw_exit(&page_ctrs_rwlock[mnode]);
2919 return (pgfree);
2922 while (--r > 0) {
2923 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2924 full = FULL_REGION_CNT(r);
2925 for (i = 0; i < range; i++, idx++) {
2926 cnt = PAGE_COUNTERS(mnode, r, idx);
2928 * If cnt here is full, that means we have already
2929 * accounted for these pages earlier.
2931 if (cnt != full) {
2932 pgfree += (cnt << PNUM_SHIFT(r - 1));
2935 range *= full;
2937 rw_exit(&page_ctrs_rwlock[mnode]);
2938 return (pgfree);
2942 * Called from page_geti_contig_pages to exclusively lock constituent pages
2943 * starting from 'spp' for page size code 'szc'.
2945 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
2946 * region needs to be greater than or equal to the threshold.
2948 static int
2949 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
2951 pgcnt_t pgcnt = PNUM_SIZE(szc);
2952 pgcnt_t pgfree, i;
2953 page_t *pp;
2955 VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
2958 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
2959 goto skipptcpcheck;
2961 * check if there are sufficient free pages available before attempting
2962 * to trylock. Count is approximate as page counters can change.
2964 pgfree = page_freecnt(mnode, spp, szc);
2966 /* attempt to trylock if there are sufficient already free pages */
2967 if (pgfree < pgcnt/ptcpthreshold) {
2968 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
2969 return (0);
2972 skipptcpcheck:
2974 for (i = 0; i < pgcnt; i++) {
2975 pp = &spp[i];
2976 if (!page_trylock(pp, SE_EXCL)) {
2977 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
2978 while (--i != (pgcnt_t)-1) {
2979 pp = &spp[i];
2980 ASSERT(PAGE_EXCL(pp));
2981 page_unlock_nocapture(pp);
2983 return (0);
2985 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
2986 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
2987 !PP_ISFREE(pp)) {
2988 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
2989 ASSERT(i == 0);
2990 page_unlock_nocapture(pp);
2991 return (0);
2995 * If a page has been marked non-relocatable or has been
2996 * explicitly locked in memory, we don't want to relocate it;
2997 * unlock the pages and fail the operation.
2999 if (PP_ISNORELOC(pp) ||
3000 pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
3001 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3002 while (i != (pgcnt_t)-1) {
3003 pp = &spp[i];
3004 ASSERT(PAGE_EXCL(pp));
3005 page_unlock_nocapture(pp);
3006 i--;
3008 return (0);
3011 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3012 return (1);
3016 * Claim large page pointed to by 'pp'. 'pp' is the starting set
3017 * of 'szc' constituent pages that had been locked exclusively previously.
3018 * Will attempt to relocate constituent pages in use.
3020 static page_t *
3021 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3023 spgcnt_t pgcnt, npgs, i;
3024 page_t *targpp, *rpp, *hpp;
3025 page_t *replpp = NULL;
3026 page_t *pplist = NULL;
3028 ASSERT(pp != NULL);
3030 pgcnt = page_get_pagecnt(szc);
3031 while (pgcnt) {
3032 ASSERT(PAGE_EXCL(pp));
3033 ASSERT(!PP_ISNORELOC(pp));
3034 if (PP_ISFREE(pp)) {
3036 * If this is a PG_FREE_LIST page then its
3037 * size code can change underneath us due to
3038 * page promotion or demotion. As an optimzation
3039 * use page_list_sub_pages() instead of
3040 * page_list_sub().
3042 if (PP_ISAGED(pp)) {
3043 page_list_sub_pages(pp, szc);
3044 if (pp->p_szc == szc) {
3045 return (pp);
3047 ASSERT(pp->p_szc < szc);
3048 npgs = page_get_pagecnt(pp->p_szc);
3049 hpp = pp;
3050 for (i = 0; i < npgs; i++, pp++) {
3051 pp->p_szc = szc;
3053 page_list_concat(&pplist, &hpp);
3054 pgcnt -= npgs;
3055 continue;
3057 ASSERT(!PP_ISAGED(pp));
3058 ASSERT(pp->p_szc == 0);
3059 page_list_sub(pp, PG_CACHE_LIST);
3060 page_hashout(pp, false);
3061 PP_SETAGED(pp);
3062 pp->p_szc = szc;
3063 page_list_concat(&pplist, &pp);
3064 pp++;
3065 pgcnt--;
3066 continue;
3068 npgs = page_get_pagecnt(pp->p_szc);
3071 * page_create_wait freemem accounting done by caller of
3072 * page_get_freelist and not necessary to call it prior to
3073 * calling page_get_replacement_page.
3075 * page_get_replacement_page can call page_get_contig_pages
3076 * to acquire a large page (szc > 0); the replacement must be
3077 * smaller than the contig page size to avoid looping or
3078 * szc == 0 and PGI_PGCPSZC0 is set.
3080 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3081 replpp = page_get_replacement_page(pp, NULL, 0);
3082 if (replpp) {
3083 npgs = page_get_pagecnt(pp->p_szc);
3084 ASSERT(npgs <= pgcnt);
3085 targpp = pp;
3090 * If replacement is NULL or do_page_relocate fails, fail
3091 * coalescing of pages.
3093 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3094 &npgs, NULL) != 0)) {
3096 * Unlock un-processed target list
3098 while (pgcnt--) {
3099 ASSERT(PAGE_EXCL(pp));
3100 page_unlock_nocapture(pp);
3101 pp++;
3104 * Free the processed target list.
3106 while (pplist) {
3107 pp = pplist;
3108 page_sub(&pplist, pp);
3109 ASSERT(PAGE_EXCL(pp));
3110 ASSERT(pp->p_szc == szc);
3111 ASSERT(PP_ISFREE(pp));
3112 ASSERT(PP_ISAGED(pp));
3113 pp->p_szc = 0;
3114 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3115 page_unlock_nocapture(pp);
3118 if (replpp != NULL)
3119 page_free_replacement_page(replpp);
3121 return (NULL);
3123 ASSERT(pp == targpp);
3125 ASSERT(hpp = pp); /* That's right, it's an assignment */
3127 pp += npgs;
3128 pgcnt -= npgs;
3130 while (npgs--) {
3131 ASSERT(PAGE_EXCL(targpp));
3132 ASSERT(!PP_ISFREE(targpp));
3133 ASSERT(!PP_ISNORELOC(targpp));
3134 PP_SETFREE(targpp);
3135 ASSERT(PP_ISAGED(targpp));
3136 ASSERT(targpp->p_szc < szc || (szc == 0 &&
3137 (flags & PGI_PGCPSZC0)));
3138 targpp->p_szc = szc;
3139 targpp = targpp->p_next;
3141 rpp = replpp;
3142 ASSERT(rpp != NULL);
3143 page_sub(&replpp, rpp);
3144 ASSERT(PAGE_EXCL(rpp));
3145 ASSERT(!PP_ISFREE(rpp));
3146 page_unlock_nocapture(rpp);
3148 ASSERT(targpp == hpp);
3149 ASSERT(replpp == NULL);
3150 page_list_concat(&pplist, &targpp);
3152 CHK_LPG(pplist, szc);
3153 return (pplist);
3157 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3158 * page with size code 'szc'. Claiming such a page requires acquiring
3159 * exclusive locks on all constituent pages (page_trylock_contig_pages),
3160 * relocating pages in use and concatenating these constituent pages into a
3161 * large page.
3163 * The page lists do not have such a large page and page_freelist_split has
3164 * already failed to demote larger pages and/or coalesce smaller free pages.
3166 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3167 * pages with the same color as 'bin'.
3169 * 'pfnflag' specifies the subset of the pfn range to search.
3172 static page_t *
3173 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3174 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3176 struct memseg *mseg;
3177 pgcnt_t szcpgcnt = page_get_pagecnt(szc);
3178 pgcnt_t szcpgmask = szcpgcnt - 1;
3179 pfn_t randpfn;
3180 page_t *pp, *randpp, *endpp;
3181 uint_t colors, ceq_mask;
3182 uint_t color_mask __unused;
3183 pfn_t hi, lo;
3184 uint_t skip;
3185 MEM_NODE_ITERATOR_DECL(it);
3187 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3189 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3191 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3192 return (NULL);
3194 ASSERT(szc < mmu_page_sizes);
3196 colors = PAGE_GET_PAGECOLORS(szc);
3197 color_mask = colors - 1;
3198 if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3199 uchar_t ceq = colorequivszc[szc];
3200 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3202 ASSERT(ceq_dif > 0);
3203 ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3204 } else {
3205 ceq_mask = 0;
3208 ASSERT(bin < colors);
3210 /* clear "non-significant" color bits */
3211 bin &= ceq_mask;
3214 * trim the pfn range to search based on pfnflag. pfnflag is set
3215 * when there have been previous page_get_contig_page failures to
3216 * limit the search.
3218 * The high bit in pfnflag specifies the number of 'slots' in the
3219 * pfn range and the remainder of pfnflag specifies which slot.
3220 * For example, a value of 1010b would mean the second slot of
3221 * the pfn range that has been divided into 8 slots.
3223 if (pfnflag > 1) {
3224 int slots = 1 << (highbit(pfnflag) - 1);
3225 int slotid = pfnflag & (slots - 1);
3226 pgcnt_t szcpages;
3227 int slotlen;
3229 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3230 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3231 slotlen = howmany(szcpages, slots);
3232 /* skip if 'slotid' slot is empty */
3233 if (slotid * slotlen >= szcpages)
3234 return (NULL);
3235 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3236 ASSERT(pfnlo < pfnhi);
3237 if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3238 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3242 * This routine is can be called recursively so we shouldn't
3243 * acquire a reader lock if a write request is pending. This
3244 * could lead to a deadlock with the DR thread.
3246 * Returning NULL informs the caller that we could not get
3247 * a contig page with the required characteristics.
3250 if (!memsegs_trylock(0))
3251 return (NULL);
3254 * loop through memsegs to look for contig page candidates
3257 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3258 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3259 /* no overlap */
3260 continue;
3263 if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3264 /* mseg too small */
3265 continue;
3267 lo = MAX(pfnlo, mseg->pages_base);
3268 hi = MIN(pfnhi, (mseg->pages_end - 1));
3270 /* round to szcpgcnt boundaries */
3271 lo = P2ROUNDUP(lo, szcpgcnt);
3273 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3274 hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3276 if (hi <= lo)
3277 continue;
3280 * set lo to point to the pfn for the desired bin. Large
3281 * page sizes may only have a single page color
3283 skip = szcpgcnt;
3284 if (ceq_mask > 0 || interleaved_mnodes) {
3285 /* set lo to point at appropriate color */
3286 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3287 (interleaved_mnodes &&
3288 PFN_2_MEM_NODE(lo) != mnode)) {
3289 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3290 color_mask, &it);
3292 if (hi <= lo)
3293 /* mseg cannot satisfy color request */
3294 continue;
3297 /* randomly choose a point between lo and hi to begin search */
3299 randpfn = (pfn_t)GETTICK();
3300 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3301 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3302 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3303 if (randpfn != (pfn_t)-1) {
3304 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3305 ceq_mask, color_mask, &it);
3307 if (randpfn >= hi) {
3308 randpfn = lo;
3309 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3310 &it);
3313 randpp = mseg->pages + (randpfn - mseg->pages_base);
3315 ASSERT(randpp->p_pagenum == randpfn);
3317 pp = randpp;
3318 endpp = mseg->pages + (hi - mseg->pages_base) + 1;
3320 ASSERT(randpp + szcpgcnt <= endpp);
3322 do {
3323 ASSERT(!(pp->p_pagenum & szcpgmask));
3324 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3326 if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3327 /* pages unlocked by page_claim on failure */
3328 if (page_claim_contig_pages(pp, szc, flags)) {
3329 memsegs_unlock(0);
3330 return (pp);
3334 if (ceq_mask == 0 && !interleaved_mnodes) {
3335 pp += skip;
3336 } else {
3337 pfn_t pfn = pp->p_pagenum;
3339 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3340 ceq_mask, color_mask, &it);
3341 if (pfn == (pfn_t)-1) {
3342 pp = endpp;
3343 } else {
3344 pp = mseg->pages +
3345 (pfn - mseg->pages_base);
3348 if (pp >= endpp) {
3349 /* start from the beginning */
3350 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3351 pp = mseg->pages + (lo - mseg->pages_base);
3352 ASSERT(pp->p_pagenum == lo);
3353 ASSERT(pp + szcpgcnt <= endpp);
3355 } while (pp != randpp);
3357 memsegs_unlock(0);
3358 return (NULL);
3363 * controlling routine that searches through physical memory in an attempt to
3364 * claim a large page based on the input parameters.
3365 * on the page free lists.
3367 * calls page_geti_contig_pages with an initial pfn range from the mnode
3368 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3369 * that overlaps with the kernel cage or does not match the requested page
3370 * color if PG_MATCH_COLOR is set. Since this search is very expensive,
3371 * page_geti_contig_pages may further limit the search range based on
3372 * previous failure counts (pgcpfailcnt[]).
3374 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3375 * pagesize page that satisfies mtype.
3377 page_t *
3378 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3379 uint_t flags)
3381 pfn_t pfnlo, pfnhi; /* contig pages pfn range */
3382 page_t *pp;
3383 pgcnt_t pfnflag = 0; /* no limit on search if 0 */
3385 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3387 /* no allocations from cage */
3388 flags |= PGI_NOCAGE;
3390 MTYPE_START(mnode, mtype, flags);
3391 if (mtype < 0) { /* mnode does not have memory in mtype range */
3392 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3393 return (NULL);
3396 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3398 /* do not limit search and ignore color if hi pri */
3400 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3401 pfnflag = pgcpfailcnt[szc];
3403 /* remove color match to improve chances */
3405 if (flags & PGI_PGCPHIPRI || pfnflag)
3406 flags &= ~PG_MATCH_COLOR;
3408 do {
3409 /* get pfn range based on mnode and mtype */
3410 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3412 ASSERT(pfnhi >= pfnlo);
3414 pp = page_geti_contig_pages(mnode, bin, szc, flags,
3415 pfnlo, pfnhi, pfnflag);
3417 if (pp != NULL) {
3418 pfnflag = pgcpfailcnt[szc];
3419 if (pfnflag) {
3420 /* double the search size */
3421 pgcpfailcnt[szc] = pfnflag >> 1;
3423 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3424 return (pp);
3426 MTYPE_NEXT(mnode, mtype, flags);
3427 } while (mtype >= 0);
3429 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3430 return (NULL);
3433 #if defined(__i386) || defined(__amd64)
3435 * Determine the likelihood of finding/coalescing a szc page.
3436 * Return 0 if the likelihood is small otherwise return 1.
3438 * For now, be conservative and check only 1g pages and return 0
3439 * if there had been previous coalescing failures and the szc pages
3440 * needed to satisfy request would exhaust most of freemem.
3443 page_chk_freelist(uint_t szc)
3445 pgcnt_t pgcnt;
3447 if (szc <= 1)
3448 return (1);
3450 pgcnt = page_get_pagecnt(szc);
3451 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3452 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3453 return (0);
3455 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3456 return (1);
3458 #endif
3461 * Find the `best' page on the freelist for this (obj,off) (as,vaddr) pair.
3463 * Does its own locking and accounting.
3464 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3465 * pages of the proper color even if there are pages of a different color.
3467 * Finds a page, removes it, THEN locks it.
3470 /*ARGSUSED*/
3471 page_t *
3472 page_get_freelist(struct vmobject *obj, uoff_t off, struct seg *seg,
3473 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3475 struct as *as = seg->s_as;
3476 page_t *pp = NULL;
3477 ulong_t bin;
3478 uchar_t szc;
3479 int mnode;
3480 int mtype;
3481 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3482 lgrp_mnode_cookie_t lgrp_cookie;
3484 page_get_func = page_get_mnode_freelist;
3487 * If we aren't passed a specific lgroup, or passed a freed lgrp
3488 * assume we wish to allocate near to the current thread's home.
3490 if (!LGRP_EXISTS(lgrp))
3491 lgrp = lgrp_home_lgrp();
3493 flags &= ~PG_NORELOC;
3494 flags |= PGI_NOCAGE;
3496 MTYPE_INIT(mtype, obj->vnode, vaddr, flags, size);
3499 * Convert size to page size code.
3501 if ((szc = page_szc(size)) == (uchar_t)-1)
3502 panic("page_get_freelist: illegal page size request");
3503 ASSERT(szc < mmu_page_sizes);
3505 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3507 AS_2_BIN(as, seg, obj->vnode, vaddr, bin, szc);
3509 ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3512 * Try to get a local page first, but try remote if we can't
3513 * get a page of the right color.
3515 pgretry:
3516 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3517 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3518 pp = page_get_func(mnode, bin, mtype, szc, flags);
3519 if (pp != NULL) {
3520 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3521 DTRACE_PROBE4(page__get,
3522 lgrp_t *, lgrp,
3523 int, mnode,
3524 ulong_t, bin,
3525 uint_t, flags);
3526 return (pp);
3529 ASSERT(pp == NULL);
3532 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3533 * remote free lists. Caller expected to call page_get_cachelist which
3534 * will check local cache lists and remote free lists.
3536 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3537 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3538 return (NULL);
3541 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3543 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3545 if (!(flags & PG_LOCAL)) {
3547 * Try to get a non-local freelist page.
3549 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3550 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3551 pp = page_get_func(mnode, bin, mtype, szc, flags);
3552 if (pp != NULL) {
3553 DTRACE_PROBE4(page__get,
3554 lgrp_t *, lgrp,
3555 int, mnode,
3556 ulong_t, bin,
3557 uint_t, flags);
3558 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3559 return (pp);
3562 ASSERT(pp == NULL);
3566 * when the cage is off chances are page_get_contig_pages() will fail
3567 * to lock a large page chunk therefore when the cage is off it's not
3568 * called by default. this can be changed via /etc/system.
3570 * page_get_contig_pages() also called to acquire a base pagesize page
3571 * for page_create_get_something().
3573 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3574 (pg_lpgcreate_nocage || szc == 0) &&
3575 (page_get_func != page_get_contig_pages)) {
3577 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3578 page_get_func = page_get_contig_pages;
3579 goto pgretry;
3582 if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3583 page_get_func == page_get_contig_pages)
3584 SETPGCPFAILCNT(szc);
3586 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3587 return (NULL);
3591 * Find the `best' page on the cachelist for this (obj,off) (as,vaddr) pair.
3593 * Does its own locking.
3594 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3595 * pages of the proper color even if there are pages of a different color.
3596 * Otherwise, scan the bins for ones with pages. For each bin with pages,
3597 * try to lock one of them. If no page can be locked, try the
3598 * next bin. Return NULL if a page can not be found and locked.
3600 * Finds a pages, trys to lock it, then removes it.
3603 /*ARGSUSED*/
3604 struct page *
3605 page_get_cachelist(struct vmobject *obj, uoff_t off, struct seg *seg,
3606 caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3608 page_t *pp;
3609 struct as *as = seg->s_as;
3610 ulong_t bin;
3611 int mnode;
3612 int mtype;
3613 lgrp_mnode_cookie_t lgrp_cookie;
3616 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3617 * assume we wish to allocate near to the current thread's home.
3619 if (!LGRP_EXISTS(lgrp))
3620 lgrp = lgrp_home_lgrp();
3622 flags &= ~PG_NORELOC;
3623 flags |= PGI_NOCAGE;
3625 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC)
3626 return (NULL);
3628 AS_2_BIN(as, seg, obj->vnode, vaddr, bin, 0);
3630 ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3632 MTYPE_INIT(mtype, obj->vnode, vaddr, flags, MMU_PAGESIZE);
3634 VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3637 * Try local cachelists first
3639 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3640 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3641 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3642 if (pp != NULL) {
3643 VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3644 DTRACE_PROBE4(page__get,
3645 lgrp_t *, lgrp,
3646 int, mnode,
3647 ulong_t, bin,
3648 uint_t, flags);
3649 return (pp);
3653 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3656 * Try freelists/cachelists that are farther away
3657 * This is our only chance to allocate remote pages for PAGESIZE
3658 * requests.
3660 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3661 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3662 pp = page_get_mnode_freelist(mnode, bin, mtype,
3663 0, flags);
3664 if (pp != NULL) {
3665 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3666 DTRACE_PROBE4(page__get,
3667 lgrp_t *, lgrp,
3668 int, mnode,
3669 ulong_t, bin,
3670 uint_t, flags);
3671 return (pp);
3673 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3674 if (pp != NULL) {
3675 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3676 DTRACE_PROBE4(page__get,
3677 lgrp_t *, lgrp,
3678 int, mnode,
3679 ulong_t, bin,
3680 uint_t, flags);
3681 return (pp);
3685 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3686 return (NULL);
3689 page_t *
3690 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3692 kmutex_t *pcm;
3693 page_t *pp, *first_pp;
3694 uint_t sbin;
3695 int plw_initialized;
3696 page_list_walker_t plw;
3698 VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3700 MTYPE_START(mnode, mtype, flags);
3701 if (mtype < 0) { /* mnode does not have memory in mtype range */
3702 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3703 return (NULL);
3706 try_again:
3708 plw_initialized = 0;
3709 plw.plw_ceq_dif = 1;
3712 * Only hold one cachelist lock at a time, that way we
3713 * can start anywhere and not have to worry about lock
3714 * ordering.
3717 for (plw.plw_count = 0;
3718 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3719 sbin = bin;
3720 do {
3722 if (!PAGE_CACHELISTS(mnode, bin, mtype))
3723 goto bin_empty_1;
3724 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3725 mutex_enter(pcm);
3726 pp = PAGE_CACHELISTS(mnode, bin, mtype);
3727 if (pp == NULL)
3728 goto bin_empty_0;
3730 first_pp = pp;
3731 VERIFY(pp->p_object);
3732 ASSERT(pp->p_vnode);
3733 ASSERT(PP_ISAGED(pp) == 0);
3734 ASSERT(pp->p_szc == 0);
3735 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3736 while (!page_trylock(pp, SE_EXCL)) {
3737 pp = pp->p_next;
3738 ASSERT(pp->p_szc == 0);
3739 if (pp == first_pp) {
3741 * We have searched the complete list!
3742 * And all of them (might only be one)
3743 * are locked. This can happen since
3744 * these pages can also be found via
3745 * the hash list. When found via the
3746 * hash list, they are locked first,
3747 * then removed. We give up to let the
3748 * other thread run.
3750 pp = NULL;
3751 break;
3753 VERIFY(pp->p_object);
3754 ASSERT(pp->p_vnode);
3755 ASSERT(PP_ISFREE(pp));
3756 ASSERT(PP_ISAGED(pp) == 0);
3757 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
3758 mnode);
3761 if (pp) {
3762 page_t **ppp;
3764 * Found and locked a page.
3765 * Pull it off the list.
3767 ASSERT(mtype == PP_2_MTYPE(pp));
3768 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
3769 page_sub(ppp, pp);
3771 * Subtract counters before releasing pcm mutex
3772 * to avoid a race with page_freelist_coalesce
3773 * and page_freelist_split.
3775 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3776 mutex_exit(pcm);
3777 VERIFY(pp->p_object);
3778 ASSERT(pp->p_vnode);
3779 ASSERT(PP_ISAGED(pp) == 0);
3780 VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
3781 return (pp);
3783 bin_empty_0:
3784 mutex_exit(pcm);
3785 bin_empty_1:
3786 if (plw_initialized == 0) {
3787 page_list_walk_init(0, flags, bin, 0, 1, &plw);
3788 plw_initialized = 1;
3790 /* calculate the next bin with equivalent color */
3791 bin = ADD_MASKED(bin, plw.plw_bin_step,
3792 plw.plw_ceq_mask[0], plw.plw_color_mask);
3793 } while (sbin != bin);
3795 if (plw.plw_ceq_dif > 1)
3796 bin = page_list_walk_next_bin(0, bin, &plw);
3799 MTYPE_NEXT(mnode, mtype, flags);
3800 if (mtype >= 0)
3801 goto try_again;
3803 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
3804 return (NULL);
3807 #ifdef DEBUG
3808 #define REPL_PAGE_STATS
3809 #endif /* DEBUG */
3811 #ifdef REPL_PAGE_STATS
3812 struct repl_page_stats {
3813 uint_t ngets;
3814 uint_t ngets_noreloc;
3815 uint_t npgr_noreloc;
3816 uint_t nnopage_first;
3817 uint_t nnopage;
3818 uint_t nhashout;
3819 uint_t nnofree;
3820 uint_t nnext_pp;
3821 } repl_page_stats;
3822 #define REPL_STAT_INCR(v) atomic_inc_32(&repl_page_stats.v)
3823 #else /* REPL_PAGE_STATS */
3824 #define REPL_STAT_INCR(v)
3825 #endif /* REPL_PAGE_STATS */
3827 int pgrppgcp;
3830 * The freemem accounting must be done by the caller.
3831 * First we try to get a replacement page of the same size as like_pp,
3832 * if that is not possible, then we just get a set of discontiguous
3833 * PAGESIZE pages.
3835 page_t *
3836 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
3837 uint_t pgrflags)
3839 page_t *like_pp;
3840 page_t *pp, *pplist;
3841 page_t *pl = NULL;
3842 ulong_t bin;
3843 int mnode, page_mnode;
3844 int szc;
3845 spgcnt_t npgs, pg_cnt;
3846 pfn_t pfnum;
3847 int mtype;
3848 int flags = 0;
3849 lgrp_mnode_cookie_t lgrp_cookie;
3850 lgrp_t *lgrp;
3852 REPL_STAT_INCR(ngets);
3853 like_pp = orig_like_pp;
3854 ASSERT(PAGE_EXCL(like_pp));
3856 szc = like_pp->p_szc;
3857 npgs = page_get_pagecnt(szc);
3859 * Now we reset like_pp to the base page_t.
3860 * That way, we won't walk past the end of this 'szc' page.
3862 pfnum = PFN_BASE(like_pp->p_pagenum, szc);
3863 like_pp = page_numtopp_nolock(pfnum);
3864 ASSERT(like_pp->p_szc == szc);
3866 VERIFY0(PP_ISNORELOC(like_pp));
3867 VERIFY0(pgrflags & PGR_NORELOC);
3870 * Kernel pages must always be replaced with the same size
3871 * pages, since we cannot properly handle demotion of kernel
3872 * pages.
3874 if (PP_ISKAS(like_pp))
3875 pgrflags |= PGR_SAMESZC;
3877 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
3879 while (npgs) {
3880 pplist = NULL;
3881 for (;;) {
3882 pg_cnt = page_get_pagecnt(szc);
3883 bin = PP_2_BIN(like_pp);
3884 ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
3885 ASSERT(pg_cnt <= npgs);
3888 * If an lgroup was specified, try to get the
3889 * page from that lgroup.
3890 * NOTE: Must be careful with code below because
3891 * lgroup may disappear and reappear since there
3892 * is no locking for lgroup here.
3894 if (LGRP_EXISTS(lgrp_target)) {
3896 * Keep local variable for lgroup separate
3897 * from lgroup argument since this code should
3898 * only be exercised when lgroup argument
3899 * exists....
3901 lgrp = lgrp_target;
3903 /* Try the lgroup's freelists first */
3904 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3905 LGRP_SRCH_LOCAL);
3906 while ((pplist == NULL) &&
3907 (mnode = lgrp_memnode_choose(&lgrp_cookie))
3908 != -1) {
3909 pplist =
3910 page_get_mnode_freelist(mnode, bin,
3911 mtype, szc, flags);
3915 * Now try it's cachelists if this is a
3916 * small page. Don't need to do it for
3917 * larger ones since page_freelist_coalesce()
3918 * already failed.
3920 if (pplist != NULL || szc != 0)
3921 break;
3923 /* Now try it's cachelists */
3924 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3925 LGRP_SRCH_LOCAL);
3927 while ((pplist == NULL) &&
3928 (mnode = lgrp_memnode_choose(&lgrp_cookie))
3929 != -1) {
3930 pplist =
3931 page_get_mnode_cachelist(bin, flags,
3932 mnode, mtype);
3934 if (pplist != NULL) {
3935 page_hashout(pplist, false);
3936 PP_SETAGED(pplist);
3937 REPL_STAT_INCR(nhashout);
3938 break;
3940 /* Done looking in this lgroup. Bail out. */
3941 break;
3945 * No lgroup was specified (or lgroup was removed by
3946 * DR, so just try to get the page as close to
3947 * like_pp's mnode as possible.
3948 * First try the local freelist...
3950 mnode = PP_2_MEM_NODE(like_pp);
3951 pplist = page_get_mnode_freelist(mnode, bin,
3952 mtype, szc, flags);
3953 if (pplist != NULL)
3954 break;
3956 REPL_STAT_INCR(nnofree);
3959 * ...then the local cachelist. Don't need to do it for
3960 * larger pages cause page_freelist_coalesce() already
3961 * failed there anyway.
3963 if (szc == 0) {
3964 pplist = page_get_mnode_cachelist(bin, flags,
3965 mnode, mtype);
3966 if (pplist != NULL) {
3967 page_hashout(pplist, false);
3968 PP_SETAGED(pplist);
3969 REPL_STAT_INCR(nhashout);
3970 break;
3974 /* Now try remote freelists */
3975 page_mnode = mnode;
3976 lgrp =
3977 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
3978 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3979 LGRP_SRCH_HIER);
3980 while (pplist == NULL &&
3981 (mnode = lgrp_memnode_choose(&lgrp_cookie))
3982 != -1) {
3984 * Skip local mnode.
3986 if ((mnode == page_mnode) ||
3987 (mem_node_config[mnode].exists == 0))
3988 continue;
3990 pplist = page_get_mnode_freelist(mnode,
3991 bin, mtype, szc, flags);
3994 if (pplist != NULL)
3995 break;
3998 /* Now try remote cachelists */
3999 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4000 LGRP_SRCH_HIER);
4001 while (pplist == NULL && szc == 0) {
4002 mnode = lgrp_memnode_choose(&lgrp_cookie);
4003 if (mnode == -1)
4004 break;
4006 * Skip local mnode.
4008 if ((mnode == page_mnode) ||
4009 (mem_node_config[mnode].exists == 0))
4010 continue;
4012 pplist = page_get_mnode_cachelist(bin,
4013 flags, mnode, mtype);
4015 if (pplist != NULL) {
4016 page_hashout(pplist, false);
4017 PP_SETAGED(pplist);
4018 REPL_STAT_INCR(nhashout);
4019 break;
4024 * Break out of while loop under the following cases:
4025 * - If we successfully got a page.
4026 * - If pgrflags specified only returning a specific
4027 * page size and we could not find that page size.
4028 * - If we could not satisfy the request with PAGESIZE
4029 * or larger pages.
4031 if (pplist != NULL || szc == 0)
4032 break;
4034 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4035 /* try to find contig page */
4037 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4038 LGRP_SRCH_HIER);
4040 while ((pplist == NULL) &&
4041 (mnode =
4042 lgrp_memnode_choose(&lgrp_cookie))
4043 != -1) {
4044 pplist = page_get_contig_pages(
4045 mnode, bin, mtype, szc,
4046 flags | PGI_PGCPHIPRI);
4048 break;
4052 * The correct thing to do here is try the next
4053 * page size down using szc--. Due to a bug
4054 * with the processing of HAT_RELOAD_SHARE
4055 * where the sfmmu_ttecnt arrays of all
4056 * hats sharing an ISM segment don't get updated,
4057 * using intermediate size pages for relocation
4058 * can lead to continuous page faults.
4060 szc = 0;
4063 if (pplist != NULL) {
4064 DTRACE_PROBE4(page__get,
4065 lgrp_t *, lgrp,
4066 int, mnode,
4067 ulong_t, bin,
4068 uint_t, flags);
4070 while (pplist != NULL && pg_cnt--) {
4071 ASSERT(pplist != NULL);
4072 pp = pplist;
4073 page_sub(&pplist, pp);
4074 PP_CLRFREE(pp);
4075 PP_CLRAGED(pp);
4076 page_list_concat(&pl, &pp);
4077 npgs--;
4078 like_pp = like_pp + 1;
4079 REPL_STAT_INCR(nnext_pp);
4081 ASSERT(pg_cnt == 0);
4082 } else {
4083 break;
4087 if (npgs) {
4089 * We were unable to allocate the necessary number
4090 * of pages.
4091 * We need to free up any pl.
4093 REPL_STAT_INCR(nnopage);
4094 page_free_replacement_page(pl);
4095 return (NULL);
4096 } else {
4097 return (pl);
4102 * demote a free large page to it's constituent pages
4104 void
4105 page_demote_free_pages(page_t *pp)
4108 int mnode;
4110 ASSERT(pp != NULL);
4111 ASSERT(PAGE_LOCKED(pp));
4112 ASSERT(PP_ISFREE(pp));
4113 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4115 mnode = PP_2_MEM_NODE(pp);
4116 page_freelist_lock(mnode);
4117 if (pp->p_szc != 0) {
4118 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4119 pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4121 page_freelist_unlock(mnode);
4122 ASSERT(pp->p_szc == 0);
4126 * Factor in colorequiv to check additional 'equivalent' bins.
4127 * colorequiv may be set in /etc/system
4129 void
4130 page_set_colorequiv_arr(void)
4132 if (colorequiv > 1) {
4133 int i;
4134 uint_t sv_a = lowbit(colorequiv) - 1;
4136 if (sv_a > 15)
4137 sv_a = 15;
4139 for (i = 0; i < MMU_PAGE_SIZES; i++) {
4140 uint_t colors;
4141 uint_t a = sv_a;
4143 if ((colors = hw_page_array[i].hp_colors) <= 1) {
4144 continue;
4146 while ((colors >> a) == 0)
4147 a--;
4148 if ((a << 4) > colorequivszc[i]) {
4149 colorequivszc[i] = (a << 4);