4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright 2012 Joyent, Inc. All rights reserved.
29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
33 * Portions of this source code were derived from Berkeley 4.3 BSD
34 * under license from the Regents of the University of California.
39 * This file contains common functions to access and manage the page lists.
40 * Many of these routines originated from platform dependent modules
41 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
42 * a platform independent manner.
44 * vm/vm_dep.h provides for platform specific support.
47 #include <sys/types.h>
48 #include <sys/debug.h>
49 #include <sys/cmn_err.h>
50 #include <sys/systm.h>
51 #include <sys/atomic.h>
52 #include <sys/sysmacros.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/seg_vn.h>
57 #include <sys/vmsystm.h>
58 #include <sys/memnode.h>
59 #include <vm/vm_dep.h>
61 #include <sys/mem_config.h>
62 #include <sys/callb.h>
64 #include <sys/dumphdr.h>
67 extern uint_t vac_colors
;
69 #define MAX_PRAGMA_ALIGN 128
71 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
73 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
74 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0)
76 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0)
78 char vm_cpu_data0
[VM_CPU_DATA_PADSIZE
];
81 * number of page colors equivalent to reqested color in page_get routines.
82 * If set, keeps large pages intact longer and keeps MPO allocation
83 * from the local mnode in favor of acquiring the 'correct' page color from
84 * a demoted large page or from a remote mnode.
89 * color equivalency mask for each page size.
90 * Mask is computed based on cpu L2$ way sizes and colorequiv global.
91 * High 4 bits determine the number of high order bits of the color to ignore.
92 * Low 4 bits determines number of low order bits of color to ignore (it's only
93 * relevant for hashed index based page coloring).
95 uchar_t colorequivszc
[MMU_PAGE_SIZES
];
98 * if set, specifies the percentage of large pages that are free from within
99 * a large page region before attempting to lock those pages for
100 * page_get_contig_pages processing.
102 * Should be turned on when kpr is available when page_trylock_contig_pages
103 * can be more selective.
109 * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
110 * Enabled by default via pgcplimitsearch.
112 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
113 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
114 * bound. This upper bound range guarantees:
115 * - all large page 'slots' will be searched over time
116 * - the minimum (1) large page candidates considered on each pgcp call
117 * - count doesn't wrap around to 0
119 pgcnt_t pgcpfailcnt
[MMU_PAGE_SIZES
];
120 int pgcplimitsearch
= 1;
122 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1))
123 #define SETPGCPFAILCNT(szc) \
124 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \
125 pgcpfailcnt[szc] = PGCPFAILMAX / 2;
128 struct vmm_vmstats_str vmm_vmstats
;
130 #endif /* VM_STATS */
132 /* enable page_get_contig_pages */
135 int pg_contig_disable
;
136 int pg_lpgcreate_nocage
= LPGCREATE
;
139 * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
143 /* Flags involved in promotion and demotion routines */
144 #define PC_FREE 0x1 /* put page on freelist */
145 #define PC_ALLOC 0x2 /* return page for allocation */
148 * Flag for page_demote to be used with PC_FREE to denote that we don't care
149 * what the color is as the color parameter to the function is ignored.
151 #define PC_NO_COLOR (-1)
153 /* mtype value for page_promote to use when mtype does not matter */
154 #define PC_MTYPE_ANY (-1)
157 * page counters candidates info
158 * See page_ctrs_cands comment below for more details.
159 * fields are as follows:
160 * pcc_pages_free: # pages which freelist coalesce can create
161 * pcc_color_free: pointer to page free counts per color
163 typedef struct pcc_info
{
164 pgcnt_t pcc_pages_free
;
165 pgcnt_t
*pcc_color_free
;
170 * On big machines it can take a long time to check page_counters
171 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
172 * updated sum of all elements of the corresponding page_counters arrays.
173 * page_freelist_coalesce() searches page_counters only if an appropriate
174 * element of page_ctrs_cands array is greater than 0.
176 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
178 pcc_info_t
**page_ctrs_cands
[NPC_MUTEX
][MMU_PAGE_SIZES
];
181 * Return in val the total number of free pages which can be created
182 * for the given mnode (m), mrange (g), and region size (r)
184 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \
187 for (i = 0; i < NPC_MUTEX; i++) { \
188 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \
193 * Return in val the total number of free pages which can be created
194 * for the given mnode (m), mrange (g), region size (r), and color (c)
196 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \
199 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \
200 for (i = 0; i < NPC_MUTEX; i++) { \
202 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \
207 * We can only allow a single thread to update a counter within the physical
208 * range of the largest supported page size. That is the finest granularity
209 * possible since the counter values are dependent on each other
210 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
211 * ctr_mutex lock index for a particular physical range.
213 static kmutex_t
*ctr_mutex
[NPC_MUTEX
];
215 #define PP_CTR_LOCK_INDX(pp) \
216 (((pp)->p_pagenum >> \
217 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
219 #define INVALID_COLOR 0xffffffff
220 #define INVALID_MASK 0xffffffff
223 * Local functions prototypes.
226 void page_ctr_add(int, int, page_t
*, int);
227 void page_ctr_add_internal(int, int, page_t
*, int);
228 void page_ctr_sub(int, int, page_t
*, int);
229 void page_ctr_sub_internal(int, int, page_t
*, int);
230 void page_freelist_lock(int);
231 void page_freelist_unlock(int);
232 page_t
*page_promote(int, pfn_t
, uchar_t
, int, int);
233 page_t
*page_demote(int, pfn_t
, pfn_t
, uchar_t
, uchar_t
, int, int);
234 page_t
*page_freelist_split(uchar_t
,
235 uint_t
, int, int, pfn_t
, pfn_t
, page_list_walker_t
*);
236 page_t
*page_get_mnode_cachelist(uint_t
, uint_t
, int, int);
237 static int page_trylock_cons(page_t
*pp
, se_t se
);
240 * The page_counters array below is used to keep track of free contiguous
241 * physical memory. A hw_page_map_t will be allocated per mnode per szc.
242 * This contains an array of counters, the size of the array, a shift value
243 * used to convert a pagenum into a counter array index or vice versa, as
244 * well as a cache of the last successful index to be promoted to a larger
245 * page size. As an optimization, we keep track of the last successful index
246 * to be promoted per page color for the given size region, and this is
247 * allocated dynamically based upon the number of colors for a given
250 * Conceptually, the page counters are represented as:
252 * page_counters[region_size][mnode]
254 * region_size: size code of a candidate larger page made up
255 * of contiguous free smaller pages.
257 * page_counters[region_size][mnode].hpm_counters[index]:
258 * represents how many (region_size - 1) pages either
259 * exist or can be created within the given index range.
261 * Let's look at a sparc example:
262 * If we want to create a free 512k page, we look at region_size 2
263 * for the mnode we want. We calculate the index and look at a specific
264 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at
265 * this location, it means that 8 64k pages either exist or can be created
266 * from 8K pages in order to make a single free 512k page at the given
267 * index. Note that when a region is full, it will contribute to the
268 * counts in the region above it. Thus we will not know what page
269 * size the free pages will be which can be promoted to this new free
270 * page unless we look at all regions below the current region.
274 * Note: hpmctr_t is defined in platform vm_dep.h
275 * hw_page_map_t contains all the information needed for the page_counters
276 * logic. The fields are as follows:
278 * hpm_counters: dynamically allocated array to hold counter data
279 * hpm_entries: entries in hpm_counters
280 * hpm_shift: shift for pnum/array index conv
281 * hpm_base: PFN mapped to counter index 0
282 * hpm_color_current: last index in counter array for this color at
283 * which we successfully created a large page
285 typedef struct hw_page_map
{
286 hpmctr_t
*hpm_counters
;
290 size_t *hpm_color_current
[MAX_MNODE_MRANGES
];
294 * Element zero is not used, but is allocated for convenience.
296 static hw_page_map_t
*page_counters
[MMU_PAGE_SIZES
];
299 * Cached value of MNODE_RANGE_CNT(mnode).
300 * This is a function call in x86.
302 static int mnode_nranges
[MAX_MEM_NODES
];
303 static int mnode_maxmrange
[MAX_MEM_NODES
];
306 * The following macros are convenient ways to get access to the individual
307 * elements of the page_counters arrays. They can be used on both
308 * the left side and right side of equations.
310 #define PAGE_COUNTERS(mnode, rg_szc, idx) \
311 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
313 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \
314 (page_counters[(rg_szc)][(mnode)].hpm_counters)
316 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \
317 (page_counters[(rg_szc)][(mnode)].hpm_shift)
319 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \
320 (page_counters[(rg_szc)][(mnode)].hpm_entries)
322 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \
323 (page_counters[(rg_szc)][(mnode)].hpm_base)
325 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \
326 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
328 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \
329 (page_counters[(rg_szc)][(mnode)]. \
330 hpm_color_current[(mrange)][(color)])
332 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \
333 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \
334 PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
336 #define IDX_TO_PNUM(mnode, rg_szc, index) \
337 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \
338 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
341 * Protects the hpm_counters and hpm_color_current memory from changing while
342 * looking at page counters information.
343 * Grab the write lock to modify what these fields point at.
344 * Grab the read lock to prevent any pointers from changing.
345 * The write lock can not be held during memory allocation due to a possible
346 * recursion deadlock with trying to grab the read lock while the
347 * write lock is already held.
349 krwlock_t page_ctrs_rwlock
[MAX_MEM_NODES
];
353 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
356 cpu_vm_data_init(struct cpu
*cp
)
359 cp
->cpu_vm_data
= (void *)&vm_cpu_data0
;
365 align
= (L2CACHE_ALIGN
) ? L2CACHE_ALIGN
: L2CACHE_ALIGN_MAX
;
366 sz
= P2ROUNDUP(sizeof (vm_cpu_data_t
), align
) + align
;
367 kmptr
= kmem_zalloc(sz
, KM_SLEEP
);
368 cp
->cpu_vm_data
= (void *) P2ROUNDUP((uintptr_t)kmptr
, align
);
369 ((vm_cpu_data_t
*)cp
->cpu_vm_data
)->vc_kmptr
= kmptr
;
370 ((vm_cpu_data_t
*)cp
->cpu_vm_data
)->vc_kmsize
= sz
;
378 cpu_vm_data_destroy(struct cpu
*cp
)
380 if (cp
->cpu_seqid
&& cp
->cpu_vm_data
) {
382 kmem_free(((vm_cpu_data_t
*)cp
->cpu_vm_data
)->vc_kmptr
,
383 ((vm_cpu_data_t
*)cp
->cpu_vm_data
)->vc_kmsize
);
385 cp
->cpu_vm_data
= NULL
;
390 * page size to page size code
393 page_szc(size_t pagesize
)
397 while (hw_page_array
[i
].hp_size
) {
398 if (pagesize
== hw_page_array
[i
].hp_size
)
406 * page size to page size code with the restriction that it be a supported
407 * user page size. If it's not a supported user page size, -1 will be returned.
410 page_szc_user_filtered(size_t pagesize
)
412 int szc
= page_szc(pagesize
);
413 if ((szc
!= -1) && (SZC_2_USERSZC(szc
) != -1)) {
420 * Return how many page sizes are available for the user to use. This is
421 * what the hardware supports and not based upon how the OS implements the
422 * support of different page sizes.
424 * If legacy is non-zero, return the number of pagesizes available to legacy
425 * applications. The number of legacy page sizes might be less than the
426 * exported user page sizes. This is to prevent legacy applications that
427 * use the largest page size returned from getpagesizes(3c) from inadvertantly
428 * using the 'new' large pagesizes.
431 page_num_user_pagesizes(int legacy
)
434 return (mmu_legacy_page_sizes
);
435 return (mmu_exported_page_sizes
);
439 page_num_pagesizes(void)
441 return (mmu_page_sizes
);
445 * returns the count of the number of base pagesize pages associated with szc
448 page_get_pagecnt(uint_t szc
)
450 if (szc
>= mmu_page_sizes
)
451 panic("page_get_pagecnt: out of range %d", szc
);
452 return (hw_page_array
[szc
].hp_pgcnt
);
456 page_get_pagesize(uint_t szc
)
458 if (szc
>= mmu_page_sizes
)
459 panic("page_get_pagesize: out of range %d", szc
);
460 return (hw_page_array
[szc
].hp_size
);
464 * Return the size of a page based upon the index passed in. An index of
465 * zero refers to the smallest page size in the system, and as index increases
466 * it refers to the next larger supported page size in the system.
467 * Note that szc and userszc may not be the same due to unsupported szc's on
471 page_get_user_pagesize(uint_t userszc
)
473 uint_t szc
= USERSZC_2_SZC(userszc
);
475 if (szc
>= mmu_page_sizes
)
476 panic("page_get_user_pagesize: out of range %d", szc
);
477 return (hw_page_array
[szc
].hp_size
);
481 page_get_shift(uint_t szc
)
483 if (szc
>= mmu_page_sizes
)
484 panic("page_get_shift: out of range %d", szc
);
485 return (PAGE_GET_SHIFT(szc
));
489 page_get_pagecolors(uint_t szc
)
491 if (szc
>= mmu_page_sizes
)
492 panic("page_get_pagecolors: out of range %d", szc
);
493 return (PAGE_GET_PAGECOLORS(szc
));
497 * this assigns the desired equivalent color after a split
500 page_correct_color(uchar_t szc
, uchar_t nszc
, uint_t color
,
501 uint_t ncolor
, uint_t ceq_mask
)
504 ASSERT(szc
< mmu_page_sizes
);
505 ASSERT(color
< PAGE_GET_PAGECOLORS(szc
));
506 ASSERT(ncolor
< PAGE_GET_PAGECOLORS(nszc
));
509 ncolor
= PAGE_CONVERT_COLOR(ncolor
, szc
, nszc
);
510 return (color
| (ncolor
& ~ceq_mask
));
514 * The interleaved_mnodes flag is set when mnodes overlap in
515 * the physbase..physmax range, but have disjoint slices.
516 * In this case hpm_counters is shared by all mnodes.
517 * This flag is set dynamically by the platform.
519 int interleaved_mnodes
= 0;
522 * Called by startup().
523 * Size up the per page size free list counters based on physmax
524 * of each node and max_mem_nodes.
526 * If interleaved_mnodes is set we need to find the first mnode that
527 * exists. hpm_counters for the first mnode will then be shared by
528 * all other mnodes. If interleaved_mnodes is not set, just set
529 * first=mnode each time. That means there will be no sharing.
534 int r
; /* region size */
536 int firstmn
; /* first mnode that exists */
542 pgcnt_t colors_per_szc
[MMU_PAGE_SIZES
];
545 * We need to determine how many page colors there are for each
546 * page size in order to allocate memory for any color specific
549 for (i
= 0; i
< mmu_page_sizes
; i
++) {
550 colors_per_szc
[i
] = PAGE_GET_PAGECOLORS(i
);
553 for (firstmn
= -1, mnode
= 0; mnode
< max_mem_nodes
; mnode
++) {
559 if (mem_node_config
[mnode
].exists
== 0)
562 HPM_COUNTERS_LIMITS(mnode
, physbase
, physmax
, firstmn
);
563 nranges
= MNODE_RANGE_CNT(mnode
);
564 mnode_nranges
[mnode
] = nranges
;
565 mnode_maxmrange
[mnode
] = MNODE_MAX_MRANGE(mnode
);
568 * determine size needed for page counter arrays with
569 * base aligned to large page size.
571 for (r
= 1; r
< mmu_page_sizes
; r
++) {
572 /* add in space for hpm_color_current */
573 ctrs_sz
+= sizeof (size_t) *
574 colors_per_szc
[r
] * nranges
;
576 if (firstmn
!= mnode
)
579 /* add in space for hpm_counters */
580 r_align
= page_get_pagecnt(r
);
582 r_base
&= ~(r_align
- 1);
583 r_pgcnt
= howmany(physmax
- r_base
+ 1, r_align
);
586 * Round up to always allocate on pointer sized
589 ctrs_sz
+= P2ROUNDUP((r_pgcnt
* sizeof (hpmctr_t
)),
590 sizeof (hpmctr_t
*));
594 for (r
= 1; r
< mmu_page_sizes
; r
++) {
595 ctrs_sz
+= (max_mem_nodes
* sizeof (hw_page_map_t
));
598 /* add in space for page_ctrs_cands and pcc_color_free */
599 ctrs_sz
+= sizeof (pcc_info_t
*) * max_mem_nodes
*
600 mmu_page_sizes
* NPC_MUTEX
;
602 for (mnode
= 0; mnode
< max_mem_nodes
; mnode
++) {
604 if (mem_node_config
[mnode
].exists
== 0)
607 nranges
= mnode_nranges
[mnode
];
608 ctrs_sz
+= sizeof (pcc_info_t
) * nranges
*
609 mmu_page_sizes
* NPC_MUTEX
;
610 for (r
= 1; r
< mmu_page_sizes
; r
++) {
611 ctrs_sz
+= sizeof (pgcnt_t
) * nranges
*
612 colors_per_szc
[r
] * NPC_MUTEX
;
617 ctrs_sz
+= (max_mem_nodes
* NPC_MUTEX
* sizeof (kmutex_t
));
619 /* size for page list counts */
623 * add some slop for roundups. page_ctrs_alloc will roundup the start
624 * address of the counters to ecache_alignsize boundary for every
627 return (ctrs_sz
+ max_mem_nodes
* L2CACHE_ALIGN
);
631 page_ctrs_alloc(caddr_t alloc_base
)
635 int r
; /* region size */
637 int firstmn
; /* first mnode that exists */
640 pgcnt_t colors_per_szc
[MMU_PAGE_SIZES
];
643 * We need to determine how many page colors there are for each
644 * page size in order to allocate memory for any color specific
647 for (i
= 0; i
< mmu_page_sizes
; i
++) {
648 colors_per_szc
[i
] = PAGE_GET_PAGECOLORS(i
);
651 for (r
= 1; r
< mmu_page_sizes
; r
++) {
652 page_counters
[r
] = (hw_page_map_t
*)alloc_base
;
653 alloc_base
+= (max_mem_nodes
* sizeof (hw_page_map_t
));
656 /* page_ctrs_cands and pcc_color_free array */
657 for (i
= 0; i
< NPC_MUTEX
; i
++) {
658 for (r
= 1; r
< mmu_page_sizes
; r
++) {
660 page_ctrs_cands
[i
][r
] = (pcc_info_t
**)alloc_base
;
661 alloc_base
+= sizeof (pcc_info_t
*) * max_mem_nodes
;
663 for (mnode
= 0; mnode
< max_mem_nodes
; mnode
++) {
666 if (mem_node_config
[mnode
].exists
== 0)
669 nranges
= mnode_nranges
[mnode
];
671 pi
= (pcc_info_t
*)alloc_base
;
672 alloc_base
+= sizeof (pcc_info_t
) * nranges
;
673 page_ctrs_cands
[i
][r
][mnode
] = pi
;
675 for (mrange
= 0; mrange
< nranges
; mrange
++) {
677 (pgcnt_t
*)alloc_base
;
678 alloc_base
+= sizeof (pgcnt_t
) *
687 for (i
= 0; i
< NPC_MUTEX
; i
++) {
688 ctr_mutex
[i
] = (kmutex_t
*)alloc_base
;
689 alloc_base
+= (max_mem_nodes
* sizeof (kmutex_t
));
692 /* initialize page list counts */
693 PLCNT_INIT(alloc_base
);
695 for (firstmn
= -1, mnode
= 0; mnode
< max_mem_nodes
; mnode
++) {
701 int nranges
= mnode_nranges
[mnode
];
703 if (mem_node_config
[mnode
].exists
== 0)
706 HPM_COUNTERS_LIMITS(mnode
, physbase
, physmax
, firstmn
);
708 for (r
= 1; r
< mmu_page_sizes
; r
++) {
710 * the page_counters base has to be aligned to the
711 * page count of page size code r otherwise the counts
712 * will cross large page boundaries.
714 r_align
= page_get_pagecnt(r
);
716 /* base needs to be aligned - lower to aligned value */
717 r_base
&= ~(r_align
- 1);
718 r_pgcnt
= howmany(physmax
- r_base
+ 1, r_align
);
719 r_shift
= PAGE_BSZS_SHIFT(r
);
721 PAGE_COUNTERS_SHIFT(mnode
, r
) = r_shift
;
722 PAGE_COUNTERS_ENTRIES(mnode
, r
) = r_pgcnt
;
723 PAGE_COUNTERS_BASE(mnode
, r
) = r_base
;
724 for (mrange
= 0; mrange
< nranges
; mrange
++) {
725 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode
,
726 r
, mrange
) = (size_t *)alloc_base
;
727 alloc_base
+= sizeof (size_t) *
730 for (i
= 0; i
< colors_per_szc
[r
]; i
++) {
731 uint_t color_mask
= colors_per_szc
[r
] - 1;
732 pfn_t pfnum
= r_base
;
735 MEM_NODE_ITERATOR_DECL(it
);
737 MEM_NODE_ITERATOR_INIT(pfnum
, mnode
, r
, &it
);
738 if (pfnum
== (pfn_t
)-1) {
741 PAGE_NEXT_PFN_FOR_COLOR(pfnum
, r
, i
,
742 color_mask
, color_mask
, &it
);
743 idx
= PNUM_TO_IDX(mnode
, r
, pfnum
);
744 idx
= (idx
>= r_pgcnt
) ? 0 : idx
;
746 for (mrange
= 0; mrange
< nranges
; mrange
++) {
747 PAGE_COUNTERS_CURRENT_COLOR(mnode
,
752 /* hpm_counters may be shared by all mnodes */
753 if (firstmn
== mnode
) {
754 PAGE_COUNTERS_COUNTERS(mnode
, r
) =
755 (hpmctr_t
*)alloc_base
;
757 P2ROUNDUP((sizeof (hpmctr_t
) * r_pgcnt
),
758 sizeof (hpmctr_t
*));
760 PAGE_COUNTERS_COUNTERS(mnode
, r
) =
761 PAGE_COUNTERS_COUNTERS(firstmn
, r
);
765 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
766 * satisfy the identity requirement.
767 * We should be able to go from one to the other
768 * and get consistent values.
770 ASSERT(PNUM_TO_IDX(mnode
, r
,
771 (IDX_TO_PNUM(mnode
, r
, 0))) == 0);
772 ASSERT(IDX_TO_PNUM(mnode
, r
,
773 (PNUM_TO_IDX(mnode
, r
, r_base
))) == r_base
);
776 * Roundup the start address of the page_counters to
777 * cache aligned boundary for every memory node.
778 * page_ctrs_sz() has added some slop for these roundups.
780 alloc_base
= (caddr_t
)P2ROUNDUP((uintptr_t)alloc_base
,
784 /* Initialize other page counter specific data structures. */
785 for (mnode
= 0; mnode
< MAX_MEM_NODES
; mnode
++) {
786 rw_init(&page_ctrs_rwlock
[mnode
], NULL
, RW_DEFAULT
, NULL
);
793 * Functions to adjust region counters for each size free list.
794 * Caller is responsible to acquire the ctr_mutex lock if necessary and
795 * thus can be called during startup without locks.
799 page_ctr_add_internal(int mnode
, int mtype
, page_t
*pp
, int flags
)
801 ssize_t r
; /* region size */
806 ASSERT(mnode
== PP_2_MEM_NODE(pp
));
807 ASSERT(mtype
== PP_2_MTYPE(pp
));
809 ASSERT(pp
->p_szc
< mmu_page_sizes
);
811 PLCNT_INCR(pp
, mnode
, mtype
, pp
->p_szc
, flags
);
813 /* no counter update needed for largest page size */
814 if (pp
->p_szc
>= mmu_page_sizes
- 1) {
819 pfnum
= pp
->p_pagenum
;
820 lckidx
= PP_CTR_LOCK_INDX(pp
);
823 * Increment the count of free pages for the current
824 * region. Continue looping up in region size incrementing
825 * count if the preceeding region is full.
827 while (r
< mmu_page_sizes
) {
828 idx
= PNUM_TO_IDX(mnode
, r
, pfnum
);
830 ASSERT(idx
< PAGE_COUNTERS_ENTRIES(mnode
, r
));
831 ASSERT(PAGE_COUNTERS(mnode
, r
, idx
) < FULL_REGION_CNT(r
));
833 if (++PAGE_COUNTERS(mnode
, r
, idx
) != FULL_REGION_CNT(r
)) {
836 int root_mtype
= PP_2_MTYPE(PP_GROUPLEADER(pp
, r
));
837 pcc_info_t
*cand
= &page_ctrs_cands
[lckidx
][r
][mnode
]
838 [MTYPE_2_MRANGE(mnode
, root_mtype
)];
840 cand
->pcc_pages_free
++;
841 cand
->pcc_color_free
[PP_2_BIN_SZC(pp
, r
)]++;
848 page_ctr_add(int mnode
, int mtype
, page_t
*pp
, int flags
)
850 int lckidx
= PP_CTR_LOCK_INDX(pp
);
851 kmutex_t
*lock
= &ctr_mutex
[lckidx
][mnode
];
854 page_ctr_add_internal(mnode
, mtype
, pp
, flags
);
859 page_ctr_sub_internal(int mnode
, int mtype
, page_t
*pp
, int flags
)
862 ssize_t r
; /* region size */
866 ASSERT(mnode
== PP_2_MEM_NODE(pp
));
867 ASSERT(mtype
== PP_2_MTYPE(pp
));
869 ASSERT(pp
->p_szc
< mmu_page_sizes
);
871 PLCNT_DECR(pp
, mnode
, mtype
, pp
->p_szc
, flags
);
873 /* no counter update needed for largest page size */
874 if (pp
->p_szc
>= mmu_page_sizes
- 1) {
879 pfnum
= pp
->p_pagenum
;
880 lckidx
= PP_CTR_LOCK_INDX(pp
);
883 * Decrement the count of free pages for the current
884 * region. Continue looping up in region size decrementing
885 * count if the preceeding region was full.
887 while (r
< mmu_page_sizes
) {
888 idx
= PNUM_TO_IDX(mnode
, r
, pfnum
);
890 ASSERT(idx
< PAGE_COUNTERS_ENTRIES(mnode
, r
));
891 ASSERT(PAGE_COUNTERS(mnode
, r
, idx
) > 0);
893 if (--PAGE_COUNTERS(mnode
, r
, idx
) != FULL_REGION_CNT(r
) - 1) {
896 int root_mtype
= PP_2_MTYPE(PP_GROUPLEADER(pp
, r
));
897 pcc_info_t
*cand
= &page_ctrs_cands
[lckidx
][r
][mnode
]
898 [MTYPE_2_MRANGE(mnode
, root_mtype
)];
900 ASSERT(cand
->pcc_pages_free
!= 0);
901 ASSERT(cand
->pcc_color_free
[PP_2_BIN_SZC(pp
, r
)] != 0);
903 cand
->pcc_pages_free
--;
904 cand
->pcc_color_free
[PP_2_BIN_SZC(pp
, r
)]--;
911 page_ctr_sub(int mnode
, int mtype
, page_t
*pp
, int flags
)
913 int lckidx
= PP_CTR_LOCK_INDX(pp
);
914 kmutex_t
*lock
= &ctr_mutex
[lckidx
][mnode
];
917 page_ctr_sub_internal(mnode
, mtype
, pp
, flags
);
922 * Adjust page counters following a memory attach, since typically the
923 * size of the array needs to change, and the PFN to counter index
924 * mapping needs to change.
926 * It is possible this mnode did not exist at startup. In that case
927 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
928 * to change (a theoretical possibility on x86), which means pcc_color_free
929 * arrays must be extended.
932 page_ctrs_adjust(int mnode
)
935 int r
; /* region size */
937 size_t pcsz
, old_csz
;
938 hpmctr_t
*new_ctr
, *old_ctr
;
939 pfn_t oldbase
, newbase
;
940 pfn_t physbase
, physmax
;
942 hpmctr_t
*ctr_cache
[MMU_PAGE_SIZES
];
943 size_t size_cache
[MMU_PAGE_SIZES
];
944 size_t *color_cache
[MMU_PAGE_SIZES
][MAX_MNODE_MRANGES
];
945 size_t *old_color_array
[MAX_MNODE_MRANGES
];
946 pgcnt_t colors_per_szc
[MMU_PAGE_SIZES
];
947 pcc_info_t
**cands_cache
;
948 pcc_info_t
*old_pi
, *pi
;
950 int nr
, old_nranges
, mrange
, nranges
= MNODE_RANGE_CNT(mnode
);
951 int cands_cache_nranges
;
952 int old_maxmrange
, new_maxmrange
;
956 cands_cache
= kmem_zalloc(sizeof (pcc_info_t
*) * NPC_MUTEX
*
957 MMU_PAGE_SIZES
, KM_NOSLEEP
);
958 if (cands_cache
== NULL
)
962 HPM_COUNTERS_LIMITS(mnode
, physbase
, physmax
, i
);
964 newbase
= physbase
& ~PC_BASE_ALIGN_MASK
;
965 npgs
= roundup(physmax
, PC_BASE_ALIGN
) - newbase
;
967 /* prepare to free non-null pointers on the way out */
968 cands_cache_nranges
= nranges
;
969 bzero(ctr_cache
, sizeof (ctr_cache
));
970 bzero(color_cache
, sizeof (color_cache
));
973 * We need to determine how many page colors there are for each
974 * page size in order to allocate memory for any color specific
977 for (r
= 0; r
< mmu_page_sizes
; r
++) {
978 colors_per_szc
[r
] = PAGE_GET_PAGECOLORS(r
);
982 * Preallocate all of the new hpm_counters arrays as we can't
983 * hold the page_ctrs_rwlock as a writer and allocate memory.
984 * If we can't allocate all of the arrays, undo our work so far
985 * and return failure.
987 for (r
= 1; r
< mmu_page_sizes
; r
++) {
988 pcsz
= npgs
>> PAGE_BSZS_SHIFT(r
);
989 size_cache
[r
] = pcsz
;
990 ctr_cache
[r
] = kmem_zalloc(pcsz
*
991 sizeof (hpmctr_t
), KM_NOSLEEP
);
992 if (ctr_cache
[r
] == NULL
) {
999 * Preallocate all of the new color current arrays as we can't
1000 * hold the page_ctrs_rwlock as a writer and allocate memory.
1001 * If we can't allocate all of the arrays, undo our work so far
1002 * and return failure.
1004 for (r
= 1; r
< mmu_page_sizes
; r
++) {
1005 for (mrange
= 0; mrange
< nranges
; mrange
++) {
1006 color_cache
[r
][mrange
] = kmem_zalloc(sizeof (size_t) *
1007 colors_per_szc
[r
], KM_NOSLEEP
);
1008 if (color_cache
[r
][mrange
] == NULL
) {
1016 * Preallocate all of the new pcc_info_t arrays as we can't
1017 * hold the page_ctrs_rwlock as a writer and allocate memory.
1018 * If we can't allocate all of the arrays, undo our work so far
1019 * and return failure.
1021 for (r
= 1; r
< mmu_page_sizes
; r
++) {
1022 for (i
= 0; i
< NPC_MUTEX
; i
++) {
1023 pi
= kmem_zalloc(nranges
* sizeof (pcc_info_t
),
1029 cands_cache
[i
* MMU_PAGE_SIZES
+ r
] = pi
;
1031 for (mrange
= 0; mrange
< nranges
; mrange
++, pi
++) {
1032 pgcntp
= kmem_zalloc(colors_per_szc
[r
] *
1033 sizeof (pgcnt_t
), KM_NOSLEEP
);
1034 if (pgcntp
== NULL
) {
1038 pi
->pcc_color_free
= pgcntp
;
1044 * Grab the write lock to prevent others from walking these arrays
1045 * while we are modifying them.
1047 PAGE_CTRS_WRITE_LOCK(mnode
);
1050 * For interleaved mnodes, find the first mnode
1051 * with valid page counters since the current
1052 * mnode may have just been added and not have
1053 * valid page counters.
1055 if (interleaved_mnodes
) {
1056 for (i
= 0; i
< max_mem_nodes
; i
++)
1057 if (PAGE_COUNTERS_COUNTERS(i
, 1) != NULL
)
1059 ASSERT(i
< max_mem_nodes
);
1064 old_nranges
= mnode_nranges
[mnode
];
1065 cands_cache_nranges
= old_nranges
;
1066 mnode_nranges
[mnode
] = nranges
;
1067 old_maxmrange
= mnode_maxmrange
[mnode
];
1068 mnode_maxmrange
[mnode
] = MNODE_MAX_MRANGE(mnode
);
1069 new_maxmrange
= mnode_maxmrange
[mnode
];
1071 for (r
= 1; r
< mmu_page_sizes
; r
++) {
1072 PAGE_COUNTERS_SHIFT(mnode
, r
) = PAGE_BSZS_SHIFT(r
);
1073 old_ctr
= PAGE_COUNTERS_COUNTERS(oldmnode
, r
);
1074 old_csz
= PAGE_COUNTERS_ENTRIES(oldmnode
, r
);
1075 oldbase
= PAGE_COUNTERS_BASE(oldmnode
, r
);
1076 old_npgs
= old_csz
<< PAGE_COUNTERS_SHIFT(oldmnode
, r
);
1077 for (mrange
= 0; mrange
< MAX_MNODE_MRANGES
; mrange
++) {
1078 old_color_array
[mrange
] =
1079 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode
,
1083 pcsz
= npgs
>> PAGE_COUNTERS_SHIFT(mnode
, r
);
1084 new_ctr
= ctr_cache
[r
];
1085 ctr_cache
[r
] = NULL
;
1086 if (old_ctr
!= NULL
&&
1087 (oldbase
+ old_npgs
> newbase
) &&
1088 (newbase
+ npgs
> oldbase
)) {
1090 * Map the intersection of the old and new
1091 * counters into the new array.
1094 if (newbase
> oldbase
) {
1095 offset
= (newbase
- oldbase
) >>
1096 PAGE_COUNTERS_SHIFT(mnode
, r
);
1097 bcopy(old_ctr
+ offset
, new_ctr
,
1098 MIN(pcsz
, (old_csz
- offset
)) *
1101 offset
= (oldbase
- newbase
) >>
1102 PAGE_COUNTERS_SHIFT(mnode
, r
);
1103 bcopy(old_ctr
, new_ctr
+ offset
,
1104 MIN(pcsz
- offset
, old_csz
) *
1109 PAGE_COUNTERS_COUNTERS(mnode
, r
) = new_ctr
;
1110 PAGE_COUNTERS_ENTRIES(mnode
, r
) = pcsz
;
1111 PAGE_COUNTERS_BASE(mnode
, r
) = newbase
;
1113 /* update shared hpm_counters in other mnodes */
1114 if (interleaved_mnodes
) {
1115 for (i
= 0; i
< max_mem_nodes
; i
++) {
1117 (mem_node_config
[i
].exists
== 0))
1120 PAGE_COUNTERS_COUNTERS(i
, r
) == old_ctr
||
1121 PAGE_COUNTERS_COUNTERS(i
, r
) == NULL
);
1122 PAGE_COUNTERS_COUNTERS(i
, r
) = new_ctr
;
1123 PAGE_COUNTERS_ENTRIES(i
, r
) = pcsz
;
1124 PAGE_COUNTERS_BASE(i
, r
) = newbase
;
1128 for (mrange
= 0; mrange
< MAX_MNODE_MRANGES
; mrange
++) {
1129 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode
, r
, mrange
) =
1130 color_cache
[r
][mrange
];
1131 color_cache
[r
][mrange
] = NULL
;
1134 * for now, just reset on these events as it's probably
1135 * not worthwhile to try and optimize this.
1137 for (i
= 0; i
< colors_per_szc
[r
]; i
++) {
1138 uint_t color_mask
= colors_per_szc
[r
] - 1;
1139 int mlo
= interleaved_mnodes
? 0 : mnode
;
1140 int mhi
= interleaved_mnodes
? max_mem_nodes
:
1145 MEM_NODE_ITERATOR_DECL(it
);
1147 for (m
= mlo
; m
< mhi
; m
++) {
1148 if (mem_node_config
[m
].exists
== 0)
1151 MEM_NODE_ITERATOR_INIT(pfnum
, m
, r
, &it
);
1152 if (pfnum
== (pfn_t
)-1) {
1155 PAGE_NEXT_PFN_FOR_COLOR(pfnum
, r
, i
,
1156 color_mask
, color_mask
, &it
);
1157 idx
= PNUM_TO_IDX(m
, r
, pfnum
);
1158 idx
= (idx
< pcsz
) ? idx
: 0;
1160 for (mrange
= 0; mrange
< nranges
; mrange
++) {
1161 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m
,
1163 PAGE_COUNTERS_CURRENT_COLOR(m
,
1164 r
, i
, mrange
) = idx
;
1169 /* cache info for freeing out of the critical path */
1170 if ((caddr_t
)old_ctr
>= kernelheap
&&
1171 (caddr_t
)old_ctr
< ekernelheap
) {
1172 ctr_cache
[r
] = old_ctr
;
1173 size_cache
[r
] = old_csz
;
1175 for (mrange
= 0; mrange
< MAX_MNODE_MRANGES
; mrange
++) {
1176 size_t *tmp
= old_color_array
[mrange
];
1177 if ((caddr_t
)tmp
>= kernelheap
&&
1178 (caddr_t
)tmp
< ekernelheap
) {
1179 color_cache
[r
][mrange
] = tmp
;
1183 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1184 * satisfy the identity requirement.
1185 * We should be able to go from one to the other
1186 * and get consistent values.
1188 ASSERT(PNUM_TO_IDX(mnode
, r
,
1189 (IDX_TO_PNUM(mnode
, r
, 0))) == 0);
1190 ASSERT(IDX_TO_PNUM(mnode
, r
,
1191 (PNUM_TO_IDX(mnode
, r
, newbase
))) == newbase
);
1193 /* pcc_info_t and pcc_color_free */
1194 for (i
= 0; i
< NPC_MUTEX
; i
++) {
1196 pcc_info_t
*eold_pi
;
1198 pi
= cands_cache
[i
* MMU_PAGE_SIZES
+ r
];
1199 old_pi
= page_ctrs_cands
[i
][r
][mnode
];
1200 page_ctrs_cands
[i
][r
][mnode
] = pi
;
1201 cands_cache
[i
* MMU_PAGE_SIZES
+ r
] = old_pi
;
1203 /* preserve old pcc_color_free values, if any */
1208 * when/if x86 does DR, must account for
1209 * possible change in range index when
1210 * preserving pcc_info
1213 eold_pi
= &old_pi
[old_nranges
];
1214 if (new_maxmrange
> old_maxmrange
) {
1215 pi
+= new_maxmrange
- old_maxmrange
;
1216 } else if (new_maxmrange
< old_maxmrange
) {
1217 old_pi
+= old_maxmrange
- new_maxmrange
;
1219 for (; pi
< epi
&& old_pi
< eold_pi
; pi
++, old_pi
++) {
1220 pcc_info_t tmp
= *pi
;
1226 PAGE_CTRS_WRITE_UNLOCK(mnode
);
1229 * Now that we have dropped the write lock, it is safe to free all
1230 * of the memory we have cached above.
1231 * We come thru here to free memory when pre-alloc fails, and also to
1232 * free old pointers which were recorded while locked.
1235 for (r
= 1; r
< mmu_page_sizes
; r
++) {
1236 if (ctr_cache
[r
] != NULL
) {
1237 kmem_free(ctr_cache
[r
],
1238 size_cache
[r
] * sizeof (hpmctr_t
));
1240 for (mrange
= 0; mrange
< MAX_MNODE_MRANGES
; mrange
++) {
1241 if (color_cache
[r
][mrange
] != NULL
) {
1242 kmem_free(color_cache
[r
][mrange
],
1243 colors_per_szc
[r
] * sizeof (size_t));
1246 for (i
= 0; i
< NPC_MUTEX
; i
++) {
1247 pi
= cands_cache
[i
* MMU_PAGE_SIZES
+ r
];
1250 nr
= cands_cache_nranges
;
1251 for (mrange
= 0; mrange
< nr
; mrange
++, pi
++) {
1252 pgcntp
= pi
->pcc_color_free
;
1255 if ((caddr_t
)pgcntp
>= kernelheap
&&
1256 (caddr_t
)pgcntp
< ekernelheap
) {
1262 pi
= cands_cache
[i
* MMU_PAGE_SIZES
+ r
];
1263 if ((caddr_t
)pi
>= kernelheap
&&
1264 (caddr_t
)pi
< ekernelheap
) {
1265 kmem_free(pi
, nr
* sizeof (pcc_info_t
));
1270 kmem_free(cands_cache
,
1271 sizeof (pcc_info_t
*) * NPC_MUTEX
* MMU_PAGE_SIZES
);
1276 * Cleanup the hpm_counters field in the page counters
1280 page_ctrs_cleanup(void)
1282 int r
; /* region size */
1283 int i
; /* mnode index */
1286 * Get the page counters write lock while we are
1287 * setting the page hpm_counters field to NULL
1288 * for non-existent mnodes.
1290 for (i
= 0; i
< max_mem_nodes
; i
++) {
1291 PAGE_CTRS_WRITE_LOCK(i
);
1292 if (mem_node_config
[i
].exists
) {
1293 PAGE_CTRS_WRITE_UNLOCK(i
);
1296 for (r
= 1; r
< mmu_page_sizes
; r
++) {
1297 PAGE_COUNTERS_COUNTERS(i
, r
) = NULL
;
1299 PAGE_CTRS_WRITE_UNLOCK(i
);
1306 * confirm pp is a large page corresponding to szc
1309 chk_lpg(page_t
*pp
, uchar_t szc
)
1311 spgcnt_t npgs
= page_get_pagecnt(pp
->p_szc
);
1315 ASSERT(pp
->p_szc
== 0);
1316 ASSERT(pp
->p_next
== pp
);
1317 ASSERT(pp
->p_prev
== pp
);
1321 ASSERT(pp
->p_list
.largepg
.next
== pp
|| pp
->p_list
.largepg
.next
== NULL
);
1322 ASSERT(pp
->p_list
.largepg
.prev
== pp
|| pp
->p_list
.largepg
.prev
== NULL
);
1324 ASSERT(IS_P2ALIGNED(pp
->p_pagenum
, npgs
));
1325 ASSERT(pp
->p_pagenum
== (pp
->p_next
->p_pagenum
- 1));
1326 ASSERT(pp
->p_prev
->p_pagenum
== (pp
->p_pagenum
+ (npgs
- 1)));
1327 ASSERT(pp
->p_prev
== (pp
+ (npgs
- 1)));
1330 * Check list of pages.
1332 noreloc
= PP_ISNORELOC(pp
);
1335 ASSERT(pp
->p_pagenum
== pp
->p_next
->p_pagenum
- 1);
1336 ASSERT(pp
->p_next
== (pp
+ 1));
1338 ASSERT(pp
->p_szc
== szc
);
1339 ASSERT(PP_ISFREE(pp
));
1340 ASSERT(PP_ISAGED(pp
));
1341 ASSERT(pp
->p_list
.largepg
.next
== pp
|| pp
->p_list
.largepg
.next
== NULL
);
1342 ASSERT(pp
->p_list
.largepg
.prev
== pp
|| pp
->p_list
.largepg
.prev
== NULL
);
1343 VERIFY(pp
->p_object
== NULL
);
1344 ASSERT(pp
->p_vnode
== NULL
);
1345 ASSERT(PP_ISNORELOC(pp
) == noreloc
);
1353 page_freelist_lock(int mnode
)
1356 for (i
= 0; i
< NPC_MUTEX
; i
++) {
1357 mutex_enter(FPC_MUTEX(mnode
, i
));
1358 mutex_enter(CPC_MUTEX(mnode
, i
));
1363 page_freelist_unlock(int mnode
)
1366 for (i
= 0; i
< NPC_MUTEX
; i
++) {
1367 mutex_exit(FPC_MUTEX(mnode
, i
));
1368 mutex_exit(CPC_MUTEX(mnode
, i
));
1373 * add pp to the specified page list. Defaults to head of the page list
1374 * unless PG_LIST_TAIL is specified.
1377 page_list_add(page_t
*pp
, int flags
)
1384 ASSERT(PAGE_EXCL(pp
) || (flags
& PG_LIST_ISINIT
));
1385 ASSERT(PP_ISFREE(pp
));
1386 ASSERT(!hat_page_is_mapped(pp
));
1387 ASSERT(hat_page_getshare(pp
) == 0);
1390 * Large pages should be freed via page_list_add_pages().
1392 ASSERT(pp
->p_szc
== 0);
1395 * Don't need to lock the freelist first here
1396 * because the page isn't on the freelist yet.
1397 * This means p_szc can't change on us.
1401 mnode
= PP_2_MEM_NODE(pp
);
1402 mtype
= PP_2_MTYPE(pp
);
1404 if (flags
& PG_LIST_ISINIT
) {
1406 * PG_LIST_ISINIT is set during system startup (ie. single
1407 * threaded), add a page to the free list and add to the
1408 * the free region counters w/o any locking
1410 ppp
= &PAGE_FREELISTS(mnode
, 0, bin
, mtype
);
1412 /* inline version of page_add() */
1415 pp
->p_prev
= (*ppp
)->p_prev
;
1416 (*ppp
)->p_prev
= pp
;
1417 pp
->p_prev
->p_next
= pp
;
1421 page_ctr_add_internal(mnode
, mtype
, pp
, flags
);
1422 VM_STAT_ADD(vmm_vmstats
.pladd_free
[0]);
1424 pcm
= PC_BIN_MUTEX(mnode
, bin
, flags
);
1426 if (flags
& PG_FREE_LIST
) {
1427 VM_STAT_ADD(vmm_vmstats
.pladd_free
[0]);
1428 ASSERT(PP_ISAGED(pp
));
1429 ppp
= &PAGE_FREELISTS(mnode
, 0, bin
, mtype
);
1432 VM_STAT_ADD(vmm_vmstats
.pladd_cache
);
1433 VERIFY(pp
->p_object
);
1434 ASSERT(pp
->p_vnode
);
1435 ASSERT((pp
->p_offset
& PAGEOFFSET
) == 0);
1436 ppp
= &PAGE_CACHELISTS(mnode
, bin
, mtype
);
1441 if (flags
& PG_LIST_TAIL
)
1442 *ppp
= (*ppp
)->p_next
;
1444 * Add counters before releasing pcm mutex to avoid a race with
1445 * page_freelist_coalesce and page_freelist_split.
1447 page_ctr_add(mnode
, mtype
, pp
, flags
);
1453 * It is up to the caller to unlock the page!
1455 ASSERT(PAGE_EXCL(pp
) || (flags
& PG_LIST_ISINIT
));
1462 page_list_noreloc_startup(page_t
*pp
)
1464 panic("page_list_noreloc_startup: should be here only for sparc");
1468 page_list_add_pages(page_t
*pp
, int flags
)
1472 uint_t bin
, mtype
, i
;
1475 /* default to freelist/head */
1476 ASSERT((flags
& (PG_CACHE_LIST
| PG_LIST_TAIL
)) == 0);
1478 CHK_LPG(pp
, pp
->p_szc
);
1479 VM_STAT_ADD(vmm_vmstats
.pladd_free
[pp
->p_szc
]);
1482 mnode
= PP_2_MEM_NODE(pp
);
1483 mtype
= PP_2_MTYPE(pp
);
1485 if (flags
& PG_LIST_ISINIT
) {
1486 ASSERT(pp
->p_szc
== mmu_page_sizes
- 1);
1487 page_lpadd(&PAGE_FREELISTS(mnode
, pp
->p_szc
, bin
, mtype
), pp
);
1488 ASSERT(!PP_ISNORELOC(pp
));
1489 PLCNT_INCR(pp
, mnode
, mtype
, pp
->p_szc
, flags
);
1492 ASSERT(pp
->p_szc
!= 0 && pp
->p_szc
< mmu_page_sizes
);
1494 pcm
= PC_BIN_MUTEX(mnode
, bin
, PG_FREE_LIST
);
1497 page_lpadd(&PAGE_FREELISTS(mnode
, pp
->p_szc
, bin
, mtype
), pp
);
1498 page_ctr_add(mnode
, mtype
, pp
, PG_FREE_LIST
);
1501 pgcnt
= page_get_pagecnt(pp
->p_szc
);
1502 for (i
= 0; i
< pgcnt
; i
++, pp
++)
1503 page_unlock_nocapture(pp
);
1508 * During boot, need to demote a large page to base
1509 * pagesize pages for seg_kmem for use in boot_alloc()
1512 page_boot_demote(page_t
*pp
)
1514 ASSERT(pp
->p_szc
!= 0);
1515 ASSERT(PP_ISFREE(pp
));
1516 ASSERT(PP_ISAGED(pp
));
1518 (void) page_demote(PP_2_MEM_NODE(pp
),
1519 PFN_BASE(pp
->p_pagenum
, pp
->p_szc
), 0, pp
->p_szc
, 0, PC_NO_COLOR
,
1522 ASSERT(PP_ISFREE(pp
));
1523 ASSERT(PP_ISAGED(pp
));
1524 ASSERT(pp
->p_szc
== 0);
1528 * Take a particular page off of whatever freelist the page
1529 * is claimed to be on.
1531 * NOTE: Only used for PAGESIZE pages.
1534 page_list_sub(page_t
*pp
, int flags
)
1542 ASSERT(PAGE_EXCL(pp
));
1543 ASSERT(PP_ISFREE(pp
));
1546 * The p_szc field can only be changed by page_promote()
1547 * and page_demote(). Only free pages can be promoted and
1548 * demoted and the free list MUST be locked during these
1549 * operations. So to prevent a race in page_list_sub()
1550 * between computing which bin of the freelist lock to
1551 * grab and actually grabing the lock we check again that
1552 * the bin we locked is still the correct one. Notice that
1553 * the p_szc field could have actually changed on us but
1554 * if the bin happens to still be the same we are safe.
1558 mnode
= PP_2_MEM_NODE(pp
);
1559 pcm
= PC_BIN_MUTEX(mnode
, bin
, flags
);
1561 if (PP_2_BIN(pp
) != bin
) {
1565 mtype
= PP_2_MTYPE(pp
);
1567 if (flags
& PG_FREE_LIST
) {
1568 VM_STAT_ADD(vmm_vmstats
.plsub_free
[0]);
1569 ASSERT(PP_ISAGED(pp
));
1570 ppp
= &PAGE_FREELISTS(mnode
, pp
->p_szc
, bin
, mtype
);
1572 VM_STAT_ADD(vmm_vmstats
.plsub_cache
);
1573 ASSERT(!PP_ISAGED(pp
));
1574 ppp
= &PAGE_CACHELISTS(mnode
, bin
, mtype
);
1578 * Common PAGESIZE case.
1580 * Note that we locked the freelist. This prevents
1581 * any page promotion/demotion operations. Therefore
1582 * the p_szc will not change until we drop pcm mutex.
1584 if (pp
->p_szc
== 0) {
1587 * Subtract counters before releasing pcm mutex
1588 * to avoid race with page_freelist_coalesce.
1590 page_ctr_sub(mnode
, mtype
, pp
, flags
);
1597 * Large pages on the cache list are not supported.
1599 if (flags
& PG_CACHE_LIST
)
1600 panic("page_list_sub: large page on cachelist");
1605 * Somebody wants this particular page which is part
1606 * of a large page. In this case we just demote the page
1607 * if it's on the freelist.
1609 * We have to drop pcm before locking the entire freelist.
1610 * Once we have re-locked the freelist check to make sure
1611 * the page hasn't already been demoted or completely
1615 page_freelist_lock(mnode
);
1616 if (pp
->p_szc
!= 0) {
1618 * Large page is on freelist.
1620 (void) page_demote(mnode
, PFN_BASE(pp
->p_pagenum
, pp
->p_szc
),
1621 0, pp
->p_szc
, 0, PC_NO_COLOR
, PC_FREE
);
1623 ASSERT(PP_ISFREE(pp
));
1624 ASSERT(PP_ISAGED(pp
));
1625 ASSERT(pp
->p_szc
== 0);
1628 * Subtract counters before releasing pcm mutex
1629 * to avoid race with page_freelist_coalesce.
1632 mtype
= PP_2_MTYPE(pp
);
1633 ppp
= &PAGE_FREELISTS(mnode
, pp
->p_szc
, bin
, mtype
);
1636 page_ctr_sub(mnode
, mtype
, pp
, flags
);
1637 page_freelist_unlock(mnode
);
1642 page_list_sub_pages(page_t
*pp
, uint_t szc
)
1648 ASSERT(PAGE_EXCL(pp
));
1649 ASSERT(PP_ISFREE(pp
));
1650 ASSERT(PP_ISAGED(pp
));
1653 * See comment in page_list_sub().
1657 mnode
= PP_2_MEM_NODE(pp
);
1658 pcm
= PC_BIN_MUTEX(mnode
, bin
, PG_FREE_LIST
);
1660 if (PP_2_BIN(pp
) != bin
) {
1666 * If we're called with a page larger than szc or it got
1667 * promoted above szc before we locked the freelist then
1668 * drop pcm and re-lock entire freelist. If page still larger
1669 * than szc then demote it.
1671 if (pp
->p_szc
> szc
) {
1674 page_freelist_lock(mnode
);
1675 if (pp
->p_szc
> szc
) {
1676 VM_STAT_ADD(vmm_vmstats
.plsubpages_szcbig
);
1677 (void) page_demote(mnode
,
1678 PFN_BASE(pp
->p_pagenum
, pp
->p_szc
), 0,
1679 pp
->p_szc
, szc
, PC_NO_COLOR
, PC_FREE
);
1683 ASSERT(PP_ISFREE(pp
));
1684 ASSERT(PP_ISAGED(pp
));
1685 ASSERT(pp
->p_szc
<= szc
);
1686 ASSERT(pp
== PP_PAGEROOT(pp
));
1688 VM_STAT_ADD(vmm_vmstats
.plsub_free
[pp
->p_szc
]);
1690 mtype
= PP_2_MTYPE(pp
);
1691 if (pp
->p_szc
!= 0) {
1692 page_lpsub(&PAGE_FREELISTS(mnode
, pp
->p_szc
, bin
, mtype
), pp
);
1693 CHK_LPG(pp
, pp
->p_szc
);
1695 VM_STAT_ADD(vmm_vmstats
.plsubpages_szc0
);
1696 page_sub(&PAGE_FREELISTS(mnode
, pp
->p_szc
, bin
, mtype
), pp
);
1698 page_ctr_sub(mnode
, mtype
, pp
, PG_FREE_LIST
);
1703 page_freelist_unlock(mnode
);
1709 * Add the page to the front of a linked list of pages
1710 * using the p_next & p_prev pointers for the list.
1711 * The caller is responsible for protecting the list pointers.
1714 mach_page_add(page_t
**ppp
, page_t
*pp
)
1717 pp
->p_next
= pp
->p_prev
= pp
;
1720 pp
->p_prev
= (*ppp
)->p_prev
;
1721 (*ppp
)->p_prev
= pp
;
1722 pp
->p_prev
->p_next
= pp
;
1728 * Remove this page from a linked list of pages
1729 * using the p_next & p_prev pointers for the list.
1731 * The caller is responsible for protecting the list pointers.
1734 mach_page_sub(page_t
**ppp
, page_t
*pp
)
1736 ASSERT(PP_ISFREE(pp
));
1738 if (*ppp
== NULL
|| pp
== NULL
)
1739 panic("mach_page_sub");
1742 *ppp
= pp
->p_next
; /* go to next page */
1745 *ppp
= NULL
; /* page list is gone */
1747 pp
->p_prev
->p_next
= pp
->p_next
;
1748 pp
->p_next
->p_prev
= pp
->p_prev
;
1750 pp
->p_prev
= pp
->p_next
= pp
; /* make pp a list of one */
1754 * Routine fsflush uses to gradually coalesce the free list into larger pages.
1757 page_promote_size(page_t
*pp
, uint_t cur_szc
)
1762 int new_szc
= cur_szc
+ 1;
1763 int full
= FULL_REGION_CNT(new_szc
);
1765 pfn
= page_pptonum(pp
);
1766 mnode
= PFN_2_MEM_NODE(pfn
);
1768 page_freelist_lock(mnode
);
1770 idx
= PNUM_TO_IDX(mnode
, new_szc
, pfn
);
1771 if (PAGE_COUNTERS(mnode
, new_szc
, idx
) == full
)
1772 (void) page_promote(mnode
, pfn
, new_szc
, PC_FREE
, PC_MTYPE_ANY
);
1774 page_freelist_unlock(mnode
);
1777 static uint_t page_promote_err
;
1778 static uint_t page_promote_noreloc_err
;
1781 * Create a single larger page (of szc new_szc) from smaller contiguous pages
1782 * for the given mnode starting at pfnum. Pages involved are on the freelist
1783 * before the call and may be returned to the caller if requested, otherwise
1784 * they will be placed back on the freelist.
1785 * If flags is PC_ALLOC, then the large page will be returned to the user in
1786 * a state which is consistent with a page being taken off the freelist. If
1787 * we failed to lock the new large page, then we will return NULL to the
1788 * caller and put the large page on the freelist instead.
1789 * If flags is PC_FREE, then the large page will be placed on the freelist,
1790 * and NULL will be returned.
1791 * The caller is responsible for locking the freelist as well as any other
1792 * accounting which needs to be done for a returned page.
1794 * RFE: For performance pass in pp instead of pfnum so
1795 * we can avoid excessive calls to page_numtopp_nolock().
1796 * This would depend on an assumption that all contiguous
1797 * pages are in the same memseg so we can just add/dec
1802 * There is a potential but rare deadlock situation
1803 * for page promotion and demotion operations. The problem
1804 * is there are two paths into the freelist manager and
1805 * they have different lock orders:
1812 * caller drops page_lock
1814 * page_free() and page_reclaim()
1815 * caller grabs page_lock(EXCL)
1821 * What prevents a thread in page_create() from deadlocking
1822 * with a thread freeing or reclaiming the same page is the
1823 * page_trylock() in page_get_freelist(). If the trylock fails
1824 * it skips the page.
1826 * The lock ordering for promotion and demotion is the same as
1827 * for page_create(). Since the same deadlock could occur during
1828 * page promotion and freeing or reclaiming of a page on the
1829 * cache list we might have to fail the operation and undo what
1830 * have done so far. Again this is rare.
1833 page_promote(int mnode
, pfn_t pfnum
, uchar_t new_szc
, int flags
, int mtype
)
1835 page_t
*pp
, *pplist
, *tpp
, *start_pp
;
1836 pgcnt_t new_npgs
, npgs
;
1838 pgcnt_t tmpnpgs
, pages_left
;
1845 * General algorithm:
1846 * Find the starting page
1847 * Walk each page struct removing it from the freelist,
1848 * and linking it to all the other pages removed.
1849 * Once all pages are off the freelist,
1850 * walk the list, modifying p_szc to new_szc and what
1851 * ever other info needs to be done to create a large free page.
1852 * According to the flags, either return the page or put it
1856 start_pp
= page_numtopp_nolock(pfnum
);
1857 ASSERT(start_pp
&& (start_pp
->p_pagenum
== pfnum
));
1858 new_npgs
= page_get_pagecnt(new_szc
);
1859 ASSERT(IS_P2ALIGNED(pfnum
, new_npgs
));
1861 /* don't return page of the wrong mtype */
1862 if (mtype
!= PC_MTYPE_ANY
&& mtype
!= PP_2_MTYPE(start_pp
))
1866 * Loop through smaller pages to confirm that all pages
1867 * give the same result for PP_ISNORELOC().
1868 * We can check this reliably here as the protocol for setting
1869 * P_NORELOC requires pages to be taken off the free list first.
1871 noreloc
= PP_ISNORELOC(start_pp
);
1872 for (pp
= start_pp
+ new_npgs
; --pp
> start_pp
; ) {
1873 if (noreloc
!= PP_ISNORELOC(pp
)) {
1874 page_promote_noreloc_err
++;
1880 pages_left
= new_npgs
;
1884 /* Loop around coalescing the smaller pages into a big page. */
1885 while (pages_left
) {
1887 * Remove from the freelist.
1889 ASSERT(PP_ISFREE(pp
));
1891 ASSERT(mnode
== PP_2_MEM_NODE(pp
));
1892 mtype
= PP_2_MTYPE(pp
);
1893 if (PP_ISAGED(pp
)) {
1899 page_lpsub(&PAGE_FREELISTS(mnode
,
1900 pp
->p_szc
, bin
, mtype
), pp
);
1902 mach_page_sub(&PAGE_FREELISTS(mnode
, 0,
1905 which_list
= PG_FREE_LIST
;
1907 struct vmobject
*obj
;
1909 ASSERT(pp
->p_szc
== 0);
1914 * Since this page comes from the
1915 * cachelist, we must destroy the
1916 * vnode association.
1918 if (!page_trylock(pp
, SE_EXCL
)) {
1922 obj
= &pp
->p_vnode
->v_object
;
1925 * We need to be careful not to deadlock
1926 * with another thread in page_lookup().
1927 * The page_lookup() thread could be holding
1928 * the same phm that we need if the two
1929 * pages happen to hash to the same phm lock.
1930 * At this point we have locked the entire
1931 * freelist and page_lookup() could be trying
1932 * to grab a freelist lock.
1934 if (!vmobject_trylock(obj
)) {
1935 page_unlock_nocapture(pp
);
1939 mach_page_sub(&PAGE_CACHELISTS(mnode
, bin
, mtype
), pp
);
1940 page_hashout(pp
, true);
1941 vmobject_unlock(obj
);
1943 page_unlock_nocapture(pp
);
1944 which_list
= PG_CACHE_LIST
;
1946 page_ctr_sub(mnode
, mtype
, pp
, which_list
);
1949 * Concatenate the smaller page(s) onto
1950 * the large page list.
1952 tmpnpgs
= npgs
= page_get_pagecnt(pp
->p_szc
);
1956 tpp
->p_szc
= new_szc
;
1959 page_list_concat(&pplist
, &pp
);
1962 CHK_LPG(pplist
, new_szc
);
1965 * return the page to the user if requested
1966 * in the properly locked state.
1968 if (flags
== PC_ALLOC
&& (page_trylock_cons(pplist
, SE_EXCL
))) {
1973 * Otherwise place the new large page on the freelist
1975 bin
= PP_2_BIN(pplist
);
1976 mnode
= PP_2_MEM_NODE(pplist
);
1977 mtype
= PP_2_MTYPE(pplist
);
1978 page_lpadd(&PAGE_FREELISTS(mnode
, new_szc
, bin
, mtype
), pplist
);
1980 page_ctr_add(mnode
, mtype
, pplist
, PG_FREE_LIST
);
1985 * A thread must have still been freeing or
1986 * reclaiming the page on the cachelist.
1987 * To prevent a deadlock undo what we have
1988 * done sofar and return failure. This
1989 * situation can only happen while promoting
1995 mach_page_sub(&pplist
, pp
);
1998 mtype
= PP_2_MTYPE(pp
);
1999 mach_page_add(&PAGE_FREELISTS(mnode
, 0, bin
, mtype
), pp
);
2000 page_ctr_add(mnode
, mtype
, pp
, PG_FREE_LIST
);
2007 * Break up a large page into smaller size pages.
2008 * Pages involved are on the freelist before the call and may
2009 * be returned to the caller if requested, otherwise they will
2010 * be placed back on the freelist.
2011 * The caller is responsible for locking the freelist as well as any other
2012 * accounting which needs to be done for a returned page.
2013 * If flags is not PC_ALLOC, the color argument is ignored, and thus
2014 * technically, any value may be passed in but PC_NO_COLOR is the standard
2015 * which should be followed for clarity's sake.
2016 * Returns a page whose pfn is < pfnmax
2019 page_demote(int mnode
, pfn_t pfnum
, pfn_t pfnmax
, uchar_t cur_szc
,
2020 uchar_t new_szc
, int color
, int flags
)
2022 page_t
*pp
, *pplist
, *npplist
;
2026 page_t
*ret_pp
= NULL
;
2028 ASSERT(cur_szc
!= 0);
2029 ASSERT(new_szc
< cur_szc
);
2031 pplist
= page_numtopp_nolock(pfnum
);
2032 ASSERT(pplist
!= NULL
);
2034 ASSERT(pplist
->p_szc
== cur_szc
);
2036 bin
= PP_2_BIN(pplist
);
2037 ASSERT(mnode
== PP_2_MEM_NODE(pplist
));
2038 mtype
= PP_2_MTYPE(pplist
);
2039 page_lpsub(&PAGE_FREELISTS(mnode
, cur_szc
, bin
, mtype
), pplist
);
2041 CHK_LPG(pplist
, cur_szc
);
2042 page_ctr_sub(mnode
, mtype
, pplist
, PG_FREE_LIST
);
2045 * Number of PAGESIZE pages for smaller new_szc
2048 npgs
= page_get_pagecnt(new_szc
);
2053 ASSERT(pp
->p_szc
== cur_szc
);
2056 * We either break it up into PAGESIZE pages or larger.
2058 if (npgs
== 1) { /* PAGESIZE case */
2059 mach_page_sub(&pplist
, pp
);
2060 ASSERT(pp
->p_szc
== cur_szc
);
2061 ASSERT(new_szc
== 0);
2062 ASSERT(mnode
== PP_2_MEM_NODE(pp
));
2063 pp
->p_szc
= new_szc
;
2065 if ((bin
== color
) && (flags
== PC_ALLOC
) &&
2066 (ret_pp
== NULL
) && (pfnmax
== 0 ||
2067 pp
->p_pagenum
< pfnmax
) &&
2068 page_trylock_cons(pp
, SE_EXCL
)) {
2071 mtype
= PP_2_MTYPE(pp
);
2072 mach_page_add(&PAGE_FREELISTS(mnode
, 0, bin
,
2074 page_ctr_add(mnode
, mtype
, pp
, PG_FREE_LIST
);
2077 page_t
*try_to_return_this_page
= NULL
;
2081 * Break down into smaller lists of pages.
2083 page_list_break(&pplist
, &npplist
, npgs
);
2088 ASSERT(pp
->p_szc
== cur_szc
);
2090 * Check whether all the pages in this list
2091 * fit the request criteria.
2093 if (pfnmax
== 0 || pp
->p_pagenum
< pfnmax
) {
2096 pp
->p_szc
= new_szc
;
2100 if (count
== npgs
&&
2101 (pfnmax
== 0 || pp
->p_pagenum
< pfnmax
)) {
2102 try_to_return_this_page
= pp
;
2105 CHK_LPG(pplist
, new_szc
);
2107 bin
= PP_2_BIN(pplist
);
2108 if (try_to_return_this_page
)
2110 PP_2_MEM_NODE(try_to_return_this_page
));
2111 if ((bin
== color
) && (flags
== PC_ALLOC
) &&
2112 (ret_pp
== NULL
) && try_to_return_this_page
&&
2113 page_trylock_cons(try_to_return_this_page
,
2115 ret_pp
= try_to_return_this_page
;
2117 mtype
= PP_2_MTYPE(pp
);
2118 page_lpadd(&PAGE_FREELISTS(mnode
, new_szc
,
2119 bin
, mtype
), pplist
);
2121 page_ctr_add(mnode
, mtype
, pplist
,
2130 int mpss_coalesce_disable
= 0;
2133 * Coalesce free pages into a page of the given szc and color if possible.
2134 * Return the pointer to the page created, otherwise, return NULL.
2136 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2139 page_freelist_coalesce(int mnode
, uchar_t szc
, uint_t color
, uint_t ceq_mask
,
2140 int mtype
, pfn_t pfnhi
)
2142 int r
= szc
; /* region size */
2144 uint_t full
, bin
, color_mask
, wrap
= 0;
2145 pfn_t pfnum
, lo
, hi
;
2146 size_t len
, idx
, idx0
;
2147 pgcnt_t cands
= 0, szcpgcnt
= page_get_pagecnt(szc
);
2149 MEM_NODE_ITERATOR_DECL(it
);
2151 if (mpss_coalesce_disable
) {
2152 ASSERT(szc
< MMU_PAGE_SIZES
);
2153 VM_STAT_ADD(vmm_vmstats
.page_ctrs_coalesce
[szc
][0]);
2157 ASSERT(szc
< mmu_page_sizes
);
2158 color_mask
= PAGE_GET_PAGECOLORS(szc
) - 1;
2159 ASSERT(ceq_mask
<= color_mask
);
2160 ASSERT(color
<= color_mask
);
2163 /* Prevent page_counters dynamic memory from being freed */
2164 rw_enter(&page_ctrs_rwlock
[mnode
], RW_READER
);
2166 mrange
= MTYPE_2_MRANGE(mnode
, mtype
);
2167 ASSERT(mrange
< mnode_nranges
[mnode
]);
2168 VM_STAT_ADD(vmm_vmstats
.page_ctrs_coalesce
[r
][mrange
]);
2170 /* get pfn range for mtype */
2171 len
= PAGE_COUNTERS_ENTRIES(mnode
, r
);
2172 MNODETYPE_2_PFN(mnode
, mtype
, lo
, hi
);
2175 /* use lower limit if given */
2176 if (pfnhi
!= PFNNULL
&& pfnhi
< hi
)
2179 /* round to szcpgcnt boundaries */
2180 lo
= P2ROUNDUP(lo
, szcpgcnt
);
2181 MEM_NODE_ITERATOR_INIT(lo
, mnode
, szc
, &it
);
2182 if (lo
== (pfn_t
)-1) {
2183 rw_exit(&page_ctrs_rwlock
[mnode
]);
2186 hi
= hi
& ~(szcpgcnt
- 1);
2188 /* set lo to the closest pfn of the right color */
2189 if (((PFN_2_COLOR(lo
, szc
, &it
) ^ color
) & ceq_mask
) ||
2190 (interleaved_mnodes
&& PFN_2_MEM_NODE(lo
) != mnode
)) {
2191 PAGE_NEXT_PFN_FOR_COLOR(lo
, szc
, color
, ceq_mask
, color_mask
,
2196 rw_exit(&page_ctrs_rwlock
[mnode
]);
2200 full
= FULL_REGION_CNT(r
);
2202 /* calculate the number of page candidates and initial search index */
2204 idx0
= (size_t)(-1);
2208 PGCTRS_CANDS_GETVALUECOLOR(mnode
, mrange
, r
, bin
, acand
);
2210 idx
= PAGE_COUNTERS_CURRENT_COLOR(mnode
,
2212 idx0
= MIN(idx0
, idx
);
2215 bin
= ADD_MASKED(bin
, 1, ceq_mask
, color_mask
);
2216 } while (bin
!= color
);
2219 VM_STAT_ADD(vmm_vmstats
.page_ctrs_cands_skip
[r
][mrange
]);
2220 rw_exit(&page_ctrs_rwlock
[mnode
]);
2224 pfnum
= IDX_TO_PNUM(mnode
, r
, idx0
);
2225 if (pfnum
< lo
|| pfnum
>= hi
) {
2228 MEM_NODE_ITERATOR_INIT(pfnum
, mnode
, szc
, &it
);
2229 if (pfnum
== (pfn_t
)-1) {
2231 MEM_NODE_ITERATOR_INIT(pfnum
, mnode
, szc
, &it
);
2232 ASSERT(pfnum
!= (pfn_t
)-1);
2233 } else if ((PFN_2_COLOR(pfnum
, szc
, &it
) ^ color
) & ceq_mask
||
2234 (interleaved_mnodes
&& PFN_2_MEM_NODE(pfnum
) != mnode
)) {
2235 /* invalid color, get the closest correct pfn */
2236 PAGE_NEXT_PFN_FOR_COLOR(pfnum
, szc
, color
, ceq_mask
,
2240 MEM_NODE_ITERATOR_INIT(pfnum
, mnode
, szc
, &it
);
2245 /* set starting index */
2246 idx0
= PNUM_TO_IDX(mnode
, r
, pfnum
);
2250 for (idx
= idx0
; wrap
== 0 || (idx
< idx0
&& wrap
< 2); ) {
2253 if (PAGE_COUNTERS(mnode
, r
, idx
) != full
)
2257 * RFE: For performance maybe we can do something less
2258 * brutal than locking the entire freelist. So far
2259 * this doesn't seem to be a performance problem?
2261 page_freelist_lock(mnode
);
2262 if (PAGE_COUNTERS(mnode
, r
, idx
) == full
) {
2264 page_promote(mnode
, pfnum
, r
, PC_ALLOC
, mtype
);
2265 if (ret_pp
!= NULL
) {
2266 VM_STAT_ADD(vmm_vmstats
.pfc_coalok
[r
][mrange
]);
2267 PAGE_COUNTERS_CURRENT_COLOR(mnode
, r
,
2268 PFN_2_COLOR(pfnum
, szc
, &it
), mrange
) = idx
;
2269 page_freelist_unlock(mnode
);
2270 rw_exit(&page_ctrs_rwlock
[mnode
]);
2274 VM_STAT_ADD(vmm_vmstats
.page_ctrs_changed
[r
][mrange
]);
2277 page_freelist_unlock(mnode
);
2279 * No point looking for another page if we've
2280 * already tried all of the ones that
2281 * page_ctr_cands indicated. Stash off where we left
2283 * Note: this is not exact since we don't hold the
2284 * page_freelist_locks before we initially get the
2285 * value of cands for performance reasons, but should
2286 * be a decent approximation.
2289 PAGE_COUNTERS_CURRENT_COLOR(mnode
, r
, color
, mrange
) =
2294 PAGE_NEXT_PFN_FOR_COLOR(pfnum
, szc
, color
, ceq_mask
,
2296 idx
= PNUM_TO_IDX(mnode
, r
, pfnum
);
2297 if (idx
>= len
|| pfnum
>= hi
) {
2300 MEM_NODE_ITERATOR_INIT(pfnum
, mnode
, szc
, &it
);
2301 idx
= PNUM_TO_IDX(mnode
, r
, pfnum
);
2306 rw_exit(&page_ctrs_rwlock
[mnode
]);
2307 VM_STAT_ADD(vmm_vmstats
.page_ctrs_failed
[r
][mrange
]);
2312 * For the given mnode, promote as many small pages to large pages as possible.
2313 * mnode can be -1, which means do them all
2316 page_freelist_coalesce_all(int mnode
)
2318 int r
; /* region size */
2321 int doall
= interleaved_mnodes
|| mnode
< 0;
2322 int mlo
= doall
? 0 : mnode
;
2323 int mhi
= doall
? max_mem_nodes
: (mnode
+ 1);
2325 VM_STAT_ADD(vmm_vmstats
.page_ctrs_coalesce_all
);
2327 if (mpss_coalesce_disable
) {
2332 * Lock the entire freelist and coalesce what we can.
2334 * Always promote to the largest page possible
2335 * first to reduce the number of page promotions.
2337 for (mnode
= mlo
; mnode
< mhi
; mnode
++) {
2338 rw_enter(&page_ctrs_rwlock
[mnode
], RW_READER
);
2339 page_freelist_lock(mnode
);
2341 for (r
= mmu_page_sizes
- 1; r
> 0; r
--) {
2342 for (mnode
= mlo
; mnode
< mhi
; mnode
++) {
2344 int mrange
, nranges
= mnode_nranges
[mnode
];
2346 for (mrange
= 0; mrange
< nranges
; mrange
++) {
2347 PGCTRS_CANDS_GETVALUE(mnode
, mrange
, r
, cands
);
2352 VM_STAT_ADD(vmm_vmstats
.
2353 page_ctrs_cands_skip_all
);
2357 full
= FULL_REGION_CNT(r
);
2358 len
= PAGE_COUNTERS_ENTRIES(mnode
, r
);
2360 for (idx
= 0; idx
< len
; idx
++) {
2361 if (PAGE_COUNTERS(mnode
, r
, idx
) == full
) {
2363 IDX_TO_PNUM(mnode
, r
, idx
);
2364 int tmnode
= interleaved_mnodes
?
2365 PFN_2_MEM_NODE(pfnum
) : mnode
;
2368 mem_node_config
[tmnode
].physbase
&&
2370 mem_node_config
[tmnode
].physmax
);
2372 (void) page_promote(tmnode
,
2373 pfnum
, r
, PC_FREE
, PC_MTYPE_ANY
);
2376 /* shared hpm_counters covers all mnodes, so we quit */
2377 if (interleaved_mnodes
)
2381 for (mnode
= mlo
; mnode
< mhi
; mnode
++) {
2382 page_freelist_unlock(mnode
);
2383 rw_exit(&page_ctrs_rwlock
[mnode
]);
2388 * This is where all polices for moving pages around
2389 * to different page size free lists is implemented.
2390 * Returns 1 on success, 0 on failure.
2392 * So far these are the priorities for this algorithm in descending
2395 * 1) When servicing a request try to do so with a free page
2396 * from next size up. Helps defer fragmentation as long
2399 * 2) Page coalesce on demand. Only when a freelist
2400 * larger than PAGESIZE is empty and step 1
2401 * will not work since all larger size lists are
2404 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2408 page_freelist_split(uchar_t szc
, uint_t color
, int mnode
, int mtype
,
2409 pfn_t pfnlo
, pfn_t pfnhi
, page_list_walker_t
*plw
)
2411 uchar_t nszc
= szc
+ 1;
2412 uint_t bin
, sbin
, bin_prev
;
2413 page_t
*pp
, *firstpp
;
2414 page_t
*ret_pp
= NULL
;
2417 if (nszc
== mmu_page_sizes
)
2420 ASSERT(nszc
< mmu_page_sizes
);
2421 color_mask
= PAGE_GET_PAGECOLORS(nszc
) - 1;
2422 bin
= sbin
= PAGE_GET_NSZ_COLOR(szc
, color
);
2423 bin_prev
= (plw
->plw_bin_split_prev
== color
) ? INVALID_COLOR
:
2424 PAGE_GET_NSZ_COLOR(szc
, plw
->plw_bin_split_prev
);
2426 VM_STAT_ADD(vmm_vmstats
.pfs_req
[szc
]);
2428 * First try to break up a larger page to fill current size freelist.
2430 while (plw
->plw_bins
[nszc
] != 0) {
2432 ASSERT(nszc
< mmu_page_sizes
);
2435 * If page found then demote it.
2437 if (PAGE_FREELISTS(mnode
, nszc
, bin
, mtype
)) {
2438 page_freelist_lock(mnode
);
2439 firstpp
= pp
= PAGE_FREELISTS(mnode
, nszc
, bin
, mtype
);
2442 * If pfnhi is not PFNNULL, look for large page below
2443 * pfnhi. PFNNULL signifies no pfn requirement.
2446 ((pfnhi
!= PFNNULL
&& pp
->p_pagenum
>= pfnhi
) ||
2447 (pfnlo
!= PFNNULL
&& pp
->p_pagenum
< pfnlo
))) {
2449 pp
= pp
->p_list
.largepg
.next
;
2450 if (pp
== firstpp
) {
2454 } while ((pfnhi
!= PFNNULL
&&
2455 pp
->p_pagenum
>= pfnhi
) ||
2456 (pfnlo
!= PFNNULL
&&
2457 pp
->p_pagenum
< pfnlo
));
2459 if (pfnhi
!= PFNNULL
&& pp
!= NULL
)
2460 ASSERT(pp
->p_pagenum
< pfnhi
);
2462 if (pfnlo
!= PFNNULL
&& pp
!= NULL
)
2463 ASSERT(pp
->p_pagenum
>= pfnlo
);
2466 uint_t ccolor
= page_correct_color(szc
, nszc
,
2467 color
, bin
, plw
->plw_ceq_mask
[szc
]);
2469 ASSERT(pp
->p_szc
== nszc
);
2470 VM_STAT_ADD(vmm_vmstats
.pfs_demote
[nszc
]);
2471 ret_pp
= page_demote(mnode
, pp
->p_pagenum
,
2472 pfnhi
, pp
->p_szc
, szc
, ccolor
, PC_ALLOC
);
2474 page_freelist_unlock(mnode
);
2478 page_freelist_unlock(mnode
);
2481 /* loop through next size bins */
2482 bin
= ADD_MASKED(bin
, 1, plw
->plw_ceq_mask
[nszc
], color_mask
);
2483 plw
->plw_bins
[nszc
]--;
2486 uchar_t nnszc
= nszc
+ 1;
2488 /* we are done with this page size - check next */
2489 if (plw
->plw_bins
[nnszc
] == 0)
2490 /* we have already checked next size bins */
2493 bin
= sbin
= PAGE_GET_NSZ_COLOR(nszc
, bin
);
2494 if (bin_prev
!= INVALID_COLOR
) {
2495 bin_prev
= PAGE_GET_NSZ_COLOR(nszc
, bin_prev
);
2496 if (!((bin
^ bin_prev
) &
2497 plw
->plw_ceq_mask
[nnszc
]))
2500 ASSERT(nnszc
< mmu_page_sizes
);
2501 color_mask
= PAGE_GET_PAGECOLORS(nnszc
) - 1;
2503 ASSERT(nszc
< mmu_page_sizes
);
2511 * Helper routine used only by the freelist code to lock
2512 * a page. If the page is a large page then it succeeds in
2513 * locking all the constituent pages or none at all.
2514 * Returns 1 on sucess, 0 on failure.
2517 page_trylock_cons(page_t
*pp
, se_t se
)
2519 page_t
*tpp
, *first_pp
= pp
;
2522 * Fail if can't lock first or only page.
2524 if (!page_trylock(pp
, se
)) {
2529 * PAGESIZE: common case.
2531 if (pp
->p_szc
== 0) {
2540 if (!page_trylock(tpp
, se
)) {
2542 * On failure unlock what we have locked so far.
2543 * We want to avoid attempting to capture these
2544 * pages as the pcm mutex may be held which could
2545 * lead to a recursive mutex panic.
2547 while (first_pp
!= tpp
) {
2548 page_unlock_nocapture(first_pp
);
2549 first_pp
= first_pp
->p_next
;
2559 * init context for walking page lists
2560 * Called when a page of the given szc in unavailable. Sets markers
2561 * for the beginning of the search to detect when search has
2562 * completed a full cycle. Sets flags for splitting larger pages
2563 * and coalescing smaller pages. Page walking procedes until a page
2564 * of the desired equivalent color is found.
2567 page_list_walk_init(uchar_t szc
, uint_t flags
, uint_t bin
, int can_split
,
2568 int use_ceq
, page_list_walker_t
*plw
)
2570 uint_t nszc
, ceq_mask
, colors
;
2571 uchar_t ceq
= use_ceq
? colorequivszc
[szc
] : 0;
2573 ASSERT(szc
< mmu_page_sizes
);
2574 colors
= PAGE_GET_PAGECOLORS(szc
);
2576 plw
->plw_colors
= colors
;
2577 plw
->plw_color_mask
= colors
- 1;
2578 plw
->plw_bin_marker
= plw
->plw_bin0
= bin
;
2579 plw
->plw_bin_split_prev
= bin
;
2580 plw
->plw_bin_step
= (szc
== 0) ? vac_colors
: 1;
2583 * if vac aliasing is possible make sure lower order color
2584 * bits are never ignored
2590 * calculate the number of non-equivalent colors and
2591 * color equivalency mask
2593 plw
->plw_ceq_dif
= colors
>> ((ceq
>> 4) + (ceq
& 0xf));
2594 ASSERT(szc
> 0 || plw
->plw_ceq_dif
>= vac_colors
);
2595 ASSERT(plw
->plw_ceq_dif
> 0);
2596 plw
->plw_ceq_mask
[szc
] = (plw
->plw_ceq_dif
- 1) << (ceq
& 0xf);
2598 if (flags
& PG_MATCH_COLOR
) {
2599 if (cpu_page_colors
< 0) {
2601 * this is a heterogeneous machine with different CPUs
2602 * having different size e$ (not supported for ni2/rock
2604 uint_t cpucolors
= CPUSETSIZE() >> PAGE_GET_SHIFT(szc
);
2605 cpucolors
= MAX(cpucolors
, 1);
2606 ceq_mask
= plw
->plw_color_mask
& (cpucolors
- 1);
2607 plw
->plw_ceq_mask
[szc
] =
2608 MIN(ceq_mask
, plw
->plw_ceq_mask
[szc
]);
2610 plw
->plw_ceq_dif
= 1;
2613 /* we can split pages in the freelist, but not the cachelist */
2615 plw
->plw_do_split
= (szc
+ 1 < mmu_page_sizes
) ? 1 : 0;
2617 /* set next szc color masks and number of free list bins */
2618 for (nszc
= szc
+ 1; nszc
< mmu_page_sizes
; nszc
++, szc
++) {
2619 plw
->plw_ceq_mask
[nszc
] = PAGE_GET_NSZ_MASK(szc
,
2620 plw
->plw_ceq_mask
[szc
]);
2621 plw
->plw_bins
[nszc
] = PAGE_GET_PAGECOLORS(nszc
);
2623 plw
->plw_ceq_mask
[nszc
] = INVALID_MASK
;
2624 plw
->plw_bins
[nszc
] = 0;
2628 plw
->plw_do_split
= 0;
2629 plw
->plw_bins
[1] = 0;
2630 plw
->plw_ceq_mask
[1] = INVALID_MASK
;
2635 * set mark to flag where next split should occur
2637 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \
2638 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \
2639 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \
2640 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \
2641 plw->plw_split_next = \
2642 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \
2643 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2644 plw->plw_split_next = \
2645 INC_MASKED(plw->plw_split_next, \
2646 neq_mask, plw->plw_color_mask); \
2651 page_list_walk_next_bin(uchar_t szc
, uint_t bin
, page_list_walker_t
*plw
)
2653 uint_t neq_mask
= ~plw
->plw_ceq_mask
[szc
] & plw
->plw_color_mask
;
2654 uint_t bin0_nsz
, nbin_nsz
, nbin0
, nbin
;
2655 uchar_t nszc
= szc
+ 1;
2657 nbin
= ADD_MASKED(bin
,
2658 plw
->plw_bin_step
, neq_mask
, plw
->plw_color_mask
);
2660 if (plw
->plw_do_split
) {
2661 plw
->plw_bin_split_prev
= bin
;
2662 PAGE_SET_NEXT_SPLIT_MARKER(szc
, nszc
, bin
, plw
);
2663 plw
->plw_do_split
= 0;
2667 if (plw
->plw_count
!= 0 || plw
->plw_ceq_dif
== vac_colors
) {
2668 if (nbin
== plw
->plw_bin0
&&
2669 (vac_colors
== 1 || nbin
!= plw
->plw_bin_marker
)) {
2670 nbin
= ADD_MASKED(nbin
, plw
->plw_bin_step
,
2671 neq_mask
, plw
->plw_color_mask
);
2672 plw
->plw_bin_split_prev
= plw
->plw_bin0
;
2675 if (vac_colors
> 1 && nbin
== plw
->plw_bin_marker
) {
2676 plw
->plw_bin_marker
=
2677 nbin
= INC_MASKED(nbin
, neq_mask
,
2678 plw
->plw_color_mask
);
2679 plw
->plw_bin_split_prev
= plw
->plw_bin0
;
2681 * large pages all have the same vac color
2682 * so by now we should be done with next
2683 * size page splitting process
2685 ASSERT(plw
->plw_bins
[1] == 0);
2686 plw
->plw_do_split
= 0;
2691 uint_t bin_jump
= (vac_colors
== 1) ?
2692 (BIN_STEP
& ~3) - (plw
->plw_bin0
& 3) : BIN_STEP
;
2694 bin_jump
&= ~(vac_colors
- 1);
2696 nbin0
= ADD_MASKED(plw
->plw_bin0
, bin_jump
, neq_mask
,
2697 plw
->plw_color_mask
);
2699 if ((nbin0
^ plw
->plw_bin0
) & plw
->plw_ceq_mask
[szc
]) {
2701 plw
->plw_bin_marker
= nbin
= nbin0
;
2703 if (plw
->plw_bins
[nszc
] != 0) {
2705 * check if next page size bin is the
2706 * same as the next page size bin for
2709 nbin_nsz
= PAGE_GET_NSZ_COLOR(szc
,
2711 bin0_nsz
= PAGE_GET_NSZ_COLOR(szc
,
2714 if ((bin0_nsz
^ nbin_nsz
) &
2715 plw
->plw_ceq_mask
[nszc
])
2716 plw
->plw_do_split
= 1;
2723 if (plw
->plw_bins
[nszc
] != 0) {
2724 nbin_nsz
= PAGE_GET_NSZ_COLOR(szc
, nbin
);
2725 if (!((plw
->plw_split_next
^ nbin_nsz
) &
2726 plw
->plw_ceq_mask
[nszc
]))
2727 plw
->plw_do_split
= 1;
2734 page_get_mnode_freelist(int mnode
, uint_t bin
, int mtype
, uchar_t szc
,
2738 page_t
*pp
, *first_pp
;
2740 int plw_initialized
;
2741 page_list_walker_t plw
;
2743 ASSERT(szc
< mmu_page_sizes
);
2745 VM_STAT_ADD(vmm_vmstats
.pgmf_alloc
[szc
]);
2747 MTYPE_START(mnode
, mtype
, flags
);
2748 if (mtype
< 0) { /* mnode does not have memory in mtype range */
2749 VM_STAT_ADD(vmm_vmstats
.pgmf_allocempty
[szc
]);
2754 plw_initialized
= 0;
2755 plw
.plw_ceq_dif
= 1;
2758 * Only hold one freelist lock at a time, that way we
2759 * can start anywhere and not have to worry about lock
2762 for (plw
.plw_count
= 0;
2763 plw
.plw_count
< plw
.plw_ceq_dif
; plw
.plw_count
++) {
2766 if (!PAGE_FREELISTS(mnode
, szc
, bin
, mtype
))
2769 pcm
= PC_BIN_MUTEX(mnode
, bin
, PG_FREE_LIST
);
2771 pp
= PAGE_FREELISTS(mnode
, szc
, bin
, mtype
);
2776 * These were set before the page
2777 * was put on the free list,
2778 * they must still be set.
2780 ASSERT(PP_ISFREE(pp
));
2781 ASSERT(PP_ISAGED(pp
));
2782 VERIFY(pp
->p_object
== NULL
);
2783 ASSERT(pp
->p_vnode
== NULL
);
2784 ASSERT(pp
->p_offset
== (uoff_t
)-1);
2785 ASSERT(pp
->p_szc
== szc
);
2786 ASSERT(PFN_2_MEM_NODE(pp
->p_pagenum
) == mnode
);
2789 * Walk down the hash chain. 4k/8k pages are linked
2790 * on p_next and p_prev fields. Large pages are a
2791 * contiguous group of constituent pages linked
2792 * together on their p_next and p_prev fields. The
2793 * large pages are linked together on the hash chain
2794 * using p_list.largepg of the base constituent page
2795 * of each large page.
2798 while (!page_trylock_cons(pp
, SE_EXCL
)) {
2802 pp
= pp
->p_list
.largepg
.next
;
2805 ASSERT(PP_ISFREE(pp
));
2806 ASSERT(PP_ISAGED(pp
));
2807 VERIFY(pp
->p_object
== NULL
);
2808 ASSERT(pp
->p_vnode
== NULL
);
2809 ASSERT(pp
->p_offset
== (uoff_t
)-1);
2810 ASSERT(pp
->p_szc
== szc
);
2811 ASSERT(PFN_2_MEM_NODE(pp
->p_pagenum
) == mnode
);
2818 ASSERT(mtype
== PP_2_MTYPE(pp
));
2819 ASSERT(pp
->p_szc
== szc
);
2821 page_sub(&PAGE_FREELISTS(mnode
,
2822 szc
, bin
, mtype
), pp
);
2824 page_lpsub(&PAGE_FREELISTS(mnode
,
2825 szc
, bin
, mtype
), pp
);
2828 page_ctr_sub(mnode
, mtype
, pp
, PG_FREE_LIST
);
2830 if ((PP_ISFREE(pp
) == 0) || (PP_ISAGED(pp
) == 0))
2831 panic("free page is not. pp %p", (void *)pp
);
2834 VM_STAT_ADD(vmm_vmstats
.pgmf_allocok
[szc
]);
2840 if (plw_initialized
== 0) {
2841 page_list_walk_init(szc
, flags
, bin
, 1, 1,
2843 plw_initialized
= 1;
2844 ASSERT(plw
.plw_colors
<=
2845 PAGE_GET_PAGECOLORS(szc
));
2846 ASSERT(plw
.plw_colors
> 0);
2847 ASSERT((plw
.plw_colors
&
2848 (plw
.plw_colors
- 1)) == 0);
2849 ASSERT(bin
< plw
.plw_colors
);
2850 ASSERT(plw
.plw_ceq_mask
[szc
] < plw
.plw_colors
);
2852 /* calculate the next bin with equivalent color */
2853 bin
= ADD_MASKED(bin
, plw
.plw_bin_step
,
2854 plw
.plw_ceq_mask
[szc
], plw
.plw_color_mask
);
2855 } while (sbin
!= bin
);
2858 * color bins are all empty if color match. Try and
2859 * satisfy the request by breaking up or coalescing
2860 * pages from a different size freelist of the correct
2861 * color that satisfies the ORIGINAL color requested.
2862 * If that fails then try pages of the same size but
2863 * different colors assuming we are not called with
2866 if (plw
.plw_do_split
&&
2867 (pp
= page_freelist_split(szc
, bin
, mnode
,
2868 mtype
, PFNNULL
, PFNNULL
, &plw
)) != NULL
)
2871 if (szc
> 0 && (pp
= page_freelist_coalesce(mnode
, szc
,
2872 bin
, plw
.plw_ceq_mask
[szc
], mtype
, PFNNULL
)) != NULL
)
2875 if (plw
.plw_ceq_dif
> 1)
2876 bin
= page_list_walk_next_bin(szc
, bin
, &plw
);
2879 /* if allowed, cycle through additional mtypes */
2880 MTYPE_NEXT(mnode
, mtype
, flags
);
2884 VM_STAT_ADD(vmm_vmstats
.pgmf_allocfailed
[szc
]);
2890 * Returns the count of free pages for 'pp' with size code 'szc'.
2891 * Note: This function does not return an exact value as the page freelist
2892 * locks are not held and thus the values in the page_counters may be
2893 * changing as we walk through the data.
2896 page_freecnt(int mnode
, page_t
*pp
, uchar_t szc
)
2900 ssize_t r
= szc
; /* region size */
2905 /* Make sure pagenum passed in is aligned properly */
2906 ASSERT((pp
->p_pagenum
& (PNUM_SIZE(szc
) - 1)) == 0);
2909 /* Prevent page_counters dynamic memory from being freed */
2910 rw_enter(&page_ctrs_rwlock
[mnode
], RW_READER
);
2911 idx
= PNUM_TO_IDX(mnode
, r
, pp
->p_pagenum
);
2912 cnt
= PAGE_COUNTERS(mnode
, r
, idx
);
2913 pgfree
= cnt
<< PNUM_SHIFT(r
- 1);
2914 range
= FULL_REGION_CNT(szc
);
2916 /* Check for completely full region */
2918 rw_exit(&page_ctrs_rwlock
[mnode
]);
2923 idx
= PNUM_TO_IDX(mnode
, r
, pp
->p_pagenum
);
2924 full
= FULL_REGION_CNT(r
);
2925 for (i
= 0; i
< range
; i
++, idx
++) {
2926 cnt
= PAGE_COUNTERS(mnode
, r
, idx
);
2928 * If cnt here is full, that means we have already
2929 * accounted for these pages earlier.
2932 pgfree
+= (cnt
<< PNUM_SHIFT(r
- 1));
2937 rw_exit(&page_ctrs_rwlock
[mnode
]);
2942 * Called from page_geti_contig_pages to exclusively lock constituent pages
2943 * starting from 'spp' for page size code 'szc'.
2945 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
2946 * region needs to be greater than or equal to the threshold.
2949 page_trylock_contig_pages(int mnode
, page_t
*spp
, uchar_t szc
, int flags
)
2951 pgcnt_t pgcnt
= PNUM_SIZE(szc
);
2955 VM_STAT_ADD(vmm_vmstats
.ptcp
[szc
]);
2958 if ((ptcpthreshold
== 0) || (flags
& PGI_PGCPHIPRI
))
2961 * check if there are sufficient free pages available before attempting
2962 * to trylock. Count is approximate as page counters can change.
2964 pgfree
= page_freecnt(mnode
, spp
, szc
);
2966 /* attempt to trylock if there are sufficient already free pages */
2967 if (pgfree
< pgcnt
/ptcpthreshold
) {
2968 VM_STAT_ADD(vmm_vmstats
.ptcpfreethresh
[szc
]);
2974 for (i
= 0; i
< pgcnt
; i
++) {
2976 if (!page_trylock(pp
, SE_EXCL
)) {
2977 VM_STAT_ADD(vmm_vmstats
.ptcpfailexcl
[szc
]);
2978 while (--i
!= (pgcnt_t
)-1) {
2980 ASSERT(PAGE_EXCL(pp
));
2981 page_unlock_nocapture(pp
);
2985 ASSERT(spp
[i
].p_pagenum
== spp
->p_pagenum
+ i
);
2986 if ((pp
->p_szc
> szc
|| (szc
&& pp
->p_szc
== szc
)) &&
2988 VM_STAT_ADD(vmm_vmstats
.ptcpfailszc
[szc
]);
2990 page_unlock_nocapture(pp
);
2995 * If a page has been marked non-relocatable or has been
2996 * explicitly locked in memory, we don't want to relocate it;
2997 * unlock the pages and fail the operation.
2999 if (PP_ISNORELOC(pp
) ||
3000 pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0) {
3001 VM_STAT_ADD(vmm_vmstats
.ptcpfailcage
[szc
]);
3002 while (i
!= (pgcnt_t
)-1) {
3004 ASSERT(PAGE_EXCL(pp
));
3005 page_unlock_nocapture(pp
);
3011 VM_STAT_ADD(vmm_vmstats
.ptcpok
[szc
]);
3016 * Claim large page pointed to by 'pp'. 'pp' is the starting set
3017 * of 'szc' constituent pages that had been locked exclusively previously.
3018 * Will attempt to relocate constituent pages in use.
3021 page_claim_contig_pages(page_t
*pp
, uchar_t szc
, int flags
)
3023 spgcnt_t pgcnt
, npgs
, i
;
3024 page_t
*targpp
, *rpp
, *hpp
;
3025 page_t
*replpp
= NULL
;
3026 page_t
*pplist
= NULL
;
3030 pgcnt
= page_get_pagecnt(szc
);
3032 ASSERT(PAGE_EXCL(pp
));
3033 ASSERT(!PP_ISNORELOC(pp
));
3034 if (PP_ISFREE(pp
)) {
3036 * If this is a PG_FREE_LIST page then its
3037 * size code can change underneath us due to
3038 * page promotion or demotion. As an optimzation
3039 * use page_list_sub_pages() instead of
3042 if (PP_ISAGED(pp
)) {
3043 page_list_sub_pages(pp
, szc
);
3044 if (pp
->p_szc
== szc
) {
3047 ASSERT(pp
->p_szc
< szc
);
3048 npgs
= page_get_pagecnt(pp
->p_szc
);
3050 for (i
= 0; i
< npgs
; i
++, pp
++) {
3053 page_list_concat(&pplist
, &hpp
);
3057 ASSERT(!PP_ISAGED(pp
));
3058 ASSERT(pp
->p_szc
== 0);
3059 page_list_sub(pp
, PG_CACHE_LIST
);
3060 page_hashout(pp
, false);
3063 page_list_concat(&pplist
, &pp
);
3068 npgs
= page_get_pagecnt(pp
->p_szc
);
3071 * page_create_wait freemem accounting done by caller of
3072 * page_get_freelist and not necessary to call it prior to
3073 * calling page_get_replacement_page.
3075 * page_get_replacement_page can call page_get_contig_pages
3076 * to acquire a large page (szc > 0); the replacement must be
3077 * smaller than the contig page size to avoid looping or
3078 * szc == 0 and PGI_PGCPSZC0 is set.
3080 if (pp
->p_szc
< szc
|| (szc
== 0 && (flags
& PGI_PGCPSZC0
))) {
3081 replpp
= page_get_replacement_page(pp
, NULL
, 0);
3083 npgs
= page_get_pagecnt(pp
->p_szc
);
3084 ASSERT(npgs
<= pgcnt
);
3090 * If replacement is NULL or do_page_relocate fails, fail
3091 * coalescing of pages.
3093 if (replpp
== NULL
|| (do_page_relocate(&targpp
, &replpp
, 0,
3094 &npgs
, NULL
) != 0)) {
3096 * Unlock un-processed target list
3099 ASSERT(PAGE_EXCL(pp
));
3100 page_unlock_nocapture(pp
);
3104 * Free the processed target list.
3108 page_sub(&pplist
, pp
);
3109 ASSERT(PAGE_EXCL(pp
));
3110 ASSERT(pp
->p_szc
== szc
);
3111 ASSERT(PP_ISFREE(pp
));
3112 ASSERT(PP_ISAGED(pp
));
3114 page_list_add(pp
, PG_FREE_LIST
| PG_LIST_TAIL
);
3115 page_unlock_nocapture(pp
);
3119 page_free_replacement_page(replpp
);
3123 ASSERT(pp
== targpp
);
3125 ASSERT(hpp
= pp
); /* That's right, it's an assignment */
3131 ASSERT(PAGE_EXCL(targpp
));
3132 ASSERT(!PP_ISFREE(targpp
));
3133 ASSERT(!PP_ISNORELOC(targpp
));
3135 ASSERT(PP_ISAGED(targpp
));
3136 ASSERT(targpp
->p_szc
< szc
|| (szc
== 0 &&
3137 (flags
& PGI_PGCPSZC0
)));
3138 targpp
->p_szc
= szc
;
3139 targpp
= targpp
->p_next
;
3142 ASSERT(rpp
!= NULL
);
3143 page_sub(&replpp
, rpp
);
3144 ASSERT(PAGE_EXCL(rpp
));
3145 ASSERT(!PP_ISFREE(rpp
));
3146 page_unlock_nocapture(rpp
);
3148 ASSERT(targpp
== hpp
);
3149 ASSERT(replpp
== NULL
);
3150 page_list_concat(&pplist
, &targpp
);
3152 CHK_LPG(pplist
, szc
);
3157 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3158 * page with size code 'szc'. Claiming such a page requires acquiring
3159 * exclusive locks on all constituent pages (page_trylock_contig_pages),
3160 * relocating pages in use and concatenating these constituent pages into a
3163 * The page lists do not have such a large page and page_freelist_split has
3164 * already failed to demote larger pages and/or coalesce smaller free pages.
3166 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3167 * pages with the same color as 'bin'.
3169 * 'pfnflag' specifies the subset of the pfn range to search.
3173 page_geti_contig_pages(int mnode
, uint_t bin
, uchar_t szc
, int flags
,
3174 pfn_t pfnlo
, pfn_t pfnhi
, pgcnt_t pfnflag
)
3176 struct memseg
*mseg
;
3177 pgcnt_t szcpgcnt
= page_get_pagecnt(szc
);
3178 pgcnt_t szcpgmask
= szcpgcnt
- 1;
3180 page_t
*pp
, *randpp
, *endpp
;
3181 uint_t colors
, ceq_mask
;
3182 uint_t color_mask __unused
;
3185 MEM_NODE_ITERATOR_DECL(it
);
3187 ASSERT(szc
!= 0 || (flags
& PGI_PGCPSZC0
));
3189 pfnlo
= P2ROUNDUP(pfnlo
, szcpgcnt
);
3191 if ((pfnhi
- pfnlo
) + 1 < szcpgcnt
|| pfnlo
>= pfnhi
)
3194 ASSERT(szc
< mmu_page_sizes
);
3196 colors
= PAGE_GET_PAGECOLORS(szc
);
3197 color_mask
= colors
- 1;
3198 if ((colors
> 1) && (flags
& PG_MATCH_COLOR
)) {
3199 uchar_t ceq
= colorequivszc
[szc
];
3200 uint_t ceq_dif
= colors
>> ((ceq
>> 4) + (ceq
& 0xf));
3202 ASSERT(ceq_dif
> 0);
3203 ceq_mask
= (ceq_dif
- 1) << (ceq
& 0xf);
3208 ASSERT(bin
< colors
);
3210 /* clear "non-significant" color bits */
3214 * trim the pfn range to search based on pfnflag. pfnflag is set
3215 * when there have been previous page_get_contig_page failures to
3218 * The high bit in pfnflag specifies the number of 'slots' in the
3219 * pfn range and the remainder of pfnflag specifies which slot.
3220 * For example, a value of 1010b would mean the second slot of
3221 * the pfn range that has been divided into 8 slots.
3224 int slots
= 1 << (highbit(pfnflag
) - 1);
3225 int slotid
= pfnflag
& (slots
- 1);
3229 pfnhi
= P2ALIGN((pfnhi
+ 1), szcpgcnt
) - 1;
3230 szcpages
= ((pfnhi
- pfnlo
) + 1) / szcpgcnt
;
3231 slotlen
= howmany(szcpages
, slots
);
3232 /* skip if 'slotid' slot is empty */
3233 if (slotid
* slotlen
>= szcpages
)
3235 pfnlo
= pfnlo
+ (((slotid
* slotlen
) % szcpages
) * szcpgcnt
);
3236 ASSERT(pfnlo
< pfnhi
);
3237 if (pfnhi
> pfnlo
+ (slotlen
* szcpgcnt
))
3238 pfnhi
= pfnlo
+ (slotlen
* szcpgcnt
) - 1;
3242 * This routine is can be called recursively so we shouldn't
3243 * acquire a reader lock if a write request is pending. This
3244 * could lead to a deadlock with the DR thread.
3246 * Returning NULL informs the caller that we could not get
3247 * a contig page with the required characteristics.
3250 if (!memsegs_trylock(0))
3254 * loop through memsegs to look for contig page candidates
3257 for (mseg
= memsegs
; mseg
!= NULL
; mseg
= mseg
->next
) {
3258 if (pfnhi
< mseg
->pages_base
|| pfnlo
>= mseg
->pages_end
) {
3263 if (mseg
->pages_end
- mseg
->pages_base
< szcpgcnt
)
3264 /* mseg too small */
3267 lo
= MAX(pfnlo
, mseg
->pages_base
);
3268 hi
= MIN(pfnhi
, (mseg
->pages_end
- 1));
3270 /* round to szcpgcnt boundaries */
3271 lo
= P2ROUNDUP(lo
, szcpgcnt
);
3273 MEM_NODE_ITERATOR_INIT(lo
, mnode
, szc
, &it
);
3274 hi
= P2ALIGN((hi
+ 1), szcpgcnt
) - 1;
3280 * set lo to point to the pfn for the desired bin. Large
3281 * page sizes may only have a single page color
3284 if (ceq_mask
> 0 || interleaved_mnodes
) {
3285 /* set lo to point at appropriate color */
3286 if (((PFN_2_COLOR(lo
, szc
, &it
) ^ bin
) & ceq_mask
) ||
3287 (interleaved_mnodes
&&
3288 PFN_2_MEM_NODE(lo
) != mnode
)) {
3289 PAGE_NEXT_PFN_FOR_COLOR(lo
, szc
, bin
, ceq_mask
,
3293 /* mseg cannot satisfy color request */
3297 /* randomly choose a point between lo and hi to begin search */
3299 randpfn
= (pfn_t
)GETTICK();
3300 randpfn
= ((randpfn
% (hi
- lo
)) + lo
) & ~(skip
- 1);
3301 MEM_NODE_ITERATOR_INIT(randpfn
, mnode
, szc
, &it
);
3302 if (ceq_mask
|| interleaved_mnodes
|| randpfn
== (pfn_t
)-1) {
3303 if (randpfn
!= (pfn_t
)-1) {
3304 PAGE_NEXT_PFN_FOR_COLOR(randpfn
, szc
, bin
,
3305 ceq_mask
, color_mask
, &it
);
3307 if (randpfn
>= hi
) {
3309 MEM_NODE_ITERATOR_INIT(randpfn
, mnode
, szc
,
3313 randpp
= mseg
->pages
+ (randpfn
- mseg
->pages_base
);
3315 ASSERT(randpp
->p_pagenum
== randpfn
);
3318 endpp
= mseg
->pages
+ (hi
- mseg
->pages_base
) + 1;
3320 ASSERT(randpp
+ szcpgcnt
<= endpp
);
3323 ASSERT(!(pp
->p_pagenum
& szcpgmask
));
3324 ASSERT(((PP_2_BIN(pp
) ^ bin
) & ceq_mask
) == 0);
3326 if (page_trylock_contig_pages(mnode
, pp
, szc
, flags
)) {
3327 /* pages unlocked by page_claim on failure */
3328 if (page_claim_contig_pages(pp
, szc
, flags
)) {
3334 if (ceq_mask
== 0 && !interleaved_mnodes
) {
3337 pfn_t pfn
= pp
->p_pagenum
;
3339 PAGE_NEXT_PFN_FOR_COLOR(pfn
, szc
, bin
,
3340 ceq_mask
, color_mask
, &it
);
3341 if (pfn
== (pfn_t
)-1) {
3345 (pfn
- mseg
->pages_base
);
3349 /* start from the beginning */
3350 MEM_NODE_ITERATOR_INIT(lo
, mnode
, szc
, &it
);
3351 pp
= mseg
->pages
+ (lo
- mseg
->pages_base
);
3352 ASSERT(pp
->p_pagenum
== lo
);
3353 ASSERT(pp
+ szcpgcnt
<= endpp
);
3355 } while (pp
!= randpp
);
3363 * controlling routine that searches through physical memory in an attempt to
3364 * claim a large page based on the input parameters.
3365 * on the page free lists.
3367 * calls page_geti_contig_pages with an initial pfn range from the mnode
3368 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3369 * that overlaps with the kernel cage or does not match the requested page
3370 * color if PG_MATCH_COLOR is set. Since this search is very expensive,
3371 * page_geti_contig_pages may further limit the search range based on
3372 * previous failure counts (pgcpfailcnt[]).
3374 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3375 * pagesize page that satisfies mtype.
3378 page_get_contig_pages(int mnode
, uint_t bin
, int mtype
, uchar_t szc
,
3381 pfn_t pfnlo
, pfnhi
; /* contig pages pfn range */
3383 pgcnt_t pfnflag
= 0; /* no limit on search if 0 */
3385 VM_STAT_ADD(vmm_vmstats
.pgcp_alloc
[szc
]);
3387 /* no allocations from cage */
3388 flags
|= PGI_NOCAGE
;
3390 MTYPE_START(mnode
, mtype
, flags
);
3391 if (mtype
< 0) { /* mnode does not have memory in mtype range */
3392 VM_STAT_ADD(vmm_vmstats
.pgcp_allocempty
[szc
]);
3396 ASSERT(szc
> 0 || (flags
& PGI_PGCPSZC0
));
3398 /* do not limit search and ignore color if hi pri */
3400 if (pgcplimitsearch
&& ((flags
& PGI_PGCPHIPRI
) == 0))
3401 pfnflag
= pgcpfailcnt
[szc
];
3403 /* remove color match to improve chances */
3405 if (flags
& PGI_PGCPHIPRI
|| pfnflag
)
3406 flags
&= ~PG_MATCH_COLOR
;
3409 /* get pfn range based on mnode and mtype */
3410 MNODETYPE_2_PFN(mnode
, mtype
, pfnlo
, pfnhi
);
3412 ASSERT(pfnhi
>= pfnlo
);
3414 pp
= page_geti_contig_pages(mnode
, bin
, szc
, flags
,
3415 pfnlo
, pfnhi
, pfnflag
);
3418 pfnflag
= pgcpfailcnt
[szc
];
3420 /* double the search size */
3421 pgcpfailcnt
[szc
] = pfnflag
>> 1;
3423 VM_STAT_ADD(vmm_vmstats
.pgcp_allocok
[szc
]);
3426 MTYPE_NEXT(mnode
, mtype
, flags
);
3427 } while (mtype
>= 0);
3429 VM_STAT_ADD(vmm_vmstats
.pgcp_allocfailed
[szc
]);
3433 #if defined(__i386) || defined(__amd64)
3435 * Determine the likelihood of finding/coalescing a szc page.
3436 * Return 0 if the likelihood is small otherwise return 1.
3438 * For now, be conservative and check only 1g pages and return 0
3439 * if there had been previous coalescing failures and the szc pages
3440 * needed to satisfy request would exhaust most of freemem.
3443 page_chk_freelist(uint_t szc
)
3450 pgcnt
= page_get_pagecnt(szc
);
3451 if (pgcpfailcnt
[szc
] && pgcnt
+ throttlefree
>= freemem
) {
3452 VM_STAT_ADD(vmm_vmstats
.pcf_deny
[szc
]);
3455 VM_STAT_ADD(vmm_vmstats
.pcf_allow
[szc
]);
3461 * Find the `best' page on the freelist for this (obj,off) (as,vaddr) pair.
3463 * Does its own locking and accounting.
3464 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3465 * pages of the proper color even if there are pages of a different color.
3467 * Finds a page, removes it, THEN locks it.
3472 page_get_freelist(struct vmobject
*obj
, uoff_t off
, struct seg
*seg
,
3473 caddr_t vaddr
, size_t size
, uint_t flags
, struct lgrp
*lgrp
)
3475 struct as
*as
= seg
->s_as
;
3481 page_t
*(*page_get_func
)(int, uint_t
, int, uchar_t
, uint_t
);
3482 lgrp_mnode_cookie_t lgrp_cookie
;
3484 page_get_func
= page_get_mnode_freelist
;
3487 * If we aren't passed a specific lgroup, or passed a freed lgrp
3488 * assume we wish to allocate near to the current thread's home.
3490 if (!LGRP_EXISTS(lgrp
))
3491 lgrp
= lgrp_home_lgrp();
3493 flags
&= ~PG_NORELOC
;
3494 flags
|= PGI_NOCAGE
;
3496 MTYPE_INIT(mtype
, obj
->vnode
, vaddr
, flags
, size
);
3499 * Convert size to page size code.
3501 if ((szc
= page_szc(size
)) == (uchar_t
)-1)
3502 panic("page_get_freelist: illegal page size request");
3503 ASSERT(szc
< mmu_page_sizes
);
3505 VM_STAT_ADD(vmm_vmstats
.pgf_alloc
[szc
]);
3507 AS_2_BIN(as
, seg
, obj
->vnode
, vaddr
, bin
, szc
);
3509 ASSERT(bin
< PAGE_GET_PAGECOLORS(szc
));
3512 * Try to get a local page first, but try remote if we can't
3513 * get a page of the right color.
3516 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
, LGRP_SRCH_LOCAL
);
3517 while ((mnode
= lgrp_memnode_choose(&lgrp_cookie
)) >= 0) {
3518 pp
= page_get_func(mnode
, bin
, mtype
, szc
, flags
);
3520 VM_STAT_ADD(vmm_vmstats
.pgf_allocok
[szc
]);
3521 DTRACE_PROBE4(page__get
,
3532 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3533 * remote free lists. Caller expected to call page_get_cachelist which
3534 * will check local cache lists and remote free lists.
3536 if (szc
== 0 && ((flags
& PGI_PGCPSZC0
) == 0)) {
3537 VM_STAT_ADD(vmm_vmstats
.pgf_allocdeferred
);
3541 ASSERT(szc
> 0 || (flags
& PGI_PGCPSZC0
));
3543 lgrp_stat_add(lgrp
->lgrp_id
, LGRP_NUM_ALLOC_FAIL
, 1);
3545 if (!(flags
& PG_LOCAL
)) {
3547 * Try to get a non-local freelist page.
3549 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie
);
3550 while ((mnode
= lgrp_memnode_choose(&lgrp_cookie
)) >= 0) {
3551 pp
= page_get_func(mnode
, bin
, mtype
, szc
, flags
);
3553 DTRACE_PROBE4(page__get
,
3558 VM_STAT_ADD(vmm_vmstats
.pgf_allocokrem
[szc
]);
3566 * when the cage is off chances are page_get_contig_pages() will fail
3567 * to lock a large page chunk therefore when the cage is off it's not
3568 * called by default. this can be changed via /etc/system.
3570 * page_get_contig_pages() also called to acquire a base pagesize page
3571 * for page_create_get_something().
3573 if (!(flags
& PG_NORELOC
) && (pg_contig_disable
== 0) &&
3574 (pg_lpgcreate_nocage
|| szc
== 0) &&
3575 (page_get_func
!= page_get_contig_pages
)) {
3577 VM_STAT_ADD(vmm_vmstats
.pgf_allocretry
[szc
]);
3578 page_get_func
= page_get_contig_pages
;
3582 if (!(flags
& PG_LOCAL
) && pgcplimitsearch
&&
3583 page_get_func
== page_get_contig_pages
)
3584 SETPGCPFAILCNT(szc
);
3586 VM_STAT_ADD(vmm_vmstats
.pgf_allocfailed
[szc
]);
3591 * Find the `best' page on the cachelist for this (obj,off) (as,vaddr) pair.
3593 * Does its own locking.
3594 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3595 * pages of the proper color even if there are pages of a different color.
3596 * Otherwise, scan the bins for ones with pages. For each bin with pages,
3597 * try to lock one of them. If no page can be locked, try the
3598 * next bin. Return NULL if a page can not be found and locked.
3600 * Finds a pages, trys to lock it, then removes it.
3605 page_get_cachelist(struct vmobject
*obj
, uoff_t off
, struct seg
*seg
,
3606 caddr_t vaddr
, uint_t flags
, struct lgrp
*lgrp
)
3609 struct as
*as
= seg
->s_as
;
3613 lgrp_mnode_cookie_t lgrp_cookie
;
3616 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3617 * assume we wish to allocate near to the current thread's home.
3619 if (!LGRP_EXISTS(lgrp
))
3620 lgrp
= lgrp_home_lgrp();
3622 flags
&= ~PG_NORELOC
;
3623 flags
|= PGI_NOCAGE
;
3625 if ((flags
& (PG_NORELOC
| PG_PANIC
| PG_PUSHPAGE
)) == PG_NORELOC
)
3628 AS_2_BIN(as
, seg
, obj
->vnode
, vaddr
, bin
, 0);
3630 ASSERT(bin
< PAGE_GET_PAGECOLORS(0));
3632 MTYPE_INIT(mtype
, obj
->vnode
, vaddr
, flags
, MMU_PAGESIZE
);
3634 VM_STAT_ADD(vmm_vmstats
.pgc_alloc
);
3637 * Try local cachelists first
3639 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
, LGRP_SRCH_LOCAL
);
3640 while ((mnode
= lgrp_memnode_choose(&lgrp_cookie
)) >= 0) {
3641 pp
= page_get_mnode_cachelist(bin
, flags
, mnode
, mtype
);
3643 VM_STAT_ADD(vmm_vmstats
.pgc_allocok
);
3644 DTRACE_PROBE4(page__get
,
3653 lgrp_stat_add(lgrp
->lgrp_id
, LGRP_NUM_ALLOC_FAIL
, 1);
3656 * Try freelists/cachelists that are farther away
3657 * This is our only chance to allocate remote pages for PAGESIZE
3660 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie
);
3661 while ((mnode
= lgrp_memnode_choose(&lgrp_cookie
)) >= 0) {
3662 pp
= page_get_mnode_freelist(mnode
, bin
, mtype
,
3665 VM_STAT_ADD(vmm_vmstats
.pgc_allocokdeferred
);
3666 DTRACE_PROBE4(page__get
,
3673 pp
= page_get_mnode_cachelist(bin
, flags
, mnode
, mtype
);
3675 VM_STAT_ADD(vmm_vmstats
.pgc_allocokrem
);
3676 DTRACE_PROBE4(page__get
,
3685 VM_STAT_ADD(vmm_vmstats
.pgc_allocfailed
);
3690 page_get_mnode_cachelist(uint_t bin
, uint_t flags
, int mnode
, int mtype
)
3693 page_t
*pp
, *first_pp
;
3695 int plw_initialized
;
3696 page_list_walker_t plw
;
3698 VM_STAT_ADD(vmm_vmstats
.pgmc_alloc
);
3700 MTYPE_START(mnode
, mtype
, flags
);
3701 if (mtype
< 0) { /* mnode does not have memory in mtype range */
3702 VM_STAT_ADD(vmm_vmstats
.pgmc_allocempty
);
3708 plw_initialized
= 0;
3709 plw
.plw_ceq_dif
= 1;
3712 * Only hold one cachelist lock at a time, that way we
3713 * can start anywhere and not have to worry about lock
3717 for (plw
.plw_count
= 0;
3718 plw
.plw_count
< plw
.plw_ceq_dif
; plw
.plw_count
++) {
3722 if (!PAGE_CACHELISTS(mnode
, bin
, mtype
))
3724 pcm
= PC_BIN_MUTEX(mnode
, bin
, PG_CACHE_LIST
);
3726 pp
= PAGE_CACHELISTS(mnode
, bin
, mtype
);
3731 VERIFY(pp
->p_object
);
3732 ASSERT(pp
->p_vnode
);
3733 ASSERT(PP_ISAGED(pp
) == 0);
3734 ASSERT(pp
->p_szc
== 0);
3735 ASSERT(PFN_2_MEM_NODE(pp
->p_pagenum
) == mnode
);
3736 while (!page_trylock(pp
, SE_EXCL
)) {
3738 ASSERT(pp
->p_szc
== 0);
3739 if (pp
== first_pp
) {
3741 * We have searched the complete list!
3742 * And all of them (might only be one)
3743 * are locked. This can happen since
3744 * these pages can also be found via
3745 * the hash list. When found via the
3746 * hash list, they are locked first,
3747 * then removed. We give up to let the
3753 VERIFY(pp
->p_object
);
3754 ASSERT(pp
->p_vnode
);
3755 ASSERT(PP_ISFREE(pp
));
3756 ASSERT(PP_ISAGED(pp
) == 0);
3757 ASSERT(PFN_2_MEM_NODE(pp
->p_pagenum
) ==
3764 * Found and locked a page.
3765 * Pull it off the list.
3767 ASSERT(mtype
== PP_2_MTYPE(pp
));
3768 ppp
= &PAGE_CACHELISTS(mnode
, bin
, mtype
);
3771 * Subtract counters before releasing pcm mutex
3772 * to avoid a race with page_freelist_coalesce
3773 * and page_freelist_split.
3775 page_ctr_sub(mnode
, mtype
, pp
, PG_CACHE_LIST
);
3777 VERIFY(pp
->p_object
);
3778 ASSERT(pp
->p_vnode
);
3779 ASSERT(PP_ISAGED(pp
) == 0);
3780 VM_STAT_ADD(vmm_vmstats
. pgmc_allocok
);
3786 if (plw_initialized
== 0) {
3787 page_list_walk_init(0, flags
, bin
, 0, 1, &plw
);
3788 plw_initialized
= 1;
3790 /* calculate the next bin with equivalent color */
3791 bin
= ADD_MASKED(bin
, plw
.plw_bin_step
,
3792 plw
.plw_ceq_mask
[0], plw
.plw_color_mask
);
3793 } while (sbin
!= bin
);
3795 if (plw
.plw_ceq_dif
> 1)
3796 bin
= page_list_walk_next_bin(0, bin
, &plw
);
3799 MTYPE_NEXT(mnode
, mtype
, flags
);
3803 VM_STAT_ADD(vmm_vmstats
.pgmc_allocfailed
);
3808 #define REPL_PAGE_STATS
3811 #ifdef REPL_PAGE_STATS
3812 struct repl_page_stats
{
3814 uint_t ngets_noreloc
;
3815 uint_t npgr_noreloc
;
3816 uint_t nnopage_first
;
3822 #define REPL_STAT_INCR(v) atomic_inc_32(&repl_page_stats.v)
3823 #else /* REPL_PAGE_STATS */
3824 #define REPL_STAT_INCR(v)
3825 #endif /* REPL_PAGE_STATS */
3830 * The freemem accounting must be done by the caller.
3831 * First we try to get a replacement page of the same size as like_pp,
3832 * if that is not possible, then we just get a set of discontiguous
3836 page_get_replacement_page(page_t
*orig_like_pp
, struct lgrp
*lgrp_target
,
3840 page_t
*pp
, *pplist
;
3843 int mnode
, page_mnode
;
3845 spgcnt_t npgs
, pg_cnt
;
3849 lgrp_mnode_cookie_t lgrp_cookie
;
3852 REPL_STAT_INCR(ngets
);
3853 like_pp
= orig_like_pp
;
3854 ASSERT(PAGE_EXCL(like_pp
));
3856 szc
= like_pp
->p_szc
;
3857 npgs
= page_get_pagecnt(szc
);
3859 * Now we reset like_pp to the base page_t.
3860 * That way, we won't walk past the end of this 'szc' page.
3862 pfnum
= PFN_BASE(like_pp
->p_pagenum
, szc
);
3863 like_pp
= page_numtopp_nolock(pfnum
);
3864 ASSERT(like_pp
->p_szc
== szc
);
3866 VERIFY0(PP_ISNORELOC(like_pp
));
3867 VERIFY0(pgrflags
& PGR_NORELOC
);
3870 * Kernel pages must always be replaced with the same size
3871 * pages, since we cannot properly handle demotion of kernel
3874 if (PP_ISKAS(like_pp
))
3875 pgrflags
|= PGR_SAMESZC
;
3877 MTYPE_PGR_INIT(mtype
, flags
, like_pp
, page_mnode
, npgs
);
3882 pg_cnt
= page_get_pagecnt(szc
);
3883 bin
= PP_2_BIN(like_pp
);
3884 ASSERT(like_pp
->p_szc
== orig_like_pp
->p_szc
);
3885 ASSERT(pg_cnt
<= npgs
);
3888 * If an lgroup was specified, try to get the
3889 * page from that lgroup.
3890 * NOTE: Must be careful with code below because
3891 * lgroup may disappear and reappear since there
3892 * is no locking for lgroup here.
3894 if (LGRP_EXISTS(lgrp_target
)) {
3896 * Keep local variable for lgroup separate
3897 * from lgroup argument since this code should
3898 * only be exercised when lgroup argument
3903 /* Try the lgroup's freelists first */
3904 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
,
3906 while ((pplist
== NULL
) &&
3907 (mnode
= lgrp_memnode_choose(&lgrp_cookie
))
3910 page_get_mnode_freelist(mnode
, bin
,
3915 * Now try it's cachelists if this is a
3916 * small page. Don't need to do it for
3917 * larger ones since page_freelist_coalesce()
3920 if (pplist
!= NULL
|| szc
!= 0)
3923 /* Now try it's cachelists */
3924 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
,
3927 while ((pplist
== NULL
) &&
3928 (mnode
= lgrp_memnode_choose(&lgrp_cookie
))
3931 page_get_mnode_cachelist(bin
, flags
,
3934 if (pplist
!= NULL
) {
3935 page_hashout(pplist
, false);
3937 REPL_STAT_INCR(nhashout
);
3940 /* Done looking in this lgroup. Bail out. */
3945 * No lgroup was specified (or lgroup was removed by
3946 * DR, so just try to get the page as close to
3947 * like_pp's mnode as possible.
3948 * First try the local freelist...
3950 mnode
= PP_2_MEM_NODE(like_pp
);
3951 pplist
= page_get_mnode_freelist(mnode
, bin
,
3956 REPL_STAT_INCR(nnofree
);
3959 * ...then the local cachelist. Don't need to do it for
3960 * larger pages cause page_freelist_coalesce() already
3961 * failed there anyway.
3964 pplist
= page_get_mnode_cachelist(bin
, flags
,
3966 if (pplist
!= NULL
) {
3967 page_hashout(pplist
, false);
3969 REPL_STAT_INCR(nhashout
);
3974 /* Now try remote freelists */
3977 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode
));
3978 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
,
3980 while (pplist
== NULL
&&
3981 (mnode
= lgrp_memnode_choose(&lgrp_cookie
))
3986 if ((mnode
== page_mnode
) ||
3987 (mem_node_config
[mnode
].exists
== 0))
3990 pplist
= page_get_mnode_freelist(mnode
,
3991 bin
, mtype
, szc
, flags
);
3998 /* Now try remote cachelists */
3999 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
,
4001 while (pplist
== NULL
&& szc
== 0) {
4002 mnode
= lgrp_memnode_choose(&lgrp_cookie
);
4008 if ((mnode
== page_mnode
) ||
4009 (mem_node_config
[mnode
].exists
== 0))
4012 pplist
= page_get_mnode_cachelist(bin
,
4013 flags
, mnode
, mtype
);
4015 if (pplist
!= NULL
) {
4016 page_hashout(pplist
, false);
4018 REPL_STAT_INCR(nhashout
);
4024 * Break out of while loop under the following cases:
4025 * - If we successfully got a page.
4026 * - If pgrflags specified only returning a specific
4027 * page size and we could not find that page size.
4028 * - If we could not satisfy the request with PAGESIZE
4031 if (pplist
!= NULL
|| szc
== 0)
4034 if ((pgrflags
& PGR_SAMESZC
) || pgrppgcp
) {
4035 /* try to find contig page */
4037 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
,
4040 while ((pplist
== NULL
) &&
4042 lgrp_memnode_choose(&lgrp_cookie
))
4044 pplist
= page_get_contig_pages(
4045 mnode
, bin
, mtype
, szc
,
4046 flags
| PGI_PGCPHIPRI
);
4052 * The correct thing to do here is try the next
4053 * page size down using szc--. Due to a bug
4054 * with the processing of HAT_RELOAD_SHARE
4055 * where the sfmmu_ttecnt arrays of all
4056 * hats sharing an ISM segment don't get updated,
4057 * using intermediate size pages for relocation
4058 * can lead to continuous page faults.
4063 if (pplist
!= NULL
) {
4064 DTRACE_PROBE4(page__get
,
4070 while (pplist
!= NULL
&& pg_cnt
--) {
4071 ASSERT(pplist
!= NULL
);
4073 page_sub(&pplist
, pp
);
4076 page_list_concat(&pl
, &pp
);
4078 like_pp
= like_pp
+ 1;
4079 REPL_STAT_INCR(nnext_pp
);
4081 ASSERT(pg_cnt
== 0);
4089 * We were unable to allocate the necessary number
4091 * We need to free up any pl.
4093 REPL_STAT_INCR(nnopage
);
4094 page_free_replacement_page(pl
);
4102 * demote a free large page to it's constituent pages
4105 page_demote_free_pages(page_t
*pp
)
4111 ASSERT(PAGE_LOCKED(pp
));
4112 ASSERT(PP_ISFREE(pp
));
4113 ASSERT(pp
->p_szc
!= 0 && pp
->p_szc
< mmu_page_sizes
);
4115 mnode
= PP_2_MEM_NODE(pp
);
4116 page_freelist_lock(mnode
);
4117 if (pp
->p_szc
!= 0) {
4118 (void) page_demote(mnode
, PFN_BASE(pp
->p_pagenum
,
4119 pp
->p_szc
), 0, pp
->p_szc
, 0, PC_NO_COLOR
, PC_FREE
);
4121 page_freelist_unlock(mnode
);
4122 ASSERT(pp
->p_szc
== 0);
4126 * Factor in colorequiv to check additional 'equivalent' bins.
4127 * colorequiv may be set in /etc/system
4130 page_set_colorequiv_arr(void)
4132 if (colorequiv
> 1) {
4134 uint_t sv_a
= lowbit(colorequiv
) - 1;
4139 for (i
= 0; i
< MMU_PAGE_SIZES
; i
++) {
4143 if ((colors
= hw_page_array
[i
].hp_colors
) <= 1) {
4146 while ((colors
>> a
) == 0)
4148 if ((a
<< 4) > colorequivszc
[i
]) {
4149 colorequivszc
[i
] = (a
<< 4);