4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright 2012 Joyent, Inc. All rights reserved.
29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
33 * Portions of this source code were derived from Berkeley 4.3 BSD
34 * under license from the Regents of the University of California.
39 * This file contains common functions to access and manage the page lists.
40 * Many of these routines originated from platform dependent modules
41 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
42 * a platform independent manner.
44 * vm/vm_dep.h provides for platform specific support.
47 #include <sys/types.h>
48 #include <sys/debug.h>
49 #include <sys/cmn_err.h>
50 #include <sys/systm.h>
51 #include <sys/atomic.h>
52 #include <sys/sysmacros.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/seg_vn.h>
57 #include <sys/vmsystm.h>
58 #include <sys/memnode.h>
59 #include <vm/vm_dep.h>
61 #include <sys/mem_config.h>
62 #include <sys/callb.h>
63 #include <sys/mem_cage.h>
65 #include <sys/dumphdr.h>
68 extern uint_t vac_colors
;
70 #define MAX_PRAGMA_ALIGN 128
72 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
74 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
75 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0)
77 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0)
79 char vm_cpu_data0
[VM_CPU_DATA_PADSIZE
];
82 * number of page colors equivalent to reqested color in page_get routines.
83 * If set, keeps large pages intact longer and keeps MPO allocation
84 * from the local mnode in favor of acquiring the 'correct' page color from
85 * a demoted large page or from a remote mnode.
90 * color equivalency mask for each page size.
91 * Mask is computed based on cpu L2$ way sizes and colorequiv global.
92 * High 4 bits determine the number of high order bits of the color to ignore.
93 * Low 4 bits determines number of low order bits of color to ignore (it's only
94 * relevant for hashed index based page coloring).
96 uchar_t colorequivszc
[MMU_PAGE_SIZES
];
99 * if set, specifies the percentage of large pages that are free from within
100 * a large page region before attempting to lock those pages for
101 * page_get_contig_pages processing.
103 * Should be turned on when kpr is available when page_trylock_contig_pages
104 * can be more selective.
110 * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
111 * Enabled by default via pgcplimitsearch.
113 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
114 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
115 * bound. This upper bound range guarantees:
116 * - all large page 'slots' will be searched over time
117 * - the minimum (1) large page candidates considered on each pgcp call
118 * - count doesn't wrap around to 0
120 pgcnt_t pgcpfailcnt
[MMU_PAGE_SIZES
];
121 int pgcplimitsearch
= 1;
123 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1))
124 #define SETPGCPFAILCNT(szc) \
125 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \
126 pgcpfailcnt[szc] = PGCPFAILMAX / 2;
129 struct vmm_vmstats_str vmm_vmstats
;
131 #endif /* VM_STATS */
136 /* enable page_get_contig_pages */
140 int pg_contig_disable
;
141 int pg_lpgcreate_nocage
= LPGCREATE
;
144 * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
148 /* Flags involved in promotion and demotion routines */
149 #define PC_FREE 0x1 /* put page on freelist */
150 #define PC_ALLOC 0x2 /* return page for allocation */
153 * Flag for page_demote to be used with PC_FREE to denote that we don't care
154 * what the color is as the color parameter to the function is ignored.
156 #define PC_NO_COLOR (-1)
158 /* mtype value for page_promote to use when mtype does not matter */
159 #define PC_MTYPE_ANY (-1)
162 * page counters candidates info
163 * See page_ctrs_cands comment below for more details.
164 * fields are as follows:
165 * pcc_pages_free: # pages which freelist coalesce can create
166 * pcc_color_free: pointer to page free counts per color
168 typedef struct pcc_info
{
169 pgcnt_t pcc_pages_free
;
170 pgcnt_t
*pcc_color_free
;
175 * On big machines it can take a long time to check page_counters
176 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
177 * updated sum of all elements of the corresponding page_counters arrays.
178 * page_freelist_coalesce() searches page_counters only if an appropriate
179 * element of page_ctrs_cands array is greater than 0.
181 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
183 pcc_info_t
**page_ctrs_cands
[NPC_MUTEX
][MMU_PAGE_SIZES
];
186 * Return in val the total number of free pages which can be created
187 * for the given mnode (m), mrange (g), and region size (r)
189 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \
192 for (i = 0; i < NPC_MUTEX; i++) { \
193 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \
198 * Return in val the total number of free pages which can be created
199 * for the given mnode (m), mrange (g), region size (r), and color (c)
201 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \
204 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \
205 for (i = 0; i < NPC_MUTEX; i++) { \
207 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \
212 * We can only allow a single thread to update a counter within the physical
213 * range of the largest supported page size. That is the finest granularity
214 * possible since the counter values are dependent on each other
215 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
216 * ctr_mutex lock index for a particular physical range.
218 static kmutex_t
*ctr_mutex
[NPC_MUTEX
];
220 #define PP_CTR_LOCK_INDX(pp) \
221 (((pp)->p_pagenum >> \
222 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
224 #define INVALID_COLOR 0xffffffff
225 #define INVALID_MASK 0xffffffff
228 * Local functions prototypes.
231 void page_ctr_add(int, int, page_t
*, int);
232 void page_ctr_add_internal(int, int, page_t
*, int);
233 void page_ctr_sub(int, int, page_t
*, int);
234 void page_ctr_sub_internal(int, int, page_t
*, int);
235 void page_freelist_lock(int);
236 void page_freelist_unlock(int);
237 page_t
*page_promote(int, pfn_t
, uchar_t
, int, int);
238 page_t
*page_demote(int, pfn_t
, pfn_t
, uchar_t
, uchar_t
, int, int);
239 page_t
*page_freelist_split(uchar_t
,
240 uint_t
, int, int, pfn_t
, pfn_t
, page_list_walker_t
*);
241 page_t
*page_get_mnode_cachelist(uint_t
, uint_t
, int, int);
242 static int page_trylock_cons(page_t
*pp
, se_t se
);
245 * The page_counters array below is used to keep track of free contiguous
246 * physical memory. A hw_page_map_t will be allocated per mnode per szc.
247 * This contains an array of counters, the size of the array, a shift value
248 * used to convert a pagenum into a counter array index or vice versa, as
249 * well as a cache of the last successful index to be promoted to a larger
250 * page size. As an optimization, we keep track of the last successful index
251 * to be promoted per page color for the given size region, and this is
252 * allocated dynamically based upon the number of colors for a given
255 * Conceptually, the page counters are represented as:
257 * page_counters[region_size][mnode]
259 * region_size: size code of a candidate larger page made up
260 * of contiguous free smaller pages.
262 * page_counters[region_size][mnode].hpm_counters[index]:
263 * represents how many (region_size - 1) pages either
264 * exist or can be created within the given index range.
266 * Let's look at a sparc example:
267 * If we want to create a free 512k page, we look at region_size 2
268 * for the mnode we want. We calculate the index and look at a specific
269 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at
270 * this location, it means that 8 64k pages either exist or can be created
271 * from 8K pages in order to make a single free 512k page at the given
272 * index. Note that when a region is full, it will contribute to the
273 * counts in the region above it. Thus we will not know what page
274 * size the free pages will be which can be promoted to this new free
275 * page unless we look at all regions below the current region.
279 * Note: hpmctr_t is defined in platform vm_dep.h
280 * hw_page_map_t contains all the information needed for the page_counters
281 * logic. The fields are as follows:
283 * hpm_counters: dynamically allocated array to hold counter data
284 * hpm_entries: entries in hpm_counters
285 * hpm_shift: shift for pnum/array index conv
286 * hpm_base: PFN mapped to counter index 0
287 * hpm_color_current: last index in counter array for this color at
288 * which we successfully created a large page
290 typedef struct hw_page_map
{
291 hpmctr_t
*hpm_counters
;
295 size_t *hpm_color_current
[MAX_MNODE_MRANGES
];
302 * Element zero is not used, but is allocated for convenience.
304 static hw_page_map_t
*page_counters
[MMU_PAGE_SIZES
];
307 * Cached value of MNODE_RANGE_CNT(mnode).
308 * This is a function call in x86.
310 static int mnode_nranges
[MAX_MEM_NODES
];
311 static int mnode_maxmrange
[MAX_MEM_NODES
];
314 * The following macros are convenient ways to get access to the individual
315 * elements of the page_counters arrays. They can be used on both
316 * the left side and right side of equations.
318 #define PAGE_COUNTERS(mnode, rg_szc, idx) \
319 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
321 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \
322 (page_counters[(rg_szc)][(mnode)].hpm_counters)
324 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \
325 (page_counters[(rg_szc)][(mnode)].hpm_shift)
327 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \
328 (page_counters[(rg_szc)][(mnode)].hpm_entries)
330 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \
331 (page_counters[(rg_szc)][(mnode)].hpm_base)
333 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \
334 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
336 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \
337 (page_counters[(rg_szc)][(mnode)]. \
338 hpm_color_current[(mrange)][(color)])
340 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \
341 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \
342 PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
344 #define IDX_TO_PNUM(mnode, rg_szc, index) \
345 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \
346 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
349 * Protects the hpm_counters and hpm_color_current memory from changing while
350 * looking at page counters information.
351 * Grab the write lock to modify what these fields point at.
352 * Grab the read lock to prevent any pointers from changing.
353 * The write lock can not be held during memory allocation due to a possible
354 * recursion deadlock with trying to grab the read lock while the
355 * write lock is already held.
357 krwlock_t page_ctrs_rwlock
[MAX_MEM_NODES
];
361 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
364 cpu_vm_data_init(struct cpu
*cp
)
367 cp
->cpu_vm_data
= (void *)&vm_cpu_data0
;
373 align
= (L2CACHE_ALIGN
) ? L2CACHE_ALIGN
: L2CACHE_ALIGN_MAX
;
374 sz
= P2ROUNDUP(sizeof (vm_cpu_data_t
), align
) + align
;
375 kmptr
= kmem_zalloc(sz
, KM_SLEEP
);
376 cp
->cpu_vm_data
= (void *) P2ROUNDUP((uintptr_t)kmptr
, align
);
377 ((vm_cpu_data_t
*)cp
->cpu_vm_data
)->vc_kmptr
= kmptr
;
378 ((vm_cpu_data_t
*)cp
->cpu_vm_data
)->vc_kmsize
= sz
;
386 cpu_vm_data_destroy(struct cpu
*cp
)
388 if (cp
->cpu_seqid
&& cp
->cpu_vm_data
) {
390 kmem_free(((vm_cpu_data_t
*)cp
->cpu_vm_data
)->vc_kmptr
,
391 ((vm_cpu_data_t
*)cp
->cpu_vm_data
)->vc_kmsize
);
393 cp
->cpu_vm_data
= NULL
;
398 * page size to page size code
401 page_szc(size_t pagesize
)
405 while (hw_page_array
[i
].hp_size
) {
406 if (pagesize
== hw_page_array
[i
].hp_size
)
414 * page size to page size code with the restriction that it be a supported
415 * user page size. If it's not a supported user page size, -1 will be returned.
418 page_szc_user_filtered(size_t pagesize
)
420 int szc
= page_szc(pagesize
);
421 if ((szc
!= -1) && (SZC_2_USERSZC(szc
) != -1)) {
428 * Return how many page sizes are available for the user to use. This is
429 * what the hardware supports and not based upon how the OS implements the
430 * support of different page sizes.
432 * If legacy is non-zero, return the number of pagesizes available to legacy
433 * applications. The number of legacy page sizes might be less than the
434 * exported user page sizes. This is to prevent legacy applications that
435 * use the largest page size returned from getpagesizes(3c) from inadvertantly
436 * using the 'new' large pagesizes.
439 page_num_user_pagesizes(int legacy
)
442 return (mmu_legacy_page_sizes
);
443 return (mmu_exported_page_sizes
);
447 page_num_pagesizes(void)
449 return (mmu_page_sizes
);
453 * returns the count of the number of base pagesize pages associated with szc
456 page_get_pagecnt(uint_t szc
)
458 if (szc
>= mmu_page_sizes
)
459 panic("page_get_pagecnt: out of range %d", szc
);
460 return (hw_page_array
[szc
].hp_pgcnt
);
464 page_get_pagesize(uint_t szc
)
466 if (szc
>= mmu_page_sizes
)
467 panic("page_get_pagesize: out of range %d", szc
);
468 return (hw_page_array
[szc
].hp_size
);
472 * Return the size of a page based upon the index passed in. An index of
473 * zero refers to the smallest page size in the system, and as index increases
474 * it refers to the next larger supported page size in the system.
475 * Note that szc and userszc may not be the same due to unsupported szc's on
479 page_get_user_pagesize(uint_t userszc
)
481 uint_t szc
= USERSZC_2_SZC(userszc
);
483 if (szc
>= mmu_page_sizes
)
484 panic("page_get_user_pagesize: out of range %d", szc
);
485 return (hw_page_array
[szc
].hp_size
);
489 page_get_shift(uint_t szc
)
491 if (szc
>= mmu_page_sizes
)
492 panic("page_get_shift: out of range %d", szc
);
493 return (PAGE_GET_SHIFT(szc
));
497 page_get_pagecolors(uint_t szc
)
499 if (szc
>= mmu_page_sizes
)
500 panic("page_get_pagecolors: out of range %d", szc
);
501 return (PAGE_GET_PAGECOLORS(szc
));
505 * this assigns the desired equivalent color after a split
508 page_correct_color(uchar_t szc
, uchar_t nszc
, uint_t color
,
509 uint_t ncolor
, uint_t ceq_mask
)
512 ASSERT(szc
< mmu_page_sizes
);
513 ASSERT(color
< PAGE_GET_PAGECOLORS(szc
));
514 ASSERT(ncolor
< PAGE_GET_PAGECOLORS(nszc
));
517 ncolor
= PAGE_CONVERT_COLOR(ncolor
, szc
, nszc
);
518 return (color
| (ncolor
& ~ceq_mask
));
522 * The interleaved_mnodes flag is set when mnodes overlap in
523 * the physbase..physmax range, but have disjoint slices.
524 * In this case hpm_counters is shared by all mnodes.
525 * This flag is set dynamically by the platform.
527 int interleaved_mnodes
= 0;
530 * Called by startup().
531 * Size up the per page size free list counters based on physmax
532 * of each node and max_mem_nodes.
534 * If interleaved_mnodes is set we need to find the first mnode that
535 * exists. hpm_counters for the first mnode will then be shared by
536 * all other mnodes. If interleaved_mnodes is not set, just set
537 * first=mnode each time. That means there will be no sharing.
542 int r
; /* region size */
544 int firstmn
; /* first mnode that exists */
550 pgcnt_t colors_per_szc
[MMU_PAGE_SIZES
];
553 * We need to determine how many page colors there are for each
554 * page size in order to allocate memory for any color specific
557 for (i
= 0; i
< mmu_page_sizes
; i
++) {
558 colors_per_szc
[i
] = PAGE_GET_PAGECOLORS(i
);
561 for (firstmn
= -1, mnode
= 0; mnode
< max_mem_nodes
; mnode
++) {
567 if (mem_node_config
[mnode
].exists
== 0)
570 HPM_COUNTERS_LIMITS(mnode
, physbase
, physmax
, firstmn
);
571 nranges
= MNODE_RANGE_CNT(mnode
);
572 mnode_nranges
[mnode
] = nranges
;
573 mnode_maxmrange
[mnode
] = MNODE_MAX_MRANGE(mnode
);
576 * determine size needed for page counter arrays with
577 * base aligned to large page size.
579 for (r
= 1; r
< mmu_page_sizes
; r
++) {
580 /* add in space for hpm_color_current */
581 ctrs_sz
+= sizeof (size_t) *
582 colors_per_szc
[r
] * nranges
;
584 if (firstmn
!= mnode
)
587 /* add in space for hpm_counters */
588 r_align
= page_get_pagecnt(r
);
590 r_base
&= ~(r_align
- 1);
591 r_pgcnt
= howmany(physmax
- r_base
+ 1, r_align
);
594 * Round up to always allocate on pointer sized
597 ctrs_sz
+= P2ROUNDUP((r_pgcnt
* sizeof (hpmctr_t
)),
598 sizeof (hpmctr_t
*));
602 for (r
= 1; r
< mmu_page_sizes
; r
++) {
603 ctrs_sz
+= (max_mem_nodes
* sizeof (hw_page_map_t
));
606 /* add in space for page_ctrs_cands and pcc_color_free */
607 ctrs_sz
+= sizeof (pcc_info_t
*) * max_mem_nodes
*
608 mmu_page_sizes
* NPC_MUTEX
;
610 for (mnode
= 0; mnode
< max_mem_nodes
; mnode
++) {
612 if (mem_node_config
[mnode
].exists
== 0)
615 nranges
= mnode_nranges
[mnode
];
616 ctrs_sz
+= sizeof (pcc_info_t
) * nranges
*
617 mmu_page_sizes
* NPC_MUTEX
;
618 for (r
= 1; r
< mmu_page_sizes
; r
++) {
619 ctrs_sz
+= sizeof (pgcnt_t
) * nranges
*
620 colors_per_szc
[r
] * NPC_MUTEX
;
625 ctrs_sz
+= (max_mem_nodes
* NPC_MUTEX
* sizeof (kmutex_t
));
627 /* size for page list counts */
631 * add some slop for roundups. page_ctrs_alloc will roundup the start
632 * address of the counters to ecache_alignsize boundary for every
635 return (ctrs_sz
+ max_mem_nodes
* L2CACHE_ALIGN
);
639 page_ctrs_alloc(caddr_t alloc_base
)
643 int r
; /* region size */
645 int firstmn
; /* first mnode that exists */
648 pgcnt_t colors_per_szc
[MMU_PAGE_SIZES
];
651 * We need to determine how many page colors there are for each
652 * page size in order to allocate memory for any color specific
655 for (i
= 0; i
< mmu_page_sizes
; i
++) {
656 colors_per_szc
[i
] = PAGE_GET_PAGECOLORS(i
);
659 for (r
= 1; r
< mmu_page_sizes
; r
++) {
660 page_counters
[r
] = (hw_page_map_t
*)alloc_base
;
661 alloc_base
+= (max_mem_nodes
* sizeof (hw_page_map_t
));
664 /* page_ctrs_cands and pcc_color_free array */
665 for (i
= 0; i
< NPC_MUTEX
; i
++) {
666 for (r
= 1; r
< mmu_page_sizes
; r
++) {
668 page_ctrs_cands
[i
][r
] = (pcc_info_t
**)alloc_base
;
669 alloc_base
+= sizeof (pcc_info_t
*) * max_mem_nodes
;
671 for (mnode
= 0; mnode
< max_mem_nodes
; mnode
++) {
674 if (mem_node_config
[mnode
].exists
== 0)
677 nranges
= mnode_nranges
[mnode
];
679 pi
= (pcc_info_t
*)alloc_base
;
680 alloc_base
+= sizeof (pcc_info_t
) * nranges
;
681 page_ctrs_cands
[i
][r
][mnode
] = pi
;
683 for (mrange
= 0; mrange
< nranges
; mrange
++) {
685 (pgcnt_t
*)alloc_base
;
686 alloc_base
+= sizeof (pgcnt_t
) *
695 for (i
= 0; i
< NPC_MUTEX
; i
++) {
696 ctr_mutex
[i
] = (kmutex_t
*)alloc_base
;
697 alloc_base
+= (max_mem_nodes
* sizeof (kmutex_t
));
700 /* initialize page list counts */
701 PLCNT_INIT(alloc_base
);
703 for (firstmn
= -1, mnode
= 0; mnode
< max_mem_nodes
; mnode
++) {
709 int nranges
= mnode_nranges
[mnode
];
711 if (mem_node_config
[mnode
].exists
== 0)
714 HPM_COUNTERS_LIMITS(mnode
, physbase
, physmax
, firstmn
);
716 for (r
= 1; r
< mmu_page_sizes
; r
++) {
718 * the page_counters base has to be aligned to the
719 * page count of page size code r otherwise the counts
720 * will cross large page boundaries.
722 r_align
= page_get_pagecnt(r
);
724 /* base needs to be aligned - lower to aligned value */
725 r_base
&= ~(r_align
- 1);
726 r_pgcnt
= howmany(physmax
- r_base
+ 1, r_align
);
727 r_shift
= PAGE_BSZS_SHIFT(r
);
729 PAGE_COUNTERS_SHIFT(mnode
, r
) = r_shift
;
730 PAGE_COUNTERS_ENTRIES(mnode
, r
) = r_pgcnt
;
731 PAGE_COUNTERS_BASE(mnode
, r
) = r_base
;
732 for (mrange
= 0; mrange
< nranges
; mrange
++) {
733 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode
,
734 r
, mrange
) = (size_t *)alloc_base
;
735 alloc_base
+= sizeof (size_t) *
738 for (i
= 0; i
< colors_per_szc
[r
]; i
++) {
739 uint_t color_mask
= colors_per_szc
[r
] - 1;
740 pfn_t pfnum
= r_base
;
743 MEM_NODE_ITERATOR_DECL(it
);
745 MEM_NODE_ITERATOR_INIT(pfnum
, mnode
, r
, &it
);
746 if (pfnum
== (pfn_t
)-1) {
749 PAGE_NEXT_PFN_FOR_COLOR(pfnum
, r
, i
,
750 color_mask
, color_mask
, &it
);
751 idx
= PNUM_TO_IDX(mnode
, r
, pfnum
);
752 idx
= (idx
>= r_pgcnt
) ? 0 : idx
;
754 for (mrange
= 0; mrange
< nranges
; mrange
++) {
755 PAGE_COUNTERS_CURRENT_COLOR(mnode
,
760 /* hpm_counters may be shared by all mnodes */
761 if (firstmn
== mnode
) {
762 PAGE_COUNTERS_COUNTERS(mnode
, r
) =
763 (hpmctr_t
*)alloc_base
;
765 P2ROUNDUP((sizeof (hpmctr_t
) * r_pgcnt
),
766 sizeof (hpmctr_t
*));
768 PAGE_COUNTERS_COUNTERS(mnode
, r
) =
769 PAGE_COUNTERS_COUNTERS(firstmn
, r
);
773 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
774 * satisfy the identity requirement.
775 * We should be able to go from one to the other
776 * and get consistent values.
778 ASSERT(PNUM_TO_IDX(mnode
, r
,
779 (IDX_TO_PNUM(mnode
, r
, 0))) == 0);
780 ASSERT(IDX_TO_PNUM(mnode
, r
,
781 (PNUM_TO_IDX(mnode
, r
, r_base
))) == r_base
);
784 * Roundup the start address of the page_counters to
785 * cache aligned boundary for every memory node.
786 * page_ctrs_sz() has added some slop for these roundups.
788 alloc_base
= (caddr_t
)P2ROUNDUP((uintptr_t)alloc_base
,
792 /* Initialize other page counter specific data structures. */
793 for (mnode
= 0; mnode
< MAX_MEM_NODES
; mnode
++) {
794 rw_init(&page_ctrs_rwlock
[mnode
], NULL
, RW_DEFAULT
, NULL
);
801 * Functions to adjust region counters for each size free list.
802 * Caller is responsible to acquire the ctr_mutex lock if necessary and
803 * thus can be called during startup without locks.
807 page_ctr_add_internal(int mnode
, int mtype
, page_t
*pp
, int flags
)
809 ssize_t r
; /* region size */
814 ASSERT(mnode
== PP_2_MEM_NODE(pp
));
815 ASSERT(mtype
== PP_2_MTYPE(pp
));
817 ASSERT(pp
->p_szc
< mmu_page_sizes
);
819 PLCNT_INCR(pp
, mnode
, mtype
, pp
->p_szc
, flags
);
821 /* no counter update needed for largest page size */
822 if (pp
->p_szc
>= mmu_page_sizes
- 1) {
827 pfnum
= pp
->p_pagenum
;
828 lckidx
= PP_CTR_LOCK_INDX(pp
);
831 * Increment the count of free pages for the current
832 * region. Continue looping up in region size incrementing
833 * count if the preceeding region is full.
835 while (r
< mmu_page_sizes
) {
836 idx
= PNUM_TO_IDX(mnode
, r
, pfnum
);
838 ASSERT(idx
< PAGE_COUNTERS_ENTRIES(mnode
, r
));
839 ASSERT(PAGE_COUNTERS(mnode
, r
, idx
) < FULL_REGION_CNT(r
));
841 if (++PAGE_COUNTERS(mnode
, r
, idx
) != FULL_REGION_CNT(r
)) {
844 int root_mtype
= PP_2_MTYPE(PP_GROUPLEADER(pp
, r
));
845 pcc_info_t
*cand
= &page_ctrs_cands
[lckidx
][r
][mnode
]
846 [MTYPE_2_MRANGE(mnode
, root_mtype
)];
848 cand
->pcc_pages_free
++;
849 cand
->pcc_color_free
[PP_2_BIN_SZC(pp
, r
)]++;
856 page_ctr_add(int mnode
, int mtype
, page_t
*pp
, int flags
)
858 int lckidx
= PP_CTR_LOCK_INDX(pp
);
859 kmutex_t
*lock
= &ctr_mutex
[lckidx
][mnode
];
862 page_ctr_add_internal(mnode
, mtype
, pp
, flags
);
867 page_ctr_sub_internal(int mnode
, int mtype
, page_t
*pp
, int flags
)
870 ssize_t r
; /* region size */
874 ASSERT(mnode
== PP_2_MEM_NODE(pp
));
875 ASSERT(mtype
== PP_2_MTYPE(pp
));
877 ASSERT(pp
->p_szc
< mmu_page_sizes
);
879 PLCNT_DECR(pp
, mnode
, mtype
, pp
->p_szc
, flags
);
881 /* no counter update needed for largest page size */
882 if (pp
->p_szc
>= mmu_page_sizes
- 1) {
887 pfnum
= pp
->p_pagenum
;
888 lckidx
= PP_CTR_LOCK_INDX(pp
);
891 * Decrement the count of free pages for the current
892 * region. Continue looping up in region size decrementing
893 * count if the preceeding region was full.
895 while (r
< mmu_page_sizes
) {
896 idx
= PNUM_TO_IDX(mnode
, r
, pfnum
);
898 ASSERT(idx
< PAGE_COUNTERS_ENTRIES(mnode
, r
));
899 ASSERT(PAGE_COUNTERS(mnode
, r
, idx
) > 0);
901 if (--PAGE_COUNTERS(mnode
, r
, idx
) != FULL_REGION_CNT(r
) - 1) {
904 int root_mtype
= PP_2_MTYPE(PP_GROUPLEADER(pp
, r
));
905 pcc_info_t
*cand
= &page_ctrs_cands
[lckidx
][r
][mnode
]
906 [MTYPE_2_MRANGE(mnode
, root_mtype
)];
908 ASSERT(cand
->pcc_pages_free
!= 0);
909 ASSERT(cand
->pcc_color_free
[PP_2_BIN_SZC(pp
, r
)] != 0);
911 cand
->pcc_pages_free
--;
912 cand
->pcc_color_free
[PP_2_BIN_SZC(pp
, r
)]--;
919 page_ctr_sub(int mnode
, int mtype
, page_t
*pp
, int flags
)
921 int lckidx
= PP_CTR_LOCK_INDX(pp
);
922 kmutex_t
*lock
= &ctr_mutex
[lckidx
][mnode
];
925 page_ctr_sub_internal(mnode
, mtype
, pp
, flags
);
930 * Adjust page counters following a memory attach, since typically the
931 * size of the array needs to change, and the PFN to counter index
932 * mapping needs to change.
934 * It is possible this mnode did not exist at startup. In that case
935 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
936 * to change (a theoretical possibility on x86), which means pcc_color_free
937 * arrays must be extended.
940 page_ctrs_adjust(int mnode
)
943 int r
; /* region size */
945 size_t pcsz
, old_csz
;
946 hpmctr_t
*new_ctr
, *old_ctr
;
947 pfn_t oldbase
, newbase
;
948 pfn_t physbase
, physmax
;
950 hpmctr_t
*ctr_cache
[MMU_PAGE_SIZES
];
951 size_t size_cache
[MMU_PAGE_SIZES
];
952 size_t *color_cache
[MMU_PAGE_SIZES
][MAX_MNODE_MRANGES
];
953 size_t *old_color_array
[MAX_MNODE_MRANGES
];
954 pgcnt_t colors_per_szc
[MMU_PAGE_SIZES
];
955 pcc_info_t
**cands_cache
;
956 pcc_info_t
*old_pi
, *pi
;
958 int nr
, old_nranges
, mrange
, nranges
= MNODE_RANGE_CNT(mnode
);
959 int cands_cache_nranges
;
960 int old_maxmrange
, new_maxmrange
;
964 cands_cache
= kmem_zalloc(sizeof (pcc_info_t
*) * NPC_MUTEX
*
965 MMU_PAGE_SIZES
, KM_NOSLEEP
);
966 if (cands_cache
== NULL
)
970 HPM_COUNTERS_LIMITS(mnode
, physbase
, physmax
, i
);
972 newbase
= physbase
& ~PC_BASE_ALIGN_MASK
;
973 npgs
= roundup(physmax
, PC_BASE_ALIGN
) - newbase
;
975 /* prepare to free non-null pointers on the way out */
976 cands_cache_nranges
= nranges
;
977 bzero(ctr_cache
, sizeof (ctr_cache
));
978 bzero(color_cache
, sizeof (color_cache
));
981 * We need to determine how many page colors there are for each
982 * page size in order to allocate memory for any color specific
985 for (r
= 0; r
< mmu_page_sizes
; r
++) {
986 colors_per_szc
[r
] = PAGE_GET_PAGECOLORS(r
);
990 * Preallocate all of the new hpm_counters arrays as we can't
991 * hold the page_ctrs_rwlock as a writer and allocate memory.
992 * If we can't allocate all of the arrays, undo our work so far
993 * and return failure.
995 for (r
= 1; r
< mmu_page_sizes
; r
++) {
996 pcsz
= npgs
>> PAGE_BSZS_SHIFT(r
);
997 size_cache
[r
] = pcsz
;
998 ctr_cache
[r
] = kmem_zalloc(pcsz
*
999 sizeof (hpmctr_t
), KM_NOSLEEP
);
1000 if (ctr_cache
[r
] == NULL
) {
1007 * Preallocate all of the new color current arrays as we can't
1008 * hold the page_ctrs_rwlock as a writer and allocate memory.
1009 * If we can't allocate all of the arrays, undo our work so far
1010 * and return failure.
1012 for (r
= 1; r
< mmu_page_sizes
; r
++) {
1013 for (mrange
= 0; mrange
< nranges
; mrange
++) {
1014 color_cache
[r
][mrange
] = kmem_zalloc(sizeof (size_t) *
1015 colors_per_szc
[r
], KM_NOSLEEP
);
1016 if (color_cache
[r
][mrange
] == NULL
) {
1024 * Preallocate all of the new pcc_info_t arrays as we can't
1025 * hold the page_ctrs_rwlock as a writer and allocate memory.
1026 * If we can't allocate all of the arrays, undo our work so far
1027 * and return failure.
1029 for (r
= 1; r
< mmu_page_sizes
; r
++) {
1030 for (i
= 0; i
< NPC_MUTEX
; i
++) {
1031 pi
= kmem_zalloc(nranges
* sizeof (pcc_info_t
),
1037 cands_cache
[i
* MMU_PAGE_SIZES
+ r
] = pi
;
1039 for (mrange
= 0; mrange
< nranges
; mrange
++, pi
++) {
1040 pgcntp
= kmem_zalloc(colors_per_szc
[r
] *
1041 sizeof (pgcnt_t
), KM_NOSLEEP
);
1042 if (pgcntp
== NULL
) {
1046 pi
->pcc_color_free
= pgcntp
;
1052 * Grab the write lock to prevent others from walking these arrays
1053 * while we are modifying them.
1055 PAGE_CTRS_WRITE_LOCK(mnode
);
1058 * For interleaved mnodes, find the first mnode
1059 * with valid page counters since the current
1060 * mnode may have just been added and not have
1061 * valid page counters.
1063 if (interleaved_mnodes
) {
1064 for (i
= 0; i
< max_mem_nodes
; i
++)
1065 if (PAGE_COUNTERS_COUNTERS(i
, 1) != NULL
)
1067 ASSERT(i
< max_mem_nodes
);
1072 old_nranges
= mnode_nranges
[mnode
];
1073 cands_cache_nranges
= old_nranges
;
1074 mnode_nranges
[mnode
] = nranges
;
1075 old_maxmrange
= mnode_maxmrange
[mnode
];
1076 mnode_maxmrange
[mnode
] = MNODE_MAX_MRANGE(mnode
);
1077 new_maxmrange
= mnode_maxmrange
[mnode
];
1079 for (r
= 1; r
< mmu_page_sizes
; r
++) {
1080 PAGE_COUNTERS_SHIFT(mnode
, r
) = PAGE_BSZS_SHIFT(r
);
1081 old_ctr
= PAGE_COUNTERS_COUNTERS(oldmnode
, r
);
1082 old_csz
= PAGE_COUNTERS_ENTRIES(oldmnode
, r
);
1083 oldbase
= PAGE_COUNTERS_BASE(oldmnode
, r
);
1084 old_npgs
= old_csz
<< PAGE_COUNTERS_SHIFT(oldmnode
, r
);
1085 for (mrange
= 0; mrange
< MAX_MNODE_MRANGES
; mrange
++) {
1086 old_color_array
[mrange
] =
1087 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode
,
1091 pcsz
= npgs
>> PAGE_COUNTERS_SHIFT(mnode
, r
);
1092 new_ctr
= ctr_cache
[r
];
1093 ctr_cache
[r
] = NULL
;
1094 if (old_ctr
!= NULL
&&
1095 (oldbase
+ old_npgs
> newbase
) &&
1096 (newbase
+ npgs
> oldbase
)) {
1098 * Map the intersection of the old and new
1099 * counters into the new array.
1102 if (newbase
> oldbase
) {
1103 offset
= (newbase
- oldbase
) >>
1104 PAGE_COUNTERS_SHIFT(mnode
, r
);
1105 bcopy(old_ctr
+ offset
, new_ctr
,
1106 MIN(pcsz
, (old_csz
- offset
)) *
1109 offset
= (oldbase
- newbase
) >>
1110 PAGE_COUNTERS_SHIFT(mnode
, r
);
1111 bcopy(old_ctr
, new_ctr
+ offset
,
1112 MIN(pcsz
- offset
, old_csz
) *
1117 PAGE_COUNTERS_COUNTERS(mnode
, r
) = new_ctr
;
1118 PAGE_COUNTERS_ENTRIES(mnode
, r
) = pcsz
;
1119 PAGE_COUNTERS_BASE(mnode
, r
) = newbase
;
1121 /* update shared hpm_counters in other mnodes */
1122 if (interleaved_mnodes
) {
1123 for (i
= 0; i
< max_mem_nodes
; i
++) {
1125 (mem_node_config
[i
].exists
== 0))
1128 PAGE_COUNTERS_COUNTERS(i
, r
) == old_ctr
||
1129 PAGE_COUNTERS_COUNTERS(i
, r
) == NULL
);
1130 PAGE_COUNTERS_COUNTERS(i
, r
) = new_ctr
;
1131 PAGE_COUNTERS_ENTRIES(i
, r
) = pcsz
;
1132 PAGE_COUNTERS_BASE(i
, r
) = newbase
;
1136 for (mrange
= 0; mrange
< MAX_MNODE_MRANGES
; mrange
++) {
1137 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode
, r
, mrange
) =
1138 color_cache
[r
][mrange
];
1139 color_cache
[r
][mrange
] = NULL
;
1142 * for now, just reset on these events as it's probably
1143 * not worthwhile to try and optimize this.
1145 for (i
= 0; i
< colors_per_szc
[r
]; i
++) {
1146 uint_t color_mask
= colors_per_szc
[r
] - 1;
1147 int mlo
= interleaved_mnodes
? 0 : mnode
;
1148 int mhi
= interleaved_mnodes
? max_mem_nodes
:
1153 MEM_NODE_ITERATOR_DECL(it
);
1155 for (m
= mlo
; m
< mhi
; m
++) {
1156 if (mem_node_config
[m
].exists
== 0)
1159 MEM_NODE_ITERATOR_INIT(pfnum
, m
, r
, &it
);
1160 if (pfnum
== (pfn_t
)-1) {
1163 PAGE_NEXT_PFN_FOR_COLOR(pfnum
, r
, i
,
1164 color_mask
, color_mask
, &it
);
1165 idx
= PNUM_TO_IDX(m
, r
, pfnum
);
1166 idx
= (idx
< pcsz
) ? idx
: 0;
1168 for (mrange
= 0; mrange
< nranges
; mrange
++) {
1169 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m
,
1171 PAGE_COUNTERS_CURRENT_COLOR(m
,
1172 r
, i
, mrange
) = idx
;
1177 /* cache info for freeing out of the critical path */
1178 if ((caddr_t
)old_ctr
>= kernelheap
&&
1179 (caddr_t
)old_ctr
< ekernelheap
) {
1180 ctr_cache
[r
] = old_ctr
;
1181 size_cache
[r
] = old_csz
;
1183 for (mrange
= 0; mrange
< MAX_MNODE_MRANGES
; mrange
++) {
1184 size_t *tmp
= old_color_array
[mrange
];
1185 if ((caddr_t
)tmp
>= kernelheap
&&
1186 (caddr_t
)tmp
< ekernelheap
) {
1187 color_cache
[r
][mrange
] = tmp
;
1191 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1192 * satisfy the identity requirement.
1193 * We should be able to go from one to the other
1194 * and get consistent values.
1196 ASSERT(PNUM_TO_IDX(mnode
, r
,
1197 (IDX_TO_PNUM(mnode
, r
, 0))) == 0);
1198 ASSERT(IDX_TO_PNUM(mnode
, r
,
1199 (PNUM_TO_IDX(mnode
, r
, newbase
))) == newbase
);
1201 /* pcc_info_t and pcc_color_free */
1202 for (i
= 0; i
< NPC_MUTEX
; i
++) {
1204 pcc_info_t
*eold_pi
;
1206 pi
= cands_cache
[i
* MMU_PAGE_SIZES
+ r
];
1207 old_pi
= page_ctrs_cands
[i
][r
][mnode
];
1208 page_ctrs_cands
[i
][r
][mnode
] = pi
;
1209 cands_cache
[i
* MMU_PAGE_SIZES
+ r
] = old_pi
;
1211 /* preserve old pcc_color_free values, if any */
1216 * when/if x86 does DR, must account for
1217 * possible change in range index when
1218 * preserving pcc_info
1221 eold_pi
= &old_pi
[old_nranges
];
1222 if (new_maxmrange
> old_maxmrange
) {
1223 pi
+= new_maxmrange
- old_maxmrange
;
1224 } else if (new_maxmrange
< old_maxmrange
) {
1225 old_pi
+= old_maxmrange
- new_maxmrange
;
1227 for (; pi
< epi
&& old_pi
< eold_pi
; pi
++, old_pi
++) {
1228 pcc_info_t tmp
= *pi
;
1234 PAGE_CTRS_WRITE_UNLOCK(mnode
);
1237 * Now that we have dropped the write lock, it is safe to free all
1238 * of the memory we have cached above.
1239 * We come thru here to free memory when pre-alloc fails, and also to
1240 * free old pointers which were recorded while locked.
1243 for (r
= 1; r
< mmu_page_sizes
; r
++) {
1244 if (ctr_cache
[r
] != NULL
) {
1245 kmem_free(ctr_cache
[r
],
1246 size_cache
[r
] * sizeof (hpmctr_t
));
1248 for (mrange
= 0; mrange
< MAX_MNODE_MRANGES
; mrange
++) {
1249 if (color_cache
[r
][mrange
] != NULL
) {
1250 kmem_free(color_cache
[r
][mrange
],
1251 colors_per_szc
[r
] * sizeof (size_t));
1254 for (i
= 0; i
< NPC_MUTEX
; i
++) {
1255 pi
= cands_cache
[i
* MMU_PAGE_SIZES
+ r
];
1258 nr
= cands_cache_nranges
;
1259 for (mrange
= 0; mrange
< nr
; mrange
++, pi
++) {
1260 pgcntp
= pi
->pcc_color_free
;
1263 if ((caddr_t
)pgcntp
>= kernelheap
&&
1264 (caddr_t
)pgcntp
< ekernelheap
) {
1270 pi
= cands_cache
[i
* MMU_PAGE_SIZES
+ r
];
1271 if ((caddr_t
)pi
>= kernelheap
&&
1272 (caddr_t
)pi
< ekernelheap
) {
1273 kmem_free(pi
, nr
* sizeof (pcc_info_t
));
1278 kmem_free(cands_cache
,
1279 sizeof (pcc_info_t
*) * NPC_MUTEX
* MMU_PAGE_SIZES
);
1284 * Cleanup the hpm_counters field in the page counters
1288 page_ctrs_cleanup(void)
1290 int r
; /* region size */
1291 int i
; /* mnode index */
1294 * Get the page counters write lock while we are
1295 * setting the page hpm_counters field to NULL
1296 * for non-existent mnodes.
1298 for (i
= 0; i
< max_mem_nodes
; i
++) {
1299 PAGE_CTRS_WRITE_LOCK(i
);
1300 if (mem_node_config
[i
].exists
) {
1301 PAGE_CTRS_WRITE_UNLOCK(i
);
1304 for (r
= 1; r
< mmu_page_sizes
; r
++) {
1305 PAGE_COUNTERS_COUNTERS(i
, r
) = NULL
;
1307 PAGE_CTRS_WRITE_UNLOCK(i
);
1314 * confirm pp is a large page corresponding to szc
1317 chk_lpg(page_t
*pp
, uchar_t szc
)
1319 spgcnt_t npgs
= page_get_pagecnt(pp
->p_szc
);
1323 ASSERT(pp
->p_szc
== 0);
1324 ASSERT(pp
->p_next
== pp
);
1325 ASSERT(pp
->p_prev
== pp
);
1329 ASSERT(pp
->p_list
.largepg
.next
== pp
|| pp
->p_list
.largepg
.next
== NULL
);
1330 ASSERT(pp
->p_list
.largepg
.prev
== pp
|| pp
->p_list
.largepg
.prev
== NULL
);
1332 ASSERT(IS_P2ALIGNED(pp
->p_pagenum
, npgs
));
1333 ASSERT(pp
->p_pagenum
== (pp
->p_next
->p_pagenum
- 1));
1334 ASSERT(pp
->p_prev
->p_pagenum
== (pp
->p_pagenum
+ (npgs
- 1)));
1335 ASSERT(pp
->p_prev
== (pp
+ (npgs
- 1)));
1338 * Check list of pages.
1340 noreloc
= PP_ISNORELOC(pp
);
1343 ASSERT(pp
->p_pagenum
== pp
->p_next
->p_pagenum
- 1);
1344 ASSERT(pp
->p_next
== (pp
+ 1));
1346 ASSERT(pp
->p_szc
== szc
);
1347 ASSERT(PP_ISFREE(pp
));
1348 ASSERT(PP_ISAGED(pp
));
1349 ASSERT(pp
->p_list
.largepg
.next
== pp
|| pp
->p_list
.largepg
.next
== NULL
);
1350 ASSERT(pp
->p_list
.largepg
.prev
== pp
|| pp
->p_list
.largepg
.prev
== NULL
);
1351 VERIFY(pp
->p_object
== NULL
);
1352 ASSERT(pp
->p_vnode
== NULL
);
1353 ASSERT(PP_ISNORELOC(pp
) == noreloc
);
1361 page_freelist_lock(int mnode
)
1364 for (i
= 0; i
< NPC_MUTEX
; i
++) {
1365 mutex_enter(FPC_MUTEX(mnode
, i
));
1366 mutex_enter(CPC_MUTEX(mnode
, i
));
1371 page_freelist_unlock(int mnode
)
1374 for (i
= 0; i
< NPC_MUTEX
; i
++) {
1375 mutex_exit(FPC_MUTEX(mnode
, i
));
1376 mutex_exit(CPC_MUTEX(mnode
, i
));
1381 * add pp to the specified page list. Defaults to head of the page list
1382 * unless PG_LIST_TAIL is specified.
1385 page_list_add(page_t
*pp
, int flags
)
1392 ASSERT(PAGE_EXCL(pp
) || (flags
& PG_LIST_ISINIT
));
1393 ASSERT(PP_ISFREE(pp
));
1394 ASSERT(!hat_page_is_mapped(pp
));
1395 ASSERT(hat_page_getshare(pp
) == 0);
1398 * Large pages should be freed via page_list_add_pages().
1400 ASSERT(pp
->p_szc
== 0);
1403 * Don't need to lock the freelist first here
1404 * because the page isn't on the freelist yet.
1405 * This means p_szc can't change on us.
1409 mnode
= PP_2_MEM_NODE(pp
);
1410 mtype
= PP_2_MTYPE(pp
);
1412 if (flags
& PG_LIST_ISINIT
) {
1414 * PG_LIST_ISINIT is set during system startup (ie. single
1415 * threaded), add a page to the free list and add to the
1416 * the free region counters w/o any locking
1418 ppp
= &PAGE_FREELISTS(mnode
, 0, bin
, mtype
);
1420 /* inline version of page_add() */
1423 pp
->p_prev
= (*ppp
)->p_prev
;
1424 (*ppp
)->p_prev
= pp
;
1425 pp
->p_prev
->p_next
= pp
;
1429 page_ctr_add_internal(mnode
, mtype
, pp
, flags
);
1430 VM_STAT_ADD(vmm_vmstats
.pladd_free
[0]);
1432 pcm
= PC_BIN_MUTEX(mnode
, bin
, flags
);
1434 if (flags
& PG_FREE_LIST
) {
1435 VM_STAT_ADD(vmm_vmstats
.pladd_free
[0]);
1436 ASSERT(PP_ISAGED(pp
));
1437 ppp
= &PAGE_FREELISTS(mnode
, 0, bin
, mtype
);
1440 VM_STAT_ADD(vmm_vmstats
.pladd_cache
);
1441 VERIFY(pp
->p_object
);
1442 ASSERT(pp
->p_vnode
);
1443 ASSERT((pp
->p_offset
& PAGEOFFSET
) == 0);
1444 ppp
= &PAGE_CACHELISTS(mnode
, bin
, mtype
);
1449 if (flags
& PG_LIST_TAIL
)
1450 *ppp
= (*ppp
)->p_next
;
1452 * Add counters before releasing pcm mutex to avoid a race with
1453 * page_freelist_coalesce and page_freelist_split.
1455 page_ctr_add(mnode
, mtype
, pp
, flags
);
1460 #if defined(__sparc)
1461 if (PP_ISNORELOC(pp
)) {
1462 kcage_freemem_add(1);
1466 * It is up to the caller to unlock the page!
1468 ASSERT(PAGE_EXCL(pp
) || (flags
& PG_LIST_ISINIT
));
1474 * This routine is only used by kcage_init during system startup.
1475 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1476 * without the overhead of taking locks and updating counters.
1479 page_list_noreloc_startup(page_t
*pp
)
1488 * If this is a large page on the freelist then
1489 * break it up into smaller pages.
1492 page_boot_demote(pp
);
1495 * Get list page is currently on.
1498 mnode
= PP_2_MEM_NODE(pp
);
1499 mtype
= PP_2_MTYPE(pp
);
1500 ASSERT(mtype
== MTYPE_RELOC
);
1501 ASSERT(pp
->p_szc
== 0);
1503 if (PP_ISAGED(pp
)) {
1504 ppp
= &PAGE_FREELISTS(mnode
, 0, bin
, mtype
);
1505 flags
|= PG_FREE_LIST
;
1507 ppp
= &PAGE_CACHELISTS(mnode
, bin
, mtype
);
1508 flags
|= PG_CACHE_LIST
;
1511 ASSERT(*ppp
!= NULL
);
1514 * Delete page from current list.
1517 *ppp
= pp
->p_next
; /* go to next page */
1519 *ppp
= NULL
; /* page list is gone */
1521 pp
->p_prev
->p_next
= pp
->p_next
;
1522 pp
->p_next
->p_prev
= pp
->p_prev
;
1526 * Decrement page counters
1528 page_ctr_sub_internal(mnode
, mtype
, pp
, flags
);
1531 * Set no reloc for cage initted pages.
1535 mtype
= PP_2_MTYPE(pp
);
1536 ASSERT(mtype
== MTYPE_NORELOC
);
1539 * Get new list for page.
1541 if (PP_ISAGED(pp
)) {
1542 ppp
= &PAGE_FREELISTS(mnode
, 0, bin
, mtype
);
1544 ppp
= &PAGE_CACHELISTS(mnode
, bin
, mtype
);
1548 * Insert page on new list.
1552 pp
->p_next
= pp
->p_prev
= pp
;
1555 pp
->p_prev
= (*ppp
)->p_prev
;
1556 (*ppp
)->p_prev
= pp
;
1557 pp
->p_prev
->p_next
= pp
;
1561 * Increment page counters
1563 page_ctr_add_internal(mnode
, mtype
, pp
, flags
);
1566 * Update cage freemem counter
1568 atomic_inc_ulong(&kcage_freemem
);
1574 page_list_noreloc_startup(page_t
*pp
)
1576 panic("page_list_noreloc_startup: should be here only for sparc");
1581 page_list_add_pages(page_t
*pp
, int flags
)
1585 uint_t bin
, mtype
, i
;
1588 /* default to freelist/head */
1589 ASSERT((flags
& (PG_CACHE_LIST
| PG_LIST_TAIL
)) == 0);
1591 CHK_LPG(pp
, pp
->p_szc
);
1592 VM_STAT_ADD(vmm_vmstats
.pladd_free
[pp
->p_szc
]);
1595 mnode
= PP_2_MEM_NODE(pp
);
1596 mtype
= PP_2_MTYPE(pp
);
1598 if (flags
& PG_LIST_ISINIT
) {
1599 ASSERT(pp
->p_szc
== mmu_page_sizes
- 1);
1600 page_lpadd(&PAGE_FREELISTS(mnode
, pp
->p_szc
, bin
, mtype
), pp
);
1601 ASSERT(!PP_ISNORELOC(pp
));
1602 PLCNT_INCR(pp
, mnode
, mtype
, pp
->p_szc
, flags
);
1605 ASSERT(pp
->p_szc
!= 0 && pp
->p_szc
< mmu_page_sizes
);
1607 pcm
= PC_BIN_MUTEX(mnode
, bin
, PG_FREE_LIST
);
1610 page_lpadd(&PAGE_FREELISTS(mnode
, pp
->p_szc
, bin
, mtype
), pp
);
1611 page_ctr_add(mnode
, mtype
, pp
, PG_FREE_LIST
);
1614 pgcnt
= page_get_pagecnt(pp
->p_szc
);
1615 #if defined(__sparc)
1616 if (PP_ISNORELOC(pp
))
1617 kcage_freemem_add(pgcnt
);
1619 for (i
= 0; i
< pgcnt
; i
++, pp
++)
1620 page_unlock_nocapture(pp
);
1625 * During boot, need to demote a large page to base
1626 * pagesize pages for seg_kmem for use in boot_alloc()
1629 page_boot_demote(page_t
*pp
)
1631 ASSERT(pp
->p_szc
!= 0);
1632 ASSERT(PP_ISFREE(pp
));
1633 ASSERT(PP_ISAGED(pp
));
1635 (void) page_demote(PP_2_MEM_NODE(pp
),
1636 PFN_BASE(pp
->p_pagenum
, pp
->p_szc
), 0, pp
->p_szc
, 0, PC_NO_COLOR
,
1639 ASSERT(PP_ISFREE(pp
));
1640 ASSERT(PP_ISAGED(pp
));
1641 ASSERT(pp
->p_szc
== 0);
1645 * Take a particular page off of whatever freelist the page
1646 * is claimed to be on.
1648 * NOTE: Only used for PAGESIZE pages.
1651 page_list_sub(page_t
*pp
, int flags
)
1659 ASSERT(PAGE_EXCL(pp
));
1660 ASSERT(PP_ISFREE(pp
));
1663 * The p_szc field can only be changed by page_promote()
1664 * and page_demote(). Only free pages can be promoted and
1665 * demoted and the free list MUST be locked during these
1666 * operations. So to prevent a race in page_list_sub()
1667 * between computing which bin of the freelist lock to
1668 * grab and actually grabing the lock we check again that
1669 * the bin we locked is still the correct one. Notice that
1670 * the p_szc field could have actually changed on us but
1671 * if the bin happens to still be the same we are safe.
1675 mnode
= PP_2_MEM_NODE(pp
);
1676 pcm
= PC_BIN_MUTEX(mnode
, bin
, flags
);
1678 if (PP_2_BIN(pp
) != bin
) {
1682 mtype
= PP_2_MTYPE(pp
);
1684 if (flags
& PG_FREE_LIST
) {
1685 VM_STAT_ADD(vmm_vmstats
.plsub_free
[0]);
1686 ASSERT(PP_ISAGED(pp
));
1687 ppp
= &PAGE_FREELISTS(mnode
, pp
->p_szc
, bin
, mtype
);
1689 VM_STAT_ADD(vmm_vmstats
.plsub_cache
);
1690 ASSERT(!PP_ISAGED(pp
));
1691 ppp
= &PAGE_CACHELISTS(mnode
, bin
, mtype
);
1695 * Common PAGESIZE case.
1697 * Note that we locked the freelist. This prevents
1698 * any page promotion/demotion operations. Therefore
1699 * the p_szc will not change until we drop pcm mutex.
1701 if (pp
->p_szc
== 0) {
1704 * Subtract counters before releasing pcm mutex
1705 * to avoid race with page_freelist_coalesce.
1707 page_ctr_sub(mnode
, mtype
, pp
, flags
);
1710 #if defined(__sparc)
1711 if (PP_ISNORELOC(pp
)) {
1712 kcage_freemem_sub(1);
1719 * Large pages on the cache list are not supported.
1721 if (flags
& PG_CACHE_LIST
)
1722 panic("page_list_sub: large page on cachelist");
1727 * Somebody wants this particular page which is part
1728 * of a large page. In this case we just demote the page
1729 * if it's on the freelist.
1731 * We have to drop pcm before locking the entire freelist.
1732 * Once we have re-locked the freelist check to make sure
1733 * the page hasn't already been demoted or completely
1737 page_freelist_lock(mnode
);
1738 if (pp
->p_szc
!= 0) {
1740 * Large page is on freelist.
1742 (void) page_demote(mnode
, PFN_BASE(pp
->p_pagenum
, pp
->p_szc
),
1743 0, pp
->p_szc
, 0, PC_NO_COLOR
, PC_FREE
);
1745 ASSERT(PP_ISFREE(pp
));
1746 ASSERT(PP_ISAGED(pp
));
1747 ASSERT(pp
->p_szc
== 0);
1750 * Subtract counters before releasing pcm mutex
1751 * to avoid race with page_freelist_coalesce.
1754 mtype
= PP_2_MTYPE(pp
);
1755 ppp
= &PAGE_FREELISTS(mnode
, pp
->p_szc
, bin
, mtype
);
1758 page_ctr_sub(mnode
, mtype
, pp
, flags
);
1759 page_freelist_unlock(mnode
);
1761 #if defined(__sparc)
1762 if (PP_ISNORELOC(pp
)) {
1763 kcage_freemem_sub(1);
1769 page_list_sub_pages(page_t
*pp
, uint_t szc
)
1775 ASSERT(PAGE_EXCL(pp
));
1776 ASSERT(PP_ISFREE(pp
));
1777 ASSERT(PP_ISAGED(pp
));
1780 * See comment in page_list_sub().
1784 mnode
= PP_2_MEM_NODE(pp
);
1785 pcm
= PC_BIN_MUTEX(mnode
, bin
, PG_FREE_LIST
);
1787 if (PP_2_BIN(pp
) != bin
) {
1793 * If we're called with a page larger than szc or it got
1794 * promoted above szc before we locked the freelist then
1795 * drop pcm and re-lock entire freelist. If page still larger
1796 * than szc then demote it.
1798 if (pp
->p_szc
> szc
) {
1801 page_freelist_lock(mnode
);
1802 if (pp
->p_szc
> szc
) {
1803 VM_STAT_ADD(vmm_vmstats
.plsubpages_szcbig
);
1804 (void) page_demote(mnode
,
1805 PFN_BASE(pp
->p_pagenum
, pp
->p_szc
), 0,
1806 pp
->p_szc
, szc
, PC_NO_COLOR
, PC_FREE
);
1810 ASSERT(PP_ISFREE(pp
));
1811 ASSERT(PP_ISAGED(pp
));
1812 ASSERT(pp
->p_szc
<= szc
);
1813 ASSERT(pp
== PP_PAGEROOT(pp
));
1815 VM_STAT_ADD(vmm_vmstats
.plsub_free
[pp
->p_szc
]);
1817 mtype
= PP_2_MTYPE(pp
);
1818 if (pp
->p_szc
!= 0) {
1819 page_lpsub(&PAGE_FREELISTS(mnode
, pp
->p_szc
, bin
, mtype
), pp
);
1820 CHK_LPG(pp
, pp
->p_szc
);
1822 VM_STAT_ADD(vmm_vmstats
.plsubpages_szc0
);
1823 page_sub(&PAGE_FREELISTS(mnode
, pp
->p_szc
, bin
, mtype
), pp
);
1825 page_ctr_sub(mnode
, mtype
, pp
, PG_FREE_LIST
);
1830 page_freelist_unlock(mnode
);
1833 #if defined(__sparc)
1834 if (PP_ISNORELOC(pp
)) {
1837 pgcnt
= page_get_pagecnt(pp
->p_szc
);
1838 kcage_freemem_sub(pgcnt
);
1844 * Add the page to the front of a linked list of pages
1845 * using the p_next & p_prev pointers for the list.
1846 * The caller is responsible for protecting the list pointers.
1849 mach_page_add(page_t
**ppp
, page_t
*pp
)
1852 pp
->p_next
= pp
->p_prev
= pp
;
1855 pp
->p_prev
= (*ppp
)->p_prev
;
1856 (*ppp
)->p_prev
= pp
;
1857 pp
->p_prev
->p_next
= pp
;
1863 * Remove this page from a linked list of pages
1864 * using the p_next & p_prev pointers for the list.
1866 * The caller is responsible for protecting the list pointers.
1869 mach_page_sub(page_t
**ppp
, page_t
*pp
)
1871 ASSERT(PP_ISFREE(pp
));
1873 if (*ppp
== NULL
|| pp
== NULL
)
1874 panic("mach_page_sub");
1877 *ppp
= pp
->p_next
; /* go to next page */
1880 *ppp
= NULL
; /* page list is gone */
1882 pp
->p_prev
->p_next
= pp
->p_next
;
1883 pp
->p_next
->p_prev
= pp
->p_prev
;
1885 pp
->p_prev
= pp
->p_next
= pp
; /* make pp a list of one */
1889 * Routine fsflush uses to gradually coalesce the free list into larger pages.
1892 page_promote_size(page_t
*pp
, uint_t cur_szc
)
1897 int new_szc
= cur_szc
+ 1;
1898 int full
= FULL_REGION_CNT(new_szc
);
1900 pfn
= page_pptonum(pp
);
1901 mnode
= PFN_2_MEM_NODE(pfn
);
1903 page_freelist_lock(mnode
);
1905 idx
= PNUM_TO_IDX(mnode
, new_szc
, pfn
);
1906 if (PAGE_COUNTERS(mnode
, new_szc
, idx
) == full
)
1907 (void) page_promote(mnode
, pfn
, new_szc
, PC_FREE
, PC_MTYPE_ANY
);
1909 page_freelist_unlock(mnode
);
1912 static uint_t page_promote_err
;
1913 static uint_t page_promote_noreloc_err
;
1916 * Create a single larger page (of szc new_szc) from smaller contiguous pages
1917 * for the given mnode starting at pfnum. Pages involved are on the freelist
1918 * before the call and may be returned to the caller if requested, otherwise
1919 * they will be placed back on the freelist.
1920 * If flags is PC_ALLOC, then the large page will be returned to the user in
1921 * a state which is consistent with a page being taken off the freelist. If
1922 * we failed to lock the new large page, then we will return NULL to the
1923 * caller and put the large page on the freelist instead.
1924 * If flags is PC_FREE, then the large page will be placed on the freelist,
1925 * and NULL will be returned.
1926 * The caller is responsible for locking the freelist as well as any other
1927 * accounting which needs to be done for a returned page.
1929 * RFE: For performance pass in pp instead of pfnum so
1930 * we can avoid excessive calls to page_numtopp_nolock().
1931 * This would depend on an assumption that all contiguous
1932 * pages are in the same memseg so we can just add/dec
1937 * There is a potential but rare deadlock situation
1938 * for page promotion and demotion operations. The problem
1939 * is there are two paths into the freelist manager and
1940 * they have different lock orders:
1947 * caller drops page_lock
1949 * page_free() and page_reclaim()
1950 * caller grabs page_lock(EXCL)
1956 * What prevents a thread in page_create() from deadlocking
1957 * with a thread freeing or reclaiming the same page is the
1958 * page_trylock() in page_get_freelist(). If the trylock fails
1959 * it skips the page.
1961 * The lock ordering for promotion and demotion is the same as
1962 * for page_create(). Since the same deadlock could occur during
1963 * page promotion and freeing or reclaiming of a page on the
1964 * cache list we might have to fail the operation and undo what
1965 * have done so far. Again this is rare.
1968 page_promote(int mnode
, pfn_t pfnum
, uchar_t new_szc
, int flags
, int mtype
)
1970 page_t
*pp
, *pplist
, *tpp
, *start_pp
;
1971 pgcnt_t new_npgs
, npgs
;
1973 pgcnt_t tmpnpgs
, pages_left
;
1980 * General algorithm:
1981 * Find the starting page
1982 * Walk each page struct removing it from the freelist,
1983 * and linking it to all the other pages removed.
1984 * Once all pages are off the freelist,
1985 * walk the list, modifying p_szc to new_szc and what
1986 * ever other info needs to be done to create a large free page.
1987 * According to the flags, either return the page or put it
1991 start_pp
= page_numtopp_nolock(pfnum
);
1992 ASSERT(start_pp
&& (start_pp
->p_pagenum
== pfnum
));
1993 new_npgs
= page_get_pagecnt(new_szc
);
1994 ASSERT(IS_P2ALIGNED(pfnum
, new_npgs
));
1996 /* don't return page of the wrong mtype */
1997 if (mtype
!= PC_MTYPE_ANY
&& mtype
!= PP_2_MTYPE(start_pp
))
2001 * Loop through smaller pages to confirm that all pages
2002 * give the same result for PP_ISNORELOC().
2003 * We can check this reliably here as the protocol for setting
2004 * P_NORELOC requires pages to be taken off the free list first.
2006 noreloc
= PP_ISNORELOC(start_pp
);
2007 for (pp
= start_pp
+ new_npgs
; --pp
> start_pp
; ) {
2008 if (noreloc
!= PP_ISNORELOC(pp
)) {
2009 page_promote_noreloc_err
++;
2015 pages_left
= new_npgs
;
2019 /* Loop around coalescing the smaller pages into a big page. */
2020 while (pages_left
) {
2022 * Remove from the freelist.
2024 ASSERT(PP_ISFREE(pp
));
2026 ASSERT(mnode
== PP_2_MEM_NODE(pp
));
2027 mtype
= PP_2_MTYPE(pp
);
2028 if (PP_ISAGED(pp
)) {
2034 page_lpsub(&PAGE_FREELISTS(mnode
,
2035 pp
->p_szc
, bin
, mtype
), pp
);
2037 mach_page_sub(&PAGE_FREELISTS(mnode
, 0,
2040 which_list
= PG_FREE_LIST
;
2042 struct vmobject
*obj
;
2044 ASSERT(pp
->p_szc
== 0);
2049 * Since this page comes from the
2050 * cachelist, we must destroy the
2051 * vnode association.
2053 if (!page_trylock(pp
, SE_EXCL
)) {
2057 obj
= &pp
->p_vnode
->v_object
;
2060 * We need to be careful not to deadlock
2061 * with another thread in page_lookup().
2062 * The page_lookup() thread could be holding
2063 * the same phm that we need if the two
2064 * pages happen to hash to the same phm lock.
2065 * At this point we have locked the entire
2066 * freelist and page_lookup() could be trying
2067 * to grab a freelist lock.
2069 if (!vmobject_trylock(obj
)) {
2070 page_unlock_nocapture(pp
);
2074 mach_page_sub(&PAGE_CACHELISTS(mnode
, bin
, mtype
), pp
);
2075 page_hashout(pp
, true);
2076 vmobject_unlock(obj
);
2078 page_unlock_nocapture(pp
);
2079 which_list
= PG_CACHE_LIST
;
2081 page_ctr_sub(mnode
, mtype
, pp
, which_list
);
2084 * Concatenate the smaller page(s) onto
2085 * the large page list.
2087 tmpnpgs
= npgs
= page_get_pagecnt(pp
->p_szc
);
2091 tpp
->p_szc
= new_szc
;
2094 page_list_concat(&pplist
, &pp
);
2097 CHK_LPG(pplist
, new_szc
);
2100 * return the page to the user if requested
2101 * in the properly locked state.
2103 if (flags
== PC_ALLOC
&& (page_trylock_cons(pplist
, SE_EXCL
))) {
2108 * Otherwise place the new large page on the freelist
2110 bin
= PP_2_BIN(pplist
);
2111 mnode
= PP_2_MEM_NODE(pplist
);
2112 mtype
= PP_2_MTYPE(pplist
);
2113 page_lpadd(&PAGE_FREELISTS(mnode
, new_szc
, bin
, mtype
), pplist
);
2115 page_ctr_add(mnode
, mtype
, pplist
, PG_FREE_LIST
);
2120 * A thread must have still been freeing or
2121 * reclaiming the page on the cachelist.
2122 * To prevent a deadlock undo what we have
2123 * done sofar and return failure. This
2124 * situation can only happen while promoting
2130 mach_page_sub(&pplist
, pp
);
2133 mtype
= PP_2_MTYPE(pp
);
2134 mach_page_add(&PAGE_FREELISTS(mnode
, 0, bin
, mtype
), pp
);
2135 page_ctr_add(mnode
, mtype
, pp
, PG_FREE_LIST
);
2142 * Break up a large page into smaller size pages.
2143 * Pages involved are on the freelist before the call and may
2144 * be returned to the caller if requested, otherwise they will
2145 * be placed back on the freelist.
2146 * The caller is responsible for locking the freelist as well as any other
2147 * accounting which needs to be done for a returned page.
2148 * If flags is not PC_ALLOC, the color argument is ignored, and thus
2149 * technically, any value may be passed in but PC_NO_COLOR is the standard
2150 * which should be followed for clarity's sake.
2151 * Returns a page whose pfn is < pfnmax
2154 page_demote(int mnode
, pfn_t pfnum
, pfn_t pfnmax
, uchar_t cur_szc
,
2155 uchar_t new_szc
, int color
, int flags
)
2157 page_t
*pp
, *pplist
, *npplist
;
2161 page_t
*ret_pp
= NULL
;
2163 ASSERT(cur_szc
!= 0);
2164 ASSERT(new_szc
< cur_szc
);
2166 pplist
= page_numtopp_nolock(pfnum
);
2167 ASSERT(pplist
!= NULL
);
2169 ASSERT(pplist
->p_szc
== cur_szc
);
2171 bin
= PP_2_BIN(pplist
);
2172 ASSERT(mnode
== PP_2_MEM_NODE(pplist
));
2173 mtype
= PP_2_MTYPE(pplist
);
2174 page_lpsub(&PAGE_FREELISTS(mnode
, cur_szc
, bin
, mtype
), pplist
);
2176 CHK_LPG(pplist
, cur_szc
);
2177 page_ctr_sub(mnode
, mtype
, pplist
, PG_FREE_LIST
);
2180 * Number of PAGESIZE pages for smaller new_szc
2183 npgs
= page_get_pagecnt(new_szc
);
2188 ASSERT(pp
->p_szc
== cur_szc
);
2191 * We either break it up into PAGESIZE pages or larger.
2193 if (npgs
== 1) { /* PAGESIZE case */
2194 mach_page_sub(&pplist
, pp
);
2195 ASSERT(pp
->p_szc
== cur_szc
);
2196 ASSERT(new_szc
== 0);
2197 ASSERT(mnode
== PP_2_MEM_NODE(pp
));
2198 pp
->p_szc
= new_szc
;
2200 if ((bin
== color
) && (flags
== PC_ALLOC
) &&
2201 (ret_pp
== NULL
) && (pfnmax
== 0 ||
2202 pp
->p_pagenum
< pfnmax
) &&
2203 page_trylock_cons(pp
, SE_EXCL
)) {
2206 mtype
= PP_2_MTYPE(pp
);
2207 mach_page_add(&PAGE_FREELISTS(mnode
, 0, bin
,
2209 page_ctr_add(mnode
, mtype
, pp
, PG_FREE_LIST
);
2212 page_t
*try_to_return_this_page
= NULL
;
2216 * Break down into smaller lists of pages.
2218 page_list_break(&pplist
, &npplist
, npgs
);
2223 ASSERT(pp
->p_szc
== cur_szc
);
2225 * Check whether all the pages in this list
2226 * fit the request criteria.
2228 if (pfnmax
== 0 || pp
->p_pagenum
< pfnmax
) {
2231 pp
->p_szc
= new_szc
;
2235 if (count
== npgs
&&
2236 (pfnmax
== 0 || pp
->p_pagenum
< pfnmax
)) {
2237 try_to_return_this_page
= pp
;
2240 CHK_LPG(pplist
, new_szc
);
2242 bin
= PP_2_BIN(pplist
);
2243 if (try_to_return_this_page
)
2245 PP_2_MEM_NODE(try_to_return_this_page
));
2246 if ((bin
== color
) && (flags
== PC_ALLOC
) &&
2247 (ret_pp
== NULL
) && try_to_return_this_page
&&
2248 page_trylock_cons(try_to_return_this_page
,
2250 ret_pp
= try_to_return_this_page
;
2252 mtype
= PP_2_MTYPE(pp
);
2253 page_lpadd(&PAGE_FREELISTS(mnode
, new_szc
,
2254 bin
, mtype
), pplist
);
2256 page_ctr_add(mnode
, mtype
, pplist
,
2265 int mpss_coalesce_disable
= 0;
2268 * Coalesce free pages into a page of the given szc and color if possible.
2269 * Return the pointer to the page created, otherwise, return NULL.
2271 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2274 page_freelist_coalesce(int mnode
, uchar_t szc
, uint_t color
, uint_t ceq_mask
,
2275 int mtype
, pfn_t pfnhi
)
2277 int r
= szc
; /* region size */
2279 uint_t full
, bin
, color_mask
, wrap
= 0;
2280 pfn_t pfnum
, lo
, hi
;
2281 size_t len
, idx
, idx0
;
2282 pgcnt_t cands
= 0, szcpgcnt
= page_get_pagecnt(szc
);
2284 MEM_NODE_ITERATOR_DECL(it
);
2285 #if defined(__sparc)
2286 pfn_t pfnum0
, nlo
, nhi
;
2289 if (mpss_coalesce_disable
) {
2290 ASSERT(szc
< MMU_PAGE_SIZES
);
2291 VM_STAT_ADD(vmm_vmstats
.page_ctrs_coalesce
[szc
][0]);
2295 ASSERT(szc
< mmu_page_sizes
);
2296 color_mask
= PAGE_GET_PAGECOLORS(szc
) - 1;
2297 ASSERT(ceq_mask
<= color_mask
);
2298 ASSERT(color
<= color_mask
);
2301 /* Prevent page_counters dynamic memory from being freed */
2302 rw_enter(&page_ctrs_rwlock
[mnode
], RW_READER
);
2304 mrange
= MTYPE_2_MRANGE(mnode
, mtype
);
2305 ASSERT(mrange
< mnode_nranges
[mnode
]);
2306 VM_STAT_ADD(vmm_vmstats
.page_ctrs_coalesce
[r
][mrange
]);
2308 /* get pfn range for mtype */
2309 len
= PAGE_COUNTERS_ENTRIES(mnode
, r
);
2310 MNODETYPE_2_PFN(mnode
, mtype
, lo
, hi
);
2313 /* use lower limit if given */
2314 if (pfnhi
!= PFNNULL
&& pfnhi
< hi
)
2317 /* round to szcpgcnt boundaries */
2318 lo
= P2ROUNDUP(lo
, szcpgcnt
);
2319 MEM_NODE_ITERATOR_INIT(lo
, mnode
, szc
, &it
);
2320 if (lo
== (pfn_t
)-1) {
2321 rw_exit(&page_ctrs_rwlock
[mnode
]);
2324 hi
= hi
& ~(szcpgcnt
- 1);
2326 /* set lo to the closest pfn of the right color */
2327 if (((PFN_2_COLOR(lo
, szc
, &it
) ^ color
) & ceq_mask
) ||
2328 (interleaved_mnodes
&& PFN_2_MEM_NODE(lo
) != mnode
)) {
2329 PAGE_NEXT_PFN_FOR_COLOR(lo
, szc
, color
, ceq_mask
, color_mask
,
2334 rw_exit(&page_ctrs_rwlock
[mnode
]);
2338 full
= FULL_REGION_CNT(r
);
2340 /* calculate the number of page candidates and initial search index */
2342 idx0
= (size_t)(-1);
2346 PGCTRS_CANDS_GETVALUECOLOR(mnode
, mrange
, r
, bin
, acand
);
2348 idx
= PAGE_COUNTERS_CURRENT_COLOR(mnode
,
2350 idx0
= MIN(idx0
, idx
);
2353 bin
= ADD_MASKED(bin
, 1, ceq_mask
, color_mask
);
2354 } while (bin
!= color
);
2357 VM_STAT_ADD(vmm_vmstats
.page_ctrs_cands_skip
[r
][mrange
]);
2358 rw_exit(&page_ctrs_rwlock
[mnode
]);
2362 pfnum
= IDX_TO_PNUM(mnode
, r
, idx0
);
2363 if (pfnum
< lo
|| pfnum
>= hi
) {
2366 MEM_NODE_ITERATOR_INIT(pfnum
, mnode
, szc
, &it
);
2367 if (pfnum
== (pfn_t
)-1) {
2369 MEM_NODE_ITERATOR_INIT(pfnum
, mnode
, szc
, &it
);
2370 ASSERT(pfnum
!= (pfn_t
)-1);
2371 } else if ((PFN_2_COLOR(pfnum
, szc
, &it
) ^ color
) & ceq_mask
||
2372 (interleaved_mnodes
&& PFN_2_MEM_NODE(pfnum
) != mnode
)) {
2373 /* invalid color, get the closest correct pfn */
2374 PAGE_NEXT_PFN_FOR_COLOR(pfnum
, szc
, color
, ceq_mask
,
2378 MEM_NODE_ITERATOR_INIT(pfnum
, mnode
, szc
, &it
);
2383 /* set starting index */
2384 idx0
= PNUM_TO_IDX(mnode
, r
, pfnum
);
2387 #if defined(__sparc)
2388 pfnum0
= pfnum
; /* page corresponding to idx0 */
2389 nhi
= 0; /* search kcage ranges */
2392 for (idx
= idx0
; wrap
== 0 || (idx
< idx0
&& wrap
< 2); ) {
2394 #if defined(__sparc)
2396 * Find lowest intersection of kcage ranges and mnode.
2397 * MTYPE_NORELOC means look in the cage, otherwise outside.
2400 if (kcage_next_range(mtype
== MTYPE_NORELOC
, pfnum
,
2401 (wrap
== 0 ? hi
: pfnum0
), &nlo
, &nhi
))
2404 /* jump to the next page in the range */
2406 pfnum
= P2ROUNDUP(nlo
, szcpgcnt
);
2407 MEM_NODE_ITERATOR_INIT(pfnum
, mnode
, szc
, &it
);
2408 idx
= PNUM_TO_IDX(mnode
, r
, pfnum
);
2409 if (idx
>= len
|| pfnum
>= hi
)
2411 if ((PFN_2_COLOR(pfnum
, szc
, &it
) ^ color
) &
2414 if (interleaved_mnodes
&&
2415 PFN_2_MEM_NODE(pfnum
) != mnode
)
2421 if (PAGE_COUNTERS(mnode
, r
, idx
) != full
)
2425 * RFE: For performance maybe we can do something less
2426 * brutal than locking the entire freelist. So far
2427 * this doesn't seem to be a performance problem?
2429 page_freelist_lock(mnode
);
2430 if (PAGE_COUNTERS(mnode
, r
, idx
) == full
) {
2432 page_promote(mnode
, pfnum
, r
, PC_ALLOC
, mtype
);
2433 if (ret_pp
!= NULL
) {
2434 VM_STAT_ADD(vmm_vmstats
.pfc_coalok
[r
][mrange
]);
2435 PAGE_COUNTERS_CURRENT_COLOR(mnode
, r
,
2436 PFN_2_COLOR(pfnum
, szc
, &it
), mrange
) = idx
;
2437 page_freelist_unlock(mnode
);
2438 rw_exit(&page_ctrs_rwlock
[mnode
]);
2439 #if defined(__sparc)
2440 if (PP_ISNORELOC(ret_pp
)) {
2443 npgs
= page_get_pagecnt(ret_pp
->p_szc
);
2444 kcage_freemem_sub(npgs
);
2450 VM_STAT_ADD(vmm_vmstats
.page_ctrs_changed
[r
][mrange
]);
2453 page_freelist_unlock(mnode
);
2455 * No point looking for another page if we've
2456 * already tried all of the ones that
2457 * page_ctr_cands indicated. Stash off where we left
2459 * Note: this is not exact since we don't hold the
2460 * page_freelist_locks before we initially get the
2461 * value of cands for performance reasons, but should
2462 * be a decent approximation.
2465 PAGE_COUNTERS_CURRENT_COLOR(mnode
, r
, color
, mrange
) =
2470 PAGE_NEXT_PFN_FOR_COLOR(pfnum
, szc
, color
, ceq_mask
,
2472 idx
= PNUM_TO_IDX(mnode
, r
, pfnum
);
2473 if (idx
>= len
|| pfnum
>= hi
) {
2476 MEM_NODE_ITERATOR_INIT(pfnum
, mnode
, szc
, &it
);
2477 idx
= PNUM_TO_IDX(mnode
, r
, pfnum
);
2479 #if defined(__sparc)
2480 nhi
= 0; /* search kcage ranges */
2485 rw_exit(&page_ctrs_rwlock
[mnode
]);
2486 VM_STAT_ADD(vmm_vmstats
.page_ctrs_failed
[r
][mrange
]);
2491 * For the given mnode, promote as many small pages to large pages as possible.
2492 * mnode can be -1, which means do them all
2495 page_freelist_coalesce_all(int mnode
)
2497 int r
; /* region size */
2500 int doall
= interleaved_mnodes
|| mnode
< 0;
2501 int mlo
= doall
? 0 : mnode
;
2502 int mhi
= doall
? max_mem_nodes
: (mnode
+ 1);
2504 VM_STAT_ADD(vmm_vmstats
.page_ctrs_coalesce_all
);
2506 if (mpss_coalesce_disable
) {
2511 * Lock the entire freelist and coalesce what we can.
2513 * Always promote to the largest page possible
2514 * first to reduce the number of page promotions.
2516 for (mnode
= mlo
; mnode
< mhi
; mnode
++) {
2517 rw_enter(&page_ctrs_rwlock
[mnode
], RW_READER
);
2518 page_freelist_lock(mnode
);
2520 for (r
= mmu_page_sizes
- 1; r
> 0; r
--) {
2521 for (mnode
= mlo
; mnode
< mhi
; mnode
++) {
2523 int mrange
, nranges
= mnode_nranges
[mnode
];
2525 for (mrange
= 0; mrange
< nranges
; mrange
++) {
2526 PGCTRS_CANDS_GETVALUE(mnode
, mrange
, r
, cands
);
2531 VM_STAT_ADD(vmm_vmstats
.
2532 page_ctrs_cands_skip_all
);
2536 full
= FULL_REGION_CNT(r
);
2537 len
= PAGE_COUNTERS_ENTRIES(mnode
, r
);
2539 for (idx
= 0; idx
< len
; idx
++) {
2540 if (PAGE_COUNTERS(mnode
, r
, idx
) == full
) {
2542 IDX_TO_PNUM(mnode
, r
, idx
);
2543 int tmnode
= interleaved_mnodes
?
2544 PFN_2_MEM_NODE(pfnum
) : mnode
;
2547 mem_node_config
[tmnode
].physbase
&&
2549 mem_node_config
[tmnode
].physmax
);
2551 (void) page_promote(tmnode
,
2552 pfnum
, r
, PC_FREE
, PC_MTYPE_ANY
);
2555 /* shared hpm_counters covers all mnodes, so we quit */
2556 if (interleaved_mnodes
)
2560 for (mnode
= mlo
; mnode
< mhi
; mnode
++) {
2561 page_freelist_unlock(mnode
);
2562 rw_exit(&page_ctrs_rwlock
[mnode
]);
2567 * This is where all polices for moving pages around
2568 * to different page size free lists is implemented.
2569 * Returns 1 on success, 0 on failure.
2571 * So far these are the priorities for this algorithm in descending
2574 * 1) When servicing a request try to do so with a free page
2575 * from next size up. Helps defer fragmentation as long
2578 * 2) Page coalesce on demand. Only when a freelist
2579 * larger than PAGESIZE is empty and step 1
2580 * will not work since all larger size lists are
2583 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2587 page_freelist_split(uchar_t szc
, uint_t color
, int mnode
, int mtype
,
2588 pfn_t pfnlo
, pfn_t pfnhi
, page_list_walker_t
*plw
)
2590 uchar_t nszc
= szc
+ 1;
2591 uint_t bin
, sbin
, bin_prev
;
2592 page_t
*pp
, *firstpp
;
2593 page_t
*ret_pp
= NULL
;
2596 if (nszc
== mmu_page_sizes
)
2599 ASSERT(nszc
< mmu_page_sizes
);
2600 color_mask
= PAGE_GET_PAGECOLORS(nszc
) - 1;
2601 bin
= sbin
= PAGE_GET_NSZ_COLOR(szc
, color
);
2602 bin_prev
= (plw
->plw_bin_split_prev
== color
) ? INVALID_COLOR
:
2603 PAGE_GET_NSZ_COLOR(szc
, plw
->plw_bin_split_prev
);
2605 VM_STAT_ADD(vmm_vmstats
.pfs_req
[szc
]);
2607 * First try to break up a larger page to fill current size freelist.
2609 while (plw
->plw_bins
[nszc
] != 0) {
2611 ASSERT(nszc
< mmu_page_sizes
);
2614 * If page found then demote it.
2616 if (PAGE_FREELISTS(mnode
, nszc
, bin
, mtype
)) {
2617 page_freelist_lock(mnode
);
2618 firstpp
= pp
= PAGE_FREELISTS(mnode
, nszc
, bin
, mtype
);
2621 * If pfnhi is not PFNNULL, look for large page below
2622 * pfnhi. PFNNULL signifies no pfn requirement.
2625 ((pfnhi
!= PFNNULL
&& pp
->p_pagenum
>= pfnhi
) ||
2626 (pfnlo
!= PFNNULL
&& pp
->p_pagenum
< pfnlo
))) {
2628 pp
= pp
->p_list
.largepg
.next
;
2629 if (pp
== firstpp
) {
2633 } while ((pfnhi
!= PFNNULL
&&
2634 pp
->p_pagenum
>= pfnhi
) ||
2635 (pfnlo
!= PFNNULL
&&
2636 pp
->p_pagenum
< pfnlo
));
2638 if (pfnhi
!= PFNNULL
&& pp
!= NULL
)
2639 ASSERT(pp
->p_pagenum
< pfnhi
);
2641 if (pfnlo
!= PFNNULL
&& pp
!= NULL
)
2642 ASSERT(pp
->p_pagenum
>= pfnlo
);
2645 uint_t ccolor
= page_correct_color(szc
, nszc
,
2646 color
, bin
, plw
->plw_ceq_mask
[szc
]);
2648 ASSERT(pp
->p_szc
== nszc
);
2649 VM_STAT_ADD(vmm_vmstats
.pfs_demote
[nszc
]);
2650 ret_pp
= page_demote(mnode
, pp
->p_pagenum
,
2651 pfnhi
, pp
->p_szc
, szc
, ccolor
, PC_ALLOC
);
2653 page_freelist_unlock(mnode
);
2654 #if defined(__sparc)
2655 if (PP_ISNORELOC(ret_pp
)) {
2658 npgs
= page_get_pagecnt(
2660 kcage_freemem_sub(npgs
);
2666 page_freelist_unlock(mnode
);
2669 /* loop through next size bins */
2670 bin
= ADD_MASKED(bin
, 1, plw
->plw_ceq_mask
[nszc
], color_mask
);
2671 plw
->plw_bins
[nszc
]--;
2674 uchar_t nnszc
= nszc
+ 1;
2676 /* we are done with this page size - check next */
2677 if (plw
->plw_bins
[nnszc
] == 0)
2678 /* we have already checked next size bins */
2681 bin
= sbin
= PAGE_GET_NSZ_COLOR(nszc
, bin
);
2682 if (bin_prev
!= INVALID_COLOR
) {
2683 bin_prev
= PAGE_GET_NSZ_COLOR(nszc
, bin_prev
);
2684 if (!((bin
^ bin_prev
) &
2685 plw
->plw_ceq_mask
[nnszc
]))
2688 ASSERT(nnszc
< mmu_page_sizes
);
2689 color_mask
= PAGE_GET_PAGECOLORS(nnszc
) - 1;
2691 ASSERT(nszc
< mmu_page_sizes
);
2699 * Helper routine used only by the freelist code to lock
2700 * a page. If the page is a large page then it succeeds in
2701 * locking all the constituent pages or none at all.
2702 * Returns 1 on sucess, 0 on failure.
2705 page_trylock_cons(page_t
*pp
, se_t se
)
2707 page_t
*tpp
, *first_pp
= pp
;
2710 * Fail if can't lock first or only page.
2712 if (!page_trylock(pp
, se
)) {
2717 * PAGESIZE: common case.
2719 if (pp
->p_szc
== 0) {
2728 if (!page_trylock(tpp
, se
)) {
2730 * On failure unlock what we have locked so far.
2731 * We want to avoid attempting to capture these
2732 * pages as the pcm mutex may be held which could
2733 * lead to a recursive mutex panic.
2735 while (first_pp
!= tpp
) {
2736 page_unlock_nocapture(first_pp
);
2737 first_pp
= first_pp
->p_next
;
2747 * init context for walking page lists
2748 * Called when a page of the given szc in unavailable. Sets markers
2749 * for the beginning of the search to detect when search has
2750 * completed a full cycle. Sets flags for splitting larger pages
2751 * and coalescing smaller pages. Page walking procedes until a page
2752 * of the desired equivalent color is found.
2755 page_list_walk_init(uchar_t szc
, uint_t flags
, uint_t bin
, int can_split
,
2756 int use_ceq
, page_list_walker_t
*plw
)
2758 uint_t nszc
, ceq_mask
, colors
;
2759 uchar_t ceq
= use_ceq
? colorequivszc
[szc
] : 0;
2761 ASSERT(szc
< mmu_page_sizes
);
2762 colors
= PAGE_GET_PAGECOLORS(szc
);
2764 plw
->plw_colors
= colors
;
2765 plw
->plw_color_mask
= colors
- 1;
2766 plw
->plw_bin_marker
= plw
->plw_bin0
= bin
;
2767 plw
->plw_bin_split_prev
= bin
;
2768 plw
->plw_bin_step
= (szc
== 0) ? vac_colors
: 1;
2771 * if vac aliasing is possible make sure lower order color
2772 * bits are never ignored
2778 * calculate the number of non-equivalent colors and
2779 * color equivalency mask
2781 plw
->plw_ceq_dif
= colors
>> ((ceq
>> 4) + (ceq
& 0xf));
2782 ASSERT(szc
> 0 || plw
->plw_ceq_dif
>= vac_colors
);
2783 ASSERT(plw
->plw_ceq_dif
> 0);
2784 plw
->plw_ceq_mask
[szc
] = (plw
->plw_ceq_dif
- 1) << (ceq
& 0xf);
2786 if (flags
& PG_MATCH_COLOR
) {
2787 if (cpu_page_colors
< 0) {
2789 * this is a heterogeneous machine with different CPUs
2790 * having different size e$ (not supported for ni2/rock
2792 uint_t cpucolors
= CPUSETSIZE() >> PAGE_GET_SHIFT(szc
);
2793 cpucolors
= MAX(cpucolors
, 1);
2794 ceq_mask
= plw
->plw_color_mask
& (cpucolors
- 1);
2795 plw
->plw_ceq_mask
[szc
] =
2796 MIN(ceq_mask
, plw
->plw_ceq_mask
[szc
]);
2798 plw
->plw_ceq_dif
= 1;
2801 /* we can split pages in the freelist, but not the cachelist */
2803 plw
->plw_do_split
= (szc
+ 1 < mmu_page_sizes
) ? 1 : 0;
2805 /* set next szc color masks and number of free list bins */
2806 for (nszc
= szc
+ 1; nszc
< mmu_page_sizes
; nszc
++, szc
++) {
2807 plw
->plw_ceq_mask
[nszc
] = PAGE_GET_NSZ_MASK(szc
,
2808 plw
->plw_ceq_mask
[szc
]);
2809 plw
->plw_bins
[nszc
] = PAGE_GET_PAGECOLORS(nszc
);
2811 plw
->plw_ceq_mask
[nszc
] = INVALID_MASK
;
2812 plw
->plw_bins
[nszc
] = 0;
2816 plw
->plw_do_split
= 0;
2817 plw
->plw_bins
[1] = 0;
2818 plw
->plw_ceq_mask
[1] = INVALID_MASK
;
2823 * set mark to flag where next split should occur
2825 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \
2826 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \
2827 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \
2828 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \
2829 plw->plw_split_next = \
2830 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \
2831 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2832 plw->plw_split_next = \
2833 INC_MASKED(plw->plw_split_next, \
2834 neq_mask, plw->plw_color_mask); \
2839 page_list_walk_next_bin(uchar_t szc
, uint_t bin
, page_list_walker_t
*plw
)
2841 uint_t neq_mask
= ~plw
->plw_ceq_mask
[szc
] & plw
->plw_color_mask
;
2842 uint_t bin0_nsz
, nbin_nsz
, nbin0
, nbin
;
2843 uchar_t nszc
= szc
+ 1;
2845 nbin
= ADD_MASKED(bin
,
2846 plw
->plw_bin_step
, neq_mask
, plw
->plw_color_mask
);
2848 if (plw
->plw_do_split
) {
2849 plw
->plw_bin_split_prev
= bin
;
2850 PAGE_SET_NEXT_SPLIT_MARKER(szc
, nszc
, bin
, plw
);
2851 plw
->plw_do_split
= 0;
2855 if (plw
->plw_count
!= 0 || plw
->plw_ceq_dif
== vac_colors
) {
2856 if (nbin
== plw
->plw_bin0
&&
2857 (vac_colors
== 1 || nbin
!= plw
->plw_bin_marker
)) {
2858 nbin
= ADD_MASKED(nbin
, plw
->plw_bin_step
,
2859 neq_mask
, plw
->plw_color_mask
);
2860 plw
->plw_bin_split_prev
= plw
->plw_bin0
;
2863 if (vac_colors
> 1 && nbin
== plw
->plw_bin_marker
) {
2864 plw
->plw_bin_marker
=
2865 nbin
= INC_MASKED(nbin
, neq_mask
,
2866 plw
->plw_color_mask
);
2867 plw
->plw_bin_split_prev
= plw
->plw_bin0
;
2869 * large pages all have the same vac color
2870 * so by now we should be done with next
2871 * size page splitting process
2873 ASSERT(plw
->plw_bins
[1] == 0);
2874 plw
->plw_do_split
= 0;
2879 uint_t bin_jump
= (vac_colors
== 1) ?
2880 (BIN_STEP
& ~3) - (plw
->plw_bin0
& 3) : BIN_STEP
;
2882 bin_jump
&= ~(vac_colors
- 1);
2884 nbin0
= ADD_MASKED(plw
->plw_bin0
, bin_jump
, neq_mask
,
2885 plw
->plw_color_mask
);
2887 if ((nbin0
^ plw
->plw_bin0
) & plw
->plw_ceq_mask
[szc
]) {
2889 plw
->plw_bin_marker
= nbin
= nbin0
;
2891 if (plw
->plw_bins
[nszc
] != 0) {
2893 * check if next page size bin is the
2894 * same as the next page size bin for
2897 nbin_nsz
= PAGE_GET_NSZ_COLOR(szc
,
2899 bin0_nsz
= PAGE_GET_NSZ_COLOR(szc
,
2902 if ((bin0_nsz
^ nbin_nsz
) &
2903 plw
->plw_ceq_mask
[nszc
])
2904 plw
->plw_do_split
= 1;
2911 if (plw
->plw_bins
[nszc
] != 0) {
2912 nbin_nsz
= PAGE_GET_NSZ_COLOR(szc
, nbin
);
2913 if (!((plw
->plw_split_next
^ nbin_nsz
) &
2914 plw
->plw_ceq_mask
[nszc
]))
2915 plw
->plw_do_split
= 1;
2922 page_get_mnode_freelist(int mnode
, uint_t bin
, int mtype
, uchar_t szc
,
2926 page_t
*pp
, *first_pp
;
2928 int plw_initialized
;
2929 page_list_walker_t plw
;
2931 ASSERT(szc
< mmu_page_sizes
);
2933 VM_STAT_ADD(vmm_vmstats
.pgmf_alloc
[szc
]);
2935 MTYPE_START(mnode
, mtype
, flags
);
2936 if (mtype
< 0) { /* mnode does not have memory in mtype range */
2937 VM_STAT_ADD(vmm_vmstats
.pgmf_allocempty
[szc
]);
2942 plw_initialized
= 0;
2943 plw
.plw_ceq_dif
= 1;
2946 * Only hold one freelist lock at a time, that way we
2947 * can start anywhere and not have to worry about lock
2950 for (plw
.plw_count
= 0;
2951 plw
.plw_count
< plw
.plw_ceq_dif
; plw
.plw_count
++) {
2954 if (!PAGE_FREELISTS(mnode
, szc
, bin
, mtype
))
2957 pcm
= PC_BIN_MUTEX(mnode
, bin
, PG_FREE_LIST
);
2959 pp
= PAGE_FREELISTS(mnode
, szc
, bin
, mtype
);
2964 * These were set before the page
2965 * was put on the free list,
2966 * they must still be set.
2968 ASSERT(PP_ISFREE(pp
));
2969 ASSERT(PP_ISAGED(pp
));
2970 VERIFY(pp
->p_object
== NULL
);
2971 ASSERT(pp
->p_vnode
== NULL
);
2972 ASSERT(pp
->p_offset
== (uoff_t
)-1);
2973 ASSERT(pp
->p_szc
== szc
);
2974 ASSERT(PFN_2_MEM_NODE(pp
->p_pagenum
) == mnode
);
2977 * Walk down the hash chain. 4k/8k pages are linked
2978 * on p_next and p_prev fields. Large pages are a
2979 * contiguous group of constituent pages linked
2980 * together on their p_next and p_prev fields. The
2981 * large pages are linked together on the hash chain
2982 * using p_list.largepg of the base constituent page
2983 * of each large page.
2986 while (!page_trylock_cons(pp
, SE_EXCL
)) {
2990 pp
= pp
->p_list
.largepg
.next
;
2993 ASSERT(PP_ISFREE(pp
));
2994 ASSERT(PP_ISAGED(pp
));
2995 VERIFY(pp
->p_object
== NULL
);
2996 ASSERT(pp
->p_vnode
== NULL
);
2997 ASSERT(pp
->p_offset
== (uoff_t
)-1);
2998 ASSERT(pp
->p_szc
== szc
);
2999 ASSERT(PFN_2_MEM_NODE(pp
->p_pagenum
) == mnode
);
3006 ASSERT(mtype
== PP_2_MTYPE(pp
));
3007 ASSERT(pp
->p_szc
== szc
);
3009 page_sub(&PAGE_FREELISTS(mnode
,
3010 szc
, bin
, mtype
), pp
);
3012 page_lpsub(&PAGE_FREELISTS(mnode
,
3013 szc
, bin
, mtype
), pp
);
3016 page_ctr_sub(mnode
, mtype
, pp
, PG_FREE_LIST
);
3018 if ((PP_ISFREE(pp
) == 0) || (PP_ISAGED(pp
) == 0))
3019 panic("free page is not. pp %p", (void *)pp
);
3022 #if defined(__sparc)
3023 ASSERT(!kcage_on
|| PP_ISNORELOC(pp
) ||
3024 (flags
& PG_NORELOC
) == 0);
3026 if (PP_ISNORELOC(pp
))
3027 kcage_freemem_sub(page_get_pagecnt(szc
));
3029 VM_STAT_ADD(vmm_vmstats
.pgmf_allocok
[szc
]);
3035 if (plw_initialized
== 0) {
3036 page_list_walk_init(szc
, flags
, bin
, 1, 1,
3038 plw_initialized
= 1;
3039 ASSERT(plw
.plw_colors
<=
3040 PAGE_GET_PAGECOLORS(szc
));
3041 ASSERT(plw
.plw_colors
> 0);
3042 ASSERT((plw
.plw_colors
&
3043 (plw
.plw_colors
- 1)) == 0);
3044 ASSERT(bin
< plw
.plw_colors
);
3045 ASSERT(plw
.plw_ceq_mask
[szc
] < plw
.plw_colors
);
3047 /* calculate the next bin with equivalent color */
3048 bin
= ADD_MASKED(bin
, plw
.plw_bin_step
,
3049 plw
.plw_ceq_mask
[szc
], plw
.plw_color_mask
);
3050 } while (sbin
!= bin
);
3053 * color bins are all empty if color match. Try and
3054 * satisfy the request by breaking up or coalescing
3055 * pages from a different size freelist of the correct
3056 * color that satisfies the ORIGINAL color requested.
3057 * If that fails then try pages of the same size but
3058 * different colors assuming we are not called with
3061 if (plw
.plw_do_split
&&
3062 (pp
= page_freelist_split(szc
, bin
, mnode
,
3063 mtype
, PFNNULL
, PFNNULL
, &plw
)) != NULL
)
3066 if (szc
> 0 && (pp
= page_freelist_coalesce(mnode
, szc
,
3067 bin
, plw
.plw_ceq_mask
[szc
], mtype
, PFNNULL
)) != NULL
)
3070 if (plw
.plw_ceq_dif
> 1)
3071 bin
= page_list_walk_next_bin(szc
, bin
, &plw
);
3074 /* if allowed, cycle through additional mtypes */
3075 MTYPE_NEXT(mnode
, mtype
, flags
);
3079 VM_STAT_ADD(vmm_vmstats
.pgmf_allocfailed
[szc
]);
3085 * Returns the count of free pages for 'pp' with size code 'szc'.
3086 * Note: This function does not return an exact value as the page freelist
3087 * locks are not held and thus the values in the page_counters may be
3088 * changing as we walk through the data.
3091 page_freecnt(int mnode
, page_t
*pp
, uchar_t szc
)
3095 ssize_t r
= szc
; /* region size */
3100 /* Make sure pagenum passed in is aligned properly */
3101 ASSERT((pp
->p_pagenum
& (PNUM_SIZE(szc
) - 1)) == 0);
3104 /* Prevent page_counters dynamic memory from being freed */
3105 rw_enter(&page_ctrs_rwlock
[mnode
], RW_READER
);
3106 idx
= PNUM_TO_IDX(mnode
, r
, pp
->p_pagenum
);
3107 cnt
= PAGE_COUNTERS(mnode
, r
, idx
);
3108 pgfree
= cnt
<< PNUM_SHIFT(r
- 1);
3109 range
= FULL_REGION_CNT(szc
);
3111 /* Check for completely full region */
3113 rw_exit(&page_ctrs_rwlock
[mnode
]);
3118 idx
= PNUM_TO_IDX(mnode
, r
, pp
->p_pagenum
);
3119 full
= FULL_REGION_CNT(r
);
3120 for (i
= 0; i
< range
; i
++, idx
++) {
3121 cnt
= PAGE_COUNTERS(mnode
, r
, idx
);
3123 * If cnt here is full, that means we have already
3124 * accounted for these pages earlier.
3127 pgfree
+= (cnt
<< PNUM_SHIFT(r
- 1));
3132 rw_exit(&page_ctrs_rwlock
[mnode
]);
3137 * Called from page_geti_contig_pages to exclusively lock constituent pages
3138 * starting from 'spp' for page size code 'szc'.
3140 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3141 * region needs to be greater than or equal to the threshold.
3144 page_trylock_contig_pages(int mnode
, page_t
*spp
, uchar_t szc
, int flags
)
3146 pgcnt_t pgcnt
= PNUM_SIZE(szc
);
3150 VM_STAT_ADD(vmm_vmstats
.ptcp
[szc
]);
3153 if ((ptcpthreshold
== 0) || (flags
& PGI_PGCPHIPRI
))
3156 * check if there are sufficient free pages available before attempting
3157 * to trylock. Count is approximate as page counters can change.
3159 pgfree
= page_freecnt(mnode
, spp
, szc
);
3161 /* attempt to trylock if there are sufficient already free pages */
3162 if (pgfree
< pgcnt
/ptcpthreshold
) {
3163 VM_STAT_ADD(vmm_vmstats
.ptcpfreethresh
[szc
]);
3169 for (i
= 0; i
< pgcnt
; i
++) {
3171 if (!page_trylock(pp
, SE_EXCL
)) {
3172 VM_STAT_ADD(vmm_vmstats
.ptcpfailexcl
[szc
]);
3173 while (--i
!= (pgcnt_t
)-1) {
3175 ASSERT(PAGE_EXCL(pp
));
3176 page_unlock_nocapture(pp
);
3180 ASSERT(spp
[i
].p_pagenum
== spp
->p_pagenum
+ i
);
3181 if ((pp
->p_szc
> szc
|| (szc
&& pp
->p_szc
== szc
)) &&
3183 VM_STAT_ADD(vmm_vmstats
.ptcpfailszc
[szc
]);
3185 page_unlock_nocapture(pp
);
3190 * If a page has been marked non-relocatable or has been
3191 * explicitly locked in memory, we don't want to relocate it;
3192 * unlock the pages and fail the operation.
3194 if (PP_ISNORELOC(pp
) ||
3195 pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0) {
3196 VM_STAT_ADD(vmm_vmstats
.ptcpfailcage
[szc
]);
3197 while (i
!= (pgcnt_t
)-1) {
3199 ASSERT(PAGE_EXCL(pp
));
3200 page_unlock_nocapture(pp
);
3206 VM_STAT_ADD(vmm_vmstats
.ptcpok
[szc
]);
3211 * Claim large page pointed to by 'pp'. 'pp' is the starting set
3212 * of 'szc' constituent pages that had been locked exclusively previously.
3213 * Will attempt to relocate constituent pages in use.
3216 page_claim_contig_pages(page_t
*pp
, uchar_t szc
, int flags
)
3218 spgcnt_t pgcnt
, npgs
, i
;
3219 page_t
*targpp
, *rpp
, *hpp
;
3220 page_t
*replpp
= NULL
;
3221 page_t
*pplist
= NULL
;
3225 pgcnt
= page_get_pagecnt(szc
);
3227 ASSERT(PAGE_EXCL(pp
));
3228 ASSERT(!PP_ISNORELOC(pp
));
3229 if (PP_ISFREE(pp
)) {
3231 * If this is a PG_FREE_LIST page then its
3232 * size code can change underneath us due to
3233 * page promotion or demotion. As an optimzation
3234 * use page_list_sub_pages() instead of
3237 if (PP_ISAGED(pp
)) {
3238 page_list_sub_pages(pp
, szc
);
3239 if (pp
->p_szc
== szc
) {
3242 ASSERT(pp
->p_szc
< szc
);
3243 npgs
= page_get_pagecnt(pp
->p_szc
);
3245 for (i
= 0; i
< npgs
; i
++, pp
++) {
3248 page_list_concat(&pplist
, &hpp
);
3252 ASSERT(!PP_ISAGED(pp
));
3253 ASSERT(pp
->p_szc
== 0);
3254 page_list_sub(pp
, PG_CACHE_LIST
);
3255 page_hashout(pp
, false);
3258 page_list_concat(&pplist
, &pp
);
3263 npgs
= page_get_pagecnt(pp
->p_szc
);
3266 * page_create_wait freemem accounting done by caller of
3267 * page_get_freelist and not necessary to call it prior to
3268 * calling page_get_replacement_page.
3270 * page_get_replacement_page can call page_get_contig_pages
3271 * to acquire a large page (szc > 0); the replacement must be
3272 * smaller than the contig page size to avoid looping or
3273 * szc == 0 and PGI_PGCPSZC0 is set.
3275 if (pp
->p_szc
< szc
|| (szc
== 0 && (flags
& PGI_PGCPSZC0
))) {
3276 replpp
= page_get_replacement_page(pp
, NULL
, 0);
3278 npgs
= page_get_pagecnt(pp
->p_szc
);
3279 ASSERT(npgs
<= pgcnt
);
3285 * If replacement is NULL or do_page_relocate fails, fail
3286 * coalescing of pages.
3288 if (replpp
== NULL
|| (do_page_relocate(&targpp
, &replpp
, 0,
3289 &npgs
, NULL
) != 0)) {
3291 * Unlock un-processed target list
3294 ASSERT(PAGE_EXCL(pp
));
3295 page_unlock_nocapture(pp
);
3299 * Free the processed target list.
3303 page_sub(&pplist
, pp
);
3304 ASSERT(PAGE_EXCL(pp
));
3305 ASSERT(pp
->p_szc
== szc
);
3306 ASSERT(PP_ISFREE(pp
));
3307 ASSERT(PP_ISAGED(pp
));
3309 page_list_add(pp
, PG_FREE_LIST
| PG_LIST_TAIL
);
3310 page_unlock_nocapture(pp
);
3314 page_free_replacement_page(replpp
);
3318 ASSERT(pp
== targpp
);
3320 ASSERT(hpp
= pp
); /* That's right, it's an assignment */
3326 ASSERT(PAGE_EXCL(targpp
));
3327 ASSERT(!PP_ISFREE(targpp
));
3328 ASSERT(!PP_ISNORELOC(targpp
));
3330 ASSERT(PP_ISAGED(targpp
));
3331 ASSERT(targpp
->p_szc
< szc
|| (szc
== 0 &&
3332 (flags
& PGI_PGCPSZC0
)));
3333 targpp
->p_szc
= szc
;
3334 targpp
= targpp
->p_next
;
3337 ASSERT(rpp
!= NULL
);
3338 page_sub(&replpp
, rpp
);
3339 ASSERT(PAGE_EXCL(rpp
));
3340 ASSERT(!PP_ISFREE(rpp
));
3341 page_unlock_nocapture(rpp
);
3343 ASSERT(targpp
== hpp
);
3344 ASSERT(replpp
== NULL
);
3345 page_list_concat(&pplist
, &targpp
);
3347 CHK_LPG(pplist
, szc
);
3352 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3353 * of 0 means nothing left after trim.
3356 trimkcage(struct memseg
*mseg
, pfn_t
*lo
, pfn_t
*hi
, pfn_t pfnlo
, pfn_t pfnhi
)
3362 if (PP_ISNORELOC(mseg
->pages
)) {
3363 if (PP_ISNORELOC(mseg
->epages
- 1) == 0) {
3365 /* lower part of this mseg inside kernel cage */
3366 decr
= kcage_current_pfn(&kcagepfn
);
3368 /* kernel cage may have transitioned past mseg */
3369 if (kcagepfn
>= mseg
->pages_base
&&
3370 kcagepfn
< mseg
->pages_end
) {
3372 *lo
= MAX(kcagepfn
, pfnlo
);
3373 *hi
= MIN(pfnhi
, (mseg
->pages_end
- 1));
3377 /* else entire mseg in the cage */
3379 if (PP_ISNORELOC(mseg
->epages
- 1)) {
3381 /* upper part of this mseg inside kernel cage */
3382 decr
= kcage_current_pfn(&kcagepfn
);
3384 /* kernel cage may have transitioned past mseg */
3385 if (kcagepfn
>= mseg
->pages_base
&&
3386 kcagepfn
< mseg
->pages_end
) {
3388 *hi
= MIN(kcagepfn
, pfnhi
);
3389 *lo
= MAX(pfnlo
, mseg
->pages_base
);
3393 /* entire mseg outside of kernel cage */
3394 *lo
= MAX(pfnlo
, mseg
->pages_base
);
3395 *hi
= MIN(pfnhi
, (mseg
->pages_end
- 1));
3403 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3404 * page with size code 'szc'. Claiming such a page requires acquiring
3405 * exclusive locks on all constituent pages (page_trylock_contig_pages),
3406 * relocating pages in use and concatenating these constituent pages into a
3409 * The page lists do not have such a large page and page_freelist_split has
3410 * already failed to demote larger pages and/or coalesce smaller free pages.
3412 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3413 * pages with the same color as 'bin'.
3415 * 'pfnflag' specifies the subset of the pfn range to search.
3419 page_geti_contig_pages(int mnode
, uint_t bin
, uchar_t szc
, int flags
,
3420 pfn_t pfnlo
, pfn_t pfnhi
, pgcnt_t pfnflag
)
3422 struct memseg
*mseg
;
3423 pgcnt_t szcpgcnt
= page_get_pagecnt(szc
);
3424 pgcnt_t szcpgmask
= szcpgcnt
- 1;
3426 page_t
*pp
, *randpp
, *endpp
;
3427 uint_t colors
, ceq_mask
;
3431 MEM_NODE_ITERATOR_DECL(it
);
3433 ASSERT(szc
!= 0 || (flags
& PGI_PGCPSZC0
));
3435 pfnlo
= P2ROUNDUP(pfnlo
, szcpgcnt
);
3437 if ((pfnhi
- pfnlo
) + 1 < szcpgcnt
|| pfnlo
>= pfnhi
)
3440 ASSERT(szc
< mmu_page_sizes
);
3442 colors
= PAGE_GET_PAGECOLORS(szc
);
3443 color_mask
= colors
- 1;
3444 if ((colors
> 1) && (flags
& PG_MATCH_COLOR
)) {
3445 uchar_t ceq
= colorequivszc
[szc
];
3446 uint_t ceq_dif
= colors
>> ((ceq
>> 4) + (ceq
& 0xf));
3448 ASSERT(ceq_dif
> 0);
3449 ceq_mask
= (ceq_dif
- 1) << (ceq
& 0xf);
3454 ASSERT(bin
< colors
);
3456 /* clear "non-significant" color bits */
3460 * trim the pfn range to search based on pfnflag. pfnflag is set
3461 * when there have been previous page_get_contig_page failures to
3464 * The high bit in pfnflag specifies the number of 'slots' in the
3465 * pfn range and the remainder of pfnflag specifies which slot.
3466 * For example, a value of 1010b would mean the second slot of
3467 * the pfn range that has been divided into 8 slots.
3470 int slots
= 1 << (highbit(pfnflag
) - 1);
3471 int slotid
= pfnflag
& (slots
- 1);
3475 pfnhi
= P2ALIGN((pfnhi
+ 1), szcpgcnt
) - 1;
3476 szcpages
= ((pfnhi
- pfnlo
) + 1) / szcpgcnt
;
3477 slotlen
= howmany(szcpages
, slots
);
3478 /* skip if 'slotid' slot is empty */
3479 if (slotid
* slotlen
>= szcpages
)
3481 pfnlo
= pfnlo
+ (((slotid
* slotlen
) % szcpages
) * szcpgcnt
);
3482 ASSERT(pfnlo
< pfnhi
);
3483 if (pfnhi
> pfnlo
+ (slotlen
* szcpgcnt
))
3484 pfnhi
= pfnlo
+ (slotlen
* szcpgcnt
) - 1;
3488 * This routine is can be called recursively so we shouldn't
3489 * acquire a reader lock if a write request is pending. This
3490 * could lead to a deadlock with the DR thread.
3492 * Returning NULL informs the caller that we could not get
3493 * a contig page with the required characteristics.
3496 if (!memsegs_trylock(0))
3500 * loop through memsegs to look for contig page candidates
3503 for (mseg
= memsegs
; mseg
!= NULL
; mseg
= mseg
->next
) {
3504 if (pfnhi
< mseg
->pages_base
|| pfnlo
>= mseg
->pages_end
) {
3509 if (mseg
->pages_end
- mseg
->pages_base
< szcpgcnt
)
3510 /* mseg too small */
3514 * trim off kernel cage pages from pfn range and check for
3515 * a trimmed pfn range returned that does not span the
3516 * desired large page size.
3519 if (trimkcage(mseg
, &lo
, &hi
, pfnlo
, pfnhi
) == 0 ||
3520 lo
>= hi
|| ((hi
- lo
) + 1) < szcpgcnt
)
3523 lo
= MAX(pfnlo
, mseg
->pages_base
);
3524 hi
= MIN(pfnhi
, (mseg
->pages_end
- 1));
3527 /* round to szcpgcnt boundaries */
3528 lo
= P2ROUNDUP(lo
, szcpgcnt
);
3530 MEM_NODE_ITERATOR_INIT(lo
, mnode
, szc
, &it
);
3531 hi
= P2ALIGN((hi
+ 1), szcpgcnt
) - 1;
3537 * set lo to point to the pfn for the desired bin. Large
3538 * page sizes may only have a single page color
3541 if (ceq_mask
> 0 || interleaved_mnodes
) {
3542 /* set lo to point at appropriate color */
3543 if (((PFN_2_COLOR(lo
, szc
, &it
) ^ bin
) & ceq_mask
) ||
3544 (interleaved_mnodes
&&
3545 PFN_2_MEM_NODE(lo
) != mnode
)) {
3546 PAGE_NEXT_PFN_FOR_COLOR(lo
, szc
, bin
, ceq_mask
,
3550 /* mseg cannot satisfy color request */
3554 /* randomly choose a point between lo and hi to begin search */
3556 randpfn
= (pfn_t
)GETTICK();
3557 randpfn
= ((randpfn
% (hi
- lo
)) + lo
) & ~(skip
- 1);
3558 MEM_NODE_ITERATOR_INIT(randpfn
, mnode
, szc
, &it
);
3559 if (ceq_mask
|| interleaved_mnodes
|| randpfn
== (pfn_t
)-1) {
3560 if (randpfn
!= (pfn_t
)-1) {
3561 PAGE_NEXT_PFN_FOR_COLOR(randpfn
, szc
, bin
,
3562 ceq_mask
, color_mask
, &it
);
3564 if (randpfn
>= hi
) {
3566 MEM_NODE_ITERATOR_INIT(randpfn
, mnode
, szc
,
3570 randpp
= mseg
->pages
+ (randpfn
- mseg
->pages_base
);
3572 ASSERT(randpp
->p_pagenum
== randpfn
);
3575 endpp
= mseg
->pages
+ (hi
- mseg
->pages_base
) + 1;
3577 ASSERT(randpp
+ szcpgcnt
<= endpp
);
3580 ASSERT(!(pp
->p_pagenum
& szcpgmask
));
3581 ASSERT(((PP_2_BIN(pp
) ^ bin
) & ceq_mask
) == 0);
3583 if (page_trylock_contig_pages(mnode
, pp
, szc
, flags
)) {
3584 /* pages unlocked by page_claim on failure */
3585 if (page_claim_contig_pages(pp
, szc
, flags
)) {
3591 if (ceq_mask
== 0 && !interleaved_mnodes
) {
3594 pfn_t pfn
= pp
->p_pagenum
;
3596 PAGE_NEXT_PFN_FOR_COLOR(pfn
, szc
, bin
,
3597 ceq_mask
, color_mask
, &it
);
3598 if (pfn
== (pfn_t
)-1) {
3602 (pfn
- mseg
->pages_base
);
3606 /* start from the beginning */
3607 MEM_NODE_ITERATOR_INIT(lo
, mnode
, szc
, &it
);
3608 pp
= mseg
->pages
+ (lo
- mseg
->pages_base
);
3609 ASSERT(pp
->p_pagenum
== lo
);
3610 ASSERT(pp
+ szcpgcnt
<= endpp
);
3612 } while (pp
!= randpp
);
3620 * controlling routine that searches through physical memory in an attempt to
3621 * claim a large page based on the input parameters.
3622 * on the page free lists.
3624 * calls page_geti_contig_pages with an initial pfn range from the mnode
3625 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3626 * that overlaps with the kernel cage or does not match the requested page
3627 * color if PG_MATCH_COLOR is set. Since this search is very expensive,
3628 * page_geti_contig_pages may further limit the search range based on
3629 * previous failure counts (pgcpfailcnt[]).
3631 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3632 * pagesize page that satisfies mtype.
3635 page_get_contig_pages(int mnode
, uint_t bin
, int mtype
, uchar_t szc
,
3638 pfn_t pfnlo
, pfnhi
; /* contig pages pfn range */
3640 pgcnt_t pfnflag
= 0; /* no limit on search if 0 */
3642 VM_STAT_ADD(vmm_vmstats
.pgcp_alloc
[szc
]);
3644 /* no allocations from cage */
3645 flags
|= PGI_NOCAGE
;
3647 MTYPE_START(mnode
, mtype
, flags
);
3648 if (mtype
< 0) { /* mnode does not have memory in mtype range */
3649 VM_STAT_ADD(vmm_vmstats
.pgcp_allocempty
[szc
]);
3653 ASSERT(szc
> 0 || (flags
& PGI_PGCPSZC0
));
3655 /* do not limit search and ignore color if hi pri */
3657 if (pgcplimitsearch
&& ((flags
& PGI_PGCPHIPRI
) == 0))
3658 pfnflag
= pgcpfailcnt
[szc
];
3660 /* remove color match to improve chances */
3662 if (flags
& PGI_PGCPHIPRI
|| pfnflag
)
3663 flags
&= ~PG_MATCH_COLOR
;
3666 /* get pfn range based on mnode and mtype */
3667 MNODETYPE_2_PFN(mnode
, mtype
, pfnlo
, pfnhi
);
3669 ASSERT(pfnhi
>= pfnlo
);
3671 pp
= page_geti_contig_pages(mnode
, bin
, szc
, flags
,
3672 pfnlo
, pfnhi
, pfnflag
);
3675 pfnflag
= pgcpfailcnt
[szc
];
3677 /* double the search size */
3678 pgcpfailcnt
[szc
] = pfnflag
>> 1;
3680 VM_STAT_ADD(vmm_vmstats
.pgcp_allocok
[szc
]);
3683 MTYPE_NEXT(mnode
, mtype
, flags
);
3684 } while (mtype
>= 0);
3686 VM_STAT_ADD(vmm_vmstats
.pgcp_allocfailed
[szc
]);
3690 #if defined(__i386) || defined(__amd64)
3692 * Determine the likelihood of finding/coalescing a szc page.
3693 * Return 0 if the likelihood is small otherwise return 1.
3695 * For now, be conservative and check only 1g pages and return 0
3696 * if there had been previous coalescing failures and the szc pages
3697 * needed to satisfy request would exhaust most of freemem.
3700 page_chk_freelist(uint_t szc
)
3707 pgcnt
= page_get_pagecnt(szc
);
3708 if (pgcpfailcnt
[szc
] && pgcnt
+ throttlefree
>= freemem
) {
3709 VM_STAT_ADD(vmm_vmstats
.pcf_deny
[szc
]);
3712 VM_STAT_ADD(vmm_vmstats
.pcf_allow
[szc
]);
3718 * Find the `best' page on the freelist for this (obj,off) (as,vaddr) pair.
3720 * Does its own locking and accounting.
3721 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3722 * pages of the proper color even if there are pages of a different color.
3724 * Finds a page, removes it, THEN locks it.
3729 page_get_freelist(struct vmobject
*obj
, uoff_t off
, struct seg
*seg
,
3730 caddr_t vaddr
, size_t size
, uint_t flags
, struct lgrp
*lgrp
)
3732 struct as
*as
= seg
->s_as
;
3738 page_t
*(*page_get_func
)(int, uint_t
, int, uchar_t
, uint_t
);
3739 lgrp_mnode_cookie_t lgrp_cookie
;
3741 page_get_func
= page_get_mnode_freelist
;
3744 * If we aren't passed a specific lgroup, or passed a freed lgrp
3745 * assume we wish to allocate near to the current thread's home.
3747 if (!LGRP_EXISTS(lgrp
))
3748 lgrp
= lgrp_home_lgrp();
3751 if ((flags
& (PG_NORELOC
| PG_PANIC
)) == PG_NORELOC
&&
3752 kcage_freemem
< kcage_throttlefree
+ btop(size
) &&
3753 curthread
!= kcage_cageout_thread
) {
3755 * Set a "reserve" of kcage_throttlefree pages for
3756 * PG_PANIC and cageout thread allocations.
3758 * Everybody else has to serialize in
3759 * page_create_get_something() to get a cage page, so
3760 * that we don't deadlock cageout!
3765 flags
&= ~PG_NORELOC
;
3766 flags
|= PGI_NOCAGE
;
3769 MTYPE_INIT(mtype
, obj
->vnode
, vaddr
, flags
, size
);
3772 * Convert size to page size code.
3774 if ((szc
= page_szc(size
)) == (uchar_t
)-1)
3775 panic("page_get_freelist: illegal page size request");
3776 ASSERT(szc
< mmu_page_sizes
);
3778 VM_STAT_ADD(vmm_vmstats
.pgf_alloc
[szc
]);
3780 AS_2_BIN(as
, seg
, obj
->vnode
, vaddr
, bin
, szc
);
3782 ASSERT(bin
< PAGE_GET_PAGECOLORS(szc
));
3785 * Try to get a local page first, but try remote if we can't
3786 * get a page of the right color.
3789 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
, LGRP_SRCH_LOCAL
);
3790 while ((mnode
= lgrp_memnode_choose(&lgrp_cookie
)) >= 0) {
3791 pp
= page_get_func(mnode
, bin
, mtype
, szc
, flags
);
3793 VM_STAT_ADD(vmm_vmstats
.pgf_allocok
[szc
]);
3794 DTRACE_PROBE4(page__get
,
3805 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3806 * remote free lists. Caller expected to call page_get_cachelist which
3807 * will check local cache lists and remote free lists.
3809 if (szc
== 0 && ((flags
& PGI_PGCPSZC0
) == 0)) {
3810 VM_STAT_ADD(vmm_vmstats
.pgf_allocdeferred
);
3814 ASSERT(szc
> 0 || (flags
& PGI_PGCPSZC0
));
3816 lgrp_stat_add(lgrp
->lgrp_id
, LGRP_NUM_ALLOC_FAIL
, 1);
3818 if (!(flags
& PG_LOCAL
)) {
3820 * Try to get a non-local freelist page.
3822 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie
);
3823 while ((mnode
= lgrp_memnode_choose(&lgrp_cookie
)) >= 0) {
3824 pp
= page_get_func(mnode
, bin
, mtype
, szc
, flags
);
3826 DTRACE_PROBE4(page__get
,
3831 VM_STAT_ADD(vmm_vmstats
.pgf_allocokrem
[szc
]);
3839 * when the cage is off chances are page_get_contig_pages() will fail
3840 * to lock a large page chunk therefore when the cage is off it's not
3841 * called by default. this can be changed via /etc/system.
3843 * page_get_contig_pages() also called to acquire a base pagesize page
3844 * for page_create_get_something().
3846 if (!(flags
& PG_NORELOC
) && (pg_contig_disable
== 0) &&
3847 (kcage_on
|| pg_lpgcreate_nocage
|| szc
== 0) &&
3848 (page_get_func
!= page_get_contig_pages
)) {
3850 VM_STAT_ADD(vmm_vmstats
.pgf_allocretry
[szc
]);
3851 page_get_func
= page_get_contig_pages
;
3855 if (!(flags
& PG_LOCAL
) && pgcplimitsearch
&&
3856 page_get_func
== page_get_contig_pages
)
3857 SETPGCPFAILCNT(szc
);
3859 VM_STAT_ADD(vmm_vmstats
.pgf_allocfailed
[szc
]);
3864 * Find the `best' page on the cachelist for this (obj,off) (as,vaddr) pair.
3866 * Does its own locking.
3867 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3868 * pages of the proper color even if there are pages of a different color.
3869 * Otherwise, scan the bins for ones with pages. For each bin with pages,
3870 * try to lock one of them. If no page can be locked, try the
3871 * next bin. Return NULL if a page can not be found and locked.
3873 * Finds a pages, trys to lock it, then removes it.
3878 page_get_cachelist(struct vmobject
*obj
, uoff_t off
, struct seg
*seg
,
3879 caddr_t vaddr
, uint_t flags
, struct lgrp
*lgrp
)
3882 struct as
*as
= seg
->s_as
;
3886 lgrp_mnode_cookie_t lgrp_cookie
;
3889 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3890 * assume we wish to allocate near to the current thread's home.
3892 if (!LGRP_EXISTS(lgrp
))
3893 lgrp
= lgrp_home_lgrp();
3896 flags
&= ~PG_NORELOC
;
3897 flags
|= PGI_NOCAGE
;
3900 if ((flags
& (PG_NORELOC
| PG_PANIC
| PG_PUSHPAGE
)) == PG_NORELOC
&&
3901 kcage_freemem
<= kcage_throttlefree
) {
3903 * Reserve kcage_throttlefree pages for critical kernel
3906 * Everybody else has to go to page_create_get_something()
3907 * to get a cage page, so we don't deadlock cageout.
3912 AS_2_BIN(as
, seg
, obj
->vnode
, vaddr
, bin
, 0);
3914 ASSERT(bin
< PAGE_GET_PAGECOLORS(0));
3916 MTYPE_INIT(mtype
, obj
->vnode
, vaddr
, flags
, MMU_PAGESIZE
);
3918 VM_STAT_ADD(vmm_vmstats
.pgc_alloc
);
3921 * Try local cachelists first
3923 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
, LGRP_SRCH_LOCAL
);
3924 while ((mnode
= lgrp_memnode_choose(&lgrp_cookie
)) >= 0) {
3925 pp
= page_get_mnode_cachelist(bin
, flags
, mnode
, mtype
);
3927 VM_STAT_ADD(vmm_vmstats
.pgc_allocok
);
3928 DTRACE_PROBE4(page__get
,
3937 lgrp_stat_add(lgrp
->lgrp_id
, LGRP_NUM_ALLOC_FAIL
, 1);
3940 * Try freelists/cachelists that are farther away
3941 * This is our only chance to allocate remote pages for PAGESIZE
3944 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie
);
3945 while ((mnode
= lgrp_memnode_choose(&lgrp_cookie
)) >= 0) {
3946 pp
= page_get_mnode_freelist(mnode
, bin
, mtype
,
3949 VM_STAT_ADD(vmm_vmstats
.pgc_allocokdeferred
);
3950 DTRACE_PROBE4(page__get
,
3957 pp
= page_get_mnode_cachelist(bin
, flags
, mnode
, mtype
);
3959 VM_STAT_ADD(vmm_vmstats
.pgc_allocokrem
);
3960 DTRACE_PROBE4(page__get
,
3969 VM_STAT_ADD(vmm_vmstats
.pgc_allocfailed
);
3974 page_get_mnode_cachelist(uint_t bin
, uint_t flags
, int mnode
, int mtype
)
3977 page_t
*pp
, *first_pp
;
3979 int plw_initialized
;
3980 page_list_walker_t plw
;
3982 VM_STAT_ADD(vmm_vmstats
.pgmc_alloc
);
3984 MTYPE_START(mnode
, mtype
, flags
);
3985 if (mtype
< 0) { /* mnode does not have memory in mtype range */
3986 VM_STAT_ADD(vmm_vmstats
.pgmc_allocempty
);
3992 plw_initialized
= 0;
3993 plw
.plw_ceq_dif
= 1;
3996 * Only hold one cachelist lock at a time, that way we
3997 * can start anywhere and not have to worry about lock
4001 for (plw
.plw_count
= 0;
4002 plw
.plw_count
< plw
.plw_ceq_dif
; plw
.plw_count
++) {
4006 if (!PAGE_CACHELISTS(mnode
, bin
, mtype
))
4008 pcm
= PC_BIN_MUTEX(mnode
, bin
, PG_CACHE_LIST
);
4010 pp
= PAGE_CACHELISTS(mnode
, bin
, mtype
);
4015 VERIFY(pp
->p_object
);
4016 ASSERT(pp
->p_vnode
);
4017 ASSERT(PP_ISAGED(pp
) == 0);
4018 ASSERT(pp
->p_szc
== 0);
4019 ASSERT(PFN_2_MEM_NODE(pp
->p_pagenum
) == mnode
);
4020 while (!page_trylock(pp
, SE_EXCL
)) {
4022 ASSERT(pp
->p_szc
== 0);
4023 if (pp
== first_pp
) {
4025 * We have searched the complete list!
4026 * And all of them (might only be one)
4027 * are locked. This can happen since
4028 * these pages can also be found via
4029 * the hash list. When found via the
4030 * hash list, they are locked first,
4031 * then removed. We give up to let the
4037 VERIFY(pp
->p_object
);
4038 ASSERT(pp
->p_vnode
);
4039 ASSERT(PP_ISFREE(pp
));
4040 ASSERT(PP_ISAGED(pp
) == 0);
4041 ASSERT(PFN_2_MEM_NODE(pp
->p_pagenum
) ==
4048 * Found and locked a page.
4049 * Pull it off the list.
4051 ASSERT(mtype
== PP_2_MTYPE(pp
));
4052 ppp
= &PAGE_CACHELISTS(mnode
, bin
, mtype
);
4055 * Subtract counters before releasing pcm mutex
4056 * to avoid a race with page_freelist_coalesce
4057 * and page_freelist_split.
4059 page_ctr_sub(mnode
, mtype
, pp
, PG_CACHE_LIST
);
4061 VERIFY(pp
->p_object
);
4062 ASSERT(pp
->p_vnode
);
4063 ASSERT(PP_ISAGED(pp
) == 0);
4064 #if defined(__sparc)
4066 (flags
& PG_NORELOC
) == 0 ||
4068 if (PP_ISNORELOC(pp
)) {
4069 kcage_freemem_sub(1);
4072 VM_STAT_ADD(vmm_vmstats
. pgmc_allocok
);
4078 if (plw_initialized
== 0) {
4079 page_list_walk_init(0, flags
, bin
, 0, 1, &plw
);
4080 plw_initialized
= 1;
4082 /* calculate the next bin with equivalent color */
4083 bin
= ADD_MASKED(bin
, plw
.plw_bin_step
,
4084 plw
.plw_ceq_mask
[0], plw
.plw_color_mask
);
4085 } while (sbin
!= bin
);
4087 if (plw
.plw_ceq_dif
> 1)
4088 bin
= page_list_walk_next_bin(0, bin
, &plw
);
4091 MTYPE_NEXT(mnode
, mtype
, flags
);
4095 VM_STAT_ADD(vmm_vmstats
.pgmc_allocfailed
);
4100 #define REPL_PAGE_STATS
4103 #ifdef REPL_PAGE_STATS
4104 struct repl_page_stats
{
4106 uint_t ngets_noreloc
;
4107 uint_t npgr_noreloc
;
4108 uint_t nnopage_first
;
4114 #define REPL_STAT_INCR(v) atomic_inc_32(&repl_page_stats.v)
4115 #else /* REPL_PAGE_STATS */
4116 #define REPL_STAT_INCR(v)
4117 #endif /* REPL_PAGE_STATS */
4122 * The freemem accounting must be done by the caller.
4123 * First we try to get a replacement page of the same size as like_pp,
4124 * if that is not possible, then we just get a set of discontiguous
4128 page_get_replacement_page(page_t
*orig_like_pp
, struct lgrp
*lgrp_target
,
4132 page_t
*pp
, *pplist
;
4135 int mnode
, page_mnode
;
4137 spgcnt_t npgs
, pg_cnt
;
4141 lgrp_mnode_cookie_t lgrp_cookie
;
4144 REPL_STAT_INCR(ngets
);
4145 like_pp
= orig_like_pp
;
4146 ASSERT(PAGE_EXCL(like_pp
));
4148 szc
= like_pp
->p_szc
;
4149 npgs
= page_get_pagecnt(szc
);
4151 * Now we reset like_pp to the base page_t.
4152 * That way, we won't walk past the end of this 'szc' page.
4154 pfnum
= PFN_BASE(like_pp
->p_pagenum
, szc
);
4155 like_pp
= page_numtopp_nolock(pfnum
);
4156 ASSERT(like_pp
->p_szc
== szc
);
4158 if (PP_ISNORELOC(like_pp
)) {
4160 REPL_STAT_INCR(ngets_noreloc
);
4161 flags
= PGI_RELOCONLY
;
4162 } else if (pgrflags
& PGR_NORELOC
) {
4164 REPL_STAT_INCR(npgr_noreloc
);
4169 * Kernel pages must always be replaced with the same size
4170 * pages, since we cannot properly handle demotion of kernel
4173 if (PP_ISKAS(like_pp
))
4174 pgrflags
|= PGR_SAMESZC
;
4176 MTYPE_PGR_INIT(mtype
, flags
, like_pp
, page_mnode
, npgs
);
4181 pg_cnt
= page_get_pagecnt(szc
);
4182 bin
= PP_2_BIN(like_pp
);
4183 ASSERT(like_pp
->p_szc
== orig_like_pp
->p_szc
);
4184 ASSERT(pg_cnt
<= npgs
);
4187 * If an lgroup was specified, try to get the
4188 * page from that lgroup.
4189 * NOTE: Must be careful with code below because
4190 * lgroup may disappear and reappear since there
4191 * is no locking for lgroup here.
4193 if (LGRP_EXISTS(lgrp_target
)) {
4195 * Keep local variable for lgroup separate
4196 * from lgroup argument since this code should
4197 * only be exercised when lgroup argument
4202 /* Try the lgroup's freelists first */
4203 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
,
4205 while ((pplist
== NULL
) &&
4206 (mnode
= lgrp_memnode_choose(&lgrp_cookie
))
4209 page_get_mnode_freelist(mnode
, bin
,
4214 * Now try it's cachelists if this is a
4215 * small page. Don't need to do it for
4216 * larger ones since page_freelist_coalesce()
4219 if (pplist
!= NULL
|| szc
!= 0)
4222 /* Now try it's cachelists */
4223 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
,
4226 while ((pplist
== NULL
) &&
4227 (mnode
= lgrp_memnode_choose(&lgrp_cookie
))
4230 page_get_mnode_cachelist(bin
, flags
,
4233 if (pplist
!= NULL
) {
4234 page_hashout(pplist
, false);
4236 REPL_STAT_INCR(nhashout
);
4239 /* Done looking in this lgroup. Bail out. */
4244 * No lgroup was specified (or lgroup was removed by
4245 * DR, so just try to get the page as close to
4246 * like_pp's mnode as possible.
4247 * First try the local freelist...
4249 mnode
= PP_2_MEM_NODE(like_pp
);
4250 pplist
= page_get_mnode_freelist(mnode
, bin
,
4255 REPL_STAT_INCR(nnofree
);
4258 * ...then the local cachelist. Don't need to do it for
4259 * larger pages cause page_freelist_coalesce() already
4260 * failed there anyway.
4263 pplist
= page_get_mnode_cachelist(bin
, flags
,
4265 if (pplist
!= NULL
) {
4266 page_hashout(pplist
, false);
4268 REPL_STAT_INCR(nhashout
);
4273 /* Now try remote freelists */
4276 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode
));
4277 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
,
4279 while (pplist
== NULL
&&
4280 (mnode
= lgrp_memnode_choose(&lgrp_cookie
))
4285 if ((mnode
== page_mnode
) ||
4286 (mem_node_config
[mnode
].exists
== 0))
4289 pplist
= page_get_mnode_freelist(mnode
,
4290 bin
, mtype
, szc
, flags
);
4297 /* Now try remote cachelists */
4298 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
,
4300 while (pplist
== NULL
&& szc
== 0) {
4301 mnode
= lgrp_memnode_choose(&lgrp_cookie
);
4307 if ((mnode
== page_mnode
) ||
4308 (mem_node_config
[mnode
].exists
== 0))
4311 pplist
= page_get_mnode_cachelist(bin
,
4312 flags
, mnode
, mtype
);
4314 if (pplist
!= NULL
) {
4315 page_hashout(pplist
, false);
4317 REPL_STAT_INCR(nhashout
);
4323 * Break out of while loop under the following cases:
4324 * - If we successfully got a page.
4325 * - If pgrflags specified only returning a specific
4326 * page size and we could not find that page size.
4327 * - If we could not satisfy the request with PAGESIZE
4330 if (pplist
!= NULL
|| szc
== 0)
4333 if ((pgrflags
& PGR_SAMESZC
) || pgrppgcp
) {
4334 /* try to find contig page */
4336 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
,
4339 while ((pplist
== NULL
) &&
4341 lgrp_memnode_choose(&lgrp_cookie
))
4343 pplist
= page_get_contig_pages(
4344 mnode
, bin
, mtype
, szc
,
4345 flags
| PGI_PGCPHIPRI
);
4351 * The correct thing to do here is try the next
4352 * page size down using szc--. Due to a bug
4353 * with the processing of HAT_RELOAD_SHARE
4354 * where the sfmmu_ttecnt arrays of all
4355 * hats sharing an ISM segment don't get updated,
4356 * using intermediate size pages for relocation
4357 * can lead to continuous page faults.
4362 if (pplist
!= NULL
) {
4363 DTRACE_PROBE4(page__get
,
4369 while (pplist
!= NULL
&& pg_cnt
--) {
4370 ASSERT(pplist
!= NULL
);
4372 page_sub(&pplist
, pp
);
4375 page_list_concat(&pl
, &pp
);
4377 like_pp
= like_pp
+ 1;
4378 REPL_STAT_INCR(nnext_pp
);
4380 ASSERT(pg_cnt
== 0);
4388 * We were unable to allocate the necessary number
4390 * We need to free up any pl.
4392 REPL_STAT_INCR(nnopage
);
4393 page_free_replacement_page(pl
);
4401 * demote a free large page to it's constituent pages
4404 page_demote_free_pages(page_t
*pp
)
4410 ASSERT(PAGE_LOCKED(pp
));
4411 ASSERT(PP_ISFREE(pp
));
4412 ASSERT(pp
->p_szc
!= 0 && pp
->p_szc
< mmu_page_sizes
);
4414 mnode
= PP_2_MEM_NODE(pp
);
4415 page_freelist_lock(mnode
);
4416 if (pp
->p_szc
!= 0) {
4417 (void) page_demote(mnode
, PFN_BASE(pp
->p_pagenum
,
4418 pp
->p_szc
), 0, pp
->p_szc
, 0, PC_NO_COLOR
, PC_FREE
);
4420 page_freelist_unlock(mnode
);
4421 ASSERT(pp
->p_szc
== 0);
4425 * Factor in colorequiv to check additional 'equivalent' bins.
4426 * colorequiv may be set in /etc/system
4429 page_set_colorequiv_arr(void)
4431 if (colorequiv
> 1) {
4433 uint_t sv_a
= lowbit(colorequiv
) - 1;
4438 for (i
= 0; i
< MMU_PAGE_SIZES
; i
++) {
4442 if ((colors
= hw_page_array
[i
].hp_colors
) <= 1) {
4445 while ((colors
>> a
) == 0)
4447 if ((a
<< 4) > colorequivszc
[i
]) {
4448 colorequivszc
[i
] = (a
<< 4);