4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2010, Intel Corporation.
26 * All rights reserved.
30 * UNIX machine dependent virtual memory support.
41 #include <sys/clock.h>
42 #include <vm/hat_pte.h>
43 #include <sys/param.h>
44 #include <sys/memnode.h>
47 * WARNING: vm_dep.h is included by files in common.
50 #define GETTICK() tsc_read()
52 * Do not use this function for obtaining clock tick. This
53 * is called by callers who do not need to have a guarenteed
54 * correct tick value. The proper routine to use is tsc_read().
57 extern u_longlong_t
randtick();
58 extern uint_t
page_create_update_flags_x86(uint_t
);
60 extern size_t plcnt_sz(size_t);
61 #define PLCNT_SZ(ctrs_sz) (ctrs_sz = plcnt_sz(ctrs_sz))
63 extern caddr_t
plcnt_init(caddr_t
);
64 #define PLCNT_INIT(addr) (addr = plcnt_init(addr))
66 extern void plcnt_inc_dec(page_t
*, int, int, long, int);
67 #define PLCNT_INCR(pp, mnode, mtype, szc, flags) \
68 plcnt_inc_dec(pp, mtype, szc, 1l << PAGE_BSZS_SHIFT(szc), flags)
69 #define PLCNT_DECR(pp, mnode, mtype, szc, flags) \
70 plcnt_inc_dec(pp, mtype, szc, \
71 (long)(ULONG_MAX << PAGE_BSZS_SHIFT(szc)), flags)
74 * macro to update page list max counts. no-op on x86.
76 #define PLCNT_XFER_NORELOC(pp)
79 * macro to modify the page list max counts when memory is added to
80 * the page lists during startup (add_physmem) or during a DR operation
81 * when memory is added (kphysm_add_memory_dynamic) or deleted
82 * (kphysm_del_cleanup).
84 #define PLCNT_MODIFY_MAX(pfn, cnt) mtype_modify_max(pfn, cnt)
86 extern int memrange_num(pfn_t
);
87 extern int pfn_2_mtype(pfn_t
);
88 extern int mtype_func(int, int, uint_t
);
89 extern void mtype_modify_max(pfn_t
, long);
90 extern int mnode_pgcnt(int);
91 extern int mnode_range_cnt(int);
94 * candidate counters in vm_pagelist.c are indexed by color and range
96 #define NUM_MEM_RANGES 4 /* memory range types */
97 #define MAX_MNODE_MRANGES NUM_MEM_RANGES
98 #define MNODE_RANGE_CNT(mnode) mnode_range_cnt(mnode)
99 #define MNODE_MAX_MRANGE(mnode) memrange_num(mem_node_config[mnode].physbase)
102 * This was really badly defined, it implicitly uses mnode_maxmrange[]
103 * which is a static in vm_pagelist.c
105 extern int mtype_2_mrange(int);
106 #define MTYPE_2_MRANGE(mnode, mtype) \
107 (mnode_maxmrange[mnode] - mtype_2_mrange(mtype))
110 * Per page size free lists. Allocated dynamically.
111 * dimensions [mtype][mmu_page_sizes][colors]
113 * mtype specifies a physical memory range with a unique mnode.
116 extern page_t
****page_freelists
;
118 #define PAGE_FREELISTS(mnode, szc, color, mtype) \
119 (*(page_freelists[mtype][szc] + (color)))
122 * For now there is only a single size cache list. Allocated dynamically.
123 * dimensions [mtype][colors]
125 * mtype specifies a physical memory range with a unique mnode.
127 extern page_t
***page_cachelists
;
129 #define PAGE_CACHELISTS(mnode, color, mtype) \
130 (*(page_cachelists[mtype] + (color)))
133 * There are mutexes for both the page freelist
134 * and the page cachelist. We want enough locks to make contention
135 * reasonable, but not too many -- otherwise page_freelist_lock() gets
136 * so expensive that it becomes the bottleneck!
141 extern kmutex_t
*fpc_mutex
[NPC_MUTEX
];
142 extern kmutex_t
*cpc_mutex
[NPC_MUTEX
];
144 extern page_t
*page_get_mnode_freelist(int, uint_t
, int, uchar_t
, uint_t
);
145 extern page_t
*page_get_mnode_cachelist(uint_t
, uint_t
, int, int);
147 /* mem node iterator is not used on x86 */
148 #define MEM_NODE_ITERATOR_DECL(it)
149 #define MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it)
152 * interleaved_mnodes mode is never set on x86, therefore,
153 * simply return the limits of the given mnode, which then
154 * determines the length of hpm_counters array for the mnode.
156 #define HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first) \
158 (physbase) = mem_node_config[(mnode)].physbase; \
159 (physmax) = mem_node_config[(mnode)].physmax; \
163 #define PAGE_CTRS_WRITE_LOCK(mnode) \
165 rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER);\
166 page_freelist_lock(mnode); \
169 #define PAGE_CTRS_WRITE_UNLOCK(mnode) \
171 page_freelist_unlock(mnode); \
172 rw_exit(&page_ctrs_rwlock[(mnode)]); \
176 * macro to call page_ctrs_adjust() when memory is added
177 * during a DR operation.
179 #define PAGE_CTRS_ADJUST(pfn, cnt, rv) { \
180 spgcnt_t _cnt = (spgcnt_t)(cnt); \
183 pfn_t _pfn = (pfn); \
184 pfn_t _endpfn = _pfn + _cnt; \
185 while (_pfn < _endpfn) { \
186 _mn = PFN_2_MEM_NODE(_pfn); \
187 _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - _pfn; \
189 if ((rv = page_ctrs_adjust(_mn)) != 0) \
194 #define PAGE_GET_COLOR_SHIFT(szc, nszc) \
195 (hw_page_array[(nszc)].hp_shift - hw_page_array[(szc)].hp_shift)
197 #define PAGE_CONVERT_COLOR(ncolor, szc, nszc) \
198 ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc)))
200 #define PFN_2_COLOR(pfn, szc, it) \
201 (((pfn) & page_colors_mask) >> \
202 (hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
204 #define PNUM_SIZE(szc) \
205 (hw_page_array[(szc)].hp_pgcnt)
206 #define PNUM_SHIFT(szc) \
207 (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
208 #define PAGE_GET_SHIFT(szc) \
209 (hw_page_array[(szc)].hp_shift)
210 #define PAGE_GET_PAGECOLORS(szc) \
211 (hw_page_array[(szc)].hp_colors)
214 * This macro calculates the next sequential pfn with the specified
215 * color using color equivalency mask
217 #define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it) \
219 uint_t pfn_shift = PAGE_BSZS_SHIFT(szc); \
220 pfn_t spfn = pfn >> pfn_shift; \
221 pfn_t stride = (ceq_mask) + 1; \
222 ASSERT(((color) & ~(ceq_mask)) == 0); \
223 ASSERT((((ceq_mask) + 1) & (ceq_mask)) == 0); \
224 if (((spfn ^ (color)) & (ceq_mask)) == 0) { \
225 pfn += stride << pfn_shift; \
227 pfn = (spfn & ~(pfn_t)(ceq_mask)) | (color); \
228 pfn = (pfn > spfn ? pfn : pfn + stride) << pfn_shift; \
232 /* get the color equivalency mask for the next szc */
233 #define PAGE_GET_NSZ_MASK(szc, mask) \
234 ((mask) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc)))
236 /* get the color of the next szc */
237 #define PAGE_GET_NSZ_COLOR(szc, color) \
238 ((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc)))
240 /* Find the bin for the given page if it was of size szc */
241 #define PP_2_BIN_SZC(pp, szc) (PFN_2_COLOR(pp->p_pagenum, szc, NULL))
243 #define PP_2_BIN(pp) (PP_2_BIN_SZC(pp, pp->p_szc))
245 #define PP_2_MEM_NODE(pp) (PFN_2_MEM_NODE(pp->p_pagenum))
246 #define PP_2_MTYPE(pp) (pfn_2_mtype(pp->p_pagenum))
247 #define PP_2_SZC(pp) (pp->p_szc)
249 #define SZCPAGES(szc) (1 << PAGE_BSZS_SHIFT(szc))
250 #define PFN_BASE(pfnum, szc) (pfnum & ~(SZCPAGES(szc) - 1))
253 * this structure is used for walking free page lists
254 * controls when to split large pages into smaller pages,
255 * and when to coalesce smaller pages into larger pages
257 typedef struct page_list_walker
{
258 uint_t plw_colors
; /* num of colors for szc */
259 uint_t plw_color_mask
; /* colors-1 */
260 uint_t plw_bin_step
; /* next bin: 1 or 2 */
261 uint_t plw_count
; /* loop count */
262 uint_t plw_bin0
; /* starting bin */
263 uint_t plw_bin_marker
; /* bin after initial jump */
264 uint_t plw_bin_split_prev
; /* last bin we tried to split */
265 uint_t plw_do_split
; /* set if OK to split */
266 uint_t plw_split_next
; /* next bin to split */
267 uint_t plw_ceq_dif
; /* number of different color groups */
269 uint_t plw_ceq_mask
[MMU_PAGE_SIZES
+ 1]; /* color equiv mask */
270 uint_t plw_bins
[MMU_PAGE_SIZES
+ 1]; /* num of bins */
271 } page_list_walker_t
;
273 void page_list_walk_init(uchar_t szc
, uint_t flags
, uint_t bin
,
274 int can_split
, int use_ceq
, page_list_walker_t
*plw
);
276 uint_t
page_list_walk_next_bin(uchar_t szc
, uint_t bin
,
277 page_list_walker_t
*plw
);
279 extern struct cpu cpus
[];
282 extern int mtype_init(vnode_t
*, caddr_t
, uint_t
*, size_t);
283 #define MTYPE_INIT(mtype, vp, vaddr, flags, pgsz) \
284 (mtype = mtype_init(vp, vaddr, &(flags), pgsz))
287 * macros to loop through the mtype range (page_get_mnode_{free,cache,any}list,
288 * and page_get_contig_pages)
290 * MTYPE_START sets the initial mtype. -1 if the mtype range specified does
293 * MTYPE_NEXT sets the next mtype. -1 if there are no more valid
294 * mtype in the range.
297 #define MTYPE_START(mnode, mtype, flags) \
298 (mtype = mtype_func(mnode, mtype, flags))
300 #define MTYPE_NEXT(mnode, mtype, flags) { \
301 if (flags & PGI_MT_RANGE) { \
302 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); \
308 extern int mtype_pgr_init(int *, page_t
*, int, pgcnt_t
);
309 #define MTYPE_PGR_INIT(mtype, flags, pp, mnode, pgcnt) \
310 (mtype = mtype_pgr_init(&flags, pp, mnode, pgcnt))
312 #define MNODE_PGCNT(mnode) mnode_pgcnt(mnode)
314 extern void mnodetype_2_pfn(int, int, pfn_t
*, pfn_t
*);
315 #define MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi) \
316 mnodetype_2_pfn(mnode, mtype, &pfnlo, &pfnhi)
318 #define PC_BIN_MUTEX(mnode, bin, flags) ((flags & PG_FREE_LIST) ? \
319 &fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] : \
320 &cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode])
322 #define FPC_MUTEX(mnode, i) (&fpc_mutex[i][mnode])
323 #define CPC_MUTEX(mnode, i) (&cpc_mutex[i][mnode])
326 #define CHK_LPG(pp, szc) chk_lpg(pp, szc)
327 extern void chk_lpg(page_t
*, uchar_t
);
329 #define CHK_LPG(pp, szc)
332 #define FULL_REGION_CNT(rg_szc) \
333 (LEVEL_SIZE(rg_szc) >> LEVEL_SHIFT(rg_szc - 1))
335 /* Return the leader for this mapping size */
336 #define PP_GROUPLEADER(pp, szc) \
337 (&(pp)[-(int)((pp)->p_pagenum & (SZCPAGES(szc)-1))])
339 /* Return the root page for this page based on p_szc */
340 #define PP_PAGEROOT(pp) ((pp)->p_szc == 0 ? (pp) : \
341 PP_GROUPLEADER((pp), (pp)->p_szc))
344 * The counter base must be per page_counter element to prevent
345 * races when re-indexing, and the base page size element should
346 * be aligned on a boundary of the given region size.
348 * We also round up the number of pages spanned by the counters
349 * for a given region to PC_BASE_ALIGN in certain situations to simplify
350 * the coding for some non-performance critical routines.
353 #define PC_BASE_ALIGN ((pfn_t)1 << PAGE_BSZS_SHIFT(MMU_PAGE_SIZES-1))
354 #define PC_BASE_ALIGN_MASK (PC_BASE_ALIGN - 1)
357 * cpu/mmu-dependent vm variables
359 extern uint_t mmu_page_sizes
;
360 extern uint_t mmu_exported_page_sizes
;
362 * page sizes that legacy applications can see via getpagesizes(3c).
363 * Used to prevent legacy applications from inadvertantly using the
364 * 'new' large pagesizes (1g and above).
366 extern uint_t mmu_legacy_page_sizes
;
368 /* For x86, userszc is the same as the kernel's szc */
369 #define USERSZC_2_SZC(userszc) (userszc)
370 #define SZC_2_USERSZC(szc) (szc)
373 * for hw_page_map_t, sized to hold the ratio of large page to base
374 * pagesize (1024 max)
376 typedef short hpmctr_t
;
379 * get the setsize of the current cpu - assume homogenous for x86
381 extern int l2cache_sz
, l2cache_linesz
, l2cache_assoc
;
383 #define L2CACHE_ALIGN l2cache_linesz
384 #define L2CACHE_ALIGN_MAX 64
385 #define CPUSETSIZE() \
386 (l2cache_assoc ? (l2cache_sz / l2cache_assoc) : MMU_PAGESIZE)
389 * Return the log2(pagesize(szc) / MMU_PAGESIZE) --- or the shift count
390 * for the number of base pages in this pagesize
392 #define PAGE_BSZS_SHIFT(szc) (LEVEL_SHIFT(szc) - MMU_PAGESHIFT)
395 * Internal PG_ flags.
397 #define PGI_RELOCONLY 0x010000 /* opposite of PG_NORELOC */
398 #define PGI_NOCAGE 0x020000 /* cage is disabled */
399 #define PGI_PGCPHIPRI 0x040000 /* page_get_contig_page pri alloc */
400 #define PGI_PGCPSZC0 0x080000 /* relocate base pagesize page */
403 * PGI range flags - should not overlap PGI flags
405 #define PGI_MT_RANGE0 0x1000000 /* mtype range to 0 */
406 #define PGI_MT_RANGE16M 0x2000000 /* mtype range to 16m */
407 #define PGI_MT_RANGE4G 0x4000000 /* mtype range to 4g */
408 #define PGI_MT_NEXT 0x8000000 /* get next mtype */
409 #define PGI_MT_RANGE (PGI_MT_RANGE0 | PGI_MT_RANGE16M | PGI_MT_RANGE4G)
413 * Maximum and default values for user heap, stack, private and shared
414 * anonymous memory, and user text and initialized data.
415 * Used by map_pgsz*() routines.
417 extern size_t max_uheap_lpsize
;
418 extern size_t default_uheap_lpsize
;
419 extern size_t max_ustack_lpsize
;
420 extern size_t default_ustack_lpsize
;
421 extern size_t max_privmap_lpsize
;
422 extern size_t max_uidata_lpsize
;
423 extern size_t max_utext_lpsize
;
424 extern size_t max_shm_lpsize
;
425 extern size_t mcntl0_lpsize
;
428 * Sanity control. Don't use large pages regardless of user
429 * settings if there's less than priv or shm_lpg_min_physmem memory installed.
430 * The units for this variable are 8K pages.
432 extern pgcnt_t privm_lpg_min_physmem
;
433 extern pgcnt_t shm_lpg_min_physmem
;
436 * hash as and addr to get a bin.
439 #define AS_2_BIN(as, seg, vp, addr, bin, szc) \
440 bin = (((((uintptr_t)(addr) >> PAGESHIFT) + ((uintptr_t)(as) >> 4)) \
441 & page_colors_mask) >> \
442 (hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
445 * cpu private vm data - accessed thru CPU->cpu_vm_data
446 * vc_pnum_memseg: tracks last memseg visited in page_numtopp_nolock()
447 * vc_pnext_memseg: tracks last memseg visited in page_nextn()
448 * vc_kmptr: orignal unaligned kmem pointer for this vm_cpu_data_t
449 * vc_kmsize: orignal kmem size for this vm_cpu_data_t
453 struct memseg
*vc_pnum_memseg
;
454 struct memseg
*vc_pnext_memseg
;
459 /* allocation size to ensure vm_cpu_data_t resides in its own cache line */
460 #define VM_CPU_DATA_PADSIZE \
461 (P2ROUNDUP(sizeof (vm_cpu_data_t), L2CACHE_ALIGN_MAX))
464 * When a bin is empty, and we can't satisfy a color request correctly,
465 * we scan. If we assume that the programs have reasonable spatial
466 * behavior, then it will not be a good idea to use the adjacent color.
467 * Using the adjacent color would result in virtually adjacent addresses
468 * mapping into the same spot in the cache. So, if we stumble across
469 * an empty bin, skip a bunch before looking. After the first skip,
470 * then just look one bin at a time so we don't miss our cache on
471 * every look. Be sure to check every bin. Page_create() will panic
474 * This also explains the `<=' in the for loops in both page_get_freelist()
475 * and page_get_cachelist(). Since we checked the target bin, skipped
476 * a bunch, then continued one a time, we wind up checking the target bin
477 * twice to make sure we get all of them bins.
482 struct vmm_vmstats_str
{
483 ulong_t pgf_alloc
[MMU_PAGE_SIZES
]; /* page_get_freelist */
484 ulong_t pgf_allocok
[MMU_PAGE_SIZES
];
485 ulong_t pgf_allocokrem
[MMU_PAGE_SIZES
];
486 ulong_t pgf_allocfailed
[MMU_PAGE_SIZES
];
487 ulong_t pgf_allocdeferred
;
488 ulong_t pgf_allocretry
[MMU_PAGE_SIZES
];
489 ulong_t pgc_alloc
; /* page_get_cachelist */
491 ulong_t pgc_allocokrem
;
492 ulong_t pgc_allocokdeferred
;
493 ulong_t pgc_allocfailed
;
494 ulong_t pgcp_alloc
[MMU_PAGE_SIZES
]; /* page_get_contig_pages */
495 ulong_t pgcp_allocfailed
[MMU_PAGE_SIZES
];
496 ulong_t pgcp_allocempty
[MMU_PAGE_SIZES
];
497 ulong_t pgcp_allocok
[MMU_PAGE_SIZES
];
498 ulong_t ptcp
[MMU_PAGE_SIZES
]; /* page_trylock_contig_pages */
499 ulong_t ptcpfreethresh
[MMU_PAGE_SIZES
];
500 ulong_t ptcpfailexcl
[MMU_PAGE_SIZES
];
501 ulong_t ptcpfailszc
[MMU_PAGE_SIZES
];
502 ulong_t ptcpfailcage
[MMU_PAGE_SIZES
];
503 ulong_t ptcpok
[MMU_PAGE_SIZES
];
504 ulong_t pgmf_alloc
[MMU_PAGE_SIZES
]; /* page_get_mnode_freelist */
505 ulong_t pgmf_allocfailed
[MMU_PAGE_SIZES
];
506 ulong_t pgmf_allocempty
[MMU_PAGE_SIZES
];
507 ulong_t pgmf_allocok
[MMU_PAGE_SIZES
];
508 ulong_t pgmc_alloc
; /* page_get_mnode_cachelist */
509 ulong_t pgmc_allocfailed
;
510 ulong_t pgmc_allocempty
;
511 ulong_t pgmc_allocok
;
512 ulong_t pladd_free
[MMU_PAGE_SIZES
]; /* page_list_add/sub */
513 ulong_t plsub_free
[MMU_PAGE_SIZES
];
516 ulong_t plsubpages_szcbig
;
517 ulong_t plsubpages_szc0
;
518 ulong_t pfs_req
[MMU_PAGE_SIZES
]; /* page_freelist_split */
519 ulong_t pfs_demote
[MMU_PAGE_SIZES
];
520 ulong_t pfc_coalok
[MMU_PAGE_SIZES
][MAX_MNODE_MRANGES
];
521 ulong_t ppr_reloc
[MMU_PAGE_SIZES
]; /* page_relocate */
522 ulong_t ppr_relocnoroot
[MMU_PAGE_SIZES
];
523 ulong_t ppr_reloc_replnoroot
[MMU_PAGE_SIZES
];
524 ulong_t ppr_relocnolock
[MMU_PAGE_SIZES
];
525 ulong_t ppr_relocnomem
[MMU_PAGE_SIZES
];
526 ulong_t ppr_relocok
[MMU_PAGE_SIZES
];
527 ulong_t ppr_copyfail
;
528 /* page coalesce counter */
529 ulong_t page_ctrs_coalesce
[MMU_PAGE_SIZES
][MAX_MNODE_MRANGES
];
530 /* candidates useful */
531 ulong_t page_ctrs_cands_skip
[MMU_PAGE_SIZES
][MAX_MNODE_MRANGES
];
532 /* ctrs changed after locking */
533 ulong_t page_ctrs_changed
[MMU_PAGE_SIZES
][MAX_MNODE_MRANGES
];
534 /* page_freelist_coalesce failed */
535 ulong_t page_ctrs_failed
[MMU_PAGE_SIZES
][MAX_MNODE_MRANGES
];
536 ulong_t page_ctrs_coalesce_all
; /* page coalesce all counter */
537 ulong_t page_ctrs_cands_skip_all
; /* candidates useful for all func */
538 ulong_t restrict4gcnt
;
539 ulong_t unrestrict16mcnt
; /* non-DMA 16m allocs allowed */
540 ulong_t pgpanicalloc
; /* PG_PANIC allocation */
541 ulong_t pcf_deny
[MMU_PAGE_SIZES
]; /* page_chk_freelist */
542 ulong_t pcf_allow
[MMU_PAGE_SIZES
];
544 extern struct vmm_vmstats_str vmm_vmstats
;
545 #endif /* VM_STATS */
547 extern size_t page_ctrs_sz(void);
548 extern caddr_t
page_ctrs_alloc(caddr_t
);
549 extern void page_ctr_sub(int, int, page_t
*, int);
550 extern page_t
*page_freelist_split(uchar_t
,
551 uint_t
, int, int, pfn_t
, pfn_t
, page_list_walker_t
*);
552 extern page_t
*page_freelist_coalesce(int, uchar_t
, uint_t
, uint_t
, int,
554 extern void page_freelist_coalesce_all(int);
555 extern uint_t
page_get_pagecolors(uint_t
);
556 extern void pfnzero(pfn_t
, uint_t
, uint_t
);
562 #endif /* _VM_DEP_H */