4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2010, Intel Corporation.
26 * All rights reserved.
27 * Copyright 2019, Joyent, Inc.
30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
31 /* All Rights Reserved */
34 * Portions of this source code were derived from Berkeley 4.3 BSD
35 * under license from the Regents of the University of California.
39 * UNIX machine dependent virtual memory support.
42 #include <sys/types.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
50 #include <sys/cpuvar.h>
55 #include <sys/vnode.h>
58 #include <sys/exechdr.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
62 #include <sys/dumphdr.h>
63 #include <sys/random.h>
68 #include <vm/seg_kp.h>
69 #include <vm/seg_vn.h>
71 #include <vm/seg_kmem.h>
72 #include <vm/seg_kpm.h>
73 #include <vm/vm_dep.h>
76 #include <sys/vm_machparam.h>
77 #include <sys/memlist.h>
78 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
79 #include <vm/hat_i86.h>
80 #include <sys/x86_archext.h>
81 #include <sys/elf_386.h>
82 #include <sys/cmn_err.h>
83 #include <sys/archsystm.h>
84 #include <sys/machsystm.h>
85 #include <sys/secflags.h>
87 #include <sys/vtrace.h>
88 #include <sys/ddidmareq.h>
89 #include <sys/promif.h>
90 #include <sys/memnode.h>
91 #include <sys/stack.h>
92 #include <util/qsort.h>
93 #include <sys/taskq.h>
97 #include <sys/hypervisor.h>
98 #include <sys/xen_mmu.h>
99 #include <sys/balloon_impl.h>
102 * domain 0 pages usable for DMA are kept pre-allocated and kept in
103 * distinct lists, ordered by increasing mfn.
105 static kmutex_t io_pool_lock
;
106 static kmutex_t contig_list_lock
;
107 static page_t
*io_pool_4g
; /* pool for 32 bit dma limited devices */
108 static page_t
*io_pool_16m
; /* pool for 24 bit dma limited legacy devices */
109 static long io_pool_cnt
;
110 static long io_pool_cnt_max
= 0;
111 #define DEFAULT_IO_POOL_MIN 128
112 static long io_pool_cnt_min
= DEFAULT_IO_POOL_MIN
;
113 static long io_pool_cnt_lowater
= 0;
114 static long io_pool_shrink_attempts
; /* how many times did we try to shrink */
115 static long io_pool_shrinks
; /* how many times did we really shrink */
116 static long io_pool_grows
; /* how many times did we grow */
117 static mfn_t start_mfn
= 1;
118 static caddr_t io_pool_kva
; /* use to alloc pages when needed */
120 static int create_contig_pfnlist(uint_t
);
123 * percentage of phys mem to hold in the i/o pool
125 #define DEFAULT_IO_POOL_PCT 2
126 static long io_pool_physmem_pct
= DEFAULT_IO_POOL_PCT
;
127 static void page_io_pool_sub(page_t
**, page_t
*, page_t
*);
132 uint_t vac_colors
= 1;
134 int largepagesupport
= 0;
135 extern uint_t page_create_new
;
136 extern uint_t page_create_exists
;
137 extern uint_t page_create_putbacks
;
139 * Allow users to disable the kernel's use of SSE.
141 extern int use_sse_pagecopy
, use_sse_pagezero
;
144 * combined memory ranges from mnode and memranges[] to manage single
145 * mnode/mtype dimension in the page lists.
151 int mnr_memrange
; /* index into memranges[] */
152 int mnr_next
; /* next lower PA mnoderange */
154 /* maintain page list stats */
155 pgcnt_t mnr_mt_clpgcnt
; /* cache list cnt */
156 pgcnt_t mnr_mt_flpgcnt
[MMU_PAGE_SIZES
]; /* free list cnt per szc */
157 pgcnt_t mnr_mt_totcnt
; /* sum of cache and free lists */
159 struct mnr_mts
{ /* mnode/mtype szc stats */
160 pgcnt_t mnr_mts_pgcnt
;
162 pgcnt_t
*mnr_mtsc_pgcnt
;
167 #define MEMRANGEHI(mtype) \
168 ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
169 #define MEMRANGELO(mtype) (memranges[mtype])
171 #define MTYPE_FREEMEM(mt) (mnoderanges[mt].mnr_mt_totcnt)
174 * As the PC architecture evolved memory up was clumped into several
175 * ranges for various historical I/O devices to do DMA.
178 * < 4Gig - PCI bus or drivers that don't understand PAE mode
180 * These are listed in reverse order, so that we can skip over unused
181 * ranges on machines with small memories.
183 * For now under the Hypervisor, we'll only ever have one memrange.
185 #define PFN_4GIG 0x100000
186 #define PFN_16MEG 0x1000
187 /* Indices into the memory range (arch_memranges) array. */
192 static pfn_t arch_memranges
[NUM_MEM_RANGES
] = {
193 PFN_4GIG
, /* pfn range for 4G and above */
194 0x80000, /* pfn range for 2G-4G */
195 PFN_16MEG
, /* pfn range for 16M-2G */
196 0x00000, /* pfn range for 0-16M */
198 pfn_t
*memranges
= &arch_memranges
[0];
199 int nranges
= NUM_MEM_RANGES
;
202 * This combines mem_node_config and memranges into one data
203 * structure to be used for page list management.
205 static mnoderange_t
*mnoderanges
;
206 static int mnoderangecnt
;
212 * 4g memory management variables for systems with more than 4g of memory:
214 * physical memory below 4g is required for 32bit dma devices and, currently,
215 * for kmem memory. On systems with more than 4g of memory, the pool of memory
216 * below 4g can be depleted without any paging activity given that there is
217 * likely to be sufficient memory above 4g.
219 * physmax4g is set true if the largest pfn is over 4g. The rest of the
220 * 4g memory management code is enabled only when physmax4g is true.
222 * maxmem4g is the count of the maximum number of pages on the page lists
223 * with physical addresses below 4g. It can be a lot less then 4g given that
224 * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
227 * freemem4g maintains the count of the number of available pages on the
228 * page lists with physical addresses below 4g.
230 * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
231 * 6% (desfree4gshift = 4) of maxmem4g.
233 * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
234 * and the amount of physical memory above 4g is greater than freemem4g.
235 * In this case, page_get_* routines will restrict below 4g allocations
236 * for requests that don't specifically require it.
239 #define DESFREE4G (maxmem4g >> desfree4gshift)
241 #define RESTRICT4G_ALLOC \
242 (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
244 static pgcnt_t maxmem4g
;
245 static pgcnt_t freemem4g
;
246 static int physmax4g
;
247 static int desfree4gshift
= 4; /* maxmem4g shift to derive DESFREE4G */
250 * 16m memory management:
252 * reserve some amount of physical memory below 16m for legacy devices.
254 * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
255 * 16m or if the 16m pool drops below DESFREE16M.
257 * In this case, general page allocations via page_get_{free,cache}list
258 * routines will be restricted from allocating from the 16m pool. Allocations
259 * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
260 * are not restricted.
263 #define FREEMEM16M MTYPE_FREEMEM(mtype16m)
264 #define DESFREE16M desfree16m
265 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \
266 (mtype16m != -1 && (freemem != 0) && ((flags & PG_PANIC) == 0) && \
267 ((freemem >= (FREEMEM16M)) || \
268 (FREEMEM16M < (DESFREE16M + pgcnt))))
270 static pgcnt_t desfree16m
= 0x380;
273 * This can be patched via /etc/system to allow old non-PAE aware device
274 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
276 int restricted_kmemalloc
= 0;
281 ulong_t pga_notfullrange
;
282 ulong_t pga_nulldmaattr
;
284 ulong_t pga_allocfailed
;
286 ulong_t pgma_allocok
;
287 ulong_t pgma_allocfailed
;
288 ulong_t pgma_allocempty
;
292 uint_t mmu_page_sizes
;
294 /* How many page sizes the users can see */
295 uint_t mmu_exported_page_sizes
;
297 /* page sizes that legacy applications can see */
298 uint_t mmu_legacy_page_sizes
;
301 * Number of pages in 1 GB. Don't enable automatic large pages if we have
302 * fewer than this many pages.
304 pgcnt_t shm_lpg_min_physmem
= 1 << (30 - MMU_PAGESHIFT
);
305 pgcnt_t privm_lpg_min_physmem
= 1 << (30 - MMU_PAGESHIFT
);
308 * Maximum and default segment size tunables for user private
309 * and shared anon memory, and user text and initialized data.
310 * These can be patched via /etc/system to allow large pages
311 * to be used for mapping application private and shared anon memory.
313 size_t mcntl0_lpsize
= MMU_PAGESIZE
;
314 size_t max_uheap_lpsize
= MMU_PAGESIZE
;
315 size_t default_uheap_lpsize
= MMU_PAGESIZE
;
316 size_t max_ustack_lpsize
= MMU_PAGESIZE
;
317 size_t default_ustack_lpsize
= MMU_PAGESIZE
;
318 size_t max_privmap_lpsize
= MMU_PAGESIZE
;
319 size_t max_uidata_lpsize
= MMU_PAGESIZE
;
320 size_t max_utext_lpsize
= MMU_PAGESIZE
;
321 size_t max_shm_lpsize
= MMU_PAGESIZE
;
325 * initialized by page_coloring_init().
328 uint_t page_colors_mask
;
329 uint_t page_coloring_shift
;
331 static uint_t l2_colors
;
334 * Page freelists and cachelists are dynamically allocated once mnoderangecnt
335 * and page_colors are calculated from the l2 cache n-way set size. Within a
336 * mnode range, the page freelist and cachelist are hashed into bins based on
337 * color. This makes it easier to search for a page within a specific memory
340 #define PAGE_COLORS_MIN 16
342 page_t
****page_freelists
;
343 page_t
***page_cachelists
;
347 * Used by page layer to know about page sizes
349 hw_pagesize_t hw_page_array
[MAX_NUM_LEVEL
+ 1];
351 kmutex_t
*fpc_mutex
[NPC_MUTEX
];
352 kmutex_t
*cpc_mutex
[NPC_MUTEX
];
354 /* Lock to protect mnoderanges array for memory DR operations. */
355 static kmutex_t mnoderange_lock
;
358 * Only let one thread at a time try to coalesce large pages, to
359 * prevent them from working against each other.
361 static kmutex_t contig_lock
;
362 #define CONTIG_LOCK() mutex_enter(&contig_lock);
363 #define CONTIG_UNLOCK() mutex_exit(&contig_lock);
365 #define PFN_16M (mmu_btop((uint64_t)0x1000000))
368 i86devmap(pfn_t pf
, pgcnt_t pgcnt
, uint_t prot
)
374 addr1
= addr
= vmem_alloc(heap_arena
, mmu_ptob(pgcnt
), VM_SLEEP
);
376 for (; pgcnt
!= 0; addr
+= MMU_PAGESIZE
, ++pf
, --pgcnt
) {
377 pp
= page_numtopp_nolock(pf
);
379 hat_devload(kas
.a_hat
, addr
, MMU_PAGESIZE
, pf
,
380 prot
| HAT_NOSYNC
, HAT_LOAD_LOCK
);
382 hat_memload(kas
.a_hat
, addr
, pp
,
383 prot
| HAT_NOSYNC
, HAT_LOAD_LOCK
);
391 * This routine is like page_numtopp, but accepts only free pages, which
392 * it allocates (unfrees) and returns with the exclusive lock held.
393 * It is used by machdep.c/dma_init() to find contiguous free pages.
396 page_numtopp_alloc(pfn_t pfnum
)
401 pp
= page_numtopp_nolock(pfnum
);
406 if (!page_trylock(pp
, SE_EXCL
)) {
410 if (page_pptonum(pp
) != pfnum
) {
415 if (!PP_ISFREE(pp
)) {
420 page_demote_free_pages(pp
);
425 /* If associated with a vnode, destroy mappings */
429 page_destroy_free(pp
);
431 if (!page_lock(pp
, SE_EXCL
, (kmutex_t
*)NULL
, P_NO_RECLAIM
)) {
435 if (page_pptonum(pp
) != pfnum
) {
441 if (!PP_ISFREE(pp
)) {
446 if (!page_reclaim(pp
, (kmutex_t
*)NULL
))
453 * Return the optimum page size for a given mapping
457 map_pgsz(int maptype
, struct proc
*p
, caddr_t addr
, size_t len
, int memcntl
)
460 size_t pgsz
= MMU_PAGESIZE
;
464 ASSERT(maptype
!= MAPPGSZ_VA
);
466 if (maptype
!= MAPPGSZ_ISM
&& physmem
< privm_lpg_min_physmem
) {
467 return (MMU_PAGESIZE
);
473 max_lpsize
= memcntl
? mcntl0_lpsize
: (maptype
==
474 MAPPGSZ_HEAP
? max_uheap_lpsize
: max_ustack_lpsize
);
475 if (max_lpsize
== MMU_PAGESIZE
) {
476 return (MMU_PAGESIZE
);
479 len
= (maptype
== MAPPGSZ_HEAP
) ? p
->p_brkbase
+
480 p
->p_brksize
- p
->p_bssbase
: p
->p_stksize
;
482 len
= (maptype
== MAPPGSZ_HEAP
) ? MAX(len
,
483 default_uheap_lpsize
) : MAX(len
, default_ustack_lpsize
);
486 * use the pages size that best fits len
488 for (l
= mmu
.umax_page_level
; l
> 0; --l
) {
489 if (LEVEL_SIZE(l
) > max_lpsize
|| len
< LEVEL_SIZE(l
)) {
492 pgsz
= LEVEL_SIZE(l
);
497 mszc
= (maptype
== MAPPGSZ_HEAP
? p
->p_brkpageszc
:
499 if (addr
== 0 && (pgsz
< hw_page_array
[mszc
].hp_size
)) {
500 pgsz
= hw_page_array
[mszc
].hp_size
;
505 for (l
= mmu
.umax_page_level
; l
> 0; --l
) {
506 if (len
>= LEVEL_SIZE(l
))
507 return (LEVEL_SIZE(l
));
509 return (LEVEL_SIZE(0));
515 map_szcvec(caddr_t addr
, size_t size
, uintptr_t off
, size_t max_lpsize
,
518 caddr_t eaddr
= addr
+ size
;
525 if (physmem
< min_physmem
|| max_lpsize
<= MMU_PAGESIZE
) {
529 for (i
= mmu_exported_page_sizes
- 1; i
> 0; i
--) {
530 pgsz
= page_get_pagesize(i
);
531 if (pgsz
> max_lpsize
) {
534 raddr
= (caddr_t
)P2ROUNDUP((uintptr_t)addr
, pgsz
);
535 readdr
= (caddr_t
)P2ALIGN((uintptr_t)eaddr
, pgsz
);
536 if (raddr
< addr
|| raddr
>= readdr
) {
539 if (P2PHASE((uintptr_t)addr
^ off
, pgsz
)) {
543 * Set szcvec to the remaining page sizes.
545 szcvec
= ((1 << (i
+ 1)) - 1) & ~1;
552 * Return a bit vector of large page size codes that
553 * can be used to map [addr, addr + len) region.
557 map_pgszcvec(caddr_t addr
, size_t size
, uintptr_t off
, int flags
, int type
,
560 size_t max_lpsize
= mcntl0_lpsize
;
562 if (mmu
.max_page_level
== 0)
565 if (flags
& MAP_TEXT
) {
567 max_lpsize
= max_utext_lpsize
;
568 return (map_szcvec(addr
, size
, off
, max_lpsize
,
569 shm_lpg_min_physmem
));
571 } else if (flags
& MAP_INITDATA
) {
573 max_lpsize
= max_uidata_lpsize
;
574 return (map_szcvec(addr
, size
, off
, max_lpsize
,
575 privm_lpg_min_physmem
));
577 } else if (type
== MAPPGSZC_SHM
) {
579 max_lpsize
= max_shm_lpsize
;
580 return (map_szcvec(addr
, size
, off
, max_lpsize
,
581 shm_lpg_min_physmem
));
583 } else if (type
== MAPPGSZC_HEAP
) {
585 max_lpsize
= max_uheap_lpsize
;
586 return (map_szcvec(addr
, size
, off
, max_lpsize
,
587 privm_lpg_min_physmem
));
589 } else if (type
== MAPPGSZC_STACK
) {
591 max_lpsize
= max_ustack_lpsize
;
592 return (map_szcvec(addr
, size
, off
, max_lpsize
,
593 privm_lpg_min_physmem
));
597 max_lpsize
= max_privmap_lpsize
;
598 return (map_szcvec(addr
, size
, off
, max_lpsize
,
599 privm_lpg_min_physmem
));
604 * Handle a pagefault.
609 enum fault_type type
,
624 ASSERT_STACK_ALIGNED();
626 if (INVALID_VADDR(addr
))
629 mapped_red
= segkp_map_red();
642 * Dispatch pagefault.
644 res
= as_fault(hat
, as
, addr
, 1, type
, rw
);
647 * If this isn't a potential unmapped hole in the user's
648 * UNIX data or stack segments, just return status info.
650 if (res
!= FC_NOMAP
|| iskernel
)
654 * Check to see if we happened to faulted on a currently unmapped
655 * part of the UNIX data or stack segments. If so, create a zfod
656 * mapping there and then try calling the fault routine again.
661 if (addr
< base
|| addr
>= base
+ len
) { /* data seg? */
662 base
= (caddr_t
)p
->p_usrstack
- p
->p_stksize
;
664 if (addr
< base
|| addr
>= p
->p_usrstack
) { /* stack seg? */
665 /* not in either UNIX data or stack segments */
672 * the rest of this function implements a 3.X 4.X 5.X compatibility
673 * This code is probably not needed anymore
675 if (p
->p_model
== DATAMODEL_ILP32
) {
677 /* expand the gap to the page boundaries on each side */
678 ea
= P2ROUNDUP((uintptr_t)base
+ len
, MMU_PAGESIZE
);
679 base
= (caddr_t
)P2ALIGN((uintptr_t)base
, MMU_PAGESIZE
);
680 len
= ea
- (uintptr_t)base
;
683 if (as_gap(as
, MMU_PAGESIZE
, &base
, &len
, AH_CONTAIN
, addr
) ==
685 err
= as_map(as
, base
, len
, segvn_create
, zfod_argsp
);
688 res
= FC_MAKE_ERR(err
);
693 * This page is already mapped by another thread after
694 * we returned from as_fault() above. We just fall
695 * through as_fault() below.
700 res
= as_fault(hat
, as
, addr
, 1, F_INVAL
, rw
);
711 map_addr(caddr_t
*addrp
, size_t len
, offset_t off
, int vacalign
, uint_t flags
)
713 struct proc
*p
= curproc
;
714 caddr_t userlimit
= (flags
& _MAP_LOW32
) ?
715 (caddr_t
)_userlimit32
: p
->p_as
->a_userlimit
;
717 map_addr_proc(addrp
, len
, off
, vacalign
, userlimit
, curproc
, flags
);
722 map_addr_vacalign_check(caddr_t addr
, u_offset_t off
)
728 * The maximum amount a randomized mapping will be slewed. We should perhaps
729 * arrange things so these tunables can be separate for mmap, mmapobj, and
732 size_t aslr_max_map_skew
= 256 * 1024 * 1024; /* 256MB */
735 * map_addr_proc() is the routine called when the system is to
736 * choose an address for the user. We will pick an address
737 * range which is the highest available below userlimit.
739 * Every mapping will have a redzone of a single page on either side of
740 * the request. This is done to leave one page unmapped between segments.
741 * This is not required, but it's useful for the user because if their
742 * program strays across a segment boundary, it will catch a fault
743 * immediately making debugging a little easier. Currently the redzone
746 * addrp is a value/result parameter.
747 * On input it is a hint from the user to be used in a completely
748 * machine dependent fashion. We decide to completely ignore this hint.
749 * If MAP_ALIGN was specified, addrp contains the minimal alignment, which
750 * must be some "power of two" multiple of pagesize.
752 * On output it is NULL if no address can be found in the current
753 * processes address space or else an address that is currently
754 * not mapped for len bytes with a page of red zone on either side.
756 * vacalign is not needed on x86 (it's for viturally addressed caches)
769 struct as
*as
= p
->p_as
;
775 ASSERT32(userlimit
== as
->a_userlimit
);
778 if (p
->p_model
== DATAMODEL_NATIVE
) {
779 if (userlimit
< as
->a_userlimit
) {
781 * This happens when a program wants to map
782 * something in a range that's accessible to a
783 * program in a smaller address space. For example,
784 * a 64-bit program calling mmap32(2) to guarantee
785 * that the returned address is below 4Gbytes.
787 ASSERT((uintptr_t)userlimit
< ADDRESS_C(0xffffffff));
789 if (userlimit
> base
)
790 slen
= userlimit
- base
;
797 * With the stack positioned at a higher address than
798 * the heap for 64-bit processes, it is necessary to be
799 * mindful of its location and potential size.
801 * Unallocated space above the top of the stack (that
802 * is, at a lower address) but still within the bounds
803 * of the stack limit should be considered unavailable.
805 * As the 64-bit stack guard is mapped in immediately
806 * adjacent to the stack limit boundary, this prevents
807 * new mappings from having accidentally dangerous
808 * proximity to the stack.
810 slen
= p
->p_usrstack
- base
-
811 ((p
->p_stk_ctl
+ PAGEOFFSET
) & PAGEMASK
);
814 slen
= userlimit
- base
;
817 /* Make len be a multiple of PAGESIZE */
818 len
= (len
+ PAGEOFFSET
) & PAGEMASK
;
821 * figure out what the alignment should be
823 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
825 if (len
<= ELF_386_MAXPGSZ
) {
827 * Align virtual addresses to ensure that ELF shared libraries
828 * are mapped with the appropriate alignment constraints by
829 * the run-time linker.
831 align_amount
= ELF_386_MAXPGSZ
;
834 * For 32-bit processes, only those which have specified
835 * MAP_ALIGN and an addr will be aligned on a larger page size.
836 * Not doing so can potentially waste up to 1G of process
839 int lvl
= (p
->p_model
== DATAMODEL_ILP32
) ? 1 :
842 while (lvl
&& len
< LEVEL_SIZE(lvl
))
845 align_amount
= LEVEL_SIZE(lvl
);
847 if ((flags
& MAP_ALIGN
) && ((uintptr_t)*addrp
> align_amount
))
848 align_amount
= (uintptr_t)*addrp
;
850 ASSERT(ISP2(align_amount
));
851 ASSERT(align_amount
== 0 || align_amount
>= PAGESIZE
);
853 off
= off
& (align_amount
- 1);
856 * Look for a large enough hole starting below userlimit.
857 * After finding it, use the upper part.
859 if (as_gap_aligned(as
, len
, &base
, &slen
, AH_HI
, NULL
, align_amount
,
860 PAGESIZE
, off
) == 0) {
864 * addr is the highest possible address to use since we have
865 * a PAGESIZE redzone at the beginning and end.
867 addr
= base
+ slen
- (PAGESIZE
+ len
);
870 * Round address DOWN to the alignment amount and
872 * If addr is greater than as_addr, len would not be large
873 * enough to include the redzone, so we must adjust down
874 * by the alignment amount.
876 addr
= (caddr_t
)((uintptr_t)addr
& (~(align_amount
- 1)));
877 addr
+= (uintptr_t)off
;
878 if (addr
> as_addr
) {
879 addr
-= align_amount
;
883 * If randomization is requested, slew the allocation
884 * backwards, within the same gap, by a random amount.
886 if (flags
& _MAP_RANDOMIZE
) {
889 (void) random_get_pseudo_bytes((uint8_t *)&slew
,
892 slew
= slew
% MIN(aslr_max_map_skew
, (addr
- base
));
893 addr
-= P2ALIGN(slew
, align_amount
);
897 ASSERT(addr
+ len
< base
+ slen
);
898 ASSERT(((uintptr_t)addr
& (align_amount
- 1)) ==
902 *addrp
= NULL
; /* no more virtual space */
906 int valid_va_range_aligned_wraparound
;
909 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
910 * addresses at least "minlen" long, where the base of the range is at "off"
911 * phase from an "align" boundary and there is space for a "redzone"-sized
912 * redzone on either side of the range. On success, 1 is returned and *basep
913 * and *lenp are adjusted to describe the acceptable range (including
914 * the redzone). On failure, 0 is returned.
918 valid_va_range_aligned(caddr_t
*basep
, size_t *lenp
, size_t minlen
, int dir
,
919 size_t align
, size_t redzone
, size_t off
)
924 ASSERT(align
== 0 ? off
== 0 : off
< align
);
926 ASSERT(align
== 0 || align
>= PAGESIZE
);
928 lo
= (uintptr_t)*basep
;
930 tot_len
= minlen
+ 2 * redzone
; /* need at least this much space */
933 * If hi rolled over the top, try cutting back.
936 *lenp
= 0UL - lo
- 1UL;
937 /* See if this really happens. If so, then we figure out why */
938 valid_va_range_aligned_wraparound
++;
941 if (*lenp
< tot_len
) {
946 * Deal with a possible hole in the address range between
947 * hole_start and hole_end that should never be mapped.
949 if (lo
< hole_start
) {
950 if (hi
> hole_start
) {
954 /* lo < hole_start && hi >= hole_end */
957 * prefer lowest range
959 if (hole_start
- lo
>= tot_len
)
961 else if (hi
- hole_end
>= tot_len
)
967 * prefer highest range
969 if (hi
- hole_end
>= tot_len
)
971 else if (hole_start
- lo
>= tot_len
)
979 /* lo >= hole_start */
986 if (hi
- lo
< tot_len
)
990 uintptr_t tlo
= lo
+ redzone
;
991 uintptr_t thi
= hi
- redzone
;
992 tlo
= (uintptr_t)P2PHASEUP(tlo
, align
, off
);
993 if (tlo
< lo
+ redzone
) {
996 if (thi
< tlo
|| thi
- tlo
< minlen
) {
1001 *basep
= (caddr_t
)lo
;
1007 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
1008 * addresses at least "minlen" long. On success, 1 is returned and *basep
1009 * and *lenp are adjusted to describe the acceptable range. On failure, 0
1013 valid_va_range(caddr_t
*basep
, size_t *lenp
, size_t minlen
, int dir
)
1015 return (valid_va_range_aligned(basep
, lenp
, minlen
, dir
, 0, 0, 0));
1019 * Default to forbidding the first 64k of address space. This protects most
1020 * reasonably sized structures from dereferences through NULL:
1023 uintptr_t forbidden_null_mapping_sz
= 0x10000;
1026 * Determine whether [addr, addr+len] are valid user addresses.
1030 valid_usr_range(caddr_t addr
, size_t len
, uint_t prot
, struct as
*as
,
1033 caddr_t eaddr
= addr
+ len
;
1035 if (eaddr
<= addr
|| addr
>= userlimit
|| eaddr
> userlimit
)
1036 return (RANGE_BADADDR
);
1038 if ((addr
<= (caddr_t
)forbidden_null_mapping_sz
) &&
1039 as
->a_proc
!= NULL
&&
1040 secflag_enabled(as
->a_proc
, PROC_SEC_FORBIDNULLMAP
))
1041 return (RANGE_BADADDR
);
1044 * Check for the VA hole
1046 if (eaddr
> (caddr_t
)hole_start
&& addr
< (caddr_t
)hole_end
)
1047 return (RANGE_BADADDR
);
1049 return (RANGE_OKAY
);
1053 * Return 1 if the page frame is onboard memory, else 0.
1056 pf_is_memory(pfn_t pf
)
1058 if (pfn_is_foreign(pf
))
1060 return (address_in_memlist(phys_install
, pfn_to_pa(pf
), 1));
1064 * return the memrange containing pfn
1067 memrange_num(pfn_t pfn
)
1071 for (n
= 0; n
< nranges
- 1; ++n
) {
1072 if (pfn
>= memranges
[n
])
1079 * return the mnoderange containing pfn
1083 pfn_2_mtype(pfn_t pfn
)
1090 /* Always start from highest pfn and work our way down */
1091 for (n
= mtypetop
; n
!= -1; n
= mnoderanges
[n
].mnr_next
) {
1092 if (pfn
>= mnoderanges
[n
].mnr_pfnlo
) {
1102 * is_contigpage_free:
1103 * returns a page list of contiguous pages. It minimally has to return
1104 * minctg pages. Caller determines minctg based on the scatter-gather
1107 * pfnp is set to the next page frame to search on return.
1120 page_t
*plist
= NULL
;
1123 * fail if pfn + minctg crosses a segment boundary.
1124 * Adjust for next starting pfn to begin at segment boundary.
1127 if (((*pfnp
+ minctg
- 1) & pfnseg
) < (*pfnp
& pfnseg
)) {
1128 *pfnp
= roundup(*pfnp
, pfnseg
+ 1);
1134 pp
= page_numtopp_nolock(pfn
+ i
);
1135 if ((pp
== NULL
) || IS_DUMP_PAGE(pp
) ||
1136 (page_trylock(pp
, SE_EXCL
) == 0)) {
1140 if (page_pptonum(pp
) != pfn
+ i
) {
1145 if (!(PP_ISFREE(pp
))) {
1151 if (!PP_ISAGED(pp
)) {
1152 page_list_sub(pp
, PG_CACHE_LIST
);
1153 page_hashout(pp
, (kmutex_t
*)NULL
);
1155 page_list_sub(pp
, PG_FREE_LIST
);
1160 page_list_concat(&plist
, &pp
);
1163 * exit loop when pgcnt satisfied or segment boundary reached.
1166 } while ((++i
< *pgcnt
) && ((pfn
+ i
) & pfnseg
));
1168 *pfnp
+= i
; /* set to next pfn to search */
1176 * failure: minctg not satisfied.
1178 * if next request crosses segment boundary, set next pfn
1179 * to search from the segment boundary.
1181 if (((*pfnp
+ minctg
- 1) & pfnseg
) < (*pfnp
& pfnseg
))
1182 *pfnp
= roundup(*pfnp
, pfnseg
+ 1);
1184 /* clean up any pages already allocated */
1188 page_sub(&plist
, pp
);
1189 page_list_add(pp
, PG_FREE_LIST
| PG_LIST_TAIL
);
1200 * verify that pages being returned from allocator have correct DMA attribute
1203 #define check_dma(a, b, c) (void)(0)
1206 check_dma(ddi_dma_attr_t
*dma_attr
, page_t
*pp
, int cnt
)
1208 if (dma_attr
== NULL
)
1212 if (pa_to_ma(pfn_to_pa(pp
->p_pagenum
)) <
1213 dma_attr
->dma_attr_addr_lo
)
1214 panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp
);
1215 if (pa_to_ma(pfn_to_pa(pp
->p_pagenum
)) >=
1216 dma_attr
->dma_attr_addr_hi
)
1217 panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp
);
1225 page_get_contigpage(pgcnt_t
*pgcnt
, ddi_dma_attr_t
*mattr
, int iolock
)
1231 page_t
*pplist
= NULL
, *plist
;
1233 pgcnt_t pfnalign
= 0;
1234 static pfn_t startpfn
;
1235 static pgcnt_t lastctgcnt
;
1241 lo
= mmu_btop((mattr
->dma_attr_addr_lo
+ MMU_PAGEOFFSET
));
1242 hi
= mmu_btop(mattr
->dma_attr_addr_hi
);
1245 sgllen
= mattr
->dma_attr_sgllen
;
1246 pfnseg
= mmu_btop(mattr
->dma_attr_seg
);
1248 align
= maxbit(mattr
->dma_attr_align
, mattr
->dma_attr_minxfer
);
1249 if (align
> MMU_PAGESIZE
)
1250 pfnalign
= mmu_btop(align
);
1253 * in order to satisfy the request, must minimally
1254 * acquire minctg contiguous pages
1256 minctg
= howmany(*pgcnt
, sgllen
);
1261 * start from where last searched if the minctg >= lastctgcnt
1263 if (minctg
< lastctgcnt
|| startpfn
< lo
|| startpfn
> hi
)
1269 pfnseg
= mmu
.highest_pfn
;
1272 if (minctg
< lastctgcnt
)
1275 lastctgcnt
= minctg
;
1277 ASSERT(pfnseg
+ 1 >= (uint64_t)minctg
);
1279 /* conserve 16m memory - start search above 16m when possible */
1280 if (hi
> PFN_16M
&& startpfn
< PFN_16M
)
1285 pfn
= P2ROUNDUP(pfn
, pfnalign
);
1287 while (pfn
+ minctg
- 1 <= hi
) {
1289 plist
= is_contigpage_free(&pfn
, pgcnt
, minctg
, pfnseg
, iolock
);
1291 page_list_concat(&pplist
, &plist
);
1294 * return when contig pages no longer needed
1296 if (!*pgcnt
|| ((*pgcnt
<= sgllen
) && !pfnalign
)) {
1299 check_dma(mattr
, pplist
, *pgcnt
);
1302 minctg
= howmany(*pgcnt
, sgllen
);
1305 pfn
= P2ROUNDUP(pfn
, pfnalign
);
1308 /* cannot find contig pages in specified range */
1309 if (startpfn
== lo
) {
1314 /* did not start with lo previously */
1317 pfn
= P2ROUNDUP(pfn
, pfnalign
);
1319 /* allow search to go above startpfn */
1320 while (pfn
< startpfn
) {
1322 plist
= is_contigpage_free(&pfn
, pgcnt
, minctg
, pfnseg
, iolock
);
1323 if (plist
!= NULL
) {
1325 page_list_concat(&pplist
, &plist
);
1329 * return when contig pages no longer needed
1331 if (!*pgcnt
|| ((*pgcnt
<= sgllen
) && !pfnalign
)) {
1334 check_dma(mattr
, pplist
, *pgcnt
);
1337 minctg
= howmany(*pgcnt
, sgllen
);
1340 pfn
= P2ROUNDUP(pfn
, pfnalign
);
1348 * mnode_range_cnt() calculates the number of memory ranges for mnode and
1349 * memranges[]. Used to determine the size of page lists and mnoderanges.
1352 mnode_range_cnt(int mnode
)
1361 if (mem_node_config
[mnode
].exists
!= 0) {
1364 /* find the memranges index below contained in mnode range */
1366 while (MEMRANGEHI(mri
) < mem_node_config
[mnode
].physbase
)
1370 * increment mnode range counter when memranges or mnode
1371 * boundary is reached.
1374 mem_node_config
[mnode
].physmax
>= MEMRANGELO(mri
)) {
1376 if (mem_node_config
[mnode
].physmax
> MEMRANGEHI(mri
))
1382 ASSERT(mnrcnt
<= MAX_MNODE_MRANGES
);
1388 mnoderange_cmp(const void *v1
, const void *v2
)
1390 const mnoderange_t
*m1
= v1
;
1391 const mnoderange_t
*m2
= v2
;
1393 if (m1
->mnr_pfnlo
< m2
->mnr_pfnlo
)
1395 return (m1
->mnr_pfnlo
> m2
->mnr_pfnlo
);
1399 mnode_range_setup(mnoderange_t
*mnoderanges
)
1405 for (mnode
= 0, nr_ranges
= 0, mp
= mnoderanges
;
1406 mnode
< max_mem_nodes
; mnode
++) {
1407 ssize_t mri
= nranges
- 1;
1409 if (mem_node_config
[mnode
].exists
== 0)
1412 while (MEMRANGEHI(mri
) < mem_node_config
[mnode
].physbase
)
1415 while (mri
>= 0 && mem_node_config
[mnode
].physmax
>=
1417 mp
->mnr_pfnlo
= MAX(MEMRANGELO(mri
),
1418 mem_node_config
[mnode
].physbase
);
1419 mp
->mnr_pfnhi
= MIN(MEMRANGEHI(mri
),
1420 mem_node_config
[mnode
].physmax
);
1421 mp
->mnr_mnode
= mnode
;
1422 mp
->mnr_memrange
= mri
;
1427 if (mem_node_config
[mnode
].physmax
> MEMRANGEHI(mri
))
1435 * mnoderangecnt can be larger than nr_ranges when memory DR is
1436 * supposedly supported.
1438 VERIFY3U(nr_ranges
, <=, mnoderangecnt
);
1440 qsort(mnoderanges
, nr_ranges
, sizeof (mnoderange_t
), mnoderange_cmp
);
1443 * If some intrepid soul takes the axe to the memory DR code, we can
1444 * remove ->mnr_next altogether, as we just sorted by ->mnr_pfnlo order.
1446 * The VERIFY3U() above can be "==" then too.
1448 for (size_t i
= 1; i
< nr_ranges
; i
++)
1449 mnoderanges
[i
].mnr_next
= i
- 1;
1451 mtypetop
= nr_ranges
- 1;
1452 mtype16m
= pfn_2_mtype(PFN_16MEG
- 1); /* Can be -1 ... */
1454 mtype4g
= pfn_2_mtype(0xfffff);
1459 * Update mnoderanges for memory hot-add DR operations.
1462 mnode_range_add(int mnode
)
1467 extern void membar_sync(void);
1469 ASSERT(0 <= mnode
&& mnode
< max_mem_nodes
);
1470 ASSERT(mem_node_config
[mnode
].exists
);
1471 start
= mem_node_config
[mnode
].physbase
;
1472 end
= mem_node_config
[mnode
].physmax
;
1473 ASSERT(start
<= end
);
1474 mutex_enter(&mnoderange_lock
);
1477 /* Check whether it interleaves with other memory nodes. */
1478 for (n
= mtypetop
; n
!= -1; n
= mnoderanges
[n
].mnr_next
) {
1479 ASSERT(mnoderanges
[n
].mnr_exists
);
1480 if (mnoderanges
[n
].mnr_mnode
== mnode
)
1482 ASSERT(start
> mnoderanges
[n
].mnr_pfnhi
||
1483 end
< mnoderanges
[n
].mnr_pfnlo
);
1488 while (MEMRANGEHI(mri
) < mem_node_config
[mnode
].physbase
)
1490 while (mri
>= 0 && mem_node_config
[mnode
].physmax
>= MEMRANGELO(mri
)) {
1491 /* Check whether mtype already exists. */
1492 for (n
= mtypetop
; n
!= -1; n
= mnoderanges
[n
].mnr_next
) {
1493 if (mnoderanges
[n
].mnr_mnode
== mnode
&&
1494 mnoderanges
[n
].mnr_memrange
== mri
) {
1495 mnoderanges
[n
].mnr_pfnlo
= MAX(MEMRANGELO(mri
),
1497 mnoderanges
[n
].mnr_pfnhi
= MIN(MEMRANGEHI(mri
),
1503 /* Add a new entry if it doesn't exist yet. */
1505 /* Try to find an unused entry in mnoderanges array. */
1506 for (n
= 0; n
< mnoderangecnt
; n
++) {
1507 if (mnoderanges
[n
].mnr_exists
== 0)
1510 ASSERT(n
< mnoderangecnt
);
1511 mnoderanges
[n
].mnr_pfnlo
= MAX(MEMRANGELO(mri
), start
);
1512 mnoderanges
[n
].mnr_pfnhi
= MIN(MEMRANGEHI(mri
), end
);
1513 mnoderanges
[n
].mnr_mnode
= mnode
;
1514 mnoderanges
[n
].mnr_memrange
= mri
;
1515 mnoderanges
[n
].mnr_exists
= 1;
1516 /* Page 0 should always be present. */
1517 for (prev
= &mtypetop
;
1518 mnoderanges
[*prev
].mnr_pfnlo
> start
;
1519 prev
= &mnoderanges
[*prev
].mnr_next
) {
1520 ASSERT(mnoderanges
[*prev
].mnr_next
>= 0);
1521 ASSERT(mnoderanges
[*prev
].mnr_pfnlo
> end
);
1523 mnoderanges
[n
].mnr_next
= *prev
;
1528 if (mem_node_config
[mnode
].physmax
> MEMRANGEHI(mri
))
1534 mutex_exit(&mnoderange_lock
);
1538 * Update mnoderanges for memory hot-removal DR operations.
1541 mnode_range_del(int mnode
)
1543 _NOTE(ARGUNUSED(mnode
));
1544 ASSERT(0 <= mnode
&& mnode
< max_mem_nodes
);
1545 /* TODO: support deletion operation. */
1550 plat_slice_add(pfn_t start
, pfn_t end
)
1552 mem_node_add_slice(start
, end
);
1553 if (plat_dr_enabled()) {
1554 mnode_range_add(PFN_2_MEM_NODE(start
));
1559 plat_slice_del(pfn_t start
, pfn_t end
)
1561 ASSERT(PFN_2_MEM_NODE(start
) == PFN_2_MEM_NODE(end
));
1562 ASSERT(plat_dr_enabled());
1563 mnode_range_del(PFN_2_MEM_NODE(start
));
1564 mem_node_del_slice(start
, end
);
1570 mtype_init(vnode_t
*vp
, caddr_t vaddr
, uint_t
*flags
, size_t pgsz
)
1572 int mtype
= mtypetop
;
1575 if (RESTRICT4G_ALLOC
) {
1576 VM_STAT_ADD(vmm_vmstats
.restrict4gcnt
);
1577 /* here only for > 4g systems */
1578 *flags
|= PGI_MT_RANGE4G
;
1579 } else if (RESTRICT16M_ALLOC(freemem
, btop(pgsz
), *flags
)) {
1580 *flags
|= PGI_MT_RANGE16M
;
1582 VM_STAT_ADD(vmm_vmstats
.unrestrict16mcnt
);
1583 VM_STAT_COND_ADD((*flags
& PG_PANIC
), vmm_vmstats
.pgpanicalloc
);
1584 *flags
|= PGI_MT_RANGE0
;
1591 /* mtype init for page_get_replacement_page */
1594 mtype_pgr_init(int *flags
, page_t
*pp
, pgcnt_t pgcnt
)
1596 int mtype
= mtypetop
;
1598 if (RESTRICT16M_ALLOC(freemem
, pgcnt
, *flags
)) {
1599 *flags
|= PGI_MT_RANGE16M
;
1601 VM_STAT_ADD(vmm_vmstats
.unrestrict16mcnt
);
1602 *flags
|= PGI_MT_RANGE0
;
1609 * Determine if the mnode range specified in mtype contains memory belonging
1610 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains
1611 * the range from high pfn to 0, 16m or 4g.
1613 * Return first mnode range type index found otherwise return -1 if none found.
1616 mtype_func(int mnode
, int mtype
, uint_t flags
)
1618 if (flags
& PGI_MT_RANGE
) {
1619 int mnr_lim
= MRI_0
;
1621 if (flags
& PGI_MT_NEXT
) {
1622 mtype
= mnoderanges
[mtype
].mnr_next
;
1624 if (flags
& PGI_MT_RANGE4G
)
1625 mnr_lim
= MRI_4G
; /* exclude 0-4g range */
1626 else if (flags
& PGI_MT_RANGE16M
)
1627 mnr_lim
= MRI_16M
; /* exclude 0-16m range */
1628 while (mtype
!= -1 &&
1629 mnoderanges
[mtype
].mnr_memrange
<= mnr_lim
) {
1630 if (mnoderanges
[mtype
].mnr_mnode
== mnode
)
1632 mtype
= mnoderanges
[mtype
].mnr_next
;
1634 } else if (mnoderanges
[mtype
].mnr_mnode
== mnode
) {
1641 * Update the page list max counts with the pfn range specified by the
1645 mtype_modify_max(pfn_t startpfn
, long cnt
)
1649 spgcnt_t scnt
= (spgcnt_t
)(cnt
);
1650 pgcnt_t acnt
= ABS(scnt
);
1651 pfn_t endpfn
= startpfn
+ acnt
;
1658 for (pfn
= endpfn
; pfn
> startpfn
; ) {
1659 ASSERT(mtype
!= -1);
1660 lo
= mnoderanges
[mtype
].mnr_pfnlo
;
1662 if (startpfn
>= lo
) {
1663 inc
= pfn
- startpfn
;
1667 if (mnoderanges
[mtype
].mnr_memrange
!= MRI_4G
) {
1675 mtype
= mnoderanges
[mtype
].mnr_next
;
1680 mtype_2_mrange(int mtype
)
1682 return (mnoderanges
[mtype
].mnr_memrange
);
1686 mnodetype_2_pfn(int mnode
, int mtype
, pfn_t
*pfnlo
, pfn_t
*pfnhi
)
1688 _NOTE(ARGUNUSED(mnode
));
1689 ASSERT(mnoderanges
[mtype
].mnr_mnode
== mnode
);
1690 *pfnlo
= mnoderanges
[mtype
].mnr_pfnlo
;
1691 *pfnhi
= mnoderanges
[mtype
].mnr_pfnhi
;
1695 plcnt_sz(size_t ctrs_sz
)
1700 ctrs_sz
+= mnoderangecnt
* sizeof (struct mnr_mts
) * mmu_page_sizes
;
1701 for (szc
= 0; szc
< mmu_page_sizes
; szc
++) {
1702 colors
= page_get_pagecolors(szc
);
1703 ctrs_sz
+= mnoderangecnt
* sizeof (pgcnt_t
) * colors
;
1710 plcnt_init(caddr_t addr
)
1713 int mt
, szc
, colors
;
1715 for (mt
= 0; mt
< mnoderangecnt
; mt
++) {
1716 mnoderanges
[mt
].mnr_mts
= (struct mnr_mts
*)addr
;
1717 addr
+= (sizeof (struct mnr_mts
) * mmu_page_sizes
);
1718 for (szc
= 0; szc
< mmu_page_sizes
; szc
++) {
1719 colors
= page_get_pagecolors(szc
);
1720 mnoderanges
[mt
].mnr_mts
[szc
].mnr_mts_colors
= colors
;
1721 mnoderanges
[mt
].mnr_mts
[szc
].mnr_mtsc_pgcnt
=
1723 addr
+= (sizeof (pgcnt_t
) * colors
);
1731 plcnt_inc_dec(page_t
*pp
, int mtype
, int szc
, long cnt
, int flags
)
1733 _NOTE(ARGUNUSED(pp
));
1735 int bin
= PP_2_BIN(pp
);
1737 atomic_add_long(&mnoderanges
[mtype
].mnr_mts
[szc
].mnr_mts_pgcnt
, cnt
);
1738 atomic_add_long(&mnoderanges
[mtype
].mnr_mts
[szc
].mnr_mtsc_pgcnt
[bin
],
1741 ASSERT(mtype
== PP_2_MTYPE(pp
));
1742 if (physmax4g
&& mnoderanges
[mtype
].mnr_memrange
!= MRI_4G
)
1743 atomic_add_long(&freemem4g
, cnt
);
1744 if (flags
& PG_CACHE_LIST
)
1745 atomic_add_long(&mnoderanges
[mtype
].mnr_mt_clpgcnt
, cnt
);
1747 atomic_add_long(&mnoderanges
[mtype
].mnr_mt_flpgcnt
[szc
], cnt
);
1748 atomic_add_long(&mnoderanges
[mtype
].mnr_mt_totcnt
, cnt
);
1752 * Returns the free page count for mnode
1755 mnode_pgcnt(int mnode
)
1757 int mtype
= mtypetop
;
1758 int flags
= PGI_MT_RANGE0
;
1761 mtype
= mtype_func(mnode
, mtype
, flags
);
1763 while (mtype
!= -1) {
1764 pgcnt
+= MTYPE_FREEMEM(mtype
);
1765 mtype
= mtype_func(mnode
, mtype
, flags
| PGI_MT_NEXT
);
1771 * Initialize page coloring variables based on the l2 cache parameters.
1772 * Calculate and return memory needed for page coloring data structures.
1775 page_coloring_init(uint_t l2_sz
, int l2_linesz
, int l2_assoc
)
1777 _NOTE(ARGUNUSED(l2_linesz
));
1784 * Hypervisor domains currently don't have any concept of NUMA.
1785 * Hence we'll act like there is only 1 memrange.
1787 i
= memrange_num(1);
1790 * Reduce the memory ranges lists if we don't have large amounts
1791 * of memory. This avoids searching known empty free lists.
1792 * To support memory DR operations, we need to keep memory ranges
1793 * for possible memory hot-add operations.
1795 if (plat_dr_physmax
> physmax
)
1796 i
= memrange_num(plat_dr_physmax
);
1798 i
= memrange_num(physmax
);
1799 /* physmax greater than 4g */
1806 ASSERT(mmu_page_sizes
<= MMU_PAGE_SIZES
);
1808 ASSERT(ISP2(l2_linesz
));
1809 ASSERT(l2_sz
> MMU_PAGESIZE
);
1811 /* l2_assoc is 0 for fully associative l2 cache */
1813 l2_colors
= MAX(1, l2_sz
/ (l2_assoc
* MMU_PAGESIZE
));
1817 ASSERT(ISP2(l2_colors
));
1819 /* for scalability, configure at least PAGE_COLORS_MIN color bins */
1820 page_colors
= MAX(l2_colors
, PAGE_COLORS_MIN
);
1823 * cpu_page_colors is non-zero when a page color may be spread across
1826 if (l2_colors
< page_colors
)
1827 cpu_page_colors
= l2_colors
;
1829 ASSERT(ISP2(page_colors
));
1831 page_colors_mask
= page_colors
- 1;
1833 ASSERT(ISP2(CPUSETSIZE()));
1834 page_coloring_shift
= lowbit(CPUSETSIZE());
1836 /* initialize number of colors per page size */
1837 for (i
= 0; i
<= mmu
.max_page_level
; i
++) {
1838 hw_page_array
[i
].hp_size
= LEVEL_SIZE(i
);
1839 hw_page_array
[i
].hp_shift
= LEVEL_SHIFT(i
);
1840 hw_page_array
[i
].hp_pgcnt
= LEVEL_SIZE(i
) >> LEVEL_SHIFT(0);
1841 hw_page_array
[i
].hp_colors
= (page_colors_mask
>>
1842 (hw_page_array
[i
].hp_shift
- hw_page_array
[0].hp_shift
))
1844 colorequivszc
[i
] = 0;
1848 * The value of cpu_page_colors determines if additional color bins
1849 * need to be checked for a particular color in the page_get routines.
1851 if (cpu_page_colors
!= 0) {
1853 int a
= lowbit(page_colors
) - lowbit(cpu_page_colors
);
1857 for (i
= 0; i
<= mmu
.max_page_level
; i
++) {
1858 if ((colors
= hw_page_array
[i
].hp_colors
) <= 1) {
1859 colorequivszc
[i
] = 0;
1862 while ((colors
>> a
) == 0)
1866 /* higher 4 bits encodes color equiv mask */
1867 colorequivszc
[i
] = (a
<< 4);
1871 /* factor in colorequiv to check additional 'equivalent' bins. */
1872 if (colorequiv
> 1) {
1874 int a
= lowbit(colorequiv
) - 1;
1878 for (i
= 0; i
<= mmu
.max_page_level
; i
++) {
1879 if ((colors
= hw_page_array
[i
].hp_colors
) <= 1) {
1882 while ((colors
>> a
) == 0)
1884 if ((a
<< 4) > colorequivszc
[i
]) {
1885 colorequivszc
[i
] = (a
<< 4);
1890 /* size for mnoderanges */
1891 for (mnoderangecnt
= 0, i
= 0; i
< max_mem_nodes
; i
++)
1892 mnoderangecnt
+= mnode_range_cnt(i
);
1893 if (plat_dr_support_memory()) {
1895 * Reserve enough space for memory DR operations.
1896 * Two extra mnoderanges for possbile fragmentations,
1897 * one for the 2G boundary and the other for the 4G boundary.
1898 * We don't expect a memory board crossing the 16M boundary
1899 * for memory hot-add operations on x86 platforms.
1901 mnoderangecnt
+= 2 + max_mem_nodes
- lgrp_plat_node_cnt
;
1903 colorsz
= mnoderangecnt
* sizeof (mnoderange_t
);
1905 /* size for fpc_mutex and cpc_mutex */
1906 colorsz
+= (2 * max_mem_nodes
* sizeof (kmutex_t
) * NPC_MUTEX
);
1908 /* size of page_freelists */
1909 colorsz
+= mnoderangecnt
* sizeof (page_t
***);
1910 colorsz
+= mnoderangecnt
* mmu_page_sizes
* sizeof (page_t
**);
1912 for (i
= 0; i
< mmu_page_sizes
; i
++) {
1913 colors
= page_get_pagecolors(i
);
1914 colorsz
+= mnoderangecnt
* colors
* sizeof (page_t
*);
1917 /* size of page_cachelists */
1918 colorsz
+= mnoderangecnt
* sizeof (page_t
**);
1919 colorsz
+= mnoderangecnt
* page_colors
* sizeof (page_t
*);
1925 * Called once at startup to configure page_coloring data structures and
1926 * does the 1st page_free()/page_freelist_add().
1929 page_coloring_setup(caddr_t pcmemaddr
)
1938 * do page coloring setup
1942 mnoderanges
= (mnoderange_t
*)addr
;
1943 addr
+= (mnoderangecnt
* sizeof (mnoderange_t
));
1945 mnode_range_setup(mnoderanges
);
1947 for (k
= 0; k
< NPC_MUTEX
; k
++) {
1948 fpc_mutex
[k
] = (kmutex_t
*)addr
;
1949 addr
+= (max_mem_nodes
* sizeof (kmutex_t
));
1951 for (k
= 0; k
< NPC_MUTEX
; k
++) {
1952 cpc_mutex
[k
] = (kmutex_t
*)addr
;
1953 addr
+= (max_mem_nodes
* sizeof (kmutex_t
));
1955 page_freelists
= (page_t
****)addr
;
1956 addr
+= (mnoderangecnt
* sizeof (page_t
***));
1958 page_cachelists
= (page_t
***)addr
;
1959 addr
+= (mnoderangecnt
* sizeof (page_t
**));
1961 for (i
= 0; i
< mnoderangecnt
; i
++) {
1962 page_freelists
[i
] = (page_t
***)addr
;
1963 addr
+= (mmu_page_sizes
* sizeof (page_t
**));
1965 for (j
= 0; j
< mmu_page_sizes
; j
++) {
1966 colors
= page_get_pagecolors(j
);
1967 page_freelists
[i
][j
] = (page_t
**)addr
;
1968 addr
+= (colors
* sizeof (page_t
*));
1970 page_cachelists
[i
] = (page_t
**)addr
;
1971 addr
+= (page_colors
* sizeof (page_t
*));
1977 * Give back 10% of the io_pool pages to the free list.
1978 * Don't shrink the pool below some absolute minimum.
1981 page_io_pool_shrink()
1984 page_t
*pp
, *pp_first
, *pp_last
, **curpool
;
1988 mutex_enter(&io_pool_lock
);
1989 io_pool_shrink_attempts
++; /* should be a kstat? */
1990 retcnt
= io_pool_cnt
/ 10;
1991 if (io_pool_cnt
- retcnt
< io_pool_cnt_min
)
1992 retcnt
= io_pool_cnt
- io_pool_cnt_min
;
1995 io_pool_shrinks
++; /* should be a kstat? */
1996 curpool
= &io_pool_4g
;
1999 * Loop through taking pages from the end of the list
2000 * (highest mfns) till amount to return reached.
2002 for (pp
= *curpool
; pp
&& retcnt
> 0; ) {
2003 pp_first
= pp_last
= pp
->p_prev
;
2004 if (pp_first
== *curpool
)
2008 page_io_pool_sub(curpool
, pp_first
, pp_last
);
2009 if ((mfn
= pfn_to_mfn(pp
->p_pagenum
)) < start_mfn
)
2011 page_free(pp_first
, 1);
2014 if (retcnt
!= 0 && !bothpools
) {
2016 * If not enough found in less constrained pool try the
2017 * more constrained one.
2019 curpool
= &io_pool_16m
;
2024 mutex_exit(&io_pool_lock
);
2030 page_create_update_flags_x86(uint_t flags
)
2034 * Check this is an urgent allocation and free pages are depleted.
2036 if (!(flags
& PG_WAIT
) && freemem
< desfree
)
2037 page_io_pool_shrink();
2040 * page_create_get_something may call this because 4g memory may be
2041 * depleted. Set flags to allow for relocation of base page below
2045 flags
|= (PGI_PGCPSZC0
| PGI_PGCPHIPRI
);
2052 bp_color(struct buf
*bp
)
2060 * Take pages out of an io_pool
2063 page_io_pool_sub(page_t
**poolp
, page_t
*pp_first
, page_t
*pp_last
)
2065 if (*poolp
== pp_first
) {
2066 *poolp
= pp_last
->p_next
;
2067 if (*poolp
== pp_first
)
2070 pp_first
->p_prev
->p_next
= pp_last
->p_next
;
2071 pp_last
->p_next
->p_prev
= pp_first
->p_prev
;
2072 pp_first
->p_prev
= pp_last
;
2073 pp_last
->p_next
= pp_first
;
2077 * Put a page on the io_pool list. The list is ordered by increasing MFN.
2080 page_io_pool_add(page_t
**poolp
, page_t
*pp
)
2083 mfn_t mfn
= mfn_list
[pp
->p_pagenum
];
2085 if (*poolp
== NULL
) {
2093 * Since we try to take pages from the high end of the pool
2094 * chances are good that the pages to be put on the list will
2095 * go at or near the end of the list. so start at the end and
2098 look
= (*poolp
)->p_prev
;
2099 while (mfn
< mfn_list
[look
->p_pagenum
]) {
2100 look
= look
->p_prev
;
2101 if (look
== (*poolp
)->p_prev
)
2102 break; /* backed all the way to front of list */
2105 /* insert after look */
2107 pp
->p_next
= look
->p_next
;
2108 pp
->p_next
->p_prev
= pp
;
2110 if (mfn
< mfn_list
[(*poolp
)->p_pagenum
]) {
2112 * we inserted a new first list element
2113 * adjust pool pointer to newly inserted element
2120 * Add a page to the io_pool. Setting the force flag will force the page
2121 * into the io_pool no matter what.
2124 add_page_to_pool(page_t
*pp
, int force
)
2127 page_t
*freep
= NULL
;
2129 mutex_enter(&io_pool_lock
);
2131 * Always keep the scarce low memory pages
2133 if (mfn_list
[pp
->p_pagenum
] < PFN_16MEG
) {
2135 page_io_pool_add(&io_pool_16m
, pp
);
2138 if (io_pool_cnt
< io_pool_cnt_max
|| force
|| io_pool_4g
== NULL
) {
2140 page_io_pool_add(&io_pool_4g
, pp
);
2142 highest
= io_pool_4g
->p_prev
;
2143 if (mfn_list
[pp
->p_pagenum
] < mfn_list
[highest
->p_pagenum
]) {
2144 page_io_pool_sub(&io_pool_4g
, highest
, highest
);
2145 page_io_pool_add(&io_pool_4g
, pp
);
2152 mutex_exit(&io_pool_lock
);
2154 page_free(freep
, 1);
2158 int contig_pfn_cnt
; /* no of pfns in the contig pfn list */
2159 int contig_pfn_max
; /* capacity of the contig pfn list */
2160 int next_alloc_pfn
; /* next position in list to start a contig search */
2161 int contig_pfnlist_updates
; /* pfn list update count */
2162 int contig_pfnlist_builds
; /* how many times have we (re)built list */
2163 int contig_pfnlist_buildfailed
; /* how many times has list build failed */
2164 int create_contig_pending
; /* nonzero means taskq creating contig list */
2165 pfn_t
*contig_pfn_list
= NULL
; /* list of contig pfns in ascending mfn order */
2168 * Function to use in sorting a list of pfns by their underlying mfns.
2171 mfn_compare(const void *pfnp1
, const void *pfnp2
)
2173 mfn_t mfn1
= mfn_list
[*(pfn_t
*)pfnp1
];
2174 mfn_t mfn2
= mfn_list
[*(pfn_t
*)pfnp2
];
2184 * Compact the contig_pfn_list by tossing all the non-contiguous
2185 * elements from the list.
2188 compact_contig_pfn_list(void)
2190 pfn_t pfn
, lapfn
, prev_lapfn
;
2195 for (i
= 0; i
< contig_pfn_cnt
- 1; i
++) {
2196 pfn
= contig_pfn_list
[i
];
2197 lapfn
= contig_pfn_list
[i
+ 1];
2198 mfn
= mfn_list
[pfn
];
2200 * See if next pfn is for a contig mfn
2202 if (mfn_list
[lapfn
] != mfn
+ 1)
2205 * pfn and lookahead are both put in list
2206 * unless pfn is the previous lookahead.
2208 if (pfn
!= prev_lapfn
)
2209 contig_pfn_list
[newcnt
++] = pfn
;
2210 contig_pfn_list
[newcnt
++] = lapfn
;
2213 for (i
= newcnt
; i
< contig_pfn_cnt
; i
++)
2214 contig_pfn_list
[i
] = 0;
2215 contig_pfn_cnt
= newcnt
;
2220 call_create_contiglist(void *arg
)
2222 (void) create_contig_pfnlist(PG_WAIT
);
2226 * Create list of freelist pfns that have underlying
2227 * contiguous mfns. The list is kept in ascending mfn order.
2228 * returns 1 if list created else 0.
2231 create_contig_pfnlist(uint_t flags
)
2237 mutex_enter(&contig_list_lock
);
2238 if (contig_pfn_list
!= NULL
)
2240 contig_pfn_max
= freemem
+ (freemem
/ 10);
2241 contig_pfn_list
= kmem_zalloc(contig_pfn_max
* sizeof (pfn_t
),
2242 (flags
& PG_WAIT
) ? KM_SLEEP
: KM_NOSLEEP
);
2243 if (contig_pfn_list
== NULL
) {
2245 * If we could not create the contig list (because
2246 * we could not sleep for memory). Dispatch a taskq that can
2247 * sleep to get the memory.
2249 if (!create_contig_pending
) {
2250 if (taskq_dispatch(system_taskq
, call_create_contiglist
,
2251 NULL
, TQ_NOSLEEP
) != TASKQID_INVALID
)
2252 create_contig_pending
= 1;
2254 contig_pfnlist_buildfailed
++; /* count list build failures */
2258 create_contig_pending
= 0;
2259 ASSERT(contig_pfn_cnt
== 0);
2260 for (pfn
= 0; pfn
< mfn_count
; pfn
++) {
2261 pp
= page_numtopp_nolock(pfn
);
2262 if (pp
== NULL
|| !PP_ISFREE(pp
))
2264 contig_pfn_list
[contig_pfn_cnt
] = pfn
;
2265 if (++contig_pfn_cnt
== contig_pfn_max
)
2269 * Sanity check the new list.
2271 if (contig_pfn_cnt
< 2) { /* no contig pfns */
2273 contig_pfnlist_buildfailed
++;
2274 kmem_free(contig_pfn_list
, contig_pfn_max
* sizeof (pfn_t
));
2275 contig_pfn_list
= NULL
;
2280 qsort(contig_pfn_list
, contig_pfn_cnt
, sizeof (pfn_t
), mfn_compare
);
2281 compact_contig_pfn_list();
2283 * Make sure next search of the newly created contiguous pfn
2284 * list starts at the beginning of the list.
2287 contig_pfnlist_builds
++; /* count list builds */
2289 mutex_exit(&contig_list_lock
);
2295 * Toss the current contig pfnlist. Someone is about to do a massive
2296 * update to pfn<->mfn mappings. So we have them destroy the list and lock
2297 * it till they are done with their update.
2300 clear_and_lock_contig_pfnlist()
2302 pfn_t
*listp
= NULL
;
2305 mutex_enter(&contig_list_lock
);
2306 if (contig_pfn_list
!= NULL
) {
2307 listp
= contig_pfn_list
;
2308 listsize
= contig_pfn_max
* sizeof (pfn_t
);
2309 contig_pfn_list
= NULL
;
2310 contig_pfn_max
= contig_pfn_cnt
= 0;
2313 kmem_free(listp
, listsize
);
2317 * Unlock the contig_pfn_list. The next attempted use of it will cause
2318 * it to be re-created.
2321 unlock_contig_pfnlist()
2323 mutex_exit(&contig_list_lock
);
2327 * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2330 update_contig_pfnlist(pfn_t pfn
, mfn_t oldmfn
, mfn_t newmfn
)
2332 int probe_hi
, probe_lo
, probe_pos
, insert_after
, insert_point
;
2337 if (mutex_owner(&contig_list_lock
) != curthread
) {
2339 mutex_enter(&contig_list_lock
);
2341 if (contig_pfn_list
== NULL
)
2343 contig_pfnlist_updates
++;
2345 * Find the pfn in the current list. Use a binary chop to locate it.
2347 probe_hi
= contig_pfn_cnt
- 1;
2349 probe_pos
= (probe_hi
+ probe_lo
) / 2;
2350 while ((probe_pfn
= contig_pfn_list
[probe_pos
]) != pfn
) {
2351 if (probe_pos
== probe_lo
) { /* pfn not in list */
2355 if (pfn_to_mfn(probe_pfn
) <= oldmfn
)
2356 probe_lo
= probe_pos
;
2358 probe_hi
= probe_pos
;
2359 probe_pos
= (probe_hi
+ probe_lo
) / 2;
2361 if (probe_pos
>= 0) {
2363 * Remove pfn from list and ensure next alloc
2364 * position stays in bounds.
2366 if (--contig_pfn_cnt
<= next_alloc_pfn
)
2368 if (contig_pfn_cnt
< 2) { /* no contig pfns */
2370 kmem_free(contig_pfn_list
,
2371 contig_pfn_max
* sizeof (pfn_t
));
2372 contig_pfn_list
= NULL
;
2376 ovbcopy(&contig_pfn_list
[probe_pos
+ 1],
2377 &contig_pfn_list
[probe_pos
],
2378 (contig_pfn_cnt
- probe_pos
) * sizeof (pfn_t
));
2380 if (newmfn
== MFN_INVALID
)
2383 * Check if new mfn has adjacent mfns in the list
2385 probe_hi
= contig_pfn_cnt
- 1;
2389 probe_pos
= (probe_hi
+ probe_lo
) / 2;
2390 probe_mfn
= pfn_to_mfn(contig_pfn_list
[probe_pos
]);
2391 if (newmfn
== probe_mfn
+ 1)
2392 insert_after
= probe_pos
;
2393 else if (newmfn
== probe_mfn
- 1)
2394 insert_after
= probe_pos
- 1;
2395 if (probe_pos
== probe_lo
)
2397 if (probe_mfn
<= newmfn
)
2398 probe_lo
= probe_pos
;
2400 probe_hi
= probe_pos
;
2401 } while (insert_after
== -2);
2403 * If there is space in the list and there are adjacent mfns
2404 * insert the pfn in to its proper place in the list.
2406 if (insert_after
!= -2 && contig_pfn_cnt
+ 1 <= contig_pfn_max
) {
2407 insert_point
= insert_after
+ 1;
2408 ovbcopy(&contig_pfn_list
[insert_point
],
2409 &contig_pfn_list
[insert_point
+ 1],
2410 (contig_pfn_cnt
- insert_point
) * sizeof (pfn_t
));
2411 contig_pfn_list
[insert_point
] = pfn
;
2416 mutex_exit(&contig_list_lock
);
2420 * Called to (re-)populate the io_pool from the free page lists.
2423 populate_io_pool(void)
2430 * Figure out the bounds of the pool on first invocation.
2431 * We use a percentage of memory for the io pool size.
2432 * we allow that to shrink, but not to less than a fixed minimum
2434 if (io_pool_cnt_max
== 0) {
2435 io_pool_cnt_max
= physmem
/ (100 / io_pool_physmem_pct
);
2436 io_pool_cnt_lowater
= io_pool_cnt_max
;
2438 * This is the first time in populate_io_pool, grab a va to use
2439 * when we need to allocate pages.
2441 io_pool_kva
= vmem_alloc(heap_arena
, PAGESIZE
, VM_SLEEP
);
2444 * If we are out of pages in the pool, then grow the size of the pool
2446 if (io_pool_cnt
== 0) {
2448 * Grow the max size of the io pool by 5%, but never more than
2449 * 25% of physical memory.
2451 if (io_pool_cnt_max
< physmem
/ 4)
2452 io_pool_cnt_max
+= io_pool_cnt_max
/ 20;
2454 io_pool_grows
++; /* should be a kstat? */
2457 * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2459 (void) mfn_to_pfn(start_mfn
);
2460 max_mfn
= MIN(cached_max_mfn
, PFN_4GIG
);
2461 for (mfn
= start_mfn
; mfn
< max_mfn
; start_mfn
= ++mfn
) {
2462 pfn
= mfn_to_pfn(mfn
);
2463 if (pfn
& PFN_IS_FOREIGN_MFN
)
2466 * try to allocate it from free pages
2468 pp
= page_numtopp_alloc(pfn
);
2472 add_page_to_pool(pp
, 1);
2473 if (io_pool_cnt
>= io_pool_cnt_max
)
2477 return (io_pool_cnt
);
2481 * Destroy a page that was being used for DMA I/O. It may or
2482 * may not actually go back to the io_pool.
2485 page_destroy_io(page_t
*pp
)
2487 mfn_t mfn
= mfn_list
[pp
->p_pagenum
];
2490 * When the page was alloc'd a reservation was made, release it now
2494 * Unload translations, if any, then hash out the
2495 * page to erase its identity.
2497 (void) hat_pageunload(pp
, HAT_FORCE_PGUNLOAD
);
2498 page_hashout(pp
, NULL
);
2501 * If the page came from the free lists, just put it back to them.
2502 * DomU pages always go on the free lists as well.
2504 if (!DOMAIN_IS_INITDOMAIN(xen_info
) || mfn
>= PFN_4GIG
) {
2509 add_page_to_pool(pp
, 0);
2513 long contig_searches
; /* count of times contig pages requested */
2514 long contig_search_restarts
; /* count of contig ranges tried */
2515 long contig_search_failed
; /* count of contig alloc failures */
2518 * Free partial page list
2521 free_partial_list(page_t
**pplist
)
2525 while (*pplist
!= NULL
) {
2527 page_io_pool_sub(pplist
, pp
, pp
);
2533 * Look thru the contiguous pfns that are not part of the io_pool for
2534 * contiguous free pages. Return a list of the found pages or NULL.
2537 find_contig_free(uint_t npages
, uint_t flags
, uint64_t pfnseg
,
2540 page_t
*pp
, *plist
= NULL
;
2541 mfn_t mfn
, prev_mfn
, start_mfn
;
2543 int pages_needed
, pages_requested
;
2547 * create the contig pfn list if not already done
2550 mutex_enter(&contig_list_lock
);
2551 if (contig_pfn_list
== NULL
) {
2552 mutex_exit(&contig_list_lock
);
2553 if (!create_contig_pfnlist(flags
)) {
2560 * Search contiguous pfn list for physically contiguous pages not in
2561 * the io_pool. Start the search where the last search left off.
2563 pages_requested
= pages_needed
= npages
;
2564 search_start
= next_alloc_pfn
;
2565 start_mfn
= prev_mfn
= 0;
2566 while (pages_needed
) {
2567 pfn
= contig_pfn_list
[next_alloc_pfn
];
2568 mfn
= pfn_to_mfn(pfn
);
2570 * Check if mfn is first one or contig to previous one and
2571 * if page corresponding to mfn is free and that mfn
2572 * range is not crossing a segment boundary.
2574 if ((prev_mfn
== 0 || mfn
== prev_mfn
+ 1) &&
2575 (pp
= page_numtopp_alloc(pfn
)) != NULL
&&
2576 !((mfn
& pfnseg
) < (start_mfn
& pfnseg
))) {
2578 page_io_pool_add(&plist
, pp
);
2580 if (prev_mfn
== 0) {
2582 mfn
!= P2ROUNDUP(mfn
, pfnalign
)) {
2584 * not properly aligned
2586 contig_search_restarts
++;
2587 free_partial_list(&plist
);
2588 pages_needed
= pages_requested
;
2589 start_mfn
= prev_mfn
= 0;
2596 contig_search_restarts
++;
2597 free_partial_list(&plist
);
2598 pages_needed
= pages_requested
;
2599 start_mfn
= prev_mfn
= 0;
2602 if (++next_alloc_pfn
== contig_pfn_cnt
)
2604 if (next_alloc_pfn
== search_start
)
2605 break; /* all pfns searched */
2607 mutex_exit(&contig_list_lock
);
2609 contig_search_failed
++;
2611 * Failed to find enough contig pages.
2612 * free partial page list
2614 free_partial_list(&plist
);
2620 * Search the reserved io pool pages for a page range with the
2621 * desired characteristics.
2624 page_io_pool_alloc(ddi_dma_attr_t
*mattr
, int contig
, pgcnt_t minctg
)
2626 page_t
*pp_first
, *pp_last
;
2627 page_t
*pp
, **poolp
;
2628 pgcnt_t nwanted
, pfnalign
;
2630 mfn_t mfn
, tmfn
, hi_mfn
, lo_mfn
;
2631 int align
, attempt
= 0;
2635 lo_mfn
= mmu_btop(mattr
->dma_attr_addr_lo
);
2636 hi_mfn
= mmu_btop(mattr
->dma_attr_addr_hi
);
2637 pfnseg
= mmu_btop(mattr
->dma_attr_seg
);
2638 align
= maxbit(mattr
->dma_attr_align
, mattr
->dma_attr_minxfer
);
2639 if (align
> MMU_PAGESIZE
)
2640 pfnalign
= mmu_btop(align
);
2646 * See if we want pages for a legacy device
2648 if (hi_mfn
< PFN_16MEG
)
2649 poolp
= &io_pool_16m
;
2651 poolp
= &io_pool_4g
;
2654 * Take pages from I/O pool. We'll use pages from the highest
2655 * MFN range possible.
2657 pp_first
= pp_last
= NULL
;
2658 mutex_enter(&io_pool_lock
);
2660 for (pp
= *poolp
; pp
&& nwanted
> 0; ) {
2664 * skip pages above allowable range
2666 mfn
= mfn_list
[pp
->p_pagenum
];
2671 * stop at pages below allowable range
2676 if (pp_last
== NULL
) {
2680 tmfn
= mfn
- (minctg
- 1);
2681 if (pfnalign
&& tmfn
!= P2ROUNDUP(tmfn
, pfnalign
))
2682 goto skip
; /* not properly aligned */
2686 if ((mfn
& pfnseg
) < (tmfn
& pfnseg
))
2687 goto skip
; /* crosses seg boundary */
2689 * Start building page list
2691 pp_first
= pp_last
= pp
;
2695 * check physical contiguity if required
2698 mfn_list
[pp_first
->p_pagenum
] != mfn
+ 1) {
2700 * not a contiguous page, restart list.
2705 } else { /* add page to list */
2716 * If we didn't find memory. Try the more constrained pool, then
2717 * sweep free pages into the DMA pool and try again.
2720 mutex_exit(&io_pool_lock
);
2722 * If we were looking in the less constrained pool and
2723 * didn't find pages, try the more constrained pool.
2725 if (poolp
== &io_pool_4g
) {
2726 poolp
= &io_pool_16m
;
2730 if (++attempt
< 4) {
2732 * Grab some more io_pool pages
2734 (void) populate_io_pool();
2735 goto try_again
; /* go around and retry */
2740 * Found the pages, now snip them from the list
2742 page_io_pool_sub(poolp
, pp_first
, pp_last
);
2743 io_pool_cnt
-= minctg
;
2745 * reset low water mark
2747 if (io_pool_cnt
< io_pool_cnt_lowater
)
2748 io_pool_cnt_lowater
= io_pool_cnt
;
2749 mutex_exit(&io_pool_lock
);
2754 page_swap_with_hypervisor(struct vnode
*vp
, u_offset_t off
, caddr_t vaddr
,
2755 ddi_dma_attr_t
*mattr
, uint_t flags
, pgcnt_t minctg
)
2758 int order
, extra
, extpages
, i
, contig
, nbits
, extents
;
2759 page_t
*pp
, *expp
, *pp_first
, **pplist
= NULL
;
2760 mfn_t
*mfnlist
= NULL
;
2763 contig
= flags
& PG_PHYSCONTIG
;
2766 flags
&= ~PG_PHYSCONTIG
;
2767 kflags
= flags
& PG_WAIT
? KM_SLEEP
: KM_NOSLEEP
;
2769 * Hypervisor will allocate extents, if we want contig
2770 * pages extent must be >= minctg
2773 order
= highbit(minctg
) - 1;
2774 if (minctg
& ((1 << order
) - 1))
2776 extpages
= 1 << order
;
2781 if (extpages
> minctg
) {
2782 extra
= extpages
- minctg
;
2783 if (!page_resv(extra
, kflags
))
2787 pplist
= kmem_alloc(extpages
* sizeof (page_t
*), kflags
);
2790 mfnlist
= kmem_alloc(extpages
* sizeof (mfn_t
), kflags
);
2791 if (mfnlist
== NULL
)
2793 pp
= page_create_va(vp
, off
, minctg
* PAGESIZE
, flags
, &kvseg
, vaddr
);
2797 if (extpages
> minctg
) {
2799 * fill out the rest of extent pages to swap
2800 * with the hypervisor
2802 for (i
= 0; i
< extra
; i
++) {
2803 expp
= page_create_va(vp
,
2804 (u_offset_t
)(uintptr_t)io_pool_kva
,
2805 PAGESIZE
, flags
, &kvseg
, io_pool_kva
);
2808 (void) hat_pageunload(expp
, HAT_FORCE_PGUNLOAD
);
2809 page_io_unlock(expp
);
2810 page_hashout(expp
, NULL
);
2813 * add page to end of list
2815 expp
->p_prev
= pp_first
->p_prev
;
2816 expp
->p_next
= pp_first
;
2817 expp
->p_prev
->p_next
= expp
;
2818 pp_first
->p_prev
= expp
;
2822 for (i
= 0; i
< extpages
; i
++) {
2826 nbits
= highbit(mattr
->dma_attr_addr_hi
);
2827 extents
= contig
? 1 : minctg
;
2828 if (balloon_replace_pages(extents
, pplist
, nbits
, order
,
2829 mfnlist
) != extents
) {
2831 cmn_err(CE_NOTE
, "request to hypervisor"
2832 " for %d pages, maxaddr %" PRIx64
" failed",
2833 extpages
, mattr
->dma_attr_addr_hi
);
2837 kmem_free(pplist
, extpages
* sizeof (page_t
*));
2838 kmem_free(mfnlist
, extpages
* sizeof (mfn_t
));
2840 * Return any excess pages to free list
2842 if (extpages
> minctg
) {
2843 for (i
= 0; i
< extra
; i
++) {
2844 pp
= pp_first
->p_prev
;
2845 page_sub(&pp_first
, pp
);
2854 * Return pages to free list and return failure
2856 while (pp_first
!= NULL
) {
2858 page_sub(&pp_first
, pp
);
2860 if (pp
->p_vnode
!= NULL
)
2861 page_hashout(pp
, NULL
);
2865 kmem_free(pplist
, extpages
* sizeof (page_t
*));
2867 kmem_free(mfnlist
, extpages
* sizeof (mfn_t
));
2868 page_unresv(extpages
- minctg
);
2873 return_partial_alloc(page_t
*plist
)
2877 while (plist
!= NULL
) {
2879 page_sub(&plist
, pp
);
2881 page_destroy_io(pp
);
2886 page_get_contigpages(
2892 ddi_dma_attr_t
*mattr
)
2894 mfn_t max_mfn
= HYPERVISOR_memory_op(XENMEM_maximum_ram_page
, NULL
);
2895 page_t
*plist
; /* list to return */
2897 int contig
, anyaddr
, npages
, getone
= 0;
2900 pgcnt_t pfnalign
= 0;
2906 ASSERT(mattr
!= NULL
);
2907 lo_mfn
= mmu_btop(mattr
->dma_attr_addr_lo
);
2908 hi_mfn
= mmu_btop(mattr
->dma_attr_addr_hi
);
2909 sgllen
= mattr
->dma_attr_sgllen
;
2910 pfnseg
= mmu_btop(mattr
->dma_attr_seg
);
2911 align
= maxbit(mattr
->dma_attr_align
, mattr
->dma_attr_minxfer
);
2912 if (align
> MMU_PAGESIZE
)
2913 pfnalign
= mmu_btop(align
);
2915 contig
= flags
& PG_PHYSCONTIG
;
2921 * Clear the contig flag if only one page is needed.
2929 * Check if any page in the system is fine.
2931 anyaddr
= lo_mfn
== 0 && hi_mfn
>= max_mfn
;
2932 if (!contig
&& anyaddr
&& !pfnalign
) {
2933 flags
&= ~PG_PHYSCONTIG
;
2934 plist
= page_create_va(vp
, off
, npages
* MMU_PAGESIZE
,
2935 flags
, &kvseg
, vaddr
);
2936 if (plist
!= NULL
) {
2942 minctg
= howmany(npages
, sgllen
);
2943 while (npages
> sgllen
|| getone
) {
2944 if (minctg
> npages
)
2948 * We could want contig pages with no address range limits.
2950 if (anyaddr
&& contig
) {
2952 * Look for free contig pages to satisfy the request.
2954 mcpl
= find_contig_free(minctg
, flags
, pfnseg
,
2958 * Try the reserved io pools next
2961 mcpl
= page_io_pool_alloc(mattr
, contig
, minctg
);
2965 if (!page_hashin(pp
, vp
, off
, NULL
)) {
2966 panic("page_get_contigpages:"
2968 " pp %p, vp %p, off %llx",
2969 (void *)pp
, (void *)vp
, off
);
2971 off
+= MMU_PAGESIZE
;
2974 page_set_props(pp
, P_REF
);
2977 } while (pp
!= mcpl
);
2980 * Hypervisor exchange doesn't handle segment or
2981 * alignment constraints
2983 if (mattr
->dma_attr_seg
< mattr
->dma_attr_addr_hi
||
2987 * Try exchanging pages with the hypervisor
2989 mcpl
= page_swap_with_hypervisor(vp
, off
, vaddr
, mattr
,
2993 off
+= minctg
* MMU_PAGESIZE
;
2995 check_dma(mattr
, mcpl
, minctg
);
2997 * Here with a minctg run of contiguous pages, add them to the
2998 * list we will return for this request.
3000 page_list_concat(&plist
, &mcpl
);
3009 return_partial_alloc(plist
);
3014 * Allocator for domain 0 I/O pages. We match the required
3015 * DMA attributes and contiguity constraints.
3026 ddi_dma_attr_t
*mattr
)
3028 page_t
*plist
= NULL
, *pp
;
3029 int npages
= 0, contig
, anyaddr
, pages_req
;
3032 pgcnt_t pfnalign
= 0;
3035 int dummy
, bytes_got
;
3036 mfn_t max_mfn
= HYPERVISOR_memory_op(XENMEM_maximum_ram_page
, NULL
);
3038 ASSERT(mattr
!= NULL
);
3039 lo_mfn
= mmu_btop(mattr
->dma_attr_addr_lo
);
3040 hi_mfn
= mmu_btop(mattr
->dma_attr_addr_hi
);
3041 align
= maxbit(mattr
->dma_attr_align
, mattr
->dma_attr_minxfer
);
3042 if (align
> MMU_PAGESIZE
)
3043 pfnalign
= mmu_btop(align
);
3046 * Clear the contig flag if only one page is needed or the scatter
3047 * gather list length is >= npages.
3049 pages_req
= npages
= mmu_btopr(bytes
);
3050 contig
= (flags
& PG_PHYSCONTIG
);
3051 bytes
= P2ROUNDUP(bytes
, MMU_PAGESIZE
);
3052 if (bytes
== MMU_PAGESIZE
|| mattr
->dma_attr_sgllen
>= npages
)
3056 * Check if any old page in the system is fine.
3057 * DomU should always go down this path.
3059 is_domu
= !DOMAIN_IS_INITDOMAIN(xen_info
);
3060 anyaddr
= lo_mfn
== 0 && hi_mfn
>= max_mfn
&& !pfnalign
;
3061 if ((!contig
&& anyaddr
) || is_domu
) {
3062 flags
&= ~PG_PHYSCONTIG
;
3063 plist
= page_create_va(vp
, off
, bytes
, flags
, &kvseg
, vaddr
);
3067 return (NULL
); /* no memory available */
3070 * DomU should never reach here
3073 plist
= page_get_contigpages(vp
, off
, &npages
, flags
, vaddr
,
3077 bytes_got
= (pages_req
- npages
) << MMU_PAGESHIFT
;
3081 * We now have all the contiguous pages we need, but
3082 * we may still need additional non-contiguous pages.
3086 * now loop collecting the requested number of pages, these do
3087 * not have to be contiguous pages but we will use the contig
3088 * page alloc code to get the pages since it will honor any
3089 * other constraints the pages may have.
3093 pp
= page_get_contigpages(vp
, off
, &dummy
, flags
, vaddr
, mattr
);
3096 page_add(&plist
, pp
);
3097 vaddr
+= MMU_PAGESIZE
;
3098 off
+= MMU_PAGESIZE
;
3103 * Failed to get enough pages, return ones we did get
3105 return_partial_alloc(plist
);
3110 * Lock and return the page with the highest mfn that we can find. last_mfn
3111 * holds the last one found, so the next search can start from there. We
3112 * also keep a counter so that we don't loop forever if the machine has no
3115 * This is called from the balloon thread to find pages to give away. new_high
3116 * is used when new mfn's have been added to the system - we will reset our
3117 * search if the new mfn's are higher than our current search position.
3120 page_get_high_mfn(mfn_t new_high
)
3122 static mfn_t last_mfn
= 0;
3125 ulong_t loop_count
= 0;
3127 if (new_high
> last_mfn
)
3128 last_mfn
= new_high
;
3130 for (; loop_count
< mfn_count
; loop_count
++, last_mfn
--) {
3131 if (last_mfn
== 0) {
3132 last_mfn
= cached_max_mfn
;
3135 pfn
= mfn_to_pfn(last_mfn
);
3136 if (pfn
& PFN_IS_FOREIGN_MFN
)
3139 /* See if the page is free. If so, lock it. */
3140 pp
= page_numtopp_alloc(pfn
);
3145 ASSERT(PAGE_EXCL(pp
));
3146 ASSERT(pp
->p_vnode
== NULL
);
3147 ASSERT(!hat_page_is_mapped(pp
));
3157 * get a page from any list with the given mnode
3160 page_get_mnode_anylist(ulong_t origbin
, uchar_t szc
, uint_t flags
,
3161 int mnode
, int mtype
, ddi_dma_attr_t
*dma_attr
)
3170 int plw_initialized
;
3171 page_list_walker_t plw
;
3173 VM_STAT_ADD(pga_vmstats
.pgma_alloc
);
3175 ASSERT((flags
& PG_MATCH_COLOR
) == 0);
3177 ASSERT(dma_attr
!= NULL
);
3179 MTYPE_START(mnode
, mtype
, flags
);
3181 VM_STAT_ADD(pga_vmstats
.pgma_allocempty
);
3190 * check up to page_colors + 1 bins - origbin may be checked twice
3191 * because of BIN_STEP skip
3194 plw_initialized
= 0;
3196 for (plw
.plw_count
= 0;
3197 plw
.plw_count
< page_colors
; plw
.plw_count
++) {
3199 if (PAGE_FREELISTS(mnode
, szc
, bin
, mtype
) == NULL
)
3202 pcm
= PC_BIN_MUTEX(mnode
, bin
, PG_FREE_LIST
);
3204 pp
= PAGE_FREELISTS(mnode
, szc
, bin
, mtype
);
3206 while (pp
!= NULL
) {
3207 if (IS_DUMP_PAGE(pp
) || page_trylock(pp
,
3210 if (pp
== first_pp
) {
3216 ASSERT(PP_ISFREE(pp
));
3217 ASSERT(PP_ISAGED(pp
));
3218 ASSERT(pp
->p_vnode
== NULL
);
3219 ASSERT(pp
->p_hash
== NULL
);
3220 ASSERT(pp
->p_offset
== (u_offset_t
)-1);
3221 ASSERT(pp
->p_szc
== szc
);
3222 ASSERT(PFN_2_MEM_NODE(pp
->p_pagenum
) == mnode
);
3223 /* check if page within DMA attributes */
3224 pgaddr
= pa_to_ma(pfn_to_pa(pp
->p_pagenum
));
3225 if ((pgaddr
>= dma_attr
->dma_attr_addr_lo
) &&
3226 (pgaddr
+ MMU_PAGESIZE
- 1 <=
3227 dma_attr
->dma_attr_addr_hi
)) {
3231 /* continue looking */
3239 ASSERT(mtype
== PP_2_MTYPE(pp
));
3240 ASSERT(pp
->p_szc
== 0);
3242 /* found a page with specified DMA attributes */
3243 page_sub(&PAGE_FREELISTS(mnode
, szc
, bin
,
3245 page_ctr_sub(mnode
, mtype
, pp
, PG_FREE_LIST
);
3247 if ((PP_ISFREE(pp
) == 0) ||
3248 (PP_ISAGED(pp
) == 0)) {
3249 cmn_err(CE_PANIC
, "page %p is not free",
3254 check_dma(dma_attr
, pp
, 1);
3255 VM_STAT_ADD(pga_vmstats
.pgma_allocok
);
3260 if (plw_initialized
== 0) {
3261 page_list_walk_init(szc
, 0, bin
, 1, 0, &plw
);
3262 ASSERT(plw
.plw_ceq_dif
== page_colors
);
3263 plw_initialized
= 1;
3266 if (plw
.plw_do_split
) {
3267 pp
= page_freelist_split(szc
, bin
, mnode
,
3269 mmu_btop(dma_attr
->dma_attr_addr_lo
),
3270 mmu_btop(dma_attr
->dma_attr_addr_hi
+ 1),
3273 check_dma(dma_attr
, pp
, 1);
3278 bin
= page_list_walk_next_bin(szc
, bin
, &plw
);
3281 MTYPE_NEXT(mnode
, mtype
, flags
);
3282 } while (mtype
>= 0);
3284 /* failed to find a page in the freelist; try it in the cachelist */
3286 /* reset mtype start for cachelist search */
3290 /* start with the bin of matching color */
3294 for (i
= 0; i
<= page_colors
; i
++) {
3295 if (PAGE_CACHELISTS(mnode
, bin
, mtype
) == NULL
)
3297 pcm
= PC_BIN_MUTEX(mnode
, bin
, PG_CACHE_LIST
);
3299 pp
= PAGE_CACHELISTS(mnode
, bin
, mtype
);
3301 while (pp
!= NULL
) {
3302 if (IS_DUMP_PAGE(pp
) || page_trylock(pp
,
3309 ASSERT(pp
->p_vnode
);
3310 ASSERT(PP_ISAGED(pp
) == 0);
3311 ASSERT(pp
->p_szc
== 0);
3312 ASSERT(PFN_2_MEM_NODE(pp
->p_pagenum
) == mnode
);
3314 /* check if page within DMA attributes */
3316 pgaddr
= pa_to_ma(pfn_to_pa(pp
->p_pagenum
));
3317 if ((pgaddr
>= dma_attr
->dma_attr_addr_lo
) &&
3318 (pgaddr
+ MMU_PAGESIZE
- 1 <=
3319 dma_attr
->dma_attr_addr_hi
)) {
3323 /* continue looking */
3331 ASSERT(mtype
== PP_2_MTYPE(pp
));
3332 ASSERT(pp
->p_szc
== 0);
3334 /* found a page with specified DMA attributes */
3335 page_sub(&PAGE_CACHELISTS(mnode
, bin
,
3337 page_ctr_sub(mnode
, mtype
, pp
, PG_CACHE_LIST
);
3340 ASSERT(pp
->p_vnode
);
3341 ASSERT(PP_ISAGED(pp
) == 0);
3342 check_dma(dma_attr
, pp
, 1);
3343 VM_STAT_ADD(pga_vmstats
.pgma_allocok
);
3348 bin
+= (i
== 0) ? BIN_STEP
: 1;
3349 bin
&= page_colors_mask
;
3351 MTYPE_NEXT(mnode
, mtype
, flags
);
3352 } while (mtype
>= 0);
3354 VM_STAT_ADD(pga_vmstats
.pgma_allocfailed
);
3359 * This function is similar to page_get_freelist()/page_get_cachelist()
3360 * but it searches both the lists to find a page with the specified
3361 * color (or no color) and DMA attributes. The search is done in the
3362 * freelist first and then in the cache list within the highest memory
3363 * range (based on DMA attributes) before searching in the lower
3366 * Note: This function is called only by page_create_io().
3370 page_get_anylist(struct vnode
*vp
, u_offset_t off
, struct as
*as
, caddr_t vaddr
,
3371 size_t size
, uint_t flags
, ddi_dma_attr_t
*dma_attr
, lgrp_t
*lgrp
)
3381 int local_failed_stat
= 0;
3382 lgrp_mnode_cookie_t lgrp_cookie
;
3384 VM_STAT_ADD(pga_vmstats
.pga_alloc
);
3386 /* only base pagesize currently supported */
3387 if (size
!= MMU_PAGESIZE
)
3391 * If we're passed a specific lgroup, we use it. Otherwise,
3392 * assume first-touch placement is desired.
3394 if (!LGRP_EXISTS(lgrp
))
3395 lgrp
= lgrp_home_lgrp();
3398 AS_2_BIN(as
, seg
, vp
, vaddr
, bin
, 0);
3401 * Only hold one freelist or cachelist lock at a time, that way we
3402 * can start anywhere and not have to worry about lock
3405 if (dma_attr
== NULL
) {
3409 VM_STAT_ADD(pga_vmstats
.pga_nulldmaattr
);
3411 pfn_t pfnlo
= mmu_btop(dma_attr
->dma_attr_addr_lo
);
3412 pfn_t pfnhi
= mmu_btop(dma_attr
->dma_attr_addr_hi
);
3415 * We can guarantee alignment only for page boundary.
3417 if (dma_attr
->dma_attr_align
> MMU_PAGESIZE
)
3420 /* Sanity check the dma_attr */
3424 n
= pfn_2_mtype(pfnlo
);
3425 m
= pfn_2_mtype(pfnhi
);
3427 fullrange
= ((pfnlo
== mnoderanges
[n
].mnr_pfnlo
) &&
3428 (pfnhi
>= mnoderanges
[m
].mnr_pfnhi
));
3430 VM_STAT_COND_ADD(fullrange
== 0, pga_vmstats
.pga_notfullrange
);
3434 /* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3435 if (n
== mtype16m
) {
3436 flags
|= PGI_MT_RANGE0
;
3441 * Try local memory node first, but try remote if we can't
3442 * get a page of the right color.
3444 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
, LGRP_SRCH_HIER
);
3445 while ((mnode
= lgrp_memnode_choose(&lgrp_cookie
)) >= 0) {
3447 * allocate pages from high pfn to low.
3451 if (fullrange
!= 0) {
3452 pp
= page_get_mnode_freelist(mnode
,
3453 bin
, mtype
, szc
, flags
);
3455 pp
= page_get_mnode_cachelist(
3456 bin
, flags
, mnode
, mtype
);
3459 pp
= page_get_mnode_anylist(bin
, szc
,
3460 flags
, mnode
, mtype
, dma_attr
);
3463 VM_STAT_ADD(pga_vmstats
.pga_allocok
);
3464 check_dma(dma_attr
, pp
, 1);
3467 } while (mtype
!= n
&&
3468 (mtype
= mnoderanges
[mtype
].mnr_next
) != -1);
3469 if (!local_failed_stat
) {
3470 lgrp_stat_add(lgrp
->lgrp_id
, LGRP_NUM_ALLOC_FAIL
, 1);
3471 local_failed_stat
= 1;
3474 VM_STAT_ADD(pga_vmstats
.pga_allocfailed
);
3482 * This function is a copy of page_create_va() with an additional
3483 * argument 'mattr' that specifies DMA memory requirements to
3484 * the page list functions. This function is used by the segkmem
3485 * allocator so it is only to create new pages (i.e PG_EXCL is
3488 * Note: This interface is currently used by x86 PSM only and is
3489 * not fully specified so the commitment level is only for
3490 * private interface specific to x86. This interface uses PSM
3491 * specific page_get_anylist() interface.
3494 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
3495 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3496 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3510 ddi_dma_attr_t
*mattr
) /* DMA memory attributes if any */
3512 page_t
*plist
= NULL
;
3513 uint_t plist_len
= 0;
3518 kmutex_t
*phm
= NULL
;
3521 TRACE_4(TR_FAC_VM
, TR_PAGE_CREATE_START
,
3522 "page_create_start:vp %p off %llx bytes %u flags %x",
3523 vp
, off
, bytes
, flags
);
3525 ASSERT((flags
& ~(PG_EXCL
| PG_WAIT
| PG_PHYSCONTIG
)) == 0);
3527 pages_req
= npages
= mmu_btopr(bytes
);
3530 * Do the freemem and pcf accounting.
3532 if (!page_create_wait(npages
, flags
)) {
3536 TRACE_2(TR_FAC_VM
, TR_PAGE_CREATE_SUCCESS
,
3537 "page_create_success:vp %p off %llx", vp
, off
);
3540 * If satisfying this request has left us with too little
3541 * memory, start the wheels turning to get some back. The
3542 * first clause of the test prevents waking up the pageout
3543 * daemon in situations where it would decide that there's
3546 if (nscan
< desscan
&& freemem
< minfree
) {
3547 TRACE_1(TR_FAC_VM
, TR_PAGEOUT_CV_SIGNAL
,
3548 "pageout_cv_signal:freemem %ld", freemem
);
3549 WAKE_PAGEOUT_SCANNER(page__create__io
);
3552 if (flags
& PG_PHYSCONTIG
) {
3554 plist
= page_get_contigpage(&npages
, mattr
, 1);
3555 if (plist
== NULL
) {
3556 page_create_putback(npages
);
3563 if (!page_hashin(pp
, vp
, off
, NULL
)) {
3564 panic("pg_creat_io: hashin failed %p %p %llx",
3565 (void *)pp
, (void *)vp
, off
);
3567 VM_STAT_ADD(page_create_new
);
3568 off
+= MMU_PAGESIZE
;
3571 page_set_props(pp
, P_REF
);
3573 } while (pp
!= plist
);
3576 check_dma(mattr
, plist
, pages_req
);
3579 vaddr
+= (pages_req
- npages
) << MMU_PAGESHIFT
;
3585 * page_get_contigpage returns when npages <= sgllen.
3586 * Grab the rest of the non-contig pages below from anylist.
3591 * Loop around collecting the requested number of pages.
3592 * Most of the time, we have to `create' a new page. With
3593 * this in mind, pull the page off the free list before
3594 * getting the hash lock. This will minimize the hash
3595 * lock hold time, nesting, and the like. If it turns
3596 * out we don't need the page, we put it back at the end.
3601 index
= PAGE_HASH_FUNC(vp
, off
);
3603 ASSERT(phm
== NULL
);
3604 ASSERT(index
== PAGE_HASH_FUNC(vp
, off
));
3605 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp
)));
3609 * Try to get the page of any color either from
3610 * the freelist or from the cache list.
3612 npp
= page_get_anylist(vp
, off
, as
, vaddr
, MMU_PAGESIZE
,
3613 flags
& ~PG_MATCH_COLOR
, mattr
, NULL
);
3615 if (mattr
== NULL
) {
3617 * Not looking for a special page;
3620 panic("no page found %d", (int)npages
);
3623 * No page found! This can happen
3624 * if we are looking for a page
3625 * within a specific memory range
3626 * for DMA purposes. If PG_WAIT is
3627 * specified then we wait for a
3628 * while and then try again. The
3629 * wait could be forever if we
3630 * don't get the page(s) we need.
3632 * Note: XXX We really need a mechanism
3633 * to wait for pages in the desired
3634 * range. For now, we wait for any
3635 * pages and see if we can use it.
3638 if ((mattr
!= NULL
) && (flags
& PG_WAIT
)) {
3642 goto fail
; /* undo accounting stuff */
3645 if (PP_ISAGED(npp
) == 0) {
3647 * Since this page came from the
3648 * cachelist, we must destroy the
3649 * old vnode association.
3651 page_hashout(npp
, (kmutex_t
*)NULL
);
3658 ASSERT(PAGE_EXCL(npp
));
3659 ASSERT(npp
->p_vnode
== NULL
);
3660 ASSERT(!hat_page_is_mapped(npp
));
3665 * Here we have a page in our hot little mits and are
3666 * just waiting to stuff it on the appropriate lists.
3667 * Get the mutex and check to see if it really does
3670 phm
= PAGE_HASH_MUTEX(index
);
3672 PAGE_HASH_SEARCH(index
, pp
, vp
, off
);
3674 VM_STAT_ADD(page_create_new
);
3677 if (!page_hashin(pp
, vp
, off
, phm
)) {
3679 * Since we hold the page hash mutex and
3680 * just searched for this page, page_hashin
3681 * had better not fail. If it does, that
3682 * means somethread did not follow the
3683 * page hash mutex rules. Panic now and
3684 * get it over with. As usual, go down
3685 * holding all the locks.
3687 ASSERT(MUTEX_HELD(phm
));
3688 panic("page_create: hashin fail %p %p %llx %p",
3689 (void *)pp
, (void *)vp
, off
, (void *)phm
);
3692 ASSERT(MUTEX_HELD(phm
));
3697 * Hat layer locking need not be done to set
3698 * the following bits since the page is not hashed
3699 * and was on the free list (i.e., had no mappings).
3701 * Set the reference bit to protect
3702 * against immediate pageout
3704 * XXXmh modify freelist code to set reference
3705 * bit so we don't have to do it here.
3707 page_set_props(pp
, P_REF
);
3709 ASSERT(MUTEX_HELD(phm
));
3713 * NOTE: This should not happen for pages associated
3714 * with kernel vnode 'kvp'.
3716 /* XX64 - to debug why this happens! */
3717 ASSERT(!VN_ISKAS(vp
));
3720 "page_create: page not expected "
3721 "in hash list for kernel vnode - pp 0x%p",
3723 VM_STAT_ADD(page_create_exists
);
3728 * Got a page! It is locked. Acquire the i/o
3729 * lock since we are going to use the p_next and
3730 * p_prev fields to link the requested pages together.
3733 page_add(&plist
, pp
);
3734 plist
= plist
->p_next
;
3735 off
+= MMU_PAGESIZE
;
3736 vaddr
+= MMU_PAGESIZE
;
3739 check_dma(mattr
, plist
, pages_req
);
3745 * Did not need this page after all.
3746 * Put it back on the free list.
3748 VM_STAT_ADD(page_create_putbacks
);
3751 npp
->p_offset
= (u_offset_t
)-1;
3752 page_list_add(npp
, PG_FREE_LIST
| PG_LIST_TAIL
);
3757 * Give up the pages we already got.
3759 while (plist
!= NULL
) {
3761 page_sub(&plist
, pp
);
3764 /*LINTED: constant in conditional ctx*/
3765 VN_DISPOSE(pp
, B_INVAL
, 0, kcred
);
3769 * VN_DISPOSE does freemem accounting for the pages in plist
3770 * by calling page_free. So, we need to undo the pcf accounting
3771 * for only the remaining pages.
3773 VM_STAT_ADD(page_create_putbacks
);
3774 page_create_putback(pages_req
- plist_len
);
3782 * Copy the data from the physical page represented by "frompp" to
3783 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3784 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt
3785 * level and no one sleeps with an active mapping there.
3787 * Note that the ref/mod bits in the page_t's are not affected by
3788 * this operation, hence it is up to the caller to update them appropriately.
3791 ppcopy(page_t
*frompp
, page_t
*topp
)
3800 ASSERT_STACK_ALIGNED();
3801 ASSERT(PAGE_LOCKED(frompp
));
3802 ASSERT(PAGE_LOCKED(topp
));
3805 pp_addr1
= hat_kpm_page2va(frompp
, 0);
3806 pp_addr2
= hat_kpm_page2va(topp
, 0);
3810 * disable pre-emption so that CPU can't change
3814 pp_addr1
= CPU
->cpu_caddr1
;
3815 pp_addr2
= CPU
->cpu_caddr2
;
3816 pte1
= CPU
->cpu_caddr1pte
;
3817 pte2
= CPU
->cpu_caddr2pte
;
3819 mutex_enter(&CPU
->cpu_ppaddr_mutex
);
3821 hat_mempte_remap(page_pptonum(frompp
), pp_addr1
, pte1
,
3822 PROT_READ
| HAT_STORECACHING_OK
, HAT_LOAD_NOCONSIST
);
3823 hat_mempte_remap(page_pptonum(topp
), pp_addr2
, pte2
,
3824 PROT_READ
| PROT_WRITE
| HAT_STORECACHING_OK
,
3825 HAT_LOAD_NOCONSIST
);
3828 if (on_fault(&ljb
)) {
3834 if (use_sse_pagecopy
)
3836 page_copy_no_xmm(pp_addr2
, pp_addr1
);
3838 hwblkpagecopy(pp_addr1
, pp_addr2
);
3841 bcopy(pp_addr1
, pp_addr2
, PAGESIZE
);
3848 * We can't leave unused mappings laying about under the
3849 * hypervisor, so blow them away.
3851 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1
, 0,
3852 UVMF_INVLPG
| UVMF_LOCAL
) < 0)
3853 panic("HYPERVISOR_update_va_mapping() failed");
3854 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2
, 0,
3855 UVMF_INVLPG
| UVMF_LOCAL
) < 0)
3856 panic("HYPERVISOR_update_va_mapping() failed");
3858 mutex_exit(&CPU
->cpu_ppaddr_mutex
);
3865 pagezero(page_t
*pp
, uint_t off
, uint_t len
)
3867 ASSERT(PAGE_LOCKED(pp
));
3868 pfnzero(page_pptonum(pp
), off
, len
);
3872 * Zero the physical page from off to off + len given by pfn
3873 * without changing the reference and modified bits of page.
3875 * We use this using CPU private page address #2, see ppcopy() for more info.
3876 * pfnzero() must not be called at interrupt level.
3879 pfnzero(pfn_t pfn
, uint_t off
, uint_t len
)
3883 kmutex_t
*ppaddr_mutex
= NULL
;
3885 ASSERT_STACK_ALIGNED();
3886 ASSERT(len
<= MMU_PAGESIZE
);
3887 ASSERT(off
<= MMU_PAGESIZE
);
3888 ASSERT(off
+ len
<= MMU_PAGESIZE
);
3890 if (kpm_enable
&& !pfn_is_foreign(pfn
)) {
3891 pp_addr2
= hat_kpm_pfn2va(pfn
);
3896 pp_addr2
= CPU
->cpu_caddr2
;
3897 pte2
= CPU
->cpu_caddr2pte
;
3899 ppaddr_mutex
= &CPU
->cpu_ppaddr_mutex
;
3900 mutex_enter(ppaddr_mutex
);
3902 hat_mempte_remap(pfn
, pp_addr2
, pte2
,
3903 PROT_READ
| PROT_WRITE
| HAT_STORECACHING_OK
,
3904 HAT_LOAD_NOCONSIST
);
3907 if (use_sse_pagezero
) {
3912 * zero a byte at a time until properly aligned for
3913 * block_zero_no_xmm().
3915 while (!P2NPHASE(off
, ((uint_t
)BLOCKZEROALIGN
)) && len
-- > 0)
3916 pp_addr2
[off
++] = 0;
3919 * Now use faster block_zero_no_xmm() for any range
3920 * that is properly aligned and sized.
3922 rem
= P2PHASE(len
, ((uint_t
)BLOCKZEROALIGN
));
3925 block_zero_no_xmm(pp_addr2
+ off
, len
);
3930 * zero remainder with byte stores.
3933 pp_addr2
[off
++] = 0;
3935 hwblkclr(pp_addr2
+ off
, len
);
3938 bzero(pp_addr2
+ off
, len
);
3941 if (!kpm_enable
|| pfn_is_foreign(pfn
)) {
3944 * On the hypervisor this page might get used for a page
3945 * table before any intervening change to this mapping,
3948 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2
, 0,
3950 panic("HYPERVISOR_update_va_mapping() failed");
3952 mutex_exit(ppaddr_mutex
);
3959 * Platform-dependent page scrub call.
3962 pagescrub(page_t
*pp
, uint_t off
, uint_t len
)
3965 * For now, we rely on the fact that pagezero() will
3968 pagezero(pp
, off
, len
);
3972 * set up two private addresses for use on a given CPU for use in ppcopy()
3975 setup_vaddr_for_ppcopy(struct cpu
*cpup
)
3978 hat_mempte_t pte_pa
;
3980 addr
= vmem_alloc(heap_arena
, mmu_ptob(1), VM_SLEEP
);
3981 pte_pa
= hat_mempte_setup(addr
);
3982 cpup
->cpu_caddr1
= addr
;
3983 cpup
->cpu_caddr1pte
= pte_pa
;
3985 addr
= vmem_alloc(heap_arena
, mmu_ptob(1), VM_SLEEP
);
3986 pte_pa
= hat_mempte_setup(addr
);
3987 cpup
->cpu_caddr2
= addr
;
3988 cpup
->cpu_caddr2pte
= pte_pa
;
3990 mutex_init(&cpup
->cpu_ppaddr_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
3994 * Undo setup_vaddr_for_ppcopy
3997 teardown_vaddr_for_ppcopy(struct cpu
*cpup
)
3999 mutex_destroy(&cpup
->cpu_ppaddr_mutex
);
4001 hat_mempte_release(cpup
->cpu_caddr2
, cpup
->cpu_caddr2pte
);
4002 cpup
->cpu_caddr2pte
= 0;
4003 vmem_free(heap_arena
, cpup
->cpu_caddr2
, mmu_ptob(1));
4004 cpup
->cpu_caddr2
= 0;
4006 hat_mempte_release(cpup
->cpu_caddr1
, cpup
->cpu_caddr1pte
);
4007 cpup
->cpu_caddr1pte
= 0;
4008 vmem_free(heap_arena
, cpup
->cpu_caddr1
, mmu_ptob(1));
4009 cpup
->cpu_caddr1
= 0;
4013 * Function for flushing D-cache when performing module relocations
4014 * to an alternate mapping. Unnecessary on Intel / AMD platforms.
4021 * Allocate a memory page. The argument 'seed' can be any pseudo-random
4022 * number to vary where the pages come from. This is quite a hacked up
4023 * method -- it works for now, but really needs to be fixed up a bit.
4025 * We currently use page_create_va() on the kvp with fake offsets,
4026 * segments and virt address. This is pretty bogus, but was copied from the
4027 * old hat_i86.c code. A better approach would be to specify either mnode
4028 * random or mnode local and takes a page from whatever color has the MOST
4029 * available - this would have a minimal impact on page coloring.
4032 page_get_physical(uintptr_t seed
)
4036 static struct seg tmpseg
;
4037 static uintptr_t ctr
= 0;
4040 * This code is gross, we really need a simpler page allocator.
4042 * We need to assign an offset for the page to call page_create_va()
4043 * To avoid conflicts with other pages, we get creative with the offset.
4044 * For 32 bits, we need an offset > 4Gig
4045 * For 64 bits, need an offset somewhere in the VA hole.
4048 if (offset
> kernelbase
)
4049 offset
-= kernelbase
;
4050 offset
<<= MMU_PAGESHIFT
;
4051 offset
+= mmu
.hole_start
; /* something in VA hole */
4053 if (page_resv(1, KM_NOSLEEP
) == 0)
4057 pp
= page_exists(&kvp
, offset
);
4059 panic("page already exists %p", (void *)pp
);
4062 pp
= page_create_va(&kvp
, offset
, MMU_PAGESIZE
, PG_EXCL
,
4063 &tmpseg
, (caddr_t
)(ctr
+= MMU_PAGESIZE
)); /* changing VA usage */