4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2010, Intel Corporation.
26 * All rights reserved.
29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
33 * Portions of this source code were derived from Berkeley 4.3 BSD
34 * under license from the Regents of the University of California.
38 * UNIX machine dependent virtual memory support.
41 #include <sys/types.h>
42 #include <sys/param.h>
43 #include <sys/systm.h>
49 #include <sys/cpuvar.h>
54 #include <sys/vnode.h>
57 #include <sys/exechdr.h>
58 #include <sys/debug.h>
59 #include <sys/vmsystm.h>
61 #include <sys/dumphdr.h>
62 #include <sys/random.h>
67 #include <vm/seg_kp.h>
68 #include <vm/seg_vn.h>
70 #include <vm/seg_kmem.h>
71 #include <vm/seg_kpm.h>
72 #include <vm/vm_dep.h>
75 #include <sys/vm_machparam.h>
76 #include <sys/memlist.h>
77 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
78 #include <vm/hat_i86.h>
79 #include <sys/x86_archext.h>
80 #include <sys/elf_386.h>
81 #include <sys/cmn_err.h>
82 #include <sys/archsystm.h>
83 #include <sys/machsystm.h>
84 #include <sys/secflags.h>
86 #include <sys/vtrace.h>
87 #include <sys/ddidmareq.h>
88 #include <sys/promif.h>
89 #include <sys/memnode.h>
90 #include <sys/stack.h>
91 #include <util/qsort.h>
92 #include <sys/taskq.h>
96 #include <sys/hypervisor.h>
97 #include <sys/xen_mmu.h>
98 #include <sys/balloon_impl.h>
101 * domain 0 pages usable for DMA are kept pre-allocated and kept in
102 * distinct lists, ordered by increasing mfn.
104 static kmutex_t io_pool_lock
;
105 static kmutex_t contig_list_lock
;
106 static page_t
*io_pool_4g
; /* pool for 32 bit dma limited devices */
107 static page_t
*io_pool_16m
; /* pool for 24 bit dma limited legacy devices */
108 static long io_pool_cnt
;
109 static long io_pool_cnt_max
= 0;
110 #define DEFAULT_IO_POOL_MIN 128
111 static long io_pool_cnt_min
= DEFAULT_IO_POOL_MIN
;
112 static long io_pool_cnt_lowater
= 0;
113 static long io_pool_shrink_attempts
; /* how many times did we try to shrink */
114 static long io_pool_shrinks
; /* how many times did we really shrink */
115 static long io_pool_grows
; /* how many times did we grow */
116 static mfn_t start_mfn
= 1;
117 static caddr_t io_pool_kva
; /* use to alloc pages when needed */
119 static int create_contig_pfnlist(uint_t
);
122 * percentage of phys mem to hold in the i/o pool
124 #define DEFAULT_IO_POOL_PCT 2
125 static long io_pool_physmem_pct
= DEFAULT_IO_POOL_PCT
;
126 static void page_io_pool_sub(page_t
**, page_t
*, page_t
*);
131 uint_t vac_colors
= 1;
133 int largepagesupport
= 0;
134 extern uint_t page_create_new
;
135 extern uint_t page_create_exists
;
136 extern uint_t page_create_putbacks
;
138 * Allow users to disable the kernel's use of SSE.
140 extern int use_sse_pagecopy
, use_sse_pagezero
;
143 * combined memory ranges from mnode and memranges[] to manage single
144 * mnode/mtype dimension in the page lists.
150 int mnr_memrange
; /* index into memranges[] */
151 int mnr_next
; /* next lower PA mnoderange */
153 /* maintain page list stats */
154 pgcnt_t mnr_mt_clpgcnt
; /* cache list cnt */
155 pgcnt_t mnr_mt_flpgcnt
[MMU_PAGE_SIZES
]; /* free list cnt per szc */
156 pgcnt_t mnr_mt_totcnt
; /* sum of cache and free lists */
158 struct mnr_mts
{ /* mnode/mtype szc stats */
159 pgcnt_t mnr_mts_pgcnt
;
161 pgcnt_t
*mnr_mtsc_pgcnt
;
166 #define MEMRANGEHI(mtype) \
167 ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
168 #define MEMRANGELO(mtype) (memranges[mtype])
170 #define MTYPE_FREEMEM(mt) (mnoderanges[mt].mnr_mt_totcnt)
173 * As the PC architecture evolved memory up was clumped into several
174 * ranges for various historical I/O devices to do DMA.
177 * < 4Gig - PCI bus or drivers that don't understand PAE mode
179 * These are listed in reverse order, so that we can skip over unused
180 * ranges on machines with small memories.
182 * For now under the Hypervisor, we'll only ever have one memrange.
184 #define PFN_4GIG 0x100000
185 #define PFN_16MEG 0x1000
186 /* Indices into the memory range (arch_memranges) array. */
191 static pfn_t arch_memranges
[NUM_MEM_RANGES
] = {
192 PFN_4GIG
, /* pfn range for 4G and above */
193 0x80000, /* pfn range for 2G-4G */
194 PFN_16MEG
, /* pfn range for 16M-2G */
195 0x00000, /* pfn range for 0-16M */
197 pfn_t
*memranges
= &arch_memranges
[0];
198 int nranges
= NUM_MEM_RANGES
;
201 * This combines mem_node_config and memranges into one data
202 * structure to be used for page list management.
204 mnoderange_t
*mnoderanges
;
208 int mtypetop
; /* index of highest pfn'ed mnoderange */
211 * 4g memory management variables for systems with more than 4g of memory:
213 * physical memory below 4g is required for 32bit dma devices and, currently,
214 * for kmem memory. On systems with more than 4g of memory, the pool of memory
215 * below 4g can be depleted without any paging activity given that there is
216 * likely to be sufficient memory above 4g.
218 * physmax4g is set true if the largest pfn is over 4g. The rest of the
219 * 4g memory management code is enabled only when physmax4g is true.
221 * maxmem4g is the count of the maximum number of pages on the page lists
222 * with physical addresses below 4g. It can be a lot less then 4g given that
223 * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
226 * freemem4g maintains the count of the number of available pages on the
227 * page lists with physical addresses below 4g.
229 * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
230 * 6% (desfree4gshift = 4) of maxmem4g.
232 * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
233 * and the amount of physical memory above 4g is greater than freemem4g.
234 * In this case, page_get_* routines will restrict below 4g allocations
235 * for requests that don't specifically require it.
238 #define DESFREE4G (maxmem4g >> desfree4gshift)
240 #define RESTRICT4G_ALLOC \
241 (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
243 static pgcnt_t maxmem4g
;
244 static pgcnt_t freemem4g
;
245 static int physmax4g
;
246 static int desfree4gshift
= 4; /* maxmem4g shift to derive DESFREE4G */
249 * 16m memory management:
251 * reserve some amount of physical memory below 16m for legacy devices.
253 * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
254 * 16m or if the 16m pool drops below DESFREE16M.
256 * In this case, general page allocations via page_get_{free,cache}list
257 * routines will be restricted from allocating from the 16m pool. Allocations
258 * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
259 * are not restricted.
262 #define FREEMEM16M MTYPE_FREEMEM(mtype16m)
263 #define DESFREE16M desfree16m
264 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \
265 ((freemem != 0) && ((flags & PG_PANIC) == 0) && \
266 ((freemem >= (FREEMEM16M)) || \
267 (FREEMEM16M < (DESFREE16M + pgcnt))))
269 static pgcnt_t desfree16m
= 0x380;
272 * This can be patched via /etc/system to allow old non-PAE aware device
273 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
275 int restricted_kmemalloc
= 0;
280 ulong_t pga_notfullrange
;
281 ulong_t pga_nulldmaattr
;
283 ulong_t pga_allocfailed
;
285 ulong_t pgma_allocok
;
286 ulong_t pgma_allocfailed
;
287 ulong_t pgma_allocempty
;
291 uint_t mmu_page_sizes
;
293 /* How many page sizes the users can see */
294 uint_t mmu_exported_page_sizes
;
296 /* page sizes that legacy applications can see */
297 uint_t mmu_legacy_page_sizes
;
300 * Number of pages in 1 GB. Don't enable automatic large pages if we have
301 * fewer than this many pages.
303 pgcnt_t shm_lpg_min_physmem
= 1 << (30 - MMU_PAGESHIFT
);
304 pgcnt_t privm_lpg_min_physmem
= 1 << (30 - MMU_PAGESHIFT
);
307 * Maximum and default segment size tunables for user private
308 * and shared anon memory, and user text and initialized data.
309 * These can be patched via /etc/system to allow large pages
310 * to be used for mapping application private and shared anon memory.
312 size_t mcntl0_lpsize
= MMU_PAGESIZE
;
313 size_t max_uheap_lpsize
= MMU_PAGESIZE
;
314 size_t default_uheap_lpsize
= MMU_PAGESIZE
;
315 size_t max_ustack_lpsize
= MMU_PAGESIZE
;
316 size_t default_ustack_lpsize
= MMU_PAGESIZE
;
317 size_t max_privmap_lpsize
= MMU_PAGESIZE
;
318 size_t max_uidata_lpsize
= MMU_PAGESIZE
;
319 size_t max_utext_lpsize
= MMU_PAGESIZE
;
320 size_t max_shm_lpsize
= MMU_PAGESIZE
;
324 * initialized by page_coloring_init().
327 uint_t page_colors_mask
;
328 uint_t page_coloring_shift
;
330 static uint_t l2_colors
;
333 * Page freelists and cachelists are dynamically allocated once mnoderangecnt
334 * and page_colors are calculated from the l2 cache n-way set size. Within a
335 * mnode range, the page freelist and cachelist are hashed into bins based on
336 * color. This makes it easier to search for a page within a specific memory
339 #define PAGE_COLORS_MIN 16
341 page_t
****page_freelists
;
342 page_t
***page_cachelists
;
346 * Used by page layer to know about page sizes
348 hw_pagesize_t hw_page_array
[MAX_NUM_LEVEL
+ 1];
350 kmutex_t
*fpc_mutex
[NPC_MUTEX
];
351 kmutex_t
*cpc_mutex
[NPC_MUTEX
];
353 /* Lock to protect mnoderanges array for memory DR operations. */
354 static kmutex_t mnoderange_lock
;
357 * Only let one thread at a time try to coalesce large pages, to
358 * prevent them from working against each other.
360 static kmutex_t contig_lock
;
361 #define CONTIG_LOCK() mutex_enter(&contig_lock);
362 #define CONTIG_UNLOCK() mutex_exit(&contig_lock);
364 #define PFN_16M (mmu_btop((uint64_t)0x1000000))
367 * Return the optimum page size for a given mapping
371 map_pgsz(int maptype
, struct proc
*p
, caddr_t addr
, size_t len
, int memcntl
)
374 size_t pgsz
= MMU_PAGESIZE
;
378 ASSERT(maptype
!= MAPPGSZ_VA
);
380 if (maptype
!= MAPPGSZ_ISM
&& physmem
< privm_lpg_min_physmem
) {
381 return (MMU_PAGESIZE
);
387 max_lpsize
= memcntl
? mcntl0_lpsize
: (maptype
==
388 MAPPGSZ_HEAP
? max_uheap_lpsize
: max_ustack_lpsize
);
389 if (max_lpsize
== MMU_PAGESIZE
) {
390 return (MMU_PAGESIZE
);
393 len
= (maptype
== MAPPGSZ_HEAP
) ? p
->p_brkbase
+
394 p
->p_brksize
- p
->p_bssbase
: p
->p_stksize
;
396 len
= (maptype
== MAPPGSZ_HEAP
) ? MAX(len
,
397 default_uheap_lpsize
) : MAX(len
, default_ustack_lpsize
);
400 * use the pages size that best fits len
402 for (l
= mmu
.umax_page_level
; l
> 0; --l
) {
403 if (LEVEL_SIZE(l
) > max_lpsize
|| len
< LEVEL_SIZE(l
)) {
406 pgsz
= LEVEL_SIZE(l
);
411 mszc
= (maptype
== MAPPGSZ_HEAP
? p
->p_brkpageszc
:
413 if (addr
== 0 && (pgsz
< hw_page_array
[mszc
].hp_size
)) {
414 pgsz
= hw_page_array
[mszc
].hp_size
;
419 for (l
= mmu
.umax_page_level
; l
> 0; --l
) {
420 if (len
>= LEVEL_SIZE(l
))
421 return (LEVEL_SIZE(l
));
423 return (LEVEL_SIZE(0));
429 map_szcvec(caddr_t addr
, size_t size
, uintptr_t off
, size_t max_lpsize
,
432 caddr_t eaddr
= addr
+ size
;
439 if (physmem
< min_physmem
|| max_lpsize
<= MMU_PAGESIZE
) {
443 for (i
= mmu_exported_page_sizes
- 1; i
> 0; i
--) {
444 pgsz
= page_get_pagesize(i
);
445 if (pgsz
> max_lpsize
) {
448 raddr
= (caddr_t
)P2ROUNDUP((uintptr_t)addr
, pgsz
);
449 readdr
= (caddr_t
)P2ALIGN((uintptr_t)eaddr
, pgsz
);
450 if (raddr
< addr
|| raddr
>= readdr
) {
453 if (P2PHASE((uintptr_t)addr
^ off
, pgsz
)) {
457 * Set szcvec to the remaining page sizes.
459 szcvec
= ((1 << (i
+ 1)) - 1) & ~1;
466 * Return a bit vector of large page size codes that
467 * can be used to map [addr, addr + len) region.
471 map_pgszcvec(caddr_t addr
, size_t size
, uintptr_t off
, int flags
, int type
,
474 size_t max_lpsize
= mcntl0_lpsize
;
476 if (mmu
.max_page_level
== 0)
479 if (flags
& MAP_TEXT
) {
481 max_lpsize
= max_utext_lpsize
;
482 return (map_szcvec(addr
, size
, off
, max_lpsize
,
483 shm_lpg_min_physmem
));
485 } else if (flags
& MAP_INITDATA
) {
487 max_lpsize
= max_uidata_lpsize
;
488 return (map_szcvec(addr
, size
, off
, max_lpsize
,
489 privm_lpg_min_physmem
));
491 } else if (type
== MAPPGSZC_SHM
) {
493 max_lpsize
= max_shm_lpsize
;
494 return (map_szcvec(addr
, size
, off
, max_lpsize
,
495 shm_lpg_min_physmem
));
497 } else if (type
== MAPPGSZC_HEAP
) {
499 max_lpsize
= max_uheap_lpsize
;
500 return (map_szcvec(addr
, size
, off
, max_lpsize
,
501 privm_lpg_min_physmem
));
503 } else if (type
== MAPPGSZC_STACK
) {
505 max_lpsize
= max_ustack_lpsize
;
506 return (map_szcvec(addr
, size
, off
, max_lpsize
,
507 privm_lpg_min_physmem
));
511 max_lpsize
= max_privmap_lpsize
;
512 return (map_szcvec(addr
, size
, off
, max_lpsize
,
513 privm_lpg_min_physmem
));
518 * Handle a pagefault.
523 enum fault_type type
,
538 ASSERT_STACK_ALIGNED();
540 if (INVALID_VADDR(addr
))
543 mapped_red
= segkp_map_red();
556 * Dispatch pagefault.
558 res
= as_fault(hat
, as
, addr
, 1, type
, rw
);
561 * If this isn't a potential unmapped hole in the user's
562 * UNIX data or stack segments, just return status info.
564 if (res
!= FC_NOMAP
|| iskernel
)
568 * Check to see if we happened to faulted on a currently unmapped
569 * part of the UNIX data or stack segments. If so, create a zfod
570 * mapping there and then try calling the fault routine again.
575 if (addr
< base
|| addr
>= base
+ len
) { /* data seg? */
576 base
= (caddr_t
)p
->p_usrstack
- p
->p_stksize
;
578 if (addr
< base
|| addr
>= p
->p_usrstack
) { /* stack seg? */
579 /* not in either UNIX data or stack segments */
586 * the rest of this function implements a 3.X 4.X 5.X compatibility
587 * This code is probably not needed anymore
589 if (p
->p_model
== DATAMODEL_ILP32
) {
591 /* expand the gap to the page boundaries on each side */
592 ea
= P2ROUNDUP((uintptr_t)base
+ len
, MMU_PAGESIZE
);
593 base
= (caddr_t
)P2ALIGN((uintptr_t)base
, MMU_PAGESIZE
);
594 len
= ea
- (uintptr_t)base
;
597 if (as_gap(as
, MMU_PAGESIZE
, &base
, &len
, AH_CONTAIN
, addr
) ==
599 err
= as_map(as
, base
, len
, segvn_create
, zfod_argsp
);
602 res
= FC_MAKE_ERR(err
);
607 * This page is already mapped by another thread after
608 * we returned from as_fault() above. We just fall
609 * through as_fault() below.
614 res
= as_fault(hat
, as
, addr
, 1, F_INVAL
, rw
);
625 map_addr(caddr_t
*addrp
, size_t len
, offset_t off
, int vacalign
, uint_t flags
)
627 struct proc
*p
= curproc
;
628 caddr_t userlimit
= (flags
& _MAP_LOW32
) ?
629 (caddr_t
)_userlimit32
: p
->p_as
->a_userlimit
;
631 map_addr_proc(addrp
, len
, off
, vacalign
, userlimit
, curproc
, flags
);
636 map_addr_vacalign_check(caddr_t addr
, u_offset_t off
)
642 * The maximum amount a randomized mapping will be slewed. We should perhaps
643 * arrange things so these tunables can be separate for mmap, mmapobj, and
646 size_t aslr_max_map_skew
= 256 * 1024 * 1024; /* 256MB */
649 * map_addr_proc() is the routine called when the system is to
650 * choose an address for the user. We will pick an address
651 * range which is the highest available below userlimit.
653 * Every mapping will have a redzone of a single page on either side of
654 * the request. This is done to leave one page unmapped between segments.
655 * This is not required, but it's useful for the user because if their
656 * program strays across a segment boundary, it will catch a fault
657 * immediately making debugging a little easier. Currently the redzone
660 * addrp is a value/result parameter.
661 * On input it is a hint from the user to be used in a completely
662 * machine dependent fashion. We decide to completely ignore this hint.
663 * If MAP_ALIGN was specified, addrp contains the minimal alignment, which
664 * must be some "power of two" multiple of pagesize.
666 * On output it is NULL if no address can be found in the current
667 * processes address space or else an address that is currently
668 * not mapped for len bytes with a page of red zone on either side.
670 * vacalign is not needed on x86 (it's for viturally addressed caches)
683 struct as
*as
= p
->p_as
;
689 ASSERT32(userlimit
== as
->a_userlimit
);
694 * XX64 Yes, this needs more work.
696 if (p
->p_model
== DATAMODEL_NATIVE
) {
697 if (userlimit
< as
->a_userlimit
) {
699 * This happens when a program wants to map
700 * something in a range that's accessible to a
701 * program in a smaller address space. For example,
702 * a 64-bit program calling mmap32(2) to guarantee
703 * that the returned address is below 4Gbytes.
705 ASSERT((uintptr_t)userlimit
< ADDRESS_C(0xffffffff));
707 if (userlimit
> base
)
708 slen
= userlimit
- base
;
715 * XX64 This layout is probably wrong .. but in
716 * the event we make the amd64 address space look
717 * like sparcv9 i.e. with the stack -above- the
718 * heap, this bit of code might even be correct.
720 slen
= p
->p_usrstack
- base
-
721 ((p
->p_stk_ctl
+ PAGEOFFSET
) & PAGEMASK
);
725 slen
= userlimit
- base
;
727 /* Make len be a multiple of PAGESIZE */
728 len
= (len
+ PAGEOFFSET
) & PAGEMASK
;
731 * figure out what the alignment should be
733 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
735 if (len
<= ELF_386_MAXPGSZ
) {
737 * Align virtual addresses to ensure that ELF shared libraries
738 * are mapped with the appropriate alignment constraints by
739 * the run-time linker.
741 align_amount
= ELF_386_MAXPGSZ
;
744 * For 32-bit processes, only those which have specified
745 * MAP_ALIGN and an addr will be aligned on a larger page size.
746 * Not doing so can potentially waste up to 1G of process
749 int lvl
= (p
->p_model
== DATAMODEL_ILP32
) ? 1 :
752 while (lvl
&& len
< LEVEL_SIZE(lvl
))
755 align_amount
= LEVEL_SIZE(lvl
);
757 if ((flags
& MAP_ALIGN
) && ((uintptr_t)*addrp
> align_amount
))
758 align_amount
= (uintptr_t)*addrp
;
760 ASSERT(ISP2(align_amount
));
761 ASSERT(align_amount
== 0 || align_amount
>= PAGESIZE
);
763 off
= off
& (align_amount
- 1);
766 * Look for a large enough hole starting below userlimit.
767 * After finding it, use the upper part.
769 if (as_gap_aligned(as
, len
, &base
, &slen
, AH_HI
, NULL
, align_amount
,
770 PAGESIZE
, off
) == 0) {
774 * addr is the highest possible address to use since we have
775 * a PAGESIZE redzone at the beginning and end.
777 addr
= base
+ slen
- (PAGESIZE
+ len
);
780 * Round address DOWN to the alignment amount and
782 * If addr is greater than as_addr, len would not be large
783 * enough to include the redzone, so we must adjust down
784 * by the alignment amount.
786 addr
= (caddr_t
)((uintptr_t)addr
& (~(align_amount
- 1)));
787 addr
+= (uintptr_t)off
;
788 if (addr
> as_addr
) {
789 addr
-= align_amount
;
793 * If randomization is requested, slew the allocation
794 * backwards, within the same gap, by a random amount.
796 if (flags
& _MAP_RANDOMIZE
) {
799 (void) random_get_pseudo_bytes((uint8_t *)&slew
,
802 slew
= slew
% MIN(aslr_max_map_skew
, (addr
- base
));
803 addr
-= P2ALIGN(slew
, align_amount
);
807 ASSERT(addr
+ len
< base
+ slen
);
808 ASSERT(((uintptr_t)addr
& (align_amount
- 1)) ==
812 *addrp
= NULL
; /* no more virtual space */
816 int valid_va_range_aligned_wraparound
;
819 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
820 * addresses at least "minlen" long, where the base of the range is at "off"
821 * phase from an "align" boundary and there is space for a "redzone"-sized
822 * redzone on either side of the range. On success, 1 is returned and *basep
823 * and *lenp are adjusted to describe the acceptable range (including
824 * the redzone). On failure, 0 is returned.
828 valid_va_range_aligned(caddr_t
*basep
, size_t *lenp
, size_t minlen
, int dir
,
829 size_t align
, size_t redzone
, size_t off
)
834 ASSERT(align
== 0 ? off
== 0 : off
< align
);
836 ASSERT(align
== 0 || align
>= PAGESIZE
);
838 lo
= (uintptr_t)*basep
;
840 tot_len
= minlen
+ 2 * redzone
; /* need at least this much space */
843 * If hi rolled over the top, try cutting back.
846 *lenp
= 0UL - lo
- 1UL;
847 /* See if this really happens. If so, then we figure out why */
848 valid_va_range_aligned_wraparound
++;
851 if (*lenp
< tot_len
) {
857 * Deal with a possible hole in the address range between
858 * hole_start and hole_end that should never be mapped.
860 if (lo
< hole_start
) {
861 if (hi
> hole_start
) {
865 /* lo < hole_start && hi >= hole_end */
868 * prefer lowest range
870 if (hole_start
- lo
>= tot_len
)
872 else if (hi
- hole_end
>= tot_len
)
878 * prefer highest range
880 if (hi
- hole_end
>= tot_len
)
882 else if (hole_start
- lo
>= tot_len
)
890 /* lo >= hole_start */
898 if (hi
- lo
< tot_len
)
902 uintptr_t tlo
= lo
+ redzone
;
903 uintptr_t thi
= hi
- redzone
;
904 tlo
= (uintptr_t)P2PHASEUP(tlo
, align
, off
);
905 if (tlo
< lo
+ redzone
) {
908 if (thi
< tlo
|| thi
- tlo
< minlen
) {
913 *basep
= (caddr_t
)lo
;
919 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
920 * addresses at least "minlen" long. On success, 1 is returned and *basep
921 * and *lenp are adjusted to describe the acceptable range. On failure, 0
925 valid_va_range(caddr_t
*basep
, size_t *lenp
, size_t minlen
, int dir
)
927 return (valid_va_range_aligned(basep
, lenp
, minlen
, dir
, 0, 0, 0));
931 * Default to forbidding the first 64k of address space. This protects most
932 * reasonably sized structures from dereferences through NULL:
935 uintptr_t forbidden_null_mapping_sz
= 0x10000;
938 * Determine whether [addr, addr+len] are valid user addresses.
942 valid_usr_range(caddr_t addr
, size_t len
, uint_t prot
, struct as
*as
,
945 caddr_t eaddr
= addr
+ len
;
947 if (eaddr
<= addr
|| addr
>= userlimit
|| eaddr
> userlimit
)
948 return (RANGE_BADADDR
);
950 if ((addr
<= (caddr_t
)forbidden_null_mapping_sz
) &&
951 secflag_enabled(as
->a_proc
, PROC_SEC_FORBIDNULLMAP
))
952 return (RANGE_BADADDR
);
956 * Check for the VA hole
958 if (eaddr
> (caddr_t
)hole_start
&& addr
< (caddr_t
)hole_end
)
959 return (RANGE_BADADDR
);
966 * Return 1 if the page frame is onboard memory, else 0.
969 pf_is_memory(pfn_t pf
)
971 if (pfn_is_foreign(pf
))
973 return (address_in_memlist(phys_install
, pfn_to_pa(pf
), 1));
977 * return the memrange containing pfn
980 memrange_num(pfn_t pfn
)
984 for (n
= 0; n
< nranges
- 1; ++n
) {
985 if (pfn
>= memranges
[n
])
992 * return the mnoderange containing pfn
996 pfn_2_mtype(pfn_t pfn
)
1003 /* Always start from highest pfn and work our way down */
1004 for (n
= mtypetop
; n
!= -1; n
= mnoderanges
[n
].mnr_next
) {
1005 if (pfn
>= mnoderanges
[n
].mnr_pfnlo
) {
1015 * is_contigpage_free:
1016 * returns a page list of contiguous pages. It minimally has to return
1017 * minctg pages. Caller determines minctg based on the scatter-gather
1020 * pfnp is set to the next page frame to search on return.
1033 page_t
*plist
= NULL
;
1036 * fail if pfn + minctg crosses a segment boundary.
1037 * Adjust for next starting pfn to begin at segment boundary.
1040 if (((*pfnp
+ minctg
- 1) & pfnseg
) < (*pfnp
& pfnseg
)) {
1041 *pfnp
= roundup(*pfnp
, pfnseg
+ 1);
1047 pp
= page_numtopp_nolock(pfn
+ i
);
1048 if ((pp
== NULL
) || IS_DUMP_PAGE(pp
) ||
1049 (page_trylock(pp
, SE_EXCL
) == 0)) {
1053 if (page_pptonum(pp
) != pfn
+ i
) {
1058 if (!(PP_ISFREE(pp
))) {
1064 if (!PP_ISAGED(pp
)) {
1065 page_list_sub(pp
, PG_CACHE_LIST
);
1066 page_hashout(pp
, (kmutex_t
*)NULL
);
1068 page_list_sub(pp
, PG_FREE_LIST
);
1073 page_list_concat(&plist
, &pp
);
1076 * exit loop when pgcnt satisfied or segment boundary reached.
1079 } while ((++i
< *pgcnt
) && ((pfn
+ i
) & pfnseg
));
1081 *pfnp
+= i
; /* set to next pfn to search */
1089 * failure: minctg not satisfied.
1091 * if next request crosses segment boundary, set next pfn
1092 * to search from the segment boundary.
1094 if (((*pfnp
+ minctg
- 1) & pfnseg
) < (*pfnp
& pfnseg
))
1095 *pfnp
= roundup(*pfnp
, pfnseg
+ 1);
1097 /* clean up any pages already allocated */
1101 page_sub(&plist
, pp
);
1102 page_list_add(pp
, PG_FREE_LIST
| PG_LIST_TAIL
);
1113 * verify that pages being returned from allocator have correct DMA attribute
1116 #define check_dma(a, b, c) (void)(0)
1119 check_dma(ddi_dma_attr_t
*dma_attr
, page_t
*pp
, int cnt
)
1121 if (dma_attr
== NULL
)
1125 if (pa_to_ma(pfn_to_pa(pp
->p_pagenum
)) <
1126 dma_attr
->dma_attr_addr_lo
)
1127 panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp
);
1128 if (pa_to_ma(pfn_to_pa(pp
->p_pagenum
)) >=
1129 dma_attr
->dma_attr_addr_hi
)
1130 panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp
);
1138 page_get_contigpage(pgcnt_t
*pgcnt
, ddi_dma_attr_t
*mattr
, int iolock
)
1144 page_t
*pplist
= NULL
, *plist
;
1146 pgcnt_t pfnalign
= 0;
1147 static pfn_t startpfn
;
1148 static pgcnt_t lastctgcnt
;
1154 lo
= mmu_btop((mattr
->dma_attr_addr_lo
+ MMU_PAGEOFFSET
));
1155 hi
= mmu_btop(mattr
->dma_attr_addr_hi
);
1158 sgllen
= mattr
->dma_attr_sgllen
;
1159 pfnseg
= mmu_btop(mattr
->dma_attr_seg
);
1161 align
= maxbit(mattr
->dma_attr_align
, mattr
->dma_attr_minxfer
);
1162 if (align
> MMU_PAGESIZE
)
1163 pfnalign
= mmu_btop(align
);
1166 * in order to satisfy the request, must minimally
1167 * acquire minctg contiguous pages
1169 minctg
= howmany(*pgcnt
, sgllen
);
1174 * start from where last searched if the minctg >= lastctgcnt
1176 if (minctg
< lastctgcnt
|| startpfn
< lo
|| startpfn
> hi
)
1182 pfnseg
= mmu
.highest_pfn
;
1185 if (minctg
< lastctgcnt
)
1188 lastctgcnt
= minctg
;
1190 ASSERT(pfnseg
+ 1 >= (uint64_t)minctg
);
1192 /* conserve 16m memory - start search above 16m when possible */
1193 if (hi
> PFN_16M
&& startpfn
< PFN_16M
)
1198 pfn
= P2ROUNDUP(pfn
, pfnalign
);
1200 while (pfn
+ minctg
- 1 <= hi
) {
1202 plist
= is_contigpage_free(&pfn
, pgcnt
, minctg
, pfnseg
, iolock
);
1204 page_list_concat(&pplist
, &plist
);
1207 * return when contig pages no longer needed
1209 if (!*pgcnt
|| ((*pgcnt
<= sgllen
) && !pfnalign
)) {
1212 check_dma(mattr
, pplist
, *pgcnt
);
1215 minctg
= howmany(*pgcnt
, sgllen
);
1218 pfn
= P2ROUNDUP(pfn
, pfnalign
);
1221 /* cannot find contig pages in specified range */
1222 if (startpfn
== lo
) {
1227 /* did not start with lo previously */
1230 pfn
= P2ROUNDUP(pfn
, pfnalign
);
1232 /* allow search to go above startpfn */
1233 while (pfn
< startpfn
) {
1235 plist
= is_contigpage_free(&pfn
, pgcnt
, minctg
, pfnseg
, iolock
);
1236 if (plist
!= NULL
) {
1238 page_list_concat(&pplist
, &plist
);
1242 * return when contig pages no longer needed
1244 if (!*pgcnt
|| ((*pgcnt
<= sgllen
) && !pfnalign
)) {
1247 check_dma(mattr
, pplist
, *pgcnt
);
1250 minctg
= howmany(*pgcnt
, sgllen
);
1253 pfn
= P2ROUNDUP(pfn
, pfnalign
);
1261 * mnode_range_cnt() calculates the number of memory ranges for mnode and
1262 * memranges[]. Used to determine the size of page lists and mnoderanges.
1265 mnode_range_cnt(int mnode
)
1274 if (mem_node_config
[mnode
].exists
!= 0) {
1277 /* find the memranges index below contained in mnode range */
1279 while (MEMRANGEHI(mri
) < mem_node_config
[mnode
].physbase
)
1283 * increment mnode range counter when memranges or mnode
1284 * boundary is reached.
1287 mem_node_config
[mnode
].physmax
>= MEMRANGELO(mri
)) {
1289 if (mem_node_config
[mnode
].physmax
> MEMRANGEHI(mri
))
1295 ASSERT(mnrcnt
<= MAX_MNODE_MRANGES
);
1301 * mnode_range_setup() initializes mnoderanges.
1304 mnode_range_setup(mnoderange_t
*mnoderanges
)
1306 mnoderange_t
*mp
= mnoderanges
;
1308 int mindex
= 0; /* current index into mnoderanges array */
1313 for (mnode
= 0; mnode
< max_mem_nodes
; mnode
++) {
1314 if (mem_node_config
[mnode
].exists
== 0)
1319 while (MEMRANGEHI(mri
) < mem_node_config
[mnode
].physbase
)
1322 while (mri
>= 0 && mem_node_config
[mnode
].physmax
>=
1324 mnoderanges
->mnr_pfnlo
= MAX(MEMRANGELO(mri
),
1325 mem_node_config
[mnode
].physbase
);
1326 mnoderanges
->mnr_pfnhi
= MIN(MEMRANGEHI(mri
),
1327 mem_node_config
[mnode
].physmax
);
1328 mnoderanges
->mnr_mnode
= mnode
;
1329 mnoderanges
->mnr_memrange
= mri
;
1330 mnoderanges
->mnr_exists
= 1;
1333 if (mem_node_config
[mnode
].physmax
> MEMRANGEHI(mri
))
1341 * For now do a simple sort of the mnoderanges array to fill in
1342 * the mnr_next fields. Since mindex is expected to be relatively
1343 * small, using a simple O(N^2) algorithm.
1345 for (i
= 0; i
< mindex
; i
++) {
1346 if (mp
[i
].mnr_pfnlo
== 0) /* find lowest */
1352 mp
[last
].mnr_next
= -1;
1353 for (i
= 0; i
< mindex
- 1; i
++) {
1354 hipfn
= (pfn_t
)(-1);
1356 /* find next highest mnode range */
1357 for (j
= 0; j
< mindex
; j
++) {
1358 if (mp
[j
].mnr_pfnlo
> mp
[last
].mnr_pfnlo
&&
1359 mp
[j
].mnr_pfnlo
< hipfn
) {
1360 hipfn
= mp
[j
].mnr_pfnlo
;
1364 mp
[hi
].mnr_next
= last
;
1372 * Update mnoderanges for memory hot-add DR operations.
1375 mnode_range_add(int mnode
)
1380 extern void membar_sync(void);
1382 ASSERT(0 <= mnode
&& mnode
< max_mem_nodes
);
1383 ASSERT(mem_node_config
[mnode
].exists
);
1384 start
= mem_node_config
[mnode
].physbase
;
1385 end
= mem_node_config
[mnode
].physmax
;
1386 ASSERT(start
<= end
);
1387 mutex_enter(&mnoderange_lock
);
1390 /* Check whether it interleaves with other memory nodes. */
1391 for (n
= mtypetop
; n
!= -1; n
= mnoderanges
[n
].mnr_next
) {
1392 ASSERT(mnoderanges
[n
].mnr_exists
);
1393 if (mnoderanges
[n
].mnr_mnode
== mnode
)
1395 ASSERT(start
> mnoderanges
[n
].mnr_pfnhi
||
1396 end
< mnoderanges
[n
].mnr_pfnlo
);
1401 while (MEMRANGEHI(mri
) < mem_node_config
[mnode
].physbase
)
1403 while (mri
>= 0 && mem_node_config
[mnode
].physmax
>= MEMRANGELO(mri
)) {
1404 /* Check whether mtype already exists. */
1405 for (n
= mtypetop
; n
!= -1; n
= mnoderanges
[n
].mnr_next
) {
1406 if (mnoderanges
[n
].mnr_mnode
== mnode
&&
1407 mnoderanges
[n
].mnr_memrange
== mri
) {
1408 mnoderanges
[n
].mnr_pfnlo
= MAX(MEMRANGELO(mri
),
1410 mnoderanges
[n
].mnr_pfnhi
= MIN(MEMRANGEHI(mri
),
1416 /* Add a new entry if it doesn't exist yet. */
1418 /* Try to find an unused entry in mnoderanges array. */
1419 for (n
= 0; n
< mnoderangecnt
; n
++) {
1420 if (mnoderanges
[n
].mnr_exists
== 0)
1423 ASSERT(n
< mnoderangecnt
);
1424 mnoderanges
[n
].mnr_pfnlo
= MAX(MEMRANGELO(mri
), start
);
1425 mnoderanges
[n
].mnr_pfnhi
= MIN(MEMRANGEHI(mri
), end
);
1426 mnoderanges
[n
].mnr_mnode
= mnode
;
1427 mnoderanges
[n
].mnr_memrange
= mri
;
1428 mnoderanges
[n
].mnr_exists
= 1;
1429 /* Page 0 should always be present. */
1430 for (prev
= &mtypetop
;
1431 mnoderanges
[*prev
].mnr_pfnlo
> start
;
1432 prev
= &mnoderanges
[*prev
].mnr_next
) {
1433 ASSERT(mnoderanges
[*prev
].mnr_next
>= 0);
1434 ASSERT(mnoderanges
[*prev
].mnr_pfnlo
> end
);
1436 mnoderanges
[n
].mnr_next
= *prev
;
1441 if (mem_node_config
[mnode
].physmax
> MEMRANGEHI(mri
))
1447 mutex_exit(&mnoderange_lock
);
1451 * Update mnoderanges for memory hot-removal DR operations.
1454 mnode_range_del(int mnode
)
1456 _NOTE(ARGUNUSED(mnode
));
1457 ASSERT(0 <= mnode
&& mnode
< max_mem_nodes
);
1458 /* TODO: support deletion operation. */
1463 plat_slice_add(pfn_t start
, pfn_t end
)
1465 mem_node_add_slice(start
, end
);
1466 if (plat_dr_enabled()) {
1467 mnode_range_add(PFN_2_MEM_NODE(start
));
1472 plat_slice_del(pfn_t start
, pfn_t end
)
1474 ASSERT(PFN_2_MEM_NODE(start
) == PFN_2_MEM_NODE(end
));
1475 ASSERT(plat_dr_enabled());
1476 mnode_range_del(PFN_2_MEM_NODE(start
));
1477 mem_node_del_slice(start
, end
);
1483 mtype_init(vnode_t
*vp
, caddr_t vaddr
, uint_t
*flags
, size_t pgsz
)
1485 int mtype
= mtypetop
;
1490 * set the mtype range
1491 * - kmem requests need to be below 4g if restricted_kmemalloc is set.
1492 * - for non kmem requests, set range to above 4g if memory below 4g
1495 if (restricted_kmemalloc
&& VN_ISKAS(vp
) &&
1496 (caddr_t
)(vaddr
) >= kernelheap
&&
1497 (caddr_t
)(vaddr
) < ekernelheap
) {
1500 if (RESTRICT16M_ALLOC(freemem4g
- btop(pgsz
),
1501 btop(pgsz
), *flags
)) {
1502 *flags
|= PGI_MT_RANGE16M
;
1504 VM_STAT_ADD(vmm_vmstats
.unrestrict16mcnt
);
1505 VM_STAT_COND_ADD((*flags
& PG_PANIC
),
1506 vmm_vmstats
.pgpanicalloc
);
1507 *flags
|= PGI_MT_RANGE0
;
1513 if (RESTRICT4G_ALLOC
) {
1514 VM_STAT_ADD(vmm_vmstats
.restrict4gcnt
);
1515 /* here only for > 4g systems */
1516 *flags
|= PGI_MT_RANGE4G
;
1517 } else if (RESTRICT16M_ALLOC(freemem
, btop(pgsz
), *flags
)) {
1518 *flags
|= PGI_MT_RANGE16M
;
1520 VM_STAT_ADD(vmm_vmstats
.unrestrict16mcnt
);
1521 VM_STAT_COND_ADD((*flags
& PG_PANIC
), vmm_vmstats
.pgpanicalloc
);
1522 *flags
|= PGI_MT_RANGE0
;
1529 /* mtype init for page_get_replacement_page */
1532 mtype_pgr_init(int *flags
, page_t
*pp
, int mnode
, pgcnt_t pgcnt
)
1534 int mtype
= mtypetop
;
1536 if (RESTRICT16M_ALLOC(freemem
, pgcnt
, *flags
)) {
1537 *flags
|= PGI_MT_RANGE16M
;
1539 VM_STAT_ADD(vmm_vmstats
.unrestrict16mcnt
);
1540 *flags
|= PGI_MT_RANGE0
;
1547 * Determine if the mnode range specified in mtype contains memory belonging
1548 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains
1549 * the range from high pfn to 0, 16m or 4g.
1551 * Return first mnode range type index found otherwise return -1 if none found.
1554 mtype_func(int mnode
, int mtype
, uint_t flags
)
1556 if (flags
& PGI_MT_RANGE
) {
1557 int mnr_lim
= MRI_0
;
1559 if (flags
& PGI_MT_NEXT
) {
1560 mtype
= mnoderanges
[mtype
].mnr_next
;
1562 if (flags
& PGI_MT_RANGE4G
)
1563 mnr_lim
= MRI_4G
; /* exclude 0-4g range */
1564 else if (flags
& PGI_MT_RANGE16M
)
1565 mnr_lim
= MRI_16M
; /* exclude 0-16m range */
1566 while (mtype
!= -1 &&
1567 mnoderanges
[mtype
].mnr_memrange
<= mnr_lim
) {
1568 if (mnoderanges
[mtype
].mnr_mnode
== mnode
)
1570 mtype
= mnoderanges
[mtype
].mnr_next
;
1572 } else if (mnoderanges
[mtype
].mnr_mnode
== mnode
) {
1579 * Update the page list max counts with the pfn range specified by the
1583 mtype_modify_max(pfn_t startpfn
, long cnt
)
1587 spgcnt_t scnt
= (spgcnt_t
)(cnt
);
1588 pgcnt_t acnt
= ABS(scnt
);
1589 pfn_t endpfn
= startpfn
+ acnt
;
1596 for (pfn
= endpfn
; pfn
> startpfn
; ) {
1597 ASSERT(mtype
!= -1);
1598 lo
= mnoderanges
[mtype
].mnr_pfnlo
;
1600 if (startpfn
>= lo
) {
1601 inc
= pfn
- startpfn
;
1605 if (mnoderanges
[mtype
].mnr_memrange
!= MRI_4G
) {
1613 mtype
= mnoderanges
[mtype
].mnr_next
;
1618 mtype_2_mrange(int mtype
)
1620 return (mnoderanges
[mtype
].mnr_memrange
);
1624 mnodetype_2_pfn(int mnode
, int mtype
, pfn_t
*pfnlo
, pfn_t
*pfnhi
)
1626 _NOTE(ARGUNUSED(mnode
));
1627 ASSERT(mnoderanges
[mtype
].mnr_mnode
== mnode
);
1628 *pfnlo
= mnoderanges
[mtype
].mnr_pfnlo
;
1629 *pfnhi
= mnoderanges
[mtype
].mnr_pfnhi
;
1633 plcnt_sz(size_t ctrs_sz
)
1638 ctrs_sz
+= mnoderangecnt
* sizeof (struct mnr_mts
) * mmu_page_sizes
;
1639 for (szc
= 0; szc
< mmu_page_sizes
; szc
++) {
1640 colors
= page_get_pagecolors(szc
);
1641 ctrs_sz
+= mnoderangecnt
* sizeof (pgcnt_t
) * colors
;
1648 plcnt_init(caddr_t addr
)
1651 int mt
, szc
, colors
;
1653 for (mt
= 0; mt
< mnoderangecnt
; mt
++) {
1654 mnoderanges
[mt
].mnr_mts
= (struct mnr_mts
*)addr
;
1655 addr
+= (sizeof (struct mnr_mts
) * mmu_page_sizes
);
1656 for (szc
= 0; szc
< mmu_page_sizes
; szc
++) {
1657 colors
= page_get_pagecolors(szc
);
1658 mnoderanges
[mt
].mnr_mts
[szc
].mnr_mts_colors
= colors
;
1659 mnoderanges
[mt
].mnr_mts
[szc
].mnr_mtsc_pgcnt
=
1661 addr
+= (sizeof (pgcnt_t
) * colors
);
1669 plcnt_inc_dec(page_t
*pp
, int mtype
, int szc
, long cnt
, int flags
)
1671 _NOTE(ARGUNUSED(pp
));
1673 int bin
= PP_2_BIN(pp
);
1675 atomic_add_long(&mnoderanges
[mtype
].mnr_mts
[szc
].mnr_mts_pgcnt
, cnt
);
1676 atomic_add_long(&mnoderanges
[mtype
].mnr_mts
[szc
].mnr_mtsc_pgcnt
[bin
],
1679 ASSERT(mtype
== PP_2_MTYPE(pp
));
1680 if (physmax4g
&& mnoderanges
[mtype
].mnr_memrange
!= MRI_4G
)
1681 atomic_add_long(&freemem4g
, cnt
);
1682 if (flags
& PG_CACHE_LIST
)
1683 atomic_add_long(&mnoderanges
[mtype
].mnr_mt_clpgcnt
, cnt
);
1685 atomic_add_long(&mnoderanges
[mtype
].mnr_mt_flpgcnt
[szc
], cnt
);
1686 atomic_add_long(&mnoderanges
[mtype
].mnr_mt_totcnt
, cnt
);
1690 * Returns the free page count for mnode
1693 mnode_pgcnt(int mnode
)
1695 int mtype
= mtypetop
;
1696 int flags
= PGI_MT_RANGE0
;
1699 mtype
= mtype_func(mnode
, mtype
, flags
);
1701 while (mtype
!= -1) {
1702 pgcnt
+= MTYPE_FREEMEM(mtype
);
1703 mtype
= mtype_func(mnode
, mtype
, flags
| PGI_MT_NEXT
);
1709 * Initialize page coloring variables based on the l2 cache parameters.
1710 * Calculate and return memory needed for page coloring data structures.
1713 page_coloring_init(uint_t l2_sz
, int l2_linesz
, int l2_assoc
)
1715 _NOTE(ARGUNUSED(l2_linesz
));
1722 * Hypervisor domains currently don't have any concept of NUMA.
1723 * Hence we'll act like there is only 1 memrange.
1725 i
= memrange_num(1);
1728 * Reduce the memory ranges lists if we don't have large amounts
1729 * of memory. This avoids searching known empty free lists.
1730 * To support memory DR operations, we need to keep memory ranges
1731 * for possible memory hot-add operations.
1733 if (plat_dr_physmax
> physmax
)
1734 i
= memrange_num(plat_dr_physmax
);
1736 i
= memrange_num(physmax
);
1739 restricted_kmemalloc
= 0;
1741 /* physmax greater than 4g */
1748 ASSERT(mmu_page_sizes
<= MMU_PAGE_SIZES
);
1750 ASSERT(ISP2(l2_linesz
));
1751 ASSERT(l2_sz
> MMU_PAGESIZE
);
1753 /* l2_assoc is 0 for fully associative l2 cache */
1755 l2_colors
= MAX(1, l2_sz
/ (l2_assoc
* MMU_PAGESIZE
));
1759 ASSERT(ISP2(l2_colors
));
1761 /* for scalability, configure at least PAGE_COLORS_MIN color bins */
1762 page_colors
= MAX(l2_colors
, PAGE_COLORS_MIN
);
1765 * cpu_page_colors is non-zero when a page color may be spread across
1768 if (l2_colors
< page_colors
)
1769 cpu_page_colors
= l2_colors
;
1771 ASSERT(ISP2(page_colors
));
1773 page_colors_mask
= page_colors
- 1;
1775 ASSERT(ISP2(CPUSETSIZE()));
1776 page_coloring_shift
= lowbit(CPUSETSIZE());
1778 /* initialize number of colors per page size */
1779 for (i
= 0; i
<= mmu
.max_page_level
; i
++) {
1780 hw_page_array
[i
].hp_size
= LEVEL_SIZE(i
);
1781 hw_page_array
[i
].hp_shift
= LEVEL_SHIFT(i
);
1782 hw_page_array
[i
].hp_pgcnt
= LEVEL_SIZE(i
) >> LEVEL_SHIFT(0);
1783 hw_page_array
[i
].hp_colors
= (page_colors_mask
>>
1784 (hw_page_array
[i
].hp_shift
- hw_page_array
[0].hp_shift
))
1786 colorequivszc
[i
] = 0;
1790 * The value of cpu_page_colors determines if additional color bins
1791 * need to be checked for a particular color in the page_get routines.
1793 if (cpu_page_colors
!= 0) {
1795 int a
= lowbit(page_colors
) - lowbit(cpu_page_colors
);
1799 for (i
= 0; i
<= mmu
.max_page_level
; i
++) {
1800 if ((colors
= hw_page_array
[i
].hp_colors
) <= 1) {
1801 colorequivszc
[i
] = 0;
1804 while ((colors
>> a
) == 0)
1808 /* higher 4 bits encodes color equiv mask */
1809 colorequivszc
[i
] = (a
<< 4);
1813 /* factor in colorequiv to check additional 'equivalent' bins. */
1814 if (colorequiv
> 1) {
1816 int a
= lowbit(colorequiv
) - 1;
1820 for (i
= 0; i
<= mmu
.max_page_level
; i
++) {
1821 if ((colors
= hw_page_array
[i
].hp_colors
) <= 1) {
1824 while ((colors
>> a
) == 0)
1826 if ((a
<< 4) > colorequivszc
[i
]) {
1827 colorequivszc
[i
] = (a
<< 4);
1832 /* size for mnoderanges */
1833 for (mnoderangecnt
= 0, i
= 0; i
< max_mem_nodes
; i
++)
1834 mnoderangecnt
+= mnode_range_cnt(i
);
1835 if (plat_dr_support_memory()) {
1837 * Reserve enough space for memory DR operations.
1838 * Two extra mnoderanges for possbile fragmentations,
1839 * one for the 2G boundary and the other for the 4G boundary.
1840 * We don't expect a memory board crossing the 16M boundary
1841 * for memory hot-add operations on x86 platforms.
1843 mnoderangecnt
+= 2 + max_mem_nodes
- lgrp_plat_node_cnt
;
1845 colorsz
= mnoderangecnt
* sizeof (mnoderange_t
);
1847 /* size for fpc_mutex and cpc_mutex */
1848 colorsz
+= (2 * max_mem_nodes
* sizeof (kmutex_t
) * NPC_MUTEX
);
1850 /* size of page_freelists */
1851 colorsz
+= mnoderangecnt
* sizeof (page_t
***);
1852 colorsz
+= mnoderangecnt
* mmu_page_sizes
* sizeof (page_t
**);
1854 for (i
= 0; i
< mmu_page_sizes
; i
++) {
1855 colors
= page_get_pagecolors(i
);
1856 colorsz
+= mnoderangecnt
* colors
* sizeof (page_t
*);
1859 /* size of page_cachelists */
1860 colorsz
+= mnoderangecnt
* sizeof (page_t
**);
1861 colorsz
+= mnoderangecnt
* page_colors
* sizeof (page_t
*);
1867 * Called once at startup to configure page_coloring data structures and
1868 * does the 1st page_free()/page_freelist_add().
1871 page_coloring_setup(caddr_t pcmemaddr
)
1880 * do page coloring setup
1884 mnoderanges
= (mnoderange_t
*)addr
;
1885 addr
+= (mnoderangecnt
* sizeof (mnoderange_t
));
1887 mnode_range_setup(mnoderanges
);
1890 mtype4g
= pfn_2_mtype(0xfffff);
1892 for (k
= 0; k
< NPC_MUTEX
; k
++) {
1893 fpc_mutex
[k
] = (kmutex_t
*)addr
;
1894 addr
+= (max_mem_nodes
* sizeof (kmutex_t
));
1896 for (k
= 0; k
< NPC_MUTEX
; k
++) {
1897 cpc_mutex
[k
] = (kmutex_t
*)addr
;
1898 addr
+= (max_mem_nodes
* sizeof (kmutex_t
));
1900 page_freelists
= (page_t
****)addr
;
1901 addr
+= (mnoderangecnt
* sizeof (page_t
***));
1903 page_cachelists
= (page_t
***)addr
;
1904 addr
+= (mnoderangecnt
* sizeof (page_t
**));
1906 for (i
= 0; i
< mnoderangecnt
; i
++) {
1907 page_freelists
[i
] = (page_t
***)addr
;
1908 addr
+= (mmu_page_sizes
* sizeof (page_t
**));
1910 for (j
= 0; j
< mmu_page_sizes
; j
++) {
1911 colors
= page_get_pagecolors(j
);
1912 page_freelists
[i
][j
] = (page_t
**)addr
;
1913 addr
+= (colors
* sizeof (page_t
*));
1915 page_cachelists
[i
] = (page_t
**)addr
;
1916 addr
+= (page_colors
* sizeof (page_t
*));
1922 * Give back 10% of the io_pool pages to the free list.
1923 * Don't shrink the pool below some absolute minimum.
1926 page_io_pool_shrink()
1929 page_t
*pp
, *pp_first
, *pp_last
, **curpool
;
1933 mutex_enter(&io_pool_lock
);
1934 io_pool_shrink_attempts
++; /* should be a kstat? */
1935 retcnt
= io_pool_cnt
/ 10;
1936 if (io_pool_cnt
- retcnt
< io_pool_cnt_min
)
1937 retcnt
= io_pool_cnt
- io_pool_cnt_min
;
1940 io_pool_shrinks
++; /* should be a kstat? */
1941 curpool
= &io_pool_4g
;
1944 * Loop through taking pages from the end of the list
1945 * (highest mfns) till amount to return reached.
1947 for (pp
= *curpool
; pp
&& retcnt
> 0; ) {
1948 pp_first
= pp_last
= pp
->p_prev
;
1949 if (pp_first
== *curpool
)
1953 page_io_pool_sub(curpool
, pp_first
, pp_last
);
1954 if ((mfn
= pfn_to_mfn(pp
->p_pagenum
)) < start_mfn
)
1956 page_free(pp_first
, 1);
1959 if (retcnt
!= 0 && !bothpools
) {
1961 * If not enough found in less constrained pool try the
1962 * more constrained one.
1964 curpool
= &io_pool_16m
;
1969 mutex_exit(&io_pool_lock
);
1975 page_create_update_flags_x86(uint_t flags
)
1979 * Check this is an urgent allocation and free pages are depleted.
1981 if (!(flags
& PG_WAIT
) && freemem
< desfree
)
1982 page_io_pool_shrink();
1985 * page_create_get_something may call this because 4g memory may be
1986 * depleted. Set flags to allow for relocation of base page below
1990 flags
|= (PGI_PGCPSZC0
| PGI_PGCPHIPRI
);
1997 bp_color(struct buf
*bp
)
2005 * Take pages out of an io_pool
2008 page_io_pool_sub(page_t
**poolp
, page_t
*pp_first
, page_t
*pp_last
)
2010 if (*poolp
== pp_first
) {
2011 *poolp
= pp_last
->p_next
;
2012 if (*poolp
== pp_first
)
2015 pp_first
->p_prev
->p_next
= pp_last
->p_next
;
2016 pp_last
->p_next
->p_prev
= pp_first
->p_prev
;
2017 pp_first
->p_prev
= pp_last
;
2018 pp_last
->p_next
= pp_first
;
2022 * Put a page on the io_pool list. The list is ordered by increasing MFN.
2025 page_io_pool_add(page_t
**poolp
, page_t
*pp
)
2028 mfn_t mfn
= mfn_list
[pp
->p_pagenum
];
2030 if (*poolp
== NULL
) {
2038 * Since we try to take pages from the high end of the pool
2039 * chances are good that the pages to be put on the list will
2040 * go at or near the end of the list. so start at the end and
2043 look
= (*poolp
)->p_prev
;
2044 while (mfn
< mfn_list
[look
->p_pagenum
]) {
2045 look
= look
->p_prev
;
2046 if (look
== (*poolp
)->p_prev
)
2047 break; /* backed all the way to front of list */
2050 /* insert after look */
2052 pp
->p_next
= look
->p_next
;
2053 pp
->p_next
->p_prev
= pp
;
2055 if (mfn
< mfn_list
[(*poolp
)->p_pagenum
]) {
2057 * we inserted a new first list element
2058 * adjust pool pointer to newly inserted element
2065 * Add a page to the io_pool. Setting the force flag will force the page
2066 * into the io_pool no matter what.
2069 add_page_to_pool(page_t
*pp
, int force
)
2072 page_t
*freep
= NULL
;
2074 mutex_enter(&io_pool_lock
);
2076 * Always keep the scarce low memory pages
2078 if (mfn_list
[pp
->p_pagenum
] < PFN_16MEG
) {
2080 page_io_pool_add(&io_pool_16m
, pp
);
2083 if (io_pool_cnt
< io_pool_cnt_max
|| force
|| io_pool_4g
== NULL
) {
2085 page_io_pool_add(&io_pool_4g
, pp
);
2087 highest
= io_pool_4g
->p_prev
;
2088 if (mfn_list
[pp
->p_pagenum
] < mfn_list
[highest
->p_pagenum
]) {
2089 page_io_pool_sub(&io_pool_4g
, highest
, highest
);
2090 page_io_pool_add(&io_pool_4g
, pp
);
2097 mutex_exit(&io_pool_lock
);
2099 page_free(freep
, 1);
2103 int contig_pfn_cnt
; /* no of pfns in the contig pfn list */
2104 int contig_pfn_max
; /* capacity of the contig pfn list */
2105 int next_alloc_pfn
; /* next position in list to start a contig search */
2106 int contig_pfnlist_updates
; /* pfn list update count */
2107 int contig_pfnlist_builds
; /* how many times have we (re)built list */
2108 int contig_pfnlist_buildfailed
; /* how many times has list build failed */
2109 int create_contig_pending
; /* nonzero means taskq creating contig list */
2110 pfn_t
*contig_pfn_list
= NULL
; /* list of contig pfns in ascending mfn order */
2113 * Function to use in sorting a list of pfns by their underlying mfns.
2116 mfn_compare(const void *pfnp1
, const void *pfnp2
)
2118 mfn_t mfn1
= mfn_list
[*(pfn_t
*)pfnp1
];
2119 mfn_t mfn2
= mfn_list
[*(pfn_t
*)pfnp2
];
2129 * Compact the contig_pfn_list by tossing all the non-contiguous
2130 * elements from the list.
2133 compact_contig_pfn_list(void)
2135 pfn_t pfn
, lapfn
, prev_lapfn
;
2140 for (i
= 0; i
< contig_pfn_cnt
- 1; i
++) {
2141 pfn
= contig_pfn_list
[i
];
2142 lapfn
= contig_pfn_list
[i
+ 1];
2143 mfn
= mfn_list
[pfn
];
2145 * See if next pfn is for a contig mfn
2147 if (mfn_list
[lapfn
] != mfn
+ 1)
2150 * pfn and lookahead are both put in list
2151 * unless pfn is the previous lookahead.
2153 if (pfn
!= prev_lapfn
)
2154 contig_pfn_list
[newcnt
++] = pfn
;
2155 contig_pfn_list
[newcnt
++] = lapfn
;
2158 for (i
= newcnt
; i
< contig_pfn_cnt
; i
++)
2159 contig_pfn_list
[i
] = 0;
2160 contig_pfn_cnt
= newcnt
;
2165 call_create_contiglist(void *arg
)
2167 (void) create_contig_pfnlist(PG_WAIT
);
2171 * Create list of freelist pfns that have underlying
2172 * contiguous mfns. The list is kept in ascending mfn order.
2173 * returns 1 if list created else 0.
2176 create_contig_pfnlist(uint_t flags
)
2182 mutex_enter(&contig_list_lock
);
2183 if (contig_pfn_list
!= NULL
)
2185 contig_pfn_max
= freemem
+ (freemem
/ 10);
2186 contig_pfn_list
= kmem_zalloc(contig_pfn_max
* sizeof (pfn_t
),
2187 (flags
& PG_WAIT
) ? KM_SLEEP
: KM_NOSLEEP
);
2188 if (contig_pfn_list
== NULL
) {
2190 * If we could not create the contig list (because
2191 * we could not sleep for memory). Dispatch a taskq that can
2192 * sleep to get the memory.
2194 if (!create_contig_pending
) {
2195 if (taskq_dispatch(system_taskq
, call_create_contiglist
,
2196 NULL
, TQ_NOSLEEP
) != NULL
)
2197 create_contig_pending
= 1;
2199 contig_pfnlist_buildfailed
++; /* count list build failures */
2203 create_contig_pending
= 0;
2204 ASSERT(contig_pfn_cnt
== 0);
2205 for (pfn
= 0; pfn
< mfn_count
; pfn
++) {
2206 pp
= page_numtopp_nolock(pfn
);
2207 if (pp
== NULL
|| !PP_ISFREE(pp
))
2209 contig_pfn_list
[contig_pfn_cnt
] = pfn
;
2210 if (++contig_pfn_cnt
== contig_pfn_max
)
2214 * Sanity check the new list.
2216 if (contig_pfn_cnt
< 2) { /* no contig pfns */
2218 contig_pfnlist_buildfailed
++;
2219 kmem_free(contig_pfn_list
, contig_pfn_max
* sizeof (pfn_t
));
2220 contig_pfn_list
= NULL
;
2225 qsort(contig_pfn_list
, contig_pfn_cnt
, sizeof (pfn_t
), mfn_compare
);
2226 compact_contig_pfn_list();
2228 * Make sure next search of the newly created contiguous pfn
2229 * list starts at the beginning of the list.
2232 contig_pfnlist_builds
++; /* count list builds */
2234 mutex_exit(&contig_list_lock
);
2240 * Toss the current contig pfnlist. Someone is about to do a massive
2241 * update to pfn<->mfn mappings. So we have them destroy the list and lock
2242 * it till they are done with their update.
2245 clear_and_lock_contig_pfnlist()
2247 pfn_t
*listp
= NULL
;
2250 mutex_enter(&contig_list_lock
);
2251 if (contig_pfn_list
!= NULL
) {
2252 listp
= contig_pfn_list
;
2253 listsize
= contig_pfn_max
* sizeof (pfn_t
);
2254 contig_pfn_list
= NULL
;
2255 contig_pfn_max
= contig_pfn_cnt
= 0;
2258 kmem_free(listp
, listsize
);
2262 * Unlock the contig_pfn_list. The next attempted use of it will cause
2263 * it to be re-created.
2266 unlock_contig_pfnlist()
2268 mutex_exit(&contig_list_lock
);
2272 * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2275 update_contig_pfnlist(pfn_t pfn
, mfn_t oldmfn
, mfn_t newmfn
)
2277 int probe_hi
, probe_lo
, probe_pos
, insert_after
, insert_point
;
2282 if (mutex_owner(&contig_list_lock
) != curthread
) {
2284 mutex_enter(&contig_list_lock
);
2286 if (contig_pfn_list
== NULL
)
2288 contig_pfnlist_updates
++;
2290 * Find the pfn in the current list. Use a binary chop to locate it.
2292 probe_hi
= contig_pfn_cnt
- 1;
2294 probe_pos
= (probe_hi
+ probe_lo
) / 2;
2295 while ((probe_pfn
= contig_pfn_list
[probe_pos
]) != pfn
) {
2296 if (probe_pos
== probe_lo
) { /* pfn not in list */
2300 if (pfn_to_mfn(probe_pfn
) <= oldmfn
)
2301 probe_lo
= probe_pos
;
2303 probe_hi
= probe_pos
;
2304 probe_pos
= (probe_hi
+ probe_lo
) / 2;
2306 if (probe_pos
>= 0) {
2308 * Remove pfn from list and ensure next alloc
2309 * position stays in bounds.
2311 if (--contig_pfn_cnt
<= next_alloc_pfn
)
2313 if (contig_pfn_cnt
< 2) { /* no contig pfns */
2315 kmem_free(contig_pfn_list
,
2316 contig_pfn_max
* sizeof (pfn_t
));
2317 contig_pfn_list
= NULL
;
2321 ovbcopy(&contig_pfn_list
[probe_pos
+ 1],
2322 &contig_pfn_list
[probe_pos
],
2323 (contig_pfn_cnt
- probe_pos
) * sizeof (pfn_t
));
2325 if (newmfn
== MFN_INVALID
)
2328 * Check if new mfn has adjacent mfns in the list
2330 probe_hi
= contig_pfn_cnt
- 1;
2334 probe_pos
= (probe_hi
+ probe_lo
) / 2;
2335 probe_mfn
= pfn_to_mfn(contig_pfn_list
[probe_pos
]);
2336 if (newmfn
== probe_mfn
+ 1)
2337 insert_after
= probe_pos
;
2338 else if (newmfn
== probe_mfn
- 1)
2339 insert_after
= probe_pos
- 1;
2340 if (probe_pos
== probe_lo
)
2342 if (probe_mfn
<= newmfn
)
2343 probe_lo
= probe_pos
;
2345 probe_hi
= probe_pos
;
2346 } while (insert_after
== -2);
2348 * If there is space in the list and there are adjacent mfns
2349 * insert the pfn in to its proper place in the list.
2351 if (insert_after
!= -2 && contig_pfn_cnt
+ 1 <= contig_pfn_max
) {
2352 insert_point
= insert_after
+ 1;
2353 ovbcopy(&contig_pfn_list
[insert_point
],
2354 &contig_pfn_list
[insert_point
+ 1],
2355 (contig_pfn_cnt
- insert_point
) * sizeof (pfn_t
));
2356 contig_pfn_list
[insert_point
] = pfn
;
2361 mutex_exit(&contig_list_lock
);
2365 * Called to (re-)populate the io_pool from the free page lists.
2368 populate_io_pool(void)
2375 * Figure out the bounds of the pool on first invocation.
2376 * We use a percentage of memory for the io pool size.
2377 * we allow that to shrink, but not to less than a fixed minimum
2379 if (io_pool_cnt_max
== 0) {
2380 io_pool_cnt_max
= physmem
/ (100 / io_pool_physmem_pct
);
2381 io_pool_cnt_lowater
= io_pool_cnt_max
;
2383 * This is the first time in populate_io_pool, grab a va to use
2384 * when we need to allocate pages.
2386 io_pool_kva
= vmem_alloc(heap_arena
, PAGESIZE
, VM_SLEEP
);
2389 * If we are out of pages in the pool, then grow the size of the pool
2391 if (io_pool_cnt
== 0) {
2393 * Grow the max size of the io pool by 5%, but never more than
2394 * 25% of physical memory.
2396 if (io_pool_cnt_max
< physmem
/ 4)
2397 io_pool_cnt_max
+= io_pool_cnt_max
/ 20;
2399 io_pool_grows
++; /* should be a kstat? */
2402 * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2404 (void) mfn_to_pfn(start_mfn
);
2405 max_mfn
= MIN(cached_max_mfn
, PFN_4GIG
);
2406 for (mfn
= start_mfn
; mfn
< max_mfn
; start_mfn
= ++mfn
) {
2407 pfn
= mfn_to_pfn(mfn
);
2408 if (pfn
& PFN_IS_FOREIGN_MFN
)
2411 * try to allocate it from free pages
2413 pp
= page_numtopp_alloc(pfn
);
2417 add_page_to_pool(pp
, 1);
2418 if (io_pool_cnt
>= io_pool_cnt_max
)
2422 return (io_pool_cnt
);
2426 * Destroy a page that was being used for DMA I/O. It may or
2427 * may not actually go back to the io_pool.
2430 page_destroy_io(page_t
*pp
)
2432 mfn_t mfn
= mfn_list
[pp
->p_pagenum
];
2435 * When the page was alloc'd a reservation was made, release it now
2439 * Unload translations, if any, then hash out the
2440 * page to erase its identity.
2442 (void) hat_pageunload(pp
, HAT_FORCE_PGUNLOAD
);
2443 page_hashout(pp
, NULL
);
2446 * If the page came from the free lists, just put it back to them.
2447 * DomU pages always go on the free lists as well.
2449 if (!DOMAIN_IS_INITDOMAIN(xen_info
) || mfn
>= PFN_4GIG
) {
2454 add_page_to_pool(pp
, 0);
2458 long contig_searches
; /* count of times contig pages requested */
2459 long contig_search_restarts
; /* count of contig ranges tried */
2460 long contig_search_failed
; /* count of contig alloc failures */
2463 * Free partial page list
2466 free_partial_list(page_t
**pplist
)
2470 while (*pplist
!= NULL
) {
2472 page_io_pool_sub(pplist
, pp
, pp
);
2478 * Look thru the contiguous pfns that are not part of the io_pool for
2479 * contiguous free pages. Return a list of the found pages or NULL.
2482 find_contig_free(uint_t npages
, uint_t flags
, uint64_t pfnseg
,
2485 page_t
*pp
, *plist
= NULL
;
2486 mfn_t mfn
, prev_mfn
, start_mfn
;
2488 int pages_needed
, pages_requested
;
2492 * create the contig pfn list if not already done
2495 mutex_enter(&contig_list_lock
);
2496 if (contig_pfn_list
== NULL
) {
2497 mutex_exit(&contig_list_lock
);
2498 if (!create_contig_pfnlist(flags
)) {
2505 * Search contiguous pfn list for physically contiguous pages not in
2506 * the io_pool. Start the search where the last search left off.
2508 pages_requested
= pages_needed
= npages
;
2509 search_start
= next_alloc_pfn
;
2510 start_mfn
= prev_mfn
= 0;
2511 while (pages_needed
) {
2512 pfn
= contig_pfn_list
[next_alloc_pfn
];
2513 mfn
= pfn_to_mfn(pfn
);
2515 * Check if mfn is first one or contig to previous one and
2516 * if page corresponding to mfn is free and that mfn
2517 * range is not crossing a segment boundary.
2519 if ((prev_mfn
== 0 || mfn
== prev_mfn
+ 1) &&
2520 (pp
= page_numtopp_alloc(pfn
)) != NULL
&&
2521 !((mfn
& pfnseg
) < (start_mfn
& pfnseg
))) {
2523 page_io_pool_add(&plist
, pp
);
2525 if (prev_mfn
== 0) {
2527 mfn
!= P2ROUNDUP(mfn
, pfnalign
)) {
2529 * not properly aligned
2531 contig_search_restarts
++;
2532 free_partial_list(&plist
);
2533 pages_needed
= pages_requested
;
2534 start_mfn
= prev_mfn
= 0;
2541 contig_search_restarts
++;
2542 free_partial_list(&plist
);
2543 pages_needed
= pages_requested
;
2544 start_mfn
= prev_mfn
= 0;
2547 if (++next_alloc_pfn
== contig_pfn_cnt
)
2549 if (next_alloc_pfn
== search_start
)
2550 break; /* all pfns searched */
2552 mutex_exit(&contig_list_lock
);
2554 contig_search_failed
++;
2556 * Failed to find enough contig pages.
2557 * free partial page list
2559 free_partial_list(&plist
);
2565 * Search the reserved io pool pages for a page range with the
2566 * desired characteristics.
2569 page_io_pool_alloc(ddi_dma_attr_t
*mattr
, int contig
, pgcnt_t minctg
)
2571 page_t
*pp_first
, *pp_last
;
2572 page_t
*pp
, **poolp
;
2573 pgcnt_t nwanted
, pfnalign
;
2575 mfn_t mfn
, tmfn
, hi_mfn
, lo_mfn
;
2576 int align
, attempt
= 0;
2580 lo_mfn
= mmu_btop(mattr
->dma_attr_addr_lo
);
2581 hi_mfn
= mmu_btop(mattr
->dma_attr_addr_hi
);
2582 pfnseg
= mmu_btop(mattr
->dma_attr_seg
);
2583 align
= maxbit(mattr
->dma_attr_align
, mattr
->dma_attr_minxfer
);
2584 if (align
> MMU_PAGESIZE
)
2585 pfnalign
= mmu_btop(align
);
2591 * See if we want pages for a legacy device
2593 if (hi_mfn
< PFN_16MEG
)
2594 poolp
= &io_pool_16m
;
2596 poolp
= &io_pool_4g
;
2599 * Take pages from I/O pool. We'll use pages from the highest
2600 * MFN range possible.
2602 pp_first
= pp_last
= NULL
;
2603 mutex_enter(&io_pool_lock
);
2605 for (pp
= *poolp
; pp
&& nwanted
> 0; ) {
2609 * skip pages above allowable range
2611 mfn
= mfn_list
[pp
->p_pagenum
];
2616 * stop at pages below allowable range
2621 if (pp_last
== NULL
) {
2625 tmfn
= mfn
- (minctg
- 1);
2626 if (pfnalign
&& tmfn
!= P2ROUNDUP(tmfn
, pfnalign
))
2627 goto skip
; /* not properly aligned */
2631 if ((mfn
& pfnseg
) < (tmfn
& pfnseg
))
2632 goto skip
; /* crosses seg boundary */
2634 * Start building page list
2636 pp_first
= pp_last
= pp
;
2640 * check physical contiguity if required
2643 mfn_list
[pp_first
->p_pagenum
] != mfn
+ 1) {
2645 * not a contiguous page, restart list.
2650 } else { /* add page to list */
2661 * If we didn't find memory. Try the more constrained pool, then
2662 * sweep free pages into the DMA pool and try again.
2665 mutex_exit(&io_pool_lock
);
2667 * If we were looking in the less constrained pool and
2668 * didn't find pages, try the more constrained pool.
2670 if (poolp
== &io_pool_4g
) {
2671 poolp
= &io_pool_16m
;
2675 if (++attempt
< 4) {
2677 * Grab some more io_pool pages
2679 (void) populate_io_pool();
2680 goto try_again
; /* go around and retry */
2685 * Found the pages, now snip them from the list
2687 page_io_pool_sub(poolp
, pp_first
, pp_last
);
2688 io_pool_cnt
-= minctg
;
2690 * reset low water mark
2692 if (io_pool_cnt
< io_pool_cnt_lowater
)
2693 io_pool_cnt_lowater
= io_pool_cnt
;
2694 mutex_exit(&io_pool_lock
);
2699 page_swap_with_hypervisor(struct vnode
*vp
, u_offset_t off
, caddr_t vaddr
,
2700 ddi_dma_attr_t
*mattr
, uint_t flags
, pgcnt_t minctg
)
2703 int order
, extra
, extpages
, i
, contig
, nbits
, extents
;
2704 page_t
*pp
, *expp
, *pp_first
, **pplist
= NULL
;
2705 mfn_t
*mfnlist
= NULL
;
2707 contig
= flags
& PG_PHYSCONTIG
;
2710 flags
&= ~PG_PHYSCONTIG
;
2711 kflags
= flags
& PG_WAIT
? KM_SLEEP
: KM_NOSLEEP
;
2713 * Hypervisor will allocate extents, if we want contig
2714 * pages extent must be >= minctg
2717 order
= highbit(minctg
) - 1;
2718 if (minctg
& ((1 << order
) - 1))
2720 extpages
= 1 << order
;
2725 if (extpages
> minctg
) {
2726 extra
= extpages
- minctg
;
2727 if (!page_resv(extra
, kflags
))
2731 pplist
= kmem_alloc(extpages
* sizeof (page_t
*), kflags
);
2734 mfnlist
= kmem_alloc(extpages
* sizeof (mfn_t
), kflags
);
2735 if (mfnlist
== NULL
)
2737 pp
= page_create_va(vp
, off
, minctg
* PAGESIZE
, flags
, &kvseg
, vaddr
);
2741 if (extpages
> minctg
) {
2743 * fill out the rest of extent pages to swap
2744 * with the hypervisor
2746 for (i
= 0; i
< extra
; i
++) {
2747 expp
= page_create_va(vp
,
2748 (u_offset_t
)(uintptr_t)io_pool_kva
,
2749 PAGESIZE
, flags
, &kvseg
, io_pool_kva
);
2752 (void) hat_pageunload(expp
, HAT_FORCE_PGUNLOAD
);
2753 page_io_unlock(expp
);
2754 page_hashout(expp
, NULL
);
2757 * add page to end of list
2759 expp
->p_prev
= pp_first
->p_prev
;
2760 expp
->p_next
= pp_first
;
2761 expp
->p_prev
->p_next
= expp
;
2762 pp_first
->p_prev
= expp
;
2766 for (i
= 0; i
< extpages
; i
++) {
2770 nbits
= highbit(mattr
->dma_attr_addr_hi
);
2771 extents
= contig
? 1 : minctg
;
2772 if (balloon_replace_pages(extents
, pplist
, nbits
, order
,
2773 mfnlist
) != extents
) {
2775 cmn_err(CE_NOTE
, "request to hypervisor"
2776 " for %d pages, maxaddr %" PRIx64
" failed",
2777 extpages
, mattr
->dma_attr_addr_hi
);
2781 kmem_free(pplist
, extpages
* sizeof (page_t
*));
2782 kmem_free(mfnlist
, extpages
* sizeof (mfn_t
));
2784 * Return any excess pages to free list
2786 if (extpages
> minctg
) {
2787 for (i
= 0; i
< extra
; i
++) {
2788 pp
= pp_first
->p_prev
;
2789 page_sub(&pp_first
, pp
);
2798 * Return pages to free list and return failure
2800 while (pp_first
!= NULL
) {
2802 page_sub(&pp_first
, pp
);
2804 if (pp
->p_vnode
!= NULL
)
2805 page_hashout(pp
, NULL
);
2809 kmem_free(pplist
, extpages
* sizeof (page_t
*));
2811 kmem_free(mfnlist
, extpages
* sizeof (mfn_t
));
2812 page_unresv(extpages
- minctg
);
2817 return_partial_alloc(page_t
*plist
)
2821 while (plist
!= NULL
) {
2823 page_sub(&plist
, pp
);
2825 page_destroy_io(pp
);
2830 page_get_contigpages(
2836 ddi_dma_attr_t
*mattr
)
2838 mfn_t max_mfn
= HYPERVISOR_memory_op(XENMEM_maximum_ram_page
, NULL
);
2839 page_t
*plist
; /* list to return */
2841 int contig
, anyaddr
, npages
, getone
= 0;
2844 pgcnt_t pfnalign
= 0;
2850 ASSERT(mattr
!= NULL
);
2851 lo_mfn
= mmu_btop(mattr
->dma_attr_addr_lo
);
2852 hi_mfn
= mmu_btop(mattr
->dma_attr_addr_hi
);
2853 sgllen
= mattr
->dma_attr_sgllen
;
2854 pfnseg
= mmu_btop(mattr
->dma_attr_seg
);
2855 align
= maxbit(mattr
->dma_attr_align
, mattr
->dma_attr_minxfer
);
2856 if (align
> MMU_PAGESIZE
)
2857 pfnalign
= mmu_btop(align
);
2859 contig
= flags
& PG_PHYSCONTIG
;
2865 * Clear the contig flag if only one page is needed.
2873 * Check if any page in the system is fine.
2875 anyaddr
= lo_mfn
== 0 && hi_mfn
>= max_mfn
;
2876 if (!contig
&& anyaddr
&& !pfnalign
) {
2877 flags
&= ~PG_PHYSCONTIG
;
2878 plist
= page_create_va(vp
, off
, npages
* MMU_PAGESIZE
,
2879 flags
, &kvseg
, vaddr
);
2880 if (plist
!= NULL
) {
2886 minctg
= howmany(npages
, sgllen
);
2887 while (npages
> sgllen
|| getone
) {
2888 if (minctg
> npages
)
2892 * We could want contig pages with no address range limits.
2894 if (anyaddr
&& contig
) {
2896 * Look for free contig pages to satisfy the request.
2898 mcpl
= find_contig_free(minctg
, flags
, pfnseg
,
2902 * Try the reserved io pools next
2905 mcpl
= page_io_pool_alloc(mattr
, contig
, minctg
);
2909 if (!page_hashin(pp
, vp
, off
, NULL
)) {
2910 panic("page_get_contigpages:"
2912 " pp %p, vp %p, off %llx",
2913 (void *)pp
, (void *)vp
, off
);
2915 off
+= MMU_PAGESIZE
;
2918 page_set_props(pp
, P_REF
);
2921 } while (pp
!= mcpl
);
2924 * Hypervisor exchange doesn't handle segment or
2925 * alignment constraints
2927 if (mattr
->dma_attr_seg
< mattr
->dma_attr_addr_hi
||
2931 * Try exchanging pages with the hypervisor
2933 mcpl
= page_swap_with_hypervisor(vp
, off
, vaddr
, mattr
,
2937 off
+= minctg
* MMU_PAGESIZE
;
2939 check_dma(mattr
, mcpl
, minctg
);
2941 * Here with a minctg run of contiguous pages, add them to the
2942 * list we will return for this request.
2944 page_list_concat(&plist
, &mcpl
);
2953 return_partial_alloc(plist
);
2958 * Allocator for domain 0 I/O pages. We match the required
2959 * DMA attributes and contiguity constraints.
2970 ddi_dma_attr_t
*mattr
)
2972 page_t
*plist
= NULL
, *pp
;
2973 int npages
= 0, contig
, anyaddr
, pages_req
;
2976 pgcnt_t pfnalign
= 0;
2979 int dummy
, bytes_got
;
2980 mfn_t max_mfn
= HYPERVISOR_memory_op(XENMEM_maximum_ram_page
, NULL
);
2982 ASSERT(mattr
!= NULL
);
2983 lo_mfn
= mmu_btop(mattr
->dma_attr_addr_lo
);
2984 hi_mfn
= mmu_btop(mattr
->dma_attr_addr_hi
);
2985 align
= maxbit(mattr
->dma_attr_align
, mattr
->dma_attr_minxfer
);
2986 if (align
> MMU_PAGESIZE
)
2987 pfnalign
= mmu_btop(align
);
2990 * Clear the contig flag if only one page is needed or the scatter
2991 * gather list length is >= npages.
2993 pages_req
= npages
= mmu_btopr(bytes
);
2994 contig
= (flags
& PG_PHYSCONTIG
);
2995 bytes
= P2ROUNDUP(bytes
, MMU_PAGESIZE
);
2996 if (bytes
== MMU_PAGESIZE
|| mattr
->dma_attr_sgllen
>= npages
)
3000 * Check if any old page in the system is fine.
3001 * DomU should always go down this path.
3003 is_domu
= !DOMAIN_IS_INITDOMAIN(xen_info
);
3004 anyaddr
= lo_mfn
== 0 && hi_mfn
>= max_mfn
&& !pfnalign
;
3005 if ((!contig
&& anyaddr
) || is_domu
) {
3006 flags
&= ~PG_PHYSCONTIG
;
3007 plist
= page_create_va(vp
, off
, bytes
, flags
, &kvseg
, vaddr
);
3011 return (NULL
); /* no memory available */
3014 * DomU should never reach here
3017 plist
= page_get_contigpages(vp
, off
, &npages
, flags
, vaddr
,
3021 bytes_got
= (pages_req
- npages
) << MMU_PAGESHIFT
;
3025 * We now have all the contiguous pages we need, but
3026 * we may still need additional non-contiguous pages.
3030 * now loop collecting the requested number of pages, these do
3031 * not have to be contiguous pages but we will use the contig
3032 * page alloc code to get the pages since it will honor any
3033 * other constraints the pages may have.
3037 pp
= page_get_contigpages(vp
, off
, &dummy
, flags
, vaddr
, mattr
);
3040 page_add(&plist
, pp
);
3041 vaddr
+= MMU_PAGESIZE
;
3042 off
+= MMU_PAGESIZE
;
3047 * Failed to get enough pages, return ones we did get
3049 return_partial_alloc(plist
);
3054 * Lock and return the page with the highest mfn that we can find. last_mfn
3055 * holds the last one found, so the next search can start from there. We
3056 * also keep a counter so that we don't loop forever if the machine has no
3059 * This is called from the balloon thread to find pages to give away. new_high
3060 * is used when new mfn's have been added to the system - we will reset our
3061 * search if the new mfn's are higher than our current search position.
3064 page_get_high_mfn(mfn_t new_high
)
3066 static mfn_t last_mfn
= 0;
3069 ulong_t loop_count
= 0;
3071 if (new_high
> last_mfn
)
3072 last_mfn
= new_high
;
3074 for (; loop_count
< mfn_count
; loop_count
++, last_mfn
--) {
3075 if (last_mfn
== 0) {
3076 last_mfn
= cached_max_mfn
;
3079 pfn
= mfn_to_pfn(last_mfn
);
3080 if (pfn
& PFN_IS_FOREIGN_MFN
)
3083 /* See if the page is free. If so, lock it. */
3084 pp
= page_numtopp_alloc(pfn
);
3089 ASSERT(PAGE_EXCL(pp
));
3090 ASSERT(pp
->p_vnode
== NULL
);
3091 ASSERT(!hat_page_is_mapped(pp
));
3101 * get a page from any list with the given mnode
3104 page_get_mnode_anylist(ulong_t origbin
, uchar_t szc
, uint_t flags
,
3105 int mnode
, int mtype
, ddi_dma_attr_t
*dma_attr
)
3114 int plw_initialized
;
3115 page_list_walker_t plw
;
3117 VM_STAT_ADD(pga_vmstats
.pgma_alloc
);
3119 ASSERT((flags
& PG_MATCH_COLOR
) == 0);
3121 ASSERT(dma_attr
!= NULL
);
3123 MTYPE_START(mnode
, mtype
, flags
);
3125 VM_STAT_ADD(pga_vmstats
.pgma_allocempty
);
3134 * check up to page_colors + 1 bins - origbin may be checked twice
3135 * because of BIN_STEP skip
3138 plw_initialized
= 0;
3140 for (plw
.plw_count
= 0;
3141 plw
.plw_count
< page_colors
; plw
.plw_count
++) {
3143 if (PAGE_FREELISTS(mnode
, szc
, bin
, mtype
) == NULL
)
3146 pcm
= PC_BIN_MUTEX(mnode
, bin
, PG_FREE_LIST
);
3148 pp
= PAGE_FREELISTS(mnode
, szc
, bin
, mtype
);
3150 while (pp
!= NULL
) {
3151 if (IS_DUMP_PAGE(pp
) || page_trylock(pp
,
3154 if (pp
== first_pp
) {
3160 ASSERT(PP_ISFREE(pp
));
3161 ASSERT(PP_ISAGED(pp
));
3162 ASSERT(pp
->p_vnode
== NULL
);
3163 ASSERT(pp
->p_hash
== NULL
);
3164 ASSERT(pp
->p_offset
== (u_offset_t
)-1);
3165 ASSERT(pp
->p_szc
== szc
);
3166 ASSERT(PFN_2_MEM_NODE(pp
->p_pagenum
) == mnode
);
3167 /* check if page within DMA attributes */
3168 pgaddr
= pa_to_ma(pfn_to_pa(pp
->p_pagenum
));
3169 if ((pgaddr
>= dma_attr
->dma_attr_addr_lo
) &&
3170 (pgaddr
+ MMU_PAGESIZE
- 1 <=
3171 dma_attr
->dma_attr_addr_hi
)) {
3175 /* continue looking */
3183 ASSERT(mtype
== PP_2_MTYPE(pp
));
3184 ASSERT(pp
->p_szc
== 0);
3186 /* found a page with specified DMA attributes */
3187 page_sub(&PAGE_FREELISTS(mnode
, szc
, bin
,
3189 page_ctr_sub(mnode
, mtype
, pp
, PG_FREE_LIST
);
3191 if ((PP_ISFREE(pp
) == 0) ||
3192 (PP_ISAGED(pp
) == 0)) {
3193 cmn_err(CE_PANIC
, "page %p is not free",
3198 check_dma(dma_attr
, pp
, 1);
3199 VM_STAT_ADD(pga_vmstats
.pgma_allocok
);
3204 if (plw_initialized
== 0) {
3205 page_list_walk_init(szc
, 0, bin
, 1, 0, &plw
);
3206 ASSERT(plw
.plw_ceq_dif
== page_colors
);
3207 plw_initialized
= 1;
3210 if (plw
.plw_do_split
) {
3211 pp
= page_freelist_split(szc
, bin
, mnode
,
3213 mmu_btop(dma_attr
->dma_attr_addr_lo
),
3214 mmu_btop(dma_attr
->dma_attr_addr_hi
+ 1),
3217 check_dma(dma_attr
, pp
, 1);
3222 bin
= page_list_walk_next_bin(szc
, bin
, &plw
);
3225 MTYPE_NEXT(mnode
, mtype
, flags
);
3226 } while (mtype
>= 0);
3228 /* failed to find a page in the freelist; try it in the cachelist */
3230 /* reset mtype start for cachelist search */
3234 /* start with the bin of matching color */
3238 for (i
= 0; i
<= page_colors
; i
++) {
3239 if (PAGE_CACHELISTS(mnode
, bin
, mtype
) == NULL
)
3241 pcm
= PC_BIN_MUTEX(mnode
, bin
, PG_CACHE_LIST
);
3243 pp
= PAGE_CACHELISTS(mnode
, bin
, mtype
);
3245 while (pp
!= NULL
) {
3246 if (IS_DUMP_PAGE(pp
) || page_trylock(pp
,
3253 ASSERT(pp
->p_vnode
);
3254 ASSERT(PP_ISAGED(pp
) == 0);
3255 ASSERT(pp
->p_szc
== 0);
3256 ASSERT(PFN_2_MEM_NODE(pp
->p_pagenum
) == mnode
);
3258 /* check if page within DMA attributes */
3260 pgaddr
= pa_to_ma(pfn_to_pa(pp
->p_pagenum
));
3261 if ((pgaddr
>= dma_attr
->dma_attr_addr_lo
) &&
3262 (pgaddr
+ MMU_PAGESIZE
- 1 <=
3263 dma_attr
->dma_attr_addr_hi
)) {
3267 /* continue looking */
3275 ASSERT(mtype
== PP_2_MTYPE(pp
));
3276 ASSERT(pp
->p_szc
== 0);
3278 /* found a page with specified DMA attributes */
3279 page_sub(&PAGE_CACHELISTS(mnode
, bin
,
3281 page_ctr_sub(mnode
, mtype
, pp
, PG_CACHE_LIST
);
3284 ASSERT(pp
->p_vnode
);
3285 ASSERT(PP_ISAGED(pp
) == 0);
3286 check_dma(dma_attr
, pp
, 1);
3287 VM_STAT_ADD(pga_vmstats
.pgma_allocok
);
3292 bin
+= (i
== 0) ? BIN_STEP
: 1;
3293 bin
&= page_colors_mask
;
3295 MTYPE_NEXT(mnode
, mtype
, flags
);
3296 } while (mtype
>= 0);
3298 VM_STAT_ADD(pga_vmstats
.pgma_allocfailed
);
3303 * This function is similar to page_get_freelist()/page_get_cachelist()
3304 * but it searches both the lists to find a page with the specified
3305 * color (or no color) and DMA attributes. The search is done in the
3306 * freelist first and then in the cache list within the highest memory
3307 * range (based on DMA attributes) before searching in the lower
3310 * Note: This function is called only by page_create_io().
3314 page_get_anylist(struct vnode
*vp
, u_offset_t off
, struct as
*as
, caddr_t vaddr
,
3315 size_t size
, uint_t flags
, ddi_dma_attr_t
*dma_attr
, lgrp_t
*lgrp
)
3325 int local_failed_stat
= 0;
3326 lgrp_mnode_cookie_t lgrp_cookie
;
3328 VM_STAT_ADD(pga_vmstats
.pga_alloc
);
3330 /* only base pagesize currently supported */
3331 if (size
!= MMU_PAGESIZE
)
3335 * If we're passed a specific lgroup, we use it. Otherwise,
3336 * assume first-touch placement is desired.
3338 if (!LGRP_EXISTS(lgrp
))
3339 lgrp
= lgrp_home_lgrp();
3342 AS_2_BIN(as
, seg
, vp
, vaddr
, bin
, 0);
3345 * Only hold one freelist or cachelist lock at a time, that way we
3346 * can start anywhere and not have to worry about lock
3349 if (dma_attr
== NULL
) {
3353 VM_STAT_ADD(pga_vmstats
.pga_nulldmaattr
);
3355 pfn_t pfnlo
= mmu_btop(dma_attr
->dma_attr_addr_lo
);
3356 pfn_t pfnhi
= mmu_btop(dma_attr
->dma_attr_addr_hi
);
3359 * We can guarantee alignment only for page boundary.
3361 if (dma_attr
->dma_attr_align
> MMU_PAGESIZE
)
3364 /* Sanity check the dma_attr */
3368 n
= pfn_2_mtype(pfnlo
);
3369 m
= pfn_2_mtype(pfnhi
);
3371 fullrange
= ((pfnlo
== mnoderanges
[n
].mnr_pfnlo
) &&
3372 (pfnhi
>= mnoderanges
[m
].mnr_pfnhi
));
3374 VM_STAT_COND_ADD(fullrange
== 0, pga_vmstats
.pga_notfullrange
);
3378 /* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3379 if (n
== mtype16m
) {
3380 flags
|= PGI_MT_RANGE0
;
3385 * Try local memory node first, but try remote if we can't
3386 * get a page of the right color.
3388 LGRP_MNODE_COOKIE_INIT(lgrp_cookie
, lgrp
, LGRP_SRCH_HIER
);
3389 while ((mnode
= lgrp_memnode_choose(&lgrp_cookie
)) >= 0) {
3391 * allocate pages from high pfn to low.
3395 if (fullrange
!= 0) {
3396 pp
= page_get_mnode_freelist(mnode
,
3397 bin
, mtype
, szc
, flags
);
3399 pp
= page_get_mnode_cachelist(
3400 bin
, flags
, mnode
, mtype
);
3403 pp
= page_get_mnode_anylist(bin
, szc
,
3404 flags
, mnode
, mtype
, dma_attr
);
3407 VM_STAT_ADD(pga_vmstats
.pga_allocok
);
3408 check_dma(dma_attr
, pp
, 1);
3411 } while (mtype
!= n
&&
3412 (mtype
= mnoderanges
[mtype
].mnr_next
) != -1);
3413 if (!local_failed_stat
) {
3414 lgrp_stat_add(lgrp
->lgrp_id
, LGRP_NUM_ALLOC_FAIL
, 1);
3415 local_failed_stat
= 1;
3418 VM_STAT_ADD(pga_vmstats
.pga_allocfailed
);
3426 * This function is a copy of page_create_va() with an additional
3427 * argument 'mattr' that specifies DMA memory requirements to
3428 * the page list functions. This function is used by the segkmem
3429 * allocator so it is only to create new pages (i.e PG_EXCL is
3432 * Note: This interface is currently used by x86 PSM only and is
3433 * not fully specified so the commitment level is only for
3434 * private interface specific to x86. This interface uses PSM
3435 * specific page_get_anylist() interface.
3438 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
3439 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3440 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3454 ddi_dma_attr_t
*mattr
) /* DMA memory attributes if any */
3456 page_t
*plist
= NULL
;
3457 uint_t plist_len
= 0;
3462 kmutex_t
*phm
= NULL
;
3465 TRACE_4(TR_FAC_VM
, TR_PAGE_CREATE_START
,
3466 "page_create_start:vp %p off %llx bytes %u flags %x",
3467 vp
, off
, bytes
, flags
);
3469 ASSERT((flags
& ~(PG_EXCL
| PG_WAIT
| PG_PHYSCONTIG
)) == 0);
3471 pages_req
= npages
= mmu_btopr(bytes
);
3474 * Do the freemem and pcf accounting.
3476 if (!page_create_wait(npages
, flags
)) {
3480 TRACE_2(TR_FAC_VM
, TR_PAGE_CREATE_SUCCESS
,
3481 "page_create_success:vp %p off %llx", vp
, off
);
3484 * If satisfying this request has left us with too little
3485 * memory, start the wheels turning to get some back. The
3486 * first clause of the test prevents waking up the pageout
3487 * daemon in situations where it would decide that there's
3490 if (nscan
< desscan
&& freemem
< minfree
) {
3491 TRACE_1(TR_FAC_VM
, TR_PAGEOUT_CV_SIGNAL
,
3492 "pageout_cv_signal:freemem %ld", freemem
);
3493 cv_signal(&proc_pageout
->p_cv
);
3496 if (flags
& PG_PHYSCONTIG
) {
3498 plist
= page_get_contigpage(&npages
, mattr
, 1);
3499 if (plist
== NULL
) {
3500 page_create_putback(npages
);
3507 if (!page_hashin(pp
, vp
, off
, NULL
)) {
3508 panic("pg_creat_io: hashin failed %p %p %llx",
3509 (void *)pp
, (void *)vp
, off
);
3511 VM_STAT_ADD(page_create_new
);
3512 off
+= MMU_PAGESIZE
;
3515 page_set_props(pp
, P_REF
);
3517 } while (pp
!= plist
);
3520 check_dma(mattr
, plist
, pages_req
);
3523 vaddr
+= (pages_req
- npages
) << MMU_PAGESHIFT
;
3529 * page_get_contigpage returns when npages <= sgllen.
3530 * Grab the rest of the non-contig pages below from anylist.
3535 * Loop around collecting the requested number of pages.
3536 * Most of the time, we have to `create' a new page. With
3537 * this in mind, pull the page off the free list before
3538 * getting the hash lock. This will minimize the hash
3539 * lock hold time, nesting, and the like. If it turns
3540 * out we don't need the page, we put it back at the end.
3545 index
= PAGE_HASH_FUNC(vp
, off
);
3547 ASSERT(phm
== NULL
);
3548 ASSERT(index
== PAGE_HASH_FUNC(vp
, off
));
3549 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp
)));
3553 * Try to get the page of any color either from
3554 * the freelist or from the cache list.
3556 npp
= page_get_anylist(vp
, off
, as
, vaddr
, MMU_PAGESIZE
,
3557 flags
& ~PG_MATCH_COLOR
, mattr
, NULL
);
3559 if (mattr
== NULL
) {
3561 * Not looking for a special page;
3564 panic("no page found %d", (int)npages
);
3567 * No page found! This can happen
3568 * if we are looking for a page
3569 * within a specific memory range
3570 * for DMA purposes. If PG_WAIT is
3571 * specified then we wait for a
3572 * while and then try again. The
3573 * wait could be forever if we
3574 * don't get the page(s) we need.
3576 * Note: XXX We really need a mechanism
3577 * to wait for pages in the desired
3578 * range. For now, we wait for any
3579 * pages and see if we can use it.
3582 if ((mattr
!= NULL
) && (flags
& PG_WAIT
)) {
3586 goto fail
; /* undo accounting stuff */
3589 if (PP_ISAGED(npp
) == 0) {
3591 * Since this page came from the
3592 * cachelist, we must destroy the
3593 * old vnode association.
3595 page_hashout(npp
, (kmutex_t
*)NULL
);
3602 ASSERT(PAGE_EXCL(npp
));
3603 ASSERT(npp
->p_vnode
== NULL
);
3604 ASSERT(!hat_page_is_mapped(npp
));
3609 * Here we have a page in our hot little mits and are
3610 * just waiting to stuff it on the appropriate lists.
3611 * Get the mutex and check to see if it really does
3614 phm
= PAGE_HASH_MUTEX(index
);
3616 PAGE_HASH_SEARCH(index
, pp
, vp
, off
);
3618 VM_STAT_ADD(page_create_new
);
3621 if (!page_hashin(pp
, vp
, off
, phm
)) {
3623 * Since we hold the page hash mutex and
3624 * just searched for this page, page_hashin
3625 * had better not fail. If it does, that
3626 * means somethread did not follow the
3627 * page hash mutex rules. Panic now and
3628 * get it over with. As usual, go down
3629 * holding all the locks.
3631 ASSERT(MUTEX_HELD(phm
));
3632 panic("page_create: hashin fail %p %p %llx %p",
3633 (void *)pp
, (void *)vp
, off
, (void *)phm
);
3636 ASSERT(MUTEX_HELD(phm
));
3641 * Hat layer locking need not be done to set
3642 * the following bits since the page is not hashed
3643 * and was on the free list (i.e., had no mappings).
3645 * Set the reference bit to protect
3646 * against immediate pageout
3648 * XXXmh modify freelist code to set reference
3649 * bit so we don't have to do it here.
3651 page_set_props(pp
, P_REF
);
3653 ASSERT(MUTEX_HELD(phm
));
3657 * NOTE: This should not happen for pages associated
3658 * with kernel vnode 'kvp'.
3660 /* XX64 - to debug why this happens! */
3661 ASSERT(!VN_ISKAS(vp
));
3664 "page_create: page not expected "
3665 "in hash list for kernel vnode - pp 0x%p",
3667 VM_STAT_ADD(page_create_exists
);
3672 * Got a page! It is locked. Acquire the i/o
3673 * lock since we are going to use the p_next and
3674 * p_prev fields to link the requested pages together.
3677 page_add(&plist
, pp
);
3678 plist
= plist
->p_next
;
3679 off
+= MMU_PAGESIZE
;
3680 vaddr
+= MMU_PAGESIZE
;
3683 check_dma(mattr
, plist
, pages_req
);
3689 * Did not need this page after all.
3690 * Put it back on the free list.
3692 VM_STAT_ADD(page_create_putbacks
);
3695 npp
->p_offset
= (u_offset_t
)-1;
3696 page_list_add(npp
, PG_FREE_LIST
| PG_LIST_TAIL
);
3701 * Give up the pages we already got.
3703 while (plist
!= NULL
) {
3705 page_sub(&plist
, pp
);
3708 /*LINTED: constant in conditional ctx*/
3709 VN_DISPOSE(pp
, B_INVAL
, 0, kcred
);
3713 * VN_DISPOSE does freemem accounting for the pages in plist
3714 * by calling page_free. So, we need to undo the pcf accounting
3715 * for only the remaining pages.
3717 VM_STAT_ADD(page_create_putbacks
);
3718 page_create_putback(pages_req
- plist_len
);
3726 * Copy the data from the physical page represented by "frompp" to
3727 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3728 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt
3729 * level and no one sleeps with an active mapping there.
3731 * Note that the ref/mod bits in the page_t's are not affected by
3732 * this operation, hence it is up to the caller to update them appropriately.
3735 ppcopy(page_t
*frompp
, page_t
*topp
)
3741 kmutex_t
*ppaddr_mutex
;
3745 ASSERT_STACK_ALIGNED();
3746 ASSERT(PAGE_LOCKED(frompp
));
3747 ASSERT(PAGE_LOCKED(topp
));
3750 pp_addr1
= hat_kpm_page2va(frompp
, 0);
3751 pp_addr2
= hat_kpm_page2va(topp
, 0);
3755 * disable pre-emption so that CPU can't change
3759 pp_addr1
= CPU
->cpu_caddr1
;
3760 pp_addr2
= CPU
->cpu_caddr2
;
3761 pte1
= CPU
->cpu_caddr1pte
;
3762 pte2
= CPU
->cpu_caddr2pte
;
3764 ppaddr_mutex
= &CPU
->cpu_ppaddr_mutex
;
3765 mutex_enter(ppaddr_mutex
);
3767 hat_mempte_remap(page_pptonum(frompp
), pp_addr1
, pte1
,
3768 PROT_READ
| HAT_STORECACHING_OK
, HAT_LOAD_NOCONSIST
);
3769 hat_mempte_remap(page_pptonum(topp
), pp_addr2
, pte2
,
3770 PROT_READ
| PROT_WRITE
| HAT_STORECACHING_OK
,
3771 HAT_LOAD_NOCONSIST
);
3774 if (on_fault(&ljb
)) {
3778 if (use_sse_pagecopy
)
3780 page_copy_no_xmm(pp_addr2
, pp_addr1
);
3782 hwblkpagecopy(pp_addr1
, pp_addr2
);
3785 bcopy(pp_addr1
, pp_addr2
, PAGESIZE
);
3792 * We can't leave unused mappings laying about under the
3793 * hypervisor, so blow them away.
3795 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1
, 0,
3796 UVMF_INVLPG
| UVMF_LOCAL
) < 0)
3797 panic("HYPERVISOR_update_va_mapping() failed");
3798 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2
, 0,
3799 UVMF_INVLPG
| UVMF_LOCAL
) < 0)
3800 panic("HYPERVISOR_update_va_mapping() failed");
3802 mutex_exit(ppaddr_mutex
);
3809 pagezero(page_t
*pp
, uint_t off
, uint_t len
)
3811 ASSERT(PAGE_LOCKED(pp
));
3812 pfnzero(page_pptonum(pp
), off
, len
);
3816 * Zero the physical page from off to off + len given by pfn
3817 * without changing the reference and modified bits of page.
3819 * We use this using CPU private page address #2, see ppcopy() for more info.
3820 * pfnzero() must not be called at interrupt level.
3823 pfnzero(pfn_t pfn
, uint_t off
, uint_t len
)
3827 kmutex_t
*ppaddr_mutex
= NULL
;
3829 ASSERT_STACK_ALIGNED();
3830 ASSERT(len
<= MMU_PAGESIZE
);
3831 ASSERT(off
<= MMU_PAGESIZE
);
3832 ASSERT(off
+ len
<= MMU_PAGESIZE
);
3834 if (kpm_enable
&& !pfn_is_foreign(pfn
)) {
3835 pp_addr2
= hat_kpm_pfn2va(pfn
);
3840 pp_addr2
= CPU
->cpu_caddr2
;
3841 pte2
= CPU
->cpu_caddr2pte
;
3843 ppaddr_mutex
= &CPU
->cpu_ppaddr_mutex
;
3844 mutex_enter(ppaddr_mutex
);
3846 hat_mempte_remap(pfn
, pp_addr2
, pte2
,
3847 PROT_READ
| PROT_WRITE
| HAT_STORECACHING_OK
,
3848 HAT_LOAD_NOCONSIST
);
3851 if (use_sse_pagezero
) {
3856 * zero a byte at a time until properly aligned for
3857 * block_zero_no_xmm().
3859 while (!P2NPHASE(off
, ((uint_t
)BLOCKZEROALIGN
)) && len
-- > 0)
3860 pp_addr2
[off
++] = 0;
3863 * Now use faster block_zero_no_xmm() for any range
3864 * that is properly aligned and sized.
3866 rem
= P2PHASE(len
, ((uint_t
)BLOCKZEROALIGN
));
3869 block_zero_no_xmm(pp_addr2
+ off
, len
);
3874 * zero remainder with byte stores.
3877 pp_addr2
[off
++] = 0;
3879 hwblkclr(pp_addr2
+ off
, len
);
3882 bzero(pp_addr2
+ off
, len
);
3885 if (!kpm_enable
|| pfn_is_foreign(pfn
)) {
3888 * On the hypervisor this page might get used for a page
3889 * table before any intervening change to this mapping,
3892 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2
, 0,
3894 panic("HYPERVISOR_update_va_mapping() failed");
3896 mutex_exit(ppaddr_mutex
);
3903 * Platform-dependent page scrub call.
3906 pagescrub(page_t
*pp
, uint_t off
, uint_t len
)
3909 * For now, we rely on the fact that pagezero() will
3912 pagezero(pp
, off
, len
);
3916 * set up two private addresses for use on a given CPU for use in ppcopy()
3919 setup_vaddr_for_ppcopy(struct cpu
*cpup
)
3922 hat_mempte_t pte_pa
;
3924 addr
= vmem_alloc(heap_arena
, mmu_ptob(1), VM_SLEEP
);
3925 pte_pa
= hat_mempte_setup(addr
);
3926 cpup
->cpu_caddr1
= addr
;
3927 cpup
->cpu_caddr1pte
= pte_pa
;
3929 addr
= vmem_alloc(heap_arena
, mmu_ptob(1), VM_SLEEP
);
3930 pte_pa
= hat_mempte_setup(addr
);
3931 cpup
->cpu_caddr2
= addr
;
3932 cpup
->cpu_caddr2pte
= pte_pa
;
3934 mutex_init(&cpup
->cpu_ppaddr_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
3938 * Undo setup_vaddr_for_ppcopy
3941 teardown_vaddr_for_ppcopy(struct cpu
*cpup
)
3943 mutex_destroy(&cpup
->cpu_ppaddr_mutex
);
3945 hat_mempte_release(cpup
->cpu_caddr2
, cpup
->cpu_caddr2pte
);
3946 cpup
->cpu_caddr2pte
= 0;
3947 vmem_free(heap_arena
, cpup
->cpu_caddr2
, mmu_ptob(1));
3948 cpup
->cpu_caddr2
= 0;
3950 hat_mempte_release(cpup
->cpu_caddr1
, cpup
->cpu_caddr1pte
);
3951 cpup
->cpu_caddr1pte
= 0;
3952 vmem_free(heap_arena
, cpup
->cpu_caddr1
, mmu_ptob(1));
3953 cpup
->cpu_caddr1
= 0;
3957 * Function for flushing D-cache when performing module relocations
3958 * to an alternate mapping. Unnecessary on Intel / AMD platforms.
3965 * Allocate a memory page. The argument 'seed' can be any pseudo-random
3966 * number to vary where the pages come from. This is quite a hacked up
3967 * method -- it works for now, but really needs to be fixed up a bit.
3969 * We currently use page_create_va() on the kvp with fake offsets,
3970 * segments and virt address. This is pretty bogus, but was copied from the
3971 * old hat_i86.c code. A better approach would be to specify either mnode
3972 * random or mnode local and takes a page from whatever color has the MOST
3973 * available - this would have a minimal impact on page coloring.
3976 page_get_physical(uintptr_t seed
)
3980 static struct seg tmpseg
;
3981 static uintptr_t ctr
= 0;
3984 * This code is gross, we really need a simpler page allocator.
3986 * We need to assign an offset for the page to call page_create_va()
3987 * To avoid conflicts with other pages, we get creative with the offset.
3988 * For 32 bits, we need an offset > 4Gig
3989 * For 64 bits, need an offset somewhere in the VA hole.
3992 if (offset
> kernelbase
)
3993 offset
-= kernelbase
;
3994 offset
<<= MMU_PAGESHIFT
;
3995 #if defined(__amd64)
3996 offset
+= mmu
.hole_start
; /* something in VA hole */
3998 offset
+= 1ULL << 40; /* something > 4 Gig */
4001 if (page_resv(1, KM_NOSLEEP
) == 0)
4005 pp
= page_exists(&kvp
, offset
);
4007 panic("page already exists %p", (void *)pp
);
4010 pp
= page_create_va(&kvp
, offset
, MMU_PAGESIZE
, PG_EXCL
,
4011 &tmpseg
, (caddr_t
)(ctr
+= MMU_PAGESIZE
)); /* changing VA usage */