4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2012 DEY Storage Systems, Inc. All rights reserved.
25 * Copyright 2017 Nexenta Systems, Inc.
26 * Copyright (c) 2018 Joyent, Inc.
27 * Copyright (c) 2015 by Delphix. All rights reserved.
30 * Copyright (c) 2010, Intel Corporation.
31 * All rights reserved.
34 #include <sys/types.h>
35 #include <sys/t_lock.h>
36 #include <sys/param.h>
37 #include <sys/sysmacros.h>
38 #include <sys/signal.h>
39 #include <sys/systm.h>
44 #include <sys/avintr.h>
45 #include <sys/autoconf.h>
47 #include <sys/class.h>
48 #include <sys/bitmap.h>
50 #include <sys/privregs.h>
56 #include <sys/kstat.h>
58 #include <sys/reboot.h>
61 #include <sys/vnode.h>
64 #include <sys/procfs.h>
67 #include <sys/cmn_err.h>
68 #include <sys/utsname.h>
69 #include <sys/debug.h>
72 #include <sys/dumphdr.h>
73 #include <sys/bootconf.h>
74 #include <sys/memlist_plat.h>
75 #include <sys/varargs.h>
76 #include <sys/promif.h>
77 #include <sys/modctl.h>
79 #include <sys/sunddi.h>
80 #include <sys/sunndi.h>
81 #include <sys/ndi_impldefs.h>
82 #include <sys/ddidmareq.h>
84 #include <sys/regset.h>
85 #include <sys/clock.h>
88 #include <sys/stack.h>
91 #include <vm/kboot_mmu.h>
96 #include <vm/seg_dev.h>
97 #include <vm/seg_kmem.h>
98 #include <vm/seg_kpm.h>
99 #include <vm/seg_map.h>
100 #include <vm/seg_vn.h>
101 #include <vm/seg_kp.h>
102 #include <sys/memnode.h>
103 #include <vm/vm_dep.h>
104 #include <sys/thread.h>
105 #include <sys/sysconf.h>
106 #include <sys/vm_machparam.h>
107 #include <sys/archsystm.h>
108 #include <sys/machsystm.h>
110 #include <vm/hat_i86.h>
111 #include <sys/pmem.h>
112 #include <sys/smp_impldefs.h>
113 #include <sys/x86_archext.h>
114 #include <sys/cpuvar.h>
115 #include <sys/segments.h>
116 #include <sys/clconf.h>
117 #include <sys/kobj.h>
118 #include <sys/kobj_lex.h>
119 #include <sys/cpc_impl.h>
120 #include <sys/cpu_module.h>
121 #include <sys/smbios.h>
122 #include <sys/debug_info.h>
123 #include <sys/bootinfo.h>
124 #include <sys/ddi_periodic.h>
125 #include <sys/systeminfo.h>
126 #include <sys/multiboot.h>
127 #include <sys/ramdisk.h>
131 #include <sys/hypervisor.h>
132 #include <sys/xen_mmu.h>
133 #include <sys/evtchn_impl.h>
134 #include <sys/gnttab.h>
135 #include <sys/xpv_panic.h>
136 #include <xen/sys/xenbus_comms.h>
137 #include <xen/public/physdev.h>
139 extern void xen_late_startup(void);
141 struct xen_evt_data cpu0_evt_data
;
144 #include <sys/memlist_impl.h>
146 extern void mem_config_init(void);
149 extern void progressbar_init(void);
150 extern void brand_init(void);
151 extern void pcf_init(void);
152 extern void pg_init(void);
153 extern void ssp_init(void);
155 extern int size_pse_array(pgcnt_t
, int);
157 #if defined(_SOFT_HOSTID)
161 static int32_t set_soft_hostid(void);
162 static char hostid_file
[] = "/etc/hostid";
166 void *gfx_devinfo_list
;
168 #if defined(__amd64) && !defined(__xpv)
169 extern void immu_startup(void);
173 * XXX make declaration below "static" when drivers no longer use this
176 extern caddr_t p0_va
; /* Virtual address for accessing physical page 0 */
181 extern int segkp_fromheap
;
183 static void kvm_init(void);
184 static void startup_init(void);
185 static void startup_memlist(void);
186 static void startup_kmem(void);
187 static void startup_modules(void);
188 static void startup_vm(void);
189 static void startup_end(void);
190 static void layout_kernel_va(void);
193 * Declare these as initialized data so we can patch them.
198 * Due to virtual address space limitations running in 32 bit mode, restrict
199 * the amount of physical memory configured to a max of PHYSMEM pages (16g).
201 * If the physical max memory size of 64g were allowed to be configured, the
202 * size of user virtual address space will be less than 1g. A limited user
203 * address space greatly reduces the range of applications that can run.
205 * If more physical memory than PHYSMEM is required, users should preferably
206 * run in 64 bit mode which has far looser virtual address space limitations.
208 * If 64 bit mode is not available (as in IA32) and/or more physical memory
209 * than PHYSMEM is required in 32 bit mode, physmem can be set to the desired
210 * value or to 0 (to configure all available memory) via eeprom(1M). kernelbase
211 * should also be carefully tuned to balance out the need of the user
212 * application while minimizing the risk of kernel heap exhaustion due to
213 * kernelbase being set too high.
215 #define PHYSMEM 0x400000
220 * For now we can handle memory with physical addresses up to about
221 * 64 Terabytes. This keeps the kernel above the VA hole, leaving roughly
222 * half the VA space for seg_kpm. When systems get bigger than 64TB this
223 * code will need revisiting. There is an implicit assumption that there
224 * are no *huge* holes in the physical address space too.
226 #define TERABYTE (1ul << 40)
227 #define PHYSMEM_MAX64 mmu_btop(64 * TERABYTE)
228 #define PHYSMEM PHYSMEM_MAX64
229 #define AMD64_VA_HOLE_END 0xFFFF800000000000ul
233 pgcnt_t physmem
= PHYSMEM
;
234 pgcnt_t obp_pages
; /* Memory used by PROM for its text and data */
237 int kobj_file_bufsize
; /* set in /etc/system */
239 /* Global variables for MP support. Used in mp_startup */
240 caddr_t rm_platter_va
= 0;
241 uint32_t rm_platter_pa
;
243 int auto_lpg_disable
= 1;
246 * Some CPUs have holes in the middle of the 64-bit virtual address range.
248 uintptr_t hole_start
, hole_end
;
255 static int kpm_desired
;
257 static uintptr_t segkpm_base
= (uintptr_t)SEGKPM_BASE
;
261 * Configuration parameters set at boot time.
264 caddr_t econtig
; /* end of first block of contiguous kernel */
266 struct bootops
*bootops
= 0; /* passed in from boot */
267 struct bootops
**bootopsp
;
268 struct boot_syscalls
*sysp
; /* passed in from boot */
270 char bootblock_fstype
[16];
272 char kern_bootargs
[OBP_MAXPATHLEN
];
273 char kern_bootfile
[OBP_MAXPATHLEN
];
276 * ZFS zio segment. This allows us to exclude large portions of ZFS data that
277 * gets cached in kmem caches on the heap. If this is set to zero, we allocate
278 * zio buffers from their own segment, otherwise they are allocated from the
279 * heap. The optimization of allocating zio buffers from their own segment is
280 * only valid on 64-bit kernels.
283 int segzio_fromheap
= 0;
285 int segzio_fromheap
= 1;
289 * Give folks an escape hatch for disabling SMAP via kmdb. Doesn't work
292 int disable_smap
= 0;
295 * new memory fragmentations are possible in startup() due to BOP_ALLOCs. this
296 * depends on number of BOP_ALLOC calls made and requested size, memory size
297 * combination and whether boot.bin memory needs to be freed.
299 #define POSS_NEW_FRAGMENTS 12
304 long page_hashsz
; /* Size of page hash table (power of two) */
305 unsigned int page_hashsz_shift
; /* log2(page_hashsz) */
306 struct page
*pp_base
; /* Base of initial system page struct array */
307 struct page
**page_hash
; /* Page hash table */
308 pad_mutex_t
*pse_mutex
; /* Locks protecting pp->p_selock */
309 size_t pse_table_size
; /* Number of mutexes in pse_mutex[] */
310 int pse_shift
; /* log2(pse_table_size) */
311 struct seg ktextseg
; /* Segment used for kernel executable image */
312 struct seg kvalloc
; /* Segment used for "valloc" mapping */
313 struct seg kpseg
; /* Segment used for pageable kernel virt mem */
314 struct seg kmapseg
; /* Segment used for generic kernel mappings */
315 struct seg kdebugseg
; /* Segment used for the kernel debugger */
317 struct seg
*segkmap
= &kmapseg
; /* Kernel generic mapping segment */
318 static struct seg
*segmap
= &kmapseg
; /* easier to use name for in here */
320 struct seg
*segkp
= &kpseg
; /* Pageable kernel virtual memory segment */
323 struct seg kvseg_core
; /* Segment used for the core heap */
324 struct seg kpmseg
; /* Segment used for physical mapping */
325 struct seg
*segkpm
= &kpmseg
; /* 64bit kernel physical mapping segment */
327 struct seg
*segkpm
= NULL
; /* Unused on IA32 */
330 caddr_t segkp_base
; /* Base address of segkp */
331 caddr_t segzio_base
; /* Base address of segzio */
333 pgcnt_t segkpsize
= btop(SEGKPDEFSIZE
); /* size of segkp segment in pages */
335 pgcnt_t segkpsize
= 0;
337 pgcnt_t segziosize
= 0; /* size of zio segment in pages */
340 * A static DR page_t VA map is reserved that can map the page structures
341 * for a domain's entire RA space. The pages that back this space are
342 * dynamically allocated and need not be physically contiguous. The DR
343 * map size is derived from KPM size.
344 * This mechanism isn't used by x86 yet, so just stubs here.
346 int ppvm_enable
= 0; /* Static virtual map for page structs */
347 page_t
*ppvm_base
= NULL
; /* Base of page struct map */
348 pgcnt_t ppvm_size
= 0; /* Size of page struct map */
351 * VA range available to the debugger
353 const caddr_t kdi_segdebugbase
= (const caddr_t
)SEGDEBUGBASE
;
354 const size_t kdi_segdebugsize
= SEGDEBUGSIZE
;
356 struct memseg
*memseg_base
;
357 struct vnode unused_pages_vp
;
359 #define FOURGB 0x100000000LL
361 struct memlist
*memlist
;
363 caddr_t s_text
; /* start of kernel text segment */
364 caddr_t e_text
; /* end of kernel text segment */
365 caddr_t s_data
; /* start of kernel data segment */
366 caddr_t e_data
; /* end of kernel data segment */
367 caddr_t modtext
; /* start of loadable module text reserved */
368 caddr_t e_modtext
; /* end of loadable module text reserved */
369 caddr_t moddata
; /* start of loadable module data reserved */
370 caddr_t e_moddata
; /* end of loadable module data reserved */
372 struct memlist
*phys_install
; /* Total installed physical memory */
373 struct memlist
*phys_avail
; /* Total available physical memory */
374 struct memlist
*bios_rsvd
; /* Bios reserved memory */
377 * kphysm_init returns the number of pages that were processed
379 static pgcnt_t
kphysm_init(page_t
*, pgcnt_t
);
381 #define IO_PROP_SIZE 64 /* device property size */
384 * a couple useful roundup macros
386 #define ROUND_UP_PAGE(x) \
387 ((uintptr_t)P2ROUNDUP((uintptr_t)(x), (uintptr_t)MMU_PAGESIZE))
388 #define ROUND_UP_LPAGE(x) \
389 ((uintptr_t)P2ROUNDUP((uintptr_t)(x), mmu.level_size[1]))
390 #define ROUND_UP_4MEG(x) \
391 ((uintptr_t)P2ROUNDUP((uintptr_t)(x), (uintptr_t)FOUR_MEG))
392 #define ROUND_UP_TOPLEVEL(x) \
393 ((uintptr_t)P2ROUNDUP((uintptr_t)(x), mmu.level_size[mmu.max_level]))
396 * 32-bit Kernel's Virtual memory layout.
397 * +-----------------------+
399 * 0xFFC00000 -|-----------------------|- ARGSBASE
401 * 0xFF800000 -|-----------------------|- SEGDEBUGBASE
403 * 0xFEC00000 -|-----------------------|
405 * 0xFE800000 -|-----------------------|- KERNEL_TEXT (0xFB400000 on Xen)
406 * |--- GDT ---|- GDT page (GDT_VA)
407 * |--- debug info ---|- debug info (DEBUG_INFO_VA)
409 * | page_t structures |
410 * | memsegs, memlists, |
411 * | page hash, etc. |
412 * --- -|-----------------------|- ekernelheap, valloc_base (floating)
413 * | | (segkp is just an arena in the heap)
418 * --- -|-----------------------|- kernelheap (floating)
420 * 0xC3002000 -|-----------------------|- segmap_start (floating)
422 * 0xC3000000 -|-----------------------|- kernelbase / userlimit (floating)
424 * | Shared objects | \/
428 * |-----------------------|
430 * 0x08048000 -|-----------------------|
434 * 0x00000000 +-----------------------+
437 * 64-bit Kernel's Virtual memory layout. (assuming 64 bit app)
438 * +-----------------------+
440 * 0xFFFFFFFF.FFC00000 |-----------------------|- ARGSBASE
442 * 0xFFFFFFFF.FF800000 |-----------------------|- SEGDEBUGBASE
444 * +-----------------------+
446 * 0xFFFFFFFF.FBC00000 |-----------------------|
448 * 0xFFFFFFFF.FB800000 |-----------------------|- KERNEL_TEXT
449 * |--- debug info ---|- debug info (DEBUG_INFO_VA)
450 * |--- GDT ---|- GDT page (GDT_VA)
451 * |--- IDT ---|- IDT page (IDT_VA)
452 * |--- LDT ---|- LDT pages (LDT_VA)
454 * | Core heap | (used for loadable modules)
455 * 0xFFFFFFFF.C0000000 |-----------------------|- core_base / ekernelheap
458 * 0xFFFFFXXX.XXX00000 |-----------------------|- kernelheap (floating)
460 * 0xFFFFFXXX.XXX00000 |-----------------------|- segmap_start (floating)
461 * | device mappings |
462 * 0xFFFFFXXX.XXX00000 |-----------------------|- toxic_addr (floating)
464 * 0xFFFFFXXX.XXX00000 |-----------------------|- segzio_base (floating)
466 * --- |-----------------------|- segkp_base (floating)
467 * | page_t structures | valloc_base + valloc_sz
468 * | memsegs, memlists, |
469 * | page hash, etc. |
470 * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if >256GB)
472 * 0xFFFFFE00.00000000 |-----------------------|
474 * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if >256GB)
475 * | User stack |- User space memory
477 * | shared objects, etc | (grows downwards)
480 * 0xFFFF8000.00000000 |-----------------------|
482 * | VA Hole / unused |
484 * 0x00008000.00000000 |-----------------------|
488 * | user heap | (grows upwards)
491 * |-----------------------|
493 * 0x00000000.04000000 |-----------------------|
495 * 0x00000000.00000000 +-----------------------+
497 * A 32 bit app on the 64 bit kernel sees the same layout as on the 32 bit
498 * kernel, except that userlimit is raised to 0xfe000000
502 * valloc_base: start of the kernel's memory management/tracking data
503 * structures. This region contains page_t structures for
504 * physical memory, memsegs, memlists, and the page hash.
506 * core_base: start of the kernel's "core" heap area on 64-bit systems.
507 * This area is intended to be used for global data as well as for module
508 * text/data that does not fit into the nucleus pages. The core heap is
509 * restricted to a 2GB range, allowing every address within it to be
510 * accessed using rip-relative addressing
512 * ekernelheap: end of kernelheap and start of segmap.
514 * kernelheap: start of kernel heap. On 32-bit systems, this starts right
515 * above a red zone that separates the user's address space from the
516 * kernel's. On 64-bit systems, it sits above segkp and segkpm.
518 * segmap_start: start of segmap. The length of segmap can be modified
519 * through eeprom. The default length is 16MB on 32-bit systems and 64MB
522 * kernelbase: On a 32-bit kernel the default value of 0xd4000000 will be
523 * decreased by 2X the size required for page_t. This allows the kernel
524 * heap to grow in size with physical memory. With sizeof(page_t) == 80
525 * bytes, the following shows the values of kernelbase and kernel heap
526 * sizes for different memory configurations (assuming default segmap and
529 * mem size for kernelbase kernel heap
531 * ---- --------- ---------- -----------
532 * 1gb 0x01400000 0xd1800000 684MB
533 * 2gb 0x02800000 0xcf000000 704MB
534 * 4gb 0x05000000 0xca000000 744MB
535 * 6gb 0x07800000 0xc5000000 784MB
536 * 8gb 0x0a000000 0xc0000000 824MB
537 * 16gb 0x14000000 0xac000000 984MB
538 * 32gb 0x28000000 0x84000000 1304MB
539 * 64gb 0x50000000 0x34000000 1944MB (*)
541 * kernelbase is less than the abi minimum of 0xc0000000 for memory
542 * configurations above 8gb.
544 * (*) support for memory configurations above 32gb will require manual tuning
545 * of kernelbase to balance out the need of user applications.
548 /* real-time-clock initialization parameters */
549 extern time_t process_rtc_config_file(void);
551 uintptr_t kernelbase
;
552 uintptr_t postbootkernelbase
; /* not set till boot loader is gone */
553 uintptr_t eprom_kernelbase
;
555 uintptr_t segmap_start
;
559 size_t core_size
; /* size of "core" heap */
560 uintptr_t core_base
; /* base address of "core" heap */
563 * List of bootstrap pages. We mark these as allocated in startup.
564 * release_bootstrap() will free them when we're completely done with
567 static page_t
*bootpages
;
570 * boot time pages that have a vnode from the ramdisk will keep that forever.
572 static page_t
*rd_pages
;
577 static page_t
*lower_pages
= NULL
;
578 static int lower_pages_count
= 0;
580 struct system_hardware system_hardware
;
583 * Enable some debugging messages concerning memory usage...
586 print_memlist(char *title
, struct memlist
*mp
)
588 prom_printf("MEMLIST: %s:\n", title
);
590 prom_printf("\tAddress 0x%" PRIx64
", size 0x%" PRIx64
"\n",
591 mp
->ml_address
, mp
->ml_size
);
597 * XX64 need a comment here.. are these just default values, surely
598 * we read the "cpuid" type information to figure this out.
600 int l2cache_sz
= 0x80000;
601 int l2cache_linesz
= 0x40;
602 int l2cache_assoc
= 1;
604 static size_t textrepl_min_gb
= 10;
607 * on 64 bit we use a predifined VA range for mapping devices in the kernel
608 * on 32 bit the mappings are intermixed in the heap, so we use a bit map
612 vmem_t
*device_arena
;
613 uintptr_t toxic_addr
= (uintptr_t)NULL
;
614 size_t toxic_size
= 1024 * 1024 * 1024; /* Sparc uses 1 gig too */
618 ulong_t
*toxic_bit_map
; /* one bit for each 4k of VA in heap_arena */
619 size_t toxic_bit_map_len
= 0; /* in bits */
624 * Simple boot time debug facilities
626 static char *prm_dbg_str
[] = {
627 "%s:%d: '%s' is 0x%x\n",
628 "%s:%d: '%s' is 0x%llx\n"
633 #define PRM_DEBUG(q) if (prom_debug) \
634 prom_printf(prm_dbg_str[sizeof (q) >> 3], "startup.c", __LINE__, #q, q);
635 #define PRM_POINT(q) if (prom_debug) \
636 prom_printf("%s:%d: %s\n", "startup.c", __LINE__, q);
639 * This structure is used to keep track of the intial allocations
640 * done in startup_memlist(). The value of NUM_ALLOCATIONS needs to
641 * be >= the number of ADD_TO_ALLOCATIONS() executed in the code.
643 #define NUM_ALLOCATIONS 8
644 int num_allocations
= 0;
648 } allocations
[NUM_ALLOCATIONS
];
649 size_t valloc_sz
= 0;
650 uintptr_t valloc_base
;
652 #define ADD_TO_ALLOCATIONS(ptr, size) { \
653 size = ROUND_UP_PAGE(size); \
654 if (num_allocations == NUM_ALLOCATIONS) \
655 panic("too many ADD_TO_ALLOCATIONS()"); \
656 allocations[num_allocations].al_ptr = (void**)&ptr; \
657 allocations[num_allocations].al_size = size; \
663 * Allocate all the initial memory needed by the page allocator.
666 perform_allocations(void)
672 PRM_DEBUG(valloc_base
);
673 PRM_DEBUG(valloc_sz
);
674 valloc_align
= mmu
.level_size
[mmu
.max_page_level
> 0];
675 mem
= BOP_ALLOC(bootops
, (caddr_t
)valloc_base
, valloc_sz
, valloc_align
);
676 if (mem
!= (caddr_t
)valloc_base
)
677 panic("BOP_ALLOC() failed");
678 bzero(mem
, valloc_sz
);
679 for (i
= 0; i
< num_allocations
; ++i
) {
680 *allocations
[i
].al_ptr
= (void *)mem
;
681 mem
+= allocations
[i
].al_size
;
686 * Set up and enable SMAP now before we start other CPUs, but after the kernel's
687 * VM has been set up so we can use hot_patch_kernel_text().
689 * We can only patch 1, 2, or 4 bytes, but not three bytes. So instead, we
690 * replace the four byte word at the patch point. See uts/intel/ia32/ml/copy.s
691 * for more information on what's going on here.
701 extern int _smap_enable_patch_count
;
702 extern int _smap_disable_patch_count
;
704 if (disable_smap
!= 0)
705 remove_x86_feature(x86_featureset
, X86FSET_SMAP
);
707 if (is_x86_feature(x86_featureset
, X86FSET_SMAP
) == B_FALSE
)
710 for (i
= 0; i
< _smap_enable_patch_count
; i
++) {
713 VERIFY3U(i
, <, _smap_enable_patch_count
);
714 VERIFY(snprintf(sym
, sizeof (sym
), "_smap_enable_patch_%d", i
) <
716 instp
= (uint8_t *)(void *)kobj_getelfsym(sym
, NULL
, &sizep
);
718 inst
= (instp
[3] << 24) | (SMAP_CLAC_INSTR
& 0x00ffffff);
719 hot_patch_kernel_text((caddr_t
)instp
, inst
, 4);
722 for (i
= 0; i
< _smap_disable_patch_count
; i
++) {
725 VERIFY(snprintf(sym
, sizeof (sym
), "_smap_disable_patch_%d",
727 instp
= (uint8_t *)(void *)kobj_getelfsym(sym
, NULL
, &sizep
);
729 inst
= (instp
[3] << 24) | (SMAP_STAC_INSTR
& 0x00ffffff);
730 hot_patch_kernel_text((caddr_t
)instp
, inst
, 4);
733 hot_patch_kernel_text((caddr_t
)smap_enable
, SMAP_CLAC_INSTR
, 4);
734 hot_patch_kernel_text((caddr_t
)smap_disable
, SMAP_STAC_INSTR
, 4);
735 setcr4(getcr4() | CR4_SMAP
);
740 * Our world looks like this at startup time.
742 * In a 32-bit OS, boot loads the kernel text at 0xfe800000 and kernel data
743 * at 0xfec00000. On a 64-bit OS, kernel text and data are loaded at
744 * 0xffffffff.fe800000 and 0xffffffff.fec00000 respectively. Those
745 * addresses are fixed in the binary at link time.
748 * unix/genunix/krtld/module text loads.
751 * unix/genunix/krtld/module data loads.
753 * Machine-dependent startup code
759 extern void startup_pci_bios(void);
761 extern cpuset_t cpu_ready_set
;
764 * Make sure that nobody tries to use sekpm until we have
765 * initialized it properly.
771 CPUSET_ONLY(cpu_ready_set
, 0); /* cpu 0 is boot cpu */
773 #if defined(__xpv) /* XXPV fix me! */
775 extern int segvn_use_regions
;
776 segvn_use_regions
= 0;
783 startup_xen_version();
790 * Note we need to do this even on fast reboot in order to access
791 * the irq routing table (used for pci labels).
807 PRM_POINT("startup_init() starting...");
810 * Complete the extraction of cpuid data
814 (void) check_boot_version(BOP_GETVERSION(bootops
));
817 * Check for prom_debug in boot environment
819 if (BOP_GETPROPLEN(bootops
, "prom_debug") >= 0) {
821 PRM_POINT("prom_debug found in boot enviroment");
825 * Collect node, cpu and memory configuration information.
827 get_system_configuration();
830 * Halt if this is an unsupported processor.
832 if (x86_type
== X86_TYPE_486
|| x86_type
== X86_TYPE_CYRIX_486
) {
833 printf("\n486 processor (\"%s\") detected.\n",
835 halt("This processor is not supported by this release "
839 PRM_POINT("startup_init() done");
843 * Callback for copy_memlist_filter() to filter nucleus, kadb/kmdb, (ie.
844 * everything mapped above KERNEL_TEXT) pages from phys_avail. Note it
845 * also filters out physical page zero. There is some reliance on the
846 * boot loader allocating only a few contiguous physical memory chunks.
849 avail_filter(uint64_t *addr
, uint64_t *size
)
861 prom_printf("\tFilter: in: a=%" PRIx64
", s=%" PRIx64
"\n",
865 * page zero is required for BIOS.. never make it available
868 *addr
+= MMU_PAGESIZE
;
869 *size
-= MMU_PAGESIZE
;
873 * First we trim from the front of the range. Since kbm_probe()
874 * walks ranges in virtual order, but addr/size are physical, we need
875 * to the list until no changes are seen. This deals with the case
876 * where page "p" is mapped at v, page "p + PAGESIZE" is mapped at w
881 for (va
= KERNEL_TEXT
;
882 *size
> 0 && kbm_probe(&va
, &len
, &pfn
, &prot
) != 0;
886 pfn_addr
= pfn_to_pa(pfn
);
887 pfn_eaddr
= pfn_addr
+ len
;
889 if (pfn_addr
<= *addr
&& pfn_eaddr
> *addr
) {
891 while (*size
> 0 && len
> 0) {
892 *addr
+= MMU_PAGESIZE
;
893 *size
-= MMU_PAGESIZE
;
898 if (change
&& prom_debug
)
899 prom_printf("\t\ttrim: a=%" PRIx64
", s=%" PRIx64
"\n",
904 * Trim pages from the end of the range.
906 for (va
= KERNEL_TEXT
;
907 *size
> 0 && kbm_probe(&va
, &len
, &pfn
, &prot
) != 0;
911 pfn_addr
= pfn_to_pa(pfn
);
913 if (pfn_addr
>= *addr
&& pfn_addr
< *addr
+ *size
)
914 *size
= pfn_addr
- *addr
;
918 prom_printf("\tFilter out: a=%" PRIx64
", s=%" PRIx64
"\n",
925 struct segkpm_crargs b
;
928 * These variables were all designed for sfmmu in which segkpm is
929 * mapped using a single pagesize - either 8KB or 4MB. On x86, we
930 * might use 2+ page sizes on a single machine, so none of these
931 * variables have a single correct value. They are set up as if we
932 * always use a 4KB pagesize, which should do no harm. In the long
933 * run, we should get rid of KPM's assumption that only a single
936 kpm_pgshft
= MMU_PAGESHIFT
;
937 kpm_pgsz
= MMU_PAGESIZE
;
938 kpm_pgoff
= MMU_PAGEOFFSET
;
941 ASSERT(((uintptr_t)kpm_vbase
& (kpm_pgsz
- 1)) == 0);
943 PRM_POINT("about to create segkpm");
944 rw_enter(&kas
.a_lock
, RW_WRITER
);
946 if (seg_attach(&kas
, kpm_vbase
, kpm_size
, segkpm
) < 0)
947 panic("cannot attach segkpm");
949 b
.prot
= PROT_READ
| PROT_WRITE
;
952 if (segkpm_create(segkpm
, (caddr_t
)&b
) != 0)
953 panic("segkpm_create segkpm");
955 rw_exit(&kas
.a_lock
);
960 * As the KPM was disabled while setting up the system, go back and fix
961 * CPU zero's access to its user page table. This is a bit gross, but
962 * we have a chicken and egg problem otherwise.
964 ASSERT(CPU
->cpu_hat_info
->hci_user_l3ptes
== NULL
);
965 CPU
->cpu_hat_info
->hci_user_l3ptes
=
966 (x86pte_t
*)hat_kpm_mapin_pfn(CPU
->cpu_hat_info
->hci_user_l3pfn
);
970 * The debug info page provides enough information to allow external
971 * inspectors (e.g. when running under a hypervisor) to bootstrap
972 * themselves into allowing full-blown kernel debugging.
975 init_debug_info(void)
981 ASSERT(sizeof (debug_info_t
) < MMU_PAGESIZE
);
984 mem
= BOP_ALLOC(bootops
, (caddr_t
)DEBUG_INFO_VA
, MMU_PAGESIZE
,
987 if (mem
!= (caddr_t
)DEBUG_INFO_VA
)
988 panic("BOP_ALLOC() failed");
989 bzero(mem
, MMU_PAGESIZE
);
991 di
= (debug_info_t
*)mem
;
993 di
->di_magic
= DEBUG_INFO_MAGIC
;
994 di
->di_version
= DEBUG_INFO_VERSION
;
995 di
->di_modules
= (uintptr_t)&modules
;
996 di
->di_s_text
= (uintptr_t)s_text
;
997 di
->di_e_text
= (uintptr_t)e_text
;
998 di
->di_s_data
= (uintptr_t)s_data
;
999 di
->di_e_data
= (uintptr_t)e_data
;
1000 di
->di_hat_htable_off
= offsetof(hat_t
, hat_htable
);
1001 di
->di_ht_pfn_off
= offsetof(htable_t
, ht_pfn
);
1005 * Build the memlists and other kernel essential memory system data structures.
1006 * This is everything at valloc_base.
1009 startup_memlist(void)
1020 pfn_t rsvd_high_pfn
;
1022 size_t rsvdmemlist_sz
;
1024 caddr_t pagecolor_mem
;
1025 size_t pagecolor_memsz
;
1026 caddr_t page_ctrs_mem
;
1027 size_t page_ctrs_size
;
1028 size_t pse_table_alloc_size
;
1029 struct memlist
*current
;
1030 extern void startup_build_mem_nodes(struct memlist
*);
1032 /* XX64 fix these - they should be in include files */
1033 extern size_t page_coloring_init(uint_t
, int, int);
1034 extern void page_coloring_setup(caddr_t
);
1036 PRM_POINT("startup_memlist() starting...");
1039 * Use leftover large page nucleus text/data space for loadable modules.
1040 * Use at most MODTEXT/MODDATA.
1042 len
= kbm_nucleus_size
;
1043 ASSERT(len
> MMU_PAGESIZE
);
1045 moddata
= (caddr_t
)ROUND_UP_PAGE(e_data
);
1046 e_moddata
= (caddr_t
)P2ROUNDUP((uintptr_t)e_data
, (uintptr_t)len
);
1047 if (e_moddata
- moddata
> MODDATA
)
1048 e_moddata
= moddata
+ MODDATA
;
1050 modtext
= (caddr_t
)ROUND_UP_PAGE(e_text
);
1051 e_modtext
= (caddr_t
)P2ROUNDUP((uintptr_t)e_text
, (uintptr_t)len
);
1052 if (e_modtext
- modtext
> MODTEXT
)
1053 e_modtext
= modtext
+ MODTEXT
;
1055 econtig
= e_moddata
;
1058 PRM_DEBUG(e_modtext
);
1060 PRM_DEBUG(e_moddata
);
1064 * Examine the boot loader physical memory map to find out:
1065 * - total memory in system - physinstalled
1066 * - the max physical address - physmax
1067 * - the number of discontiguous segments of memory.
1070 print_memlist("boot physinstalled",
1071 bootops
->boot_mem
->physinstalled
);
1072 installed_top_size_ex(bootops
->boot_mem
->physinstalled
, &physmax
,
1073 &physinstalled
, &memblocks
);
1075 PRM_DEBUG(physinstalled
);
1076 PRM_DEBUG(memblocks
);
1079 * Compute maximum physical address for memory DR operations.
1080 * Memory DR operations are unsupported on xpv or 32bit OSes.
1083 if (plat_dr_support_memory()) {
1084 if (plat_dr_physmax
== 0) {
1085 uint_t pabits
= UINT_MAX
;
1087 cpuid_get_addrsize(CPU
, &pabits
, NULL
);
1088 plat_dr_physmax
= btop(1ULL << pabits
);
1090 if (plat_dr_physmax
> PHYSMEM_MAX64
)
1091 plat_dr_physmax
= PHYSMEM_MAX64
;
1094 plat_dr_physmax
= 0;
1097 * Examine the bios reserved memory to find out:
1098 * - the number of discontiguous segments of memory.
1101 print_memlist("boot reserved mem",
1102 bootops
->boot_mem
->rsvdmem
);
1103 installed_top_size_ex(bootops
->boot_mem
->rsvdmem
, &rsvd_high_pfn
,
1104 &rsvd_pgcnt
, &rsvdmemblocks
);
1105 PRM_DEBUG(rsvd_high_pfn
);
1106 PRM_DEBUG(rsvd_pgcnt
);
1107 PRM_DEBUG(rsvdmemblocks
);
1110 * Initialize hat's mmu parameters.
1111 * Check for enforce-prot-exec in boot environment. It's used to
1112 * enable/disable support for the page table entry NX bit.
1113 * The default is to enforce PROT_EXEC on processors that support NX.
1114 * Boot seems to round up the "len", but 8 seems to be big enough.
1120 * physmax is lowered if there is more memory than can be
1121 * physically addressed in 32 bit (PAE/non-PAE) modes.
1124 if (PFN_ABOVE64G(physmax
)) {
1125 physinstalled
-= (physmax
- (PFN_64G
- 1));
1126 physmax
= PFN_64G
- 1;
1129 if (PFN_ABOVE4G(physmax
)) {
1130 physinstalled
-= (physmax
- (PFN_4G
- 1));
1131 physmax
= PFN_4G
- 1;
1136 startup_build_mem_nodes(bootops
->boot_mem
->physinstalled
);
1138 if (BOP_GETPROPLEN(bootops
, "enforce-prot-exec") >= 0) {
1139 int len
= BOP_GETPROPLEN(bootops
, "enforce-prot-exec");
1143 (void) BOP_GETPROP(bootops
, "enforce-prot-exec", value
);
1145 (void) strcpy(value
, "");
1146 if (strcmp(value
, "off") == 0)
1149 PRM_DEBUG(mmu
.pt_nx
);
1152 * We will need page_t's for every page in the system, except for
1153 * memory mapped at or above above the start of the kernel text segment.
1155 * pages above e_modtext are attributed to kernel debugger (obp_pages)
1157 npages
= physinstalled
- 1; /* avail_filter() skips page 0, so "- 1" */
1160 while (kbm_probe(&va
, &len
, &pfn
, &prot
) != 0) {
1161 npages
-= len
>> MMU_PAGESHIFT
;
1162 if (va
>= (uintptr_t)e_moddata
)
1163 obp_pages
+= len
>> MMU_PAGESHIFT
;
1167 PRM_DEBUG(obp_pages
);
1170 * If physmem is patched to be non-zero, use it instead of the computed
1171 * value unless it is larger than the actual amount of memory on hand.
1173 if (physmem
== 0 || physmem
> npages
) {
1175 } else if (physmem
< npages
) {
1176 orig_npages
= npages
;
1182 * We now compute the sizes of all the initial allocations for
1183 * structures the kernel needs in order do kmem_alloc(). These
1189 * page coloring data structs
1191 memseg_sz
= sizeof (struct memseg
) * (memblocks
+ POSS_NEW_FRAGMENTS
);
1192 ADD_TO_ALLOCATIONS(memseg_base
, memseg_sz
);
1193 PRM_DEBUG(memseg_sz
);
1196 * Reserve space for memlists. There's no real good way to know exactly
1197 * how much room we'll need, but this should be a good upper bound.
1199 memlist_sz
= ROUND_UP_PAGE(2 * sizeof (struct memlist
) *
1200 (memblocks
+ POSS_NEW_FRAGMENTS
));
1201 ADD_TO_ALLOCATIONS(memlist
, memlist_sz
);
1202 PRM_DEBUG(memlist_sz
);
1205 * Reserve space for bios reserved memlists.
1207 rsvdmemlist_sz
= ROUND_UP_PAGE(2 * sizeof (struct memlist
) *
1208 (rsvdmemblocks
+ POSS_NEW_FRAGMENTS
));
1209 ADD_TO_ALLOCATIONS(bios_rsvd
, rsvdmemlist_sz
);
1210 PRM_DEBUG(rsvdmemlist_sz
);
1213 ASSERT(P2SAMEHIGHBIT((1 << PP_SHIFT
), sizeof (struct page
)));
1215 * The page structure hash table size is a power of 2
1216 * such that the average hash chain length is PAGE_HASHAVELEN.
1218 page_hashsz
= npages
/ PAGE_HASHAVELEN
;
1219 page_hashsz_shift
= highbit(page_hashsz
);
1220 page_hashsz
= 1 << page_hashsz_shift
;
1221 pagehash_sz
= sizeof (struct page
*) * page_hashsz
;
1222 ADD_TO_ALLOCATIONS(page_hash
, pagehash_sz
);
1223 PRM_DEBUG(pagehash_sz
);
1226 * Set aside room for the page structures themselves.
1229 pp_sz
= sizeof (struct page
) * npages
;
1230 ADD_TO_ALLOCATIONS(pp_base
, pp_sz
);
1234 * determine l2 cache info and memory size for page coloring
1236 (void) getl2cacheinfo(CPU
,
1237 &l2cache_sz
, &l2cache_linesz
, &l2cache_assoc
);
1239 page_coloring_init(l2cache_sz
, l2cache_linesz
, l2cache_assoc
);
1240 ADD_TO_ALLOCATIONS(pagecolor_mem
, pagecolor_memsz
);
1241 PRM_DEBUG(pagecolor_memsz
);
1243 page_ctrs_size
= page_ctrs_sz();
1244 ADD_TO_ALLOCATIONS(page_ctrs_mem
, page_ctrs_size
);
1245 PRM_DEBUG(page_ctrs_size
);
1248 * Allocate the array that protects pp->p_selock.
1250 pse_shift
= size_pse_array(physmem
, max_ncpus
);
1251 pse_table_size
= 1 << pse_shift
;
1252 pse_table_alloc_size
= pse_table_size
* sizeof (pad_mutex_t
);
1253 ADD_TO_ALLOCATIONS(pse_mutex
, pse_table_alloc_size
);
1255 #if defined(__amd64)
1256 valloc_sz
= ROUND_UP_LPAGE(valloc_sz
);
1257 valloc_base
= VALLOC_BASE
;
1260 * The default values of VALLOC_BASE and SEGKPM_BASE should work
1261 * for values of physmax up to 256GB (1/4 TB). They need adjusting when
1262 * memory is at addresses above 256GB. When adjusted, segkpm_base must
1263 * be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte).
1265 * In the general case (>256GB), we use (4 * physmem) for the
1266 * kernel's virtual addresses, which is divided approximately
1268 * - 1 * physmem for segkpm
1269 * - 1.5 * physmem for segzio
1270 * - 1.5 * physmem for heap
1271 * Total: 4.0 * physmem
1273 * Note that the segzio and heap sizes are more than physmem so that
1274 * VA fragmentation does not prevent either of them from being
1275 * able to use nearly all of physmem. The value of 1.5x is determined
1276 * experimentally and may need to change if the workload changes.
1278 if (physmax
+ 1 > mmu_btop(TERABYTE
/ 4) ||
1279 plat_dr_physmax
> mmu_btop(TERABYTE
/ 4)) {
1280 uint64_t kpm_resv_amount
= mmu_ptob(physmax
+ 1);
1282 if (kpm_resv_amount
< mmu_ptob(plat_dr_physmax
)) {
1283 kpm_resv_amount
= mmu_ptob(plat_dr_physmax
);
1287 * This is what actually controls the KVA : UVA split.
1288 * The kernel uses high VA, and this is lowering the
1289 * boundary, thus increasing the amount of VA for the kernel.
1290 * This gives the kernel 4 * (amount of physical memory) VA.
1292 * The maximum VA is UINT64_MAX and we are using
1293 * 64-bit 2's complement math, so e.g. if you have 512GB
1294 * of memory, segkpm_base = -(4 * 512GB) == -2TB ==
1295 * UINT64_MAX - 2TB (approximately). So the kernel's
1296 * VA is [UINT64_MAX-2TB to UINT64_MAX].
1298 segkpm_base
= -(P2ROUNDUP((4 * kpm_resv_amount
),
1299 KERNEL_REDZONE_SIZE
));
1301 /* make sure we leave some space for user apps above hole */
1302 segkpm_base
= MAX(segkpm_base
, AMD64_VA_HOLE_END
+ TERABYTE
);
1303 if (segkpm_base
> SEGKPM_BASE
)
1304 segkpm_base
= SEGKPM_BASE
;
1305 PRM_DEBUG(segkpm_base
);
1307 valloc_base
= segkpm_base
+ P2ROUNDUP(kpm_resv_amount
, ONE_GIG
);
1308 if (valloc_base
< segkpm_base
)
1309 panic("not enough kernel VA to support memory size");
1310 PRM_DEBUG(valloc_base
);
1313 valloc_base
= (uintptr_t)(MISC_VA_BASE
- valloc_sz
);
1314 valloc_base
= P2ALIGN(valloc_base
, mmu
.level_size
[1]);
1315 PRM_DEBUG(valloc_base
);
1319 * do all the initial allocations
1321 perform_allocations();
1324 * Build phys_install and phys_avail in kernel memspace.
1325 * - phys_install should be all memory in the system.
1326 * - phys_avail is phys_install minus any memory mapped before this
1327 * point above KERNEL_TEXT.
1329 current
= phys_install
= memlist
;
1330 copy_memlist_filter(bootops
->boot_mem
->physinstalled
, ¤t
, NULL
);
1331 if ((caddr_t
)current
> (caddr_t
)memlist
+ memlist_sz
)
1332 panic("physinstalled was too big!");
1334 print_memlist("phys_install", phys_install
);
1336 phys_avail
= current
;
1337 PRM_POINT("Building phys_avail:\n");
1338 copy_memlist_filter(bootops
->boot_mem
->physinstalled
, ¤t
,
1340 if ((caddr_t
)current
> (caddr_t
)memlist
+ memlist_sz
)
1341 panic("physavail was too big!");
1343 print_memlist("phys_avail", phys_avail
);
1346 * Free unused memlist items, which may be used by memory DR driver
1349 if ((caddr_t
)current
< (caddr_t
)memlist
+ memlist_sz
) {
1350 memlist_free_block((caddr_t
)current
,
1351 (caddr_t
)memlist
+ memlist_sz
- (caddr_t
)current
);
1356 * Build bios reserved memspace
1358 current
= bios_rsvd
;
1359 copy_memlist_filter(bootops
->boot_mem
->rsvdmem
, ¤t
, NULL
);
1360 if ((caddr_t
)current
> (caddr_t
)bios_rsvd
+ rsvdmemlist_sz
)
1361 panic("bios_rsvd was too big!");
1363 print_memlist("bios_rsvd", bios_rsvd
);
1366 * Free unused memlist items, which may be used by memory DR driver
1369 if ((caddr_t
)current
< (caddr_t
)bios_rsvd
+ rsvdmemlist_sz
) {
1370 memlist_free_block((caddr_t
)current
,
1371 (caddr_t
)bios_rsvd
+ rsvdmemlist_sz
- (caddr_t
)current
);
1376 * setup page coloring
1378 page_coloring_setup(pagecolor_mem
);
1379 page_lock_init(); /* currently a no-op */
1382 * free page list counters
1384 (void) page_ctrs_alloc(page_ctrs_mem
);
1387 * Size the pcf array based on the number of cpus in the box at
1394 * Initialize the page structures from the memory lists.
1396 availrmem_initial
= availrmem
= freemem
= 0;
1397 PRM_POINT("Calling kphysm_init()...");
1398 npages
= kphysm_init(pp_base
, npages
);
1399 PRM_POINT("kphysm_init() done");
1405 * Now that page_t's have been initialized, remove all the
1406 * initial allocation pages from the kernel free page lists.
1408 boot_mapin((caddr_t
)valloc_base
, valloc_sz
);
1409 boot_mapin((caddr_t
)MISC_VA_BASE
, MISC_VA_SIZE
);
1410 PRM_POINT("startup_memlist() done");
1412 PRM_DEBUG(valloc_sz
);
1414 #if defined(__amd64)
1415 if ((availrmem
>> (30 - MMU_PAGESHIFT
)) >=
1416 textrepl_min_gb
&& l2cache_sz
<= 2 << 20) {
1417 extern size_t textrepl_size_thresh
;
1418 textrepl_size_thresh
= (16 << 20) - 1;
1424 * Layout the kernel's part of address space and initialize kmem allocator.
1429 extern void page_set_colorequiv_arr(void);
1431 extern uint64_t kpti_kbase
;
1434 PRM_POINT("startup_kmem() starting...");
1436 #if defined(__amd64)
1437 if (eprom_kernelbase
&& eprom_kernelbase
!= KERNELBASE
)
1438 cmn_err(CE_NOTE
, "!kernelbase cannot be changed on 64-bit "
1440 kernelbase
= segkpm_base
- KERNEL_REDZONE_SIZE
;
1441 core_base
= (uintptr_t)COREHEAP_BASE
;
1442 core_size
= (size_t)MISC_VA_BASE
- COREHEAP_BASE
;
1445 * We configure kernelbase based on:
1447 * 1. user specified kernelbase via eeprom command. Value cannot exceed
1448 * KERNELBASE_MAX. we large page align eprom_kernelbase
1450 * 2. Default to KERNELBASE and adjust to 2X less the size for page_t.
1451 * On large memory systems we must lower kernelbase to allow
1452 * enough room for page_t's for all of memory.
1454 * The value set here, might be changed a little later.
1456 if (eprom_kernelbase
) {
1457 kernelbase
= eprom_kernelbase
& mmu
.level_mask
[1];
1458 if (kernelbase
> KERNELBASE_MAX
)
1459 kernelbase
= KERNELBASE_MAX
;
1461 kernelbase
= (uintptr_t)KERNELBASE
;
1462 kernelbase
-= ROUND_UP_4MEG(2 * valloc_sz
);
1464 ASSERT((kernelbase
& mmu
.level_offset
[1]) == 0);
1465 core_base
= valloc_base
;
1469 PRM_DEBUG(core_base
);
1470 PRM_DEBUG(core_size
);
1471 PRM_DEBUG(kernelbase
);
1477 ekernelheap
= (char *)core_base
;
1478 PRM_DEBUG(ekernelheap
);
1481 * Now that we know the real value of kernelbase,
1482 * update variables that were initialized with a value of
1483 * KERNELBASE (in common/conf/param.c).
1485 * XXX The problem with this sort of hackery is that the
1486 * compiler just may feel like putting the const declarations
1487 * (in param.c) into the .text section. Perhaps they should
1488 * just be declared as variables there?
1491 *(uintptr_t *)&_kernelbase
= kernelbase
;
1492 *(uintptr_t *)&_userlimit
= kernelbase
;
1493 #if defined(__amd64)
1494 *(uintptr_t *)&_userlimit
-= KERNELBASE
- USERLIMIT
;
1496 kpti_kbase
= kernelbase
;
1499 *(uintptr_t *)&_userlimit32
= _userlimit
;
1501 PRM_DEBUG(_kernelbase
);
1502 PRM_DEBUG(_userlimit
);
1503 PRM_DEBUG(_userlimit32
);
1505 /* We have to re-do this now that we've modified _userlimit. */
1506 mmu_calc_user_slots();
1512 * If segmap is too large we can push the bottom of the kernel heap
1513 * higher than the base. Or worse, it could exceed the top of the
1514 * VA space entirely, causing it to wrap around.
1516 if (kernelheap
>= ekernelheap
|| (uintptr_t)kernelheap
< kernelbase
)
1517 panic("too little address space available for kernelheap,"
1518 " use eeprom for lower kernelbase or smaller segmapsize");
1522 * Initialize the kernel heap. Note 3rd argument must be > 1st.
1524 kernelheap_init(kernelheap
, ekernelheap
,
1525 kernelheap
+ MMU_PAGESIZE
,
1526 (void *)core_base
, (void *)(core_base
+ core_size
));
1530 * Link pending events struct into cpu struct
1532 CPU
->cpu_m
.mcpu_evt_pend
= &cpu0_evt_data
;
1535 * Initialize kernel memory allocator.
1540 * Factor in colorequiv to check additional 'equivalent' bins
1542 page_set_colorequiv_arr();
1545 * print this out early so that we know what's going on
1547 print_x86_featureset(x86_featureset
);
1550 * Initialize bp_mapin().
1552 bp_init(MMU_PAGESIZE
, HAT_STORECACHING_OK
);
1555 * orig_npages is non-zero if physmem has been configured for less
1556 * than the available memory.
1559 cmn_err(CE_WARN
, "!%slimiting physmem to 0x%lx of 0x%lx pages",
1560 (npages
== PHYSMEM
? "Due to virtual address space " : ""),
1561 npages
, orig_npages
);
1564 if (eprom_kernelbase
&& (eprom_kernelbase
!= kernelbase
))
1565 cmn_err(CE_WARN
, "kernelbase value, User specified 0x%lx, "
1566 "System using 0x%lx",
1567 (uintptr_t)eprom_kernelbase
, (uintptr_t)kernelbase
);
1570 #ifdef KERNELBASE_ABI_MIN
1571 if (kernelbase
< (uintptr_t)KERNELBASE_ABI_MIN
) {
1572 cmn_err(CE_NOTE
, "!kernelbase set to 0x%lx, system is not "
1573 "i386 ABI compliant.", (uintptr_t)kernelbase
);
1578 if (plat_dr_support_memory()) {
1583 * Some of the xen start information has to be relocated up
1584 * into the kernel's permanent address space.
1586 PRM_POINT("calling xen_relocate_start_info()");
1587 xen_relocate_start_info();
1588 PRM_POINT("xen_relocate_start_info() done");
1591 * (Update the vcpu pointer in our cpu structure to point into
1592 * the relocated shared info.)
1594 CPU
->cpu_m
.mcpu_vcpu_info
=
1595 &HYPERVISOR_shared_info
->vcpu_info
[CPU
->cpu_id
];
1598 PRM_POINT("startup_kmem() done");
1603 * If we have detected that we are running in an HVM environment, we need
1604 * to prepend the PV driver directory to the module search path.
1606 #define HVM_MOD_DIR "/platform/i86hvm/kernel"
1608 update_default_path()
1610 char *current
, *newpath
;
1614 * We are about to resync with krtld. krtld will reset its
1615 * internal module search path iff Solaris has set default_path.
1616 * We want to be sure we're prepending this new directory to the
1617 * right search path.
1619 current
= (default_path
== NULL
) ? kobj_module_path
: default_path
;
1621 newlen
= strlen(HVM_MOD_DIR
) + strlen(current
) + 2;
1622 newpath
= kmem_alloc(newlen
, KM_SLEEP
);
1623 (void) strcpy(newpath
, HVM_MOD_DIR
);
1624 (void) strcat(newpath
, " ");
1625 (void) strcat(newpath
, current
);
1627 default_path
= newpath
;
1632 startup_modules(void)
1635 extern void prom_setup(void);
1641 PRM_POINT("startup_modules() starting...");
1645 * Initialize ten-micro second timer so that drivers will
1646 * not get short changed in their init phase. This was
1647 * not getting called until clkinit which, on fast cpu's
1648 * caused the drv_usecwait to be way too short.
1652 if ((get_hwenv() & HW_XEN_HVM
) != 0)
1653 update_default_path();
1657 * Read the GMT lag from /etc/rtc_config.
1659 sgmtl(process_rtc_config_file());
1662 * Calculate default settings of system parameters based upon
1663 * maxusers, yet allow to be overridden via the /etc/system file.
1670 * Initialize system parameters.
1675 * Initialize the default brands
1680 * maxmem is the amount of physical memory we're playing with.
1685 * Initialize segment management stuff.
1689 if (modload("fs", "specfs") == -1)
1690 halt("Can't load specfs");
1692 if (modload("fs", "devfs") == -1)
1693 halt("Can't load devfs");
1695 if (modload("fs", "dev") == -1)
1696 halt("Can't load dev");
1698 if (modload("fs", "procfs") == -1)
1699 halt("Can't load procfs");
1701 (void) modloadonly("sys", "lbl_edition");
1705 /* Read cluster configuration data. */
1711 (void) xs_early_init();
1715 * Create a kernel device tree. First, create rootnex and
1716 * then invoke bus specific code to probe devices.
1721 if (DOMAIN_IS_INITDOMAIN(xen_info
))
1725 smbios_system_t smsys
;
1726 smbios_info_t sminfo
;
1729 * Load the System Management BIOS into the global ksmbios
1730 * handle, if an SMBIOS is present on this system.
1731 * Also set "si-hw-provider" property, if not already set.
1733 ksmbios
= smbios_open(NULL
, SMB_VERSION
, ksmbios_flags
, NULL
);
1734 if (ksmbios
!= NULL
&&
1735 ((smid
= smbios_info_system(ksmbios
, &smsys
)) != SMB_ERR
) &&
1736 (smbios_info_common(ksmbios
, smid
, &sminfo
)) != SMB_ERR
) {
1737 mfg
= (char *)sminfo
.smbi_manufacturer
;
1738 if (BOP_GETPROPLEN(bootops
, "si-hw-provider") < 0) {
1739 extern char hw_provider
[];
1741 for (i
= 0; i
< SYS_NMLN
; i
++) {
1742 if (isprint(mfg
[i
]))
1743 hw_provider
[i
] = mfg
[i
];
1745 hw_provider
[i
] = '\0';
1749 hw_provider
[SYS_NMLN
- 1] = '\0';
1756 * Originally clconf_init() apparently needed the hostid. But
1757 * this no longer appears to be true - it uses its own nodeid.
1758 * By placing the hostid logic here, we are able to make use of
1761 if ((h
= set_soft_hostid()) == HW_INVALID_HOSTID
) {
1762 cmn_err(CE_WARN
, "Unable to set hostid");
1764 for (v
= h
, cnt
= 0; cnt
< 10; cnt
++) {
1765 d
[cnt
] = (char)(v
% 10);
1770 for (cp
= hw_serial
; cnt
>= 0; cnt
--)
1771 *cp
++ = d
[cnt
] + '0';
1776 * Set up the CPU module subsystem for the boot cpu in the native
1777 * case, and all physical cpu resource in the xpv dom0 case.
1778 * Modifies the device tree, so this must be done after
1783 * If paravirtualized and on dom0 then we initialize all physical
1784 * cpu handles now; if paravirtualized on a domU then do not
1787 if (DOMAIN_IS_INITDOMAIN(xen_info
)) {
1788 xen_mc_lcpu_cookie_t cpi
;
1790 for (cpi
= xen_physcpu_next(NULL
); cpi
!= NULL
;
1791 cpi
= xen_physcpu_next(cpi
)) {
1792 if ((hdl
= cmi_init(CMI_HDL_SOLARIS_xVM_MCA
,
1793 xen_physcpu_chipid(cpi
), xen_physcpu_coreid(cpi
),
1794 xen_physcpu_strandid(cpi
))) != NULL
&&
1795 is_x86_feature(x86_featureset
, X86FSET_MCA
))
1801 * Initialize a handle for the boot cpu - others will initialize
1804 if ((hdl
= cmi_init(CMI_HDL_NATIVE
, cmi_ntv_hwchipid(CPU
),
1805 cmi_ntv_hwcoreid(CPU
), cmi_ntv_hwstrandid(CPU
))) != NULL
) {
1806 if (is_x86_feature(x86_featureset
, X86FSET_MCA
))
1808 CPU
->cpu_m
.mcpu_cmi_hdl
= hdl
;
1813 * Fake a prom tree such that /dev/openprom continues to work
1815 PRM_POINT("startup_modules: calling prom_setup...");
1817 PRM_POINT("startup_modules: done");
1820 * Load all platform specific modules
1822 PRM_POINT("startup_modules: calling psm_modload...");
1825 PRM_POINT("startup_modules() done");
1829 * claim a "setaside" boot page for use in the kernel
1832 boot_claim_page(pfn_t pfn
)
1836 pp
= page_numtopp_nolock(pfn
);
1839 if (PP_ISBOOTPAGES(pp
)) {
1840 if (pp
->p_next
!= NULL
)
1841 pp
->p_next
->p_prev
= pp
->p_prev
;
1842 if (pp
->p_prev
== NULL
)
1843 bootpages
= pp
->p_next
;
1845 pp
->p_prev
->p_next
= pp
->p_next
;
1848 * htable_attach() expects a base pagesize page
1851 page_boot_demote(pp
);
1852 pp
= page_numtopp(pfn
, SE_EXCL
);
1858 * Walk through the pagetables looking for pages mapped in by boot. If the
1859 * setaside flag is set the pages are expected to be returned to the
1860 * kernel later in boot, so we add them to the bootpages list.
1863 protect_boot_range(uintptr_t low
, uintptr_t high
, int setaside
)
1870 pgcnt_t boot_protect_cnt
= 0;
1872 while (kbm_probe(&va
, &len
, &pfn
, &prot
) != 0 && va
< high
) {
1873 if (va
+ len
>= high
)
1874 panic("0x%lx byte mapping at 0x%p exceeds boot's "
1875 "legal range.", len
, (void *)va
);
1878 pp
= page_numtopp_alloc(pfn
);
1881 panic("Unexpected mapping by boot. "
1882 "addr=%p pfn=%lx\n",
1885 pp
->p_next
= bootpages
;
1887 PP_SETBOOTPAGES(pp
);
1888 if (bootpages
!= NULL
) {
1889 bootpages
->p_prev
= pp
;
1896 len
-= MMU_PAGESIZE
;
1900 PRM_DEBUG(boot_protect_cnt
);
1907 layout_kernel_va(void)
1909 PRM_POINT("layout_kernel_va() starting...");
1911 * Establish the final size of the kernel's heap, size of segmap,
1915 #if defined(__amd64)
1917 kpm_vbase
= (caddr_t
)segkpm_base
;
1918 if (physmax
+ 1 < plat_dr_physmax
) {
1919 kpm_size
= ROUND_UP_LPAGE(mmu_ptob(plat_dr_physmax
));
1921 kpm_size
= ROUND_UP_LPAGE(mmu_ptob(physmax
+ 1));
1923 if ((uintptr_t)kpm_vbase
+ kpm_size
> (uintptr_t)valloc_base
)
1924 panic("not enough room for kpm!");
1925 PRM_DEBUG(kpm_size
);
1926 PRM_DEBUG(kpm_vbase
);
1929 * By default we create a seg_kp in 64 bit kernels, it's a little
1930 * faster to access than embedding it in the heap.
1932 segkp_base
= (caddr_t
)valloc_base
+ valloc_sz
;
1933 if (!segkp_fromheap
) {
1934 size_t sz
= mmu_ptob(segkpsize
);
1937 * determine size of segkp
1939 if (sz
< SEGKPMINSIZE
|| sz
> SEGKPMAXSIZE
) {
1941 cmn_err(CE_WARN
, "!Illegal value for segkpsize. "
1942 "segkpsize has been reset to %ld pages",
1945 sz
= MIN(sz
, MAX(SEGKPMINSIZE
, mmu_ptob(physmem
)));
1947 segkpsize
= mmu_btop(ROUND_UP_LPAGE(sz
));
1949 PRM_DEBUG(segkp_base
);
1950 PRM_DEBUG(segkpsize
);
1953 * segzio is used for ZFS cached data. It uses a distinct VA
1954 * segment (from kernel heap) so that we can easily tell not to
1955 * include it in kernel crash dumps on 64 bit kernels. The trick is
1956 * to give it lots of VA, but not constrain the kernel heap.
1957 * We can use 1.5x physmem for segzio, leaving approximately
1958 * another 1.5x physmem for heap. See also the comment in
1959 * startup_memlist().
1961 segzio_base
= segkp_base
+ mmu_ptob(segkpsize
);
1962 if (segzio_fromheap
) {
1965 size_t physmem_size
= mmu_ptob(physmem
);
1966 size_t size
= (segziosize
== 0) ?
1967 physmem_size
* 3 / 2 : mmu_ptob(segziosize
);
1969 if (size
< SEGZIOMINSIZE
)
1970 size
= SEGZIOMINSIZE
;
1971 segziosize
= mmu_btop(ROUND_UP_LPAGE(size
));
1973 PRM_DEBUG(segziosize
);
1974 PRM_DEBUG(segzio_base
);
1977 * Put the range of VA for device mappings next, kmdb knows to not
1978 * grep in this range of addresses.
1981 ROUND_UP_LPAGE((uintptr_t)segzio_base
+ mmu_ptob(segziosize
));
1982 PRM_DEBUG(toxic_addr
);
1983 segmap_start
= ROUND_UP_LPAGE(toxic_addr
+ toxic_size
);
1985 segmap_start
= ROUND_UP_LPAGE(kernelbase
);
1987 PRM_DEBUG(segmap_start
);
1990 * Users can change segmapsize through eeprom. If the variable
1991 * is tuned through eeprom, there is no upper bound on the
1994 segmapsize
= MAX(ROUND_UP_LPAGE(segmapsize
), SEGMAPDEFAULT
);
1998 * 32-bit systems don't have segkpm or segkp, so segmap appears at
1999 * the bottom of the kernel's address range. Set aside space for a
2000 * small red zone just below the start of segmap.
2002 segmap_start
+= KERNEL_REDZONE_SIZE
;
2003 segmapsize
-= KERNEL_REDZONE_SIZE
;
2006 PRM_DEBUG(segmap_start
);
2007 PRM_DEBUG(segmapsize
);
2008 kernelheap
= (caddr_t
)ROUND_UP_LPAGE(segmap_start
+ segmapsize
);
2009 PRM_DEBUG(kernelheap
);
2010 PRM_POINT("layout_kernel_va() done...");
2014 * Finish initializing the VM system, now that we are no longer
2015 * relying on the boot time memory allocators.
2020 struct segmap_crargs a
;
2022 extern int use_brk_lpg
, use_stk_lpg
;
2024 PRM_POINT("startup_vm() starting...");
2027 * Initialize the hat layer.
2032 * Do final allocations of HAT data structures that need to
2033 * be allocated before quiescing the boot loader.
2035 PRM_POINT("Calling hat_kern_alloc()...");
2036 hat_kern_alloc((caddr_t
)segmap_start
, segmapsize
, ekernelheap
);
2037 PRM_POINT("hat_kern_alloc() done");
2041 * Setup Page Attribute Table
2047 * The next two loops are done in distinct steps in order
2048 * to be sure that any page that is doubly mapped (both above
2049 * KERNEL_TEXT and below kernelbase) is dealt with correctly.
2050 * Note this may never happen, but it might someday.
2053 PRM_POINT("Protecting boot pages");
2056 * Protect any pages mapped above KERNEL_TEXT that somehow have
2057 * page_t's. This can only happen if something weird allocated
2058 * in this range (like kadb/kmdb).
2060 protect_boot_range(KERNEL_TEXT
, (uintptr_t)-1, 0);
2063 * Before we can take over memory allocation/mapping from the boot
2064 * loader we must remove from our free page lists any boot allocated
2065 * pages that stay mapped until release_bootstrap().
2067 protect_boot_range(0, kernelbase
, 1);
2071 * Switch to running on regular HAT (not boot_mmu)
2073 PRM_POINT("Calling hat_kern_setup()...");
2077 * It is no longer safe to call BOP_ALLOC(), so make sure we don't.
2081 PRM_POINT("hat_kern_setup() done");
2083 hat_cpu_online(CPU
);
2086 * Initialize VM system
2088 PRM_POINT("Calling kvm_init()...");
2090 PRM_POINT("kvm_init() done");
2093 * Tell kmdb that the VM system is now working
2095 if (boothowto
& RB_DEBUG
)
2100 * Populate the I/O pool on domain 0
2102 if (DOMAIN_IS_INITDOMAIN(xen_info
)) {
2103 extern long populate_io_pool(void);
2104 long init_io_pool_cnt
;
2106 PRM_POINT("Populating reserve I/O page pool");
2107 init_io_pool_cnt
= populate_io_pool();
2108 PRM_DEBUG(init_io_pool_cnt
);
2112 * Mangle the brand string etc.
2116 #if defined(__amd64)
2119 * Create the device arena for toxic (to dtrace/kmdb) mappings.
2121 device_arena
= vmem_create("device", (void *)toxic_addr
,
2122 toxic_size
, MMU_PAGESIZE
, NULL
, NULL
, NULL
, 0, VM_SLEEP
);
2127 * allocate the bit map that tracks toxic pages
2129 toxic_bit_map_len
= btop((ulong_t
)(valloc_base
- kernelbase
));
2130 PRM_DEBUG(toxic_bit_map_len
);
2132 kmem_zalloc(BT_SIZEOFMAP(toxic_bit_map_len
), KM_NOSLEEP
);
2133 ASSERT(toxic_bit_map
!= NULL
);
2134 PRM_DEBUG(toxic_bit_map
);
2140 * Now that we've got more VA, as well as the ability to allocate from
2141 * it, tell the debugger.
2143 if (boothowto
& RB_DEBUG
)
2144 kdi_dvec_memavail();
2148 * Map page pfn=0 for drivers, such as kd, that need to pick up
2149 * parameters left there by controllers/BIOS.
2151 PRM_POINT("setup up p0_va");
2152 p0_va
= i86devmap(0, 1, PROT_READ
);
2156 cmn_err(CE_CONT
, "?mem = %luK (0x%lx)\n",
2157 physinstalled
<< (MMU_PAGESHIFT
- 10), ptob(physinstalled
));
2160 * disable automatic large pages for small memory systems or
2161 * when the disable flag is set.
2163 * Do not yet consider page sizes larger than 2m/4m.
2165 if (!auto_lpg_disable
&& mmu
.max_page_level
> 0) {
2166 max_uheap_lpsize
= LEVEL_SIZE(1);
2167 max_ustack_lpsize
= LEVEL_SIZE(1);
2168 max_privmap_lpsize
= LEVEL_SIZE(1);
2169 max_uidata_lpsize
= LEVEL_SIZE(1);
2170 max_utext_lpsize
= LEVEL_SIZE(1);
2171 max_shm_lpsize
= LEVEL_SIZE(1);
2173 if (physmem
< privm_lpg_min_physmem
|| mmu
.max_page_level
== 0 ||
2178 mcntl0_lpsize
= LEVEL_SIZE(mmu
.umax_page_level
);
2180 PRM_POINT("Calling hat_init_finish()...");
2182 PRM_POINT("hat_init_finish() done");
2185 * Initialize the segkp segment type.
2187 rw_enter(&kas
.a_lock
, RW_WRITER
);
2188 PRM_POINT("Attaching segkp");
2189 if (segkp_fromheap
) {
2191 } else if (seg_attach(&kas
, (caddr_t
)segkp_base
, mmu_ptob(segkpsize
),
2193 panic("startup: cannot attach segkp");
2196 PRM_POINT("Doing segkp_create()");
2197 if (segkp_create(segkp
) != 0) {
2198 panic("startup: segkp_create failed");
2202 rw_exit(&kas
.a_lock
);
2212 * Now create segmap segment.
2214 rw_enter(&kas
.a_lock
, RW_WRITER
);
2215 if (seg_attach(&kas
, (caddr_t
)segmap_start
, segmapsize
, segmap
) < 0) {
2216 panic("cannot attach segmap");
2221 a
.prot
= PROT_READ
| PROT_WRITE
;
2223 a
.nfreelist
= segmapfreelists
;
2225 if (segmap_create(segmap
, (caddr_t
)&a
) != 0)
2226 panic("segmap_create segmap");
2227 rw_exit(&kas
.a_lock
);
2229 setup_vaddr_for_ppcopy(CPU
);
2233 if (DOMAIN_IS_INITDOMAIN(xen_info
))
2237 PRM_POINT("startup_vm() done");
2241 * Load a tod module for the non-standard tod part found on this system.
2244 load_tod_module(char *todmod
)
2246 if (modload("tod", todmod
) == -1)
2247 halt("Can't load TOD module");
2254 extern void setx86isalist(void);
2255 extern void cpu_event_init(void);
2257 PRM_POINT("startup_end() starting...");
2260 * Perform tasks that get done after most of the VM
2261 * initialization has been done but before the clock
2262 * and other devices get started.
2267 * Perform CPC initialization for this CPU.
2272 * Initialize cpu event framework.
2276 #if defined(OPTERON_WORKAROUND_6323525)
2277 if (opteron_workaround_6323525
)
2278 patch_workaround_6323525();
2281 * If needed, load TOD module now so that ddi_get_time(9F) etc. work
2282 * (For now, "needed" is defined as set tod_module_name in /etc/system)
2284 if (tod_module_name
!= NULL
) {
2285 PRM_POINT("load_tod_module()");
2286 load_tod_module(tod_module_name
);
2291 * Forceload interposing TOD module for the hypervisor.
2293 PRM_POINT("load_tod_module()");
2294 load_tod_module("xpvtod");
2298 * Configure the system.
2300 PRM_POINT("Calling configure()...");
2301 configure(); /* set up devices */
2302 PRM_POINT("configure() done");
2305 * We can now setup for XSAVE because fpu_probe is done in configure().
2307 if (fp_save_mech
== FP_XSAVE
) {
2308 xsave_setup_msr(CPU
);
2312 * Set the isa_list string to the defined instruction sets we
2316 cpu_intr_alloc(CPU
, NINTR_THREADS
);
2320 * We're done with bootops. We don't unmap the bootstrap yet because
2321 * we're still using bootsvcs.
2323 PRM_POINT("NULLing out bootops");
2324 *bootopsp
= (struct bootops
*)NULL
;
2325 bootops
= (struct bootops
*)NULL
;
2328 ec_init_debug_irq();
2334 * Intel IOMMU has been setup/initialized in ddi_impl.c
2340 * Now that we're no longer going to drop into real mode for a BIOS call
2341 * via bootops, we can enable PCID (which requires CR0.PG).
2346 PRM_POINT("Enabling interrupts");
2350 ASSERT(CPU
->cpu_m
.mcpu_vcpu_info
->evtchn_upcall_mask
== 0);
2354 (void) add_avsoftintr((void *)&softlevel1_hdl
, 1, softlevel1
,
2355 "softlevel1", NULL
, NULL
); /* XXX to be moved later */
2358 * Register software interrupt handlers for ddi_periodic_add(9F).
2359 * Software interrupts up to the level 10 are supported.
2361 for (i
= DDI_IPL_1
; i
<= DDI_IPL_10
; i
++) {
2362 (void) add_avsoftintr((void *)&softlevel_hdl
[i
-1], i
,
2363 (avfunc
)ddi_periodic_softintr
, "ddi_periodic",
2364 (caddr_t
)(uintptr_t)i
, NULL
);
2368 if (modload("drv", "amd_iommu") < 0) {
2369 PRM_POINT("No AMD IOMMU present\n");
2370 } else if (ddi_hold_installed_driver(ddi_name_to_major(
2371 "amd_iommu")) == NULL
) {
2372 prom_printf("ERROR: failed to attach AMD IOMMU\n");
2375 post_startup_cpu_fixups();
2377 PRM_POINT("startup_end() done");
2381 * Don't remove the following 2 variables. They are necessary
2382 * for reading the hostid from the legacy file (/kernel/misc/sysinit).
2384 char *_hs1107
= hw_serial
;
2390 extern void cpupm_init(cpu_t
*);
2391 extern void cpu_event_init_cpu(cpu_t
*);
2394 * Set the system wide, processor-specific flags to be passed
2395 * to userland via the aux vector for performance hints and
2396 * instruction set extensions.
2401 if (DOMAIN_IS_INITDOMAIN(xen_info
))
2408 * Startup the memory scrubber.
2409 * XXPV This should be running somewhere ..
2411 if ((get_hwenv() & HW_VIRTUAL
) == 0)
2417 * Complete CPU module initialization
2422 * Perform forceloading tasks for /etc/system.
2424 (void) mod_sysctl(SYS_FORCELOAD
, NULL
);
2427 * ON4.0: Force /proc module in until clock interrupt handle fixed
2428 * ON4.0: This must be fixed or restated in /etc/systems.
2430 (void) modload("fs", "procfs");
2432 (void) i_ddi_attach_hw_nodes("pit_beep");
2436 * Check for required functional Floating Point hardware,
2437 * unless FP hardware explicitly disabled.
2439 if (fpu_exists
&& (fpu_pentium_fdivbug
|| fp_kind
== FP_NO
))
2440 halt("No working FP hardware found");
2445 cpu_event_init_cpu(CPU
);
2447 (void) mach_cpu_create_device_node(CPU
, NULL
);
2453 pp_in_range(page_t
*pp
, uint64_t low_addr
, uint64_t high_addr
)
2455 return ((pp
->p_pagenum
>= btop(low_addr
)) &&
2456 (pp
->p_pagenum
< btopr(high_addr
)));
2460 pp_in_module(page_t
*pp
, const rd_existing_t
*modranges
)
2464 for (i
= 0; modranges
[i
].phys
!= 0; i
++) {
2465 if (pp_in_range(pp
, modranges
[i
].phys
,
2466 modranges
[i
].phys
+ modranges
[i
].size
))
2474 release_bootstrap(void)
2476 int root_is_ramdisk
;
2478 extern void kobj_boot_unmountroot(void);
2479 extern dev_t rootdev
;
2482 rd_existing_t
*modranges
;
2488 * Save the bootfs module ranges so that we can reserve them below
2489 * for the real bootfs.
2491 modranges
= kmem_alloc(sizeof (rd_existing_t
) * MAX_BOOT_MODULES
,
2493 for (i
= 0; ; i
++) {
2494 uint64_t start
, size
;
2496 modranges
[i
].phys
= 0;
2498 (void) snprintf(propname
, sizeof (propname
),
2499 "module-addr-%u", i
);
2500 if (do_bsys_getproplen(NULL
, propname
) <= 0)
2502 (void) do_bsys_getprop(NULL
, propname
, &start
);
2504 (void) snprintf(propname
, sizeof (propname
),
2505 "module-size-%u", i
);
2506 if (do_bsys_getproplen(NULL
, propname
) <= 0)
2508 (void) do_bsys_getprop(NULL
, propname
, &size
);
2510 modranges
[i
].phys
= start
;
2511 modranges
[i
].size
= size
;
2514 /* unmount boot ramdisk and release kmem usage */
2515 kobj_boot_unmountroot();
2518 * We're finished using the boot loader so free its pages.
2520 PRM_POINT("Unmapping lower boot pages");
2522 clear_boot_mappings(0, _userlimit
);
2524 postbootkernelbase
= kernelbase
;
2527 * If root isn't on ramdisk, destroy the hardcoded
2528 * ramdisk node now and release the memory. Else,
2529 * ramdisk memory is kept in rd_pages.
2531 root_is_ramdisk
= (getmajor(rootdev
) == ddi_name_to_major("ramdisk"));
2532 if (!root_is_ramdisk
) {
2533 dev_info_t
*dip
= ddi_find_devinfo("ramdisk", -1, 0);
2534 ASSERT(dip
&& ddi_get_parent(dip
) == ddi_root_node());
2535 ndi_rele_devi(dip
); /* held from ddi_find_devinfo */
2536 (void) ddi_remove_child(dip
, 0);
2539 PRM_POINT("Releasing boot pages");
2541 extern uint64_t ramdisk_start
, ramdisk_end
;
2543 bootpages
= pp
->p_next
;
2546 /* Keep pages for the lower 64K */
2547 if (pp_in_range(pp
, 0, 0x40000)) {
2548 pp
->p_next
= lower_pages
;
2550 lower_pages_count
++;
2554 if (root_is_ramdisk
&& pp_in_range(pp
, ramdisk_start
,
2555 ramdisk_end
) || pp_in_module(pp
, modranges
)) {
2556 pp
->p_next
= rd_pages
;
2560 pp
->p_next
= (struct page
*)0;
2561 pp
->p_prev
= (struct page
*)0;
2562 PP_CLRBOOTPAGES(pp
);
2565 PRM_POINT("Boot pages released");
2567 kmem_free(modranges
, sizeof (rd_existing_t
) * 99);
2570 /* XXPV -- note this following bunch of code needs to be revisited in Xen 3.0 */
2572 * Find 1 page below 1 MB so that other processors can boot up or
2573 * so that any processor can resume.
2574 * Make sure it has a kernel VA as well as a 1:1 mapping.
2575 * We should have just free'd one up.
2579 * 0x10 pages is 64K. Leave the bottom 64K alone
2582 for (pfn
= 0x10; pfn
< btop(1*1024*1024); pfn
++) {
2583 if (page_numtopp_alloc(pfn
) == NULL
)
2585 rm_platter_va
= i86devmap(pfn
, 1,
2586 PROT_READ
| PROT_WRITE
| PROT_EXEC
);
2587 rm_platter_pa
= ptob(pfn
);
2590 if (pfn
== btop(1*1024*1024) && use_mp
)
2591 panic("No page below 1M available for starting "
2592 "other processors or for resuming from system-suspend");
2597 * Initialize the platform-specific parts of a page_t.
2600 add_physmem_cb(page_t
*pp
, pfn_t pnum
)
2602 pp
->p_pagenum
= pnum
;
2603 pp
->p_mapping
= NULL
;
2610 * kphysm_init() initializes physical memory.
2617 struct memlist
*pmem
;
2618 struct memseg
*cur_memseg
;
2622 pgcnt_t pages_done
= 0;
2625 extern pfn_t ddiphysmin
;
2626 extern int mnode_xwa
;
2629 ASSERT(page_hash
!= NULL
&& page_hashsz
!= 0);
2631 cur_memseg
= memseg_base
;
2632 for (pmem
= phys_avail
; pmem
&& npages
; pmem
= pmem
->ml_next
) {
2634 * In a 32 bit kernel can't use higher memory if we're
2635 * not booting in PAE mode. This check takes care of that.
2637 addr
= pmem
->ml_address
;
2638 size
= pmem
->ml_size
;
2639 if (btop(addr
) > physmax
)
2643 * align addr and size - they may not be at page boundaries
2645 if ((addr
& MMU_PAGEOFFSET
) != 0) {
2646 addr
+= MMU_PAGEOFFSET
;
2647 addr
&= ~(uint64_t)MMU_PAGEOFFSET
;
2648 size
-= addr
- pmem
->ml_address
;
2651 /* only process pages below or equal to physmax */
2652 if ((btop(addr
+ size
) - 1) > physmax
)
2653 size
= ptob(physmax
- btop(addr
) + 1);
2664 base_pfn
= btop(addr
);
2667 prom_printf("MEMSEG addr=0x%" PRIx64
2668 " pgs=0x%lx pfn 0x%lx-0x%lx\n",
2669 addr
, num
, base_pfn
, base_pfn
+ num
);
2672 * Ignore pages below ddiphysmin to simplify ddi memory
2673 * allocation with non-zero addr_lo requests.
2675 if (base_pfn
< ddiphysmin
) {
2676 if (base_pfn
+ num
<= ddiphysmin
)
2678 pp
+= (ddiphysmin
- base_pfn
);
2679 num
-= (ddiphysmin
- base_pfn
);
2680 base_pfn
= ddiphysmin
;
2684 * mnode_xwa is greater than 1 when large pages regions can
2685 * cross memory node boundaries. To prevent the formation
2686 * of these large pages, configure the memsegs based on the
2687 * memory node ranges which had been made non-contiguous.
2689 if (mnode_xwa
> 1) {
2691 end_pfn
= base_pfn
+ num
- 1;
2692 ms
= PFN_2_MEM_NODE(base_pfn
);
2693 me
= PFN_2_MEM_NODE(end_pfn
);
2697 * current range spans more than 1 memory node.
2698 * Set num to only the pfn range in the start
2701 num
= mem_node_config
[ms
].physmax
- base_pfn
2703 ASSERT(end_pfn
> mem_node_config
[ms
].physmax
);
2709 * Build the memsegs entry
2711 cur_memseg
->pages
= pp
;
2712 cur_memseg
->epages
= pp
+ num
;
2713 cur_memseg
->pages_base
= base_pfn
;
2714 cur_memseg
->pages_end
= base_pfn
+ num
;
2717 * Insert into memseg list in decreasing pfn range
2718 * order. Low memory is typically more fragmented such
2719 * that this ordering keeps the larger ranges at the
2720 * front of the list for code that searches memseg.
2721 * This ASSERTS that the memsegs coming in from boot
2722 * are in increasing physical address order and not
2725 if (memsegs
!= NULL
) {
2726 ASSERT(cur_memseg
->pages_base
>=
2727 memsegs
->pages_end
);
2728 cur_memseg
->next
= memsegs
;
2730 memsegs
= cur_memseg
;
2733 * add_physmem() initializes the PSM part of the page
2734 * struct by calling the PSM back with add_physmem_cb().
2735 * In addition it coalesces pages into larger pages as
2736 * it initializes them.
2738 add_physmem(pp
, num
, base_pfn
);
2740 availrmem_initial
+= num
;
2747 /* process next memory node range */
2749 base_pfn
= mem_node_config
[ms
].physbase
;
2750 num
= MIN(mem_node_config
[ms
].physmax
,
2751 end_pfn
) - base_pfn
+ 1;
2755 PRM_DEBUG(availrmem_initial
);
2756 PRM_DEBUG(availrmem
);
2759 return (pages_done
);
2763 * Kernel VM initialization.
2768 ASSERT((((uintptr_t)s_text
) & MMU_PAGEOFFSET
) == 0);
2771 * Put the kernel segments in kernel address space.
2773 rw_enter(&kas
.a_lock
, RW_WRITER
);
2776 (void) seg_attach(&kas
, s_text
, e_moddata
- s_text
, &ktextseg
);
2777 (void) segkmem_create(&ktextseg
);
2779 (void) seg_attach(&kas
, (caddr_t
)valloc_base
, valloc_sz
, &kvalloc
);
2780 (void) segkmem_create(&kvalloc
);
2782 (void) seg_attach(&kas
, kernelheap
,
2783 ekernelheap
- kernelheap
, &kvseg
);
2784 (void) segkmem_create(&kvseg
);
2786 if (core_size
> 0) {
2787 PRM_POINT("attaching kvseg_core");
2788 (void) seg_attach(&kas
, (caddr_t
)core_base
, core_size
,
2790 (void) segkmem_create(&kvseg_core
);
2793 if (segziosize
> 0) {
2794 PRM_POINT("attaching segzio");
2795 (void) seg_attach(&kas
, segzio_base
, mmu_ptob(segziosize
),
2797 (void) segkmem_zio_create(&kzioseg
);
2799 /* create zio area covering new segment */
2800 segkmem_zio_init(segzio_base
, mmu_ptob(segziosize
));
2803 (void) seg_attach(&kas
, kdi_segdebugbase
, kdi_segdebugsize
, &kdebugseg
);
2804 (void) segkmem_create(&kdebugseg
);
2806 rw_exit(&kas
.a_lock
);
2809 * Ensure that the red zone at kernelbase is never accessible.
2811 PRM_POINT("protecting redzone");
2812 (void) as_setprot(&kas
, (caddr_t
)kernelbase
, KERNEL_REDZONE_SIZE
, 0);
2815 * Make the text writable so that it can be hot patched by DTrace.
2817 (void) as_setprot(&kas
, s_text
, e_modtext
- s_text
,
2818 PROT_READ
| PROT_WRITE
| PROT_EXEC
);
2821 * Make data writable until end.
2823 (void) as_setprot(&kas
, s_data
, e_moddata
- s_data
,
2824 PROT_READ
| PROT_WRITE
| PROT_EXEC
);
2829 * Solaris adds an entry for Write Combining caching to the PAT
2831 static uint64_t pat_attr_reg
= PAT_DEFAULT_ATTRIBUTE
;
2836 ulong_t cr0
, cr0_orig
, cr4
;
2838 if (!is_x86_feature(x86_featureset
, X86FSET_PAT
))
2840 cr0_orig
= cr0
= getcr0();
2843 /* disable caching and flush all caches and TLBs */
2848 if (cr4
& CR4_PGE
) {
2849 setcr4(cr4
& ~(ulong_t
)CR4_PGE
);
2855 /* add our entry to the PAT */
2856 wrmsr(REG_PAT
, pat_attr_reg
);
2858 /* flush TLBs and cache again, then reenable cr0 caching */
2859 if (cr4
& CR4_PGE
) {
2860 setcr4(cr4
& ~(ulong_t
)CR4_PGE
);
2871 #if defined(_SOFT_HOSTID)
2873 * On platforms that do not have a hardware serial number, attempt
2874 * to set one based on the contents of /etc/hostid. If this file does
2875 * not exist, assume that we are to generate a new hostid and set
2876 * it in the kernel, for subsequent saving by a userland process
2877 * once the system is up and the root filesystem is mounted r/w.
2879 * In order to gracefully support upgrade on OpenSolaris, if
2880 * /etc/hostid does not exist, we will attempt to get a serial number
2881 * using the legacy method (/kernel/misc/sysinit).
2883 * If that isn't present, we attempt to use an SMBIOS UUID, which is
2884 * a hardware serial number. Note that we don't automatically trust
2885 * all SMBIOS UUIDs (some older platforms are defective and ship duplicate
2886 * UUIDs in violation of the standard), we check against a blacklist.
2888 * In an attempt to make the hostid less prone to abuse
2889 * (for license circumvention, etc), we store it in /etc/hostid
2892 extern volatile unsigned long tenmicrodata
;
2893 static int atoi(char *);
2896 * Set this to non-zero in /etc/system if you think your SMBIOS returns a
2897 * UUID that is not unique. (Also report it so that the smbios_uuid_blacklist
2898 * array can be updated.)
2900 int smbios_broken_uuid
= 0;
2903 * List of known bad UUIDs. This is just the lower 32-bit values, since
2904 * that's what we use for the host id. If your hostid falls here, you need
2905 * to contact your hardware OEM for a fix for your BIOS.
2907 static unsigned char
2908 smbios_uuid_blacklist
[][16] = {
2910 { /* Reported bad UUID (Google search) */
2911 0x00, 0x02, 0x00, 0x03, 0x00, 0x04, 0x00, 0x05,
2912 0x00, 0x06, 0x00, 0x07, 0x00, 0x08, 0x00, 0x09,
2914 { /* Known bad DELL UUID */
2915 0x4C, 0x4C, 0x45, 0x44, 0x00, 0x00, 0x20, 0x10,
2916 0x80, 0x20, 0x80, 0xC0, 0x4F, 0x20, 0x20, 0x20,
2918 { /* Uninitialized flash */
2919 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
2920 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
2923 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
2924 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
2929 uuid_to_hostid(const uint8_t *uuid
)
2932 * Although the UUIDs are 128-bits, they may not distribute entropy
2933 * evenly. We would like to use SHA or MD5, but those are located
2934 * in loadable modules and not available this early in boot. As we
2935 * don't need the values to be cryptographically strong, we just
2936 * generate 32-bit vaue by xor'ing the various sequences together,
2937 * which ensures that the entire UUID contributes to the hostid.
2941 /* first check against the blacklist */
2942 for (int i
= 0; i
< (sizeof (smbios_uuid_blacklist
) / 16); i
++) {
2943 if (bcmp(smbios_uuid_blacklist
[0], uuid
, 16) == 0) {
2944 cmn_err(CE_CONT
, "?Broken SMBIOS UUID. "
2945 "Contact BIOS manufacturer for repair.\n");
2946 return ((int32_t)HW_INVALID_HOSTID
);
2950 for (int i
= 0; i
< 16; i
++)
2951 id
^= ((uuid
[i
]) << (8 * (i
% sizeof (id
))));
2953 /* Make sure return value is positive */
2954 return (id
& 0x7fffffff);
2958 set_soft_hostid(void)
2961 char tokbuf
[MAXNAMELEN
];
2966 int32_t hostid
= (int32_t)HW_INVALID_HOSTID
;
2969 smbios_system_t smsys
;
2972 * If /etc/hostid file not found, we'd like to get a pseudo
2973 * random number to use at the hostid. A nice way to do this
2974 * is to read the real time clock. To remain xen-compatible,
2975 * we can't poke the real hardware, so we use tsc_read() to
2976 * read the real time clock. However, there is an ominous
2977 * warning in tsc_read that says it can return zero, so we
2978 * deal with that possibility by falling back to using the
2979 * (hopefully random enough) value in tenmicrodata.
2982 if ((file
= kobj_open_file(hostid_file
)) == (struct _buf
*)-1) {
2984 * hostid file not found - try to load sysinit module
2985 * and see if it has a nonzero hostid value...use that
2986 * instead of generating a new hostid here if so.
2988 if ((i
= modload("misc", "sysinit")) != -1) {
2989 if (strlen(hw_serial
) > 0)
2990 hostid
= (int32_t)atoi(hw_serial
);
2991 (void) modunload(i
);
2995 * We try to use the SMBIOS UUID. But not if it is blacklisted
2998 if ((hostid
== HW_INVALID_HOSTID
) &&
2999 (smbios_broken_uuid
== 0) &&
3000 (ksmbios
!= NULL
) &&
3001 (smbios_info_system(ksmbios
, &smsys
) != SMB_ERR
) &&
3002 (smsys
.smbs_uuidlen
>= 16)) {
3003 hostid
= uuid_to_hostid(smsys
.smbs_uuid
);
3007 * Generate a "random" hostid using the clock. These
3008 * hostids will change on each boot if the value is not
3009 * saved to a persistent /etc/hostid file.
3011 if (hostid
== HW_INVALID_HOSTID
) {
3013 if (tsc
== 0) /* tsc_read can return zero sometimes */
3014 hostid
= (int32_t)tenmicrodata
& 0x0CFFFFF;
3016 hostid
= (int32_t)tsc
& 0x0CFFFFF;
3019 /* hostid file found */
3021 token
= kobj_lex(file
, tokbuf
, sizeof (tokbuf
));
3028 kobj_find_eol(file
);
3032 * un-rot47 - obviously this
3033 * nonsense is ascii-specific
3035 for (c
= (unsigned char *)tokbuf
;
3044 * now we should have a real number
3047 if (kobj_getvalue(tokbuf
, &tmp
) != 0)
3048 kobj_file_err(CE_WARN
, file
,
3049 "Bad value %s for hostid",
3052 hostid
= (int32_t)tmp
;
3066 if (hostid
== HW_INVALID_HOSTID
) /* didn't find a hostid */
3067 kobj_file_err(CE_WARN
, file
,
3068 "hostid missing or corrupt");
3070 kobj_close_file(file
);
3073 * hostid is now the value read from /etc/hostid, or the
3074 * new hostid we generated in this routine or HW_INVALID_HOSTID if not
3086 i
= 10 * i
+ (*p
++ - '0');
3091 #endif /* _SOFT_HOSTID */
3094 get_system_configuration(void)
3097 u_longlong_t nodes_ll
, cpus_pernode_ll
, lvalue
;
3099 if (BOP_GETPROPLEN(bootops
, "nodes") > sizeof (prop
) ||
3100 BOP_GETPROP(bootops
, "nodes", prop
) < 0 ||
3101 kobj_getvalue(prop
, &nodes_ll
) == -1 ||
3102 nodes_ll
> MAXNODES
||
3103 BOP_GETPROPLEN(bootops
, "cpus_pernode") > sizeof (prop
) ||
3104 BOP_GETPROP(bootops
, "cpus_pernode", prop
) < 0 ||
3105 kobj_getvalue(prop
, &cpus_pernode_ll
) == -1) {
3106 system_hardware
.hd_nodes
= 1;
3107 system_hardware
.hd_cpus_per_node
= 0;
3109 system_hardware
.hd_nodes
= (int)nodes_ll
;
3110 system_hardware
.hd_cpus_per_node
= (int)cpus_pernode_ll
;
3113 if (BOP_GETPROPLEN(bootops
, "kernelbase") > sizeof (prop
) ||
3114 BOP_GETPROP(bootops
, "kernelbase", prop
) < 0 ||
3115 kobj_getvalue(prop
, &lvalue
) == -1)
3116 eprom_kernelbase
= NULL
;
3118 eprom_kernelbase
= (uintptr_t)lvalue
;
3120 if (BOP_GETPROPLEN(bootops
, "segmapsize") > sizeof (prop
) ||
3121 BOP_GETPROP(bootops
, "segmapsize", prop
) < 0 ||
3122 kobj_getvalue(prop
, &lvalue
) == -1)
3123 segmapsize
= SEGMAPDEFAULT
;
3125 segmapsize
= (uintptr_t)lvalue
;
3127 if (BOP_GETPROPLEN(bootops
, "segmapfreelists") > sizeof (prop
) ||
3128 BOP_GETPROP(bootops
, "segmapfreelists", prop
) < 0 ||
3129 kobj_getvalue(prop
, &lvalue
) == -1)
3130 segmapfreelists
= 0; /* use segmap driver default */
3132 segmapfreelists
= (int)lvalue
;
3134 /* physmem used to be here, but moved much earlier to fakebop.c */
3138 * Add to a memory list.
3139 * start = start of new memory segment
3140 * len = length of new memory segment in bytes
3141 * new = pointer to a new struct memlist
3142 * memlistp = memory list to which to add segment.
3148 struct memlist
*new,
3149 struct memlist
**memlistp
)
3151 struct memlist
*cur
;
3152 uint64_t end
= start
+ len
;
3154 new->ml_address
= start
;
3160 if (cur
->ml_address
>= end
) {
3163 new->ml_prev
= cur
->ml_prev
;
3167 ASSERT(cur
->ml_address
+ cur
->ml_size
<= start
);
3168 if (cur
->ml_next
== NULL
) {
3171 new->ml_next
= NULL
;
3174 memlistp
= &cur
->ml_next
;
3180 kobj_vmem_init(vmem_t
**text_arena
, vmem_t
**data_arena
)
3182 size_t tsize
= e_modtext
- modtext
;
3183 size_t dsize
= e_moddata
- moddata
;
3185 *text_arena
= vmem_create("module_text", tsize
? modtext
: NULL
, tsize
,
3186 1, segkmem_alloc
, segkmem_free
, heaptext_arena
, 0, VM_SLEEP
);
3187 *data_arena
= vmem_create("module_data", dsize
? moddata
: NULL
, dsize
,
3188 1, segkmem_alloc
, segkmem_free
, heap32_arena
, 0, VM_SLEEP
);
3192 kobj_text_alloc(vmem_t
*arena
, size_t size
)
3194 return (vmem_alloc(arena
, size
, VM_SLEEP
| VM_BESTFIT
));
3199 kobj_texthole_alloc(caddr_t addr
, size_t size
)
3201 panic("unexpected call to kobj_texthole_alloc()");
3208 kobj_texthole_free(caddr_t addr
, size_t size
)
3210 panic("unexpected call to kobj_texthole_free()");
3214 * This is called just after configure() in startup().
3216 * The ISALIST concept is a bit hopeless on Intel, because
3217 * there's no guarantee of an ever-more-capable processor
3218 * given that various parts of the instruction set may appear
3219 * and disappear between different implementations.
3221 * While it would be possible to correct it and even enhance
3222 * it somewhat, the explicit hardware capability bitmask allows
3225 * So, we just leave this alone.
3232 extern char *isa_list
;
3234 #define TBUFSIZE 1024
3236 tp
= kmem_alloc(TBUFSIZE
, KM_SLEEP
);
3239 #if defined(__amd64)
3240 (void) strcpy(tp
, "amd64 ");
3243 switch (x86_vendor
) {
3244 case X86_VENDOR_Intel
:
3245 case X86_VENDOR_AMD
:
3247 if (is_x86_feature(x86_featureset
, X86FSET_CMOV
)) {
3249 * Pentium Pro or later
3251 (void) strcat(tp
, "pentium_pro");
3253 is_x86_feature(x86_featureset
, X86FSET_MMX
) ?
3254 "+mmx pentium_pro " : " ");
3257 case X86_VENDOR_Cyrix
:
3259 * The Cyrix 6x86 does not have any Pentium features
3260 * accessible while not at privilege level 0.
3262 if (is_x86_feature(x86_featureset
, X86FSET_CPUID
)) {
3263 (void) strcat(tp
, "pentium");
3265 is_x86_feature(x86_featureset
, X86FSET_MMX
) ?
3266 "+mmx pentium " : " ");
3272 (void) strcat(tp
, "i486 i386 i86");
3273 len
= strlen(tp
) + 1; /* account for NULL at end of string */
3274 isa_list
= strcpy(kmem_alloc(len
, KM_SLEEP
), tp
);
3275 kmem_free(tp
, TBUFSIZE
);
3284 device_arena_alloc(size_t size
, int vm_flag
)
3286 return (vmem_alloc(device_arena
, size
, vm_flag
));
3290 device_arena_free(void *vaddr
, size_t size
)
3292 vmem_free(device_arena
, vaddr
, size
);
3298 device_arena_alloc(size_t size
, int vm_flag
)
3305 vaddr
= vmem_alloc(heap_arena
, size
, vm_flag
);
3309 v
= (uintptr_t)vaddr
;
3310 ASSERT(v
>= kernelbase
);
3311 ASSERT(v
+ size
<= valloc_base
);
3313 start
= btop(v
- kernelbase
);
3314 end
= btop(v
+ size
- 1 - kernelbase
);
3315 ASSERT(start
< toxic_bit_map_len
);
3316 ASSERT(end
< toxic_bit_map_len
);
3318 while (start
<= end
) {
3319 BT_ATOMIC_SET(toxic_bit_map
, start
);
3326 device_arena_free(void *vaddr
, size_t size
)
3328 uintptr_t v
= (uintptr_t)vaddr
;
3332 ASSERT(v
>= kernelbase
);
3333 ASSERT(v
+ size
<= valloc_base
);
3335 start
= btop(v
- kernelbase
);
3336 end
= btop(v
+ size
- 1 - kernelbase
);
3337 ASSERT(start
< toxic_bit_map_len
);
3338 ASSERT(end
< toxic_bit_map_len
);
3340 while (start
<= end
) {
3341 ASSERT(BT_TEST(toxic_bit_map
, start
) != 0);
3342 BT_ATOMIC_CLEAR(toxic_bit_map
, start
);
3345 vmem_free(heap_arena
, vaddr
, size
);
3349 * returns 1st address in range that is in device arena, or NULL
3350 * if len is not NULL it returns the length of the toxic range
3353 device_arena_contains(void *vaddr
, size_t size
, size_t *len
)
3355 uintptr_t v
= (uintptr_t)vaddr
;
3356 uintptr_t eaddr
= v
+ size
;
3361 * if called very early by kmdb, just return NULL
3363 if (toxic_bit_map
== NULL
)
3367 * First check if we're completely outside the bitmap range.
3369 if (v
>= valloc_base
|| eaddr
< kernelbase
)
3373 * Trim ends of search to look at only what the bitmap covers.
3377 start
= btop(v
- kernelbase
);
3378 end
= btop(eaddr
- kernelbase
);
3379 if (end
>= toxic_bit_map_len
)
3380 end
= toxic_bit_map_len
;
3382 if (bt_range(toxic_bit_map
, &start
, &end
, end
) == 0)
3385 v
= kernelbase
+ ptob(start
);
3387 *len
= ptob(end
- start
);