2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008-2017 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
43 //#include "use_npx.h"
48 #include "opt_msgbuf.h"
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/sysproto.h>
54 #include <sys/signalvar.h>
55 #include <sys/kernel.h>
56 #include <sys/linker.h>
57 #include <sys/malloc.h>
61 #include <sys/reboot.h>
63 #include <sys/msgbuf.h>
64 #include <sys/sysent.h>
65 #include <sys/sysctl.h>
66 #include <sys/vmmeter.h>
68 #include <sys/usched.h>
71 #include <sys/ctype.h>
72 #include <sys/serialize.h>
73 #include <sys/systimer.h>
76 #include <vm/vm_param.h>
78 #include <vm/vm_kern.h>
79 #include <vm/vm_object.h>
80 #include <vm/vm_page.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_pager.h>
83 #include <vm/vm_extern.h>
85 #include <sys/thread2.h>
86 #include <sys/mplock2.h>
87 #include <sys/mutex2.h>
97 #include <machine/cpu.h>
98 #include <machine/clock.h>
99 #include <machine/specialreg.h>
101 #include <machine/bootinfo.h>
103 #include <machine/md_var.h>
104 #include <machine/metadata.h>
105 #include <machine/pc/bios.h>
106 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
107 #include <machine/globaldata.h> /* CPU_prvspace */
108 #include <machine/smp.h>
109 #include <machine/cputypes.h>
110 #include <machine/intr_machdep.h>
111 #include <machine/framebuffer.h>
114 #include <bus/isa/isa_device.h>
116 #include <machine_base/isa/isa_intr.h>
117 #include <bus/isa/rtc.h>
118 #include <sys/random.h>
119 #include <sys/ptrace.h>
120 #include <machine/sigframe.h>
122 #include <sys/machintr.h>
123 #include <machine_base/icu/icu_abi.h>
124 #include <machine_base/icu/elcr_var.h>
125 #include <machine_base/apic/lapic.h>
126 #include <machine_base/apic/ioapic.h>
127 #include <machine_base/apic/ioapic_abi.h>
128 #include <machine/mptable.h>
130 #define PHYSMAP_ENTRIES 10
132 extern u_int64_t
hammer_time(u_int64_t
, u_int64_t
);
134 extern void printcpuinfo(void); /* XXX header file */
135 extern void identify_cpu(void);
137 extern void finishidentcpu(void);
139 extern void panicifcpuunsupported(void);
141 static void cpu_startup(void *);
142 static void pic_finish(void *);
143 static void cpu_finish(void *);
145 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
146 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
147 static void init_locks(void);
149 extern void pcpu_timer_always(struct intrframe
*);
151 SYSINIT(cpu
, SI_BOOT2_START_CPU
, SI_ORDER_FIRST
, cpu_startup
, NULL
);
152 SYSINIT(pic_finish
, SI_BOOT2_FINISH_PIC
, SI_ORDER_FIRST
, pic_finish
, NULL
);
153 SYSINIT(cpu_finish
, SI_BOOT2_FINISH_CPU
, SI_ORDER_FIRST
, cpu_finish
, NULL
);
156 extern vm_offset_t ksym_start
, ksym_end
;
159 struct privatespace CPU_prvspace_bsp
__aligned(4096);
160 struct privatespace
*CPU_prvspace
[MAXCPU
] = { &CPU_prvspace_bsp
};
162 vm_paddr_t efi_systbl_phys
;
163 int _udatasel
, _ucodesel
, _ucode32sel
;
165 int64_t tsc_offsets
[MAXCPU
];
166 cpumask_t smp_idleinvl_mask
;
167 cpumask_t smp_idleinvl_reqs
;
169 static int cpu_mwait_halt_global
; /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
171 #if defined(SWTCH_OPTIM_STATS)
172 extern int swtch_optim_stats
;
173 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
174 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
175 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
176 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
178 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_halt
,
179 CTLFLAG_RD
, &cpu_mwait_halt_global
, 0, "");
180 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_spin
, CTLFLAG_RD
, &cpu_mwait_spin
, 0,
181 "monitor/mwait target state");
183 #define CPU_MWAIT_HAS_CX \
184 ((cpu_feature2 & CPUID2_MON) && \
185 (cpu_mwait_feature & CPUID_MWAIT_EXT))
187 #define CPU_MWAIT_CX_NAMELEN 16
189 #define CPU_MWAIT_C1 1
190 #define CPU_MWAIT_C2 2
191 #define CPU_MWAIT_C3 3
192 #define CPU_MWAIT_CX_MAX 8
194 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */
195 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */
197 SYSCTL_NODE(_machdep
, OID_AUTO
, mwait
, CTLFLAG_RW
, 0, "MWAIT features");
198 SYSCTL_NODE(_machdep_mwait
, OID_AUTO
, CX
, CTLFLAG_RW
, 0, "MWAIT Cx settings");
200 struct cpu_mwait_cx
{
203 struct sysctl_ctx_list sysctl_ctx
;
204 struct sysctl_oid
*sysctl_tree
;
206 static struct cpu_mwait_cx cpu_mwait_cx_info
[CPU_MWAIT_CX_MAX
];
207 static char cpu_mwait_cx_supported
[256];
209 static int cpu_mwait_c1_hints_cnt
;
210 static int cpu_mwait_hints_cnt
;
211 static int *cpu_mwait_hints
;
213 static int cpu_mwait_deep_hints_cnt
;
214 static int *cpu_mwait_deep_hints
;
216 #define CPU_IDLE_REPEAT_DEFAULT 750
218 static u_int cpu_idle_repeat
= CPU_IDLE_REPEAT_DEFAULT
;
219 static u_long cpu_idle_repeat_max
= CPU_IDLE_REPEAT_DEFAULT
;
220 static u_int cpu_mwait_repeat_shift
= 1;
222 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1
223 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2
225 static int cpu_mwait_c3_preamble
=
226 CPU_MWAIT_C3_PREAMBLE_BM_ARB
|
227 CPU_MWAIT_C3_PREAMBLE_BM_STS
;
229 SYSCTL_STRING(_machdep_mwait_CX
, OID_AUTO
, supported
, CTLFLAG_RD
,
230 cpu_mwait_cx_supported
, 0, "MWAIT supported C states");
231 SYSCTL_INT(_machdep_mwait_CX
, OID_AUTO
, c3_preamble
, CTLFLAG_RD
,
232 &cpu_mwait_c3_preamble
, 0, "C3+ preamble mask");
234 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
,
236 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
);
237 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
);
238 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
);
240 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, idle
, CTLTYPE_STRING
|CTLFLAG_RW
,
241 NULL
, 0, cpu_mwait_cx_idle_sysctl
, "A", "");
242 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, spin
, CTLTYPE_STRING
|CTLFLAG_RW
,
243 NULL
, 0, cpu_mwait_cx_spin_sysctl
, "A", "");
244 SYSCTL_UINT(_machdep_mwait_CX
, OID_AUTO
, repeat_shift
, CTLFLAG_RW
,
245 &cpu_mwait_repeat_shift
, 0, "");
249 u_long ebda_addr
= 0;
251 int imcr_present
= 0;
253 int naps
= 0; /* # of Applications processors */
258 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
260 u_long pmem
= ctob(physmem
);
263 error
= sysctl_handle_long(oidp
, &pmem
, 0, req
);
268 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_ULONG
|CTLFLAG_RD
,
269 0, 0, sysctl_hw_physmem
, "LU",
270 "Total system memory in bytes (number of pages * page size)");
273 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
275 u_long usermem
= ctob(physmem
- vmstats
.v_wire_count
);
278 error
= sysctl_handle_long(oidp
, &usermem
, 0, req
);
283 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_ULONG
|CTLFLAG_RD
,
284 0, 0, sysctl_hw_usermem
, "LU", "");
287 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
292 availpages
= x86_64_btop(avail_end
- avail_start
);
293 error
= sysctl_handle_long(oidp
, &availpages
, 0, req
);
298 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_ULONG
|CTLFLAG_RD
,
299 0, 0, sysctl_hw_availpages
, "LU", "");
305 * The number of PHYSMAP entries must be one less than the number of
306 * PHYSSEG entries because the PHYSMAP entry that spans the largest
307 * physical address that is accessible by ISA DMA is split into two
310 vm_phystable_t phys_avail
[VM_PHYSSEG_MAX
+ 1];
311 vm_phystable_t dump_avail
[VM_PHYSSEG_MAX
+ 1];
313 /* must be 1 less so 0 0 can signal end of chunks */
314 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1)
315 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1)
317 static vm_offset_t buffer_sva
, buffer_eva
;
318 vm_offset_t clean_sva
, clean_eva
;
319 static vm_offset_t pager_sva
, pager_eva
;
320 static struct trapframe proc0_tf
;
323 cpu_startup(void *dummy
)
327 vm_offset_t firstaddr
;
330 * Good {morning,afternoon,evening,night}.
332 kprintf("%s", version
);
335 panicifcpuunsupported();
336 kprintf("real memory = %ju (%ju MB)\n",
338 (intmax_t)Realmem
/ 1024 / 1024);
340 * Display any holes after the first chunk of extended memory.
345 kprintf("Physical memory chunk(s):\n");
346 for (indx
= 0; phys_avail
[indx
].phys_end
!= 0; ++indx
) {
349 size1
= phys_avail
[indx
].phys_end
-
350 phys_avail
[indx
].phys_beg
;
352 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
353 (intmax_t)phys_avail
[indx
].phys_beg
,
354 (intmax_t)phys_avail
[indx
].phys_end
- 1,
356 (intmax_t)(size1
/ PAGE_SIZE
));
361 * Allocate space for system data structures.
362 * The first available kernel virtual address is in "v".
363 * As pages of kernel virtual memory are allocated, "v" is incremented.
364 * As pages of memory are allocated and cleared,
365 * "firstaddr" is incremented.
366 * An index into the kernel page table corresponding to the
367 * virtual memory address maintained in "v" is kept in "mapaddr".
371 * Make two passes. The first pass calculates how much memory is
372 * needed and allocates it. The second pass assigns virtual
373 * addresses to the various data structures.
377 v
= (caddr_t
)firstaddr
;
379 #define valloc(name, type, num) \
380 (name) = (type *)v; v = (caddr_t)((name)+(num))
381 #define valloclim(name, type, num, lim) \
382 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
385 * The nominal buffer size (and minimum KVA allocation) is MAXBSIZE.
386 * For the first 64MB of ram nominally allocate sufficient buffers to
387 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
388 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
389 * the buffer cache we limit the eventual kva reservation to
392 * factor represents the 1/4 x ram conversion.
395 long factor
= 4 * NBUFCALCSIZE
/ 1024;
396 long kbytes
= physmem
* (PAGE_SIZE
/ 1024);
400 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
402 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
403 if (maxbcache
&& nbuf
> maxbcache
/ NBUFCALCSIZE
)
404 nbuf
= maxbcache
/ NBUFCALCSIZE
;
408 * Do not allow the buffer_map to be more then 1/2 the size of the
411 if (nbuf
> (virtual_end
- virtual_start
+
412 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2)) {
413 nbuf
= (virtual_end
- virtual_start
+
414 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2);
415 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf
);
419 * Do not allow the buffer_map to use more than 50% of available
420 * physical-equivalent memory. Since the VM pages which back
421 * individual buffers are typically wired, having too many bufs
422 * can prevent the system from paging properly.
424 if (nbuf
> physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2)) {
425 nbuf
= physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2);
426 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf
);
430 * Do not allow the sizeof(struct buf) * nbuf to exceed 1/4 of
431 * the valloc space which is just the virtual_end - virtual_start
432 * section. This is typically ~2GB regardless of the amount of
433 * memory, so we use 500MB as a metric.
435 * This is because we use valloc() to allocate the buf header array.
437 * NOTE: buffer space in bytes is limited by vfs.*bufspace sysctls.
439 if (nbuf
> (virtual_end
- virtual_start
) / sizeof(struct buf
) / 4) {
440 nbuf
= (virtual_end
- virtual_start
) /
441 sizeof(struct buf
) / 2;
442 kprintf("Warning: nbufs capped at %ld due to "
443 "valloc considerations\n",
447 nswbuf_mem
= lmax(lmin(nbuf
/ 32, 512), 8);
449 if (nswbuf_mem
< NSWBUF_MIN
)
450 nswbuf_mem
= NSWBUF_MIN
;
452 nswbuf_kva
= lmax(lmin(nbuf
/ 4, 512), 16);
454 if (nswbuf_kva
< NSWBUF_MIN
)
455 nswbuf_kva
= NSWBUF_MIN
;
458 valloc(swbuf_mem
, struct buf
, nswbuf_mem
);
459 valloc(swbuf_kva
, struct buf
, nswbuf_kva
);
460 valloc(buf
, struct buf
, nbuf
);
463 * End of first pass, size has been calculated so allocate memory
465 if (firstaddr
== 0) {
466 size
= (vm_size_t
)(v
- firstaddr
);
467 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
),
470 panic("startup: no room for tables");
475 * End of second pass, addresses have been assigned
477 * nbuf is an int, make sure we don't overflow the field.
479 * On 64-bit systems we always reserve maximal allocations for
480 * buffer cache buffers and there are no fragmentation issues,
481 * so the KVA segment does not have to be excessively oversized.
483 if ((vm_size_t
)(v
- firstaddr
) != size
)
484 panic("startup: table size inconsistency");
486 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
487 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
) +
488 ((nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) + pager_map_size
);
489 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
490 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
));
491 buffer_map
.system_map
= 1;
492 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
493 ((vm_offset_t
)(nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) +
495 pager_map
.system_map
= 1;
496 kprintf("avail memory = %ju (%ju MB)\n",
497 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
),
498 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
) /
502 struct cpu_idle_stat
{
510 u_long mwait_cx
[CPU_MWAIT_CX_MAX
];
513 #define CPU_IDLE_STAT_HALT -1
514 #define CPU_IDLE_STAT_SPIN -2
516 static struct cpu_idle_stat cpu_idle_stats
[MAXCPU
];
519 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS
)
521 int idx
= arg2
, cpu
, error
;
524 if (idx
== CPU_IDLE_STAT_HALT
) {
525 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
526 val
+= cpu_idle_stats
[cpu
].halt
;
527 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
528 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
529 val
+= cpu_idle_stats
[cpu
].spin
;
531 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
532 ("invalid index %d", idx
));
533 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
534 val
+= cpu_idle_stats
[cpu
].mwait_cx
[idx
];
537 error
= sysctl_handle_quad(oidp
, &val
, 0, req
);
538 if (error
|| req
->newptr
== NULL
)
541 if (idx
== CPU_IDLE_STAT_HALT
) {
542 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
543 cpu_idle_stats
[cpu
].halt
= 0;
544 cpu_idle_stats
[0].halt
= val
;
545 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
546 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
547 cpu_idle_stats
[cpu
].spin
= 0;
548 cpu_idle_stats
[0].spin
= val
;
550 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
551 ("invalid index %d", idx
));
552 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
553 cpu_idle_stats
[cpu
].mwait_cx
[idx
] = 0;
554 cpu_idle_stats
[0].mwait_cx
[idx
] = val
;
560 cpu_mwait_attach(void)
565 if (!CPU_MWAIT_HAS_CX
)
568 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
569 (CPUID_TO_FAMILY(cpu_id
) > 0xf ||
570 (CPUID_TO_FAMILY(cpu_id
) == 0x6 &&
571 CPUID_TO_MODEL(cpu_id
) >= 0xf))) {
575 * Pentium dual-core, Core 2 and beyond do not need any
576 * additional activities to enter deep C-state, i.e. C3(+).
578 cpu_mwait_cx_no_bmarb();
580 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts
);
582 cpu_mwait_cx_no_bmsts();
585 sbuf_new(&sb
, cpu_mwait_cx_supported
,
586 sizeof(cpu_mwait_cx_supported
), SBUF_FIXEDLEN
);
588 for (i
= 0; i
< CPU_MWAIT_CX_MAX
; ++i
) {
589 struct cpu_mwait_cx
*cx
= &cpu_mwait_cx_info
[i
];
592 ksnprintf(cx
->name
, sizeof(cx
->name
), "C%d", i
);
594 sysctl_ctx_init(&cx
->sysctl_ctx
);
595 cx
->sysctl_tree
= SYSCTL_ADD_NODE(&cx
->sysctl_ctx
,
596 SYSCTL_STATIC_CHILDREN(_machdep_mwait
), OID_AUTO
,
597 cx
->name
, CTLFLAG_RW
, NULL
, "Cx control/info");
598 if (cx
->sysctl_tree
== NULL
)
601 cx
->subcnt
= CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu
, i
);
602 SYSCTL_ADD_INT(&cx
->sysctl_ctx
,
603 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
604 "subcnt", CTLFLAG_RD
, &cx
->subcnt
, 0,
606 SYSCTL_ADD_PROC(&cx
->sysctl_ctx
,
607 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
608 "entered", (CTLTYPE_QUAD
| CTLFLAG_RW
), 0,
609 i
, sysctl_cpu_idle_cnt
, "Q", "# of times entered");
611 for (sub
= 0; sub
< cx
->subcnt
; ++sub
)
612 sbuf_printf(&sb
, "C%d/%d ", i
, sub
);
620 cpu_mwait_c1_hints_cnt
= cpu_mwait_cx_info
[CPU_MWAIT_C1
].subcnt
;
621 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
)
622 cpu_mwait_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
623 cpu_mwait_hints
= kmalloc(sizeof(int) * cpu_mwait_hints_cnt
,
627 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
) {
630 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
631 for (j
= 0; j
< subcnt
; ++j
) {
632 KASSERT(hint_idx
< cpu_mwait_hints_cnt
,
633 ("invalid mwait hint index %d", hint_idx
));
634 cpu_mwait_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
638 KASSERT(hint_idx
== cpu_mwait_hints_cnt
,
639 ("mwait hint count %d != index %d",
640 cpu_mwait_hints_cnt
, hint_idx
));
643 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt
);
644 for (i
= 0; i
< cpu_mwait_hints_cnt
; ++i
) {
645 int hint
= cpu_mwait_hints
[i
];
647 kprintf(" C%d/%d hint 0x%04x\n",
648 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
656 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
)
657 cpu_mwait_deep_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
658 cpu_mwait_deep_hints
= kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt
,
662 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
) {
665 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
666 for (j
= 0; j
< subcnt
; ++j
) {
667 KASSERT(hint_idx
< cpu_mwait_deep_hints_cnt
,
668 ("invalid mwait deep hint index %d", hint_idx
));
669 cpu_mwait_deep_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
673 KASSERT(hint_idx
== cpu_mwait_deep_hints_cnt
,
674 ("mwait deep hint count %d != index %d",
675 cpu_mwait_deep_hints_cnt
, hint_idx
));
678 kprintf("MWAIT deep hints:\n");
679 for (i
= 0; i
< cpu_mwait_deep_hints_cnt
; ++i
) {
680 int hint
= cpu_mwait_deep_hints
[i
];
682 kprintf(" C%d/%d hint 0x%04x\n",
683 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
687 cpu_idle_repeat_max
= 256 * cpu_mwait_deep_hints_cnt
;
689 for (i
= 0; i
< ncpus
; ++i
) {
692 ksnprintf(name
, sizeof(name
), "idle%d", i
);
693 SYSCTL_ADD_PROC(NULL
,
694 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX
), OID_AUTO
,
695 name
, (CTLTYPE_STRING
| CTLFLAG_RW
), &cpu_idle_stats
[i
],
696 0, cpu_mwait_cx_pcpu_idle_sysctl
, "A", "");
701 cpu_finish(void *dummy __unused
)
708 pic_finish(void *dummy __unused
)
710 /* Log ELCR information */
713 /* Log MPTABLE information */
714 mptable_pci_int_dump();
717 MachIntrABI
.finalize();
721 * Send an interrupt to process.
723 * Stack is set up to allow sigcode stored
724 * at top to call routine, followed by kcall
725 * to sigreturn routine below. After sigreturn
726 * resets the signal mask, the stack, and the
727 * frame pointer, it returns to the user
731 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
733 struct lwp
*lp
= curthread
->td_lwp
;
734 struct proc
*p
= lp
->lwp_proc
;
735 struct trapframe
*regs
;
736 struct sigacts
*psp
= p
->p_sigacts
;
737 struct sigframe sf
, *sfp
;
741 regs
= lp
->lwp_md
.md_regs
;
742 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
744 /* Save user context */
745 bzero(&sf
, sizeof(struct sigframe
));
746 sf
.sf_uc
.uc_sigmask
= *mask
;
747 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
748 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
749 KKASSERT(__offsetof(struct trapframe
, tf_rdi
) == 0);
750 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_rdi
, sizeof(struct trapframe
));
752 /* Make the size of the saved context visible to userland */
753 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
755 /* Allocate and validate space for the signal handler context. */
756 if ((lp
->lwp_flags
& LWP_ALTSTACK
) != 0 && !oonstack
&&
757 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
758 sp
= (char *)(lp
->lwp_sigstk
.ss_sp
+ lp
->lwp_sigstk
.ss_size
-
759 sizeof(struct sigframe
));
760 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
762 /* We take red zone into account */
763 sp
= (char *)regs
->tf_rsp
- sizeof(struct sigframe
) - 128;
767 * XXX AVX needs 64-byte alignment but sigframe has other fields and
768 * the embedded ucontext is not at the front, so aligning this won't
769 * help us. Fortunately we bcopy in/out of the sigframe, so the
772 * The problem though is if userland winds up trying to use the
775 sfp
= (struct sigframe
*)((intptr_t)sp
& ~(intptr_t)0xF);
777 /* Translate the signal is appropriate */
778 if (p
->p_sysent
->sv_sigtbl
) {
779 if (sig
<= p
->p_sysent
->sv_sigsize
)
780 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
784 * Build the argument list for the signal handler.
786 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
788 regs
->tf_rdi
= sig
; /* argument 1 */
789 regs
->tf_rdx
= (register_t
)&sfp
->sf_uc
; /* argument 3 */
791 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
793 * Signal handler installed with SA_SIGINFO.
795 * action(signo, siginfo, ucontext)
797 regs
->tf_rsi
= (register_t
)&sfp
->sf_si
; /* argument 2 */
798 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
799 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
801 /* fill siginfo structure */
802 sf
.sf_si
.si_signo
= sig
;
803 sf
.sf_si
.si_code
= code
;
804 sf
.sf_si
.si_addr
= (void *)regs
->tf_addr
;
807 * Old FreeBSD-style arguments.
809 * handler (signo, code, [uc], addr)
811 regs
->tf_rsi
= (register_t
)code
; /* argument 2 */
812 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
813 sf
.sf_ahu
.sf_handler
= catcher
;
817 * If we're a vm86 process, we want to save the segment registers.
818 * We also change eflags to be our emulated eflags, not the actual
822 if (regs
->tf_eflags
& PSL_VM
) {
823 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
824 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
826 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
827 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
828 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
829 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
831 if (vm86
->vm86_has_vme
== 0)
832 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
833 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
834 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
837 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
838 * syscalls made by the signal handler. This just avoids
839 * wasting time for our lazy fixup of such faults. PSL_NT
840 * does nothing in vm86 mode, but vm86 programs can set it
841 * almost legitimately in probes for old cpu types.
843 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
848 * Save the FPU state and reinit the FP unit
850 npxpush(&sf
.sf_uc
.uc_mcontext
);
853 * Copy the sigframe out to the user's stack.
855 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
857 * Something is wrong with the stack pointer.
858 * ...Kill the process.
863 regs
->tf_rsp
= (register_t
)sfp
;
864 regs
->tf_rip
= trunc_page64(PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
));
865 regs
->tf_rip
-= SZSIGCODE_EXTRA_BYTES
;
868 * x86 abi specifies that the direction flag must be cleared
871 regs
->tf_rflags
&= ~(PSL_T
| PSL_D
);
874 * 64 bit mode has a code and stack selector but
875 * no data or extra selector. %fs and %gs are not
878 regs
->tf_cs
= _ucodesel
;
879 regs
->tf_ss
= _udatasel
;
884 * Sanitize the trapframe for a virtual kernel passing control to a custom
885 * VM context. Remove any items that would otherwise create a privilage
888 * XXX at the moment we allow userland to set the resume flag. Is this a
892 cpu_sanitize_frame(struct trapframe
*frame
)
894 frame
->tf_cs
= _ucodesel
;
895 frame
->tf_ss
= _udatasel
;
896 /* XXX VM (8086) mode not supported? */
897 frame
->tf_rflags
&= (PSL_RF
| PSL_USERCHANGE
| PSL_VM_UNSUPP
);
898 frame
->tf_rflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
904 * Sanitize the tls so loading the descriptor does not blow up
905 * on us. For x86_64 we don't have to do anything.
908 cpu_sanitize_tls(struct savetls
*tls
)
914 * sigreturn(ucontext_t *sigcntxp)
916 * System call to cleanup state after a signal
917 * has been taken. Reset signal mask and
918 * stack state from context left by sendsig (above).
919 * Return to previous pc and psl as specified by
920 * context left by sendsig. Check carefully to
921 * make sure that the user has not modified the
922 * state to gain improper privileges.
926 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
927 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
930 sys_sigreturn(struct sigreturn_args
*uap
)
932 struct lwp
*lp
= curthread
->td_lwp
;
933 struct trapframe
*regs
;
941 * We have to copy the information into kernel space so userland
942 * can't modify it while we are sniffing it.
944 regs
= lp
->lwp_md
.md_regs
;
945 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
949 rflags
= ucp
->uc_mcontext
.mc_rflags
;
951 /* VM (8086) mode not supported */
952 rflags
&= ~PSL_VM_UNSUPP
;
955 if (eflags
& PSL_VM
) {
956 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
957 struct vm86_kernel
*vm86
;
960 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
961 * set up the vm86 area, and we can't enter vm86 mode.
963 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
965 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
966 if (vm86
->vm86_inited
== 0)
969 /* go back to user mode if both flags are set */
970 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
971 trapsignal(lp
, SIGBUS
, 0);
973 if (vm86
->vm86_has_vme
) {
974 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
975 (eflags
& VME_USERCHANGE
) | PSL_VM
;
977 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
978 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
979 (eflags
& VM_USERCHANGE
) | PSL_VM
;
981 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
982 tf
->tf_eflags
= eflags
;
983 tf
->tf_vm86_ds
= tf
->tf_ds
;
984 tf
->tf_vm86_es
= tf
->tf_es
;
985 tf
->tf_vm86_fs
= tf
->tf_fs
;
986 tf
->tf_vm86_gs
= tf
->tf_gs
;
987 tf
->tf_ds
= _udatasel
;
988 tf
->tf_es
= _udatasel
;
989 tf
->tf_fs
= _udatasel
;
990 tf
->tf_gs
= _udatasel
;
995 * Don't allow users to change privileged or reserved flags.
998 * XXX do allow users to change the privileged flag PSL_RF.
999 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
1000 * should sometimes set it there too. tf_eflags is kept in
1001 * the signal context during signal handling and there is no
1002 * other place to remember it, so the PSL_RF bit may be
1003 * corrupted by the signal handler without us knowing.
1004 * Corruption of the PSL_RF bit at worst causes one more or
1005 * one less debugger trap, so allowing it is fairly harmless.
1007 if (!EFL_SECURE(rflags
& ~PSL_RF
, regs
->tf_rflags
& ~PSL_RF
)) {
1008 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags
);
1013 * Don't allow users to load a valid privileged %cs. Let the
1014 * hardware check for invalid selectors, excess privilege in
1015 * other selectors, invalid %eip's and invalid %esp's.
1017 cs
= ucp
->uc_mcontext
.mc_cs
;
1018 if (!CS_SECURE(cs
)) {
1019 kprintf("sigreturn: cs = 0x%x\n", cs
);
1020 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
1023 bcopy(&ucp
->uc_mcontext
.mc_rdi
, regs
, sizeof(struct trapframe
));
1027 * Restore the FPU state from the frame
1030 npxpop(&ucp
->uc_mcontext
);
1032 if (ucp
->uc_mcontext
.mc_onstack
& 1)
1033 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
1035 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
1037 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
1038 SIG_CANTMASK(lp
->lwp_sigmask
);
1041 return(EJUSTRETURN
);
1045 * Machine dependent boot() routine
1047 * I haven't seen anything to put here yet
1048 * Possibly some stuff might be grafted back here from boot()
1056 * Shutdown the CPU as much as possible
1062 __asm__
__volatile("hlt");
1066 * cpu_idle() represents the idle LWKT. You cannot return from this function
1067 * (unless you want to blow things up!). Instead we look for runnable threads
1068 * and loop or halt as appropriate. Giant is not held on entry to the thread.
1070 * The main loop is entered with a critical section held, we must release
1071 * the critical section before doing anything else. lwkt_switch() will
1072 * check for pending interrupts due to entering and exiting its own
1075 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1076 * However, there are cases where the idlethread will be entered with
1077 * the possibility that no IPI will occur and in such cases
1078 * lwkt_switch() sets TDF_IDLE_NOHLT.
1080 * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1081 * must occur before it starts using ACPI halt.
1083 * NOTE: Value overridden in hammer_time().
1085 static int cpu_idle_hlt
= 2;
1086 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
1087 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
1088 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_repeat
, CTLFLAG_RW
,
1089 &cpu_idle_repeat
, 0, "Idle entries before acpi hlt");
1091 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1092 0, CPU_IDLE_STAT_HALT
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry halts");
1093 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_spincnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1094 0, CPU_IDLE_STAT_SPIN
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry spins");
1097 cpu_idle_default_hook(void)
1100 * We must guarentee that hlt is exactly the instruction
1101 * following the sti.
1103 __asm
__volatile("sti; hlt");
1106 /* Other subsystems (e.g., ACPI) can hook this later. */
1107 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
1110 cpu_mwait_cx_hint(struct cpu_idle_stat
*stat
)
1119 idx
= (stat
->repeat
+ stat
->repeat_last
+ stat
->repeat_delta
) >>
1120 cpu_mwait_repeat_shift
;
1121 if (idx
>= cpu_mwait_c1_hints_cnt
) {
1122 /* Step up faster, once we walked through all C1 states */
1123 stat
->repeat_delta
+= 1 << (cpu_mwait_repeat_shift
+ 1);
1125 if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
1126 if (idx
>= cpu_mwait_deep_hints_cnt
)
1127 idx
= cpu_mwait_deep_hints_cnt
- 1;
1128 hint
= cpu_mwait_deep_hints
[idx
];
1130 if (idx
>= cpu_mwait_hints_cnt
)
1131 idx
= cpu_mwait_hints_cnt
- 1;
1132 hint
= cpu_mwait_hints
[idx
];
1135 cx_idx
= MWAIT_EAX_TO_CX(hint
);
1136 if (cx_idx
>= 0 && cx_idx
< CPU_MWAIT_CX_MAX
)
1137 stat
->mwait_cx
[cx_idx
]++;
1144 globaldata_t gd
= mycpu
;
1145 struct cpu_idle_stat
*stat
= &cpu_idle_stats
[gd
->gd_cpuid
];
1146 struct thread
*td __debugvar
= gd
->gd_curthread
;
1150 stat
->repeat
= stat
->repeat_last
= cpu_idle_repeat_max
;
1153 KKASSERT(td
->td_critcount
== 0);
1157 * See if there are any LWKTs ready to go.
1162 * When halting inside a cli we must check for reqflags
1163 * races, particularly [re]schedule requests. Running
1164 * splz() does the job.
1167 * 0 Never halt, just spin
1169 * 1 Always use MONITOR/MWAIT if avail, HLT
1172 * Better default for modern (Haswell+) Intel
1175 * 2 Use HLT/MONITOR/MWAIT up to a point and then
1176 * use the ACPI halt (default). This is a hybrid
1177 * approach. See machdep.cpu_idle_repeat.
1179 * Better default for modern AMD cpus and older
1182 * 3 Always use the ACPI halt. This typically
1183 * eats the least amount of power but the cpu
1184 * will be slow waking up. Slows down e.g.
1185 * compiles and other pipe/event oriented stuff.
1187 * Usually the best default for AMD cpus.
1193 * NOTE: Interrupts are enabled and we are not in a critical
1196 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
1197 * don't bother capping gd_idle_repeat, it is ok if
1200 * Implement optimized invltlb operations when halted
1201 * in idle. By setting the bit in smp_idleinvl_mask
1202 * we inform other cpus that they can set _reqs to
1203 * request an invltlb. Current the code to do that
1204 * sets the bits in _reqs anyway, but then check _mask
1205 * to determine if they can assume the invltlb will execute.
1207 * A critical section is required to ensure that interrupts
1208 * do not fully run until after we've had a chance to execute
1211 if (gd
->gd_idle_repeat
== 0) {
1212 stat
->repeat
= (stat
->repeat
+ stat
->repeat_last
) >> 1;
1213 if (stat
->repeat
> cpu_idle_repeat_max
)
1214 stat
->repeat
= cpu_idle_repeat_max
;
1215 stat
->repeat_last
= 0;
1216 stat
->repeat_delta
= 0;
1218 ++stat
->repeat_last
;
1220 ++gd
->gd_idle_repeat
;
1221 reqflags
= gd
->gd_reqflags
;
1222 quick
= (cpu_idle_hlt
== 1) ||
1223 (cpu_idle_hlt
== 2 &&
1224 gd
->gd_idle_repeat
< cpu_idle_repeat
);
1226 if (quick
&& (cpu_mi_feature
& CPU_MI_MONITOR
) &&
1227 (reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1230 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1231 cpu_mmw_pause_int(&gd
->gd_reqflags
, reqflags
,
1232 cpu_mwait_cx_hint(stat
), 0);
1234 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1235 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1241 } else if (cpu_idle_hlt
) {
1242 __asm
__volatile("cli");
1245 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1246 if ((gd
->gd_reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1247 if (cpu_idle_hlt
== 5) {
1248 __asm
__volatile("sti");
1249 } else if (quick
|| cpu_idle_hlt
== 4) {
1250 cpu_idle_default_hook();
1255 __asm
__volatile("sti");
1257 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1258 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1266 __asm
__volatile("sti");
1275 * Called in a loop indirectly via Xcpustop
1278 cpu_smp_stopped(void)
1280 globaldata_t gd
= mycpu
;
1281 volatile __uint64_t
*ptr
;
1284 ptr
= CPUMASK_ADDR(started_cpus
, gd
->gd_cpuid
);
1286 if ((ovalue
& CPUMASK_SIMPLE(gd
->gd_cpuid
& 63)) == 0) {
1287 if (cpu_mi_feature
& CPU_MI_MONITOR
) {
1288 if (cpu_mwait_hints
) {
1289 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr
),
1291 cpu_mwait_hints
[CPU_MWAIT_C1
], 0);
1293 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr
),
1297 cpu_halt(); /* depend on lapic timer */
1303 * This routine is called if a spinlock has been held through the
1304 * exponential backoff period and is seriously contested. On a real cpu
1308 cpu_spinlock_contested(void)
1314 * Clear registers on exec
1317 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
1319 struct thread
*td
= curthread
;
1320 struct lwp
*lp
= td
->td_lwp
;
1321 struct pcb
*pcb
= td
->td_pcb
;
1322 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
1327 bzero((char *)regs
, sizeof(struct trapframe
));
1328 regs
->tf_rip
= entry
;
1329 regs
->tf_rsp
= ((stack
- 8) & ~0xFul
) + 8; /* align the stack */
1330 regs
->tf_rdi
= stack
; /* argv */
1331 regs
->tf_rflags
= PSL_USER
| (regs
->tf_rflags
& PSL_T
);
1332 regs
->tf_ss
= _udatasel
;
1333 regs
->tf_cs
= _ucodesel
;
1334 regs
->tf_rbx
= ps_strings
;
1337 * Reset the hardware debug registers if they were in use.
1338 * They won't have any meaning for the newly exec'd process.
1340 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1346 pcb
->pcb_dr7
= 0; /* JG set bit 10? */
1347 if (pcb
== td
->td_pcb
) {
1349 * Clear the debug registers on the running
1350 * CPU, otherwise they will end up affecting
1351 * the next process we switch to.
1355 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1359 * Initialize the math emulator (if any) for the current process.
1360 * Actually, just clear the bit that says that the emulator has
1361 * been initialized. Initialization is delayed until the process
1362 * traps to the emulator (if it is done at all) mainly because
1363 * emulators don't provide an entry point for initialization.
1365 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1368 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1369 * gd_npxthread. Otherwise a preemptive interrupt thread
1370 * may panic in npxdna().
1373 load_cr0(rcr0() | CR0_MP
);
1376 * NOTE: The MSR values must be correct so we can return to
1377 * userland. gd_user_fs/gs must be correct so the switch
1378 * code knows what the current MSR values are.
1380 pcb
->pcb_fsbase
= 0; /* Values loaded from PCB on switch */
1381 pcb
->pcb_gsbase
= 0;
1382 mdcpu
->gd_user_fs
= 0; /* Cache of current MSR values */
1383 mdcpu
->gd_user_gs
= 0;
1384 wrmsr(MSR_FSBASE
, 0); /* Set MSR values for return to userland */
1385 wrmsr(MSR_KGSBASE
, 0);
1387 /* Initialize the npx (if any) for the current process. */
1391 pcb
->pcb_ds
= _udatasel
;
1392 pcb
->pcb_es
= _udatasel
;
1393 pcb
->pcb_fs
= _udatasel
;
1394 pcb
->pcb_gs
= _udatasel
;
1403 cr0
|= CR0_NE
; /* Done by npxinit() */
1404 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1405 cr0
|= CR0_WP
| CR0_AM
;
1411 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1414 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1416 if (!error
&& req
->newptr
)
1421 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1422 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1424 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1425 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1428 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1429 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1432 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1433 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1436 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS
)
1438 struct efi_map_header
*efihdr
;
1442 kmdp
= preload_search_by_type("elf kernel");
1444 kmdp
= preload_search_by_type("elf64 kernel");
1445 efihdr
= (struct efi_map_header
*)preload_search_info(kmdp
,
1446 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1449 efisize
= *((uint32_t *)efihdr
- 1);
1450 return (SYSCTL_OUT(req
, efihdr
, efisize
));
1452 SYSCTL_PROC(_machdep
, OID_AUTO
, efi_map
, CTLTYPE_OPAQUE
|CTLFLAG_RD
, NULL
, 0,
1453 efi_map_sysctl_handler
, "S,efi_map_header", "Raw EFI Memory Map");
1456 * Initialize 386 and configure to run kernel
1460 * Initialize segments & interrupt table
1464 struct user_segment_descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1465 struct gate_descriptor idt_arr
[MAXCPU
][NIDT
];
1467 union descriptor ldt
[NLDT
]; /* local descriptor table */
1470 /* table descriptors - used to load tables by cpu */
1471 struct region_descriptor r_gdt
;
1472 struct region_descriptor r_idt_arr
[MAXCPU
];
1474 /* JG proc0paddr is a virtual address */
1477 char proc0paddr_buff
[LWKT_THREAD_STACK
];
1480 /* software prototypes -- in more palatable form */
1481 struct soft_segment_descriptor gdt_segs
[] = {
1482 /* GNULL_SEL 0 Null Descriptor */
1483 { 0x0, /* segment base address */
1485 0, /* segment type */
1486 0, /* segment descriptor priority level */
1487 0, /* segment descriptor present */
1489 0, /* default 32 vs 16 bit size */
1490 0 /* limit granularity (byte/page units)*/ },
1491 /* GCODE_SEL 1 Code Descriptor for kernel */
1492 { 0x0, /* segment base address */
1493 0xfffff, /* length - all address space */
1494 SDT_MEMERA
, /* segment type */
1495 SEL_KPL
, /* segment descriptor priority level */
1496 1, /* segment descriptor present */
1498 0, /* default 32 vs 16 bit size */
1499 1 /* limit granularity (byte/page units)*/ },
1500 /* GDATA_SEL 2 Data Descriptor for kernel */
1501 { 0x0, /* segment base address */
1502 0xfffff, /* length - all address space */
1503 SDT_MEMRWA
, /* segment type */
1504 SEL_KPL
, /* segment descriptor priority level */
1505 1, /* segment descriptor present */
1507 0, /* default 32 vs 16 bit size */
1508 1 /* limit granularity (byte/page units)*/ },
1509 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1510 { 0x0, /* segment base address */
1511 0xfffff, /* length - all address space */
1512 SDT_MEMERA
, /* segment type */
1513 SEL_UPL
, /* segment descriptor priority level */
1514 1, /* segment descriptor present */
1516 1, /* default 32 vs 16 bit size */
1517 1 /* limit granularity (byte/page units)*/ },
1518 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1519 { 0x0, /* segment base address */
1520 0xfffff, /* length - all address space */
1521 SDT_MEMRWA
, /* segment type */
1522 SEL_UPL
, /* segment descriptor priority level */
1523 1, /* segment descriptor present */
1525 1, /* default 32 vs 16 bit size */
1526 1 /* limit granularity (byte/page units)*/ },
1527 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1528 { 0x0, /* segment base address */
1529 0xfffff, /* length - all address space */
1530 SDT_MEMERA
, /* segment type */
1531 SEL_UPL
, /* segment descriptor priority level */
1532 1, /* segment descriptor present */
1534 0, /* default 32 vs 16 bit size */
1535 1 /* limit granularity (byte/page units)*/ },
1536 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1538 0x0, /* segment base address */
1539 sizeof(struct x86_64tss
)-1,/* length - all address space */
1540 SDT_SYSTSS
, /* segment type */
1541 SEL_KPL
, /* segment descriptor priority level */
1542 1, /* segment descriptor present */
1544 0, /* unused - default 32 vs 16 bit size */
1545 0 /* limit granularity (byte/page units)*/ },
1546 /* Actually, the TSS is a system descriptor which is double size */
1547 { 0x0, /* segment base address */
1549 0, /* segment type */
1550 0, /* segment descriptor priority level */
1551 0, /* segment descriptor present */
1553 0, /* default 32 vs 16 bit size */
1554 0 /* limit granularity (byte/page units)*/ },
1555 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1556 { 0x0, /* segment base address */
1557 0xfffff, /* length - all address space */
1558 SDT_MEMRWA
, /* segment type */
1559 SEL_UPL
, /* segment descriptor priority level */
1560 1, /* segment descriptor present */
1562 1, /* default 32 vs 16 bit size */
1563 1 /* limit granularity (byte/page units)*/ },
1567 setidt_global(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
)
1571 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
1572 struct gate_descriptor
*ip
= &idt_arr
[cpu
][idx
];
1574 ip
->gd_looffset
= (uintptr_t)func
;
1575 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1581 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1586 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
, int cpu
)
1588 struct gate_descriptor
*ip
;
1590 KASSERT(cpu
>= 0 && cpu
< ncpus
, ("invalid cpu %d", cpu
));
1592 ip
= &idt_arr
[cpu
][idx
];
1593 ip
->gd_looffset
= (uintptr_t)func
;
1594 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1600 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1603 #define IDTVEC(name) __CONCAT(X,name)
1606 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1607 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1608 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1609 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(rsvd
), IDTVEC(fpu
), IDTVEC(align
),
1610 IDTVEC(xmm
), IDTVEC(dblfault
),
1611 IDTVEC(fast_syscall
), IDTVEC(fast_syscall32
);
1614 sdtossd(struct user_segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1616 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1617 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1618 ssd
->ssd_type
= sd
->sd_type
;
1619 ssd
->ssd_dpl
= sd
->sd_dpl
;
1620 ssd
->ssd_p
= sd
->sd_p
;
1621 ssd
->ssd_def32
= sd
->sd_def32
;
1622 ssd
->ssd_gran
= sd
->sd_gran
;
1626 ssdtosd(struct soft_segment_descriptor
*ssd
, struct user_segment_descriptor
*sd
)
1629 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1630 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xff;
1631 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1632 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1633 sd
->sd_type
= ssd
->ssd_type
;
1634 sd
->sd_dpl
= ssd
->ssd_dpl
;
1635 sd
->sd_p
= ssd
->ssd_p
;
1636 sd
->sd_long
= ssd
->ssd_long
;
1637 sd
->sd_def32
= ssd
->ssd_def32
;
1638 sd
->sd_gran
= ssd
->ssd_gran
;
1642 ssdtosyssd(struct soft_segment_descriptor
*ssd
,
1643 struct system_segment_descriptor
*sd
)
1646 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1647 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xfffffffffful
;
1648 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1649 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1650 sd
->sd_type
= ssd
->ssd_type
;
1651 sd
->sd_dpl
= ssd
->ssd_dpl
;
1652 sd
->sd_p
= ssd
->ssd_p
;
1653 sd
->sd_gran
= ssd
->ssd_gran
;
1657 * Populate the (physmap) array with base/bound pairs describing the
1658 * available physical memory in the system, then test this memory and
1659 * build the phys_avail array describing the actually-available memory.
1661 * If we cannot accurately determine the physical memory map, then use
1662 * value from the 0xE801 call, and failing that, the RTC.
1664 * Total memory size may be set by the kernel environment variable
1665 * hw.physmem or the compile-time define MAXMEM.
1667 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1668 * of PAGE_SIZE. This also greatly reduces the memory test time
1669 * which would otherwise be excessive on machines with > 8G of ram.
1671 * XXX first should be vm_paddr_t.
1674 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024)
1675 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1676 #define PHYSMAP_SIZE VM_PHYSSEG_MAX
1678 vm_paddr_t physmap
[PHYSMAP_SIZE
];
1679 struct bios_smap
*smapbase
, *smap
, *smapend
;
1680 struct efi_map_header
*efihdrbase
;
1683 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024)
1684 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1)
1687 add_smap_entries(int *physmap_idx
)
1691 smapsize
= *((u_int32_t
*)smapbase
- 1);
1692 smapend
= (struct bios_smap
*)((uintptr_t)smapbase
+ smapsize
);
1694 for (smap
= smapbase
; smap
< smapend
; smap
++) {
1695 if (boothowto
& RB_VERBOSE
)
1696 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1697 smap
->type
, smap
->base
, smap
->length
);
1699 if (smap
->type
!= SMAP_TYPE_MEMORY
)
1702 if (smap
->length
== 0)
1705 for (i
= 0; i
<= *physmap_idx
; i
+= 2) {
1706 if (smap
->base
< physmap
[i
+ 1]) {
1707 if (boothowto
& RB_VERBOSE
) {
1708 kprintf("Overlapping or non-monotonic "
1709 "memory region, ignoring "
1715 if (i
<= *physmap_idx
)
1718 Realmem
+= smap
->length
;
1720 if (smap
->base
== physmap
[*physmap_idx
+ 1]) {
1721 physmap
[*physmap_idx
+ 1] += smap
->length
;
1726 if (*physmap_idx
== PHYSMAP_SIZE
) {
1727 kprintf("Too many segments in the physical "
1728 "address map, giving up\n");
1731 physmap
[*physmap_idx
] = smap
->base
;
1732 physmap
[*physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1737 add_efi_map_entries(int *physmap_idx
)
1739 struct efi_md
*map
, *p
;
1744 static const char *types
[] = {
1750 "RuntimeServicesCode",
1751 "RuntimeServicesData",
1752 "ConventionalMemory",
1754 "ACPIReclaimMemory",
1757 "MemoryMappedIOPortSpace",
1762 * Memory map data provided by UEFI via the GetMemoryMap
1763 * Boot Services API.
1765 efisz
= (sizeof(struct efi_map_header
) + 0xf) & ~0xf;
1766 map
= (struct efi_md
*)((uint8_t *)efihdrbase
+ efisz
);
1768 if (efihdrbase
->descriptor_size
== 0)
1770 ndesc
= efihdrbase
->memory_size
/ efihdrbase
->descriptor_size
;
1772 if (boothowto
& RB_VERBOSE
)
1773 kprintf("%23s %12s %12s %8s %4s\n",
1774 "Type", "Physical", "Virtual", "#Pages", "Attr");
1776 for (i
= 0, p
= map
; i
< ndesc
; i
++,
1777 p
= efi_next_descriptor(p
, efihdrbase
->descriptor_size
)) {
1778 if (boothowto
& RB_VERBOSE
) {
1779 if (p
->md_type
<= EFI_MD_TYPE_PALCODE
)
1780 type
= types
[p
->md_type
];
1783 kprintf("%23s %012lx %12p %08lx ", type
, p
->md_phys
,
1784 p
->md_virt
, p
->md_pages
);
1785 if (p
->md_attr
& EFI_MD_ATTR_UC
)
1787 if (p
->md_attr
& EFI_MD_ATTR_WC
)
1789 if (p
->md_attr
& EFI_MD_ATTR_WT
)
1791 if (p
->md_attr
& EFI_MD_ATTR_WB
)
1793 if (p
->md_attr
& EFI_MD_ATTR_UCE
)
1795 if (p
->md_attr
& EFI_MD_ATTR_WP
)
1797 if (p
->md_attr
& EFI_MD_ATTR_RP
)
1799 if (p
->md_attr
& EFI_MD_ATTR_XP
)
1801 if (p
->md_attr
& EFI_MD_ATTR_RT
)
1806 switch (p
->md_type
) {
1807 case EFI_MD_TYPE_CODE
:
1808 case EFI_MD_TYPE_DATA
:
1809 case EFI_MD_TYPE_BS_CODE
:
1810 case EFI_MD_TYPE_BS_DATA
:
1811 case EFI_MD_TYPE_FREE
:
1813 * We're allowed to use any entry with these types.
1820 Realmem
+= p
->md_pages
* PAGE_SIZE
;
1822 if (p
->md_phys
== physmap
[*physmap_idx
+ 1]) {
1823 physmap
[*physmap_idx
+ 1] += p
->md_pages
* PAGE_SIZE
;
1828 if (*physmap_idx
== PHYSMAP_SIZE
) {
1829 kprintf("Too many segments in the physical "
1830 "address map, giving up\n");
1833 physmap
[*physmap_idx
] = p
->md_phys
;
1834 physmap
[*physmap_idx
+ 1] = p
->md_phys
+ p
->md_pages
* PAGE_SIZE
;
1838 struct fb_info efi_fb_info
;
1839 static int have_efi_framebuffer
= 0;
1842 efi_fb_init_vaddr(int direct_map
)
1845 vm_offset_t addr
, v
;
1847 v
= efi_fb_info
.vaddr
;
1848 sz
= efi_fb_info
.stride
* efi_fb_info
.height
;
1851 addr
= PHYS_TO_DMAP(efi_fb_info
.paddr
);
1852 if (addr
>= DMAP_MIN_ADDRESS
&& addr
+ sz
< DMAP_MAX_ADDRESS
)
1853 efi_fb_info
.vaddr
= addr
;
1855 efi_fb_info
.vaddr
= (vm_offset_t
)pmap_mapdev_attr(
1856 efi_fb_info
.paddr
, sz
, PAT_WRITE_COMBINING
);
1861 efifb_color_depth(struct efi_fb
*efifb
)
1866 mask
= efifb
->fb_mask_red
| efifb
->fb_mask_green
|
1867 efifb
->fb_mask_blue
| efifb
->fb_mask_reserved
;
1870 for (depth
= 1; mask
!= 1; depth
++)
1876 probe_efi_fb(int early
)
1878 struct efi_fb
*efifb
;
1882 if (have_efi_framebuffer
) {
1884 (efi_fb_info
.vaddr
== 0 ||
1885 efi_fb_info
.vaddr
== PHYS_TO_DMAP(efi_fb_info
.paddr
)))
1886 efi_fb_init_vaddr(0);
1890 kmdp
= preload_search_by_type("elf kernel");
1892 kmdp
= preload_search_by_type("elf64 kernel");
1893 efifb
= (struct efi_fb
*)preload_search_info(kmdp
,
1894 MODINFO_METADATA
| MODINFOMD_EFI_FB
);
1898 depth
= efifb_color_depth(efifb
);
1900 * Our bootloader should already notice, when we won't be able to
1901 * use the UEFI framebuffer.
1903 if (depth
!= 24 && depth
!= 32)
1906 have_efi_framebuffer
= 1;
1908 efi_fb_info
.is_vga_boot_display
= 1;
1909 efi_fb_info
.width
= efifb
->fb_width
;
1910 efi_fb_info
.height
= efifb
->fb_height
;
1911 efi_fb_info
.depth
= depth
;
1912 efi_fb_info
.stride
= efifb
->fb_stride
* (depth
/ 8);
1913 efi_fb_info
.paddr
= efifb
->fb_addr
;
1915 efi_fb_info
.vaddr
= 0;
1917 efi_fb_init_vaddr(0);
1919 efi_fb_info
.fbops
.fb_set_par
= NULL
;
1920 efi_fb_info
.fbops
.fb_blank
= NULL
;
1921 efi_fb_info
.fbops
.fb_debug_enter
= NULL
;
1922 efi_fb_info
.device
= NULL
;
1928 efifb_startup(void *arg
)
1933 SYSINIT(efi_fb_info
, SI_BOOT1_POST
, SI_ORDER_FIRST
, efifb_startup
, NULL
);
1936 getmemsize(caddr_t kmdp
, u_int64_t first
)
1938 int off
, physmap_idx
, pa_indx
, da_indx
;
1941 vm_paddr_t msgbuf_size
;
1942 u_long physmem_tunable
;
1944 quad_t dcons_addr
, dcons_size
;
1946 bzero(physmap
, sizeof(physmap
));
1950 * get memory map from INT 15:E820, kindly supplied by the loader.
1952 * subr_module.c says:
1953 * "Consumer may safely assume that size value precedes data."
1954 * ie: an int32_t immediately precedes smap.
1956 efihdrbase
= (struct efi_map_header
*)preload_search_info(kmdp
,
1957 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1958 smapbase
= (struct bios_smap
*)preload_search_info(kmdp
,
1959 MODINFO_METADATA
| MODINFOMD_SMAP
);
1960 if (smapbase
== NULL
&& efihdrbase
== NULL
)
1961 panic("No BIOS smap or EFI map info from loader!");
1963 if (efihdrbase
== NULL
)
1964 add_smap_entries(&physmap_idx
);
1966 add_efi_map_entries(&physmap_idx
);
1968 base_memory
= physmap
[1] / 1024;
1969 /* make hole for AP bootstrap code */
1970 physmap
[1] = mp_bootaddress(base_memory
);
1972 /* Save EBDA address, if any */
1973 ebda_addr
= (u_long
)(*(u_short
*)(KERNBASE
+ 0x40e));
1977 * Maxmem isn't the "maximum memory", it's one larger than the
1978 * highest page of the physical address space. It should be
1979 * called something like "Maxphyspage". We may adjust this
1980 * based on ``hw.physmem'' and the results of the memory test.
1982 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1985 Maxmem
= MAXMEM
/ 4;
1988 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable
))
1989 Maxmem
= atop(physmem_tunable
);
1992 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1995 if (Maxmem
> atop(physmap
[physmap_idx
+ 1]))
1996 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1999 * Blowing out the DMAP will blow up the system.
2001 if (Maxmem
> atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
)) {
2002 kprintf("Limiting Maxmem due to DMAP size\n");
2003 Maxmem
= atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
);
2006 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
2007 (boothowto
& RB_VERBOSE
)) {
2008 kprintf("Physical memory use set to %ldK\n", Maxmem
* 4);
2012 * Call pmap initialization to make new kernel address space
2016 pmap_bootstrap(&first
);
2017 physmap
[0] = PAGE_SIZE
;
2020 * Align the physmap to PHYSMAP_ALIGN and cut out anything
2023 for (i
= j
= 0; i
<= physmap_idx
; i
+= 2) {
2024 if (physmap
[i
+1] > ptoa(Maxmem
))
2025 physmap
[i
+1] = ptoa(Maxmem
);
2026 physmap
[i
] = (physmap
[i
] + PHYSMAP_ALIGN_MASK
) &
2027 ~PHYSMAP_ALIGN_MASK
;
2028 physmap
[i
+1] = physmap
[i
+1] & ~PHYSMAP_ALIGN_MASK
;
2030 physmap
[j
] = physmap
[i
];
2031 physmap
[j
+1] = physmap
[i
+1];
2033 if (physmap
[i
] < physmap
[i
+1])
2036 physmap_idx
= j
- 2;
2039 * Align anything else used in the validation loop.
2041 * Also make sure that our 2MB kernel text+data+bss mappings
2042 * do not overlap potentially allocatable space.
2044 first
= (first
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
2047 * Size up each available chunk of physical memory.
2051 phys_avail
[pa_indx
].phys_beg
= physmap
[0];
2052 phys_avail
[pa_indx
].phys_end
= physmap
[0];
2053 dump_avail
[da_indx
].phys_beg
= 0;
2054 dump_avail
[da_indx
].phys_end
= physmap
[0];
2058 * Get dcons buffer address
2060 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
2061 kgetenv_quad("dcons.size", &dcons_size
) == 0)
2065 * Validate the physical memory. The physical memory segments
2066 * have already been aligned to PHYSMAP_ALIGN which is a multiple
2069 * We no longer perform an exhaustive memory test. Instead we
2070 * simply test the first and last word in each physmap[]
2073 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
2077 end
= physmap
[i
+ 1];
2079 for (pa
= physmap
[i
]; pa
< end
; pa
+= incr
) {
2081 volatile uint64_t *ptr
= (uint64_t *)CADDR1
;
2087 * Calculate incr. Just test the first and
2088 * last page in each physmap[] segment.
2090 if (pa
== end
- PAGE_SIZE
)
2093 incr
= end
- pa
- PAGE_SIZE
;
2096 * Make sure we don't skip blacked out areas.
2098 if (pa
< 0x200000 && 0x200000 < end
) {
2099 incr
= 0x200000 - pa
;
2101 if (dcons_addr
> 0 &&
2104 incr
= dcons_addr
- pa
;
2108 * Block out kernel memory as not available.
2110 if (pa
>= 0x200000 && pa
< first
) {
2112 if (pa
+ incr
> end
)
2118 * Block out the dcons buffer if it exists.
2120 if (dcons_addr
> 0 &&
2121 pa
>= trunc_page(dcons_addr
) &&
2122 pa
< dcons_addr
+ dcons_size
) {
2123 incr
= dcons_addr
+ dcons_size
- pa
;
2124 incr
= (incr
+ PAGE_MASK
) &
2125 ~(vm_paddr_t
)PAGE_MASK
;
2126 if (pa
+ incr
> end
)
2134 * Map the page non-cacheable for the memory
2138 kernel_pmap
.pmap_bits
[PG_V_IDX
] |
2139 kernel_pmap
.pmap_bits
[PG_RW_IDX
] |
2140 kernel_pmap
.pmap_bits
[PG_N_IDX
];
2141 cpu_invlpg(__DEVOLATILE(void *, ptr
));
2145 * Save original value for restoration later.
2150 * Test for alternating 1's and 0's
2152 *ptr
= 0xaaaaaaaaaaaaaaaaLLU
;
2154 if (*ptr
!= 0xaaaaaaaaaaaaaaaaLLU
)
2157 * Test for alternating 0's and 1's
2159 *ptr
= 0x5555555555555555LLU
;
2161 if (*ptr
!= 0x5555555555555555LLU
)
2166 *ptr
= 0xffffffffffffffffLLU
;
2168 if (*ptr
!= 0xffffffffffffffffLLU
)
2179 * Restore original value.
2184 * Adjust array of valid/good pages.
2186 if (page_bad
== TRUE
) {
2192 * Collapse page address into phys_avail[]. Do a
2193 * continuation of the current phys_avail[] index
2196 if (phys_avail
[pa_indx
].phys_end
== pa
) {
2200 phys_avail
[pa_indx
].phys_end
+= incr
;
2201 } else if (phys_avail
[pa_indx
].phys_beg
==
2202 phys_avail
[pa_indx
].phys_end
) {
2204 * Current phys_avail is completely empty,
2207 phys_avail
[pa_indx
].phys_beg
= pa
;
2208 phys_avail
[pa_indx
].phys_end
= pa
+ incr
;
2211 * Allocate next phys_avail index.
2214 if (pa_indx
== PHYS_AVAIL_ARRAY_END
) {
2216 "Too many holes in the physical address space, giving up\n");
2221 phys_avail
[pa_indx
].phys_beg
= pa
;
2222 phys_avail
[pa_indx
].phys_end
= pa
+ incr
;
2224 physmem
+= incr
/ PAGE_SIZE
;
2227 * pa available for dumping
2230 if (dump_avail
[da_indx
].phys_end
== pa
) {
2231 dump_avail
[da_indx
].phys_end
+= incr
;
2234 if (da_indx
== DUMP_AVAIL_ARRAY_END
) {
2238 dump_avail
[da_indx
].phys_beg
= pa
;
2239 dump_avail
[da_indx
].phys_end
= pa
+ incr
;
2251 * The last chunk must contain at least one page plus the message
2252 * buffer to avoid complicating other code (message buffer address
2253 * calculation, etc.).
2255 msgbuf_size
= (MSGBUF_SIZE
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
2257 while (phys_avail
[pa_indx
].phys_beg
+ PHYSMAP_ALIGN
+ msgbuf_size
>=
2258 phys_avail
[pa_indx
].phys_end
) {
2259 physmem
-= atop(phys_avail
[pa_indx
].phys_end
-
2260 phys_avail
[pa_indx
].phys_beg
);
2261 phys_avail
[pa_indx
].phys_beg
= 0;
2262 phys_avail
[pa_indx
].phys_end
= 0;
2266 Maxmem
= atop(phys_avail
[pa_indx
].phys_end
);
2268 /* Trim off space for the message buffer. */
2269 phys_avail
[pa_indx
].phys_end
-= msgbuf_size
;
2271 avail_end
= phys_avail
[pa_indx
].phys_end
;
2273 /* Map the message buffer. */
2274 for (off
= 0; off
< msgbuf_size
; off
+= PAGE_SIZE
) {
2275 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2277 /* Try to get EFI framebuffer working as early as possible */
2278 if (have_efi_framebuffer
)
2279 efi_fb_init_vaddr(1);
2282 struct machintr_abi MachIntrABI
;
2293 * 7 Device Not Available (x87)
2295 * 9 Coprocessor Segment overrun (unsupported, reserved)
2297 * 11 Segment not present
2299 * 13 General Protection
2302 * 16 x87 FP Exception pending
2303 * 17 Alignment Check
2305 * 19 SIMD floating point
2307 * 32-255 INTn/external sources
2310 hammer_time(u_int64_t modulep
, u_int64_t physfree
)
2313 int gsel_tss
, x
, cpu
;
2315 int metadata_missing
, off
;
2317 struct mdglobaldata
*gd
;
2321 * Prevent lowering of the ipl if we call tsleep() early.
2323 gd
= &CPU_prvspace
[0]->mdglobaldata
;
2324 bzero(gd
, sizeof(*gd
));
2327 * Note: on both UP and SMP curthread must be set non-NULL
2328 * early in the boot sequence because the system assumes
2329 * that 'curthread' is never NULL.
2332 gd
->mi
.gd_curthread
= &thread0
;
2333 thread0
.td_gd
= &gd
->mi
;
2335 atdevbase
= ISA_HOLE_START
+ PTOV_OFFSET
;
2338 metadata_missing
= 0;
2339 if (bootinfo
.bi_modulep
) {
2340 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
2341 preload_bootstrap_relocate(KERNBASE
);
2343 metadata_missing
= 1;
2345 if (bootinfo
.bi_envp
)
2346 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
2349 preload_metadata
= (caddr_t
)(uintptr_t)(modulep
+ PTOV_OFFSET
);
2350 preload_bootstrap_relocate(PTOV_OFFSET
);
2351 kmdp
= preload_search_by_type("elf kernel");
2353 kmdp
= preload_search_by_type("elf64 kernel");
2354 boothowto
= MD_FETCH(kmdp
, MODINFOMD_HOWTO
, int);
2355 kern_envp
= MD_FETCH(kmdp
, MODINFOMD_ENVP
, char *) + PTOV_OFFSET
;
2357 ksym_start
= MD_FETCH(kmdp
, MODINFOMD_SSYM
, uintptr_t);
2358 ksym_end
= MD_FETCH(kmdp
, MODINFOMD_ESYM
, uintptr_t);
2360 efi_systbl_phys
= MD_FETCH(kmdp
, MODINFOMD_FW_HANDLE
, vm_paddr_t
);
2362 if (boothowto
& RB_VERBOSE
)
2366 * Default MachIntrABI to ICU
2368 MachIntrABI
= MachIntrABI_ICU
;
2371 * start with one cpu. Note: with one cpu, ncpus_fit_mask remain 0.
2375 /* Init basic tunables, hz etc */
2379 * make gdt memory segments
2381 gdt_segs
[GPROC0_SEL
].ssd_base
=
2382 (uintptr_t) &CPU_prvspace
[0]->mdglobaldata
.gd_common_tss
;
2384 gd
->mi
.gd_prvspace
= CPU_prvspace
[0];
2386 for (x
= 0; x
< NGDT
; x
++) {
2387 if (x
!= GPROC0_SEL
&& x
!= (GPROC0_SEL
+ 1))
2388 ssdtosd(&gdt_segs
[x
], &gdt
[x
]);
2390 ssdtosyssd(&gdt_segs
[GPROC0_SEL
],
2391 (struct system_segment_descriptor
*)&gdt
[GPROC0_SEL
]);
2393 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
2394 r_gdt
.rd_base
= (long) gdt
;
2397 wrmsr(MSR_FSBASE
, 0); /* User value */
2398 wrmsr(MSR_GSBASE
, (u_int64_t
)&gd
->mi
);
2399 wrmsr(MSR_KGSBASE
, 0); /* User value while in the kernel */
2401 mi_gdinit(&gd
->mi
, 0);
2403 proc0paddr
= proc0paddr_buff
;
2404 mi_proc0init(&gd
->mi
, proc0paddr
);
2405 safepri
= TDPRI_MAX
;
2407 /* spinlocks and the BGL */
2411 for (x
= 0; x
< NIDT
; x
++)
2412 setidt_global(x
, &IDTVEC(rsvd
), SDT_SYSIGT
, SEL_KPL
, 0);
2413 setidt_global(IDT_DE
, &IDTVEC(div
), SDT_SYSIGT
, SEL_KPL
, 0);
2414 setidt_global(IDT_DB
, &IDTVEC(dbg
), SDT_SYSIGT
, SEL_KPL
, 0);
2415 setidt_global(IDT_NMI
, &IDTVEC(nmi
), SDT_SYSIGT
, SEL_KPL
, 1);
2416 setidt_global(IDT_BP
, &IDTVEC(bpt
), SDT_SYSIGT
, SEL_UPL
, 0);
2417 setidt_global(IDT_OF
, &IDTVEC(ofl
), SDT_SYSIGT
, SEL_KPL
, 0);
2418 setidt_global(IDT_BR
, &IDTVEC(bnd
), SDT_SYSIGT
, SEL_KPL
, 0);
2419 setidt_global(IDT_UD
, &IDTVEC(ill
), SDT_SYSIGT
, SEL_KPL
, 0);
2420 setidt_global(IDT_NM
, &IDTVEC(dna
), SDT_SYSIGT
, SEL_KPL
, 0);
2421 setidt_global(IDT_DF
, &IDTVEC(dblfault
), SDT_SYSIGT
, SEL_KPL
, 1);
2422 setidt_global(IDT_FPUGP
, &IDTVEC(fpusegm
), SDT_SYSIGT
, SEL_KPL
, 0);
2423 setidt_global(IDT_TS
, &IDTVEC(tss
), SDT_SYSIGT
, SEL_KPL
, 0);
2424 setidt_global(IDT_NP
, &IDTVEC(missing
), SDT_SYSIGT
, SEL_KPL
, 0);
2425 setidt_global(IDT_SS
, &IDTVEC(stk
), SDT_SYSIGT
, SEL_KPL
, 0);
2426 setidt_global(IDT_GP
, &IDTVEC(prot
), SDT_SYSIGT
, SEL_KPL
, 0);
2427 setidt_global(IDT_PF
, &IDTVEC(page
), SDT_SYSIGT
, SEL_KPL
, 0);
2428 setidt_global(IDT_MF
, &IDTVEC(fpu
), SDT_SYSIGT
, SEL_KPL
, 0);
2429 setidt_global(IDT_AC
, &IDTVEC(align
), SDT_SYSIGT
, SEL_KPL
, 0);
2430 setidt_global(IDT_MC
, &IDTVEC(mchk
), SDT_SYSIGT
, SEL_KPL
, 0);
2431 setidt_global(IDT_XF
, &IDTVEC(xmm
), SDT_SYSIGT
, SEL_KPL
, 0);
2433 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
2434 r_idt_arr
[cpu
].rd_limit
= sizeof(idt_arr
[cpu
]) - 1;
2435 r_idt_arr
[cpu
].rd_base
= (long) &idt_arr
[cpu
][0];
2438 lidt(&r_idt_arr
[0]);
2441 * Initialize the console before we print anything out.
2446 if (metadata_missing
)
2447 kprintf("WARNING: loader(8) metadata is missing!\n");
2457 * Initialize IRQ mapping
2460 * SHOULD be after elcr_probe()
2462 MachIntrABI_ICU
.initmap();
2463 MachIntrABI_IOAPIC
.initmap();
2467 if (boothowto
& RB_KDB
)
2468 Debugger("Boot flags requested debugger");
2471 identify_cpu(); /* Final stage of CPU initialization */
2472 initializecpu(0); /* Initialize CPU registers */
2475 * On modern Intel cpus, haswell or later, cpu_idle_hlt=1 is better
2476 * because the cpu does significant power management in MWAIT
2477 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2479 * On modern AMD cpus cpu_idle_hlt=3 is better, because the cpu does
2480 * significant power management only when using ACPI halt mode.
2482 * On older AMD or Intel cpus, cpu_idle_hlt=2 is better because ACPI
2483 * is needed to reduce power consumption, but wakeup times are often
2486 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
2487 CPUID_TO_MODEL(cpu_id
) >= 0x3C) { /* Haswell or later */
2490 if (cpu_vendor_id
== CPU_VENDOR_AMD
) {
2491 if (CPUID_TO_FAMILY(cpu_id
) >= 0x17) {
2492 /* Ryzen or later */
2494 } else if (CPUID_TO_FAMILY(cpu_id
) >= 0x14) {
2495 /* Bobcat or later */
2500 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable
); /* for compat */
2501 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable
);
2502 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable
);
2503 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt
);
2506 * Some of the virtual machines do not work w/ I/O APIC
2507 * enabled. If the user does not explicitly enable or
2508 * disable the I/O APIC (ioapic_enable < 0), then we
2509 * disable I/O APIC on all virtual machines.
2512 * This must be done after identify_cpu(), which sets
2515 if (ioapic_enable
< 0) {
2516 if (cpu_feature2
& CPUID2_VMM
)
2522 /* make an initial tss so cpu can get interrupt stack on syscall! */
2523 gd
->gd_common_tss
.tss_rsp0
=
2524 (register_t
)(thread0
.td_kstack
+
2525 KSTACK_PAGES
* PAGE_SIZE
- sizeof(struct pcb
));
2526 /* Ensure the stack is aligned to 16 bytes */
2527 gd
->gd_common_tss
.tss_rsp0
&= ~(register_t
)0xF;
2529 /* double fault stack */
2530 gd
->gd_common_tss
.tss_ist1
=
2531 (long)&gd
->mi
.gd_prvspace
->idlestack
[
2532 sizeof(gd
->mi
.gd_prvspace
->idlestack
)];
2534 /* Set the IO permission bitmap (empty due to tss seg limit) */
2535 gd
->gd_common_tss
.tss_iobase
= sizeof(struct x86_64tss
);
2537 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2538 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
];
2539 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2542 /* Set up the fast syscall stuff */
2543 msr
= rdmsr(MSR_EFER
) | EFER_SCE
;
2544 wrmsr(MSR_EFER
, msr
);
2545 wrmsr(MSR_LSTAR
, (u_int64_t
)IDTVEC(fast_syscall
));
2546 wrmsr(MSR_CSTAR
, (u_int64_t
)IDTVEC(fast_syscall32
));
2547 msr
= ((u_int64_t
)GSEL(GCODE_SEL
, SEL_KPL
) << 32) |
2548 ((u_int64_t
)GSEL(GUCODE32_SEL
, SEL_UPL
) << 48);
2549 wrmsr(MSR_STAR
, msr
);
2550 wrmsr(MSR_SF_MASK
, PSL_NT
|PSL_T
|PSL_I
|PSL_C
|PSL_D
|PSL_IOPL
);
2552 getmemsize(kmdp
, physfree
);
2553 init_param2(physmem
);
2555 /* now running on new page tables, configured,and u/iom is accessible */
2557 /* Map the message buffer. */
2559 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2560 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2563 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2566 /* transfer to user mode */
2568 _ucodesel
= GSEL(GUCODE_SEL
, SEL_UPL
);
2569 _udatasel
= GSEL(GUDATA_SEL
, SEL_UPL
);
2570 _ucode32sel
= GSEL(GUCODE32_SEL
, SEL_UPL
);
2576 /* setup proc 0's pcb */
2577 thread0
.td_pcb
->pcb_flags
= 0;
2578 thread0
.td_pcb
->pcb_cr3
= KPML4phys
;
2579 thread0
.td_pcb
->pcb_ext
= NULL
;
2580 lwp0
.lwp_md
.md_regs
= &proc0_tf
; /* XXX needed? */
2582 /* Location of kernel stack for locore */
2583 return ((u_int64_t
)thread0
.td_pcb
);
2587 * Initialize machine-dependant portions of the global data structure.
2588 * Note that the global data area and cpu0's idlestack in the private
2589 * data space were allocated in locore.
2591 * Note: the idlethread's cpl is 0
2593 * WARNING! Called from early boot, 'mycpu' may not work yet.
2596 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2599 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2601 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2602 gd
->mi
.gd_prvspace
->idlestack
,
2603 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2605 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2606 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2607 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2608 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2612 * We only have to check for DMAP bounds, the globaldata space is
2613 * actually part of the kernel_map so we don't have to waste time
2614 * checking CPU_prvspace[*].
2617 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2620 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2621 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2625 if (saddr
>= DMAP_MIN_ADDRESS
&& eaddr
<= DMAP_MAX_ADDRESS
)
2631 globaldata_find(int cpu
)
2633 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2634 return(&CPU_prvspace
[cpu
]->mdglobaldata
.mi
);
2638 * This path should be safe from the SYSRET issue because only stopped threads
2639 * can have their %rip adjusted this way (and all heavy weight thread switches
2640 * clear QUICKREF and thus do not use SYSRET). However, the code path is
2641 * convoluted so add a safety by forcing %rip to be cannonical.
2644 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2646 if (addr
& 0x0000800000000000LLU
)
2647 lp
->lwp_md
.md_regs
->tf_rip
= addr
| 0xFFFF000000000000LLU
;
2649 lp
->lwp_md
.md_regs
->tf_rip
= addr
& 0x0000FFFFFFFFFFFFLLU
;
2654 ptrace_single_step(struct lwp
*lp
)
2656 lp
->lwp_md
.md_regs
->tf_rflags
|= PSL_T
;
2661 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2663 struct trapframe
*tp
;
2665 if ((tp
= lp
->lwp_md
.md_regs
) == NULL
)
2667 bcopy(&tp
->tf_rdi
, ®s
->r_rdi
, sizeof(*regs
));
2672 set_regs(struct lwp
*lp
, struct reg
*regs
)
2674 struct trapframe
*tp
;
2676 tp
= lp
->lwp_md
.md_regs
;
2677 if (!EFL_SECURE(regs
->r_rflags
, tp
->tf_rflags
) ||
2678 !CS_SECURE(regs
->r_cs
))
2680 bcopy(®s
->r_rdi
, &tp
->tf_rdi
, sizeof(*regs
));
2686 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2688 struct env87
*penv_87
= &sv_87
->sv_env
;
2689 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2692 /* FPU control/status */
2693 penv_87
->en_cw
= penv_xmm
->en_cw
;
2694 penv_87
->en_sw
= penv_xmm
->en_sw
;
2695 penv_87
->en_tw
= penv_xmm
->en_tw
;
2696 penv_87
->en_fip
= penv_xmm
->en_fip
;
2697 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2698 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2699 penv_87
->en_foo
= penv_xmm
->en_foo
;
2700 penv_87
->en_fos
= penv_xmm
->en_fos
;
2703 for (i
= 0; i
< 8; ++i
)
2704 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2708 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2710 struct env87
*penv_87
= &sv_87
->sv_env
;
2711 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2714 /* FPU control/status */
2715 penv_xmm
->en_cw
= penv_87
->en_cw
;
2716 penv_xmm
->en_sw
= penv_87
->en_sw
;
2717 penv_xmm
->en_tw
= penv_87
->en_tw
;
2718 penv_xmm
->en_fip
= penv_87
->en_fip
;
2719 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2720 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2721 penv_xmm
->en_foo
= penv_87
->en_foo
;
2722 penv_xmm
->en_fos
= penv_87
->en_fos
;
2725 for (i
= 0; i
< 8; ++i
)
2726 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2730 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2732 if (lp
->lwp_thread
== NULL
|| lp
->lwp_thread
->td_pcb
== NULL
)
2735 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2736 (struct save87
*)fpregs
);
2739 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2744 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2747 set_fpregs_xmm((struct save87
*)fpregs
,
2748 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2751 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2756 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2761 dbregs
->dr
[0] = rdr0();
2762 dbregs
->dr
[1] = rdr1();
2763 dbregs
->dr
[2] = rdr2();
2764 dbregs
->dr
[3] = rdr3();
2765 dbregs
->dr
[4] = rdr4();
2766 dbregs
->dr
[5] = rdr5();
2767 dbregs
->dr
[6] = rdr6();
2768 dbregs
->dr
[7] = rdr7();
2771 if (lp
->lwp_thread
== NULL
|| (pcb
= lp
->lwp_thread
->td_pcb
) == NULL
)
2773 dbregs
->dr
[0] = pcb
->pcb_dr0
;
2774 dbregs
->dr
[1] = pcb
->pcb_dr1
;
2775 dbregs
->dr
[2] = pcb
->pcb_dr2
;
2776 dbregs
->dr
[3] = pcb
->pcb_dr3
;
2779 dbregs
->dr
[6] = pcb
->pcb_dr6
;
2780 dbregs
->dr
[7] = pcb
->pcb_dr7
;
2785 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2788 load_dr0(dbregs
->dr
[0]);
2789 load_dr1(dbregs
->dr
[1]);
2790 load_dr2(dbregs
->dr
[2]);
2791 load_dr3(dbregs
->dr
[3]);
2792 load_dr4(dbregs
->dr
[4]);
2793 load_dr5(dbregs
->dr
[5]);
2794 load_dr6(dbregs
->dr
[6]);
2795 load_dr7(dbregs
->dr
[7]);
2798 struct ucred
*ucred
;
2800 uint64_t mask1
, mask2
;
2803 * Don't let an illegal value for dr7 get set. Specifically,
2804 * check for undefined settings. Setting these bit patterns
2805 * result in undefined behaviour and can lead to an unexpected
2808 /* JG this loop looks unreadable */
2809 /* Check 4 2-bit fields for invalid patterns.
2810 * These fields are R/Wi, for i = 0..3
2812 /* Is 10 in LENi allowed when running in compatibility mode? */
2813 /* Pattern 10 in R/Wi might be used to indicate
2814 * breakpoint on I/O. Further analysis should be
2815 * carried to decide if it is safe and useful to
2816 * provide access to that capability
2818 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 4;
2819 i
++, mask1
<<= 4, mask2
<<= 4)
2820 if ((dbregs
->dr
[7] & mask1
) == mask2
)
2823 pcb
= lp
->lwp_thread
->td_pcb
;
2824 ucred
= lp
->lwp_proc
->p_ucred
;
2827 * Don't let a process set a breakpoint that is not within the
2828 * process's address space. If a process could do this, it
2829 * could halt the system by setting a breakpoint in the kernel
2830 * (if ddb was enabled). Thus, we need to check to make sure
2831 * that no breakpoints are being enabled for addresses outside
2832 * process's address space, unless, perhaps, we were called by
2835 * XXX - what about when the watched area of the user's
2836 * address space is written into from within the kernel
2837 * ... wouldn't that still cause a breakpoint to be generated
2838 * from within kernel mode?
2841 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2842 if (dbregs
->dr
[7] & 0x3) {
2843 /* dr0 is enabled */
2844 if (dbregs
->dr
[0] >= VM_MAX_USER_ADDRESS
)
2848 if (dbregs
->dr
[7] & (0x3<<2)) {
2849 /* dr1 is enabled */
2850 if (dbregs
->dr
[1] >= VM_MAX_USER_ADDRESS
)
2854 if (dbregs
->dr
[7] & (0x3<<4)) {
2855 /* dr2 is enabled */
2856 if (dbregs
->dr
[2] >= VM_MAX_USER_ADDRESS
)
2860 if (dbregs
->dr
[7] & (0x3<<6)) {
2861 /* dr3 is enabled */
2862 if (dbregs
->dr
[3] >= VM_MAX_USER_ADDRESS
)
2867 pcb
->pcb_dr0
= dbregs
->dr
[0];
2868 pcb
->pcb_dr1
= dbregs
->dr
[1];
2869 pcb
->pcb_dr2
= dbregs
->dr
[2];
2870 pcb
->pcb_dr3
= dbregs
->dr
[3];
2871 pcb
->pcb_dr6
= dbregs
->dr
[6];
2872 pcb
->pcb_dr7
= dbregs
->dr
[7];
2874 pcb
->pcb_flags
|= PCB_DBREGS
;
2881 * Return > 0 if a hardware breakpoint has been hit, and the
2882 * breakpoint was in user space. Return 0, otherwise.
2885 user_dbreg_trap(void)
2887 u_int64_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2888 u_int64_t bp
; /* breakpoint bits extracted from dr6 */
2889 int nbp
; /* number of breakpoints that triggered */
2890 caddr_t addr
[4]; /* breakpoint addresses */
2894 if ((dr7
& 0xff) == 0) {
2896 * all GE and LE bits in the dr7 register are zero,
2897 * thus the trap couldn't have been caused by the
2898 * hardware debug registers
2909 * None of the breakpoint bits are set meaning this
2910 * trap was not caused by any of the debug registers
2916 * at least one of the breakpoints were hit, check to see
2917 * which ones and if any of them are user space addresses
2921 addr
[nbp
++] = (caddr_t
)rdr0();
2924 addr
[nbp
++] = (caddr_t
)rdr1();
2927 addr
[nbp
++] = (caddr_t
)rdr2();
2930 addr
[nbp
++] = (caddr_t
)rdr3();
2933 for (i
=0; i
<nbp
; i
++) {
2935 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2937 * addr[i] is in user space
2944 * None of the breakpoints are in user space.
2952 Debugger(const char *msg
)
2954 kprintf("Debugger(\"%s\") called.\n", msg
);
2961 * Provide inb() and outb() as functions. They are normally only
2962 * available as macros calling inlined functions, thus cannot be
2963 * called inside DDB.
2965 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2971 /* silence compiler warnings */
2973 void outb(u_int
, u_char
);
2980 * We use %%dx and not %1 here because i/o is done at %dx and not at
2981 * %edx, while gcc generates inferior code (movw instead of movl)
2982 * if we tell it to load (u_short) port.
2984 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2989 outb(u_int port
, u_char data
)
2993 * Use an unnecessary assignment to help gcc's register allocator.
2994 * This make a large difference for gcc-1.40 and a tiny difference
2995 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2996 * best results. gcc-2.6.0 can't handle this.
2999 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
3007 * initialize all the SMP locks
3010 /* critical region when masking or unmasking interupts */
3011 struct spinlock_deprecated imen_spinlock
;
3013 /* lock region used by kernel profiling */
3014 struct spinlock_deprecated mcount_spinlock
;
3016 /* locks com (tty) data/hardware accesses: a FASTINTR() */
3017 struct spinlock_deprecated com_spinlock
;
3019 /* lock regions around the clock hardware */
3020 struct spinlock_deprecated clock_spinlock
;
3026 * Get the initial mplock with a count of 1 for the BSP.
3027 * This uses a LOGICAL cpu ID, ie BSP == 0.
3029 cpu_get_initial_mplock();
3031 spin_init_deprecated(&mcount_spinlock
);
3032 spin_init_deprecated(&imen_spinlock
);
3033 spin_init_deprecated(&com_spinlock
);
3034 spin_init_deprecated(&clock_spinlock
);
3036 /* our token pool needs to work early */
3037 lwkt_token_pool_init();
3041 cpu_mwait_hint_valid(uint32_t hint
)
3045 cx_idx
= MWAIT_EAX_TO_CX(hint
);
3046 if (cx_idx
>= CPU_MWAIT_CX_MAX
)
3049 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
3050 if (sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
3057 cpu_mwait_cx_no_bmsts(void)
3059 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_STS
);
3063 cpu_mwait_cx_no_bmarb(void)
3065 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_ARB
);
3069 cpu_mwait_cx_hint2name(int hint
, char *name
, int namelen
, boolean_t allow_auto
)
3071 int old_cx_idx
, sub
= 0;
3074 old_cx_idx
= MWAIT_EAX_TO_CX(hint
);
3075 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
3076 } else if (hint
== CPU_MWAIT_HINT_AUTO
) {
3077 old_cx_idx
= allow_auto
? CPU_MWAIT_C2
: CPU_MWAIT_CX_MAX
;
3078 } else if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
3079 old_cx_idx
= allow_auto
? CPU_MWAIT_C3
: CPU_MWAIT_CX_MAX
;
3081 old_cx_idx
= CPU_MWAIT_CX_MAX
;
3084 if (!CPU_MWAIT_HAS_CX
)
3085 strlcpy(name
, "NONE", namelen
);
3086 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTO
)
3087 strlcpy(name
, "AUTO", namelen
);
3088 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTODEEP
)
3089 strlcpy(name
, "AUTODEEP", namelen
);
3090 else if (old_cx_idx
>= CPU_MWAIT_CX_MAX
||
3091 sub
>= cpu_mwait_cx_info
[old_cx_idx
].subcnt
)
3092 strlcpy(name
, "INVALID", namelen
);
3094 ksnprintf(name
, namelen
, "C%d/%d", old_cx_idx
, sub
);
3100 cpu_mwait_cx_name2hint(char *name
, int *hint0
, boolean_t allow_auto
)
3102 int cx_idx
, sub
, hint
;
3105 if (allow_auto
&& strcmp(name
, "AUTO") == 0) {
3106 hint
= CPU_MWAIT_HINT_AUTO
;
3107 cx_idx
= CPU_MWAIT_C2
;
3110 if (allow_auto
&& strcmp(name
, "AUTODEEP") == 0) {
3111 hint
= CPU_MWAIT_HINT_AUTODEEP
;
3112 cx_idx
= CPU_MWAIT_C3
;
3116 if (strlen(name
) < 4 || toupper(name
[0]) != 'C')
3121 cx_idx
= strtol(start
, &ptr
, 10);
3122 if (ptr
== start
|| *ptr
!= '/')
3124 if (cx_idx
< 0 || cx_idx
>= CPU_MWAIT_CX_MAX
)
3130 sub
= strtol(start
, &ptr
, 10);
3133 if (sub
< 0 || sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
3136 hint
= MWAIT_EAX_HINT(cx_idx
, sub
);
3143 cpu_mwait_cx_transit(int old_cx_idx
, int cx_idx
)
3145 if (cx_idx
>= CPU_MWAIT_C3
&& cpu_mwait_c3_preamble
)
3147 if (old_cx_idx
< CPU_MWAIT_C3
&& cx_idx
>= CPU_MWAIT_C3
) {
3150 error
= cputimer_intr_powersave_addreq();
3153 } else if (old_cx_idx
>= CPU_MWAIT_C3
&& cx_idx
< CPU_MWAIT_C3
) {
3154 cputimer_intr_powersave_remreq();
3160 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
, int *hint0
,
3161 boolean_t allow_auto
)
3163 int error
, cx_idx
, old_cx_idx
, hint
;
3164 char name
[CPU_MWAIT_CX_NAMELEN
];
3167 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
),
3170 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3171 if (error
!= 0 || req
->newptr
== NULL
)
3174 if (!CPU_MWAIT_HAS_CX
)
3177 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, allow_auto
);
3181 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3190 cpu_mwait_cx_setname(struct cpu_idle_stat
*stat
, const char *cx_name
)
3192 int error
, cx_idx
, old_cx_idx
, hint
;
3193 char name
[CPU_MWAIT_CX_NAMELEN
];
3195 KASSERT(CPU_MWAIT_HAS_CX
, ("cpu does not support mwait CX extension"));
3198 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3200 strlcpy(name
, cx_name
, sizeof(name
));
3201 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3205 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3214 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3216 int hint
= cpu_mwait_halt_global
;
3217 int error
, cx_idx
, cpu
;
3218 char name
[CPU_MWAIT_CX_NAMELEN
], cx_name
[CPU_MWAIT_CX_NAMELEN
];
3220 cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3222 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3223 if (error
!= 0 || req
->newptr
== NULL
)
3226 if (!CPU_MWAIT_HAS_CX
)
3229 /* Save name for later per-cpu CX configuration */
3230 strlcpy(cx_name
, name
, sizeof(cx_name
));
3232 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3236 /* Change per-cpu CX configuration */
3237 for (cpu
= 0; cpu
< ncpus
; ++cpu
) {
3238 error
= cpu_mwait_cx_setname(&cpu_idle_stats
[cpu
], cx_name
);
3243 cpu_mwait_halt_global
= hint
;
3248 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3250 struct cpu_idle_stat
*stat
= arg1
;
3253 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3259 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
)
3263 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3264 &cpu_mwait_spin
, FALSE
);
3269 * This manual debugging code is called unconditionally from Xtimer
3270 * (the per-cpu timer interrupt) whether the current thread is in a
3271 * critical section or not) and can be useful in tracking down lockups.
3273 * NOTE: MANUAL DEBUG CODE
3276 static int saveticks
[SMP_MAXCPU
];
3277 static int savecounts
[SMP_MAXCPU
];
3281 pcpu_timer_always(struct intrframe
*frame
)
3284 globaldata_t gd
= mycpu
;
3285 int cpu
= gd
->gd_cpuid
;
3291 gptr
= (short *)0xFFFFFFFF800b8000 + 80 * cpu
;
3292 *gptr
= ((*gptr
+ 1) & 0x00FF) | 0x0700;
3295 ksnprintf(buf
, sizeof(buf
), " %p %16s %d %16s ",
3296 (void *)frame
->if_rip
, gd
->gd_curthread
->td_comm
, ticks
,
3298 for (i
= 0; buf
[i
]; ++i
) {
3299 gptr
[i
] = 0x0700 | (unsigned char)buf
[i
];
3303 if (saveticks
[gd
->gd_cpuid
] != ticks
) {
3304 saveticks
[gd
->gd_cpuid
] = ticks
;
3305 savecounts
[gd
->gd_cpuid
] = 0;
3307 ++savecounts
[gd
->gd_cpuid
];
3308 if (savecounts
[gd
->gd_cpuid
] > 2000 && panicstr
== NULL
) {
3309 panic("cpud %d panicing on ticks failure",
3312 for (i
= 0; i
< ncpus
; ++i
) {
3314 if (saveticks
[i
] && panicstr
== NULL
) {
3315 delta
= saveticks
[i
] - ticks
;
3316 if (delta
< -10 || delta
> 10) {
3317 panic("cpu %d panicing on cpu %d watchdog",