2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
43 //#include "use_npx.h"
47 #include "opt_directio.h"
49 #include "opt_msgbuf.h"
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/sysproto.h>
55 #include <sys/signalvar.h>
56 #include <sys/kernel.h>
57 #include <sys/linker.h>
58 #include <sys/malloc.h>
62 #include <sys/reboot.h>
64 #include <sys/msgbuf.h>
65 #include <sys/sysent.h>
66 #include <sys/sysctl.h>
67 #include <sys/vmmeter.h>
69 #include <sys/usched.h>
72 #include <sys/ctype.h>
73 #include <sys/serialize.h>
74 #include <sys/systimer.h>
77 #include <vm/vm_param.h>
79 #include <vm/vm_kern.h>
80 #include <vm/vm_object.h>
81 #include <vm/vm_page.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_pager.h>
84 #include <vm/vm_extern.h>
86 #include <sys/thread2.h>
87 #include <sys/mplock2.h>
88 #include <sys/mutex2.h>
98 #include <machine/cpu.h>
99 #include <machine/clock.h>
100 #include <machine/specialreg.h>
102 #include <machine/bootinfo.h>
104 #include <machine/md_var.h>
105 #include <machine/metadata.h>
106 #include <machine/pc/bios.h>
107 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
108 #include <machine/globaldata.h> /* CPU_prvspace */
109 #include <machine/smp.h>
110 #include <machine/cputypes.h>
111 #include <machine/intr_machdep.h>
112 #include <machine/framebuffer.h>
115 #include <bus/isa/isa_device.h>
117 #include <machine_base/isa/isa_intr.h>
118 #include <bus/isa/rtc.h>
119 #include <sys/random.h>
120 #include <sys/ptrace.h>
121 #include <machine/sigframe.h>
123 #include <sys/machintr.h>
124 #include <machine_base/icu/icu_abi.h>
125 #include <machine_base/icu/elcr_var.h>
126 #include <machine_base/apic/lapic.h>
127 #include <machine_base/apic/ioapic.h>
128 #include <machine_base/apic/ioapic_abi.h>
129 #include <machine/mptable.h>
131 #define PHYSMAP_ENTRIES 10
133 extern u_int64_t
hammer_time(u_int64_t
, u_int64_t
);
135 extern void printcpuinfo(void); /* XXX header file */
136 extern void identify_cpu(void);
138 extern void finishidentcpu(void);
140 extern void panicifcpuunsupported(void);
142 static void cpu_startup(void *);
143 static void pic_finish(void *);
144 static void cpu_finish(void *);
146 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
147 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
148 static void init_locks(void);
150 extern void pcpu_timer_always(struct intrframe
*);
152 SYSINIT(cpu
, SI_BOOT2_START_CPU
, SI_ORDER_FIRST
, cpu_startup
, NULL
);
153 SYSINIT(pic_finish
, SI_BOOT2_FINISH_PIC
, SI_ORDER_FIRST
, pic_finish
, NULL
);
154 SYSINIT(cpu_finish
, SI_BOOT2_FINISH_CPU
, SI_ORDER_FIRST
, cpu_finish
, NULL
);
157 extern vm_offset_t ksym_start
, ksym_end
;
160 struct privatespace CPU_prvspace_bsp
__aligned(4096);
161 struct privatespace
*CPU_prvspace
[MAXCPU
] = { &CPU_prvspace_bsp
};
163 vm_paddr_t efi_systbl_phys
;
164 int _udatasel
, _ucodesel
, _ucode32sel
;
166 int64_t tsc_offsets
[MAXCPU
];
167 cpumask_t smp_idleinvl_mask
;
168 cpumask_t smp_idleinvl_reqs
;
170 static int cpu_mwait_halt_global
; /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
172 #if defined(SWTCH_OPTIM_STATS)
173 extern int swtch_optim_stats
;
174 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
175 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
176 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
177 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
179 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_halt
,
180 CTLFLAG_RD
, &cpu_mwait_halt_global
, 0, "");
181 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_spin
, CTLFLAG_RD
, &cpu_mwait_spin
, 0,
182 "monitor/mwait target state");
184 #define CPU_MWAIT_HAS_CX \
185 ((cpu_feature2 & CPUID2_MON) && \
186 (cpu_mwait_feature & CPUID_MWAIT_EXT))
188 #define CPU_MWAIT_CX_NAMELEN 16
190 #define CPU_MWAIT_C1 1
191 #define CPU_MWAIT_C2 2
192 #define CPU_MWAIT_C3 3
193 #define CPU_MWAIT_CX_MAX 8
195 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */
196 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */
198 SYSCTL_NODE(_machdep
, OID_AUTO
, mwait
, CTLFLAG_RW
, 0, "MWAIT features");
199 SYSCTL_NODE(_machdep_mwait
, OID_AUTO
, CX
, CTLFLAG_RW
, 0, "MWAIT Cx settings");
201 struct cpu_mwait_cx
{
204 struct sysctl_ctx_list sysctl_ctx
;
205 struct sysctl_oid
*sysctl_tree
;
207 static struct cpu_mwait_cx cpu_mwait_cx_info
[CPU_MWAIT_CX_MAX
];
208 static char cpu_mwait_cx_supported
[256];
210 static int cpu_mwait_c1_hints_cnt
;
211 static int cpu_mwait_hints_cnt
;
212 static int *cpu_mwait_hints
;
214 static int cpu_mwait_deep_hints_cnt
;
215 static int *cpu_mwait_deep_hints
;
217 #define CPU_IDLE_REPEAT_DEFAULT 750
219 static u_int cpu_idle_repeat
= CPU_IDLE_REPEAT_DEFAULT
;
220 static u_long cpu_idle_repeat_max
= CPU_IDLE_REPEAT_DEFAULT
;
221 static u_int cpu_mwait_repeat_shift
= 1;
223 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1
224 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2
226 static int cpu_mwait_c3_preamble
=
227 CPU_MWAIT_C3_PREAMBLE_BM_ARB
|
228 CPU_MWAIT_C3_PREAMBLE_BM_STS
;
230 SYSCTL_STRING(_machdep_mwait_CX
, OID_AUTO
, supported
, CTLFLAG_RD
,
231 cpu_mwait_cx_supported
, 0, "MWAIT supported C states");
232 SYSCTL_INT(_machdep_mwait_CX
, OID_AUTO
, c3_preamble
, CTLFLAG_RD
,
233 &cpu_mwait_c3_preamble
, 0, "C3+ preamble mask");
235 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
,
237 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
);
238 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
);
239 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
);
241 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, idle
, CTLTYPE_STRING
|CTLFLAG_RW
,
242 NULL
, 0, cpu_mwait_cx_idle_sysctl
, "A", "");
243 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, spin
, CTLTYPE_STRING
|CTLFLAG_RW
,
244 NULL
, 0, cpu_mwait_cx_spin_sysctl
, "A", "");
245 SYSCTL_UINT(_machdep_mwait_CX
, OID_AUTO
, repeat_shift
, CTLFLAG_RW
,
246 &cpu_mwait_repeat_shift
, 0, "");
250 u_long ebda_addr
= 0;
252 int imcr_present
= 0;
254 int naps
= 0; /* # of Applications processors */
257 struct mtx dt_lock
; /* lock for GDT and LDT */
260 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
262 u_long pmem
= ctob(physmem
);
264 int error
= sysctl_handle_long(oidp
, &pmem
, 0, req
);
268 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_ULONG
|CTLFLAG_RD
,
269 0, 0, sysctl_hw_physmem
, "LU", "Total system memory in bytes (number of pages * page size)");
272 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
274 int error
= sysctl_handle_int(oidp
, 0,
275 ctob(physmem
- vmstats
.v_wire_count
), req
);
279 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
280 0, 0, sysctl_hw_usermem
, "IU", "");
283 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
285 int error
= sysctl_handle_int(oidp
, 0,
286 x86_64_btop(avail_end
- avail_start
), req
);
290 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
291 0, 0, sysctl_hw_availpages
, "I", "");
297 * The number of PHYSMAP entries must be one less than the number of
298 * PHYSSEG entries because the PHYSMAP entry that spans the largest
299 * physical address that is accessible by ISA DMA is split into two
302 vm_phystable_t phys_avail
[VM_PHYSSEG_MAX
+ 1];
303 vm_phystable_t dump_avail
[VM_PHYSSEG_MAX
+ 1];
305 /* must be 1 less so 0 0 can signal end of chunks */
306 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1)
307 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1)
309 static vm_offset_t buffer_sva
, buffer_eva
;
310 vm_offset_t clean_sva
, clean_eva
;
311 static vm_offset_t pager_sva
, pager_eva
;
312 static struct trapframe proc0_tf
;
315 cpu_startup(void *dummy
)
319 vm_offset_t firstaddr
;
322 * Good {morning,afternoon,evening,night}.
324 kprintf("%s", version
);
327 panicifcpuunsupported();
328 kprintf("real memory = %ju (%ju MB)\n",
330 (intmax_t)Realmem
/ 1024 / 1024);
332 * Display any holes after the first chunk of extended memory.
337 kprintf("Physical memory chunk(s):\n");
338 for (indx
= 0; phys_avail
[indx
].phys_end
!= 0; ++indx
) {
341 size1
= phys_avail
[indx
].phys_end
-
342 phys_avail
[indx
].phys_beg
;
344 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
345 (intmax_t)phys_avail
[indx
].phys_beg
,
346 (intmax_t)phys_avail
[indx
].phys_end
- 1,
348 (intmax_t)(size1
/ PAGE_SIZE
));
353 * Allocate space for system data structures.
354 * The first available kernel virtual address is in "v".
355 * As pages of kernel virtual memory are allocated, "v" is incremented.
356 * As pages of memory are allocated and cleared,
357 * "firstaddr" is incremented.
358 * An index into the kernel page table corresponding to the
359 * virtual memory address maintained in "v" is kept in "mapaddr".
363 * Make two passes. The first pass calculates how much memory is
364 * needed and allocates it. The second pass assigns virtual
365 * addresses to the various data structures.
369 v
= (caddr_t
)firstaddr
;
371 #define valloc(name, type, num) \
372 (name) = (type *)v; v = (caddr_t)((name)+(num))
373 #define valloclim(name, type, num, lim) \
374 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
377 * The nominal buffer size (and minimum KVA allocation) is MAXBSIZE.
378 * For the first 64MB of ram nominally allocate sufficient buffers to
379 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
380 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
381 * the buffer cache we limit the eventual kva reservation to
384 * factor represents the 1/4 x ram conversion.
387 long factor
= 4 * NBUFCALCSIZE
/ 1024;
388 long kbytes
= physmem
* (PAGE_SIZE
/ 1024);
392 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
394 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
395 if (maxbcache
&& nbuf
> maxbcache
/ NBUFCALCSIZE
)
396 nbuf
= maxbcache
/ NBUFCALCSIZE
;
400 * Do not allow the buffer_map to be more then 1/2 the size of the
403 if (nbuf
> (virtual_end
- virtual_start
+
404 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2)) {
405 nbuf
= (virtual_end
- virtual_start
+
406 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2);
407 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf
);
411 * Do not allow the buffer_map to use more than 50% of available
412 * physical-equivalent memory. Since the VM pages which back
413 * individual buffers are typically wired, having too many bufs
414 * can prevent the system from paging properly.
416 if (nbuf
> physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2)) {
417 nbuf
= physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2);
418 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf
);
422 * Do not allow the sizeof(struct buf) * nbuf to exceed half of
423 * the valloc space which is just the virtual_end - virtual_start
424 * section. We use valloc() to allocate the buf header array.
426 if (nbuf
> (virtual_end
- virtual_start
) / sizeof(struct buf
) / 2) {
427 nbuf
= (virtual_end
- virtual_start
) /
428 sizeof(struct buf
) / 2;
429 kprintf("Warning: nbufs capped at %ld due to valloc "
430 "considerations\n", nbuf
);
433 nswbuf_mem
= lmax(lmin(nbuf
/ 32, 512), 8);
435 if (nswbuf_mem
< NSWBUF_MIN
)
436 nswbuf_mem
= NSWBUF_MIN
;
438 nswbuf_kva
= lmax(lmin(nbuf
/ 4, 512), 16);
440 if (nswbuf_kva
< NSWBUF_MIN
)
441 nswbuf_kva
= NSWBUF_MIN
;
444 valloc(swbuf_mem
, struct buf
, nswbuf_mem
);
445 valloc(swbuf_kva
, struct buf
, nswbuf_kva
);
446 valloc(buf
, struct buf
, nbuf
);
449 * End of first pass, size has been calculated so allocate memory
451 if (firstaddr
== 0) {
452 size
= (vm_size_t
)(v
- firstaddr
);
453 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
),
456 panic("startup: no room for tables");
461 * End of second pass, addresses have been assigned
463 * nbuf is an int, make sure we don't overflow the field.
465 * On 64-bit systems we always reserve maximal allocations for
466 * buffer cache buffers and there are no fragmentation issues,
467 * so the KVA segment does not have to be excessively oversized.
469 if ((vm_size_t
)(v
- firstaddr
) != size
)
470 panic("startup: table size inconsistency");
472 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
473 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
) +
474 ((nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) + pager_map_size
);
475 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
476 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
));
477 buffer_map
.system_map
= 1;
478 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
479 ((vm_offset_t
)(nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) +
481 pager_map
.system_map
= 1;
482 kprintf("avail memory = %ju (%ju MB)\n",
483 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
),
484 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
) /
488 struct cpu_idle_stat
{
496 u_long mwait_cx
[CPU_MWAIT_CX_MAX
];
499 #define CPU_IDLE_STAT_HALT -1
500 #define CPU_IDLE_STAT_SPIN -2
502 static struct cpu_idle_stat cpu_idle_stats
[MAXCPU
];
505 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS
)
507 int idx
= arg2
, cpu
, error
;
510 if (idx
== CPU_IDLE_STAT_HALT
) {
511 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
512 val
+= cpu_idle_stats
[cpu
].halt
;
513 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
514 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
515 val
+= cpu_idle_stats
[cpu
].spin
;
517 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
518 ("invalid index %d", idx
));
519 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
520 val
+= cpu_idle_stats
[cpu
].mwait_cx
[idx
];
523 error
= sysctl_handle_quad(oidp
, &val
, 0, req
);
524 if (error
|| req
->newptr
== NULL
)
527 if (idx
== CPU_IDLE_STAT_HALT
) {
528 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
529 cpu_idle_stats
[cpu
].halt
= 0;
530 cpu_idle_stats
[0].halt
= val
;
531 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
532 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
533 cpu_idle_stats
[cpu
].spin
= 0;
534 cpu_idle_stats
[0].spin
= val
;
536 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
537 ("invalid index %d", idx
));
538 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
539 cpu_idle_stats
[cpu
].mwait_cx
[idx
] = 0;
540 cpu_idle_stats
[0].mwait_cx
[idx
] = val
;
546 cpu_mwait_attach(void)
551 if (!CPU_MWAIT_HAS_CX
)
554 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
555 (CPUID_TO_FAMILY(cpu_id
) > 0xf ||
556 (CPUID_TO_FAMILY(cpu_id
) == 0x6 &&
557 CPUID_TO_MODEL(cpu_id
) >= 0xf))) {
561 * Pentium dual-core, Core 2 and beyond do not need any
562 * additional activities to enter deep C-state, i.e. C3(+).
564 cpu_mwait_cx_no_bmarb();
566 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts
);
568 cpu_mwait_cx_no_bmsts();
571 sbuf_new(&sb
, cpu_mwait_cx_supported
,
572 sizeof(cpu_mwait_cx_supported
), SBUF_FIXEDLEN
);
574 for (i
= 0; i
< CPU_MWAIT_CX_MAX
; ++i
) {
575 struct cpu_mwait_cx
*cx
= &cpu_mwait_cx_info
[i
];
578 ksnprintf(cx
->name
, sizeof(cx
->name
), "C%d", i
);
580 sysctl_ctx_init(&cx
->sysctl_ctx
);
581 cx
->sysctl_tree
= SYSCTL_ADD_NODE(&cx
->sysctl_ctx
,
582 SYSCTL_STATIC_CHILDREN(_machdep_mwait
), OID_AUTO
,
583 cx
->name
, CTLFLAG_RW
, NULL
, "Cx control/info");
584 if (cx
->sysctl_tree
== NULL
)
587 cx
->subcnt
= CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu
, i
);
588 SYSCTL_ADD_INT(&cx
->sysctl_ctx
,
589 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
590 "subcnt", CTLFLAG_RD
, &cx
->subcnt
, 0,
592 SYSCTL_ADD_PROC(&cx
->sysctl_ctx
,
593 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
594 "entered", (CTLTYPE_QUAD
| CTLFLAG_RW
), 0,
595 i
, sysctl_cpu_idle_cnt
, "Q", "# of times entered");
597 for (sub
= 0; sub
< cx
->subcnt
; ++sub
)
598 sbuf_printf(&sb
, "C%d/%d ", i
, sub
);
606 cpu_mwait_c1_hints_cnt
= cpu_mwait_cx_info
[CPU_MWAIT_C1
].subcnt
;
607 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
)
608 cpu_mwait_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
609 cpu_mwait_hints
= kmalloc(sizeof(int) * cpu_mwait_hints_cnt
,
613 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
) {
616 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
617 for (j
= 0; j
< subcnt
; ++j
) {
618 KASSERT(hint_idx
< cpu_mwait_hints_cnt
,
619 ("invalid mwait hint index %d", hint_idx
));
620 cpu_mwait_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
624 KASSERT(hint_idx
== cpu_mwait_hints_cnt
,
625 ("mwait hint count %d != index %d",
626 cpu_mwait_hints_cnt
, hint_idx
));
629 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt
);
630 for (i
= 0; i
< cpu_mwait_hints_cnt
; ++i
) {
631 int hint
= cpu_mwait_hints
[i
];
633 kprintf(" C%d/%d hint 0x%04x\n",
634 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
642 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
)
643 cpu_mwait_deep_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
644 cpu_mwait_deep_hints
= kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt
,
648 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
) {
651 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
652 for (j
= 0; j
< subcnt
; ++j
) {
653 KASSERT(hint_idx
< cpu_mwait_deep_hints_cnt
,
654 ("invalid mwait deep hint index %d", hint_idx
));
655 cpu_mwait_deep_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
659 KASSERT(hint_idx
== cpu_mwait_deep_hints_cnt
,
660 ("mwait deep hint count %d != index %d",
661 cpu_mwait_deep_hints_cnt
, hint_idx
));
664 kprintf("MWAIT deep hints:\n");
665 for (i
= 0; i
< cpu_mwait_deep_hints_cnt
; ++i
) {
666 int hint
= cpu_mwait_deep_hints
[i
];
668 kprintf(" C%d/%d hint 0x%04x\n",
669 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
673 cpu_idle_repeat_max
= 256 * cpu_mwait_deep_hints_cnt
;
675 for (i
= 0; i
< ncpus
; ++i
) {
678 ksnprintf(name
, sizeof(name
), "idle%d", i
);
679 SYSCTL_ADD_PROC(NULL
,
680 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX
), OID_AUTO
,
681 name
, (CTLTYPE_STRING
| CTLFLAG_RW
), &cpu_idle_stats
[i
],
682 0, cpu_mwait_cx_pcpu_idle_sysctl
, "A", "");
687 cpu_finish(void *dummy __unused
)
694 pic_finish(void *dummy __unused
)
696 /* Log ELCR information */
699 /* Log MPTABLE information */
700 mptable_pci_int_dump();
703 MachIntrABI
.finalize();
707 * Send an interrupt to process.
709 * Stack is set up to allow sigcode stored
710 * at top to call routine, followed by kcall
711 * to sigreturn routine below. After sigreturn
712 * resets the signal mask, the stack, and the
713 * frame pointer, it returns to the user
717 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
719 struct lwp
*lp
= curthread
->td_lwp
;
720 struct proc
*p
= lp
->lwp_proc
;
721 struct trapframe
*regs
;
722 struct sigacts
*psp
= p
->p_sigacts
;
723 struct sigframe sf
, *sfp
;
727 regs
= lp
->lwp_md
.md_regs
;
728 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
730 /* Save user context */
731 bzero(&sf
, sizeof(struct sigframe
));
732 sf
.sf_uc
.uc_sigmask
= *mask
;
733 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
734 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
735 KKASSERT(__offsetof(struct trapframe
, tf_rdi
) == 0);
736 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_rdi
, sizeof(struct trapframe
));
738 /* Make the size of the saved context visible to userland */
739 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
741 /* Allocate and validate space for the signal handler context. */
742 if ((lp
->lwp_flags
& LWP_ALTSTACK
) != 0 && !oonstack
&&
743 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
744 sp
= (char *)(lp
->lwp_sigstk
.ss_sp
+ lp
->lwp_sigstk
.ss_size
-
745 sizeof(struct sigframe
));
746 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
748 /* We take red zone into account */
749 sp
= (char *)regs
->tf_rsp
- sizeof(struct sigframe
) - 128;
753 * XXX AVX needs 64-byte alignment but sigframe has other fields and
754 * the embedded ucontext is not at the front, so aligning this won't
755 * help us. Fortunately we bcopy in/out of the sigframe, so the
758 * The problem though is if userland winds up trying to use the
761 sfp
= (struct sigframe
*)((intptr_t)sp
& ~(intptr_t)0xF);
763 /* Translate the signal is appropriate */
764 if (p
->p_sysent
->sv_sigtbl
) {
765 if (sig
<= p
->p_sysent
->sv_sigsize
)
766 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
770 * Build the argument list for the signal handler.
772 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
774 regs
->tf_rdi
= sig
; /* argument 1 */
775 regs
->tf_rdx
= (register_t
)&sfp
->sf_uc
; /* argument 3 */
777 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
779 * Signal handler installed with SA_SIGINFO.
781 * action(signo, siginfo, ucontext)
783 regs
->tf_rsi
= (register_t
)&sfp
->sf_si
; /* argument 2 */
784 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
785 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
787 /* fill siginfo structure */
788 sf
.sf_si
.si_signo
= sig
;
789 sf
.sf_si
.si_code
= code
;
790 sf
.sf_si
.si_addr
= (void *)regs
->tf_addr
;
793 * Old FreeBSD-style arguments.
795 * handler (signo, code, [uc], addr)
797 regs
->tf_rsi
= (register_t
)code
; /* argument 2 */
798 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
799 sf
.sf_ahu
.sf_handler
= catcher
;
803 * If we're a vm86 process, we want to save the segment registers.
804 * We also change eflags to be our emulated eflags, not the actual
808 if (regs
->tf_eflags
& PSL_VM
) {
809 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
810 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
812 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
813 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
814 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
815 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
817 if (vm86
->vm86_has_vme
== 0)
818 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
819 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
820 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
823 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
824 * syscalls made by the signal handler. This just avoids
825 * wasting time for our lazy fixup of such faults. PSL_NT
826 * does nothing in vm86 mode, but vm86 programs can set it
827 * almost legitimately in probes for old cpu types.
829 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
834 * Save the FPU state and reinit the FP unit
836 npxpush(&sf
.sf_uc
.uc_mcontext
);
839 * Copy the sigframe out to the user's stack.
841 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
843 * Something is wrong with the stack pointer.
844 * ...Kill the process.
849 regs
->tf_rsp
= (register_t
)sfp
;
850 regs
->tf_rip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
853 * i386 abi specifies that the direction flag must be cleared
856 regs
->tf_rflags
&= ~(PSL_T
|PSL_D
);
859 * 64 bit mode has a code and stack selector but
860 * no data or extra selector. %fs and %gs are not
863 regs
->tf_cs
= _ucodesel
;
864 regs
->tf_ss
= _udatasel
;
869 * Sanitize the trapframe for a virtual kernel passing control to a custom
870 * VM context. Remove any items that would otherwise create a privilage
873 * XXX at the moment we allow userland to set the resume flag. Is this a
877 cpu_sanitize_frame(struct trapframe
*frame
)
879 frame
->tf_cs
= _ucodesel
;
880 frame
->tf_ss
= _udatasel
;
881 /* XXX VM (8086) mode not supported? */
882 frame
->tf_rflags
&= (PSL_RF
| PSL_USERCHANGE
| PSL_VM_UNSUPP
);
883 frame
->tf_rflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
889 * Sanitize the tls so loading the descriptor does not blow up
890 * on us. For x86_64 we don't have to do anything.
893 cpu_sanitize_tls(struct savetls
*tls
)
899 * sigreturn(ucontext_t *sigcntxp)
901 * System call to cleanup state after a signal
902 * has been taken. Reset signal mask and
903 * stack state from context left by sendsig (above).
904 * Return to previous pc and psl as specified by
905 * context left by sendsig. Check carefully to
906 * make sure that the user has not modified the
907 * state to gain improper privileges.
911 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
912 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
915 sys_sigreturn(struct sigreturn_args
*uap
)
917 struct lwp
*lp
= curthread
->td_lwp
;
918 struct trapframe
*regs
;
926 * We have to copy the information into kernel space so userland
927 * can't modify it while we are sniffing it.
929 regs
= lp
->lwp_md
.md_regs
;
930 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
934 rflags
= ucp
->uc_mcontext
.mc_rflags
;
936 /* VM (8086) mode not supported */
937 rflags
&= ~PSL_VM_UNSUPP
;
940 if (eflags
& PSL_VM
) {
941 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
942 struct vm86_kernel
*vm86
;
945 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
946 * set up the vm86 area, and we can't enter vm86 mode.
948 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
950 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
951 if (vm86
->vm86_inited
== 0)
954 /* go back to user mode if both flags are set */
955 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
956 trapsignal(lp
, SIGBUS
, 0);
958 if (vm86
->vm86_has_vme
) {
959 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
960 (eflags
& VME_USERCHANGE
) | PSL_VM
;
962 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
963 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
964 (eflags
& VM_USERCHANGE
) | PSL_VM
;
966 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
967 tf
->tf_eflags
= eflags
;
968 tf
->tf_vm86_ds
= tf
->tf_ds
;
969 tf
->tf_vm86_es
= tf
->tf_es
;
970 tf
->tf_vm86_fs
= tf
->tf_fs
;
971 tf
->tf_vm86_gs
= tf
->tf_gs
;
972 tf
->tf_ds
= _udatasel
;
973 tf
->tf_es
= _udatasel
;
974 tf
->tf_fs
= _udatasel
;
975 tf
->tf_gs
= _udatasel
;
980 * Don't allow users to change privileged or reserved flags.
983 * XXX do allow users to change the privileged flag PSL_RF.
984 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
985 * should sometimes set it there too. tf_eflags is kept in
986 * the signal context during signal handling and there is no
987 * other place to remember it, so the PSL_RF bit may be
988 * corrupted by the signal handler without us knowing.
989 * Corruption of the PSL_RF bit at worst causes one more or
990 * one less debugger trap, so allowing it is fairly harmless.
992 if (!EFL_SECURE(rflags
& ~PSL_RF
, regs
->tf_rflags
& ~PSL_RF
)) {
993 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags
);
998 * Don't allow users to load a valid privileged %cs. Let the
999 * hardware check for invalid selectors, excess privilege in
1000 * other selectors, invalid %eip's and invalid %esp's.
1002 cs
= ucp
->uc_mcontext
.mc_cs
;
1003 if (!CS_SECURE(cs
)) {
1004 kprintf("sigreturn: cs = 0x%x\n", cs
);
1005 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
1008 bcopy(&ucp
->uc_mcontext
.mc_rdi
, regs
, sizeof(struct trapframe
));
1012 * Restore the FPU state from the frame
1015 npxpop(&ucp
->uc_mcontext
);
1017 if (ucp
->uc_mcontext
.mc_onstack
& 1)
1018 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
1020 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
1022 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
1023 SIG_CANTMASK(lp
->lwp_sigmask
);
1026 return(EJUSTRETURN
);
1030 * Machine dependent boot() routine
1032 * I haven't seen anything to put here yet
1033 * Possibly some stuff might be grafted back here from boot()
1041 * Shutdown the CPU as much as possible
1047 __asm__
__volatile("hlt");
1051 * cpu_idle() represents the idle LWKT. You cannot return from this function
1052 * (unless you want to blow things up!). Instead we look for runnable threads
1053 * and loop or halt as appropriate. Giant is not held on entry to the thread.
1055 * The main loop is entered with a critical section held, we must release
1056 * the critical section before doing anything else. lwkt_switch() will
1057 * check for pending interrupts due to entering and exiting its own
1060 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1061 * However, there are cases where the idlethread will be entered with
1062 * the possibility that no IPI will occur and in such cases
1063 * lwkt_switch() sets TDF_IDLE_NOHLT.
1065 * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1066 * must occur before it starts using ACPI halt.
1068 * NOTE: Value overridden in hammer_time().
1070 static int cpu_idle_hlt
= 2;
1071 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
1072 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
1073 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_repeat
, CTLFLAG_RW
,
1074 &cpu_idle_repeat
, 0, "Idle entries before acpi hlt");
1076 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1077 0, CPU_IDLE_STAT_HALT
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry halts");
1078 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_spincnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1079 0, CPU_IDLE_STAT_SPIN
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry spins");
1082 cpu_idle_default_hook(void)
1085 * We must guarentee that hlt is exactly the instruction
1086 * following the sti.
1088 __asm
__volatile("sti; hlt");
1091 /* Other subsystems (e.g., ACPI) can hook this later. */
1092 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
1095 cpu_mwait_cx_hint(struct cpu_idle_stat
*stat
)
1104 idx
= (stat
->repeat
+ stat
->repeat_last
+ stat
->repeat_delta
) >>
1105 cpu_mwait_repeat_shift
;
1106 if (idx
>= cpu_mwait_c1_hints_cnt
) {
1107 /* Step up faster, once we walked through all C1 states */
1108 stat
->repeat_delta
+= 1 << (cpu_mwait_repeat_shift
+ 1);
1110 if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
1111 if (idx
>= cpu_mwait_deep_hints_cnt
)
1112 idx
= cpu_mwait_deep_hints_cnt
- 1;
1113 hint
= cpu_mwait_deep_hints
[idx
];
1115 if (idx
>= cpu_mwait_hints_cnt
)
1116 idx
= cpu_mwait_hints_cnt
- 1;
1117 hint
= cpu_mwait_hints
[idx
];
1120 cx_idx
= MWAIT_EAX_TO_CX(hint
);
1121 if (cx_idx
>= 0 && cx_idx
< CPU_MWAIT_CX_MAX
)
1122 stat
->mwait_cx
[cx_idx
]++;
1129 globaldata_t gd
= mycpu
;
1130 struct cpu_idle_stat
*stat
= &cpu_idle_stats
[gd
->gd_cpuid
];
1131 struct thread
*td __debugvar
= gd
->gd_curthread
;
1135 stat
->repeat
= stat
->repeat_last
= cpu_idle_repeat_max
;
1138 KKASSERT(td
->td_critcount
== 0);
1142 * See if there are any LWKTs ready to go.
1147 * When halting inside a cli we must check for reqflags
1148 * races, particularly [re]schedule requests. Running
1149 * splz() does the job.
1152 * 0 Never halt, just spin
1154 * 1 Always use HLT (or MONITOR/MWAIT if avail).
1156 * Better default for modern (Haswell+) Intel
1159 * 2 Use HLT/MONITOR/MWAIT up to a point and then
1160 * use the ACPI halt (default). This is a hybrid
1161 * approach. See machdep.cpu_idle_repeat.
1163 * Better default for modern AMD cpus and older
1166 * 3 Always use the ACPI halt. This typically
1167 * eats the least amount of power but the cpu
1168 * will be slow waking up. Slows down e.g.
1169 * compiles and other pipe/event oriented stuff.
1173 * NOTE: Interrupts are enabled and we are not in a critical
1176 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
1177 * don't bother capping gd_idle_repeat, it is ok if
1180 * Implement optimized invltlb operations when halted
1181 * in idle. By setting the bit in smp_idleinvl_mask
1182 * we inform other cpus that they can set _reqs to
1183 * request an invltlb. Current the code to do that
1184 * sets the bits in _reqs anyway, but then check _mask
1185 * to determine if they can assume the invltlb will execute.
1187 * A critical section is required to ensure that interrupts
1188 * do not fully run until after we've had a chance to execute
1191 if (gd
->gd_idle_repeat
== 0) {
1192 stat
->repeat
= (stat
->repeat
+ stat
->repeat_last
) >> 1;
1193 if (stat
->repeat
> cpu_idle_repeat_max
)
1194 stat
->repeat
= cpu_idle_repeat_max
;
1195 stat
->repeat_last
= 0;
1196 stat
->repeat_delta
= 0;
1198 ++stat
->repeat_last
;
1200 ++gd
->gd_idle_repeat
;
1201 reqflags
= gd
->gd_reqflags
;
1202 quick
= (cpu_idle_hlt
== 1) ||
1203 (cpu_idle_hlt
< 3 &&
1204 gd
->gd_idle_repeat
< cpu_idle_repeat
);
1206 if (quick
&& (cpu_mi_feature
& CPU_MI_MONITOR
) &&
1207 (reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1210 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1211 cpu_mmw_pause_int(&gd
->gd_reqflags
, reqflags
,
1212 cpu_mwait_cx_hint(stat
), 0);
1214 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1215 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1221 } else if (cpu_idle_hlt
) {
1222 __asm
__volatile("cli");
1225 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1226 if ((gd
->gd_reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1228 cpu_idle_default_hook();
1232 __asm
__volatile("sti");
1234 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1235 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1243 __asm
__volatile("sti");
1250 * Called in a loop indirectly via Xcpustop
1253 cpu_smp_stopped(void)
1255 globaldata_t gd
= mycpu
;
1256 volatile __uint64_t
*ptr
;
1259 ptr
= CPUMASK_ADDR(started_cpus
, gd
->gd_cpuid
);
1261 if ((ovalue
& CPUMASK_SIMPLE(gd
->gd_cpuid
& 63)) == 0) {
1262 if (cpu_mi_feature
& CPU_MI_MONITOR
) {
1263 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr
), ovalue
,
1264 cpu_mwait_hints
[CPU_MWAIT_C1
], 0);
1266 cpu_halt(); /* depend on lapic timer */
1272 * This routine is called if a spinlock has been held through the
1273 * exponential backoff period and is seriously contested. On a real cpu
1277 cpu_spinlock_contested(void)
1283 * Clear registers on exec
1286 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
1288 struct thread
*td
= curthread
;
1289 struct lwp
*lp
= td
->td_lwp
;
1290 struct pcb
*pcb
= td
->td_pcb
;
1291 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
1293 /* was i386_user_cleanup() in NetBSD */
1297 bzero((char *)regs
, sizeof(struct trapframe
));
1298 regs
->tf_rip
= entry
;
1299 regs
->tf_rsp
= ((stack
- 8) & ~0xFul
) + 8; /* align the stack */
1300 regs
->tf_rdi
= stack
; /* argv */
1301 regs
->tf_rflags
= PSL_USER
| (regs
->tf_rflags
& PSL_T
);
1302 regs
->tf_ss
= _udatasel
;
1303 regs
->tf_cs
= _ucodesel
;
1304 regs
->tf_rbx
= ps_strings
;
1307 * Reset the hardware debug registers if they were in use.
1308 * They won't have any meaning for the newly exec'd process.
1310 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1316 pcb
->pcb_dr7
= 0; /* JG set bit 10? */
1317 if (pcb
== td
->td_pcb
) {
1319 * Clear the debug registers on the running
1320 * CPU, otherwise they will end up affecting
1321 * the next process we switch to.
1325 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1329 * Initialize the math emulator (if any) for the current process.
1330 * Actually, just clear the bit that says that the emulator has
1331 * been initialized. Initialization is delayed until the process
1332 * traps to the emulator (if it is done at all) mainly because
1333 * emulators don't provide an entry point for initialization.
1335 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1338 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1339 * gd_npxthread. Otherwise a preemptive interrupt thread
1340 * may panic in npxdna().
1343 load_cr0(rcr0() | CR0_MP
);
1346 * NOTE: The MSR values must be correct so we can return to
1347 * userland. gd_user_fs/gs must be correct so the switch
1348 * code knows what the current MSR values are.
1350 pcb
->pcb_fsbase
= 0; /* Values loaded from PCB on switch */
1351 pcb
->pcb_gsbase
= 0;
1352 mdcpu
->gd_user_fs
= 0; /* Cache of current MSR values */
1353 mdcpu
->gd_user_gs
= 0;
1354 wrmsr(MSR_FSBASE
, 0); /* Set MSR values for return to userland */
1355 wrmsr(MSR_KGSBASE
, 0);
1357 /* Initialize the npx (if any) for the current process. */
1361 pcb
->pcb_ds
= _udatasel
;
1362 pcb
->pcb_es
= _udatasel
;
1363 pcb
->pcb_fs
= _udatasel
;
1364 pcb
->pcb_gs
= _udatasel
;
1373 cr0
|= CR0_NE
; /* Done by npxinit() */
1374 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1375 cr0
|= CR0_WP
| CR0_AM
;
1381 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1384 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1386 if (!error
&& req
->newptr
)
1391 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1392 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1394 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1395 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1398 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1399 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1402 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1403 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1405 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1406 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1407 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1410 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS
)
1412 struct efi_map_header
*efihdr
;
1416 kmdp
= preload_search_by_type("elf kernel");
1418 kmdp
= preload_search_by_type("elf64 kernel");
1419 efihdr
= (struct efi_map_header
*)preload_search_info(kmdp
,
1420 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1423 efisize
= *((uint32_t *)efihdr
- 1);
1424 return (SYSCTL_OUT(req
, efihdr
, efisize
));
1426 SYSCTL_PROC(_machdep
, OID_AUTO
, efi_map
, CTLTYPE_OPAQUE
|CTLFLAG_RD
, NULL
, 0,
1427 efi_map_sysctl_handler
, "S,efi_map_header", "Raw EFI Memory Map");
1430 * Initialize 386 and configure to run kernel
1434 * Initialize segments & interrupt table
1438 struct user_segment_descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1439 struct gate_descriptor idt_arr
[MAXCPU
][NIDT
];
1441 union descriptor ldt
[NLDT
]; /* local descriptor table */
1444 /* table descriptors - used to load tables by cpu */
1445 struct region_descriptor r_gdt
;
1446 struct region_descriptor r_idt_arr
[MAXCPU
];
1448 /* JG proc0paddr is a virtual address */
1451 char proc0paddr_buff
[LWKT_THREAD_STACK
];
1454 /* software prototypes -- in more palatable form */
1455 struct soft_segment_descriptor gdt_segs
[] = {
1456 /* GNULL_SEL 0 Null Descriptor */
1457 { 0x0, /* segment base address */
1459 0, /* segment type */
1460 0, /* segment descriptor priority level */
1461 0, /* segment descriptor present */
1463 0, /* default 32 vs 16 bit size */
1464 0 /* limit granularity (byte/page units)*/ },
1465 /* GCODE_SEL 1 Code Descriptor for kernel */
1466 { 0x0, /* segment base address */
1467 0xfffff, /* length - all address space */
1468 SDT_MEMERA
, /* segment type */
1469 SEL_KPL
, /* segment descriptor priority level */
1470 1, /* segment descriptor present */
1472 0, /* default 32 vs 16 bit size */
1473 1 /* limit granularity (byte/page units)*/ },
1474 /* GDATA_SEL 2 Data Descriptor for kernel */
1475 { 0x0, /* segment base address */
1476 0xfffff, /* length - all address space */
1477 SDT_MEMRWA
, /* segment type */
1478 SEL_KPL
, /* segment descriptor priority level */
1479 1, /* segment descriptor present */
1481 0, /* default 32 vs 16 bit size */
1482 1 /* limit granularity (byte/page units)*/ },
1483 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1484 { 0x0, /* segment base address */
1485 0xfffff, /* length - all address space */
1486 SDT_MEMERA
, /* segment type */
1487 SEL_UPL
, /* segment descriptor priority level */
1488 1, /* segment descriptor present */
1490 1, /* default 32 vs 16 bit size */
1491 1 /* limit granularity (byte/page units)*/ },
1492 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1493 { 0x0, /* segment base address */
1494 0xfffff, /* length - all address space */
1495 SDT_MEMRWA
, /* segment type */
1496 SEL_UPL
, /* segment descriptor priority level */
1497 1, /* segment descriptor present */
1499 1, /* default 32 vs 16 bit size */
1500 1 /* limit granularity (byte/page units)*/ },
1501 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1502 { 0x0, /* segment base address */
1503 0xfffff, /* length - all address space */
1504 SDT_MEMERA
, /* segment type */
1505 SEL_UPL
, /* segment descriptor priority level */
1506 1, /* segment descriptor present */
1508 0, /* default 32 vs 16 bit size */
1509 1 /* limit granularity (byte/page units)*/ },
1510 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1512 0x0, /* segment base address */
1513 sizeof(struct x86_64tss
)-1,/* length - all address space */
1514 SDT_SYSTSS
, /* segment type */
1515 SEL_KPL
, /* segment descriptor priority level */
1516 1, /* segment descriptor present */
1518 0, /* unused - default 32 vs 16 bit size */
1519 0 /* limit granularity (byte/page units)*/ },
1520 /* Actually, the TSS is a system descriptor which is double size */
1521 { 0x0, /* segment base address */
1523 0, /* segment type */
1524 0, /* segment descriptor priority level */
1525 0, /* segment descriptor present */
1527 0, /* default 32 vs 16 bit size */
1528 0 /* limit granularity (byte/page units)*/ },
1529 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1530 { 0x0, /* segment base address */
1531 0xfffff, /* length - all address space */
1532 SDT_MEMRWA
, /* segment type */
1533 SEL_UPL
, /* segment descriptor priority level */
1534 1, /* segment descriptor present */
1536 1, /* default 32 vs 16 bit size */
1537 1 /* limit granularity (byte/page units)*/ },
1541 setidt_global(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
)
1545 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
1546 struct gate_descriptor
*ip
= &idt_arr
[cpu
][idx
];
1548 ip
->gd_looffset
= (uintptr_t)func
;
1549 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1555 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1560 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
, int cpu
)
1562 struct gate_descriptor
*ip
;
1564 KASSERT(cpu
>= 0 && cpu
< ncpus
, ("invalid cpu %d", cpu
));
1566 ip
= &idt_arr
[cpu
][idx
];
1567 ip
->gd_looffset
= (uintptr_t)func
;
1568 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1574 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1577 #define IDTVEC(name) __CONCAT(X,name)
1580 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1581 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1582 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1583 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(rsvd
), IDTVEC(fpu
), IDTVEC(align
),
1584 IDTVEC(xmm
), IDTVEC(dblfault
),
1585 IDTVEC(fast_syscall
), IDTVEC(fast_syscall32
);
1588 sdtossd(struct user_segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1590 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1591 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1592 ssd
->ssd_type
= sd
->sd_type
;
1593 ssd
->ssd_dpl
= sd
->sd_dpl
;
1594 ssd
->ssd_p
= sd
->sd_p
;
1595 ssd
->ssd_def32
= sd
->sd_def32
;
1596 ssd
->ssd_gran
= sd
->sd_gran
;
1600 ssdtosd(struct soft_segment_descriptor
*ssd
, struct user_segment_descriptor
*sd
)
1603 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1604 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xff;
1605 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1606 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1607 sd
->sd_type
= ssd
->ssd_type
;
1608 sd
->sd_dpl
= ssd
->ssd_dpl
;
1609 sd
->sd_p
= ssd
->ssd_p
;
1610 sd
->sd_long
= ssd
->ssd_long
;
1611 sd
->sd_def32
= ssd
->ssd_def32
;
1612 sd
->sd_gran
= ssd
->ssd_gran
;
1616 ssdtosyssd(struct soft_segment_descriptor
*ssd
,
1617 struct system_segment_descriptor
*sd
)
1620 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1621 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xfffffffffful
;
1622 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1623 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1624 sd
->sd_type
= ssd
->ssd_type
;
1625 sd
->sd_dpl
= ssd
->ssd_dpl
;
1626 sd
->sd_p
= ssd
->ssd_p
;
1627 sd
->sd_gran
= ssd
->ssd_gran
;
1631 * Populate the (physmap) array with base/bound pairs describing the
1632 * available physical memory in the system, then test this memory and
1633 * build the phys_avail array describing the actually-available memory.
1635 * If we cannot accurately determine the physical memory map, then use
1636 * value from the 0xE801 call, and failing that, the RTC.
1638 * Total memory size may be set by the kernel environment variable
1639 * hw.physmem or the compile-time define MAXMEM.
1641 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1642 * of PAGE_SIZE. This also greatly reduces the memory test time
1643 * which would otherwise be excessive on machines with > 8G of ram.
1645 * XXX first should be vm_paddr_t.
1648 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024)
1649 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1650 #define PHYSMAP_SIZE VM_PHYSSEG_MAX
1652 vm_paddr_t physmap
[PHYSMAP_SIZE
];
1653 struct bios_smap
*smapbase
, *smap
, *smapend
;
1654 struct efi_map_header
*efihdrbase
;
1657 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024)
1658 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1)
1661 add_smap_entries(int *physmap_idx
)
1665 smapsize
= *((u_int32_t
*)smapbase
- 1);
1666 smapend
= (struct bios_smap
*)((uintptr_t)smapbase
+ smapsize
);
1668 for (smap
= smapbase
; smap
< smapend
; smap
++) {
1669 if (boothowto
& RB_VERBOSE
)
1670 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1671 smap
->type
, smap
->base
, smap
->length
);
1673 if (smap
->type
!= SMAP_TYPE_MEMORY
)
1676 if (smap
->length
== 0)
1679 for (i
= 0; i
<= *physmap_idx
; i
+= 2) {
1680 if (smap
->base
< physmap
[i
+ 1]) {
1681 if (boothowto
& RB_VERBOSE
) {
1682 kprintf("Overlapping or non-monotonic "
1683 "memory region, ignoring "
1689 if (i
<= *physmap_idx
)
1692 Realmem
+= smap
->length
;
1694 if (smap
->base
== physmap
[*physmap_idx
+ 1]) {
1695 physmap
[*physmap_idx
+ 1] += smap
->length
;
1700 if (*physmap_idx
== PHYSMAP_SIZE
) {
1701 kprintf("Too many segments in the physical "
1702 "address map, giving up\n");
1705 physmap
[*physmap_idx
] = smap
->base
;
1706 physmap
[*physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1711 add_efi_map_entries(int *physmap_idx
)
1713 struct efi_md
*map
, *p
;
1718 static const char *types
[] = {
1724 "RuntimeServicesCode",
1725 "RuntimeServicesData",
1726 "ConventionalMemory",
1728 "ACPIReclaimMemory",
1731 "MemoryMappedIOPortSpace",
1736 * Memory map data provided by UEFI via the GetMemoryMap
1737 * Boot Services API.
1739 efisz
= (sizeof(struct efi_map_header
) + 0xf) & ~0xf;
1740 map
= (struct efi_md
*)((uint8_t *)efihdrbase
+ efisz
);
1742 if (efihdrbase
->descriptor_size
== 0)
1744 ndesc
= efihdrbase
->memory_size
/ efihdrbase
->descriptor_size
;
1746 if (boothowto
& RB_VERBOSE
)
1747 kprintf("%23s %12s %12s %8s %4s\n",
1748 "Type", "Physical", "Virtual", "#Pages", "Attr");
1750 for (i
= 0, p
= map
; i
< ndesc
; i
++,
1751 p
= efi_next_descriptor(p
, efihdrbase
->descriptor_size
)) {
1752 if (boothowto
& RB_VERBOSE
) {
1753 if (p
->md_type
<= EFI_MD_TYPE_PALCODE
)
1754 type
= types
[p
->md_type
];
1757 kprintf("%23s %012lx %12p %08lx ", type
, p
->md_phys
,
1758 p
->md_virt
, p
->md_pages
);
1759 if (p
->md_attr
& EFI_MD_ATTR_UC
)
1761 if (p
->md_attr
& EFI_MD_ATTR_WC
)
1763 if (p
->md_attr
& EFI_MD_ATTR_WT
)
1765 if (p
->md_attr
& EFI_MD_ATTR_WB
)
1767 if (p
->md_attr
& EFI_MD_ATTR_UCE
)
1769 if (p
->md_attr
& EFI_MD_ATTR_WP
)
1771 if (p
->md_attr
& EFI_MD_ATTR_RP
)
1773 if (p
->md_attr
& EFI_MD_ATTR_XP
)
1775 if (p
->md_attr
& EFI_MD_ATTR_RT
)
1780 switch (p
->md_type
) {
1781 case EFI_MD_TYPE_CODE
:
1782 case EFI_MD_TYPE_DATA
:
1783 case EFI_MD_TYPE_BS_CODE
:
1784 case EFI_MD_TYPE_BS_DATA
:
1785 case EFI_MD_TYPE_FREE
:
1787 * We're allowed to use any entry with these types.
1794 Realmem
+= p
->md_pages
* PAGE_SIZE
;
1796 if (p
->md_phys
== physmap
[*physmap_idx
+ 1]) {
1797 physmap
[*physmap_idx
+ 1] += p
->md_pages
* PAGE_SIZE
;
1802 if (*physmap_idx
== PHYSMAP_SIZE
) {
1803 kprintf("Too many segments in the physical "
1804 "address map, giving up\n");
1807 physmap
[*physmap_idx
] = p
->md_phys
;
1808 physmap
[*physmap_idx
+ 1] = p
->md_phys
+ p
->md_pages
* PAGE_SIZE
;
1812 struct fb_info efi_fb_info
;
1813 static int have_efi_framebuffer
= 0;
1816 efi_fb_init_vaddr(int direct_map
)
1819 vm_offset_t addr
, v
;
1821 v
= efi_fb_info
.vaddr
;
1822 sz
= efi_fb_info
.stride
* efi_fb_info
.height
;
1825 addr
= PHYS_TO_DMAP(efi_fb_info
.paddr
);
1826 if (addr
>= DMAP_MIN_ADDRESS
&& addr
+ sz
< DMAP_MAX_ADDRESS
)
1827 efi_fb_info
.vaddr
= addr
;
1829 efi_fb_info
.vaddr
= (vm_offset_t
)pmap_mapdev_attr(
1830 efi_fb_info
.paddr
, sz
, PAT_WRITE_COMBINING
);
1835 probe_efi_fb(int early
)
1837 struct efi_fb
*efifb
;
1840 if (have_efi_framebuffer
) {
1842 (efi_fb_info
.vaddr
== 0 ||
1843 efi_fb_info
.vaddr
== PHYS_TO_DMAP(efi_fb_info
.paddr
)))
1844 efi_fb_init_vaddr(0);
1848 kmdp
= preload_search_by_type("elf kernel");
1850 kmdp
= preload_search_by_type("elf64 kernel");
1851 efifb
= (struct efi_fb
*)preload_search_info(kmdp
,
1852 MODINFO_METADATA
| MODINFOMD_EFI_FB
);
1856 have_efi_framebuffer
= 1;
1858 efi_fb_info
.is_vga_boot_display
= 1;
1859 efi_fb_info
.width
= efifb
->fb_width
;
1860 efi_fb_info
.height
= efifb
->fb_height
;
1861 efi_fb_info
.stride
= efifb
->fb_stride
* 4;
1862 efi_fb_info
.depth
= 32;
1863 efi_fb_info
.paddr
= efifb
->fb_addr
;
1865 efi_fb_info
.vaddr
= 0;
1867 efi_fb_init_vaddr(0);
1869 efi_fb_info
.fbops
.fb_set_par
= NULL
;
1870 efi_fb_info
.fbops
.fb_blank
= NULL
;
1871 efi_fb_info
.fbops
.fb_debug_enter
= NULL
;
1872 efi_fb_info
.device
= NULL
;
1878 efifb_startup(void *arg
)
1883 SYSINIT(efi_fb_info
, SI_BOOT1_POST
, SI_ORDER_FIRST
, efifb_startup
, NULL
);
1886 getmemsize(caddr_t kmdp
, u_int64_t first
)
1888 int off
, physmap_idx
, pa_indx
, da_indx
;
1891 vm_paddr_t msgbuf_size
;
1892 u_long physmem_tunable
;
1894 quad_t dcons_addr
, dcons_size
;
1896 bzero(physmap
, sizeof(physmap
));
1900 * get memory map from INT 15:E820, kindly supplied by the loader.
1902 * subr_module.c says:
1903 * "Consumer may safely assume that size value precedes data."
1904 * ie: an int32_t immediately precedes smap.
1906 efihdrbase
= (struct efi_map_header
*)preload_search_info(kmdp
,
1907 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1908 smapbase
= (struct bios_smap
*)preload_search_info(kmdp
,
1909 MODINFO_METADATA
| MODINFOMD_SMAP
);
1910 if (smapbase
== NULL
&& efihdrbase
== NULL
)
1911 panic("No BIOS smap or EFI map info from loader!");
1913 if (efihdrbase
== NULL
)
1914 add_smap_entries(&physmap_idx
);
1916 add_efi_map_entries(&physmap_idx
);
1918 base_memory
= physmap
[1] / 1024;
1919 /* make hole for AP bootstrap code */
1920 physmap
[1] = mp_bootaddress(base_memory
);
1922 /* Save EBDA address, if any */
1923 ebda_addr
= (u_long
)(*(u_short
*)(KERNBASE
+ 0x40e));
1927 * Maxmem isn't the "maximum memory", it's one larger than the
1928 * highest page of the physical address space. It should be
1929 * called something like "Maxphyspage". We may adjust this
1930 * based on ``hw.physmem'' and the results of the memory test.
1932 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1935 Maxmem
= MAXMEM
/ 4;
1938 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable
))
1939 Maxmem
= atop(physmem_tunable
);
1942 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1945 if (Maxmem
> atop(physmap
[physmap_idx
+ 1]))
1946 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1949 * Blowing out the DMAP will blow up the system.
1951 if (Maxmem
> atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
)) {
1952 kprintf("Limiting Maxmem due to DMAP size\n");
1953 Maxmem
= atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
);
1956 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1957 (boothowto
& RB_VERBOSE
)) {
1958 kprintf("Physical memory use set to %ldK\n", Maxmem
* 4);
1962 * Call pmap initialization to make new kernel address space
1966 pmap_bootstrap(&first
);
1967 physmap
[0] = PAGE_SIZE
;
1970 * Align the physmap to PHYSMAP_ALIGN and cut out anything
1973 for (i
= j
= 0; i
<= physmap_idx
; i
+= 2) {
1974 if (physmap
[i
+1] > ptoa(Maxmem
))
1975 physmap
[i
+1] = ptoa(Maxmem
);
1976 physmap
[i
] = (physmap
[i
] + PHYSMAP_ALIGN_MASK
) &
1977 ~PHYSMAP_ALIGN_MASK
;
1978 physmap
[i
+1] = physmap
[i
+1] & ~PHYSMAP_ALIGN_MASK
;
1980 physmap
[j
] = physmap
[i
];
1981 physmap
[j
+1] = physmap
[i
+1];
1983 if (physmap
[i
] < physmap
[i
+1])
1986 physmap_idx
= j
- 2;
1989 * Align anything else used in the validation loop.
1991 first
= (first
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
1994 * Size up each available chunk of physical memory.
1998 phys_avail
[pa_indx
].phys_beg
= physmap
[0];
1999 phys_avail
[pa_indx
].phys_end
= physmap
[0];
2000 dump_avail
[da_indx
].phys_beg
= 0;
2001 dump_avail
[da_indx
].phys_end
= physmap
[0];
2005 * Get dcons buffer address
2007 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
2008 kgetenv_quad("dcons.size", &dcons_size
) == 0)
2012 * Validate the physical memory. The physical memory segments
2013 * have already been aligned to PHYSMAP_ALIGN which is a multiple
2016 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
2018 vm_paddr_t incr
= PHYSMAP_ALIGN
;
2020 end
= physmap
[i
+ 1];
2022 for (pa
= physmap
[i
]; pa
< end
; pa
+= incr
) {
2024 volatile uint64_t *ptr
= (uint64_t *)CADDR1
;
2027 incr
= PHYSMAP_ALIGN
;
2031 * block out kernel memory as not available.
2033 if (pa
>= 0x200000 && pa
< first
)
2037 * block out dcons buffer
2040 && pa
>= trunc_page(dcons_addr
)
2041 && pa
< dcons_addr
+ dcons_size
) {
2048 * Always test the first and last block supplied in
2049 * the map entry, but it just takes too long to run
2050 * the test these days and we already have to skip
2051 * pages. Handwave it on PHYSMAP_HANDWAVE boundaries.
2053 if (pa
!= physmap
[i
]) {
2054 vm_paddr_t bytes
= end
- pa
;
2055 if ((pa
& PHYSMAP_HANDWAVE_MASK
) == 0 &&
2056 bytes
>= PHYSMAP_HANDWAVE
+ PHYSMAP_ALIGN
) {
2057 incr
= PHYSMAP_HANDWAVE
;
2063 * map page into kernel: valid, read/write,non-cacheable
2066 kernel_pmap
.pmap_bits
[PG_V_IDX
] |
2067 kernel_pmap
.pmap_bits
[PG_RW_IDX
] |
2068 kernel_pmap
.pmap_bits
[PG_N_IDX
];
2069 cpu_invlpg(__DEVOLATILE(void *, ptr
));
2074 * Test for alternating 1's and 0's
2076 *ptr
= 0xaaaaaaaaaaaaaaaaLLU
;
2078 if (*ptr
!= 0xaaaaaaaaaaaaaaaaLLU
)
2081 * Test for alternating 0's and 1's
2083 *ptr
= 0x5555555555555555LLU
;
2085 if (*ptr
!= 0x5555555555555555LLU
)
2090 *ptr
= 0xffffffffffffffffLLU
;
2092 if (*ptr
!= 0xffffffffffffffffLLU
)
2102 * Restore original value.
2108 * Adjust array of valid/good pages.
2110 if (page_bad
== TRUE
)
2114 * If this good page is a continuation of the
2115 * previous set of good pages, then just increase
2116 * the end pointer. Otherwise start a new chunk.
2117 * Note that "end" points one higher than end,
2118 * making the range >= start and < end.
2119 * If we're also doing a speculative memory
2120 * test and we at or past the end, bump up Maxmem
2121 * so that we keep going. The first bad page
2122 * will terminate the loop.
2124 if (phys_avail
[pa_indx
].phys_end
== pa
) {
2125 phys_avail
[pa_indx
].phys_end
+= incr
;
2128 if (pa_indx
== PHYS_AVAIL_ARRAY_END
) {
2130 "Too many holes in the physical address space, giving up\n");
2135 phys_avail
[pa_indx
].phys_beg
= pa
;
2136 phys_avail
[pa_indx
].phys_end
= pa
+ incr
;
2138 physmem
+= incr
/ PAGE_SIZE
;
2140 if (dump_avail
[da_indx
].phys_end
== pa
) {
2141 dump_avail
[da_indx
].phys_end
+= incr
;
2144 if (da_indx
== DUMP_AVAIL_ARRAY_END
) {
2148 dump_avail
[da_indx
].phys_beg
= pa
;
2149 dump_avail
[da_indx
].phys_end
= pa
+ incr
;
2161 * The last chunk must contain at least one page plus the message
2162 * buffer to avoid complicating other code (message buffer address
2163 * calculation, etc.).
2165 msgbuf_size
= (MSGBUF_SIZE
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
2167 while (phys_avail
[pa_indx
].phys_beg
+ PHYSMAP_ALIGN
+ msgbuf_size
>=
2168 phys_avail
[pa_indx
].phys_end
) {
2169 physmem
-= atop(phys_avail
[pa_indx
].phys_end
-
2170 phys_avail
[pa_indx
].phys_beg
);
2171 phys_avail
[pa_indx
].phys_beg
= 0;
2172 phys_avail
[pa_indx
].phys_end
= 0;
2176 Maxmem
= atop(phys_avail
[pa_indx
].phys_end
);
2178 /* Trim off space for the message buffer. */
2179 phys_avail
[pa_indx
].phys_end
-= msgbuf_size
;
2181 avail_end
= phys_avail
[pa_indx
].phys_end
;
2183 /* Map the message buffer. */
2184 for (off
= 0; off
< msgbuf_size
; off
+= PAGE_SIZE
) {
2185 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2187 /* Try to get EFI framebuffer working as early as possible */
2188 if (have_efi_framebuffer
)
2189 efi_fb_init_vaddr(1);
2192 struct machintr_abi MachIntrABI
;
2203 * 7 Device Not Available (x87)
2205 * 9 Coprocessor Segment overrun (unsupported, reserved)
2207 * 11 Segment not present
2209 * 13 General Protection
2212 * 16 x87 FP Exception pending
2213 * 17 Alignment Check
2215 * 19 SIMD floating point
2217 * 32-255 INTn/external sources
2220 hammer_time(u_int64_t modulep
, u_int64_t physfree
)
2223 int gsel_tss
, x
, cpu
;
2225 int metadata_missing
, off
;
2227 struct mdglobaldata
*gd
;
2231 * Prevent lowering of the ipl if we call tsleep() early.
2233 gd
= &CPU_prvspace
[0]->mdglobaldata
;
2234 bzero(gd
, sizeof(*gd
));
2237 * Note: on both UP and SMP curthread must be set non-NULL
2238 * early in the boot sequence because the system assumes
2239 * that 'curthread' is never NULL.
2242 gd
->mi
.gd_curthread
= &thread0
;
2243 thread0
.td_gd
= &gd
->mi
;
2245 atdevbase
= ISA_HOLE_START
+ PTOV_OFFSET
;
2248 metadata_missing
= 0;
2249 if (bootinfo
.bi_modulep
) {
2250 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
2251 preload_bootstrap_relocate(KERNBASE
);
2253 metadata_missing
= 1;
2255 if (bootinfo
.bi_envp
)
2256 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
2259 preload_metadata
= (caddr_t
)(uintptr_t)(modulep
+ PTOV_OFFSET
);
2260 preload_bootstrap_relocate(PTOV_OFFSET
);
2261 kmdp
= preload_search_by_type("elf kernel");
2263 kmdp
= preload_search_by_type("elf64 kernel");
2264 boothowto
= MD_FETCH(kmdp
, MODINFOMD_HOWTO
, int);
2265 kern_envp
= MD_FETCH(kmdp
, MODINFOMD_ENVP
, char *) + PTOV_OFFSET
;
2267 ksym_start
= MD_FETCH(kmdp
, MODINFOMD_SSYM
, uintptr_t);
2268 ksym_end
= MD_FETCH(kmdp
, MODINFOMD_ESYM
, uintptr_t);
2270 efi_systbl_phys
= MD_FETCH(kmdp
, MODINFOMD_FW_HANDLE
, vm_paddr_t
);
2272 if (boothowto
& RB_VERBOSE
)
2276 * Default MachIntrABI to ICU
2278 MachIntrABI
= MachIntrABI_ICU
;
2281 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
2282 * and ncpus_fit_mask remain 0.
2287 /* Init basic tunables, hz etc */
2291 * make gdt memory segments
2293 gdt_segs
[GPROC0_SEL
].ssd_base
=
2294 (uintptr_t) &CPU_prvspace
[0]->mdglobaldata
.gd_common_tss
;
2296 gd
->mi
.gd_prvspace
= CPU_prvspace
[0];
2298 for (x
= 0; x
< NGDT
; x
++) {
2299 if (x
!= GPROC0_SEL
&& x
!= (GPROC0_SEL
+ 1))
2300 ssdtosd(&gdt_segs
[x
], &gdt
[x
]);
2302 ssdtosyssd(&gdt_segs
[GPROC0_SEL
],
2303 (struct system_segment_descriptor
*)&gdt
[GPROC0_SEL
]);
2305 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
2306 r_gdt
.rd_base
= (long) gdt
;
2309 wrmsr(MSR_FSBASE
, 0); /* User value */
2310 wrmsr(MSR_GSBASE
, (u_int64_t
)&gd
->mi
);
2311 wrmsr(MSR_KGSBASE
, 0); /* User value while in the kernel */
2313 mi_gdinit(&gd
->mi
, 0);
2315 proc0paddr
= proc0paddr_buff
;
2316 mi_proc0init(&gd
->mi
, proc0paddr
);
2317 safepri
= TDPRI_MAX
;
2319 /* spinlocks and the BGL */
2323 for (x
= 0; x
< NIDT
; x
++)
2324 setidt_global(x
, &IDTVEC(rsvd
), SDT_SYSIGT
, SEL_KPL
, 0);
2325 setidt_global(IDT_DE
, &IDTVEC(div
), SDT_SYSIGT
, SEL_KPL
, 0);
2326 setidt_global(IDT_DB
, &IDTVEC(dbg
), SDT_SYSIGT
, SEL_KPL
, 0);
2327 setidt_global(IDT_NMI
, &IDTVEC(nmi
), SDT_SYSIGT
, SEL_KPL
, 1);
2328 setidt_global(IDT_BP
, &IDTVEC(bpt
), SDT_SYSIGT
, SEL_UPL
, 0);
2329 setidt_global(IDT_OF
, &IDTVEC(ofl
), SDT_SYSIGT
, SEL_KPL
, 0);
2330 setidt_global(IDT_BR
, &IDTVEC(bnd
), SDT_SYSIGT
, SEL_KPL
, 0);
2331 setidt_global(IDT_UD
, &IDTVEC(ill
), SDT_SYSIGT
, SEL_KPL
, 0);
2332 setidt_global(IDT_NM
, &IDTVEC(dna
), SDT_SYSIGT
, SEL_KPL
, 0);
2333 setidt_global(IDT_DF
, &IDTVEC(dblfault
), SDT_SYSIGT
, SEL_KPL
, 1);
2334 setidt_global(IDT_FPUGP
, &IDTVEC(fpusegm
), SDT_SYSIGT
, SEL_KPL
, 0);
2335 setidt_global(IDT_TS
, &IDTVEC(tss
), SDT_SYSIGT
, SEL_KPL
, 0);
2336 setidt_global(IDT_NP
, &IDTVEC(missing
), SDT_SYSIGT
, SEL_KPL
, 0);
2337 setidt_global(IDT_SS
, &IDTVEC(stk
), SDT_SYSIGT
, SEL_KPL
, 0);
2338 setidt_global(IDT_GP
, &IDTVEC(prot
), SDT_SYSIGT
, SEL_KPL
, 0);
2339 setidt_global(IDT_PF
, &IDTVEC(page
), SDT_SYSIGT
, SEL_KPL
, 0);
2340 setidt_global(IDT_MF
, &IDTVEC(fpu
), SDT_SYSIGT
, SEL_KPL
, 0);
2341 setidt_global(IDT_AC
, &IDTVEC(align
), SDT_SYSIGT
, SEL_KPL
, 0);
2342 setidt_global(IDT_MC
, &IDTVEC(mchk
), SDT_SYSIGT
, SEL_KPL
, 0);
2343 setidt_global(IDT_XF
, &IDTVEC(xmm
), SDT_SYSIGT
, SEL_KPL
, 0);
2345 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
2346 r_idt_arr
[cpu
].rd_limit
= sizeof(idt_arr
[cpu
]) - 1;
2347 r_idt_arr
[cpu
].rd_base
= (long) &idt_arr
[cpu
][0];
2350 lidt(&r_idt_arr
[0]);
2353 * Initialize the console before we print anything out.
2358 if (metadata_missing
)
2359 kprintf("WARNING: loader(8) metadata is missing!\n");
2369 * Initialize IRQ mapping
2372 * SHOULD be after elcr_probe()
2374 MachIntrABI_ICU
.initmap();
2375 MachIntrABI_IOAPIC
.initmap();
2379 if (boothowto
& RB_KDB
)
2380 Debugger("Boot flags requested debugger");
2384 finishidentcpu(); /* Final stage of CPU initialization */
2385 setidt(6, &IDTVEC(ill
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2386 setidt(13, &IDTVEC(prot
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2388 identify_cpu(); /* Final stage of CPU initialization */
2389 initializecpu(0); /* Initialize CPU registers */
2392 * On modern intel cpus, haswell or later, cpu_idle_hlt=1 is better
2393 * because the cpu does significant power management in MWAIT
2394 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2396 * On modern amd cpus cpu_idle_hlt=3 is better, because the cpu does
2397 * significant power management in HLT or ACPI (but cpu_idle_hlt=1
2398 * would try to use MWAIT).
2400 * On older amd or intel cpus, cpu_idle_hlt=2 is better because ACPI
2401 * is needed to reduce power consumption, but wakeup times are often
2404 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
2405 CPUID_TO_MODEL(cpu_id
) >= 0x3C) { /* Haswell or later */
2408 if (cpu_vendor_id
== CPU_VENDOR_AMD
&&
2409 CPUID_TO_FAMILY(cpu_id
) >= 0x14) { /* Bobcat or later */
2413 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable
); /* for compat */
2414 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable
);
2415 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable
);
2416 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt
);
2419 * Some of the virtual machines do not work w/ I/O APIC
2420 * enabled. If the user does not explicitly enable or
2421 * disable the I/O APIC (ioapic_enable < 0), then we
2422 * disable I/O APIC on all virtual machines.
2425 * This must be done after identify_cpu(), which sets
2428 if (ioapic_enable
< 0) {
2429 if (cpu_feature2
& CPUID2_VMM
)
2435 /* make an initial tss so cpu can get interrupt stack on syscall! */
2436 gd
->gd_common_tss
.tss_rsp0
=
2437 (register_t
)(thread0
.td_kstack
+
2438 KSTACK_PAGES
* PAGE_SIZE
- sizeof(struct pcb
));
2439 /* Ensure the stack is aligned to 16 bytes */
2440 gd
->gd_common_tss
.tss_rsp0
&= ~(register_t
)0xF;
2442 /* double fault stack */
2443 gd
->gd_common_tss
.tss_ist1
=
2444 (long)&gd
->mi
.gd_prvspace
->idlestack
[
2445 sizeof(gd
->mi
.gd_prvspace
->idlestack
)];
2447 /* Set the IO permission bitmap (empty due to tss seg limit) */
2448 gd
->gd_common_tss
.tss_iobase
= sizeof(struct x86_64tss
);
2450 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2451 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
];
2452 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2455 /* Set up the fast syscall stuff */
2456 msr
= rdmsr(MSR_EFER
) | EFER_SCE
;
2457 wrmsr(MSR_EFER
, msr
);
2458 wrmsr(MSR_LSTAR
, (u_int64_t
)IDTVEC(fast_syscall
));
2459 wrmsr(MSR_CSTAR
, (u_int64_t
)IDTVEC(fast_syscall32
));
2460 msr
= ((u_int64_t
)GSEL(GCODE_SEL
, SEL_KPL
) << 32) |
2461 ((u_int64_t
)GSEL(GUCODE32_SEL
, SEL_UPL
) << 48);
2462 wrmsr(MSR_STAR
, msr
);
2463 wrmsr(MSR_SF_MASK
, PSL_NT
|PSL_T
|PSL_I
|PSL_C
|PSL_D
|PSL_IOPL
);
2465 getmemsize(kmdp
, physfree
);
2466 init_param2(physmem
);
2468 /* now running on new page tables, configured,and u/iom is accessible */
2470 /* Map the message buffer. */
2472 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2473 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2476 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2479 /* transfer to user mode */
2481 _ucodesel
= GSEL(GUCODE_SEL
, SEL_UPL
);
2482 _udatasel
= GSEL(GUDATA_SEL
, SEL_UPL
);
2483 _ucode32sel
= GSEL(GUCODE32_SEL
, SEL_UPL
);
2489 /* setup proc 0's pcb */
2490 thread0
.td_pcb
->pcb_flags
= 0;
2491 thread0
.td_pcb
->pcb_cr3
= KPML4phys
;
2492 thread0
.td_pcb
->pcb_ext
= NULL
;
2493 lwp0
.lwp_md
.md_regs
= &proc0_tf
; /* XXX needed? */
2495 /* Location of kernel stack for locore */
2496 return ((u_int64_t
)thread0
.td_pcb
);
2500 * Initialize machine-dependant portions of the global data structure.
2501 * Note that the global data area and cpu0's idlestack in the private
2502 * data space were allocated in locore.
2504 * Note: the idlethread's cpl is 0
2506 * WARNING! Called from early boot, 'mycpu' may not work yet.
2509 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2512 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2514 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2515 gd
->mi
.gd_prvspace
->idlestack
,
2516 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2518 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2519 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2520 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2521 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2525 * We only have to check for DMAP bounds, the globaldata space is
2526 * actually part of the kernel_map so we don't have to waste time
2527 * checking CPU_prvspace[*].
2530 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2533 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2534 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2538 if (saddr
>= DMAP_MIN_ADDRESS
&& eaddr
<= DMAP_MAX_ADDRESS
)
2544 globaldata_find(int cpu
)
2546 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2547 return(&CPU_prvspace
[cpu
]->mdglobaldata
.mi
);
2551 * This path should be safe from the SYSRET issue because only stopped threads
2552 * can have their %rip adjusted this way (and all heavy weight thread switches
2553 * clear QUICKREF and thus do not use SYSRET). However, the code path is
2554 * convoluted so add a safety by forcing %rip to be cannonical.
2557 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2559 if (addr
& 0x0000800000000000LLU
)
2560 lp
->lwp_md
.md_regs
->tf_rip
= addr
| 0xFFFF000000000000LLU
;
2562 lp
->lwp_md
.md_regs
->tf_rip
= addr
& 0x0000FFFFFFFFFFFFLLU
;
2567 ptrace_single_step(struct lwp
*lp
)
2569 lp
->lwp_md
.md_regs
->tf_rflags
|= PSL_T
;
2574 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2576 struct trapframe
*tp
;
2578 if ((tp
= lp
->lwp_md
.md_regs
) == NULL
)
2580 bcopy(&tp
->tf_rdi
, ®s
->r_rdi
, sizeof(*regs
));
2585 set_regs(struct lwp
*lp
, struct reg
*regs
)
2587 struct trapframe
*tp
;
2589 tp
= lp
->lwp_md
.md_regs
;
2590 if (!EFL_SECURE(regs
->r_rflags
, tp
->tf_rflags
) ||
2591 !CS_SECURE(regs
->r_cs
))
2593 bcopy(®s
->r_rdi
, &tp
->tf_rdi
, sizeof(*regs
));
2599 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2601 struct env87
*penv_87
= &sv_87
->sv_env
;
2602 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2605 /* FPU control/status */
2606 penv_87
->en_cw
= penv_xmm
->en_cw
;
2607 penv_87
->en_sw
= penv_xmm
->en_sw
;
2608 penv_87
->en_tw
= penv_xmm
->en_tw
;
2609 penv_87
->en_fip
= penv_xmm
->en_fip
;
2610 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2611 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2612 penv_87
->en_foo
= penv_xmm
->en_foo
;
2613 penv_87
->en_fos
= penv_xmm
->en_fos
;
2616 for (i
= 0; i
< 8; ++i
)
2617 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2621 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2623 struct env87
*penv_87
= &sv_87
->sv_env
;
2624 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2627 /* FPU control/status */
2628 penv_xmm
->en_cw
= penv_87
->en_cw
;
2629 penv_xmm
->en_sw
= penv_87
->en_sw
;
2630 penv_xmm
->en_tw
= penv_87
->en_tw
;
2631 penv_xmm
->en_fip
= penv_87
->en_fip
;
2632 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2633 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2634 penv_xmm
->en_foo
= penv_87
->en_foo
;
2635 penv_xmm
->en_fos
= penv_87
->en_fos
;
2638 for (i
= 0; i
< 8; ++i
)
2639 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2643 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2645 if (lp
->lwp_thread
== NULL
|| lp
->lwp_thread
->td_pcb
== NULL
)
2648 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2649 (struct save87
*)fpregs
);
2652 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2657 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2660 set_fpregs_xmm((struct save87
*)fpregs
,
2661 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2664 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2669 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2674 dbregs
->dr
[0] = rdr0();
2675 dbregs
->dr
[1] = rdr1();
2676 dbregs
->dr
[2] = rdr2();
2677 dbregs
->dr
[3] = rdr3();
2678 dbregs
->dr
[4] = rdr4();
2679 dbregs
->dr
[5] = rdr5();
2680 dbregs
->dr
[6] = rdr6();
2681 dbregs
->dr
[7] = rdr7();
2684 if (lp
->lwp_thread
== NULL
|| (pcb
= lp
->lwp_thread
->td_pcb
) == NULL
)
2686 dbregs
->dr
[0] = pcb
->pcb_dr0
;
2687 dbregs
->dr
[1] = pcb
->pcb_dr1
;
2688 dbregs
->dr
[2] = pcb
->pcb_dr2
;
2689 dbregs
->dr
[3] = pcb
->pcb_dr3
;
2692 dbregs
->dr
[6] = pcb
->pcb_dr6
;
2693 dbregs
->dr
[7] = pcb
->pcb_dr7
;
2698 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2701 load_dr0(dbregs
->dr
[0]);
2702 load_dr1(dbregs
->dr
[1]);
2703 load_dr2(dbregs
->dr
[2]);
2704 load_dr3(dbregs
->dr
[3]);
2705 load_dr4(dbregs
->dr
[4]);
2706 load_dr5(dbregs
->dr
[5]);
2707 load_dr6(dbregs
->dr
[6]);
2708 load_dr7(dbregs
->dr
[7]);
2711 struct ucred
*ucred
;
2713 uint64_t mask1
, mask2
;
2716 * Don't let an illegal value for dr7 get set. Specifically,
2717 * check for undefined settings. Setting these bit patterns
2718 * result in undefined behaviour and can lead to an unexpected
2721 /* JG this loop looks unreadable */
2722 /* Check 4 2-bit fields for invalid patterns.
2723 * These fields are R/Wi, for i = 0..3
2725 /* Is 10 in LENi allowed when running in compatibility mode? */
2726 /* Pattern 10 in R/Wi might be used to indicate
2727 * breakpoint on I/O. Further analysis should be
2728 * carried to decide if it is safe and useful to
2729 * provide access to that capability
2731 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 4;
2732 i
++, mask1
<<= 4, mask2
<<= 4)
2733 if ((dbregs
->dr
[7] & mask1
) == mask2
)
2736 pcb
= lp
->lwp_thread
->td_pcb
;
2737 ucred
= lp
->lwp_proc
->p_ucred
;
2740 * Don't let a process set a breakpoint that is not within the
2741 * process's address space. If a process could do this, it
2742 * could halt the system by setting a breakpoint in the kernel
2743 * (if ddb was enabled). Thus, we need to check to make sure
2744 * that no breakpoints are being enabled for addresses outside
2745 * process's address space, unless, perhaps, we were called by
2748 * XXX - what about when the watched area of the user's
2749 * address space is written into from within the kernel
2750 * ... wouldn't that still cause a breakpoint to be generated
2751 * from within kernel mode?
2754 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2755 if (dbregs
->dr
[7] & 0x3) {
2756 /* dr0 is enabled */
2757 if (dbregs
->dr
[0] >= VM_MAX_USER_ADDRESS
)
2761 if (dbregs
->dr
[7] & (0x3<<2)) {
2762 /* dr1 is enabled */
2763 if (dbregs
->dr
[1] >= VM_MAX_USER_ADDRESS
)
2767 if (dbregs
->dr
[7] & (0x3<<4)) {
2768 /* dr2 is enabled */
2769 if (dbregs
->dr
[2] >= VM_MAX_USER_ADDRESS
)
2773 if (dbregs
->dr
[7] & (0x3<<6)) {
2774 /* dr3 is enabled */
2775 if (dbregs
->dr
[3] >= VM_MAX_USER_ADDRESS
)
2780 pcb
->pcb_dr0
= dbregs
->dr
[0];
2781 pcb
->pcb_dr1
= dbregs
->dr
[1];
2782 pcb
->pcb_dr2
= dbregs
->dr
[2];
2783 pcb
->pcb_dr3
= dbregs
->dr
[3];
2784 pcb
->pcb_dr6
= dbregs
->dr
[6];
2785 pcb
->pcb_dr7
= dbregs
->dr
[7];
2787 pcb
->pcb_flags
|= PCB_DBREGS
;
2794 * Return > 0 if a hardware breakpoint has been hit, and the
2795 * breakpoint was in user space. Return 0, otherwise.
2798 user_dbreg_trap(void)
2800 u_int64_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2801 u_int64_t bp
; /* breakpoint bits extracted from dr6 */
2802 int nbp
; /* number of breakpoints that triggered */
2803 caddr_t addr
[4]; /* breakpoint addresses */
2807 if ((dr7
& 0xff) == 0) {
2809 * all GE and LE bits in the dr7 register are zero,
2810 * thus the trap couldn't have been caused by the
2811 * hardware debug registers
2822 * None of the breakpoint bits are set meaning this
2823 * trap was not caused by any of the debug registers
2829 * at least one of the breakpoints were hit, check to see
2830 * which ones and if any of them are user space addresses
2834 addr
[nbp
++] = (caddr_t
)rdr0();
2837 addr
[nbp
++] = (caddr_t
)rdr1();
2840 addr
[nbp
++] = (caddr_t
)rdr2();
2843 addr
[nbp
++] = (caddr_t
)rdr3();
2846 for (i
=0; i
<nbp
; i
++) {
2848 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2850 * addr[i] is in user space
2857 * None of the breakpoints are in user space.
2865 Debugger(const char *msg
)
2867 kprintf("Debugger(\"%s\") called.\n", msg
);
2874 * Provide inb() and outb() as functions. They are normally only
2875 * available as macros calling inlined functions, thus cannot be
2876 * called inside DDB.
2878 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2884 /* silence compiler warnings */
2886 void outb(u_int
, u_char
);
2893 * We use %%dx and not %1 here because i/o is done at %dx and not at
2894 * %edx, while gcc generates inferior code (movw instead of movl)
2895 * if we tell it to load (u_short) port.
2897 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2902 outb(u_int port
, u_char data
)
2906 * Use an unnecessary assignment to help gcc's register allocator.
2907 * This make a large difference for gcc-1.40 and a tiny difference
2908 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2909 * best results. gcc-2.6.0 can't handle this.
2912 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2920 * initialize all the SMP locks
2923 /* critical region when masking or unmasking interupts */
2924 struct spinlock_deprecated imen_spinlock
;
2926 /* lock region used by kernel profiling */
2927 struct spinlock_deprecated mcount_spinlock
;
2929 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2930 struct spinlock_deprecated com_spinlock
;
2932 /* lock regions around the clock hardware */
2933 struct spinlock_deprecated clock_spinlock
;
2939 * Get the initial mplock with a count of 1 for the BSP.
2940 * This uses a LOGICAL cpu ID, ie BSP == 0.
2942 cpu_get_initial_mplock();
2944 spin_init_deprecated(&mcount_spinlock
);
2945 spin_init_deprecated(&imen_spinlock
);
2946 spin_init_deprecated(&com_spinlock
);
2947 spin_init_deprecated(&clock_spinlock
);
2949 /* our token pool needs to work early */
2950 lwkt_token_pool_init();
2954 cpu_mwait_hint_valid(uint32_t hint
)
2958 cx_idx
= MWAIT_EAX_TO_CX(hint
);
2959 if (cx_idx
>= CPU_MWAIT_CX_MAX
)
2962 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2963 if (sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
2970 cpu_mwait_cx_no_bmsts(void)
2972 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_STS
);
2976 cpu_mwait_cx_no_bmarb(void)
2978 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_ARB
);
2982 cpu_mwait_cx_hint2name(int hint
, char *name
, int namelen
, boolean_t allow_auto
)
2984 int old_cx_idx
, sub
= 0;
2987 old_cx_idx
= MWAIT_EAX_TO_CX(hint
);
2988 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2989 } else if (hint
== CPU_MWAIT_HINT_AUTO
) {
2990 old_cx_idx
= allow_auto
? CPU_MWAIT_C2
: CPU_MWAIT_CX_MAX
;
2991 } else if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
2992 old_cx_idx
= allow_auto
? CPU_MWAIT_C3
: CPU_MWAIT_CX_MAX
;
2994 old_cx_idx
= CPU_MWAIT_CX_MAX
;
2997 if (!CPU_MWAIT_HAS_CX
)
2998 strlcpy(name
, "NONE", namelen
);
2999 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTO
)
3000 strlcpy(name
, "AUTO", namelen
);
3001 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTODEEP
)
3002 strlcpy(name
, "AUTODEEP", namelen
);
3003 else if (old_cx_idx
>= CPU_MWAIT_CX_MAX
||
3004 sub
>= cpu_mwait_cx_info
[old_cx_idx
].subcnt
)
3005 strlcpy(name
, "INVALID", namelen
);
3007 ksnprintf(name
, namelen
, "C%d/%d", old_cx_idx
, sub
);
3013 cpu_mwait_cx_name2hint(char *name
, int *hint0
, boolean_t allow_auto
)
3015 int cx_idx
, sub
, hint
;
3018 if (allow_auto
&& strcmp(name
, "AUTO") == 0) {
3019 hint
= CPU_MWAIT_HINT_AUTO
;
3020 cx_idx
= CPU_MWAIT_C2
;
3023 if (allow_auto
&& strcmp(name
, "AUTODEEP") == 0) {
3024 hint
= CPU_MWAIT_HINT_AUTODEEP
;
3025 cx_idx
= CPU_MWAIT_C3
;
3029 if (strlen(name
) < 4 || toupper(name
[0]) != 'C')
3034 cx_idx
= strtol(start
, &ptr
, 10);
3035 if (ptr
== start
|| *ptr
!= '/')
3037 if (cx_idx
< 0 || cx_idx
>= CPU_MWAIT_CX_MAX
)
3043 sub
= strtol(start
, &ptr
, 10);
3046 if (sub
< 0 || sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
3049 hint
= MWAIT_EAX_HINT(cx_idx
, sub
);
3056 cpu_mwait_cx_transit(int old_cx_idx
, int cx_idx
)
3058 if (cx_idx
>= CPU_MWAIT_C3
&& cpu_mwait_c3_preamble
)
3060 if (old_cx_idx
< CPU_MWAIT_C3
&& cx_idx
>= CPU_MWAIT_C3
) {
3063 error
= cputimer_intr_powersave_addreq();
3066 } else if (old_cx_idx
>= CPU_MWAIT_C3
&& cx_idx
< CPU_MWAIT_C3
) {
3067 cputimer_intr_powersave_remreq();
3073 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
, int *hint0
,
3074 boolean_t allow_auto
)
3076 int error
, cx_idx
, old_cx_idx
, hint
;
3077 char name
[CPU_MWAIT_CX_NAMELEN
];
3080 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
),
3083 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3084 if (error
!= 0 || req
->newptr
== NULL
)
3087 if (!CPU_MWAIT_HAS_CX
)
3090 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, allow_auto
);
3094 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3103 cpu_mwait_cx_setname(struct cpu_idle_stat
*stat
, const char *cx_name
)
3105 int error
, cx_idx
, old_cx_idx
, hint
;
3106 char name
[CPU_MWAIT_CX_NAMELEN
];
3108 KASSERT(CPU_MWAIT_HAS_CX
, ("cpu does not support mwait CX extension"));
3111 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3113 strlcpy(name
, cx_name
, sizeof(name
));
3114 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3118 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3127 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3129 int hint
= cpu_mwait_halt_global
;
3130 int error
, cx_idx
, cpu
;
3131 char name
[CPU_MWAIT_CX_NAMELEN
], cx_name
[CPU_MWAIT_CX_NAMELEN
];
3133 cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3135 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3136 if (error
!= 0 || req
->newptr
== NULL
)
3139 if (!CPU_MWAIT_HAS_CX
)
3142 /* Save name for later per-cpu CX configuration */
3143 strlcpy(cx_name
, name
, sizeof(cx_name
));
3145 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3149 /* Change per-cpu CX configuration */
3150 for (cpu
= 0; cpu
< ncpus
; ++cpu
) {
3151 error
= cpu_mwait_cx_setname(&cpu_idle_stats
[cpu
], cx_name
);
3156 cpu_mwait_halt_global
= hint
;
3161 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3163 struct cpu_idle_stat
*stat
= arg1
;
3166 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3172 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
)
3176 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3177 &cpu_mwait_spin
, FALSE
);
3182 * This manual debugging code is called unconditionally from Xtimer
3183 * (the per-cpu timer interrupt) whether the current thread is in a
3184 * critical section or not) and can be useful in tracking down lockups.
3186 * NOTE: MANUAL DEBUG CODE
3189 static int saveticks
[SMP_MAXCPU
];
3190 static int savecounts
[SMP_MAXCPU
];
3194 pcpu_timer_always(struct intrframe
*frame
)
3197 globaldata_t gd
= mycpu
;
3198 int cpu
= gd
->gd_cpuid
;
3204 gptr
= (short *)0xFFFFFFFF800b8000 + 80 * cpu
;
3205 *gptr
= ((*gptr
+ 1) & 0x00FF) | 0x0700;
3208 ksnprintf(buf
, sizeof(buf
), " %p %16s %d %16s ",
3209 (void *)frame
->if_rip
, gd
->gd_curthread
->td_comm
, ticks
,
3211 for (i
= 0; buf
[i
]; ++i
) {
3212 gptr
[i
] = 0x0700 | (unsigned char)buf
[i
];
3216 if (saveticks
[gd
->gd_cpuid
] != ticks
) {
3217 saveticks
[gd
->gd_cpuid
] = ticks
;
3218 savecounts
[gd
->gd_cpuid
] = 0;
3220 ++savecounts
[gd
->gd_cpuid
];
3221 if (savecounts
[gd
->gd_cpuid
] > 2000 && panicstr
== NULL
) {
3222 panic("cpud %d panicing on ticks failure",
3225 for (i
= 0; i
< ncpus
; ++i
) {
3227 if (saveticks
[i
] && panicstr
== NULL
) {
3228 delta
= saveticks
[i
] - ticks
;
3229 if (delta
< -10 || delta
> 10) {
3230 panic("cpu %d panicing on cpu %d watchdog",