2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
43 //#include "use_npx.h"
47 #include "opt_directio.h"
49 #include "opt_msgbuf.h"
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/sysproto.h>
55 #include <sys/signalvar.h>
56 #include <sys/kernel.h>
57 #include <sys/linker.h>
58 #include <sys/malloc.h>
62 #include <sys/reboot.h>
64 #include <sys/msgbuf.h>
65 #include <sys/sysent.h>
66 #include <sys/sysctl.h>
67 #include <sys/vmmeter.h>
69 #include <sys/usched.h>
72 #include <sys/ctype.h>
73 #include <sys/serialize.h>
74 #include <sys/systimer.h>
77 #include <vm/vm_param.h>
79 #include <vm/vm_kern.h>
80 #include <vm/vm_object.h>
81 #include <vm/vm_page.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_pager.h>
84 #include <vm/vm_extern.h>
86 #include <sys/thread2.h>
87 #include <sys/mplock2.h>
88 #include <sys/mutex2.h>
98 #include <machine/cpu.h>
99 #include <machine/clock.h>
100 #include <machine/specialreg.h>
102 #include <machine/bootinfo.h>
104 #include <machine/md_var.h>
105 #include <machine/metadata.h>
106 #include <machine/pc/bios.h>
107 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
108 #include <machine/globaldata.h> /* CPU_prvspace */
109 #include <machine/smp.h>
110 #include <machine/cputypes.h>
111 #include <machine/intr_machdep.h>
112 #include <machine/framebuffer.h>
115 #include <bus/isa/isa_device.h>
117 #include <machine_base/isa/isa_intr.h>
118 #include <bus/isa/rtc.h>
119 #include <sys/random.h>
120 #include <sys/ptrace.h>
121 #include <machine/sigframe.h>
123 #include <sys/machintr.h>
124 #include <machine_base/icu/icu_abi.h>
125 #include <machine_base/icu/elcr_var.h>
126 #include <machine_base/apic/lapic.h>
127 #include <machine_base/apic/ioapic.h>
128 #include <machine_base/apic/ioapic_abi.h>
129 #include <machine/mptable.h>
131 #define PHYSMAP_ENTRIES 10
133 extern u_int64_t
hammer_time(u_int64_t
, u_int64_t
);
135 extern void printcpuinfo(void); /* XXX header file */
136 extern void identify_cpu(void);
138 extern void finishidentcpu(void);
140 extern void panicifcpuunsupported(void);
142 static void cpu_startup(void *);
143 static void pic_finish(void *);
144 static void cpu_finish(void *);
146 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
147 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
149 extern void ffs_rawread_setup(void);
150 #endif /* DIRECTIO */
151 static void init_locks(void);
153 extern void pcpu_timer_always(struct intrframe
*);
155 SYSINIT(cpu
, SI_BOOT2_START_CPU
, SI_ORDER_FIRST
, cpu_startup
, NULL
);
156 SYSINIT(pic_finish
, SI_BOOT2_FINISH_PIC
, SI_ORDER_FIRST
, pic_finish
, NULL
);
157 SYSINIT(cpu_finish
, SI_BOOT2_FINISH_CPU
, SI_ORDER_FIRST
, cpu_finish
, NULL
);
160 extern vm_offset_t ksym_start
, ksym_end
;
163 struct privatespace CPU_prvspace_bsp
__aligned(4096);
164 struct privatespace
*CPU_prvspace
[MAXCPU
] = { &CPU_prvspace_bsp
};
166 vm_paddr_t efi_systbl_phys
;
167 int _udatasel
, _ucodesel
, _ucode32sel
;
169 int64_t tsc_offsets
[MAXCPU
];
170 cpumask_t smp_idleinvl_mask
;
171 cpumask_t smp_idleinvl_reqs
;
173 static int cpu_mwait_halt_global
; /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
175 #if defined(SWTCH_OPTIM_STATS)
176 extern int swtch_optim_stats
;
177 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
178 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
179 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
180 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
182 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_halt
,
183 CTLFLAG_RD
, &cpu_mwait_halt_global
, 0, "");
184 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_spin
, CTLFLAG_RD
, &cpu_mwait_spin
, 0,
185 "monitor/mwait target state");
187 #define CPU_MWAIT_HAS_CX \
188 ((cpu_feature2 & CPUID2_MON) && \
189 (cpu_mwait_feature & CPUID_MWAIT_EXT))
191 #define CPU_MWAIT_CX_NAMELEN 16
193 #define CPU_MWAIT_C1 1
194 #define CPU_MWAIT_C2 2
195 #define CPU_MWAIT_C3 3
196 #define CPU_MWAIT_CX_MAX 8
198 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */
199 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */
201 SYSCTL_NODE(_machdep
, OID_AUTO
, mwait
, CTLFLAG_RW
, 0, "MWAIT features");
202 SYSCTL_NODE(_machdep_mwait
, OID_AUTO
, CX
, CTLFLAG_RW
, 0, "MWAIT Cx settings");
204 struct cpu_mwait_cx
{
207 struct sysctl_ctx_list sysctl_ctx
;
208 struct sysctl_oid
*sysctl_tree
;
210 static struct cpu_mwait_cx cpu_mwait_cx_info
[CPU_MWAIT_CX_MAX
];
211 static char cpu_mwait_cx_supported
[256];
213 static int cpu_mwait_c1_hints_cnt
;
214 static int cpu_mwait_hints_cnt
;
215 static int *cpu_mwait_hints
;
217 static int cpu_mwait_deep_hints_cnt
;
218 static int *cpu_mwait_deep_hints
;
220 #define CPU_IDLE_REPEAT_DEFAULT 750
222 static u_int cpu_idle_repeat
= CPU_IDLE_REPEAT_DEFAULT
;
223 static u_long cpu_idle_repeat_max
= CPU_IDLE_REPEAT_DEFAULT
;
224 static u_int cpu_mwait_repeat_shift
= 1;
226 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1
227 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2
229 static int cpu_mwait_c3_preamble
=
230 CPU_MWAIT_C3_PREAMBLE_BM_ARB
|
231 CPU_MWAIT_C3_PREAMBLE_BM_STS
;
233 SYSCTL_STRING(_machdep_mwait_CX
, OID_AUTO
, supported
, CTLFLAG_RD
,
234 cpu_mwait_cx_supported
, 0, "MWAIT supported C states");
235 SYSCTL_INT(_machdep_mwait_CX
, OID_AUTO
, c3_preamble
, CTLFLAG_RD
,
236 &cpu_mwait_c3_preamble
, 0, "C3+ preamble mask");
238 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
,
240 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
);
241 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
);
242 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
);
244 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, idle
, CTLTYPE_STRING
|CTLFLAG_RW
,
245 NULL
, 0, cpu_mwait_cx_idle_sysctl
, "A", "");
246 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, spin
, CTLTYPE_STRING
|CTLFLAG_RW
,
247 NULL
, 0, cpu_mwait_cx_spin_sysctl
, "A", "");
248 SYSCTL_UINT(_machdep_mwait_CX
, OID_AUTO
, repeat_shift
, CTLFLAG_RW
,
249 &cpu_mwait_repeat_shift
, 0, "");
253 u_long ebda_addr
= 0;
255 int imcr_present
= 0;
257 int naps
= 0; /* # of Applications processors */
260 struct mtx dt_lock
; /* lock for GDT and LDT */
263 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
265 u_long pmem
= ctob(physmem
);
267 int error
= sysctl_handle_long(oidp
, &pmem
, 0, req
);
271 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_ULONG
|CTLFLAG_RD
,
272 0, 0, sysctl_hw_physmem
, "LU", "Total system memory in bytes (number of pages * page size)");
275 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
277 int error
= sysctl_handle_int(oidp
, 0,
278 ctob(physmem
- vmstats
.v_wire_count
), req
);
282 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
283 0, 0, sysctl_hw_usermem
, "IU", "");
286 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
288 int error
= sysctl_handle_int(oidp
, 0,
289 x86_64_btop(avail_end
- avail_start
), req
);
293 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
294 0, 0, sysctl_hw_availpages
, "I", "");
300 * The number of PHYSMAP entries must be one less than the number of
301 * PHYSSEG entries because the PHYSMAP entry that spans the largest
302 * physical address that is accessible by ISA DMA is split into two
305 vm_phystable_t phys_avail
[VM_PHYSSEG_MAX
+ 1];
306 vm_phystable_t dump_avail
[VM_PHYSSEG_MAX
+ 1];
308 /* must be 1 less so 0 0 can signal end of chunks */
309 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1)
310 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1)
312 static vm_offset_t buffer_sva
, buffer_eva
;
313 vm_offset_t clean_sva
, clean_eva
;
314 static vm_offset_t pager_sva
, pager_eva
;
315 static struct trapframe proc0_tf
;
318 cpu_startup(void *dummy
)
322 vm_offset_t firstaddr
;
325 * Good {morning,afternoon,evening,night}.
327 kprintf("%s", version
);
330 panicifcpuunsupported();
331 kprintf("real memory = %ju (%ju MB)\n",
333 (intmax_t)Realmem
/ 1024 / 1024);
335 * Display any holes after the first chunk of extended memory.
340 kprintf("Physical memory chunk(s):\n");
341 for (indx
= 0; phys_avail
[indx
].phys_end
!= 0; ++indx
) {
344 size1
= phys_avail
[indx
].phys_end
-
345 phys_avail
[indx
].phys_beg
;
347 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
348 (intmax_t)phys_avail
[indx
].phys_beg
,
349 (intmax_t)phys_avail
[indx
].phys_end
- 1,
351 (intmax_t)(size1
/ PAGE_SIZE
));
356 * Allocate space for system data structures.
357 * The first available kernel virtual address is in "v".
358 * As pages of kernel virtual memory are allocated, "v" is incremented.
359 * As pages of memory are allocated and cleared,
360 * "firstaddr" is incremented.
361 * An index into the kernel page table corresponding to the
362 * virtual memory address maintained in "v" is kept in "mapaddr".
366 * Make two passes. The first pass calculates how much memory is
367 * needed and allocates it. The second pass assigns virtual
368 * addresses to the various data structures.
372 v
= (caddr_t
)firstaddr
;
374 #define valloc(name, type, num) \
375 (name) = (type *)v; v = (caddr_t)((name)+(num))
376 #define valloclim(name, type, num, lim) \
377 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
380 * The nominal buffer size (and minimum KVA allocation) is MAXBSIZE.
381 * For the first 64MB of ram nominally allocate sufficient buffers to
382 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
383 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
384 * the buffer cache we limit the eventual kva reservation to
387 * factor represents the 1/4 x ram conversion.
390 long factor
= 4 * NBUFCALCSIZE
/ 1024;
391 long kbytes
= physmem
* (PAGE_SIZE
/ 1024);
395 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
397 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
398 if (maxbcache
&& nbuf
> maxbcache
/ NBUFCALCSIZE
)
399 nbuf
= maxbcache
/ NBUFCALCSIZE
;
403 * Do not allow the buffer_map to be more then 1/2 the size of the
406 if (nbuf
> (virtual_end
- virtual_start
+
407 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2)) {
408 nbuf
= (virtual_end
- virtual_start
+
409 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2);
410 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf
);
414 * Do not allow the buffer_map to use more than 50% of available
415 * physical-equivalent memory. Since the VM pages which back
416 * individual buffers are typically wired, having too many bufs
417 * can prevent the system from paging properly.
419 if (nbuf
> physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2)) {
420 nbuf
= physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2);
421 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf
);
425 * Do not allow the sizeof(struct buf) * nbuf to exceed half of
426 * the valloc space which is just the virtual_end - virtual_start
427 * section. We use valloc() to allocate the buf header array.
429 if (nbuf
> (virtual_end
- virtual_start
) / sizeof(struct buf
) / 2) {
430 nbuf
= (virtual_end
- virtual_start
) /
431 sizeof(struct buf
) / 2;
432 kprintf("Warning: nbufs capped at %ld due to valloc "
433 "considerations\n", nbuf
);
436 nswbuf_mem
= lmax(lmin(nbuf
/ 32, 512), 8);
438 if (nswbuf_mem
< NSWBUF_MIN
)
439 nswbuf_mem
= NSWBUF_MIN
;
441 nswbuf_kva
= lmax(lmin(nbuf
/ 4, 512), 16);
443 if (nswbuf_kva
< NSWBUF_MIN
)
444 nswbuf_kva
= NSWBUF_MIN
;
450 valloc(swbuf_mem
, struct buf
, nswbuf_mem
);
451 valloc(swbuf_kva
, struct buf
, nswbuf_kva
);
452 valloc(buf
, struct buf
, nbuf
);
455 * End of first pass, size has been calculated so allocate memory
457 if (firstaddr
== 0) {
458 size
= (vm_size_t
)(v
- firstaddr
);
459 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
),
462 panic("startup: no room for tables");
467 * End of second pass, addresses have been assigned
469 * nbuf is an int, make sure we don't overflow the field.
471 * On 64-bit systems we always reserve maximal allocations for
472 * buffer cache buffers and there are no fragmentation issues,
473 * so the KVA segment does not have to be excessively oversized.
475 if ((vm_size_t
)(v
- firstaddr
) != size
)
476 panic("startup: table size inconsistency");
478 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
479 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
) +
480 ((nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) + pager_map_size
);
481 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
482 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
));
483 buffer_map
.system_map
= 1;
484 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
485 ((vm_offset_t
)(nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) +
487 pager_map
.system_map
= 1;
488 kprintf("avail memory = %ju (%ju MB)\n",
489 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
),
490 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
) /
494 struct cpu_idle_stat
{
502 u_long mwait_cx
[CPU_MWAIT_CX_MAX
];
505 #define CPU_IDLE_STAT_HALT -1
506 #define CPU_IDLE_STAT_SPIN -2
508 static struct cpu_idle_stat cpu_idle_stats
[MAXCPU
];
511 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS
)
513 int idx
= arg2
, cpu
, error
;
516 if (idx
== CPU_IDLE_STAT_HALT
) {
517 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
518 val
+= cpu_idle_stats
[cpu
].halt
;
519 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
520 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
521 val
+= cpu_idle_stats
[cpu
].spin
;
523 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
524 ("invalid index %d", idx
));
525 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
526 val
+= cpu_idle_stats
[cpu
].mwait_cx
[idx
];
529 error
= sysctl_handle_quad(oidp
, &val
, 0, req
);
530 if (error
|| req
->newptr
== NULL
)
533 if (idx
== CPU_IDLE_STAT_HALT
) {
534 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
535 cpu_idle_stats
[cpu
].halt
= 0;
536 cpu_idle_stats
[0].halt
= val
;
537 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
538 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
539 cpu_idle_stats
[cpu
].spin
= 0;
540 cpu_idle_stats
[0].spin
= val
;
542 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
543 ("invalid index %d", idx
));
544 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
545 cpu_idle_stats
[cpu
].mwait_cx
[idx
] = 0;
546 cpu_idle_stats
[0].mwait_cx
[idx
] = val
;
552 cpu_mwait_attach(void)
557 if (!CPU_MWAIT_HAS_CX
)
560 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
561 (CPUID_TO_FAMILY(cpu_id
) > 0xf ||
562 (CPUID_TO_FAMILY(cpu_id
) == 0x6 &&
563 CPUID_TO_MODEL(cpu_id
) >= 0xf))) {
567 * Pentium dual-core, Core 2 and beyond do not need any
568 * additional activities to enter deep C-state, i.e. C3(+).
570 cpu_mwait_cx_no_bmarb();
572 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts
);
574 cpu_mwait_cx_no_bmsts();
577 sbuf_new(&sb
, cpu_mwait_cx_supported
,
578 sizeof(cpu_mwait_cx_supported
), SBUF_FIXEDLEN
);
580 for (i
= 0; i
< CPU_MWAIT_CX_MAX
; ++i
) {
581 struct cpu_mwait_cx
*cx
= &cpu_mwait_cx_info
[i
];
584 ksnprintf(cx
->name
, sizeof(cx
->name
), "C%d", i
);
586 sysctl_ctx_init(&cx
->sysctl_ctx
);
587 cx
->sysctl_tree
= SYSCTL_ADD_NODE(&cx
->sysctl_ctx
,
588 SYSCTL_STATIC_CHILDREN(_machdep_mwait
), OID_AUTO
,
589 cx
->name
, CTLFLAG_RW
, NULL
, "Cx control/info");
590 if (cx
->sysctl_tree
== NULL
)
593 cx
->subcnt
= CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu
, i
);
594 SYSCTL_ADD_INT(&cx
->sysctl_ctx
,
595 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
596 "subcnt", CTLFLAG_RD
, &cx
->subcnt
, 0,
598 SYSCTL_ADD_PROC(&cx
->sysctl_ctx
,
599 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
600 "entered", (CTLTYPE_QUAD
| CTLFLAG_RW
), 0,
601 i
, sysctl_cpu_idle_cnt
, "Q", "# of times entered");
603 for (sub
= 0; sub
< cx
->subcnt
; ++sub
)
604 sbuf_printf(&sb
, "C%d/%d ", i
, sub
);
612 cpu_mwait_c1_hints_cnt
= cpu_mwait_cx_info
[CPU_MWAIT_C1
].subcnt
;
613 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
)
614 cpu_mwait_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
615 cpu_mwait_hints
= kmalloc(sizeof(int) * cpu_mwait_hints_cnt
,
619 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
) {
622 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
623 for (j
= 0; j
< subcnt
; ++j
) {
624 KASSERT(hint_idx
< cpu_mwait_hints_cnt
,
625 ("invalid mwait hint index %d", hint_idx
));
626 cpu_mwait_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
630 KASSERT(hint_idx
== cpu_mwait_hints_cnt
,
631 ("mwait hint count %d != index %d",
632 cpu_mwait_hints_cnt
, hint_idx
));
635 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt
);
636 for (i
= 0; i
< cpu_mwait_hints_cnt
; ++i
) {
637 int hint
= cpu_mwait_hints
[i
];
639 kprintf(" C%d/%d hint 0x%04x\n",
640 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
648 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
)
649 cpu_mwait_deep_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
650 cpu_mwait_deep_hints
= kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt
,
654 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
) {
657 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
658 for (j
= 0; j
< subcnt
; ++j
) {
659 KASSERT(hint_idx
< cpu_mwait_deep_hints_cnt
,
660 ("invalid mwait deep hint index %d", hint_idx
));
661 cpu_mwait_deep_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
665 KASSERT(hint_idx
== cpu_mwait_deep_hints_cnt
,
666 ("mwait deep hint count %d != index %d",
667 cpu_mwait_deep_hints_cnt
, hint_idx
));
670 kprintf("MWAIT deep hints:\n");
671 for (i
= 0; i
< cpu_mwait_deep_hints_cnt
; ++i
) {
672 int hint
= cpu_mwait_deep_hints
[i
];
674 kprintf(" C%d/%d hint 0x%04x\n",
675 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
679 cpu_idle_repeat_max
= 256 * cpu_mwait_deep_hints_cnt
;
681 for (i
= 0; i
< ncpus
; ++i
) {
684 ksnprintf(name
, sizeof(name
), "idle%d", i
);
685 SYSCTL_ADD_PROC(NULL
,
686 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX
), OID_AUTO
,
687 name
, (CTLTYPE_STRING
| CTLFLAG_RW
), &cpu_idle_stats
[i
],
688 0, cpu_mwait_cx_pcpu_idle_sysctl
, "A", "");
693 cpu_finish(void *dummy __unused
)
700 pic_finish(void *dummy __unused
)
702 /* Log ELCR information */
705 /* Log MPTABLE information */
706 mptable_pci_int_dump();
709 MachIntrABI
.finalize();
713 * Send an interrupt to process.
715 * Stack is set up to allow sigcode stored
716 * at top to call routine, followed by kcall
717 * to sigreturn routine below. After sigreturn
718 * resets the signal mask, the stack, and the
719 * frame pointer, it returns to the user
723 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
725 struct lwp
*lp
= curthread
->td_lwp
;
726 struct proc
*p
= lp
->lwp_proc
;
727 struct trapframe
*regs
;
728 struct sigacts
*psp
= p
->p_sigacts
;
729 struct sigframe sf
, *sfp
;
733 regs
= lp
->lwp_md
.md_regs
;
734 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
736 /* Save user context */
737 bzero(&sf
, sizeof(struct sigframe
));
738 sf
.sf_uc
.uc_sigmask
= *mask
;
739 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
740 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
741 KKASSERT(__offsetof(struct trapframe
, tf_rdi
) == 0);
742 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_rdi
, sizeof(struct trapframe
));
744 /* Make the size of the saved context visible to userland */
745 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
747 /* Allocate and validate space for the signal handler context. */
748 if ((lp
->lwp_flags
& LWP_ALTSTACK
) != 0 && !oonstack
&&
749 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
750 sp
= (char *)(lp
->lwp_sigstk
.ss_sp
+ lp
->lwp_sigstk
.ss_size
-
751 sizeof(struct sigframe
));
752 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
754 /* We take red zone into account */
755 sp
= (char *)regs
->tf_rsp
- sizeof(struct sigframe
) - 128;
759 * XXX AVX needs 64-byte alignment but sigframe has other fields and
760 * the embedded ucontext is not at the front, so aligning this won't
761 * help us. Fortunately we bcopy in/out of the sigframe, so the
764 * The problem though is if userland winds up trying to use the
767 sfp
= (struct sigframe
*)((intptr_t)sp
& ~(intptr_t)0xF);
769 /* Translate the signal is appropriate */
770 if (p
->p_sysent
->sv_sigtbl
) {
771 if (sig
<= p
->p_sysent
->sv_sigsize
)
772 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
776 * Build the argument list for the signal handler.
778 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
780 regs
->tf_rdi
= sig
; /* argument 1 */
781 regs
->tf_rdx
= (register_t
)&sfp
->sf_uc
; /* argument 3 */
783 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
785 * Signal handler installed with SA_SIGINFO.
787 * action(signo, siginfo, ucontext)
789 regs
->tf_rsi
= (register_t
)&sfp
->sf_si
; /* argument 2 */
790 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
791 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
793 /* fill siginfo structure */
794 sf
.sf_si
.si_signo
= sig
;
795 sf
.sf_si
.si_code
= code
;
796 sf
.sf_si
.si_addr
= (void *)regs
->tf_addr
;
799 * Old FreeBSD-style arguments.
801 * handler (signo, code, [uc], addr)
803 regs
->tf_rsi
= (register_t
)code
; /* argument 2 */
804 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
805 sf
.sf_ahu
.sf_handler
= catcher
;
809 * If we're a vm86 process, we want to save the segment registers.
810 * We also change eflags to be our emulated eflags, not the actual
814 if (regs
->tf_eflags
& PSL_VM
) {
815 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
816 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
818 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
819 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
820 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
821 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
823 if (vm86
->vm86_has_vme
== 0)
824 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
825 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
826 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
829 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
830 * syscalls made by the signal handler. This just avoids
831 * wasting time for our lazy fixup of such faults. PSL_NT
832 * does nothing in vm86 mode, but vm86 programs can set it
833 * almost legitimately in probes for old cpu types.
835 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
840 * Save the FPU state and reinit the FP unit
842 npxpush(&sf
.sf_uc
.uc_mcontext
);
845 * Copy the sigframe out to the user's stack.
847 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
849 * Something is wrong with the stack pointer.
850 * ...Kill the process.
855 regs
->tf_rsp
= (register_t
)sfp
;
856 regs
->tf_rip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
859 * i386 abi specifies that the direction flag must be cleared
862 regs
->tf_rflags
&= ~(PSL_T
|PSL_D
);
865 * 64 bit mode has a code and stack selector but
866 * no data or extra selector. %fs and %gs are not
869 regs
->tf_cs
= _ucodesel
;
870 regs
->tf_ss
= _udatasel
;
875 * Sanitize the trapframe for a virtual kernel passing control to a custom
876 * VM context. Remove any items that would otherwise create a privilage
879 * XXX at the moment we allow userland to set the resume flag. Is this a
883 cpu_sanitize_frame(struct trapframe
*frame
)
885 frame
->tf_cs
= _ucodesel
;
886 frame
->tf_ss
= _udatasel
;
887 /* XXX VM (8086) mode not supported? */
888 frame
->tf_rflags
&= (PSL_RF
| PSL_USERCHANGE
| PSL_VM_UNSUPP
);
889 frame
->tf_rflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
895 * Sanitize the tls so loading the descriptor does not blow up
896 * on us. For x86_64 we don't have to do anything.
899 cpu_sanitize_tls(struct savetls
*tls
)
905 * sigreturn(ucontext_t *sigcntxp)
907 * System call to cleanup state after a signal
908 * has been taken. Reset signal mask and
909 * stack state from context left by sendsig (above).
910 * Return to previous pc and psl as specified by
911 * context left by sendsig. Check carefully to
912 * make sure that the user has not modified the
913 * state to gain improper privileges.
917 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
918 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
921 sys_sigreturn(struct sigreturn_args
*uap
)
923 struct lwp
*lp
= curthread
->td_lwp
;
924 struct trapframe
*regs
;
932 * We have to copy the information into kernel space so userland
933 * can't modify it while we are sniffing it.
935 regs
= lp
->lwp_md
.md_regs
;
936 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
940 rflags
= ucp
->uc_mcontext
.mc_rflags
;
942 /* VM (8086) mode not supported */
943 rflags
&= ~PSL_VM_UNSUPP
;
946 if (eflags
& PSL_VM
) {
947 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
948 struct vm86_kernel
*vm86
;
951 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
952 * set up the vm86 area, and we can't enter vm86 mode.
954 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
956 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
957 if (vm86
->vm86_inited
== 0)
960 /* go back to user mode if both flags are set */
961 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
962 trapsignal(lp
, SIGBUS
, 0);
964 if (vm86
->vm86_has_vme
) {
965 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
966 (eflags
& VME_USERCHANGE
) | PSL_VM
;
968 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
969 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
970 (eflags
& VM_USERCHANGE
) | PSL_VM
;
972 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
973 tf
->tf_eflags
= eflags
;
974 tf
->tf_vm86_ds
= tf
->tf_ds
;
975 tf
->tf_vm86_es
= tf
->tf_es
;
976 tf
->tf_vm86_fs
= tf
->tf_fs
;
977 tf
->tf_vm86_gs
= tf
->tf_gs
;
978 tf
->tf_ds
= _udatasel
;
979 tf
->tf_es
= _udatasel
;
980 tf
->tf_fs
= _udatasel
;
981 tf
->tf_gs
= _udatasel
;
986 * Don't allow users to change privileged or reserved flags.
989 * XXX do allow users to change the privileged flag PSL_RF.
990 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
991 * should sometimes set it there too. tf_eflags is kept in
992 * the signal context during signal handling and there is no
993 * other place to remember it, so the PSL_RF bit may be
994 * corrupted by the signal handler without us knowing.
995 * Corruption of the PSL_RF bit at worst causes one more or
996 * one less debugger trap, so allowing it is fairly harmless.
998 if (!EFL_SECURE(rflags
& ~PSL_RF
, regs
->tf_rflags
& ~PSL_RF
)) {
999 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags
);
1004 * Don't allow users to load a valid privileged %cs. Let the
1005 * hardware check for invalid selectors, excess privilege in
1006 * other selectors, invalid %eip's and invalid %esp's.
1008 cs
= ucp
->uc_mcontext
.mc_cs
;
1009 if (!CS_SECURE(cs
)) {
1010 kprintf("sigreturn: cs = 0x%x\n", cs
);
1011 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
1014 bcopy(&ucp
->uc_mcontext
.mc_rdi
, regs
, sizeof(struct trapframe
));
1018 * Restore the FPU state from the frame
1021 npxpop(&ucp
->uc_mcontext
);
1023 if (ucp
->uc_mcontext
.mc_onstack
& 1)
1024 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
1026 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
1028 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
1029 SIG_CANTMASK(lp
->lwp_sigmask
);
1032 return(EJUSTRETURN
);
1036 * Machine dependent boot() routine
1038 * I haven't seen anything to put here yet
1039 * Possibly some stuff might be grafted back here from boot()
1047 * Shutdown the CPU as much as possible
1053 __asm__
__volatile("hlt");
1057 * cpu_idle() represents the idle LWKT. You cannot return from this function
1058 * (unless you want to blow things up!). Instead we look for runnable threads
1059 * and loop or halt as appropriate. Giant is not held on entry to the thread.
1061 * The main loop is entered with a critical section held, we must release
1062 * the critical section before doing anything else. lwkt_switch() will
1063 * check for pending interrupts due to entering and exiting its own
1066 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1067 * However, there are cases where the idlethread will be entered with
1068 * the possibility that no IPI will occur and in such cases
1069 * lwkt_switch() sets TDF_IDLE_NOHLT.
1071 * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1072 * must occur before it starts using ACPI halt.
1074 * NOTE: Value overridden in hammer_time().
1076 static int cpu_idle_hlt
= 2;
1077 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
1078 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
1079 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_repeat
, CTLFLAG_RW
,
1080 &cpu_idle_repeat
, 0, "Idle entries before acpi hlt");
1082 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1083 0, CPU_IDLE_STAT_HALT
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry halts");
1084 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_spincnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1085 0, CPU_IDLE_STAT_SPIN
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry spins");
1088 cpu_idle_default_hook(void)
1091 * We must guarentee that hlt is exactly the instruction
1092 * following the sti.
1094 __asm
__volatile("sti; hlt");
1097 /* Other subsystems (e.g., ACPI) can hook this later. */
1098 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
1101 cpu_mwait_cx_hint(struct cpu_idle_stat
*stat
)
1110 idx
= (stat
->repeat
+ stat
->repeat_last
+ stat
->repeat_delta
) >>
1111 cpu_mwait_repeat_shift
;
1112 if (idx
>= cpu_mwait_c1_hints_cnt
) {
1113 /* Step up faster, once we walked through all C1 states */
1114 stat
->repeat_delta
+= 1 << (cpu_mwait_repeat_shift
+ 1);
1116 if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
1117 if (idx
>= cpu_mwait_deep_hints_cnt
)
1118 idx
= cpu_mwait_deep_hints_cnt
- 1;
1119 hint
= cpu_mwait_deep_hints
[idx
];
1121 if (idx
>= cpu_mwait_hints_cnt
)
1122 idx
= cpu_mwait_hints_cnt
- 1;
1123 hint
= cpu_mwait_hints
[idx
];
1126 cx_idx
= MWAIT_EAX_TO_CX(hint
);
1127 if (cx_idx
>= 0 && cx_idx
< CPU_MWAIT_CX_MAX
)
1128 stat
->mwait_cx
[cx_idx
]++;
1135 globaldata_t gd
= mycpu
;
1136 struct cpu_idle_stat
*stat
= &cpu_idle_stats
[gd
->gd_cpuid
];
1137 struct thread
*td __debugvar
= gd
->gd_curthread
;
1141 stat
->repeat
= stat
->repeat_last
= cpu_idle_repeat_max
;
1144 KKASSERT(td
->td_critcount
== 0);
1148 * See if there are any LWKTs ready to go.
1153 * When halting inside a cli we must check for reqflags
1154 * races, particularly [re]schedule requests. Running
1155 * splz() does the job.
1158 * 0 Never halt, just spin
1160 * 1 Always use HLT (or MONITOR/MWAIT if avail).
1162 * Better default for modern (Haswell+) Intel
1165 * 2 Use HLT/MONITOR/MWAIT up to a point and then
1166 * use the ACPI halt (default). This is a hybrid
1167 * approach. See machdep.cpu_idle_repeat.
1169 * Better default for modern AMD cpus and older
1172 * 3 Always use the ACPI halt. This typically
1173 * eats the least amount of power but the cpu
1174 * will be slow waking up. Slows down e.g.
1175 * compiles and other pipe/event oriented stuff.
1179 * NOTE: Interrupts are enabled and we are not in a critical
1182 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
1183 * don't bother capping gd_idle_repeat, it is ok if
1186 * Implement optimized invltlb operations when halted
1187 * in idle. By setting the bit in smp_idleinvl_mask
1188 * we inform other cpus that they can set _reqs to
1189 * request an invltlb. Current the code to do that
1190 * sets the bits in _reqs anyway, but then check _mask
1191 * to determine if they can assume the invltlb will execute.
1193 * A critical section is required to ensure that interrupts
1194 * do not fully run until after we've had a chance to execute
1197 if (gd
->gd_idle_repeat
== 0) {
1198 stat
->repeat
= (stat
->repeat
+ stat
->repeat_last
) >> 1;
1199 if (stat
->repeat
> cpu_idle_repeat_max
)
1200 stat
->repeat
= cpu_idle_repeat_max
;
1201 stat
->repeat_last
= 0;
1202 stat
->repeat_delta
= 0;
1204 ++stat
->repeat_last
;
1206 ++gd
->gd_idle_repeat
;
1207 reqflags
= gd
->gd_reqflags
;
1208 quick
= (cpu_idle_hlt
== 1) ||
1209 (cpu_idle_hlt
< 3 &&
1210 gd
->gd_idle_repeat
< cpu_idle_repeat
);
1212 if (quick
&& (cpu_mi_feature
& CPU_MI_MONITOR
) &&
1213 (reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1216 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1217 cpu_mmw_pause_int(&gd
->gd_reqflags
, reqflags
,
1218 cpu_mwait_cx_hint(stat
), 0);
1220 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1221 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1227 } else if (cpu_idle_hlt
) {
1228 __asm
__volatile("cli");
1231 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1232 if ((gd
->gd_reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1234 cpu_idle_default_hook();
1238 __asm
__volatile("sti");
1240 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1241 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1249 __asm
__volatile("sti");
1256 * Called in a loop indirectly via Xcpustop
1259 cpu_smp_stopped(void)
1261 globaldata_t gd
= mycpu
;
1262 volatile __uint64_t
*ptr
;
1265 ptr
= CPUMASK_ADDR(started_cpus
, gd
->gd_cpuid
);
1267 if ((ovalue
& CPUMASK_SIMPLE(gd
->gd_cpuid
& 63)) == 0) {
1268 if (cpu_mi_feature
& CPU_MI_MONITOR
) {
1269 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr
), ovalue
,
1270 cpu_mwait_hints
[CPU_MWAIT_C1
], 0);
1272 cpu_halt(); /* depend on lapic timer */
1278 * This routine is called if a spinlock has been held through the
1279 * exponential backoff period and is seriously contested. On a real cpu
1283 cpu_spinlock_contested(void)
1289 * Clear registers on exec
1292 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
1294 struct thread
*td
= curthread
;
1295 struct lwp
*lp
= td
->td_lwp
;
1296 struct pcb
*pcb
= td
->td_pcb
;
1297 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
1299 /* was i386_user_cleanup() in NetBSD */
1303 bzero((char *)regs
, sizeof(struct trapframe
));
1304 regs
->tf_rip
= entry
;
1305 regs
->tf_rsp
= ((stack
- 8) & ~0xFul
) + 8; /* align the stack */
1306 regs
->tf_rdi
= stack
; /* argv */
1307 regs
->tf_rflags
= PSL_USER
| (regs
->tf_rflags
& PSL_T
);
1308 regs
->tf_ss
= _udatasel
;
1309 regs
->tf_cs
= _ucodesel
;
1310 regs
->tf_rbx
= ps_strings
;
1313 * Reset the hardware debug registers if they were in use.
1314 * They won't have any meaning for the newly exec'd process.
1316 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1322 pcb
->pcb_dr7
= 0; /* JG set bit 10? */
1323 if (pcb
== td
->td_pcb
) {
1325 * Clear the debug registers on the running
1326 * CPU, otherwise they will end up affecting
1327 * the next process we switch to.
1331 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1335 * Initialize the math emulator (if any) for the current process.
1336 * Actually, just clear the bit that says that the emulator has
1337 * been initialized. Initialization is delayed until the process
1338 * traps to the emulator (if it is done at all) mainly because
1339 * emulators don't provide an entry point for initialization.
1341 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1344 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1345 * gd_npxthread. Otherwise a preemptive interrupt thread
1346 * may panic in npxdna().
1349 load_cr0(rcr0() | CR0_MP
);
1352 * NOTE: The MSR values must be correct so we can return to
1353 * userland. gd_user_fs/gs must be correct so the switch
1354 * code knows what the current MSR values are.
1356 pcb
->pcb_fsbase
= 0; /* Values loaded from PCB on switch */
1357 pcb
->pcb_gsbase
= 0;
1358 mdcpu
->gd_user_fs
= 0; /* Cache of current MSR values */
1359 mdcpu
->gd_user_gs
= 0;
1360 wrmsr(MSR_FSBASE
, 0); /* Set MSR values for return to userland */
1361 wrmsr(MSR_KGSBASE
, 0);
1363 /* Initialize the npx (if any) for the current process. */
1367 pcb
->pcb_ds
= _udatasel
;
1368 pcb
->pcb_es
= _udatasel
;
1369 pcb
->pcb_fs
= _udatasel
;
1370 pcb
->pcb_gs
= _udatasel
;
1379 cr0
|= CR0_NE
; /* Done by npxinit() */
1380 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1381 cr0
|= CR0_WP
| CR0_AM
;
1387 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1390 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1392 if (!error
&& req
->newptr
)
1397 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1398 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1400 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1401 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1404 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1405 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1408 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1409 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1411 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1412 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1413 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1416 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS
)
1418 struct efi_map_header
*efihdr
;
1422 kmdp
= preload_search_by_type("elf kernel");
1424 kmdp
= preload_search_by_type("elf64 kernel");
1425 efihdr
= (struct efi_map_header
*)preload_search_info(kmdp
,
1426 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1429 efisize
= *((uint32_t *)efihdr
- 1);
1430 return (SYSCTL_OUT(req
, efihdr
, efisize
));
1432 SYSCTL_PROC(_machdep
, OID_AUTO
, efi_map
, CTLTYPE_OPAQUE
|CTLFLAG_RD
, NULL
, 0,
1433 efi_map_sysctl_handler
, "S,efi_map_header", "Raw EFI Memory Map");
1436 * Initialize 386 and configure to run kernel
1440 * Initialize segments & interrupt table
1444 struct user_segment_descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1445 struct gate_descriptor idt_arr
[MAXCPU
][NIDT
];
1447 union descriptor ldt
[NLDT
]; /* local descriptor table */
1450 /* table descriptors - used to load tables by cpu */
1451 struct region_descriptor r_gdt
;
1452 struct region_descriptor r_idt_arr
[MAXCPU
];
1454 /* JG proc0paddr is a virtual address */
1457 char proc0paddr_buff
[LWKT_THREAD_STACK
];
1460 /* software prototypes -- in more palatable form */
1461 struct soft_segment_descriptor gdt_segs
[] = {
1462 /* GNULL_SEL 0 Null Descriptor */
1463 { 0x0, /* segment base address */
1465 0, /* segment type */
1466 0, /* segment descriptor priority level */
1467 0, /* segment descriptor present */
1469 0, /* default 32 vs 16 bit size */
1470 0 /* limit granularity (byte/page units)*/ },
1471 /* GCODE_SEL 1 Code Descriptor for kernel */
1472 { 0x0, /* segment base address */
1473 0xfffff, /* length - all address space */
1474 SDT_MEMERA
, /* segment type */
1475 SEL_KPL
, /* segment descriptor priority level */
1476 1, /* segment descriptor present */
1478 0, /* default 32 vs 16 bit size */
1479 1 /* limit granularity (byte/page units)*/ },
1480 /* GDATA_SEL 2 Data Descriptor for kernel */
1481 { 0x0, /* segment base address */
1482 0xfffff, /* length - all address space */
1483 SDT_MEMRWA
, /* segment type */
1484 SEL_KPL
, /* segment descriptor priority level */
1485 1, /* segment descriptor present */
1487 0, /* default 32 vs 16 bit size */
1488 1 /* limit granularity (byte/page units)*/ },
1489 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1490 { 0x0, /* segment base address */
1491 0xfffff, /* length - all address space */
1492 SDT_MEMERA
, /* segment type */
1493 SEL_UPL
, /* segment descriptor priority level */
1494 1, /* segment descriptor present */
1496 1, /* default 32 vs 16 bit size */
1497 1 /* limit granularity (byte/page units)*/ },
1498 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1499 { 0x0, /* segment base address */
1500 0xfffff, /* length - all address space */
1501 SDT_MEMRWA
, /* segment type */
1502 SEL_UPL
, /* segment descriptor priority level */
1503 1, /* segment descriptor present */
1505 1, /* default 32 vs 16 bit size */
1506 1 /* limit granularity (byte/page units)*/ },
1507 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1508 { 0x0, /* segment base address */
1509 0xfffff, /* length - all address space */
1510 SDT_MEMERA
, /* segment type */
1511 SEL_UPL
, /* segment descriptor priority level */
1512 1, /* segment descriptor present */
1514 0, /* default 32 vs 16 bit size */
1515 1 /* limit granularity (byte/page units)*/ },
1516 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1518 0x0, /* segment base address */
1519 sizeof(struct x86_64tss
)-1,/* length - all address space */
1520 SDT_SYSTSS
, /* segment type */
1521 SEL_KPL
, /* segment descriptor priority level */
1522 1, /* segment descriptor present */
1524 0, /* unused - default 32 vs 16 bit size */
1525 0 /* limit granularity (byte/page units)*/ },
1526 /* Actually, the TSS is a system descriptor which is double size */
1527 { 0x0, /* segment base address */
1529 0, /* segment type */
1530 0, /* segment descriptor priority level */
1531 0, /* segment descriptor present */
1533 0, /* default 32 vs 16 bit size */
1534 0 /* limit granularity (byte/page units)*/ },
1535 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1536 { 0x0, /* segment base address */
1537 0xfffff, /* length - all address space */
1538 SDT_MEMRWA
, /* segment type */
1539 SEL_UPL
, /* segment descriptor priority level */
1540 1, /* segment descriptor present */
1542 1, /* default 32 vs 16 bit size */
1543 1 /* limit granularity (byte/page units)*/ },
1547 setidt_global(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
)
1551 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
1552 struct gate_descriptor
*ip
= &idt_arr
[cpu
][idx
];
1554 ip
->gd_looffset
= (uintptr_t)func
;
1555 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1561 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1566 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
, int cpu
)
1568 struct gate_descriptor
*ip
;
1570 KASSERT(cpu
>= 0 && cpu
< ncpus
, ("invalid cpu %d", cpu
));
1572 ip
= &idt_arr
[cpu
][idx
];
1573 ip
->gd_looffset
= (uintptr_t)func
;
1574 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1580 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1583 #define IDTVEC(name) __CONCAT(X,name)
1586 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1587 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1588 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1589 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(rsvd
), IDTVEC(fpu
), IDTVEC(align
),
1590 IDTVEC(xmm
), IDTVEC(dblfault
),
1591 IDTVEC(fast_syscall
), IDTVEC(fast_syscall32
);
1594 sdtossd(struct user_segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1596 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1597 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1598 ssd
->ssd_type
= sd
->sd_type
;
1599 ssd
->ssd_dpl
= sd
->sd_dpl
;
1600 ssd
->ssd_p
= sd
->sd_p
;
1601 ssd
->ssd_def32
= sd
->sd_def32
;
1602 ssd
->ssd_gran
= sd
->sd_gran
;
1606 ssdtosd(struct soft_segment_descriptor
*ssd
, struct user_segment_descriptor
*sd
)
1609 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1610 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xff;
1611 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1612 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1613 sd
->sd_type
= ssd
->ssd_type
;
1614 sd
->sd_dpl
= ssd
->ssd_dpl
;
1615 sd
->sd_p
= ssd
->ssd_p
;
1616 sd
->sd_long
= ssd
->ssd_long
;
1617 sd
->sd_def32
= ssd
->ssd_def32
;
1618 sd
->sd_gran
= ssd
->ssd_gran
;
1622 ssdtosyssd(struct soft_segment_descriptor
*ssd
,
1623 struct system_segment_descriptor
*sd
)
1626 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1627 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xfffffffffful
;
1628 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1629 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1630 sd
->sd_type
= ssd
->ssd_type
;
1631 sd
->sd_dpl
= ssd
->ssd_dpl
;
1632 sd
->sd_p
= ssd
->ssd_p
;
1633 sd
->sd_gran
= ssd
->ssd_gran
;
1637 * Populate the (physmap) array with base/bound pairs describing the
1638 * available physical memory in the system, then test this memory and
1639 * build the phys_avail array describing the actually-available memory.
1641 * If we cannot accurately determine the physical memory map, then use
1642 * value from the 0xE801 call, and failing that, the RTC.
1644 * Total memory size may be set by the kernel environment variable
1645 * hw.physmem or the compile-time define MAXMEM.
1647 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1648 * of PAGE_SIZE. This also greatly reduces the memory test time
1649 * which would otherwise be excessive on machines with > 8G of ram.
1651 * XXX first should be vm_paddr_t.
1654 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024)
1655 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1656 #define PHYSMAP_SIZE VM_PHYSSEG_MAX
1658 vm_paddr_t physmap
[PHYSMAP_SIZE
];
1659 struct bios_smap
*smapbase
, *smap
, *smapend
;
1660 struct efi_map_header
*efihdrbase
;
1663 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024)
1664 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1)
1667 add_smap_entries(int *physmap_idx
)
1671 smapsize
= *((u_int32_t
*)smapbase
- 1);
1672 smapend
= (struct bios_smap
*)((uintptr_t)smapbase
+ smapsize
);
1674 for (smap
= smapbase
; smap
< smapend
; smap
++) {
1675 if (boothowto
& RB_VERBOSE
)
1676 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1677 smap
->type
, smap
->base
, smap
->length
);
1679 if (smap
->type
!= SMAP_TYPE_MEMORY
)
1682 if (smap
->length
== 0)
1685 for (i
= 0; i
<= *physmap_idx
; i
+= 2) {
1686 if (smap
->base
< physmap
[i
+ 1]) {
1687 if (boothowto
& RB_VERBOSE
) {
1688 kprintf("Overlapping or non-monotonic "
1689 "memory region, ignoring "
1695 if (i
<= *physmap_idx
)
1698 Realmem
+= smap
->length
;
1700 if (smap
->base
== physmap
[*physmap_idx
+ 1]) {
1701 physmap
[*physmap_idx
+ 1] += smap
->length
;
1706 if (*physmap_idx
== PHYSMAP_SIZE
) {
1707 kprintf("Too many segments in the physical "
1708 "address map, giving up\n");
1711 physmap
[*physmap_idx
] = smap
->base
;
1712 physmap
[*physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1717 add_efi_map_entries(int *physmap_idx
)
1719 struct efi_md
*map
, *p
;
1724 static const char *types
[] = {
1730 "RuntimeServicesCode",
1731 "RuntimeServicesData",
1732 "ConventionalMemory",
1734 "ACPIReclaimMemory",
1737 "MemoryMappedIOPortSpace",
1742 * Memory map data provided by UEFI via the GetMemoryMap
1743 * Boot Services API.
1745 efisz
= (sizeof(struct efi_map_header
) + 0xf) & ~0xf;
1746 map
= (struct efi_md
*)((uint8_t *)efihdrbase
+ efisz
);
1748 if (efihdrbase
->descriptor_size
== 0)
1750 ndesc
= efihdrbase
->memory_size
/ efihdrbase
->descriptor_size
;
1752 if (boothowto
& RB_VERBOSE
)
1753 kprintf("%23s %12s %12s %8s %4s\n",
1754 "Type", "Physical", "Virtual", "#Pages", "Attr");
1756 for (i
= 0, p
= map
; i
< ndesc
; i
++,
1757 p
= efi_next_descriptor(p
, efihdrbase
->descriptor_size
)) {
1758 if (boothowto
& RB_VERBOSE
) {
1759 if (p
->md_type
<= EFI_MD_TYPE_PALCODE
)
1760 type
= types
[p
->md_type
];
1763 kprintf("%23s %012lx %12p %08lx ", type
, p
->md_phys
,
1764 p
->md_virt
, p
->md_pages
);
1765 if (p
->md_attr
& EFI_MD_ATTR_UC
)
1767 if (p
->md_attr
& EFI_MD_ATTR_WC
)
1769 if (p
->md_attr
& EFI_MD_ATTR_WT
)
1771 if (p
->md_attr
& EFI_MD_ATTR_WB
)
1773 if (p
->md_attr
& EFI_MD_ATTR_UCE
)
1775 if (p
->md_attr
& EFI_MD_ATTR_WP
)
1777 if (p
->md_attr
& EFI_MD_ATTR_RP
)
1779 if (p
->md_attr
& EFI_MD_ATTR_XP
)
1781 if (p
->md_attr
& EFI_MD_ATTR_RT
)
1786 switch (p
->md_type
) {
1787 case EFI_MD_TYPE_CODE
:
1788 case EFI_MD_TYPE_DATA
:
1789 case EFI_MD_TYPE_BS_CODE
:
1790 case EFI_MD_TYPE_BS_DATA
:
1791 case EFI_MD_TYPE_FREE
:
1793 * We're allowed to use any entry with these types.
1800 Realmem
+= p
->md_pages
* PAGE_SIZE
;
1802 if (p
->md_phys
== physmap
[*physmap_idx
+ 1]) {
1803 physmap
[*physmap_idx
+ 1] += p
->md_pages
* PAGE_SIZE
;
1808 if (*physmap_idx
== PHYSMAP_SIZE
) {
1809 kprintf("Too many segments in the physical "
1810 "address map, giving up\n");
1813 physmap
[*physmap_idx
] = p
->md_phys
;
1814 physmap
[*physmap_idx
+ 1] = p
->md_phys
+ p
->md_pages
* PAGE_SIZE
;
1818 struct fb_info efi_fb_info
;
1819 static int have_efi_framebuffer
= 0;
1822 efi_fb_init_vaddr(int direct_map
)
1825 vm_offset_t addr
, v
;
1827 v
= efi_fb_info
.vaddr
;
1828 sz
= efi_fb_info
.stride
* efi_fb_info
.height
;
1831 addr
= PHYS_TO_DMAP(efi_fb_info
.paddr
);
1832 if (addr
>= DMAP_MIN_ADDRESS
&& addr
+ sz
< DMAP_MAX_ADDRESS
)
1833 efi_fb_info
.vaddr
= addr
;
1835 efi_fb_info
.vaddr
= (vm_offset_t
)pmap_mapdev_attr(
1836 efi_fb_info
.paddr
, sz
, PAT_WRITE_COMBINING
);
1841 probe_efi_fb(int early
)
1843 struct efi_fb
*efifb
;
1846 if (have_efi_framebuffer
) {
1848 (efi_fb_info
.vaddr
== 0 ||
1849 efi_fb_info
.vaddr
== PHYS_TO_DMAP(efi_fb_info
.paddr
)))
1850 efi_fb_init_vaddr(0);
1854 kmdp
= preload_search_by_type("elf kernel");
1856 kmdp
= preload_search_by_type("elf64 kernel");
1857 efifb
= (struct efi_fb
*)preload_search_info(kmdp
,
1858 MODINFO_METADATA
| MODINFOMD_EFI_FB
);
1862 have_efi_framebuffer
= 1;
1864 efi_fb_info
.is_vga_boot_display
= 1;
1865 efi_fb_info
.width
= efifb
->fb_width
;
1866 efi_fb_info
.height
= efifb
->fb_height
;
1867 efi_fb_info
.stride
= efifb
->fb_stride
* 4;
1868 efi_fb_info
.depth
= 32;
1869 efi_fb_info
.paddr
= efifb
->fb_addr
;
1871 efi_fb_info
.vaddr
= 0;
1873 efi_fb_init_vaddr(0);
1875 efi_fb_info
.fbops
.fb_set_par
= NULL
;
1876 efi_fb_info
.fbops
.fb_blank
= NULL
;
1877 efi_fb_info
.fbops
.fb_debug_enter
= NULL
;
1878 efi_fb_info
.device
= NULL
;
1884 efifb_startup(void *arg
)
1889 SYSINIT(efi_fb_info
, SI_BOOT1_POST
, SI_ORDER_FIRST
, efifb_startup
, NULL
);
1892 getmemsize(caddr_t kmdp
, u_int64_t first
)
1894 int off
, physmap_idx
, pa_indx
, da_indx
;
1897 vm_paddr_t msgbuf_size
;
1898 u_long physmem_tunable
;
1900 quad_t dcons_addr
, dcons_size
;
1902 bzero(physmap
, sizeof(physmap
));
1906 * get memory map from INT 15:E820, kindly supplied by the loader.
1908 * subr_module.c says:
1909 * "Consumer may safely assume that size value precedes data."
1910 * ie: an int32_t immediately precedes smap.
1912 efihdrbase
= (struct efi_map_header
*)preload_search_info(kmdp
,
1913 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1914 smapbase
= (struct bios_smap
*)preload_search_info(kmdp
,
1915 MODINFO_METADATA
| MODINFOMD_SMAP
);
1916 if (smapbase
== NULL
&& efihdrbase
== NULL
)
1917 panic("No BIOS smap or EFI map info from loader!");
1919 if (efihdrbase
== NULL
)
1920 add_smap_entries(&physmap_idx
);
1922 add_efi_map_entries(&physmap_idx
);
1924 base_memory
= physmap
[1] / 1024;
1925 /* make hole for AP bootstrap code */
1926 physmap
[1] = mp_bootaddress(base_memory
);
1928 /* Save EBDA address, if any */
1929 ebda_addr
= (u_long
)(*(u_short
*)(KERNBASE
+ 0x40e));
1933 * Maxmem isn't the "maximum memory", it's one larger than the
1934 * highest page of the physical address space. It should be
1935 * called something like "Maxphyspage". We may adjust this
1936 * based on ``hw.physmem'' and the results of the memory test.
1938 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1941 Maxmem
= MAXMEM
/ 4;
1944 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable
))
1945 Maxmem
= atop(physmem_tunable
);
1948 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1951 if (Maxmem
> atop(physmap
[physmap_idx
+ 1]))
1952 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1955 * Blowing out the DMAP will blow up the system.
1957 if (Maxmem
> atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
)) {
1958 kprintf("Limiting Maxmem due to DMAP size\n");
1959 Maxmem
= atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
);
1962 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1963 (boothowto
& RB_VERBOSE
)) {
1964 kprintf("Physical memory use set to %ldK\n", Maxmem
* 4);
1968 * Call pmap initialization to make new kernel address space
1972 pmap_bootstrap(&first
);
1973 physmap
[0] = PAGE_SIZE
;
1976 * Align the physmap to PHYSMAP_ALIGN and cut out anything
1979 for (i
= j
= 0; i
<= physmap_idx
; i
+= 2) {
1980 if (physmap
[i
+1] > ptoa(Maxmem
))
1981 physmap
[i
+1] = ptoa(Maxmem
);
1982 physmap
[i
] = (physmap
[i
] + PHYSMAP_ALIGN_MASK
) &
1983 ~PHYSMAP_ALIGN_MASK
;
1984 physmap
[i
+1] = physmap
[i
+1] & ~PHYSMAP_ALIGN_MASK
;
1986 physmap
[j
] = physmap
[i
];
1987 physmap
[j
+1] = physmap
[i
+1];
1989 if (physmap
[i
] < physmap
[i
+1])
1992 physmap_idx
= j
- 2;
1995 * Align anything else used in the validation loop.
1997 first
= (first
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
2000 * Size up each available chunk of physical memory.
2004 phys_avail
[pa_indx
].phys_beg
= physmap
[0];
2005 phys_avail
[pa_indx
].phys_end
= physmap
[0];
2006 dump_avail
[da_indx
].phys_beg
= 0;
2007 dump_avail
[da_indx
].phys_end
= physmap
[0];
2011 * Get dcons buffer address
2013 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
2014 kgetenv_quad("dcons.size", &dcons_size
) == 0)
2018 * Validate the physical memory. The physical memory segments
2019 * have already been aligned to PHYSMAP_ALIGN which is a multiple
2022 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
2024 vm_paddr_t incr
= PHYSMAP_ALIGN
;
2026 end
= physmap
[i
+ 1];
2028 for (pa
= physmap
[i
]; pa
< end
; pa
+= incr
) {
2030 volatile uint64_t *ptr
= (uint64_t *)CADDR1
;
2033 incr
= PHYSMAP_ALIGN
;
2037 * block out kernel memory as not available.
2039 if (pa
>= 0x200000 && pa
< first
)
2043 * block out dcons buffer
2046 && pa
>= trunc_page(dcons_addr
)
2047 && pa
< dcons_addr
+ dcons_size
) {
2054 * Always test the first and last block supplied in
2055 * the map entry, but it just takes too long to run
2056 * the test these days and we already have to skip
2057 * pages. Handwave it on PHYSMAP_HANDWAVE boundaries.
2059 if (pa
!= physmap
[i
]) {
2060 vm_paddr_t bytes
= end
- pa
;
2061 if ((pa
& PHYSMAP_HANDWAVE_MASK
) == 0 &&
2062 bytes
>= PHYSMAP_HANDWAVE
+ PHYSMAP_ALIGN
) {
2063 incr
= PHYSMAP_HANDWAVE
;
2069 * map page into kernel: valid, read/write,non-cacheable
2072 kernel_pmap
.pmap_bits
[PG_V_IDX
] |
2073 kernel_pmap
.pmap_bits
[PG_RW_IDX
] |
2074 kernel_pmap
.pmap_bits
[PG_N_IDX
];
2075 cpu_invlpg(__DEVOLATILE(void *, ptr
));
2080 * Test for alternating 1's and 0's
2082 *ptr
= 0xaaaaaaaaaaaaaaaaLLU
;
2084 if (*ptr
!= 0xaaaaaaaaaaaaaaaaLLU
)
2087 * Test for alternating 0's and 1's
2089 *ptr
= 0x5555555555555555LLU
;
2091 if (*ptr
!= 0x5555555555555555LLU
)
2096 *ptr
= 0xffffffffffffffffLLU
;
2098 if (*ptr
!= 0xffffffffffffffffLLU
)
2108 * Restore original value.
2114 * Adjust array of valid/good pages.
2116 if (page_bad
== TRUE
)
2120 * If this good page is a continuation of the
2121 * previous set of good pages, then just increase
2122 * the end pointer. Otherwise start a new chunk.
2123 * Note that "end" points one higher than end,
2124 * making the range >= start and < end.
2125 * If we're also doing a speculative memory
2126 * test and we at or past the end, bump up Maxmem
2127 * so that we keep going. The first bad page
2128 * will terminate the loop.
2130 if (phys_avail
[pa_indx
].phys_end
== pa
) {
2131 phys_avail
[pa_indx
].phys_end
+= incr
;
2134 if (pa_indx
== PHYS_AVAIL_ARRAY_END
) {
2136 "Too many holes in the physical address space, giving up\n");
2141 phys_avail
[pa_indx
].phys_beg
= pa
;
2142 phys_avail
[pa_indx
].phys_end
= pa
+ incr
;
2144 physmem
+= incr
/ PAGE_SIZE
;
2146 if (dump_avail
[da_indx
].phys_end
== pa
) {
2147 dump_avail
[da_indx
].phys_end
+= incr
;
2150 if (da_indx
== DUMP_AVAIL_ARRAY_END
) {
2154 dump_avail
[da_indx
].phys_beg
= pa
;
2155 dump_avail
[da_indx
].phys_end
= pa
+ incr
;
2167 * The last chunk must contain at least one page plus the message
2168 * buffer to avoid complicating other code (message buffer address
2169 * calculation, etc.).
2171 msgbuf_size
= (MSGBUF_SIZE
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
2173 while (phys_avail
[pa_indx
].phys_beg
+ PHYSMAP_ALIGN
+ msgbuf_size
>=
2174 phys_avail
[pa_indx
].phys_end
) {
2175 physmem
-= atop(phys_avail
[pa_indx
].phys_end
-
2176 phys_avail
[pa_indx
].phys_beg
);
2177 phys_avail
[pa_indx
].phys_beg
= 0;
2178 phys_avail
[pa_indx
].phys_end
= 0;
2182 Maxmem
= atop(phys_avail
[pa_indx
].phys_end
);
2184 /* Trim off space for the message buffer. */
2185 phys_avail
[pa_indx
].phys_end
-= msgbuf_size
;
2187 avail_end
= phys_avail
[pa_indx
].phys_end
;
2189 /* Map the message buffer. */
2190 for (off
= 0; off
< msgbuf_size
; off
+= PAGE_SIZE
) {
2191 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2193 /* Try to get EFI framebuffer working as early as possible */
2194 if (have_efi_framebuffer
)
2195 efi_fb_init_vaddr(1);
2198 struct machintr_abi MachIntrABI
;
2209 * 7 Device Not Available (x87)
2211 * 9 Coprocessor Segment overrun (unsupported, reserved)
2213 * 11 Segment not present
2215 * 13 General Protection
2218 * 16 x87 FP Exception pending
2219 * 17 Alignment Check
2221 * 19 SIMD floating point
2223 * 32-255 INTn/external sources
2226 hammer_time(u_int64_t modulep
, u_int64_t physfree
)
2229 int gsel_tss
, x
, cpu
;
2231 int metadata_missing
, off
;
2233 struct mdglobaldata
*gd
;
2237 * Prevent lowering of the ipl if we call tsleep() early.
2239 gd
= &CPU_prvspace
[0]->mdglobaldata
;
2240 bzero(gd
, sizeof(*gd
));
2243 * Note: on both UP and SMP curthread must be set non-NULL
2244 * early in the boot sequence because the system assumes
2245 * that 'curthread' is never NULL.
2248 gd
->mi
.gd_curthread
= &thread0
;
2249 thread0
.td_gd
= &gd
->mi
;
2251 atdevbase
= ISA_HOLE_START
+ PTOV_OFFSET
;
2254 metadata_missing
= 0;
2255 if (bootinfo
.bi_modulep
) {
2256 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
2257 preload_bootstrap_relocate(KERNBASE
);
2259 metadata_missing
= 1;
2261 if (bootinfo
.bi_envp
)
2262 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
2265 preload_metadata
= (caddr_t
)(uintptr_t)(modulep
+ PTOV_OFFSET
);
2266 preload_bootstrap_relocate(PTOV_OFFSET
);
2267 kmdp
= preload_search_by_type("elf kernel");
2269 kmdp
= preload_search_by_type("elf64 kernel");
2270 boothowto
= MD_FETCH(kmdp
, MODINFOMD_HOWTO
, int);
2271 kern_envp
= MD_FETCH(kmdp
, MODINFOMD_ENVP
, char *) + PTOV_OFFSET
;
2273 ksym_start
= MD_FETCH(kmdp
, MODINFOMD_SSYM
, uintptr_t);
2274 ksym_end
= MD_FETCH(kmdp
, MODINFOMD_ESYM
, uintptr_t);
2276 efi_systbl_phys
= MD_FETCH(kmdp
, MODINFOMD_FW_HANDLE
, vm_paddr_t
);
2278 if (boothowto
& RB_VERBOSE
)
2282 * Default MachIntrABI to ICU
2284 MachIntrABI
= MachIntrABI_ICU
;
2287 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
2288 * and ncpus_fit_mask remain 0.
2293 /* Init basic tunables, hz etc */
2297 * make gdt memory segments
2299 gdt_segs
[GPROC0_SEL
].ssd_base
=
2300 (uintptr_t) &CPU_prvspace
[0]->mdglobaldata
.gd_common_tss
;
2302 gd
->mi
.gd_prvspace
= CPU_prvspace
[0];
2304 for (x
= 0; x
< NGDT
; x
++) {
2305 if (x
!= GPROC0_SEL
&& x
!= (GPROC0_SEL
+ 1))
2306 ssdtosd(&gdt_segs
[x
], &gdt
[x
]);
2308 ssdtosyssd(&gdt_segs
[GPROC0_SEL
],
2309 (struct system_segment_descriptor
*)&gdt
[GPROC0_SEL
]);
2311 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
2312 r_gdt
.rd_base
= (long) gdt
;
2315 wrmsr(MSR_FSBASE
, 0); /* User value */
2316 wrmsr(MSR_GSBASE
, (u_int64_t
)&gd
->mi
);
2317 wrmsr(MSR_KGSBASE
, 0); /* User value while in the kernel */
2319 mi_gdinit(&gd
->mi
, 0);
2321 proc0paddr
= proc0paddr_buff
;
2322 mi_proc0init(&gd
->mi
, proc0paddr
);
2323 safepri
= TDPRI_MAX
;
2325 /* spinlocks and the BGL */
2329 for (x
= 0; x
< NIDT
; x
++)
2330 setidt_global(x
, &IDTVEC(rsvd
), SDT_SYSIGT
, SEL_KPL
, 0);
2331 setidt_global(IDT_DE
, &IDTVEC(div
), SDT_SYSIGT
, SEL_KPL
, 0);
2332 setidt_global(IDT_DB
, &IDTVEC(dbg
), SDT_SYSIGT
, SEL_KPL
, 0);
2333 setidt_global(IDT_NMI
, &IDTVEC(nmi
), SDT_SYSIGT
, SEL_KPL
, 1);
2334 setidt_global(IDT_BP
, &IDTVEC(bpt
), SDT_SYSIGT
, SEL_UPL
, 0);
2335 setidt_global(IDT_OF
, &IDTVEC(ofl
), SDT_SYSIGT
, SEL_KPL
, 0);
2336 setidt_global(IDT_BR
, &IDTVEC(bnd
), SDT_SYSIGT
, SEL_KPL
, 0);
2337 setidt_global(IDT_UD
, &IDTVEC(ill
), SDT_SYSIGT
, SEL_KPL
, 0);
2338 setidt_global(IDT_NM
, &IDTVEC(dna
), SDT_SYSIGT
, SEL_KPL
, 0);
2339 setidt_global(IDT_DF
, &IDTVEC(dblfault
), SDT_SYSIGT
, SEL_KPL
, 1);
2340 setidt_global(IDT_FPUGP
, &IDTVEC(fpusegm
), SDT_SYSIGT
, SEL_KPL
, 0);
2341 setidt_global(IDT_TS
, &IDTVEC(tss
), SDT_SYSIGT
, SEL_KPL
, 0);
2342 setidt_global(IDT_NP
, &IDTVEC(missing
), SDT_SYSIGT
, SEL_KPL
, 0);
2343 setidt_global(IDT_SS
, &IDTVEC(stk
), SDT_SYSIGT
, SEL_KPL
, 0);
2344 setidt_global(IDT_GP
, &IDTVEC(prot
), SDT_SYSIGT
, SEL_KPL
, 0);
2345 setidt_global(IDT_PF
, &IDTVEC(page
), SDT_SYSIGT
, SEL_KPL
, 0);
2346 setidt_global(IDT_MF
, &IDTVEC(fpu
), SDT_SYSIGT
, SEL_KPL
, 0);
2347 setidt_global(IDT_AC
, &IDTVEC(align
), SDT_SYSIGT
, SEL_KPL
, 0);
2348 setidt_global(IDT_MC
, &IDTVEC(mchk
), SDT_SYSIGT
, SEL_KPL
, 0);
2349 setidt_global(IDT_XF
, &IDTVEC(xmm
), SDT_SYSIGT
, SEL_KPL
, 0);
2351 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
2352 r_idt_arr
[cpu
].rd_limit
= sizeof(idt_arr
[cpu
]) - 1;
2353 r_idt_arr
[cpu
].rd_base
= (long) &idt_arr
[cpu
][0];
2356 lidt(&r_idt_arr
[0]);
2359 * Initialize the console before we print anything out.
2364 if (metadata_missing
)
2365 kprintf("WARNING: loader(8) metadata is missing!\n");
2375 * Initialize IRQ mapping
2378 * SHOULD be after elcr_probe()
2380 MachIntrABI_ICU
.initmap();
2381 MachIntrABI_IOAPIC
.initmap();
2385 if (boothowto
& RB_KDB
)
2386 Debugger("Boot flags requested debugger");
2390 finishidentcpu(); /* Final stage of CPU initialization */
2391 setidt(6, &IDTVEC(ill
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2392 setidt(13, &IDTVEC(prot
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2394 identify_cpu(); /* Final stage of CPU initialization */
2395 initializecpu(0); /* Initialize CPU registers */
2398 * On modern intel cpus, haswell or later, cpu_idle_hlt=1 is better
2399 * because the cpu does significant power management in MWAIT
2400 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2402 * On modern amd cpus cpu_idle_hlt=3 is better, because the cpu does
2403 * significant power management in HLT or ACPI (but cpu_idle_hlt=1
2404 * would try to use MWAIT).
2406 * On older amd or intel cpus, cpu_idle_hlt=2 is better because ACPI
2407 * is needed to reduce power consumption, but wakeup times are often
2410 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
2411 CPUID_TO_MODEL(cpu_id
) >= 0x3C) { /* Haswell or later */
2414 if (cpu_vendor_id
== CPU_VENDOR_AMD
&&
2415 CPUID_TO_FAMILY(cpu_id
) >= 0x14) { /* Bobcat or later */
2419 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable
); /* for compat */
2420 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable
);
2421 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable
);
2422 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt
);
2425 * Some of the virtual machines do not work w/ I/O APIC
2426 * enabled. If the user does not explicitly enable or
2427 * disable the I/O APIC (ioapic_enable < 0), then we
2428 * disable I/O APIC on all virtual machines.
2431 * This must be done after identify_cpu(), which sets
2434 if (ioapic_enable
< 0) {
2435 if (cpu_feature2
& CPUID2_VMM
)
2441 /* make an initial tss so cpu can get interrupt stack on syscall! */
2442 gd
->gd_common_tss
.tss_rsp0
=
2443 (register_t
)(thread0
.td_kstack
+
2444 KSTACK_PAGES
* PAGE_SIZE
- sizeof(struct pcb
));
2445 /* Ensure the stack is aligned to 16 bytes */
2446 gd
->gd_common_tss
.tss_rsp0
&= ~(register_t
)0xF;
2448 /* double fault stack */
2449 gd
->gd_common_tss
.tss_ist1
=
2450 (long)&gd
->mi
.gd_prvspace
->idlestack
[
2451 sizeof(gd
->mi
.gd_prvspace
->idlestack
)];
2453 /* Set the IO permission bitmap (empty due to tss seg limit) */
2454 gd
->gd_common_tss
.tss_iobase
= sizeof(struct x86_64tss
);
2456 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2457 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
];
2458 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2461 /* Set up the fast syscall stuff */
2462 msr
= rdmsr(MSR_EFER
) | EFER_SCE
;
2463 wrmsr(MSR_EFER
, msr
);
2464 wrmsr(MSR_LSTAR
, (u_int64_t
)IDTVEC(fast_syscall
));
2465 wrmsr(MSR_CSTAR
, (u_int64_t
)IDTVEC(fast_syscall32
));
2466 msr
= ((u_int64_t
)GSEL(GCODE_SEL
, SEL_KPL
) << 32) |
2467 ((u_int64_t
)GSEL(GUCODE32_SEL
, SEL_UPL
) << 48);
2468 wrmsr(MSR_STAR
, msr
);
2469 wrmsr(MSR_SF_MASK
, PSL_NT
|PSL_T
|PSL_I
|PSL_C
|PSL_D
|PSL_IOPL
);
2471 getmemsize(kmdp
, physfree
);
2472 init_param2(physmem
);
2474 /* now running on new page tables, configured,and u/iom is accessible */
2476 /* Map the message buffer. */
2478 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2479 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2482 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2485 /* transfer to user mode */
2487 _ucodesel
= GSEL(GUCODE_SEL
, SEL_UPL
);
2488 _udatasel
= GSEL(GUDATA_SEL
, SEL_UPL
);
2489 _ucode32sel
= GSEL(GUCODE32_SEL
, SEL_UPL
);
2495 /* setup proc 0's pcb */
2496 thread0
.td_pcb
->pcb_flags
= 0;
2497 thread0
.td_pcb
->pcb_cr3
= KPML4phys
;
2498 thread0
.td_pcb
->pcb_ext
= NULL
;
2499 lwp0
.lwp_md
.md_regs
= &proc0_tf
; /* XXX needed? */
2501 /* Location of kernel stack for locore */
2502 return ((u_int64_t
)thread0
.td_pcb
);
2506 * Initialize machine-dependant portions of the global data structure.
2507 * Note that the global data area and cpu0's idlestack in the private
2508 * data space were allocated in locore.
2510 * Note: the idlethread's cpl is 0
2512 * WARNING! Called from early boot, 'mycpu' may not work yet.
2515 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2518 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2520 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2521 gd
->mi
.gd_prvspace
->idlestack
,
2522 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2524 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2525 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2526 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2527 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2531 * We only have to check for DMAP bounds, the globaldata space is
2532 * actually part of the kernel_map so we don't have to waste time
2533 * checking CPU_prvspace[*].
2536 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2539 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2540 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2544 if (saddr
>= DMAP_MIN_ADDRESS
&& eaddr
<= DMAP_MAX_ADDRESS
)
2550 globaldata_find(int cpu
)
2552 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2553 return(&CPU_prvspace
[cpu
]->mdglobaldata
.mi
);
2557 * This path should be safe from the SYSRET issue because only stopped threads
2558 * can have their %rip adjusted this way (and all heavy weight thread switches
2559 * clear QUICKREF and thus do not use SYSRET). However, the code path is
2560 * convoluted so add a safety by forcing %rip to be cannonical.
2563 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2565 if (addr
& 0x0000800000000000LLU
)
2566 lp
->lwp_md
.md_regs
->tf_rip
= addr
| 0xFFFF000000000000LLU
;
2568 lp
->lwp_md
.md_regs
->tf_rip
= addr
& 0x0000FFFFFFFFFFFFLLU
;
2573 ptrace_single_step(struct lwp
*lp
)
2575 lp
->lwp_md
.md_regs
->tf_rflags
|= PSL_T
;
2580 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2582 struct trapframe
*tp
;
2584 if ((tp
= lp
->lwp_md
.md_regs
) == NULL
)
2586 bcopy(&tp
->tf_rdi
, ®s
->r_rdi
, sizeof(*regs
));
2591 set_regs(struct lwp
*lp
, struct reg
*regs
)
2593 struct trapframe
*tp
;
2595 tp
= lp
->lwp_md
.md_regs
;
2596 if (!EFL_SECURE(regs
->r_rflags
, tp
->tf_rflags
) ||
2597 !CS_SECURE(regs
->r_cs
))
2599 bcopy(®s
->r_rdi
, &tp
->tf_rdi
, sizeof(*regs
));
2605 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2607 struct env87
*penv_87
= &sv_87
->sv_env
;
2608 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2611 /* FPU control/status */
2612 penv_87
->en_cw
= penv_xmm
->en_cw
;
2613 penv_87
->en_sw
= penv_xmm
->en_sw
;
2614 penv_87
->en_tw
= penv_xmm
->en_tw
;
2615 penv_87
->en_fip
= penv_xmm
->en_fip
;
2616 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2617 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2618 penv_87
->en_foo
= penv_xmm
->en_foo
;
2619 penv_87
->en_fos
= penv_xmm
->en_fos
;
2622 for (i
= 0; i
< 8; ++i
)
2623 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2627 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2629 struct env87
*penv_87
= &sv_87
->sv_env
;
2630 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2633 /* FPU control/status */
2634 penv_xmm
->en_cw
= penv_87
->en_cw
;
2635 penv_xmm
->en_sw
= penv_87
->en_sw
;
2636 penv_xmm
->en_tw
= penv_87
->en_tw
;
2637 penv_xmm
->en_fip
= penv_87
->en_fip
;
2638 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2639 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2640 penv_xmm
->en_foo
= penv_87
->en_foo
;
2641 penv_xmm
->en_fos
= penv_87
->en_fos
;
2644 for (i
= 0; i
< 8; ++i
)
2645 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2649 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2651 if (lp
->lwp_thread
== NULL
|| lp
->lwp_thread
->td_pcb
== NULL
)
2654 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2655 (struct save87
*)fpregs
);
2658 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2663 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2666 set_fpregs_xmm((struct save87
*)fpregs
,
2667 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2670 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2675 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2680 dbregs
->dr
[0] = rdr0();
2681 dbregs
->dr
[1] = rdr1();
2682 dbregs
->dr
[2] = rdr2();
2683 dbregs
->dr
[3] = rdr3();
2684 dbregs
->dr
[4] = rdr4();
2685 dbregs
->dr
[5] = rdr5();
2686 dbregs
->dr
[6] = rdr6();
2687 dbregs
->dr
[7] = rdr7();
2690 if (lp
->lwp_thread
== NULL
|| (pcb
= lp
->lwp_thread
->td_pcb
) == NULL
)
2692 dbregs
->dr
[0] = pcb
->pcb_dr0
;
2693 dbregs
->dr
[1] = pcb
->pcb_dr1
;
2694 dbregs
->dr
[2] = pcb
->pcb_dr2
;
2695 dbregs
->dr
[3] = pcb
->pcb_dr3
;
2698 dbregs
->dr
[6] = pcb
->pcb_dr6
;
2699 dbregs
->dr
[7] = pcb
->pcb_dr7
;
2704 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2707 load_dr0(dbregs
->dr
[0]);
2708 load_dr1(dbregs
->dr
[1]);
2709 load_dr2(dbregs
->dr
[2]);
2710 load_dr3(dbregs
->dr
[3]);
2711 load_dr4(dbregs
->dr
[4]);
2712 load_dr5(dbregs
->dr
[5]);
2713 load_dr6(dbregs
->dr
[6]);
2714 load_dr7(dbregs
->dr
[7]);
2717 struct ucred
*ucred
;
2719 uint64_t mask1
, mask2
;
2722 * Don't let an illegal value for dr7 get set. Specifically,
2723 * check for undefined settings. Setting these bit patterns
2724 * result in undefined behaviour and can lead to an unexpected
2727 /* JG this loop looks unreadable */
2728 /* Check 4 2-bit fields for invalid patterns.
2729 * These fields are R/Wi, for i = 0..3
2731 /* Is 10 in LENi allowed when running in compatibility mode? */
2732 /* Pattern 10 in R/Wi might be used to indicate
2733 * breakpoint on I/O. Further analysis should be
2734 * carried to decide if it is safe and useful to
2735 * provide access to that capability
2737 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 4;
2738 i
++, mask1
<<= 4, mask2
<<= 4)
2739 if ((dbregs
->dr
[7] & mask1
) == mask2
)
2742 pcb
= lp
->lwp_thread
->td_pcb
;
2743 ucred
= lp
->lwp_proc
->p_ucred
;
2746 * Don't let a process set a breakpoint that is not within the
2747 * process's address space. If a process could do this, it
2748 * could halt the system by setting a breakpoint in the kernel
2749 * (if ddb was enabled). Thus, we need to check to make sure
2750 * that no breakpoints are being enabled for addresses outside
2751 * process's address space, unless, perhaps, we were called by
2754 * XXX - what about when the watched area of the user's
2755 * address space is written into from within the kernel
2756 * ... wouldn't that still cause a breakpoint to be generated
2757 * from within kernel mode?
2760 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2761 if (dbregs
->dr
[7] & 0x3) {
2762 /* dr0 is enabled */
2763 if (dbregs
->dr
[0] >= VM_MAX_USER_ADDRESS
)
2767 if (dbregs
->dr
[7] & (0x3<<2)) {
2768 /* dr1 is enabled */
2769 if (dbregs
->dr
[1] >= VM_MAX_USER_ADDRESS
)
2773 if (dbregs
->dr
[7] & (0x3<<4)) {
2774 /* dr2 is enabled */
2775 if (dbregs
->dr
[2] >= VM_MAX_USER_ADDRESS
)
2779 if (dbregs
->dr
[7] & (0x3<<6)) {
2780 /* dr3 is enabled */
2781 if (dbregs
->dr
[3] >= VM_MAX_USER_ADDRESS
)
2786 pcb
->pcb_dr0
= dbregs
->dr
[0];
2787 pcb
->pcb_dr1
= dbregs
->dr
[1];
2788 pcb
->pcb_dr2
= dbregs
->dr
[2];
2789 pcb
->pcb_dr3
= dbregs
->dr
[3];
2790 pcb
->pcb_dr6
= dbregs
->dr
[6];
2791 pcb
->pcb_dr7
= dbregs
->dr
[7];
2793 pcb
->pcb_flags
|= PCB_DBREGS
;
2800 * Return > 0 if a hardware breakpoint has been hit, and the
2801 * breakpoint was in user space. Return 0, otherwise.
2804 user_dbreg_trap(void)
2806 u_int64_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2807 u_int64_t bp
; /* breakpoint bits extracted from dr6 */
2808 int nbp
; /* number of breakpoints that triggered */
2809 caddr_t addr
[4]; /* breakpoint addresses */
2813 if ((dr7
& 0xff) == 0) {
2815 * all GE and LE bits in the dr7 register are zero,
2816 * thus the trap couldn't have been caused by the
2817 * hardware debug registers
2828 * None of the breakpoint bits are set meaning this
2829 * trap was not caused by any of the debug registers
2835 * at least one of the breakpoints were hit, check to see
2836 * which ones and if any of them are user space addresses
2840 addr
[nbp
++] = (caddr_t
)rdr0();
2843 addr
[nbp
++] = (caddr_t
)rdr1();
2846 addr
[nbp
++] = (caddr_t
)rdr2();
2849 addr
[nbp
++] = (caddr_t
)rdr3();
2852 for (i
=0; i
<nbp
; i
++) {
2854 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2856 * addr[i] is in user space
2863 * None of the breakpoints are in user space.
2871 Debugger(const char *msg
)
2873 kprintf("Debugger(\"%s\") called.\n", msg
);
2880 * Provide inb() and outb() as functions. They are normally only
2881 * available as macros calling inlined functions, thus cannot be
2882 * called inside DDB.
2884 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2890 /* silence compiler warnings */
2892 void outb(u_int
, u_char
);
2899 * We use %%dx and not %1 here because i/o is done at %dx and not at
2900 * %edx, while gcc generates inferior code (movw instead of movl)
2901 * if we tell it to load (u_short) port.
2903 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2908 outb(u_int port
, u_char data
)
2912 * Use an unnecessary assignment to help gcc's register allocator.
2913 * This make a large difference for gcc-1.40 and a tiny difference
2914 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2915 * best results. gcc-2.6.0 can't handle this.
2918 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2926 * initialize all the SMP locks
2929 /* critical region when masking or unmasking interupts */
2930 struct spinlock_deprecated imen_spinlock
;
2932 /* lock region used by kernel profiling */
2933 struct spinlock_deprecated mcount_spinlock
;
2935 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2936 struct spinlock_deprecated com_spinlock
;
2938 /* lock regions around the clock hardware */
2939 struct spinlock_deprecated clock_spinlock
;
2945 * Get the initial mplock with a count of 1 for the BSP.
2946 * This uses a LOGICAL cpu ID, ie BSP == 0.
2948 cpu_get_initial_mplock();
2950 spin_init_deprecated(&mcount_spinlock
);
2951 spin_init_deprecated(&imen_spinlock
);
2952 spin_init_deprecated(&com_spinlock
);
2953 spin_init_deprecated(&clock_spinlock
);
2955 /* our token pool needs to work early */
2956 lwkt_token_pool_init();
2960 cpu_mwait_hint_valid(uint32_t hint
)
2964 cx_idx
= MWAIT_EAX_TO_CX(hint
);
2965 if (cx_idx
>= CPU_MWAIT_CX_MAX
)
2968 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2969 if (sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
2976 cpu_mwait_cx_no_bmsts(void)
2978 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_STS
);
2982 cpu_mwait_cx_no_bmarb(void)
2984 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_ARB
);
2988 cpu_mwait_cx_hint2name(int hint
, char *name
, int namelen
, boolean_t allow_auto
)
2990 int old_cx_idx
, sub
= 0;
2993 old_cx_idx
= MWAIT_EAX_TO_CX(hint
);
2994 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2995 } else if (hint
== CPU_MWAIT_HINT_AUTO
) {
2996 old_cx_idx
= allow_auto
? CPU_MWAIT_C2
: CPU_MWAIT_CX_MAX
;
2997 } else if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
2998 old_cx_idx
= allow_auto
? CPU_MWAIT_C3
: CPU_MWAIT_CX_MAX
;
3000 old_cx_idx
= CPU_MWAIT_CX_MAX
;
3003 if (!CPU_MWAIT_HAS_CX
)
3004 strlcpy(name
, "NONE", namelen
);
3005 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTO
)
3006 strlcpy(name
, "AUTO", namelen
);
3007 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTODEEP
)
3008 strlcpy(name
, "AUTODEEP", namelen
);
3009 else if (old_cx_idx
>= CPU_MWAIT_CX_MAX
||
3010 sub
>= cpu_mwait_cx_info
[old_cx_idx
].subcnt
)
3011 strlcpy(name
, "INVALID", namelen
);
3013 ksnprintf(name
, namelen
, "C%d/%d", old_cx_idx
, sub
);
3019 cpu_mwait_cx_name2hint(char *name
, int *hint0
, boolean_t allow_auto
)
3021 int cx_idx
, sub
, hint
;
3024 if (allow_auto
&& strcmp(name
, "AUTO") == 0) {
3025 hint
= CPU_MWAIT_HINT_AUTO
;
3026 cx_idx
= CPU_MWAIT_C2
;
3029 if (allow_auto
&& strcmp(name
, "AUTODEEP") == 0) {
3030 hint
= CPU_MWAIT_HINT_AUTODEEP
;
3031 cx_idx
= CPU_MWAIT_C3
;
3035 if (strlen(name
) < 4 || toupper(name
[0]) != 'C')
3040 cx_idx
= strtol(start
, &ptr
, 10);
3041 if (ptr
== start
|| *ptr
!= '/')
3043 if (cx_idx
< 0 || cx_idx
>= CPU_MWAIT_CX_MAX
)
3049 sub
= strtol(start
, &ptr
, 10);
3052 if (sub
< 0 || sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
3055 hint
= MWAIT_EAX_HINT(cx_idx
, sub
);
3062 cpu_mwait_cx_transit(int old_cx_idx
, int cx_idx
)
3064 if (cx_idx
>= CPU_MWAIT_C3
&& cpu_mwait_c3_preamble
)
3066 if (old_cx_idx
< CPU_MWAIT_C3
&& cx_idx
>= CPU_MWAIT_C3
) {
3069 error
= cputimer_intr_powersave_addreq();
3072 } else if (old_cx_idx
>= CPU_MWAIT_C3
&& cx_idx
< CPU_MWAIT_C3
) {
3073 cputimer_intr_powersave_remreq();
3079 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
, int *hint0
,
3080 boolean_t allow_auto
)
3082 int error
, cx_idx
, old_cx_idx
, hint
;
3083 char name
[CPU_MWAIT_CX_NAMELEN
];
3086 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
),
3089 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3090 if (error
!= 0 || req
->newptr
== NULL
)
3093 if (!CPU_MWAIT_HAS_CX
)
3096 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, allow_auto
);
3100 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3109 cpu_mwait_cx_setname(struct cpu_idle_stat
*stat
, const char *cx_name
)
3111 int error
, cx_idx
, old_cx_idx
, hint
;
3112 char name
[CPU_MWAIT_CX_NAMELEN
];
3114 KASSERT(CPU_MWAIT_HAS_CX
, ("cpu does not support mwait CX extension"));
3117 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3119 strlcpy(name
, cx_name
, sizeof(name
));
3120 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3124 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3133 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3135 int hint
= cpu_mwait_halt_global
;
3136 int error
, cx_idx
, cpu
;
3137 char name
[CPU_MWAIT_CX_NAMELEN
], cx_name
[CPU_MWAIT_CX_NAMELEN
];
3139 cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3141 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3142 if (error
!= 0 || req
->newptr
== NULL
)
3145 if (!CPU_MWAIT_HAS_CX
)
3148 /* Save name for later per-cpu CX configuration */
3149 strlcpy(cx_name
, name
, sizeof(cx_name
));
3151 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3155 /* Change per-cpu CX configuration */
3156 for (cpu
= 0; cpu
< ncpus
; ++cpu
) {
3157 error
= cpu_mwait_cx_setname(&cpu_idle_stats
[cpu
], cx_name
);
3162 cpu_mwait_halt_global
= hint
;
3167 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3169 struct cpu_idle_stat
*stat
= arg1
;
3172 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3178 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
)
3182 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3183 &cpu_mwait_spin
, FALSE
);
3188 * This manual debugging code is called unconditionally from Xtimer
3189 * (the per-cpu timer interrupt) whether the current thread is in a
3190 * critical section or not) and can be useful in tracking down lockups.
3192 * NOTE: MANUAL DEBUG CODE
3195 static int saveticks
[SMP_MAXCPU
];
3196 static int savecounts
[SMP_MAXCPU
];
3200 pcpu_timer_always(struct intrframe
*frame
)
3203 globaldata_t gd
= mycpu
;
3204 int cpu
= gd
->gd_cpuid
;
3210 gptr
= (short *)0xFFFFFFFF800b8000 + 80 * cpu
;
3211 *gptr
= ((*gptr
+ 1) & 0x00FF) | 0x0700;
3214 ksnprintf(buf
, sizeof(buf
), " %p %16s %d %16s ",
3215 (void *)frame
->if_rip
, gd
->gd_curthread
->td_comm
, ticks
,
3217 for (i
= 0; buf
[i
]; ++i
) {
3218 gptr
[i
] = 0x0700 | (unsigned char)buf
[i
];
3222 if (saveticks
[gd
->gd_cpuid
] != ticks
) {
3223 saveticks
[gd
->gd_cpuid
] = ticks
;
3224 savecounts
[gd
->gd_cpuid
] = 0;
3226 ++savecounts
[gd
->gd_cpuid
];
3227 if (savecounts
[gd
->gd_cpuid
] > 2000 && panicstr
== NULL
) {
3228 panic("cpud %d panicing on ticks failure",
3231 for (i
= 0; i
< ncpus
; ++i
) {
3233 if (saveticks
[i
] && panicstr
== NULL
) {
3234 delta
= saveticks
[i
] - ticks
;
3235 if (delta
< -10 || delta
> 10) {
3236 panic("cpu %d panicing on cpu %d watchdog",