2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
43 //#include "use_npx.h"
47 #include "opt_directio.h"
49 #include "opt_msgbuf.h"
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/sysproto.h>
55 #include <sys/signalvar.h>
56 #include <sys/kernel.h>
57 #include <sys/linker.h>
58 #include <sys/malloc.h>
62 #include <sys/reboot.h>
64 #include <sys/msgbuf.h>
65 #include <sys/sysent.h>
66 #include <sys/sysctl.h>
67 #include <sys/vmmeter.h>
69 #include <sys/usched.h>
72 #include <sys/ctype.h>
73 #include <sys/serialize.h>
74 #include <sys/systimer.h>
77 #include <vm/vm_param.h>
79 #include <vm/vm_kern.h>
80 #include <vm/vm_object.h>
81 #include <vm/vm_page.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_pager.h>
84 #include <vm/vm_extern.h>
86 #include <sys/thread2.h>
87 #include <sys/mplock2.h>
88 #include <sys/mutex2.h>
98 #include <machine/cpu.h>
99 #include <machine/clock.h>
100 #include <machine/specialreg.h>
102 #include <machine/bootinfo.h>
104 #include <machine/md_var.h>
105 #include <machine/metadata.h>
106 #include <machine/pc/bios.h>
107 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
108 #include <machine/globaldata.h> /* CPU_prvspace */
109 #include <machine/smp.h>
110 #include <machine/cputypes.h>
111 #include <machine/intr_machdep.h>
112 #include <machine/framebuffer.h>
115 #include <bus/isa/isa_device.h>
117 #include <machine_base/isa/isa_intr.h>
118 #include <bus/isa/rtc.h>
119 #include <sys/random.h>
120 #include <sys/ptrace.h>
121 #include <machine/sigframe.h>
123 #include <sys/machintr.h>
124 #include <machine_base/icu/icu_abi.h>
125 #include <machine_base/icu/elcr_var.h>
126 #include <machine_base/apic/lapic.h>
127 #include <machine_base/apic/ioapic.h>
128 #include <machine_base/apic/ioapic_abi.h>
129 #include <machine/mptable.h>
131 #define PHYSMAP_ENTRIES 10
133 extern u_int64_t
hammer_time(u_int64_t
, u_int64_t
);
135 extern void printcpuinfo(void); /* XXX header file */
136 extern void identify_cpu(void);
138 extern void finishidentcpu(void);
140 extern void panicifcpuunsupported(void);
142 static void cpu_startup(void *);
143 static void pic_finish(void *);
144 static void cpu_finish(void *);
146 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
147 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
149 extern void ffs_rawread_setup(void);
150 #endif /* DIRECTIO */
151 static void init_locks(void);
153 extern void pcpu_timer_always(struct intrframe
*);
155 SYSINIT(cpu
, SI_BOOT2_START_CPU
, SI_ORDER_FIRST
, cpu_startup
, NULL
);
156 SYSINIT(pic_finish
, SI_BOOT2_FINISH_PIC
, SI_ORDER_FIRST
, pic_finish
, NULL
);
157 SYSINIT(cpu_finish
, SI_BOOT2_FINISH_CPU
, SI_ORDER_FIRST
, cpu_finish
, NULL
);
160 extern vm_offset_t ksym_start
, ksym_end
;
163 struct privatespace CPU_prvspace_bsp
__aligned(4096);
164 struct privatespace
*CPU_prvspace
[MAXCPU
] = { &CPU_prvspace_bsp
};
166 int _udatasel
, _ucodesel
, _ucode32sel
;
168 int64_t tsc_offsets
[MAXCPU
];
169 cpumask_t smp_idleinvl_mask
;
170 cpumask_t smp_idleinvl_reqs
;
172 static int cpu_mwait_halt_global
; /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
174 #if defined(SWTCH_OPTIM_STATS)
175 extern int swtch_optim_stats
;
176 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
177 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
178 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
179 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
181 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_halt
,
182 CTLFLAG_RD
, &cpu_mwait_halt_global
, 0, "");
183 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_spin
, CTLFLAG_RD
, &cpu_mwait_spin
, 0,
184 "monitor/mwait target state");
186 #define CPU_MWAIT_HAS_CX \
187 ((cpu_feature2 & CPUID2_MON) && \
188 (cpu_mwait_feature & CPUID_MWAIT_EXT))
190 #define CPU_MWAIT_CX_NAMELEN 16
192 #define CPU_MWAIT_C1 1
193 #define CPU_MWAIT_C2 2
194 #define CPU_MWAIT_C3 3
195 #define CPU_MWAIT_CX_MAX 8
197 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */
198 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */
200 SYSCTL_NODE(_machdep
, OID_AUTO
, mwait
, CTLFLAG_RW
, 0, "MWAIT features");
201 SYSCTL_NODE(_machdep_mwait
, OID_AUTO
, CX
, CTLFLAG_RW
, 0, "MWAIT Cx settings");
203 struct cpu_mwait_cx
{
206 struct sysctl_ctx_list sysctl_ctx
;
207 struct sysctl_oid
*sysctl_tree
;
209 static struct cpu_mwait_cx cpu_mwait_cx_info
[CPU_MWAIT_CX_MAX
];
210 static char cpu_mwait_cx_supported
[256];
212 static int cpu_mwait_c1_hints_cnt
;
213 static int cpu_mwait_hints_cnt
;
214 static int *cpu_mwait_hints
;
216 static int cpu_mwait_deep_hints_cnt
;
217 static int *cpu_mwait_deep_hints
;
219 #define CPU_IDLE_REPEAT_DEFAULT 750
221 static u_int cpu_idle_repeat
= CPU_IDLE_REPEAT_DEFAULT
;
222 static u_long cpu_idle_repeat_max
= CPU_IDLE_REPEAT_DEFAULT
;
223 static u_int cpu_mwait_repeat_shift
= 1;
225 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1
226 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2
228 static int cpu_mwait_c3_preamble
=
229 CPU_MWAIT_C3_PREAMBLE_BM_ARB
|
230 CPU_MWAIT_C3_PREAMBLE_BM_STS
;
232 SYSCTL_STRING(_machdep_mwait_CX
, OID_AUTO
, supported
, CTLFLAG_RD
,
233 cpu_mwait_cx_supported
, 0, "MWAIT supported C states");
234 SYSCTL_INT(_machdep_mwait_CX
, OID_AUTO
, c3_preamble
, CTLFLAG_RD
,
235 &cpu_mwait_c3_preamble
, 0, "C3+ preamble mask");
237 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
,
239 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
);
240 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
);
241 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
);
243 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, idle
, CTLTYPE_STRING
|CTLFLAG_RW
,
244 NULL
, 0, cpu_mwait_cx_idle_sysctl
, "A", "");
245 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, spin
, CTLTYPE_STRING
|CTLFLAG_RW
,
246 NULL
, 0, cpu_mwait_cx_spin_sysctl
, "A", "");
247 SYSCTL_UINT(_machdep_mwait_CX
, OID_AUTO
, repeat_shift
, CTLFLAG_RW
,
248 &cpu_mwait_repeat_shift
, 0, "");
252 u_long ebda_addr
= 0;
254 int imcr_present
= 0;
256 int naps
= 0; /* # of Applications processors */
259 struct mtx dt_lock
; /* lock for GDT and LDT */
262 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
264 u_long pmem
= ctob(physmem
);
266 int error
= sysctl_handle_long(oidp
, &pmem
, 0, req
);
270 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_ULONG
|CTLFLAG_RD
,
271 0, 0, sysctl_hw_physmem
, "LU", "Total system memory in bytes (number of pages * page size)");
274 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
276 int error
= sysctl_handle_int(oidp
, 0,
277 ctob(physmem
- vmstats
.v_wire_count
), req
);
281 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
282 0, 0, sysctl_hw_usermem
, "IU", "");
285 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
287 int error
= sysctl_handle_int(oidp
, 0,
288 x86_64_btop(avail_end
- avail_start
), req
);
292 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
293 0, 0, sysctl_hw_availpages
, "I", "");
299 * The number of PHYSMAP entries must be one less than the number of
300 * PHYSSEG entries because the PHYSMAP entry that spans the largest
301 * physical address that is accessible by ISA DMA is split into two
304 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
306 vm_paddr_t phys_avail
[PHYSMAP_SIZE
+ 2];
307 vm_paddr_t dump_avail
[PHYSMAP_SIZE
+ 2];
309 /* must be 2 less so 0 0 can signal end of chunks */
310 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 2)
311 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 2)
313 static vm_offset_t buffer_sva
, buffer_eva
;
314 vm_offset_t clean_sva
, clean_eva
;
315 static vm_offset_t pager_sva
, pager_eva
;
316 static struct trapframe proc0_tf
;
319 cpu_startup(void *dummy
)
323 vm_offset_t firstaddr
;
326 * Good {morning,afternoon,evening,night}.
328 kprintf("%s", version
);
331 panicifcpuunsupported();
332 kprintf("real memory = %ju (%ju MB)\n",
334 (intmax_t)Realmem
/ 1024 / 1024);
336 * Display any holes after the first chunk of extended memory.
341 kprintf("Physical memory chunk(s):\n");
342 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
343 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
345 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
346 (intmax_t)phys_avail
[indx
],
347 (intmax_t)phys_avail
[indx
+ 1] - 1,
349 (intmax_t)(size1
/ PAGE_SIZE
));
354 * Allocate space for system data structures.
355 * The first available kernel virtual address is in "v".
356 * As pages of kernel virtual memory are allocated, "v" is incremented.
357 * As pages of memory are allocated and cleared,
358 * "firstaddr" is incremented.
359 * An index into the kernel page table corresponding to the
360 * virtual memory address maintained in "v" is kept in "mapaddr".
364 * Make two passes. The first pass calculates how much memory is
365 * needed and allocates it. The second pass assigns virtual
366 * addresses to the various data structures.
370 v
= (caddr_t
)firstaddr
;
372 #define valloc(name, type, num) \
373 (name) = (type *)v; v = (caddr_t)((name)+(num))
374 #define valloclim(name, type, num, lim) \
375 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
378 * The nominal buffer size (and minimum KVA allocation) is MAXBSIZE.
379 * For the first 64MB of ram nominally allocate sufficient buffers to
380 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
381 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
382 * the buffer cache we limit the eventual kva reservation to
385 * factor represents the 1/4 x ram conversion.
388 long factor
= 4 * NBUFCALCSIZE
/ 1024;
389 long kbytes
= physmem
* (PAGE_SIZE
/ 1024);
393 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
395 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
396 if (maxbcache
&& nbuf
> maxbcache
/ NBUFCALCSIZE
)
397 nbuf
= maxbcache
/ NBUFCALCSIZE
;
401 * Do not allow the buffer_map to be more then 1/2 the size of the
404 if (nbuf
> (virtual_end
- virtual_start
+
405 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2)) {
406 nbuf
= (virtual_end
- virtual_start
+
407 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2);
408 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf
);
412 * Do not allow the buffer_map to use more than 50% of available
413 * physical-equivalent memory. Since the VM pages which back
414 * individual buffers are typically wired, having too many bufs
415 * can prevent the system from paging properly.
417 if (nbuf
> physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2)) {
418 nbuf
= physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2);
419 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf
);
423 * Do not allow the sizeof(struct buf) * nbuf to exceed half of
424 * the valloc space which is just the virtual_end - virtual_start
425 * section. We use valloc() to allocate the buf header array.
427 if (nbuf
> (virtual_end
- virtual_start
) / sizeof(struct buf
) / 2) {
428 nbuf
= (virtual_end
- virtual_start
) /
429 sizeof(struct buf
) / 2;
430 kprintf("Warning: nbufs capped at %ld due to valloc "
431 "considerations\n", nbuf
);
434 nswbuf_mem
= lmax(lmin(nbuf
/ 32, 512), 8);
436 if (nswbuf_mem
< NSWBUF_MIN
)
437 nswbuf_mem
= NSWBUF_MIN
;
439 nswbuf_kva
= lmax(lmin(nbuf
/ 4, 512), 16);
441 if (nswbuf_kva
< NSWBUF_MIN
)
442 nswbuf_kva
= NSWBUF_MIN
;
448 valloc(swbuf_mem
, struct buf
, nswbuf_mem
);
449 valloc(swbuf_kva
, struct buf
, nswbuf_kva
);
450 valloc(buf
, struct buf
, nbuf
);
453 * End of first pass, size has been calculated so allocate memory
455 if (firstaddr
== 0) {
456 size
= (vm_size_t
)(v
- firstaddr
);
457 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
));
459 panic("startup: no room for tables");
464 * End of second pass, addresses have been assigned
466 * nbuf is an int, make sure we don't overflow the field.
468 * On 64-bit systems we always reserve maximal allocations for
469 * buffer cache buffers and there are no fragmentation issues,
470 * so the KVA segment does not have to be excessively oversized.
472 if ((vm_size_t
)(v
- firstaddr
) != size
)
473 panic("startup: table size inconsistency");
475 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
476 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
) +
477 ((nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) + pager_map_size
);
478 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
479 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
));
480 buffer_map
.system_map
= 1;
481 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
482 ((vm_offset_t
)(nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) +
484 pager_map
.system_map
= 1;
485 kprintf("avail memory = %ju (%ju MB)\n",
486 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
),
487 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
) /
491 struct cpu_idle_stat
{
499 u_long mwait_cx
[CPU_MWAIT_CX_MAX
];
502 #define CPU_IDLE_STAT_HALT -1
503 #define CPU_IDLE_STAT_SPIN -2
505 static struct cpu_idle_stat cpu_idle_stats
[MAXCPU
];
508 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS
)
510 int idx
= arg2
, cpu
, error
;
513 if (idx
== CPU_IDLE_STAT_HALT
) {
514 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
515 val
+= cpu_idle_stats
[cpu
].halt
;
516 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
517 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
518 val
+= cpu_idle_stats
[cpu
].spin
;
520 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
521 ("invalid index %d", idx
));
522 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
523 val
+= cpu_idle_stats
[cpu
].mwait_cx
[idx
];
526 error
= sysctl_handle_quad(oidp
, &val
, 0, req
);
527 if (error
|| req
->newptr
== NULL
)
530 if (idx
== CPU_IDLE_STAT_HALT
) {
531 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
532 cpu_idle_stats
[cpu
].halt
= 0;
533 cpu_idle_stats
[0].halt
= val
;
534 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
535 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
536 cpu_idle_stats
[cpu
].spin
= 0;
537 cpu_idle_stats
[0].spin
= val
;
539 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
540 ("invalid index %d", idx
));
541 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
542 cpu_idle_stats
[cpu
].mwait_cx
[idx
] = 0;
543 cpu_idle_stats
[0].mwait_cx
[idx
] = val
;
549 cpu_mwait_attach(void)
554 if (!CPU_MWAIT_HAS_CX
)
557 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
558 (CPUID_TO_FAMILY(cpu_id
) > 0xf ||
559 (CPUID_TO_FAMILY(cpu_id
) == 0x6 &&
560 CPUID_TO_MODEL(cpu_id
) >= 0xf))) {
564 * Pentium dual-core, Core 2 and beyond do not need any
565 * additional activities to enter deep C-state, i.e. C3(+).
567 cpu_mwait_cx_no_bmarb();
569 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts
);
571 cpu_mwait_cx_no_bmsts();
574 sbuf_new(&sb
, cpu_mwait_cx_supported
,
575 sizeof(cpu_mwait_cx_supported
), SBUF_FIXEDLEN
);
577 for (i
= 0; i
< CPU_MWAIT_CX_MAX
; ++i
) {
578 struct cpu_mwait_cx
*cx
= &cpu_mwait_cx_info
[i
];
581 ksnprintf(cx
->name
, sizeof(cx
->name
), "C%d", i
);
583 sysctl_ctx_init(&cx
->sysctl_ctx
);
584 cx
->sysctl_tree
= SYSCTL_ADD_NODE(&cx
->sysctl_ctx
,
585 SYSCTL_STATIC_CHILDREN(_machdep_mwait
), OID_AUTO
,
586 cx
->name
, CTLFLAG_RW
, NULL
, "Cx control/info");
587 if (cx
->sysctl_tree
== NULL
)
590 cx
->subcnt
= CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu
, i
);
591 SYSCTL_ADD_INT(&cx
->sysctl_ctx
,
592 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
593 "subcnt", CTLFLAG_RD
, &cx
->subcnt
, 0,
595 SYSCTL_ADD_PROC(&cx
->sysctl_ctx
,
596 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
597 "entered", (CTLTYPE_QUAD
| CTLFLAG_RW
), 0,
598 i
, sysctl_cpu_idle_cnt
, "Q", "# of times entered");
600 for (sub
= 0; sub
< cx
->subcnt
; ++sub
)
601 sbuf_printf(&sb
, "C%d/%d ", i
, sub
);
609 cpu_mwait_c1_hints_cnt
= cpu_mwait_cx_info
[CPU_MWAIT_C1
].subcnt
;
610 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
)
611 cpu_mwait_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
612 cpu_mwait_hints
= kmalloc(sizeof(int) * cpu_mwait_hints_cnt
,
616 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
) {
619 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
620 for (j
= 0; j
< subcnt
; ++j
) {
621 KASSERT(hint_idx
< cpu_mwait_hints_cnt
,
622 ("invalid mwait hint index %d", hint_idx
));
623 cpu_mwait_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
627 KASSERT(hint_idx
== cpu_mwait_hints_cnt
,
628 ("mwait hint count %d != index %d",
629 cpu_mwait_hints_cnt
, hint_idx
));
632 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt
);
633 for (i
= 0; i
< cpu_mwait_hints_cnt
; ++i
) {
634 int hint
= cpu_mwait_hints
[i
];
636 kprintf(" C%d/%d hint 0x%04x\n",
637 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
645 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
)
646 cpu_mwait_deep_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
647 cpu_mwait_deep_hints
= kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt
,
651 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
) {
654 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
655 for (j
= 0; j
< subcnt
; ++j
) {
656 KASSERT(hint_idx
< cpu_mwait_deep_hints_cnt
,
657 ("invalid mwait deep hint index %d", hint_idx
));
658 cpu_mwait_deep_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
662 KASSERT(hint_idx
== cpu_mwait_deep_hints_cnt
,
663 ("mwait deep hint count %d != index %d",
664 cpu_mwait_deep_hints_cnt
, hint_idx
));
667 kprintf("MWAIT deep hints:\n");
668 for (i
= 0; i
< cpu_mwait_deep_hints_cnt
; ++i
) {
669 int hint
= cpu_mwait_deep_hints
[i
];
671 kprintf(" C%d/%d hint 0x%04x\n",
672 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
676 cpu_idle_repeat_max
= 256 * cpu_mwait_deep_hints_cnt
;
678 for (i
= 0; i
< ncpus
; ++i
) {
681 ksnprintf(name
, sizeof(name
), "idle%d", i
);
682 SYSCTL_ADD_PROC(NULL
,
683 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX
), OID_AUTO
,
684 name
, (CTLTYPE_STRING
| CTLFLAG_RW
), &cpu_idle_stats
[i
],
685 0, cpu_mwait_cx_pcpu_idle_sysctl
, "A", "");
690 cpu_finish(void *dummy __unused
)
697 pic_finish(void *dummy __unused
)
699 /* Log ELCR information */
702 /* Log MPTABLE information */
703 mptable_pci_int_dump();
706 MachIntrABI
.finalize();
710 * Send an interrupt to process.
712 * Stack is set up to allow sigcode stored
713 * at top to call routine, followed by kcall
714 * to sigreturn routine below. After sigreturn
715 * resets the signal mask, the stack, and the
716 * frame pointer, it returns to the user
720 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
722 struct lwp
*lp
= curthread
->td_lwp
;
723 struct proc
*p
= lp
->lwp_proc
;
724 struct trapframe
*regs
;
725 struct sigacts
*psp
= p
->p_sigacts
;
726 struct sigframe sf
, *sfp
;
730 regs
= lp
->lwp_md
.md_regs
;
731 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
733 /* Save user context */
734 bzero(&sf
, sizeof(struct sigframe
));
735 sf
.sf_uc
.uc_sigmask
= *mask
;
736 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
737 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
738 KKASSERT(__offsetof(struct trapframe
, tf_rdi
) == 0);
739 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_rdi
, sizeof(struct trapframe
));
741 /* Make the size of the saved context visible to userland */
742 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
744 /* Allocate and validate space for the signal handler context. */
745 if ((lp
->lwp_flags
& LWP_ALTSTACK
) != 0 && !oonstack
&&
746 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
747 sp
= (char *)(lp
->lwp_sigstk
.ss_sp
+ lp
->lwp_sigstk
.ss_size
-
748 sizeof(struct sigframe
));
749 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
751 /* We take red zone into account */
752 sp
= (char *)regs
->tf_rsp
- sizeof(struct sigframe
) - 128;
756 * XXX AVX needs 64-byte alignment but sigframe has other fields and
757 * the embedded ucontext is not at the front, so aligning this won't
758 * help us. Fortunately we bcopy in/out of the sigframe, so the
761 * The problem though is if userland winds up trying to use the
764 sfp
= (struct sigframe
*)((intptr_t)sp
& ~(intptr_t)0xF);
766 /* Translate the signal is appropriate */
767 if (p
->p_sysent
->sv_sigtbl
) {
768 if (sig
<= p
->p_sysent
->sv_sigsize
)
769 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
773 * Build the argument list for the signal handler.
775 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
777 regs
->tf_rdi
= sig
; /* argument 1 */
778 regs
->tf_rdx
= (register_t
)&sfp
->sf_uc
; /* argument 3 */
780 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
782 * Signal handler installed with SA_SIGINFO.
784 * action(signo, siginfo, ucontext)
786 regs
->tf_rsi
= (register_t
)&sfp
->sf_si
; /* argument 2 */
787 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
788 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
790 /* fill siginfo structure */
791 sf
.sf_si
.si_signo
= sig
;
792 sf
.sf_si
.si_code
= code
;
793 sf
.sf_si
.si_addr
= (void *)regs
->tf_addr
;
796 * Old FreeBSD-style arguments.
798 * handler (signo, code, [uc], addr)
800 regs
->tf_rsi
= (register_t
)code
; /* argument 2 */
801 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
802 sf
.sf_ahu
.sf_handler
= catcher
;
806 * If we're a vm86 process, we want to save the segment registers.
807 * We also change eflags to be our emulated eflags, not the actual
811 if (regs
->tf_eflags
& PSL_VM
) {
812 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
813 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
815 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
816 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
817 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
818 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
820 if (vm86
->vm86_has_vme
== 0)
821 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
822 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
823 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
826 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
827 * syscalls made by the signal handler. This just avoids
828 * wasting time for our lazy fixup of such faults. PSL_NT
829 * does nothing in vm86 mode, but vm86 programs can set it
830 * almost legitimately in probes for old cpu types.
832 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
837 * Save the FPU state and reinit the FP unit
839 npxpush(&sf
.sf_uc
.uc_mcontext
);
842 * Copy the sigframe out to the user's stack.
844 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
846 * Something is wrong with the stack pointer.
847 * ...Kill the process.
852 regs
->tf_rsp
= (register_t
)sfp
;
853 regs
->tf_rip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
856 * i386 abi specifies that the direction flag must be cleared
859 regs
->tf_rflags
&= ~(PSL_T
|PSL_D
);
862 * 64 bit mode has a code and stack selector but
863 * no data or extra selector. %fs and %gs are not
866 regs
->tf_cs
= _ucodesel
;
867 regs
->tf_ss
= _udatasel
;
872 * Sanitize the trapframe for a virtual kernel passing control to a custom
873 * VM context. Remove any items that would otherwise create a privilage
876 * XXX at the moment we allow userland to set the resume flag. Is this a
880 cpu_sanitize_frame(struct trapframe
*frame
)
882 frame
->tf_cs
= _ucodesel
;
883 frame
->tf_ss
= _udatasel
;
884 /* XXX VM (8086) mode not supported? */
885 frame
->tf_rflags
&= (PSL_RF
| PSL_USERCHANGE
| PSL_VM_UNSUPP
);
886 frame
->tf_rflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
892 * Sanitize the tls so loading the descriptor does not blow up
893 * on us. For x86_64 we don't have to do anything.
896 cpu_sanitize_tls(struct savetls
*tls
)
902 * sigreturn(ucontext_t *sigcntxp)
904 * System call to cleanup state after a signal
905 * has been taken. Reset signal mask and
906 * stack state from context left by sendsig (above).
907 * Return to previous pc and psl as specified by
908 * context left by sendsig. Check carefully to
909 * make sure that the user has not modified the
910 * state to gain improper privileges.
914 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
915 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
918 sys_sigreturn(struct sigreturn_args
*uap
)
920 struct lwp
*lp
= curthread
->td_lwp
;
921 struct trapframe
*regs
;
929 * We have to copy the information into kernel space so userland
930 * can't modify it while we are sniffing it.
932 regs
= lp
->lwp_md
.md_regs
;
933 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
937 rflags
= ucp
->uc_mcontext
.mc_rflags
;
939 /* VM (8086) mode not supported */
940 rflags
&= ~PSL_VM_UNSUPP
;
943 if (eflags
& PSL_VM
) {
944 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
945 struct vm86_kernel
*vm86
;
948 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
949 * set up the vm86 area, and we can't enter vm86 mode.
951 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
953 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
954 if (vm86
->vm86_inited
== 0)
957 /* go back to user mode if both flags are set */
958 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
959 trapsignal(lp
, SIGBUS
, 0);
961 if (vm86
->vm86_has_vme
) {
962 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
963 (eflags
& VME_USERCHANGE
) | PSL_VM
;
965 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
966 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
967 (eflags
& VM_USERCHANGE
) | PSL_VM
;
969 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
970 tf
->tf_eflags
= eflags
;
971 tf
->tf_vm86_ds
= tf
->tf_ds
;
972 tf
->tf_vm86_es
= tf
->tf_es
;
973 tf
->tf_vm86_fs
= tf
->tf_fs
;
974 tf
->tf_vm86_gs
= tf
->tf_gs
;
975 tf
->tf_ds
= _udatasel
;
976 tf
->tf_es
= _udatasel
;
977 tf
->tf_fs
= _udatasel
;
978 tf
->tf_gs
= _udatasel
;
983 * Don't allow users to change privileged or reserved flags.
986 * XXX do allow users to change the privileged flag PSL_RF.
987 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
988 * should sometimes set it there too. tf_eflags is kept in
989 * the signal context during signal handling and there is no
990 * other place to remember it, so the PSL_RF bit may be
991 * corrupted by the signal handler without us knowing.
992 * Corruption of the PSL_RF bit at worst causes one more or
993 * one less debugger trap, so allowing it is fairly harmless.
995 if (!EFL_SECURE(rflags
& ~PSL_RF
, regs
->tf_rflags
& ~PSL_RF
)) {
996 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags
);
1001 * Don't allow users to load a valid privileged %cs. Let the
1002 * hardware check for invalid selectors, excess privilege in
1003 * other selectors, invalid %eip's and invalid %esp's.
1005 cs
= ucp
->uc_mcontext
.mc_cs
;
1006 if (!CS_SECURE(cs
)) {
1007 kprintf("sigreturn: cs = 0x%x\n", cs
);
1008 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
1011 bcopy(&ucp
->uc_mcontext
.mc_rdi
, regs
, sizeof(struct trapframe
));
1015 * Restore the FPU state from the frame
1018 npxpop(&ucp
->uc_mcontext
);
1020 if (ucp
->uc_mcontext
.mc_onstack
& 1)
1021 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
1023 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
1025 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
1026 SIG_CANTMASK(lp
->lwp_sigmask
);
1029 return(EJUSTRETURN
);
1033 * Machine dependent boot() routine
1035 * I haven't seen anything to put here yet
1036 * Possibly some stuff might be grafted back here from boot()
1044 * Shutdown the CPU as much as possible
1050 __asm__
__volatile("hlt");
1054 * cpu_idle() represents the idle LWKT. You cannot return from this function
1055 * (unless you want to blow things up!). Instead we look for runnable threads
1056 * and loop or halt as appropriate. Giant is not held on entry to the thread.
1058 * The main loop is entered with a critical section held, we must release
1059 * the critical section before doing anything else. lwkt_switch() will
1060 * check for pending interrupts due to entering and exiting its own
1063 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1064 * However, there are cases where the idlethread will be entered with
1065 * the possibility that no IPI will occur and in such cases
1066 * lwkt_switch() sets TDF_IDLE_NOHLT.
1068 * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1069 * must occur before it starts using ACPI halt.
1071 * NOTE: Value overridden in hammer_time().
1073 static int cpu_idle_hlt
= 2;
1074 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
1075 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
1076 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_repeat
, CTLFLAG_RW
,
1077 &cpu_idle_repeat
, 0, "Idle entries before acpi hlt");
1079 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1080 0, CPU_IDLE_STAT_HALT
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry halts");
1081 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_spincnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1082 0, CPU_IDLE_STAT_SPIN
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry spins");
1085 cpu_idle_default_hook(void)
1088 * We must guarentee that hlt is exactly the instruction
1089 * following the sti.
1091 __asm
__volatile("sti; hlt");
1094 /* Other subsystems (e.g., ACPI) can hook this later. */
1095 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
1098 cpu_mwait_cx_hint(struct cpu_idle_stat
*stat
)
1107 idx
= (stat
->repeat
+ stat
->repeat_last
+ stat
->repeat_delta
) >>
1108 cpu_mwait_repeat_shift
;
1109 if (idx
>= cpu_mwait_c1_hints_cnt
) {
1110 /* Step up faster, once we walked through all C1 states */
1111 stat
->repeat_delta
+= 1 << (cpu_mwait_repeat_shift
+ 1);
1113 if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
1114 if (idx
>= cpu_mwait_deep_hints_cnt
)
1115 idx
= cpu_mwait_deep_hints_cnt
- 1;
1116 hint
= cpu_mwait_deep_hints
[idx
];
1118 if (idx
>= cpu_mwait_hints_cnt
)
1119 idx
= cpu_mwait_hints_cnt
- 1;
1120 hint
= cpu_mwait_hints
[idx
];
1123 cx_idx
= MWAIT_EAX_TO_CX(hint
);
1124 if (cx_idx
>= 0 && cx_idx
< CPU_MWAIT_CX_MAX
)
1125 stat
->mwait_cx
[cx_idx
]++;
1132 globaldata_t gd
= mycpu
;
1133 struct cpu_idle_stat
*stat
= &cpu_idle_stats
[gd
->gd_cpuid
];
1134 struct thread
*td __debugvar
= gd
->gd_curthread
;
1138 stat
->repeat
= stat
->repeat_last
= cpu_idle_repeat_max
;
1141 KKASSERT(td
->td_critcount
== 0);
1145 * See if there are any LWKTs ready to go.
1150 * When halting inside a cli we must check for reqflags
1151 * races, particularly [re]schedule requests. Running
1152 * splz() does the job.
1155 * 0 Never halt, just spin
1157 * 1 Always use HLT (or MONITOR/MWAIT if avail).
1159 * Better default for modern (Haswell+) Intel
1162 * 2 Use HLT/MONITOR/MWAIT up to a point and then
1163 * use the ACPI halt (default). This is a hybrid
1164 * approach. See machdep.cpu_idle_repeat.
1166 * Better default for modern AMD cpus and older
1169 * 3 Always use the ACPI halt. This typically
1170 * eats the least amount of power but the cpu
1171 * will be slow waking up. Slows down e.g.
1172 * compiles and other pipe/event oriented stuff.
1176 * NOTE: Interrupts are enabled and we are not in a critical
1179 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
1180 * don't bother capping gd_idle_repeat, it is ok if
1183 * Implement optimized invltlb operations when halted
1184 * in idle. By setting the bit in smp_idleinvl_mask
1185 * we inform other cpus that they can set _reqs to
1186 * request an invltlb. Current the code to do that
1187 * sets the bits in _reqs anyway, but then check _mask
1188 * to determine if they can assume the invltlb will execute.
1190 * A critical section is required to ensure that interrupts
1191 * do not fully run until after we've had a chance to execute
1194 if (gd
->gd_idle_repeat
== 0) {
1195 stat
->repeat
= (stat
->repeat
+ stat
->repeat_last
) >> 1;
1196 if (stat
->repeat
> cpu_idle_repeat_max
)
1197 stat
->repeat
= cpu_idle_repeat_max
;
1198 stat
->repeat_last
= 0;
1199 stat
->repeat_delta
= 0;
1201 ++stat
->repeat_last
;
1203 ++gd
->gd_idle_repeat
;
1204 reqflags
= gd
->gd_reqflags
;
1205 quick
= (cpu_idle_hlt
== 1) ||
1206 (cpu_idle_hlt
< 3 &&
1207 gd
->gd_idle_repeat
< cpu_idle_repeat
);
1209 if (quick
&& (cpu_mi_feature
& CPU_MI_MONITOR
) &&
1210 (reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1213 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1214 cpu_mmw_pause_int(&gd
->gd_reqflags
, reqflags
,
1215 cpu_mwait_cx_hint(stat
), 0);
1217 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1218 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1224 } else if (cpu_idle_hlt
) {
1225 __asm
__volatile("cli");
1228 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1229 if ((gd
->gd_reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1231 cpu_idle_default_hook();
1235 __asm
__volatile("sti");
1237 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1238 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1246 __asm
__volatile("sti");
1253 * This routine is called if a spinlock has been held through the
1254 * exponential backoff period and is seriously contested. On a real cpu
1258 cpu_spinlock_contested(void)
1264 * Clear registers on exec
1267 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
1269 struct thread
*td
= curthread
;
1270 struct lwp
*lp
= td
->td_lwp
;
1271 struct pcb
*pcb
= td
->td_pcb
;
1272 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
1274 /* was i386_user_cleanup() in NetBSD */
1278 bzero((char *)regs
, sizeof(struct trapframe
));
1279 regs
->tf_rip
= entry
;
1280 regs
->tf_rsp
= ((stack
- 8) & ~0xFul
) + 8; /* align the stack */
1281 regs
->tf_rdi
= stack
; /* argv */
1282 regs
->tf_rflags
= PSL_USER
| (regs
->tf_rflags
& PSL_T
);
1283 regs
->tf_ss
= _udatasel
;
1284 regs
->tf_cs
= _ucodesel
;
1285 regs
->tf_rbx
= ps_strings
;
1288 * Reset the hardware debug registers if they were in use.
1289 * They won't have any meaning for the newly exec'd process.
1291 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1297 pcb
->pcb_dr7
= 0; /* JG set bit 10? */
1298 if (pcb
== td
->td_pcb
) {
1300 * Clear the debug registers on the running
1301 * CPU, otherwise they will end up affecting
1302 * the next process we switch to.
1306 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1310 * Initialize the math emulator (if any) for the current process.
1311 * Actually, just clear the bit that says that the emulator has
1312 * been initialized. Initialization is delayed until the process
1313 * traps to the emulator (if it is done at all) mainly because
1314 * emulators don't provide an entry point for initialization.
1316 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1319 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1320 * gd_npxthread. Otherwise a preemptive interrupt thread
1321 * may panic in npxdna().
1324 load_cr0(rcr0() | CR0_MP
);
1327 * NOTE: The MSR values must be correct so we can return to
1328 * userland. gd_user_fs/gs must be correct so the switch
1329 * code knows what the current MSR values are.
1331 pcb
->pcb_fsbase
= 0; /* Values loaded from PCB on switch */
1332 pcb
->pcb_gsbase
= 0;
1333 mdcpu
->gd_user_fs
= 0; /* Cache of current MSR values */
1334 mdcpu
->gd_user_gs
= 0;
1335 wrmsr(MSR_FSBASE
, 0); /* Set MSR values for return to userland */
1336 wrmsr(MSR_KGSBASE
, 0);
1338 /* Initialize the npx (if any) for the current process. */
1342 pcb
->pcb_ds
= _udatasel
;
1343 pcb
->pcb_es
= _udatasel
;
1344 pcb
->pcb_fs
= _udatasel
;
1345 pcb
->pcb_gs
= _udatasel
;
1354 cr0
|= CR0_NE
; /* Done by npxinit() */
1355 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1356 cr0
|= CR0_WP
| CR0_AM
;
1362 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1365 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1367 if (!error
&& req
->newptr
)
1372 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1373 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1375 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1376 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1379 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1380 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1383 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1384 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1386 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1387 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1388 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1391 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS
)
1393 struct efi_map_header
*efihdr
;
1397 kmdp
= preload_search_by_type("elf kernel");
1399 kmdp
= preload_search_by_type("elf64 kernel");
1400 efihdr
= (struct efi_map_header
*)preload_search_info(kmdp
,
1401 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1404 efisize
= *((uint32_t *)efihdr
- 1);
1405 return (SYSCTL_OUT(req
, efihdr
, efisize
));
1407 SYSCTL_PROC(_machdep
, OID_AUTO
, efi_map
, CTLTYPE_OPAQUE
|CTLFLAG_RD
, NULL
, 0,
1408 efi_map_sysctl_handler
, "S,efi_map_header", "Raw EFI Memory Map");
1411 * Initialize 386 and configure to run kernel
1415 * Initialize segments & interrupt table
1419 struct user_segment_descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1420 struct gate_descriptor idt_arr
[MAXCPU
][NIDT
];
1422 union descriptor ldt
[NLDT
]; /* local descriptor table */
1425 /* table descriptors - used to load tables by cpu */
1426 struct region_descriptor r_gdt
;
1427 struct region_descriptor r_idt_arr
[MAXCPU
];
1429 /* JG proc0paddr is a virtual address */
1432 char proc0paddr_buff
[LWKT_THREAD_STACK
];
1435 /* software prototypes -- in more palatable form */
1436 struct soft_segment_descriptor gdt_segs
[] = {
1437 /* GNULL_SEL 0 Null Descriptor */
1438 { 0x0, /* segment base address */
1440 0, /* segment type */
1441 0, /* segment descriptor priority level */
1442 0, /* segment descriptor present */
1444 0, /* default 32 vs 16 bit size */
1445 0 /* limit granularity (byte/page units)*/ },
1446 /* GCODE_SEL 1 Code Descriptor for kernel */
1447 { 0x0, /* segment base address */
1448 0xfffff, /* length - all address space */
1449 SDT_MEMERA
, /* segment type */
1450 SEL_KPL
, /* segment descriptor priority level */
1451 1, /* segment descriptor present */
1453 0, /* default 32 vs 16 bit size */
1454 1 /* limit granularity (byte/page units)*/ },
1455 /* GDATA_SEL 2 Data Descriptor for kernel */
1456 { 0x0, /* segment base address */
1457 0xfffff, /* length - all address space */
1458 SDT_MEMRWA
, /* segment type */
1459 SEL_KPL
, /* segment descriptor priority level */
1460 1, /* segment descriptor present */
1462 0, /* default 32 vs 16 bit size */
1463 1 /* limit granularity (byte/page units)*/ },
1464 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1465 { 0x0, /* segment base address */
1466 0xfffff, /* length - all address space */
1467 SDT_MEMERA
, /* segment type */
1468 SEL_UPL
, /* segment descriptor priority level */
1469 1, /* segment descriptor present */
1471 1, /* default 32 vs 16 bit size */
1472 1 /* limit granularity (byte/page units)*/ },
1473 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1474 { 0x0, /* segment base address */
1475 0xfffff, /* length - all address space */
1476 SDT_MEMRWA
, /* segment type */
1477 SEL_UPL
, /* segment descriptor priority level */
1478 1, /* segment descriptor present */
1480 1, /* default 32 vs 16 bit size */
1481 1 /* limit granularity (byte/page units)*/ },
1482 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1483 { 0x0, /* segment base address */
1484 0xfffff, /* length - all address space */
1485 SDT_MEMERA
, /* segment type */
1486 SEL_UPL
, /* segment descriptor priority level */
1487 1, /* segment descriptor present */
1489 0, /* default 32 vs 16 bit size */
1490 1 /* limit granularity (byte/page units)*/ },
1491 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1493 0x0, /* segment base address */
1494 sizeof(struct x86_64tss
)-1,/* length - all address space */
1495 SDT_SYSTSS
, /* segment type */
1496 SEL_KPL
, /* segment descriptor priority level */
1497 1, /* segment descriptor present */
1499 0, /* unused - default 32 vs 16 bit size */
1500 0 /* limit granularity (byte/page units)*/ },
1501 /* Actually, the TSS is a system descriptor which is double size */
1502 { 0x0, /* segment base address */
1504 0, /* segment type */
1505 0, /* segment descriptor priority level */
1506 0, /* segment descriptor present */
1508 0, /* default 32 vs 16 bit size */
1509 0 /* limit granularity (byte/page units)*/ },
1510 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1511 { 0x0, /* segment base address */
1512 0xfffff, /* length - all address space */
1513 SDT_MEMRWA
, /* segment type */
1514 SEL_UPL
, /* segment descriptor priority level */
1515 1, /* segment descriptor present */
1517 1, /* default 32 vs 16 bit size */
1518 1 /* limit granularity (byte/page units)*/ },
1522 setidt_global(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
)
1526 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
1527 struct gate_descriptor
*ip
= &idt_arr
[cpu
][idx
];
1529 ip
->gd_looffset
= (uintptr_t)func
;
1530 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1536 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1541 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
, int cpu
)
1543 struct gate_descriptor
*ip
;
1545 KASSERT(cpu
>= 0 && cpu
< ncpus
, ("invalid cpu %d", cpu
));
1547 ip
= &idt_arr
[cpu
][idx
];
1548 ip
->gd_looffset
= (uintptr_t)func
;
1549 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1555 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1558 #define IDTVEC(name) __CONCAT(X,name)
1561 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1562 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1563 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1564 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(rsvd
), IDTVEC(fpu
), IDTVEC(align
),
1565 IDTVEC(xmm
), IDTVEC(dblfault
),
1566 IDTVEC(fast_syscall
), IDTVEC(fast_syscall32
);
1569 sdtossd(struct user_segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1571 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1572 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1573 ssd
->ssd_type
= sd
->sd_type
;
1574 ssd
->ssd_dpl
= sd
->sd_dpl
;
1575 ssd
->ssd_p
= sd
->sd_p
;
1576 ssd
->ssd_def32
= sd
->sd_def32
;
1577 ssd
->ssd_gran
= sd
->sd_gran
;
1581 ssdtosd(struct soft_segment_descriptor
*ssd
, struct user_segment_descriptor
*sd
)
1584 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1585 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xff;
1586 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1587 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1588 sd
->sd_type
= ssd
->ssd_type
;
1589 sd
->sd_dpl
= ssd
->ssd_dpl
;
1590 sd
->sd_p
= ssd
->ssd_p
;
1591 sd
->sd_long
= ssd
->ssd_long
;
1592 sd
->sd_def32
= ssd
->ssd_def32
;
1593 sd
->sd_gran
= ssd
->ssd_gran
;
1597 ssdtosyssd(struct soft_segment_descriptor
*ssd
,
1598 struct system_segment_descriptor
*sd
)
1601 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1602 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xfffffffffful
;
1603 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1604 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1605 sd
->sd_type
= ssd
->ssd_type
;
1606 sd
->sd_dpl
= ssd
->ssd_dpl
;
1607 sd
->sd_p
= ssd
->ssd_p
;
1608 sd
->sd_gran
= ssd
->ssd_gran
;
1612 * Populate the (physmap) array with base/bound pairs describing the
1613 * available physical memory in the system, then test this memory and
1614 * build the phys_avail array describing the actually-available memory.
1616 * If we cannot accurately determine the physical memory map, then use
1617 * value from the 0xE801 call, and failing that, the RTC.
1619 * Total memory size may be set by the kernel environment variable
1620 * hw.physmem or the compile-time define MAXMEM.
1622 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1623 * of PAGE_SIZE. This also greatly reduces the memory test time
1624 * which would otherwise be excessive on machines with > 8G of ram.
1626 * XXX first should be vm_paddr_t.
1629 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024)
1630 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1631 vm_paddr_t physmap
[PHYSMAP_SIZE
];
1632 struct bios_smap
*smapbase
, *smap
, *smapend
;
1633 struct efi_map_header
*efihdrbase
;
1635 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024)
1636 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1)
1639 add_smap_entries(int *physmap_idx
)
1643 smapsize
= *((u_int32_t
*)smapbase
- 1);
1644 smapend
= (struct bios_smap
*)((uintptr_t)smapbase
+ smapsize
);
1646 for (smap
= smapbase
; smap
< smapend
; smap
++) {
1647 if (boothowto
& RB_VERBOSE
)
1648 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1649 smap
->type
, smap
->base
, smap
->length
);
1651 if (smap
->type
!= SMAP_TYPE_MEMORY
)
1654 if (smap
->length
== 0)
1657 for (i
= 0; i
<= *physmap_idx
; i
+= 2) {
1658 if (smap
->base
< physmap
[i
+ 1]) {
1659 if (boothowto
& RB_VERBOSE
) {
1660 kprintf("Overlapping or non-monotonic "
1661 "memory region, ignoring "
1667 if (i
<= *physmap_idx
)
1670 Realmem
+= smap
->length
;
1672 if (smap
->base
== physmap
[*physmap_idx
+ 1]) {
1673 physmap
[*physmap_idx
+ 1] += smap
->length
;
1678 if (*physmap_idx
== PHYSMAP_SIZE
) {
1679 kprintf("Too many segments in the physical "
1680 "address map, giving up\n");
1683 physmap
[*physmap_idx
] = smap
->base
;
1684 physmap
[*physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1689 add_efi_map_entries(int *physmap_idx
)
1691 struct efi_md
*map
, *p
;
1696 static const char *types
[] = {
1702 "RuntimeServicesCode",
1703 "RuntimeServicesData",
1704 "ConventionalMemory",
1706 "ACPIReclaimMemory",
1709 "MemoryMappedIOPortSpace",
1714 * Memory map data provided by UEFI via the GetMemoryMap
1715 * Boot Services API.
1717 efisz
= (sizeof(struct efi_map_header
) + 0xf) & ~0xf;
1718 map
= (struct efi_md
*)((uint8_t *)efihdrbase
+ efisz
);
1720 if (efihdrbase
->descriptor_size
== 0)
1722 ndesc
= efihdrbase
->memory_size
/ efihdrbase
->descriptor_size
;
1724 if (boothowto
& RB_VERBOSE
)
1725 kprintf("%23s %12s %12s %8s %4s\n",
1726 "Type", "Physical", "Virtual", "#Pages", "Attr");
1728 for (i
= 0, p
= map
; i
< ndesc
; i
++,
1729 p
= efi_next_descriptor(p
, efihdrbase
->descriptor_size
)) {
1730 if (boothowto
& RB_VERBOSE
) {
1731 if (p
->md_type
<= EFI_MD_TYPE_PALCODE
)
1732 type
= types
[p
->md_type
];
1735 kprintf("%23s %012lx %12p %08lx ", type
, p
->md_phys
,
1736 p
->md_virt
, p
->md_pages
);
1737 if (p
->md_attr
& EFI_MD_ATTR_UC
)
1739 if (p
->md_attr
& EFI_MD_ATTR_WC
)
1741 if (p
->md_attr
& EFI_MD_ATTR_WT
)
1743 if (p
->md_attr
& EFI_MD_ATTR_WB
)
1745 if (p
->md_attr
& EFI_MD_ATTR_UCE
)
1747 if (p
->md_attr
& EFI_MD_ATTR_WP
)
1749 if (p
->md_attr
& EFI_MD_ATTR_RP
)
1751 if (p
->md_attr
& EFI_MD_ATTR_XP
)
1753 if (p
->md_attr
& EFI_MD_ATTR_RT
)
1758 switch (p
->md_type
) {
1759 case EFI_MD_TYPE_CODE
:
1760 case EFI_MD_TYPE_DATA
:
1761 case EFI_MD_TYPE_BS_CODE
:
1762 case EFI_MD_TYPE_BS_DATA
:
1763 case EFI_MD_TYPE_FREE
:
1765 * We're allowed to use any entry with these types.
1772 Realmem
+= p
->md_pages
* PAGE_SIZE
;
1774 if (p
->md_phys
== physmap
[*physmap_idx
+ 1]) {
1775 physmap
[*physmap_idx
+ 1] += p
->md_pages
* PAGE_SIZE
;
1780 if (*physmap_idx
== PHYSMAP_SIZE
) {
1781 kprintf("Too many segments in the physical "
1782 "address map, giving up\n");
1785 physmap
[*physmap_idx
] = p
->md_phys
;
1786 physmap
[*physmap_idx
+ 1] = p
->md_phys
+ p
->md_pages
* PAGE_SIZE
;
1790 struct fb_info efi_fb_info
;
1791 static int have_efi_framebuffer
= 0;
1794 efi_fb_init_vaddr(int direct_map
)
1797 vm_offset_t addr
, v
;
1799 v
= efi_fb_info
.vaddr
;
1800 sz
= efi_fb_info
.stride
* efi_fb_info
.height
;
1803 addr
= PHYS_TO_DMAP(efi_fb_info
.paddr
);
1804 if (addr
>= DMAP_MIN_ADDRESS
&& addr
+ sz
< DMAP_MAX_ADDRESS
)
1805 efi_fb_info
.vaddr
= addr
;
1807 efi_fb_info
.vaddr
= (vm_offset_t
)pmap_mapdev_attr(
1808 efi_fb_info
.paddr
, sz
, PAT_WRITE_COMBINING
);
1811 if (v
== 0 && efi_fb_info
.vaddr
!= 0)
1812 memset((void *)efi_fb_info
.vaddr
, 0x77, sz
);
1816 probe_efi_fb(int early
)
1818 struct efi_fb
*efifb
;
1821 if (have_efi_framebuffer
) {
1823 (efi_fb_info
.vaddr
== 0 ||
1824 efi_fb_info
.vaddr
== PHYS_TO_DMAP(efi_fb_info
.paddr
)))
1825 efi_fb_init_vaddr(0);
1829 kmdp
= preload_search_by_type("elf kernel");
1831 kmdp
= preload_search_by_type("elf64 kernel");
1832 efifb
= (struct efi_fb
*)preload_search_info(kmdp
,
1833 MODINFO_METADATA
| MODINFOMD_EFI_FB
);
1837 have_efi_framebuffer
= 1;
1839 efi_fb_info
.is_vga_boot_display
= 1;
1840 efi_fb_info
.width
= efifb
->fb_width
;
1841 efi_fb_info
.height
= efifb
->fb_height
;
1842 efi_fb_info
.stride
= efifb
->fb_stride
* 4;
1843 efi_fb_info
.depth
= 32;
1844 efi_fb_info
.paddr
= efifb
->fb_addr
;
1846 efi_fb_info
.vaddr
= 0;
1848 efi_fb_init_vaddr(0);
1850 efi_fb_info
.restore
= NULL
;
1851 efi_fb_info
.device
= NULL
;
1857 efifb_startup(void *arg
)
1862 SYSINIT(efi_fb_info
, SI_BOOT1_POST
, SI_ORDER_FIRST
, efifb_startup
, NULL
);
1865 getmemsize(caddr_t kmdp
, u_int64_t first
)
1867 int off
, physmap_idx
, pa_indx
, da_indx
;
1870 vm_paddr_t msgbuf_size
;
1871 u_long physmem_tunable
;
1873 quad_t dcons_addr
, dcons_size
;
1875 bzero(physmap
, sizeof(physmap
));
1879 * get memory map from INT 15:E820, kindly supplied by the loader.
1881 * subr_module.c says:
1882 * "Consumer may safely assume that size value precedes data."
1883 * ie: an int32_t immediately precedes smap.
1885 efihdrbase
= (struct efi_map_header
*)preload_search_info(kmdp
,
1886 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1887 smapbase
= (struct bios_smap
*)preload_search_info(kmdp
,
1888 MODINFO_METADATA
| MODINFOMD_SMAP
);
1889 if (smapbase
== NULL
&& efihdrbase
== NULL
)
1890 panic("No BIOS smap or EFI map info from loader!");
1892 if (efihdrbase
== NULL
)
1893 add_smap_entries(&physmap_idx
);
1895 add_efi_map_entries(&physmap_idx
);
1897 base_memory
= physmap
[1] / 1024;
1898 /* make hole for AP bootstrap code */
1899 physmap
[1] = mp_bootaddress(base_memory
);
1901 /* Save EBDA address, if any */
1902 ebda_addr
= (u_long
)(*(u_short
*)(KERNBASE
+ 0x40e));
1906 * Maxmem isn't the "maximum memory", it's one larger than the
1907 * highest page of the physical address space. It should be
1908 * called something like "Maxphyspage". We may adjust this
1909 * based on ``hw.physmem'' and the results of the memory test.
1911 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1914 Maxmem
= MAXMEM
/ 4;
1917 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable
))
1918 Maxmem
= atop(physmem_tunable
);
1921 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1924 if (Maxmem
> atop(physmap
[physmap_idx
+ 1]))
1925 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1928 * Blowing out the DMAP will blow up the system.
1930 if (Maxmem
> atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
)) {
1931 kprintf("Limiting Maxmem due to DMAP size\n");
1932 Maxmem
= atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
);
1935 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1936 (boothowto
& RB_VERBOSE
)) {
1937 kprintf("Physical memory use set to %ldK\n", Maxmem
* 4);
1941 * Call pmap initialization to make new kernel address space
1945 pmap_bootstrap(&first
);
1946 physmap
[0] = PAGE_SIZE
;
1949 * Align the physmap to PHYSMAP_ALIGN and cut out anything
1952 for (i
= j
= 0; i
<= physmap_idx
; i
+= 2) {
1953 if (physmap
[i
+1] > ptoa(Maxmem
))
1954 physmap
[i
+1] = ptoa(Maxmem
);
1955 physmap
[i
] = (physmap
[i
] + PHYSMAP_ALIGN_MASK
) &
1956 ~PHYSMAP_ALIGN_MASK
;
1957 physmap
[i
+1] = physmap
[i
+1] & ~PHYSMAP_ALIGN_MASK
;
1959 physmap
[j
] = physmap
[i
];
1960 physmap
[j
+1] = physmap
[i
+1];
1962 if (physmap
[i
] < physmap
[i
+1])
1965 physmap_idx
= j
- 2;
1968 * Align anything else used in the validation loop.
1970 first
= (first
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
1973 * Size up each available chunk of physical memory.
1977 phys_avail
[pa_indx
++] = physmap
[0];
1978 phys_avail
[pa_indx
] = physmap
[0];
1979 dump_avail
[da_indx
] = physmap
[0];
1983 * Get dcons buffer address
1985 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
1986 kgetenv_quad("dcons.size", &dcons_size
) == 0)
1990 * Validate the physical memory. The physical memory segments
1991 * have already been aligned to PHYSMAP_ALIGN which is a multiple
1994 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1996 vm_paddr_t incr
= PHYSMAP_ALIGN
;
1998 end
= physmap
[i
+ 1];
2000 for (pa
= physmap
[i
]; pa
< end
; pa
+= incr
) {
2002 volatile uint64_t *ptr
= (uint64_t *)CADDR1
;
2005 incr
= PHYSMAP_ALIGN
;
2009 * block out kernel memory as not available.
2011 if (pa
>= 0x200000 && pa
< first
)
2015 * block out dcons buffer
2018 && pa
>= trunc_page(dcons_addr
)
2019 && pa
< dcons_addr
+ dcons_size
) {
2026 * Always test the first and last block supplied in
2027 * the map entry, but it just takes too long to run
2028 * the test these days and we already have to skip
2029 * pages. Handwave it on PHYSMAP_HANDWAVE boundaries.
2031 if (pa
!= physmap
[i
]) {
2032 vm_paddr_t bytes
= end
- pa
;
2033 if ((pa
& PHYSMAP_HANDWAVE_MASK
) == 0 &&
2034 bytes
>= PHYSMAP_HANDWAVE
+ PHYSMAP_ALIGN
) {
2035 incr
= PHYSMAP_HANDWAVE
;
2041 * map page into kernel: valid, read/write,non-cacheable
2044 kernel_pmap
.pmap_bits
[PG_V_IDX
] |
2045 kernel_pmap
.pmap_bits
[PG_RW_IDX
] |
2046 kernel_pmap
.pmap_bits
[PG_N_IDX
];
2047 cpu_invlpg(__DEVOLATILE(void *, ptr
));
2052 * Test for alternating 1's and 0's
2054 *ptr
= 0xaaaaaaaaaaaaaaaaLLU
;
2056 if (*ptr
!= 0xaaaaaaaaaaaaaaaaLLU
)
2059 * Test for alternating 0's and 1's
2061 *ptr
= 0x5555555555555555LLU
;
2063 if (*ptr
!= 0x5555555555555555LLU
)
2068 *ptr
= 0xffffffffffffffffLLU
;
2070 if (*ptr
!= 0xffffffffffffffffLLU
)
2080 * Restore original value.
2086 * Adjust array of valid/good pages.
2088 if (page_bad
== TRUE
)
2092 * If this good page is a continuation of the
2093 * previous set of good pages, then just increase
2094 * the end pointer. Otherwise start a new chunk.
2095 * Note that "end" points one higher than end,
2096 * making the range >= start and < end.
2097 * If we're also doing a speculative memory
2098 * test and we at or past the end, bump up Maxmem
2099 * so that we keep going. The first bad page
2100 * will terminate the loop.
2102 if (phys_avail
[pa_indx
] == pa
) {
2103 phys_avail
[pa_indx
] += incr
;
2106 if (pa_indx
== PHYS_AVAIL_ARRAY_END
) {
2108 "Too many holes in the physical address space, giving up\n");
2113 phys_avail
[pa_indx
++] = pa
;
2114 phys_avail
[pa_indx
] = pa
+ incr
;
2116 physmem
+= incr
/ PAGE_SIZE
;
2118 if (dump_avail
[da_indx
] == pa
) {
2119 dump_avail
[da_indx
] += incr
;
2122 if (da_indx
== DUMP_AVAIL_ARRAY_END
) {
2126 dump_avail
[da_indx
++] = pa
;
2127 dump_avail
[da_indx
] = pa
+ incr
;
2139 * The last chunk must contain at least one page plus the message
2140 * buffer to avoid complicating other code (message buffer address
2141 * calculation, etc.).
2143 msgbuf_size
= (MSGBUF_SIZE
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
2145 while (phys_avail
[pa_indx
- 1] + PHYSMAP_ALIGN
+
2146 msgbuf_size
>= phys_avail
[pa_indx
]) {
2147 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
2148 phys_avail
[pa_indx
--] = 0;
2149 phys_avail
[pa_indx
--] = 0;
2152 Maxmem
= atop(phys_avail
[pa_indx
]);
2154 /* Trim off space for the message buffer. */
2155 phys_avail
[pa_indx
] -= msgbuf_size
;
2157 avail_end
= phys_avail
[pa_indx
];
2159 /* Map the message buffer. */
2160 for (off
= 0; off
< msgbuf_size
; off
+= PAGE_SIZE
) {
2161 pmap_kenter((vm_offset_t
)msgbufp
+ off
,
2162 phys_avail
[pa_indx
] + off
);
2164 /* Try to get EFI framebuffer working as early as possible */
2165 if (have_efi_framebuffer
)
2166 efi_fb_init_vaddr(1);
2169 struct machintr_abi MachIntrABI
;
2180 * 7 Device Not Available (x87)
2182 * 9 Coprocessor Segment overrun (unsupported, reserved)
2184 * 11 Segment not present
2186 * 13 General Protection
2189 * 16 x87 FP Exception pending
2190 * 17 Alignment Check
2192 * 19 SIMD floating point
2194 * 32-255 INTn/external sources
2197 hammer_time(u_int64_t modulep
, u_int64_t physfree
)
2200 int gsel_tss
, x
, cpu
;
2202 int metadata_missing
, off
;
2204 struct mdglobaldata
*gd
;
2208 * Prevent lowering of the ipl if we call tsleep() early.
2210 gd
= &CPU_prvspace
[0]->mdglobaldata
;
2211 bzero(gd
, sizeof(*gd
));
2214 * Note: on both UP and SMP curthread must be set non-NULL
2215 * early in the boot sequence because the system assumes
2216 * that 'curthread' is never NULL.
2219 gd
->mi
.gd_curthread
= &thread0
;
2220 thread0
.td_gd
= &gd
->mi
;
2222 atdevbase
= ISA_HOLE_START
+ PTOV_OFFSET
;
2225 metadata_missing
= 0;
2226 if (bootinfo
.bi_modulep
) {
2227 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
2228 preload_bootstrap_relocate(KERNBASE
);
2230 metadata_missing
= 1;
2232 if (bootinfo
.bi_envp
)
2233 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
2236 preload_metadata
= (caddr_t
)(uintptr_t)(modulep
+ PTOV_OFFSET
);
2237 preload_bootstrap_relocate(PTOV_OFFSET
);
2238 kmdp
= preload_search_by_type("elf kernel");
2240 kmdp
= preload_search_by_type("elf64 kernel");
2241 boothowto
= MD_FETCH(kmdp
, MODINFOMD_HOWTO
, int);
2242 kern_envp
= MD_FETCH(kmdp
, MODINFOMD_ENVP
, char *) + PTOV_OFFSET
;
2244 ksym_start
= MD_FETCH(kmdp
, MODINFOMD_SSYM
, uintptr_t);
2245 ksym_end
= MD_FETCH(kmdp
, MODINFOMD_ESYM
, uintptr_t);
2248 if (boothowto
& RB_VERBOSE
)
2252 * Default MachIntrABI to ICU
2254 MachIntrABI
= MachIntrABI_ICU
;
2257 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
2258 * and ncpus_fit_mask remain 0.
2263 /* Init basic tunables, hz etc */
2267 * make gdt memory segments
2269 gdt_segs
[GPROC0_SEL
].ssd_base
=
2270 (uintptr_t) &CPU_prvspace
[0]->mdglobaldata
.gd_common_tss
;
2272 gd
->mi
.gd_prvspace
= CPU_prvspace
[0];
2274 for (x
= 0; x
< NGDT
; x
++) {
2275 if (x
!= GPROC0_SEL
&& x
!= (GPROC0_SEL
+ 1))
2276 ssdtosd(&gdt_segs
[x
], &gdt
[x
]);
2278 ssdtosyssd(&gdt_segs
[GPROC0_SEL
],
2279 (struct system_segment_descriptor
*)&gdt
[GPROC0_SEL
]);
2281 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
2282 r_gdt
.rd_base
= (long) gdt
;
2285 wrmsr(MSR_FSBASE
, 0); /* User value */
2286 wrmsr(MSR_GSBASE
, (u_int64_t
)&gd
->mi
);
2287 wrmsr(MSR_KGSBASE
, 0); /* User value while in the kernel */
2289 mi_gdinit(&gd
->mi
, 0);
2291 proc0paddr
= proc0paddr_buff
;
2292 mi_proc0init(&gd
->mi
, proc0paddr
);
2293 safepri
= TDPRI_MAX
;
2295 /* spinlocks and the BGL */
2299 for (x
= 0; x
< NIDT
; x
++)
2300 setidt_global(x
, &IDTVEC(rsvd
), SDT_SYSIGT
, SEL_KPL
, 0);
2301 setidt_global(IDT_DE
, &IDTVEC(div
), SDT_SYSIGT
, SEL_KPL
, 0);
2302 setidt_global(IDT_DB
, &IDTVEC(dbg
), SDT_SYSIGT
, SEL_KPL
, 0);
2303 setidt_global(IDT_NMI
, &IDTVEC(nmi
), SDT_SYSIGT
, SEL_KPL
, 1);
2304 setidt_global(IDT_BP
, &IDTVEC(bpt
), SDT_SYSIGT
, SEL_UPL
, 0);
2305 setidt_global(IDT_OF
, &IDTVEC(ofl
), SDT_SYSIGT
, SEL_KPL
, 0);
2306 setidt_global(IDT_BR
, &IDTVEC(bnd
), SDT_SYSIGT
, SEL_KPL
, 0);
2307 setidt_global(IDT_UD
, &IDTVEC(ill
), SDT_SYSIGT
, SEL_KPL
, 0);
2308 setidt_global(IDT_NM
, &IDTVEC(dna
), SDT_SYSIGT
, SEL_KPL
, 0);
2309 setidt_global(IDT_DF
, &IDTVEC(dblfault
), SDT_SYSIGT
, SEL_KPL
, 1);
2310 setidt_global(IDT_FPUGP
, &IDTVEC(fpusegm
), SDT_SYSIGT
, SEL_KPL
, 0);
2311 setidt_global(IDT_TS
, &IDTVEC(tss
), SDT_SYSIGT
, SEL_KPL
, 0);
2312 setidt_global(IDT_NP
, &IDTVEC(missing
), SDT_SYSIGT
, SEL_KPL
, 0);
2313 setidt_global(IDT_SS
, &IDTVEC(stk
), SDT_SYSIGT
, SEL_KPL
, 0);
2314 setidt_global(IDT_GP
, &IDTVEC(prot
), SDT_SYSIGT
, SEL_KPL
, 0);
2315 setidt_global(IDT_PF
, &IDTVEC(page
), SDT_SYSIGT
, SEL_KPL
, 0);
2316 setidt_global(IDT_MF
, &IDTVEC(fpu
), SDT_SYSIGT
, SEL_KPL
, 0);
2317 setidt_global(IDT_AC
, &IDTVEC(align
), SDT_SYSIGT
, SEL_KPL
, 0);
2318 setidt_global(IDT_MC
, &IDTVEC(mchk
), SDT_SYSIGT
, SEL_KPL
, 0);
2319 setidt_global(IDT_XF
, &IDTVEC(xmm
), SDT_SYSIGT
, SEL_KPL
, 0);
2321 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
2322 r_idt_arr
[cpu
].rd_limit
= sizeof(idt_arr
[cpu
]) - 1;
2323 r_idt_arr
[cpu
].rd_base
= (long) &idt_arr
[cpu
][0];
2326 lidt(&r_idt_arr
[0]);
2329 * Initialize the console before we print anything out.
2334 if (metadata_missing
)
2335 kprintf("WARNING: loader(8) metadata is missing!\n");
2345 * Initialize IRQ mapping
2348 * SHOULD be after elcr_probe()
2350 MachIntrABI_ICU
.initmap();
2351 MachIntrABI_IOAPIC
.initmap();
2355 if (boothowto
& RB_KDB
)
2356 Debugger("Boot flags requested debugger");
2360 finishidentcpu(); /* Final stage of CPU initialization */
2361 setidt(6, &IDTVEC(ill
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2362 setidt(13, &IDTVEC(prot
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2364 identify_cpu(); /* Final stage of CPU initialization */
2365 initializecpu(0); /* Initialize CPU registers */
2368 * On modern intel cpus, haswell or later, cpu_idle_hlt=1 is better
2369 * because the cpu does significant power management in MWAIT
2370 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2372 * On modern amd cpus cpu_idle_hlt=3 is better, because the cpu does
2373 * significant power management in HLT or ACPI (but cpu_idle_hlt=1
2374 * would try to use MWAIT).
2376 * On older amd or intel cpus, cpu_idle_hlt=2 is better because ACPI
2377 * is needed to reduce power consumption, but wakeup times are often
2380 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
2381 CPUID_TO_MODEL(cpu_id
) >= 0x3C) { /* Haswell or later */
2384 if (cpu_vendor_id
== CPU_VENDOR_AMD
&&
2385 CPUID_TO_FAMILY(cpu_id
) >= 0x14) { /* Bobcat or later */
2389 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable
); /* for compat */
2390 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable
);
2391 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable
);
2392 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt
);
2395 * Some of the virtual machines do not work w/ I/O APIC
2396 * enabled. If the user does not explicitly enable or
2397 * disable the I/O APIC (ioapic_enable < 0), then we
2398 * disable I/O APIC on all virtual machines.
2401 * This must be done after identify_cpu(), which sets
2404 if (ioapic_enable
< 0) {
2405 if (cpu_feature2
& CPUID2_VMM
)
2411 /* make an initial tss so cpu can get interrupt stack on syscall! */
2412 gd
->gd_common_tss
.tss_rsp0
=
2413 (register_t
)(thread0
.td_kstack
+
2414 KSTACK_PAGES
* PAGE_SIZE
- sizeof(struct pcb
));
2415 /* Ensure the stack is aligned to 16 bytes */
2416 gd
->gd_common_tss
.tss_rsp0
&= ~(register_t
)0xF;
2418 /* double fault stack */
2419 gd
->gd_common_tss
.tss_ist1
=
2420 (long)&gd
->mi
.gd_prvspace
->idlestack
[
2421 sizeof(gd
->mi
.gd_prvspace
->idlestack
)];
2423 /* Set the IO permission bitmap (empty due to tss seg limit) */
2424 gd
->gd_common_tss
.tss_iobase
= sizeof(struct x86_64tss
);
2426 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2427 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
];
2428 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2431 /* Set up the fast syscall stuff */
2432 msr
= rdmsr(MSR_EFER
) | EFER_SCE
;
2433 wrmsr(MSR_EFER
, msr
);
2434 wrmsr(MSR_LSTAR
, (u_int64_t
)IDTVEC(fast_syscall
));
2435 wrmsr(MSR_CSTAR
, (u_int64_t
)IDTVEC(fast_syscall32
));
2436 msr
= ((u_int64_t
)GSEL(GCODE_SEL
, SEL_KPL
) << 32) |
2437 ((u_int64_t
)GSEL(GUCODE32_SEL
, SEL_UPL
) << 48);
2438 wrmsr(MSR_STAR
, msr
);
2439 wrmsr(MSR_SF_MASK
, PSL_NT
|PSL_T
|PSL_I
|PSL_C
|PSL_D
|PSL_IOPL
);
2441 getmemsize(kmdp
, physfree
);
2442 init_param2(physmem
);
2444 /* now running on new page tables, configured,and u/iom is accessible */
2446 /* Map the message buffer. */
2448 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2449 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2452 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2455 /* transfer to user mode */
2457 _ucodesel
= GSEL(GUCODE_SEL
, SEL_UPL
);
2458 _udatasel
= GSEL(GUDATA_SEL
, SEL_UPL
);
2459 _ucode32sel
= GSEL(GUCODE32_SEL
, SEL_UPL
);
2465 /* setup proc 0's pcb */
2466 thread0
.td_pcb
->pcb_flags
= 0;
2467 thread0
.td_pcb
->pcb_cr3
= KPML4phys
;
2468 thread0
.td_pcb
->pcb_ext
= NULL
;
2469 lwp0
.lwp_md
.md_regs
= &proc0_tf
; /* XXX needed? */
2471 /* Location of kernel stack for locore */
2472 return ((u_int64_t
)thread0
.td_pcb
);
2476 * Initialize machine-dependant portions of the global data structure.
2477 * Note that the global data area and cpu0's idlestack in the private
2478 * data space were allocated in locore.
2480 * Note: the idlethread's cpl is 0
2482 * WARNING! Called from early boot, 'mycpu' may not work yet.
2485 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2488 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2490 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2491 gd
->mi
.gd_prvspace
->idlestack
,
2492 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2494 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2495 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2496 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2497 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2501 * We only have to check for DMAP bounds, the globaldata space is
2502 * actually part of the kernel_map so we don't have to waste time
2503 * checking CPU_prvspace[*].
2506 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2509 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2510 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2514 if (saddr
>= DMAP_MIN_ADDRESS
&& eaddr
<= DMAP_MAX_ADDRESS
)
2520 globaldata_find(int cpu
)
2522 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2523 return(&CPU_prvspace
[cpu
]->mdglobaldata
.mi
);
2527 * This path should be safe from the SYSRET issue because only stopped threads
2528 * can have their %rip adjusted this way (and all heavy weight thread switches
2529 * clear QUICKREF and thus do not use SYSRET). However, the code path is
2530 * convoluted so add a safety by forcing %rip to be cannonical.
2533 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2535 if (addr
& 0x0000800000000000LLU
)
2536 lp
->lwp_md
.md_regs
->tf_rip
= addr
| 0xFFFF000000000000LLU
;
2538 lp
->lwp_md
.md_regs
->tf_rip
= addr
& 0x0000FFFFFFFFFFFFLLU
;
2543 ptrace_single_step(struct lwp
*lp
)
2545 lp
->lwp_md
.md_regs
->tf_rflags
|= PSL_T
;
2550 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2552 struct trapframe
*tp
;
2554 if ((tp
= lp
->lwp_md
.md_regs
) == NULL
)
2556 bcopy(&tp
->tf_rdi
, ®s
->r_rdi
, sizeof(*regs
));
2561 set_regs(struct lwp
*lp
, struct reg
*regs
)
2563 struct trapframe
*tp
;
2565 tp
= lp
->lwp_md
.md_regs
;
2566 if (!EFL_SECURE(regs
->r_rflags
, tp
->tf_rflags
) ||
2567 !CS_SECURE(regs
->r_cs
))
2569 bcopy(®s
->r_rdi
, &tp
->tf_rdi
, sizeof(*regs
));
2575 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2577 struct env87
*penv_87
= &sv_87
->sv_env
;
2578 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2581 /* FPU control/status */
2582 penv_87
->en_cw
= penv_xmm
->en_cw
;
2583 penv_87
->en_sw
= penv_xmm
->en_sw
;
2584 penv_87
->en_tw
= penv_xmm
->en_tw
;
2585 penv_87
->en_fip
= penv_xmm
->en_fip
;
2586 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2587 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2588 penv_87
->en_foo
= penv_xmm
->en_foo
;
2589 penv_87
->en_fos
= penv_xmm
->en_fos
;
2592 for (i
= 0; i
< 8; ++i
)
2593 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2597 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2599 struct env87
*penv_87
= &sv_87
->sv_env
;
2600 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2603 /* FPU control/status */
2604 penv_xmm
->en_cw
= penv_87
->en_cw
;
2605 penv_xmm
->en_sw
= penv_87
->en_sw
;
2606 penv_xmm
->en_tw
= penv_87
->en_tw
;
2607 penv_xmm
->en_fip
= penv_87
->en_fip
;
2608 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2609 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2610 penv_xmm
->en_foo
= penv_87
->en_foo
;
2611 penv_xmm
->en_fos
= penv_87
->en_fos
;
2614 for (i
= 0; i
< 8; ++i
)
2615 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2619 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2621 if (lp
->lwp_thread
== NULL
|| lp
->lwp_thread
->td_pcb
== NULL
)
2624 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2625 (struct save87
*)fpregs
);
2628 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2633 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2636 set_fpregs_xmm((struct save87
*)fpregs
,
2637 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2640 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2645 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2650 dbregs
->dr
[0] = rdr0();
2651 dbregs
->dr
[1] = rdr1();
2652 dbregs
->dr
[2] = rdr2();
2653 dbregs
->dr
[3] = rdr3();
2654 dbregs
->dr
[4] = rdr4();
2655 dbregs
->dr
[5] = rdr5();
2656 dbregs
->dr
[6] = rdr6();
2657 dbregs
->dr
[7] = rdr7();
2660 if (lp
->lwp_thread
== NULL
|| (pcb
= lp
->lwp_thread
->td_pcb
) == NULL
)
2662 dbregs
->dr
[0] = pcb
->pcb_dr0
;
2663 dbregs
->dr
[1] = pcb
->pcb_dr1
;
2664 dbregs
->dr
[2] = pcb
->pcb_dr2
;
2665 dbregs
->dr
[3] = pcb
->pcb_dr3
;
2668 dbregs
->dr
[6] = pcb
->pcb_dr6
;
2669 dbregs
->dr
[7] = pcb
->pcb_dr7
;
2674 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2677 load_dr0(dbregs
->dr
[0]);
2678 load_dr1(dbregs
->dr
[1]);
2679 load_dr2(dbregs
->dr
[2]);
2680 load_dr3(dbregs
->dr
[3]);
2681 load_dr4(dbregs
->dr
[4]);
2682 load_dr5(dbregs
->dr
[5]);
2683 load_dr6(dbregs
->dr
[6]);
2684 load_dr7(dbregs
->dr
[7]);
2687 struct ucred
*ucred
;
2689 uint64_t mask1
, mask2
;
2692 * Don't let an illegal value for dr7 get set. Specifically,
2693 * check for undefined settings. Setting these bit patterns
2694 * result in undefined behaviour and can lead to an unexpected
2697 /* JG this loop looks unreadable */
2698 /* Check 4 2-bit fields for invalid patterns.
2699 * These fields are R/Wi, for i = 0..3
2701 /* Is 10 in LENi allowed when running in compatibility mode? */
2702 /* Pattern 10 in R/Wi might be used to indicate
2703 * breakpoint on I/O. Further analysis should be
2704 * carried to decide if it is safe and useful to
2705 * provide access to that capability
2707 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 4;
2708 i
++, mask1
<<= 4, mask2
<<= 4)
2709 if ((dbregs
->dr
[7] & mask1
) == mask2
)
2712 pcb
= lp
->lwp_thread
->td_pcb
;
2713 ucred
= lp
->lwp_proc
->p_ucred
;
2716 * Don't let a process set a breakpoint that is not within the
2717 * process's address space. If a process could do this, it
2718 * could halt the system by setting a breakpoint in the kernel
2719 * (if ddb was enabled). Thus, we need to check to make sure
2720 * that no breakpoints are being enabled for addresses outside
2721 * process's address space, unless, perhaps, we were called by
2724 * XXX - what about when the watched area of the user's
2725 * address space is written into from within the kernel
2726 * ... wouldn't that still cause a breakpoint to be generated
2727 * from within kernel mode?
2730 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2731 if (dbregs
->dr
[7] & 0x3) {
2732 /* dr0 is enabled */
2733 if (dbregs
->dr
[0] >= VM_MAX_USER_ADDRESS
)
2737 if (dbregs
->dr
[7] & (0x3<<2)) {
2738 /* dr1 is enabled */
2739 if (dbregs
->dr
[1] >= VM_MAX_USER_ADDRESS
)
2743 if (dbregs
->dr
[7] & (0x3<<4)) {
2744 /* dr2 is enabled */
2745 if (dbregs
->dr
[2] >= VM_MAX_USER_ADDRESS
)
2749 if (dbregs
->dr
[7] & (0x3<<6)) {
2750 /* dr3 is enabled */
2751 if (dbregs
->dr
[3] >= VM_MAX_USER_ADDRESS
)
2756 pcb
->pcb_dr0
= dbregs
->dr
[0];
2757 pcb
->pcb_dr1
= dbregs
->dr
[1];
2758 pcb
->pcb_dr2
= dbregs
->dr
[2];
2759 pcb
->pcb_dr3
= dbregs
->dr
[3];
2760 pcb
->pcb_dr6
= dbregs
->dr
[6];
2761 pcb
->pcb_dr7
= dbregs
->dr
[7];
2763 pcb
->pcb_flags
|= PCB_DBREGS
;
2770 * Return > 0 if a hardware breakpoint has been hit, and the
2771 * breakpoint was in user space. Return 0, otherwise.
2774 user_dbreg_trap(void)
2776 u_int64_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2777 u_int64_t bp
; /* breakpoint bits extracted from dr6 */
2778 int nbp
; /* number of breakpoints that triggered */
2779 caddr_t addr
[4]; /* breakpoint addresses */
2783 if ((dr7
& 0xff) == 0) {
2785 * all GE and LE bits in the dr7 register are zero,
2786 * thus the trap couldn't have been caused by the
2787 * hardware debug registers
2798 * None of the breakpoint bits are set meaning this
2799 * trap was not caused by any of the debug registers
2805 * at least one of the breakpoints were hit, check to see
2806 * which ones and if any of them are user space addresses
2810 addr
[nbp
++] = (caddr_t
)rdr0();
2813 addr
[nbp
++] = (caddr_t
)rdr1();
2816 addr
[nbp
++] = (caddr_t
)rdr2();
2819 addr
[nbp
++] = (caddr_t
)rdr3();
2822 for (i
=0; i
<nbp
; i
++) {
2824 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2826 * addr[i] is in user space
2833 * None of the breakpoints are in user space.
2841 Debugger(const char *msg
)
2843 kprintf("Debugger(\"%s\") called.\n", msg
);
2850 * Provide inb() and outb() as functions. They are normally only
2851 * available as macros calling inlined functions, thus cannot be
2852 * called inside DDB.
2854 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2860 /* silence compiler warnings */
2862 void outb(u_int
, u_char
);
2869 * We use %%dx and not %1 here because i/o is done at %dx and not at
2870 * %edx, while gcc generates inferior code (movw instead of movl)
2871 * if we tell it to load (u_short) port.
2873 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2878 outb(u_int port
, u_char data
)
2882 * Use an unnecessary assignment to help gcc's register allocator.
2883 * This make a large difference for gcc-1.40 and a tiny difference
2884 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2885 * best results. gcc-2.6.0 can't handle this.
2888 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2896 * initialize all the SMP locks
2899 /* critical region when masking or unmasking interupts */
2900 struct spinlock_deprecated imen_spinlock
;
2902 /* lock region used by kernel profiling */
2903 struct spinlock_deprecated mcount_spinlock
;
2905 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2906 struct spinlock_deprecated com_spinlock
;
2908 /* lock regions around the clock hardware */
2909 struct spinlock_deprecated clock_spinlock
;
2915 * Get the initial mplock with a count of 1 for the BSP.
2916 * This uses a LOGICAL cpu ID, ie BSP == 0.
2918 cpu_get_initial_mplock();
2920 spin_init_deprecated(&mcount_spinlock
);
2921 spin_init_deprecated(&imen_spinlock
);
2922 spin_init_deprecated(&com_spinlock
);
2923 spin_init_deprecated(&clock_spinlock
);
2925 /* our token pool needs to work early */
2926 lwkt_token_pool_init();
2930 cpu_mwait_hint_valid(uint32_t hint
)
2934 cx_idx
= MWAIT_EAX_TO_CX(hint
);
2935 if (cx_idx
>= CPU_MWAIT_CX_MAX
)
2938 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2939 if (sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
2946 cpu_mwait_cx_no_bmsts(void)
2948 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_STS
);
2952 cpu_mwait_cx_no_bmarb(void)
2954 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_ARB
);
2958 cpu_mwait_cx_hint2name(int hint
, char *name
, int namelen
, boolean_t allow_auto
)
2960 int old_cx_idx
, sub
= 0;
2963 old_cx_idx
= MWAIT_EAX_TO_CX(hint
);
2964 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2965 } else if (hint
== CPU_MWAIT_HINT_AUTO
) {
2966 old_cx_idx
= allow_auto
? CPU_MWAIT_C2
: CPU_MWAIT_CX_MAX
;
2967 } else if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
2968 old_cx_idx
= allow_auto
? CPU_MWAIT_C3
: CPU_MWAIT_CX_MAX
;
2970 old_cx_idx
= CPU_MWAIT_CX_MAX
;
2973 if (!CPU_MWAIT_HAS_CX
)
2974 strlcpy(name
, "NONE", namelen
);
2975 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTO
)
2976 strlcpy(name
, "AUTO", namelen
);
2977 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTODEEP
)
2978 strlcpy(name
, "AUTODEEP", namelen
);
2979 else if (old_cx_idx
>= CPU_MWAIT_CX_MAX
||
2980 sub
>= cpu_mwait_cx_info
[old_cx_idx
].subcnt
)
2981 strlcpy(name
, "INVALID", namelen
);
2983 ksnprintf(name
, namelen
, "C%d/%d", old_cx_idx
, sub
);
2989 cpu_mwait_cx_name2hint(char *name
, int *hint0
, boolean_t allow_auto
)
2991 int cx_idx
, sub
, hint
;
2994 if (allow_auto
&& strcmp(name
, "AUTO") == 0) {
2995 hint
= CPU_MWAIT_HINT_AUTO
;
2996 cx_idx
= CPU_MWAIT_C2
;
2999 if (allow_auto
&& strcmp(name
, "AUTODEEP") == 0) {
3000 hint
= CPU_MWAIT_HINT_AUTODEEP
;
3001 cx_idx
= CPU_MWAIT_C3
;
3005 if (strlen(name
) < 4 || toupper(name
[0]) != 'C')
3010 cx_idx
= strtol(start
, &ptr
, 10);
3011 if (ptr
== start
|| *ptr
!= '/')
3013 if (cx_idx
< 0 || cx_idx
>= CPU_MWAIT_CX_MAX
)
3019 sub
= strtol(start
, &ptr
, 10);
3022 if (sub
< 0 || sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
3025 hint
= MWAIT_EAX_HINT(cx_idx
, sub
);
3032 cpu_mwait_cx_transit(int old_cx_idx
, int cx_idx
)
3034 if (cx_idx
>= CPU_MWAIT_C3
&& cpu_mwait_c3_preamble
)
3036 if (old_cx_idx
< CPU_MWAIT_C3
&& cx_idx
>= CPU_MWAIT_C3
) {
3039 error
= cputimer_intr_powersave_addreq();
3042 } else if (old_cx_idx
>= CPU_MWAIT_C3
&& cx_idx
< CPU_MWAIT_C3
) {
3043 cputimer_intr_powersave_remreq();
3049 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
, int *hint0
,
3050 boolean_t allow_auto
)
3052 int error
, cx_idx
, old_cx_idx
, hint
;
3053 char name
[CPU_MWAIT_CX_NAMELEN
];
3056 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
),
3059 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3060 if (error
!= 0 || req
->newptr
== NULL
)
3063 if (!CPU_MWAIT_HAS_CX
)
3066 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, allow_auto
);
3070 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3079 cpu_mwait_cx_setname(struct cpu_idle_stat
*stat
, const char *cx_name
)
3081 int error
, cx_idx
, old_cx_idx
, hint
;
3082 char name
[CPU_MWAIT_CX_NAMELEN
];
3084 KASSERT(CPU_MWAIT_HAS_CX
, ("cpu does not support mwait CX extension"));
3087 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3089 strlcpy(name
, cx_name
, sizeof(name
));
3090 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3094 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3103 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3105 int hint
= cpu_mwait_halt_global
;
3106 int error
, cx_idx
, cpu
;
3107 char name
[CPU_MWAIT_CX_NAMELEN
], cx_name
[CPU_MWAIT_CX_NAMELEN
];
3109 cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3111 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3112 if (error
!= 0 || req
->newptr
== NULL
)
3115 if (!CPU_MWAIT_HAS_CX
)
3118 /* Save name for later per-cpu CX configuration */
3119 strlcpy(cx_name
, name
, sizeof(cx_name
));
3121 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3125 /* Change per-cpu CX configuration */
3126 for (cpu
= 0; cpu
< ncpus
; ++cpu
) {
3127 error
= cpu_mwait_cx_setname(&cpu_idle_stats
[cpu
], cx_name
);
3132 cpu_mwait_halt_global
= hint
;
3137 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3139 struct cpu_idle_stat
*stat
= arg1
;
3142 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3148 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
)
3152 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3153 &cpu_mwait_spin
, FALSE
);
3158 * This manual debugging code is called unconditionally from Xtimer
3159 * (the per-cpu timer interrupt) whether the current thread is in a
3160 * critical section or not) and can be useful in tracking down lockups.
3162 * NOTE: MANUAL DEBUG CODE
3165 static int saveticks
[SMP_MAXCPU
];
3166 static int savecounts
[SMP_MAXCPU
];
3170 pcpu_timer_always(struct intrframe
*frame
)
3173 globaldata_t gd
= mycpu
;
3174 int cpu
= gd
->gd_cpuid
;
3180 gptr
= (short *)0xFFFFFFFF800b8000 + 80 * cpu
;
3181 *gptr
= ((*gptr
+ 1) & 0x00FF) | 0x0700;
3184 ksnprintf(buf
, sizeof(buf
), " %p %16s %d %16s ",
3185 (void *)frame
->if_rip
, gd
->gd_curthread
->td_comm
, ticks
,
3187 for (i
= 0; buf
[i
]; ++i
) {
3188 gptr
[i
] = 0x0700 | (unsigned char)buf
[i
];
3192 if (saveticks
[gd
->gd_cpuid
] != ticks
) {
3193 saveticks
[gd
->gd_cpuid
] = ticks
;
3194 savecounts
[gd
->gd_cpuid
] = 0;
3196 ++savecounts
[gd
->gd_cpuid
];
3197 if (savecounts
[gd
->gd_cpuid
] > 2000 && panicstr
== NULL
) {
3198 panic("cpud %d panicing on ticks failure",
3201 for (i
= 0; i
< ncpus
; ++i
) {
3203 if (saveticks
[i
] && panicstr
== NULL
) {
3204 delta
= saveticks
[i
] - ticks
;
3205 if (delta
< -10 || delta
> 10) {
3206 panic("cpu %d panicing on cpu %d watchdog",