2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
43 //#include "use_npx.h"
47 #include "opt_directio.h"
49 #include "opt_msgbuf.h"
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/sysproto.h>
55 #include <sys/signalvar.h>
56 #include <sys/kernel.h>
57 #include <sys/linker.h>
58 #include <sys/malloc.h>
62 #include <sys/reboot.h>
64 #include <sys/msgbuf.h>
65 #include <sys/sysent.h>
66 #include <sys/sysctl.h>
67 #include <sys/vmmeter.h>
69 #include <sys/usched.h>
72 #include <sys/ctype.h>
73 #include <sys/serialize.h>
74 #include <sys/systimer.h>
77 #include <vm/vm_param.h>
79 #include <vm/vm_kern.h>
80 #include <vm/vm_object.h>
81 #include <vm/vm_page.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_pager.h>
84 #include <vm/vm_extern.h>
86 #include <sys/thread2.h>
87 #include <sys/mplock2.h>
88 #include <sys/mutex2.h>
98 #include <machine/cpu.h>
99 #include <machine/clock.h>
100 #include <machine/specialreg.h>
102 #include <machine/bootinfo.h>
104 #include <machine/md_var.h>
105 #include <machine/metadata.h>
106 #include <machine/pc/bios.h>
107 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
108 #include <machine/globaldata.h> /* CPU_prvspace */
109 #include <machine/smp.h>
110 #include <machine/cputypes.h>
111 #include <machine/intr_machdep.h>
112 #include <machine/framebuffer.h>
115 #include <bus/isa/isa_device.h>
117 #include <machine_base/isa/isa_intr.h>
118 #include <bus/isa/rtc.h>
119 #include <sys/random.h>
120 #include <sys/ptrace.h>
121 #include <machine/sigframe.h>
123 #include <sys/machintr.h>
124 #include <machine_base/icu/icu_abi.h>
125 #include <machine_base/icu/elcr_var.h>
126 #include <machine_base/apic/lapic.h>
127 #include <machine_base/apic/ioapic.h>
128 #include <machine_base/apic/ioapic_abi.h>
129 #include <machine/mptable.h>
131 #define PHYSMAP_ENTRIES 10
133 extern u_int64_t
hammer_time(u_int64_t
, u_int64_t
);
135 extern void printcpuinfo(void); /* XXX header file */
136 extern void identify_cpu(void);
138 extern void finishidentcpu(void);
140 extern void panicifcpuunsupported(void);
142 static void cpu_startup(void *);
143 static void pic_finish(void *);
144 static void cpu_finish(void *);
146 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
147 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
149 extern void ffs_rawread_setup(void);
150 #endif /* DIRECTIO */
151 static void init_locks(void);
153 extern void pcpu_timer_always(struct intrframe
*);
155 SYSINIT(cpu
, SI_BOOT2_START_CPU
, SI_ORDER_FIRST
, cpu_startup
, NULL
);
156 SYSINIT(pic_finish
, SI_BOOT2_FINISH_PIC
, SI_ORDER_FIRST
, pic_finish
, NULL
);
157 SYSINIT(cpu_finish
, SI_BOOT2_FINISH_CPU
, SI_ORDER_FIRST
, cpu_finish
, NULL
);
160 extern vm_offset_t ksym_start
, ksym_end
;
163 struct privatespace CPU_prvspace_bsp
__aligned(4096);
164 struct privatespace
*CPU_prvspace
[MAXCPU
] = { &CPU_prvspace_bsp
};
166 int _udatasel
, _ucodesel
, _ucode32sel
;
168 int64_t tsc_offsets
[MAXCPU
];
169 cpumask_t smp_idleinvl_mask
;
170 cpumask_t smp_idleinvl_reqs
;
172 static int cpu_mwait_halt_global
; /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
174 #if defined(SWTCH_OPTIM_STATS)
175 extern int swtch_optim_stats
;
176 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
177 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
178 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
179 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
181 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_halt
,
182 CTLFLAG_RD
, &cpu_mwait_halt_global
, 0, "");
183 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_spin
, CTLFLAG_RD
, &cpu_mwait_spin
, 0,
184 "monitor/mwait target state");
186 #define CPU_MWAIT_HAS_CX \
187 ((cpu_feature2 & CPUID2_MON) && \
188 (cpu_mwait_feature & CPUID_MWAIT_EXT))
190 #define CPU_MWAIT_CX_NAMELEN 16
192 #define CPU_MWAIT_C1 1
193 #define CPU_MWAIT_C2 2
194 #define CPU_MWAIT_C3 3
195 #define CPU_MWAIT_CX_MAX 8
197 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */
198 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */
200 SYSCTL_NODE(_machdep
, OID_AUTO
, mwait
, CTLFLAG_RW
, 0, "MWAIT features");
201 SYSCTL_NODE(_machdep_mwait
, OID_AUTO
, CX
, CTLFLAG_RW
, 0, "MWAIT Cx settings");
203 struct cpu_mwait_cx
{
206 struct sysctl_ctx_list sysctl_ctx
;
207 struct sysctl_oid
*sysctl_tree
;
209 static struct cpu_mwait_cx cpu_mwait_cx_info
[CPU_MWAIT_CX_MAX
];
210 static char cpu_mwait_cx_supported
[256];
212 static int cpu_mwait_c1_hints_cnt
;
213 static int cpu_mwait_hints_cnt
;
214 static int *cpu_mwait_hints
;
216 static int cpu_mwait_deep_hints_cnt
;
217 static int *cpu_mwait_deep_hints
;
219 #define CPU_IDLE_REPEAT_DEFAULT 750
221 static u_int cpu_idle_repeat
= CPU_IDLE_REPEAT_DEFAULT
;
222 static u_long cpu_idle_repeat_max
= CPU_IDLE_REPEAT_DEFAULT
;
223 static u_int cpu_mwait_repeat_shift
= 1;
225 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1
226 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2
228 static int cpu_mwait_c3_preamble
=
229 CPU_MWAIT_C3_PREAMBLE_BM_ARB
|
230 CPU_MWAIT_C3_PREAMBLE_BM_STS
;
232 SYSCTL_STRING(_machdep_mwait_CX
, OID_AUTO
, supported
, CTLFLAG_RD
,
233 cpu_mwait_cx_supported
, 0, "MWAIT supported C states");
234 SYSCTL_INT(_machdep_mwait_CX
, OID_AUTO
, c3_preamble
, CTLFLAG_RD
,
235 &cpu_mwait_c3_preamble
, 0, "C3+ preamble mask");
237 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
,
239 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
);
240 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
);
241 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
);
243 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, idle
, CTLTYPE_STRING
|CTLFLAG_RW
,
244 NULL
, 0, cpu_mwait_cx_idle_sysctl
, "A", "");
245 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, spin
, CTLTYPE_STRING
|CTLFLAG_RW
,
246 NULL
, 0, cpu_mwait_cx_spin_sysctl
, "A", "");
247 SYSCTL_UINT(_machdep_mwait_CX
, OID_AUTO
, repeat_shift
, CTLFLAG_RW
,
248 &cpu_mwait_repeat_shift
, 0, "");
252 u_long ebda_addr
= 0;
254 int imcr_present
= 0;
256 int naps
= 0; /* # of Applications processors */
259 struct mtx dt_lock
; /* lock for GDT and LDT */
262 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
264 u_long pmem
= ctob(physmem
);
266 int error
= sysctl_handle_long(oidp
, &pmem
, 0, req
);
270 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_ULONG
|CTLFLAG_RD
,
271 0, 0, sysctl_hw_physmem
, "LU", "Total system memory in bytes (number of pages * page size)");
274 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
276 int error
= sysctl_handle_int(oidp
, 0,
277 ctob(physmem
- vmstats
.v_wire_count
), req
);
281 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
282 0, 0, sysctl_hw_usermem
, "IU", "");
285 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
287 int error
= sysctl_handle_int(oidp
, 0,
288 x86_64_btop(avail_end
- avail_start
), req
);
292 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
293 0, 0, sysctl_hw_availpages
, "I", "");
299 * The number of PHYSMAP entries must be one less than the number of
300 * PHYSSEG entries because the PHYSMAP entry that spans the largest
301 * physical address that is accessible by ISA DMA is split into two
304 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
306 vm_paddr_t phys_avail
[PHYSMAP_SIZE
+ 2];
307 vm_paddr_t dump_avail
[PHYSMAP_SIZE
+ 2];
309 /* must be 2 less so 0 0 can signal end of chunks */
310 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 2)
311 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 2)
313 static vm_offset_t buffer_sva
, buffer_eva
;
314 vm_offset_t clean_sva
, clean_eva
;
315 static vm_offset_t pager_sva
, pager_eva
;
316 static struct trapframe proc0_tf
;
319 cpu_startup(void *dummy
)
323 vm_offset_t firstaddr
;
326 * Good {morning,afternoon,evening,night}.
328 kprintf("%s", version
);
331 panicifcpuunsupported();
332 kprintf("real memory = %ju (%ju MB)\n",
334 (intmax_t)Realmem
/ 1024 / 1024);
336 * Display any holes after the first chunk of extended memory.
341 kprintf("Physical memory chunk(s):\n");
342 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
343 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
345 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
346 (intmax_t)phys_avail
[indx
],
347 (intmax_t)phys_avail
[indx
+ 1] - 1,
349 (intmax_t)(size1
/ PAGE_SIZE
));
354 * Allocate space for system data structures.
355 * The first available kernel virtual address is in "v".
356 * As pages of kernel virtual memory are allocated, "v" is incremented.
357 * As pages of memory are allocated and cleared,
358 * "firstaddr" is incremented.
359 * An index into the kernel page table corresponding to the
360 * virtual memory address maintained in "v" is kept in "mapaddr".
364 * Make two passes. The first pass calculates how much memory is
365 * needed and allocates it. The second pass assigns virtual
366 * addresses to the various data structures.
370 v
= (caddr_t
)firstaddr
;
372 #define valloc(name, type, num) \
373 (name) = (type *)v; v = (caddr_t)((name)+(num))
374 #define valloclim(name, type, num, lim) \
375 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
378 * The nominal buffer size (and minimum KVA allocation) is MAXBSIZE.
379 * For the first 64MB of ram nominally allocate sufficient buffers to
380 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
381 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
382 * the buffer cache we limit the eventual kva reservation to
385 * factor represents the 1/4 x ram conversion.
388 long factor
= 4 * NBUFCALCSIZE
/ 1024;
389 long kbytes
= physmem
* (PAGE_SIZE
/ 1024);
393 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
395 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
396 if (maxbcache
&& nbuf
> maxbcache
/ NBUFCALCSIZE
)
397 nbuf
= maxbcache
/ NBUFCALCSIZE
;
401 * Do not allow the buffer_map to be more then 1/2 the size of the
404 if (nbuf
> (virtual_end
- virtual_start
+
405 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2)) {
406 nbuf
= (virtual_end
- virtual_start
+
407 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2);
408 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf
);
412 * Do not allow the buffer_map to use more than 50% of available
413 * physical-equivalent memory. Since the VM pages which back
414 * individual buffers are typically wired, having too many bufs
415 * can prevent the system from paging properly.
417 if (nbuf
> physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2)) {
418 nbuf
= physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2);
419 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf
);
423 * Do not allow the sizeof(struct buf) * nbuf to exceed half of
424 * the valloc space which is just the virtual_end - virtual_start
425 * section. We use valloc() to allocate the buf header array.
427 if (nbuf
> (virtual_end
- virtual_start
) / sizeof(struct buf
) / 2) {
428 nbuf
= (virtual_end
- virtual_start
) /
429 sizeof(struct buf
) / 2;
430 kprintf("Warning: nbufs capped at %ld due to valloc "
431 "considerations\n", nbuf
);
434 nswbuf_mem
= lmax(lmin(nbuf
/ 32, 512), 8);
436 if (nswbuf_mem
< NSWBUF_MIN
)
437 nswbuf_mem
= NSWBUF_MIN
;
439 nswbuf_kva
= lmax(lmin(nbuf
/ 4, 512), 16);
441 if (nswbuf_kva
< NSWBUF_MIN
)
442 nswbuf_kva
= NSWBUF_MIN
;
448 valloc(swbuf_mem
, struct buf
, nswbuf_mem
);
449 valloc(swbuf_kva
, struct buf
, nswbuf_kva
);
450 valloc(buf
, struct buf
, nbuf
);
453 * End of first pass, size has been calculated so allocate memory
455 if (firstaddr
== 0) {
456 size
= (vm_size_t
)(v
- firstaddr
);
457 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
));
459 panic("startup: no room for tables");
464 * End of second pass, addresses have been assigned
466 * nbuf is an int, make sure we don't overflow the field.
468 * On 64-bit systems we always reserve maximal allocations for
469 * buffer cache buffers and there are no fragmentation issues,
470 * so the KVA segment does not have to be excessively oversized.
472 if ((vm_size_t
)(v
- firstaddr
) != size
)
473 panic("startup: table size inconsistency");
475 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
476 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
) +
477 ((nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) + pager_map_size
);
478 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
479 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
));
480 buffer_map
.system_map
= 1;
481 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
482 ((vm_offset_t
)(nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) +
484 pager_map
.system_map
= 1;
485 kprintf("avail memory = %ju (%ju MB)\n",
486 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
),
487 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
) /
491 struct cpu_idle_stat
{
499 u_long mwait_cx
[CPU_MWAIT_CX_MAX
];
502 #define CPU_IDLE_STAT_HALT -1
503 #define CPU_IDLE_STAT_SPIN -2
505 static struct cpu_idle_stat cpu_idle_stats
[MAXCPU
];
508 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS
)
510 int idx
= arg2
, cpu
, error
;
513 if (idx
== CPU_IDLE_STAT_HALT
) {
514 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
515 val
+= cpu_idle_stats
[cpu
].halt
;
516 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
517 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
518 val
+= cpu_idle_stats
[cpu
].spin
;
520 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
521 ("invalid index %d", idx
));
522 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
523 val
+= cpu_idle_stats
[cpu
].mwait_cx
[idx
];
526 error
= sysctl_handle_quad(oidp
, &val
, 0, req
);
527 if (error
|| req
->newptr
== NULL
)
530 if (idx
== CPU_IDLE_STAT_HALT
) {
531 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
532 cpu_idle_stats
[cpu
].halt
= 0;
533 cpu_idle_stats
[0].halt
= val
;
534 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
535 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
536 cpu_idle_stats
[cpu
].spin
= 0;
537 cpu_idle_stats
[0].spin
= val
;
539 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
540 ("invalid index %d", idx
));
541 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
542 cpu_idle_stats
[cpu
].mwait_cx
[idx
] = 0;
543 cpu_idle_stats
[0].mwait_cx
[idx
] = val
;
549 cpu_mwait_attach(void)
554 if (!CPU_MWAIT_HAS_CX
)
557 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
558 (CPUID_TO_FAMILY(cpu_id
) > 0xf ||
559 (CPUID_TO_FAMILY(cpu_id
) == 0x6 &&
560 CPUID_TO_MODEL(cpu_id
) >= 0xf))) {
564 * Pentium dual-core, Core 2 and beyond do not need any
565 * additional activities to enter deep C-state, i.e. C3(+).
567 cpu_mwait_cx_no_bmarb();
569 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts
);
571 cpu_mwait_cx_no_bmsts();
574 sbuf_new(&sb
, cpu_mwait_cx_supported
,
575 sizeof(cpu_mwait_cx_supported
), SBUF_FIXEDLEN
);
577 for (i
= 0; i
< CPU_MWAIT_CX_MAX
; ++i
) {
578 struct cpu_mwait_cx
*cx
= &cpu_mwait_cx_info
[i
];
581 ksnprintf(cx
->name
, sizeof(cx
->name
), "C%d", i
);
583 sysctl_ctx_init(&cx
->sysctl_ctx
);
584 cx
->sysctl_tree
= SYSCTL_ADD_NODE(&cx
->sysctl_ctx
,
585 SYSCTL_STATIC_CHILDREN(_machdep_mwait
), OID_AUTO
,
586 cx
->name
, CTLFLAG_RW
, NULL
, "Cx control/info");
587 if (cx
->sysctl_tree
== NULL
)
590 cx
->subcnt
= CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu
, i
);
591 SYSCTL_ADD_INT(&cx
->sysctl_ctx
,
592 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
593 "subcnt", CTLFLAG_RD
, &cx
->subcnt
, 0,
595 SYSCTL_ADD_PROC(&cx
->sysctl_ctx
,
596 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
597 "entered", (CTLTYPE_QUAD
| CTLFLAG_RW
), 0,
598 i
, sysctl_cpu_idle_cnt
, "Q", "# of times entered");
600 for (sub
= 0; sub
< cx
->subcnt
; ++sub
)
601 sbuf_printf(&sb
, "C%d/%d ", i
, sub
);
609 cpu_mwait_c1_hints_cnt
= cpu_mwait_cx_info
[CPU_MWAIT_C1
].subcnt
;
610 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
)
611 cpu_mwait_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
612 cpu_mwait_hints
= kmalloc(sizeof(int) * cpu_mwait_hints_cnt
,
616 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
) {
619 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
620 for (j
= 0; j
< subcnt
; ++j
) {
621 KASSERT(hint_idx
< cpu_mwait_hints_cnt
,
622 ("invalid mwait hint index %d", hint_idx
));
623 cpu_mwait_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
627 KASSERT(hint_idx
== cpu_mwait_hints_cnt
,
628 ("mwait hint count %d != index %d",
629 cpu_mwait_hints_cnt
, hint_idx
));
632 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt
);
633 for (i
= 0; i
< cpu_mwait_hints_cnt
; ++i
) {
634 int hint
= cpu_mwait_hints
[i
];
636 kprintf(" C%d/%d hint 0x%04x\n",
637 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
645 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
)
646 cpu_mwait_deep_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
647 cpu_mwait_deep_hints
= kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt
,
651 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
) {
654 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
655 for (j
= 0; j
< subcnt
; ++j
) {
656 KASSERT(hint_idx
< cpu_mwait_deep_hints_cnt
,
657 ("invalid mwait deep hint index %d", hint_idx
));
658 cpu_mwait_deep_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
662 KASSERT(hint_idx
== cpu_mwait_deep_hints_cnt
,
663 ("mwait deep hint count %d != index %d",
664 cpu_mwait_deep_hints_cnt
, hint_idx
));
667 kprintf("MWAIT deep hints:\n");
668 for (i
= 0; i
< cpu_mwait_deep_hints_cnt
; ++i
) {
669 int hint
= cpu_mwait_deep_hints
[i
];
671 kprintf(" C%d/%d hint 0x%04x\n",
672 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
676 cpu_idle_repeat_max
= 256 * cpu_mwait_deep_hints_cnt
;
678 for (i
= 0; i
< ncpus
; ++i
) {
681 ksnprintf(name
, sizeof(name
), "idle%d", i
);
682 SYSCTL_ADD_PROC(NULL
,
683 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX
), OID_AUTO
,
684 name
, (CTLTYPE_STRING
| CTLFLAG_RW
), &cpu_idle_stats
[i
],
685 0, cpu_mwait_cx_pcpu_idle_sysctl
, "A", "");
690 cpu_finish(void *dummy __unused
)
697 pic_finish(void *dummy __unused
)
699 /* Log ELCR information */
702 /* Log MPTABLE information */
703 mptable_pci_int_dump();
706 MachIntrABI
.finalize();
710 * Send an interrupt to process.
712 * Stack is set up to allow sigcode stored
713 * at top to call routine, followed by kcall
714 * to sigreturn routine below. After sigreturn
715 * resets the signal mask, the stack, and the
716 * frame pointer, it returns to the user
720 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
722 struct lwp
*lp
= curthread
->td_lwp
;
723 struct proc
*p
= lp
->lwp_proc
;
724 struct trapframe
*regs
;
725 struct sigacts
*psp
= p
->p_sigacts
;
726 struct sigframe sf
, *sfp
;
730 regs
= lp
->lwp_md
.md_regs
;
731 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
733 /* Save user context */
734 bzero(&sf
, sizeof(struct sigframe
));
735 sf
.sf_uc
.uc_sigmask
= *mask
;
736 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
737 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
738 KKASSERT(__offsetof(struct trapframe
, tf_rdi
) == 0);
739 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_rdi
, sizeof(struct trapframe
));
741 /* Make the size of the saved context visible to userland */
742 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
744 /* Allocate and validate space for the signal handler context. */
745 if ((lp
->lwp_flags
& LWP_ALTSTACK
) != 0 && !oonstack
&&
746 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
747 sp
= (char *)(lp
->lwp_sigstk
.ss_sp
+ lp
->lwp_sigstk
.ss_size
-
748 sizeof(struct sigframe
));
749 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
751 /* We take red zone into account */
752 sp
= (char *)regs
->tf_rsp
- sizeof(struct sigframe
) - 128;
756 * XXX AVX needs 64-byte alignment but sigframe has other fields and
757 * the embedded ucontext is not at the front, so aligning this won't
758 * help us. Fortunately we bcopy in/out of the sigframe, so the
761 * The problem though is if userland winds up trying to use the
764 sfp
= (struct sigframe
*)((intptr_t)sp
& ~(intptr_t)0xF);
766 /* Translate the signal is appropriate */
767 if (p
->p_sysent
->sv_sigtbl
) {
768 if (sig
<= p
->p_sysent
->sv_sigsize
)
769 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
773 * Build the argument list for the signal handler.
775 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
777 regs
->tf_rdi
= sig
; /* argument 1 */
778 regs
->tf_rdx
= (register_t
)&sfp
->sf_uc
; /* argument 3 */
780 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
782 * Signal handler installed with SA_SIGINFO.
784 * action(signo, siginfo, ucontext)
786 regs
->tf_rsi
= (register_t
)&sfp
->sf_si
; /* argument 2 */
787 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
788 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
790 /* fill siginfo structure */
791 sf
.sf_si
.si_signo
= sig
;
792 sf
.sf_si
.si_code
= code
;
793 sf
.sf_si
.si_addr
= (void *)regs
->tf_addr
;
796 * Old FreeBSD-style arguments.
798 * handler (signo, code, [uc], addr)
800 regs
->tf_rsi
= (register_t
)code
; /* argument 2 */
801 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
802 sf
.sf_ahu
.sf_handler
= catcher
;
806 * If we're a vm86 process, we want to save the segment registers.
807 * We also change eflags to be our emulated eflags, not the actual
811 if (regs
->tf_eflags
& PSL_VM
) {
812 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
813 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
815 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
816 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
817 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
818 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
820 if (vm86
->vm86_has_vme
== 0)
821 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
822 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
823 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
826 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
827 * syscalls made by the signal handler. This just avoids
828 * wasting time for our lazy fixup of such faults. PSL_NT
829 * does nothing in vm86 mode, but vm86 programs can set it
830 * almost legitimately in probes for old cpu types.
832 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
837 * Save the FPU state and reinit the FP unit
839 npxpush(&sf
.sf_uc
.uc_mcontext
);
842 * Copy the sigframe out to the user's stack.
844 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
846 * Something is wrong with the stack pointer.
847 * ...Kill the process.
852 regs
->tf_rsp
= (register_t
)sfp
;
853 regs
->tf_rip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
856 * i386 abi specifies that the direction flag must be cleared
859 regs
->tf_rflags
&= ~(PSL_T
|PSL_D
);
862 * 64 bit mode has a code and stack selector but
863 * no data or extra selector. %fs and %gs are not
866 regs
->tf_cs
= _ucodesel
;
867 regs
->tf_ss
= _udatasel
;
872 * Sanitize the trapframe for a virtual kernel passing control to a custom
873 * VM context. Remove any items that would otherwise create a privilage
876 * XXX at the moment we allow userland to set the resume flag. Is this a
880 cpu_sanitize_frame(struct trapframe
*frame
)
882 frame
->tf_cs
= _ucodesel
;
883 frame
->tf_ss
= _udatasel
;
884 /* XXX VM (8086) mode not supported? */
885 frame
->tf_rflags
&= (PSL_RF
| PSL_USERCHANGE
| PSL_VM_UNSUPP
);
886 frame
->tf_rflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
892 * Sanitize the tls so loading the descriptor does not blow up
893 * on us. For x86_64 we don't have to do anything.
896 cpu_sanitize_tls(struct savetls
*tls
)
902 * sigreturn(ucontext_t *sigcntxp)
904 * System call to cleanup state after a signal
905 * has been taken. Reset signal mask and
906 * stack state from context left by sendsig (above).
907 * Return to previous pc and psl as specified by
908 * context left by sendsig. Check carefully to
909 * make sure that the user has not modified the
910 * state to gain improper privileges.
914 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
915 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
918 sys_sigreturn(struct sigreturn_args
*uap
)
920 struct lwp
*lp
= curthread
->td_lwp
;
921 struct trapframe
*regs
;
929 * We have to copy the information into kernel space so userland
930 * can't modify it while we are sniffing it.
932 regs
= lp
->lwp_md
.md_regs
;
933 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
937 rflags
= ucp
->uc_mcontext
.mc_rflags
;
939 /* VM (8086) mode not supported */
940 rflags
&= ~PSL_VM_UNSUPP
;
943 if (eflags
& PSL_VM
) {
944 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
945 struct vm86_kernel
*vm86
;
948 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
949 * set up the vm86 area, and we can't enter vm86 mode.
951 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
953 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
954 if (vm86
->vm86_inited
== 0)
957 /* go back to user mode if both flags are set */
958 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
959 trapsignal(lp
, SIGBUS
, 0);
961 if (vm86
->vm86_has_vme
) {
962 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
963 (eflags
& VME_USERCHANGE
) | PSL_VM
;
965 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
966 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
967 (eflags
& VM_USERCHANGE
) | PSL_VM
;
969 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
970 tf
->tf_eflags
= eflags
;
971 tf
->tf_vm86_ds
= tf
->tf_ds
;
972 tf
->tf_vm86_es
= tf
->tf_es
;
973 tf
->tf_vm86_fs
= tf
->tf_fs
;
974 tf
->tf_vm86_gs
= tf
->tf_gs
;
975 tf
->tf_ds
= _udatasel
;
976 tf
->tf_es
= _udatasel
;
977 tf
->tf_fs
= _udatasel
;
978 tf
->tf_gs
= _udatasel
;
983 * Don't allow users to change privileged or reserved flags.
986 * XXX do allow users to change the privileged flag PSL_RF.
987 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
988 * should sometimes set it there too. tf_eflags is kept in
989 * the signal context during signal handling and there is no
990 * other place to remember it, so the PSL_RF bit may be
991 * corrupted by the signal handler without us knowing.
992 * Corruption of the PSL_RF bit at worst causes one more or
993 * one less debugger trap, so allowing it is fairly harmless.
995 if (!EFL_SECURE(rflags
& ~PSL_RF
, regs
->tf_rflags
& ~PSL_RF
)) {
996 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags
);
1001 * Don't allow users to load a valid privileged %cs. Let the
1002 * hardware check for invalid selectors, excess privilege in
1003 * other selectors, invalid %eip's and invalid %esp's.
1005 cs
= ucp
->uc_mcontext
.mc_cs
;
1006 if (!CS_SECURE(cs
)) {
1007 kprintf("sigreturn: cs = 0x%x\n", cs
);
1008 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
1011 bcopy(&ucp
->uc_mcontext
.mc_rdi
, regs
, sizeof(struct trapframe
));
1015 * Restore the FPU state from the frame
1018 npxpop(&ucp
->uc_mcontext
);
1020 if (ucp
->uc_mcontext
.mc_onstack
& 1)
1021 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
1023 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
1025 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
1026 SIG_CANTMASK(lp
->lwp_sigmask
);
1029 return(EJUSTRETURN
);
1033 * Machine dependent boot() routine
1035 * I haven't seen anything to put here yet
1036 * Possibly some stuff might be grafted back here from boot()
1044 * Shutdown the CPU as much as possible
1050 __asm__
__volatile("hlt");
1054 * cpu_idle() represents the idle LWKT. You cannot return from this function
1055 * (unless you want to blow things up!). Instead we look for runnable threads
1056 * and loop or halt as appropriate. Giant is not held on entry to the thread.
1058 * The main loop is entered with a critical section held, we must release
1059 * the critical section before doing anything else. lwkt_switch() will
1060 * check for pending interrupts due to entering and exiting its own
1063 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1064 * However, there are cases where the idlethread will be entered with
1065 * the possibility that no IPI will occur and in such cases
1066 * lwkt_switch() sets TDF_IDLE_NOHLT.
1068 * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1069 * must occur before it starts using ACPI halt.
1071 * NOTE: Value overridden in hammer_time().
1073 static int cpu_idle_hlt
= 2;
1074 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
1075 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
1076 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_repeat
, CTLFLAG_RW
,
1077 &cpu_idle_repeat
, 0, "Idle entries before acpi hlt");
1079 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1080 0, CPU_IDLE_STAT_HALT
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry halts");
1081 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_spincnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1082 0, CPU_IDLE_STAT_SPIN
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry spins");
1085 cpu_idle_default_hook(void)
1088 * We must guarentee that hlt is exactly the instruction
1089 * following the sti.
1091 __asm
__volatile("sti; hlt");
1094 /* Other subsystems (e.g., ACPI) can hook this later. */
1095 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
1098 cpu_mwait_cx_hint(struct cpu_idle_stat
*stat
)
1107 idx
= (stat
->repeat
+ stat
->repeat_last
+ stat
->repeat_delta
) >>
1108 cpu_mwait_repeat_shift
;
1109 if (idx
>= cpu_mwait_c1_hints_cnt
) {
1110 /* Step up faster, once we walked through all C1 states */
1111 stat
->repeat_delta
+= 1 << (cpu_mwait_repeat_shift
+ 1);
1113 if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
1114 if (idx
>= cpu_mwait_deep_hints_cnt
)
1115 idx
= cpu_mwait_deep_hints_cnt
- 1;
1116 hint
= cpu_mwait_deep_hints
[idx
];
1118 if (idx
>= cpu_mwait_hints_cnt
)
1119 idx
= cpu_mwait_hints_cnt
- 1;
1120 hint
= cpu_mwait_hints
[idx
];
1123 cx_idx
= MWAIT_EAX_TO_CX(hint
);
1124 if (cx_idx
>= 0 && cx_idx
< CPU_MWAIT_CX_MAX
)
1125 stat
->mwait_cx
[cx_idx
]++;
1132 globaldata_t gd
= mycpu
;
1133 struct cpu_idle_stat
*stat
= &cpu_idle_stats
[gd
->gd_cpuid
];
1134 struct thread
*td __debugvar
= gd
->gd_curthread
;
1138 stat
->repeat
= stat
->repeat_last
= cpu_idle_repeat_max
;
1141 KKASSERT(td
->td_critcount
== 0);
1145 * See if there are any LWKTs ready to go.
1150 * When halting inside a cli we must check for reqflags
1151 * races, particularly [re]schedule requests. Running
1152 * splz() does the job.
1155 * 0 Never halt, just spin
1157 * 1 Always use HLT (or MONITOR/MWAIT if avail).
1159 * Better default for modern (Haswell+) Intel
1162 * 2 Use HLT/MONITOR/MWAIT up to a point and then
1163 * use the ACPI halt (default). This is a hybrid
1164 * approach. See machdep.cpu_idle_repeat.
1166 * Better default for modern AMD cpus and older
1169 * 3 Always use the ACPI halt. This typically
1170 * eats the least amount of power but the cpu
1171 * will be slow waking up. Slows down e.g.
1172 * compiles and other pipe/event oriented stuff.
1176 * NOTE: Interrupts are enabled and we are not in a critical
1179 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
1180 * don't bother capping gd_idle_repeat, it is ok if
1183 * Implement optimized invltlb operations when halted
1184 * in idle. By setting the bit in smp_idleinvl_mask
1185 * we inform other cpus that they can set _reqs to
1186 * request an invltlb. Current the code to do that
1187 * sets the bits in _reqs anyway, but then check _mask
1188 * to determine if they can assume the invltlb will execute.
1190 * A critical section is required to ensure that interrupts
1191 * do not fully run until after we've had a chance to execute
1194 if (gd
->gd_idle_repeat
== 0) {
1195 stat
->repeat
= (stat
->repeat
+ stat
->repeat_last
) >> 1;
1196 if (stat
->repeat
> cpu_idle_repeat_max
)
1197 stat
->repeat
= cpu_idle_repeat_max
;
1198 stat
->repeat_last
= 0;
1199 stat
->repeat_delta
= 0;
1201 ++stat
->repeat_last
;
1203 ++gd
->gd_idle_repeat
;
1204 reqflags
= gd
->gd_reqflags
;
1205 quick
= (cpu_idle_hlt
== 1) ||
1206 (cpu_idle_hlt
< 3 &&
1207 gd
->gd_idle_repeat
< cpu_idle_repeat
);
1209 if (quick
&& (cpu_mi_feature
& CPU_MI_MONITOR
) &&
1210 (reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1213 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1214 cpu_mmw_pause_int(&gd
->gd_reqflags
, reqflags
,
1215 cpu_mwait_cx_hint(stat
), 0);
1217 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1218 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1224 } else if (cpu_idle_hlt
) {
1225 __asm
__volatile("cli");
1228 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1229 if ((gd
->gd_reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1231 cpu_idle_default_hook();
1235 __asm
__volatile("sti");
1237 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1238 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1246 __asm
__volatile("sti");
1253 * This routine is called if a spinlock has been held through the
1254 * exponential backoff period and is seriously contested. On a real cpu
1258 cpu_spinlock_contested(void)
1264 * Clear registers on exec
1267 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
1269 struct thread
*td
= curthread
;
1270 struct lwp
*lp
= td
->td_lwp
;
1271 struct pcb
*pcb
= td
->td_pcb
;
1272 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
1274 /* was i386_user_cleanup() in NetBSD */
1278 bzero((char *)regs
, sizeof(struct trapframe
));
1279 regs
->tf_rip
= entry
;
1280 regs
->tf_rsp
= ((stack
- 8) & ~0xFul
) + 8; /* align the stack */
1281 regs
->tf_rdi
= stack
; /* argv */
1282 regs
->tf_rflags
= PSL_USER
| (regs
->tf_rflags
& PSL_T
);
1283 regs
->tf_ss
= _udatasel
;
1284 regs
->tf_cs
= _ucodesel
;
1285 regs
->tf_rbx
= ps_strings
;
1288 * Reset the hardware debug registers if they were in use.
1289 * They won't have any meaning for the newly exec'd process.
1291 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1297 pcb
->pcb_dr7
= 0; /* JG set bit 10? */
1298 if (pcb
== td
->td_pcb
) {
1300 * Clear the debug registers on the running
1301 * CPU, otherwise they will end up affecting
1302 * the next process we switch to.
1306 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1310 * Initialize the math emulator (if any) for the current process.
1311 * Actually, just clear the bit that says that the emulator has
1312 * been initialized. Initialization is delayed until the process
1313 * traps to the emulator (if it is done at all) mainly because
1314 * emulators don't provide an entry point for initialization.
1316 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1319 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1320 * gd_npxthread. Otherwise a preemptive interrupt thread
1321 * may panic in npxdna().
1324 load_cr0(rcr0() | CR0_MP
);
1327 * NOTE: The MSR values must be correct so we can return to
1328 * userland. gd_user_fs/gs must be correct so the switch
1329 * code knows what the current MSR values are.
1331 pcb
->pcb_fsbase
= 0; /* Values loaded from PCB on switch */
1332 pcb
->pcb_gsbase
= 0;
1333 mdcpu
->gd_user_fs
= 0; /* Cache of current MSR values */
1334 mdcpu
->gd_user_gs
= 0;
1335 wrmsr(MSR_FSBASE
, 0); /* Set MSR values for return to userland */
1336 wrmsr(MSR_KGSBASE
, 0);
1338 /* Initialize the npx (if any) for the current process. */
1342 pcb
->pcb_ds
= _udatasel
;
1343 pcb
->pcb_es
= _udatasel
;
1344 pcb
->pcb_fs
= _udatasel
;
1345 pcb
->pcb_gs
= _udatasel
;
1354 cr0
|= CR0_NE
; /* Done by npxinit() */
1355 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1356 cr0
|= CR0_WP
| CR0_AM
;
1362 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1365 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1367 if (!error
&& req
->newptr
)
1372 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1373 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1375 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1376 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1379 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1380 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1383 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1384 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1386 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1387 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1388 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1391 * Initialize 386 and configure to run kernel
1395 * Initialize segments & interrupt table
1399 struct user_segment_descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1400 struct gate_descriptor idt_arr
[MAXCPU
][NIDT
];
1402 union descriptor ldt
[NLDT
]; /* local descriptor table */
1405 /* table descriptors - used to load tables by cpu */
1406 struct region_descriptor r_gdt
;
1407 struct region_descriptor r_idt_arr
[MAXCPU
];
1409 /* JG proc0paddr is a virtual address */
1412 char proc0paddr_buff
[LWKT_THREAD_STACK
];
1415 /* software prototypes -- in more palatable form */
1416 struct soft_segment_descriptor gdt_segs
[] = {
1417 /* GNULL_SEL 0 Null Descriptor */
1418 { 0x0, /* segment base address */
1420 0, /* segment type */
1421 0, /* segment descriptor priority level */
1422 0, /* segment descriptor present */
1424 0, /* default 32 vs 16 bit size */
1425 0 /* limit granularity (byte/page units)*/ },
1426 /* GCODE_SEL 1 Code Descriptor for kernel */
1427 { 0x0, /* segment base address */
1428 0xfffff, /* length - all address space */
1429 SDT_MEMERA
, /* segment type */
1430 SEL_KPL
, /* segment descriptor priority level */
1431 1, /* segment descriptor present */
1433 0, /* default 32 vs 16 bit size */
1434 1 /* limit granularity (byte/page units)*/ },
1435 /* GDATA_SEL 2 Data Descriptor for kernel */
1436 { 0x0, /* segment base address */
1437 0xfffff, /* length - all address space */
1438 SDT_MEMRWA
, /* segment type */
1439 SEL_KPL
, /* segment descriptor priority level */
1440 1, /* segment descriptor present */
1442 0, /* default 32 vs 16 bit size */
1443 1 /* limit granularity (byte/page units)*/ },
1444 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1445 { 0x0, /* segment base address */
1446 0xfffff, /* length - all address space */
1447 SDT_MEMERA
, /* segment type */
1448 SEL_UPL
, /* segment descriptor priority level */
1449 1, /* segment descriptor present */
1451 1, /* default 32 vs 16 bit size */
1452 1 /* limit granularity (byte/page units)*/ },
1453 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1454 { 0x0, /* segment base address */
1455 0xfffff, /* length - all address space */
1456 SDT_MEMRWA
, /* segment type */
1457 SEL_UPL
, /* segment descriptor priority level */
1458 1, /* segment descriptor present */
1460 1, /* default 32 vs 16 bit size */
1461 1 /* limit granularity (byte/page units)*/ },
1462 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1463 { 0x0, /* segment base address */
1464 0xfffff, /* length - all address space */
1465 SDT_MEMERA
, /* segment type */
1466 SEL_UPL
, /* segment descriptor priority level */
1467 1, /* segment descriptor present */
1469 0, /* default 32 vs 16 bit size */
1470 1 /* limit granularity (byte/page units)*/ },
1471 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1473 0x0, /* segment base address */
1474 sizeof(struct x86_64tss
)-1,/* length - all address space */
1475 SDT_SYSTSS
, /* segment type */
1476 SEL_KPL
, /* segment descriptor priority level */
1477 1, /* segment descriptor present */
1479 0, /* unused - default 32 vs 16 bit size */
1480 0 /* limit granularity (byte/page units)*/ },
1481 /* Actually, the TSS is a system descriptor which is double size */
1482 { 0x0, /* segment base address */
1484 0, /* segment type */
1485 0, /* segment descriptor priority level */
1486 0, /* segment descriptor present */
1488 0, /* default 32 vs 16 bit size */
1489 0 /* limit granularity (byte/page units)*/ },
1490 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1491 { 0x0, /* segment base address */
1492 0xfffff, /* length - all address space */
1493 SDT_MEMRWA
, /* segment type */
1494 SEL_UPL
, /* segment descriptor priority level */
1495 1, /* segment descriptor present */
1497 1, /* default 32 vs 16 bit size */
1498 1 /* limit granularity (byte/page units)*/ },
1502 setidt_global(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
)
1506 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
1507 struct gate_descriptor
*ip
= &idt_arr
[cpu
][idx
];
1509 ip
->gd_looffset
= (uintptr_t)func
;
1510 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1516 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1521 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
, int cpu
)
1523 struct gate_descriptor
*ip
;
1525 KASSERT(cpu
>= 0 && cpu
< ncpus
, ("invalid cpu %d", cpu
));
1527 ip
= &idt_arr
[cpu
][idx
];
1528 ip
->gd_looffset
= (uintptr_t)func
;
1529 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1535 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1538 #define IDTVEC(name) __CONCAT(X,name)
1541 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1542 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1543 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1544 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(rsvd
), IDTVEC(fpu
), IDTVEC(align
),
1545 IDTVEC(xmm
), IDTVEC(dblfault
),
1546 IDTVEC(fast_syscall
), IDTVEC(fast_syscall32
);
1549 sdtossd(struct user_segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1551 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1552 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1553 ssd
->ssd_type
= sd
->sd_type
;
1554 ssd
->ssd_dpl
= sd
->sd_dpl
;
1555 ssd
->ssd_p
= sd
->sd_p
;
1556 ssd
->ssd_def32
= sd
->sd_def32
;
1557 ssd
->ssd_gran
= sd
->sd_gran
;
1561 ssdtosd(struct soft_segment_descriptor
*ssd
, struct user_segment_descriptor
*sd
)
1564 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1565 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xff;
1566 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1567 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1568 sd
->sd_type
= ssd
->ssd_type
;
1569 sd
->sd_dpl
= ssd
->ssd_dpl
;
1570 sd
->sd_p
= ssd
->ssd_p
;
1571 sd
->sd_long
= ssd
->ssd_long
;
1572 sd
->sd_def32
= ssd
->ssd_def32
;
1573 sd
->sd_gran
= ssd
->ssd_gran
;
1577 ssdtosyssd(struct soft_segment_descriptor
*ssd
,
1578 struct system_segment_descriptor
*sd
)
1581 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1582 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xfffffffffful
;
1583 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1584 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1585 sd
->sd_type
= ssd
->ssd_type
;
1586 sd
->sd_dpl
= ssd
->ssd_dpl
;
1587 sd
->sd_p
= ssd
->ssd_p
;
1588 sd
->sd_gran
= ssd
->ssd_gran
;
1592 * Populate the (physmap) array with base/bound pairs describing the
1593 * available physical memory in the system, then test this memory and
1594 * build the phys_avail array describing the actually-available memory.
1596 * If we cannot accurately determine the physical memory map, then use
1597 * value from the 0xE801 call, and failing that, the RTC.
1599 * Total memory size may be set by the kernel environment variable
1600 * hw.physmem or the compile-time define MAXMEM.
1602 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1603 * of PAGE_SIZE. This also greatly reduces the memory test time
1604 * which would otherwise be excessive on machines with > 8G of ram.
1606 * XXX first should be vm_paddr_t.
1609 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024)
1610 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1611 vm_paddr_t physmap
[PHYSMAP_SIZE
];
1612 struct bios_smap
*smapbase
, *smap
, *smapend
;
1613 struct efi_map_header
*efihdrbase
;
1615 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024)
1616 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1)
1619 add_smap_entries(int *physmap_idx
)
1623 smapsize
= *((u_int32_t
*)smapbase
- 1);
1624 smapend
= (struct bios_smap
*)((uintptr_t)smapbase
+ smapsize
);
1626 for (smap
= smapbase
; smap
< smapend
; smap
++) {
1627 if (boothowto
& RB_VERBOSE
)
1628 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1629 smap
->type
, smap
->base
, smap
->length
);
1631 if (smap
->type
!= SMAP_TYPE_MEMORY
)
1634 if (smap
->length
== 0)
1637 for (i
= 0; i
<= *physmap_idx
; i
+= 2) {
1638 if (smap
->base
< physmap
[i
+ 1]) {
1639 if (boothowto
& RB_VERBOSE
) {
1640 kprintf("Overlapping or non-monotonic "
1641 "memory region, ignoring "
1647 if (i
<= *physmap_idx
)
1650 Realmem
+= smap
->length
;
1652 if (smap
->base
== physmap
[*physmap_idx
+ 1]) {
1653 physmap
[*physmap_idx
+ 1] += smap
->length
;
1658 if (*physmap_idx
== PHYSMAP_SIZE
) {
1659 kprintf("Too many segments in the physical "
1660 "address map, giving up\n");
1663 physmap
[*physmap_idx
] = smap
->base
;
1664 physmap
[*physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1668 #define efi_next_descriptor(ptr, size) \
1669 ((struct efi_md *)(((uint8_t *) ptr) + size))
1672 add_efi_map_entries(int *physmap_idx
)
1674 struct efi_md
*map
, *p
;
1679 static const char *types
[] = {
1685 "RuntimeServicesCode",
1686 "RuntimeServicesData",
1687 "ConventionalMemory",
1689 "ACPIReclaimMemory",
1692 "MemoryMappedIOPortSpace",
1697 * Memory map data provided by UEFI via the GetMemoryMap
1698 * Boot Services API.
1700 efisz
= (sizeof(struct efi_map_header
) + 0xf) & ~0xf;
1701 map
= (struct efi_md
*)((uint8_t *)efihdrbase
+ efisz
);
1703 if (efihdrbase
->descriptor_size
== 0)
1705 ndesc
= efihdrbase
->memory_size
/ efihdrbase
->descriptor_size
;
1707 if (boothowto
& RB_VERBOSE
)
1708 kprintf("%23s %12s %12s %8s %4s\n",
1709 "Type", "Physical", "Virtual", "#Pages", "Attr");
1711 for (i
= 0, p
= map
; i
< ndesc
; i
++,
1712 p
= efi_next_descriptor(p
, efihdrbase
->descriptor_size
)) {
1713 if (boothowto
& RB_VERBOSE
) {
1714 if (p
->md_type
<= EFI_MD_TYPE_PALCODE
)
1715 type
= types
[p
->md_type
];
1718 kprintf("%23s %012lx %12p %08lx ", type
, p
->md_phys
,
1719 p
->md_virt
, p
->md_pages
);
1720 if (p
->md_attr
& EFI_MD_ATTR_UC
)
1722 if (p
->md_attr
& EFI_MD_ATTR_WC
)
1724 if (p
->md_attr
& EFI_MD_ATTR_WT
)
1726 if (p
->md_attr
& EFI_MD_ATTR_WB
)
1728 if (p
->md_attr
& EFI_MD_ATTR_UCE
)
1730 if (p
->md_attr
& EFI_MD_ATTR_WP
)
1732 if (p
->md_attr
& EFI_MD_ATTR_RP
)
1734 if (p
->md_attr
& EFI_MD_ATTR_XP
)
1736 if (p
->md_attr
& EFI_MD_ATTR_RT
)
1741 switch (p
->md_type
) {
1742 case EFI_MD_TYPE_CODE
:
1743 case EFI_MD_TYPE_DATA
:
1744 case EFI_MD_TYPE_BS_CODE
:
1745 case EFI_MD_TYPE_BS_DATA
:
1746 case EFI_MD_TYPE_FREE
:
1748 * We're allowed to use any entry with these types.
1755 Realmem
+= p
->md_pages
* PAGE_SIZE
;
1757 if (p
->md_phys
== physmap
[*physmap_idx
+ 1]) {
1758 physmap
[*physmap_idx
+ 1] += p
->md_pages
* PAGE_SIZE
;
1763 if (*physmap_idx
== PHYSMAP_SIZE
) {
1764 kprintf("Too many segments in the physical "
1765 "address map, giving up\n");
1768 physmap
[*physmap_idx
] = p
->md_phys
;
1769 physmap
[*physmap_idx
+ 1] = p
->md_phys
+ p
->md_pages
* PAGE_SIZE
;
1773 struct fb_info efi_fb_info
;
1774 static int have_efi_framebuffer
= 0;
1777 efi_fb_init_vaddr(int direct_map
)
1780 vm_offset_t addr
, v
;
1782 v
= efi_fb_info
.vaddr
;
1783 sz
= efi_fb_info
.stride
* efi_fb_info
.height
;
1786 addr
= PHYS_TO_DMAP(efi_fb_info
.paddr
);
1787 if (addr
>= DMAP_MIN_ADDRESS
&& addr
+ sz
< DMAP_MAX_ADDRESS
)
1788 efi_fb_info
.vaddr
= addr
;
1790 efi_fb_info
.vaddr
= (vm_offset_t
)pmap_mapdev_attr(
1791 efi_fb_info
.paddr
, sz
, PAT_WRITE_COMBINING
);
1794 if (v
== 0 && efi_fb_info
.vaddr
!= 0)
1795 memset((void *)efi_fb_info
.vaddr
, 0x77, sz
);
1799 probe_efi_fb(int early
)
1801 struct efi_fb
*efifb
;
1804 if (have_efi_framebuffer
) {
1806 (efi_fb_info
.vaddr
== 0 ||
1807 efi_fb_info
.vaddr
== PHYS_TO_DMAP(efi_fb_info
.paddr
)))
1808 efi_fb_init_vaddr(0);
1812 kmdp
= preload_search_by_type("elf kernel");
1814 kmdp
= preload_search_by_type("elf64 kernel");
1815 efifb
= (struct efi_fb
*)preload_search_info(kmdp
,
1816 MODINFO_METADATA
| MODINFOMD_EFI_FB
);
1820 have_efi_framebuffer
= 1;
1822 efi_fb_info
.is_vga_boot_display
= 1;
1823 efi_fb_info
.width
= efifb
->fb_width
;
1824 efi_fb_info
.height
= efifb
->fb_height
;
1825 efi_fb_info
.stride
= efifb
->fb_stride
* 4;
1826 efi_fb_info
.depth
= 32;
1827 efi_fb_info
.paddr
= efifb
->fb_addr
;
1829 efi_fb_info
.vaddr
= 0;
1831 efi_fb_init_vaddr(0);
1833 efi_fb_info
.restore
= NULL
;
1834 efi_fb_info
.device
= NULL
;
1840 efifb_startup(void *arg
)
1845 SYSINIT(efi_fb_info
, SI_BOOT1_POST
, SI_ORDER_FIRST
, efifb_startup
, NULL
);
1848 getmemsize(caddr_t kmdp
, u_int64_t first
)
1850 int off
, physmap_idx
, pa_indx
, da_indx
;
1853 vm_paddr_t msgbuf_size
;
1854 u_long physmem_tunable
;
1856 quad_t dcons_addr
, dcons_size
;
1858 bzero(physmap
, sizeof(physmap
));
1862 * get memory map from INT 15:E820, kindly supplied by the loader.
1864 * subr_module.c says:
1865 * "Consumer may safely assume that size value precedes data."
1866 * ie: an int32_t immediately precedes smap.
1868 efihdrbase
= (struct efi_map_header
*)preload_search_info(kmdp
,
1869 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1870 smapbase
= (struct bios_smap
*)preload_search_info(kmdp
,
1871 MODINFO_METADATA
| MODINFOMD_SMAP
);
1872 if (smapbase
== NULL
&& efihdrbase
== NULL
)
1873 panic("No BIOS smap or EFI map info from loader!");
1875 if (efihdrbase
== NULL
)
1876 add_smap_entries(&physmap_idx
);
1878 add_efi_map_entries(&physmap_idx
);
1880 base_memory
= physmap
[1] / 1024;
1881 /* make hole for AP bootstrap code */
1882 physmap
[1] = mp_bootaddress(base_memory
);
1884 /* Save EBDA address, if any */
1885 ebda_addr
= (u_long
)(*(u_short
*)(KERNBASE
+ 0x40e));
1889 * Maxmem isn't the "maximum memory", it's one larger than the
1890 * highest page of the physical address space. It should be
1891 * called something like "Maxphyspage". We may adjust this
1892 * based on ``hw.physmem'' and the results of the memory test.
1894 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1897 Maxmem
= MAXMEM
/ 4;
1900 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable
))
1901 Maxmem
= atop(physmem_tunable
);
1904 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1907 if (Maxmem
> atop(physmap
[physmap_idx
+ 1]))
1908 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1911 * Blowing out the DMAP will blow up the system.
1913 if (Maxmem
> atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
)) {
1914 kprintf("Limiting Maxmem due to DMAP size\n");
1915 Maxmem
= atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
);
1918 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1919 (boothowto
& RB_VERBOSE
)) {
1920 kprintf("Physical memory use set to %ldK\n", Maxmem
* 4);
1924 * Call pmap initialization to make new kernel address space
1928 pmap_bootstrap(&first
);
1929 physmap
[0] = PAGE_SIZE
;
1932 * Align the physmap to PHYSMAP_ALIGN and cut out anything
1935 for (i
= j
= 0; i
<= physmap_idx
; i
+= 2) {
1936 if (physmap
[i
+1] > ptoa(Maxmem
))
1937 physmap
[i
+1] = ptoa(Maxmem
);
1938 physmap
[i
] = (physmap
[i
] + PHYSMAP_ALIGN_MASK
) &
1939 ~PHYSMAP_ALIGN_MASK
;
1940 physmap
[i
+1] = physmap
[i
+1] & ~PHYSMAP_ALIGN_MASK
;
1942 physmap
[j
] = physmap
[i
];
1943 physmap
[j
+1] = physmap
[i
+1];
1945 if (physmap
[i
] < physmap
[i
+1])
1948 physmap_idx
= j
- 2;
1951 * Align anything else used in the validation loop.
1953 first
= (first
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
1956 * Size up each available chunk of physical memory.
1960 phys_avail
[pa_indx
++] = physmap
[0];
1961 phys_avail
[pa_indx
] = physmap
[0];
1962 dump_avail
[da_indx
] = physmap
[0];
1966 * Get dcons buffer address
1968 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
1969 kgetenv_quad("dcons.size", &dcons_size
) == 0)
1973 * Validate the physical memory. The physical memory segments
1974 * have already been aligned to PHYSMAP_ALIGN which is a multiple
1977 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1979 vm_paddr_t incr
= PHYSMAP_ALIGN
;
1981 end
= physmap
[i
+ 1];
1983 for (pa
= physmap
[i
]; pa
< end
; pa
+= incr
) {
1985 volatile uint64_t *ptr
= (uint64_t *)CADDR1
;
1988 incr
= PHYSMAP_ALIGN
;
1992 * block out kernel memory as not available.
1994 if (pa
>= 0x200000 && pa
< first
)
1998 * block out dcons buffer
2001 && pa
>= trunc_page(dcons_addr
)
2002 && pa
< dcons_addr
+ dcons_size
) {
2009 * Always test the first and last block supplied in
2010 * the map entry, but it just takes too long to run
2011 * the test these days and we already have to skip
2012 * pages. Handwave it on PHYSMAP_HANDWAVE boundaries.
2014 if (pa
!= physmap
[i
]) {
2015 vm_paddr_t bytes
= end
- pa
;
2016 if ((pa
& PHYSMAP_HANDWAVE_MASK
) == 0 &&
2017 bytes
>= PHYSMAP_HANDWAVE
+ PHYSMAP_ALIGN
) {
2018 incr
= PHYSMAP_HANDWAVE
;
2024 * map page into kernel: valid, read/write,non-cacheable
2027 kernel_pmap
.pmap_bits
[PG_V_IDX
] |
2028 kernel_pmap
.pmap_bits
[PG_RW_IDX
] |
2029 kernel_pmap
.pmap_bits
[PG_N_IDX
];
2030 cpu_invlpg(__DEVOLATILE(void *, ptr
));
2035 * Test for alternating 1's and 0's
2037 *ptr
= 0xaaaaaaaaaaaaaaaaLLU
;
2039 if (*ptr
!= 0xaaaaaaaaaaaaaaaaLLU
)
2042 * Test for alternating 0's and 1's
2044 *ptr
= 0x5555555555555555LLU
;
2046 if (*ptr
!= 0x5555555555555555LLU
)
2051 *ptr
= 0xffffffffffffffffLLU
;
2053 if (*ptr
!= 0xffffffffffffffffLLU
)
2063 * Restore original value.
2069 * Adjust array of valid/good pages.
2071 if (page_bad
== TRUE
)
2075 * If this good page is a continuation of the
2076 * previous set of good pages, then just increase
2077 * the end pointer. Otherwise start a new chunk.
2078 * Note that "end" points one higher than end,
2079 * making the range >= start and < end.
2080 * If we're also doing a speculative memory
2081 * test and we at or past the end, bump up Maxmem
2082 * so that we keep going. The first bad page
2083 * will terminate the loop.
2085 if (phys_avail
[pa_indx
] == pa
) {
2086 phys_avail
[pa_indx
] += incr
;
2089 if (pa_indx
== PHYS_AVAIL_ARRAY_END
) {
2091 "Too many holes in the physical address space, giving up\n");
2096 phys_avail
[pa_indx
++] = pa
;
2097 phys_avail
[pa_indx
] = pa
+ incr
;
2099 physmem
+= incr
/ PAGE_SIZE
;
2101 if (dump_avail
[da_indx
] == pa
) {
2102 dump_avail
[da_indx
] += incr
;
2105 if (da_indx
== DUMP_AVAIL_ARRAY_END
) {
2109 dump_avail
[da_indx
++] = pa
;
2110 dump_avail
[da_indx
] = pa
+ incr
;
2122 * The last chunk must contain at least one page plus the message
2123 * buffer to avoid complicating other code (message buffer address
2124 * calculation, etc.).
2126 msgbuf_size
= (MSGBUF_SIZE
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
2128 while (phys_avail
[pa_indx
- 1] + PHYSMAP_ALIGN
+
2129 msgbuf_size
>= phys_avail
[pa_indx
]) {
2130 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
2131 phys_avail
[pa_indx
--] = 0;
2132 phys_avail
[pa_indx
--] = 0;
2135 Maxmem
= atop(phys_avail
[pa_indx
]);
2137 /* Trim off space for the message buffer. */
2138 phys_avail
[pa_indx
] -= msgbuf_size
;
2140 avail_end
= phys_avail
[pa_indx
];
2142 /* Map the message buffer. */
2143 for (off
= 0; off
< msgbuf_size
; off
+= PAGE_SIZE
) {
2144 pmap_kenter((vm_offset_t
)msgbufp
+ off
,
2145 phys_avail
[pa_indx
] + off
);
2147 /* Try to get EFI framebuffer working as early as possible */
2148 if (have_efi_framebuffer
)
2149 efi_fb_init_vaddr(1);
2152 struct machintr_abi MachIntrABI
;
2163 * 7 Device Not Available (x87)
2165 * 9 Coprocessor Segment overrun (unsupported, reserved)
2167 * 11 Segment not present
2169 * 13 General Protection
2172 * 16 x87 FP Exception pending
2173 * 17 Alignment Check
2175 * 19 SIMD floating point
2177 * 32-255 INTn/external sources
2180 hammer_time(u_int64_t modulep
, u_int64_t physfree
)
2183 int gsel_tss
, x
, cpu
;
2185 int metadata_missing
, off
;
2187 struct mdglobaldata
*gd
;
2191 * Prevent lowering of the ipl if we call tsleep() early.
2193 gd
= &CPU_prvspace
[0]->mdglobaldata
;
2194 bzero(gd
, sizeof(*gd
));
2197 * Note: on both UP and SMP curthread must be set non-NULL
2198 * early in the boot sequence because the system assumes
2199 * that 'curthread' is never NULL.
2202 gd
->mi
.gd_curthread
= &thread0
;
2203 thread0
.td_gd
= &gd
->mi
;
2205 atdevbase
= ISA_HOLE_START
+ PTOV_OFFSET
;
2208 metadata_missing
= 0;
2209 if (bootinfo
.bi_modulep
) {
2210 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
2211 preload_bootstrap_relocate(KERNBASE
);
2213 metadata_missing
= 1;
2215 if (bootinfo
.bi_envp
)
2216 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
2219 preload_metadata
= (caddr_t
)(uintptr_t)(modulep
+ PTOV_OFFSET
);
2220 preload_bootstrap_relocate(PTOV_OFFSET
);
2221 kmdp
= preload_search_by_type("elf kernel");
2223 kmdp
= preload_search_by_type("elf64 kernel");
2224 boothowto
= MD_FETCH(kmdp
, MODINFOMD_HOWTO
, int);
2225 kern_envp
= MD_FETCH(kmdp
, MODINFOMD_ENVP
, char *) + PTOV_OFFSET
;
2227 ksym_start
= MD_FETCH(kmdp
, MODINFOMD_SSYM
, uintptr_t);
2228 ksym_end
= MD_FETCH(kmdp
, MODINFOMD_ESYM
, uintptr_t);
2231 if (boothowto
& RB_VERBOSE
)
2235 * Default MachIntrABI to ICU
2237 MachIntrABI
= MachIntrABI_ICU
;
2240 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
2241 * and ncpus_fit_mask remain 0.
2246 /* Init basic tunables, hz etc */
2250 * make gdt memory segments
2252 gdt_segs
[GPROC0_SEL
].ssd_base
=
2253 (uintptr_t) &CPU_prvspace
[0]->mdglobaldata
.gd_common_tss
;
2255 gd
->mi
.gd_prvspace
= CPU_prvspace
[0];
2257 for (x
= 0; x
< NGDT
; x
++) {
2258 if (x
!= GPROC0_SEL
&& x
!= (GPROC0_SEL
+ 1))
2259 ssdtosd(&gdt_segs
[x
], &gdt
[x
]);
2261 ssdtosyssd(&gdt_segs
[GPROC0_SEL
],
2262 (struct system_segment_descriptor
*)&gdt
[GPROC0_SEL
]);
2264 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
2265 r_gdt
.rd_base
= (long) gdt
;
2268 wrmsr(MSR_FSBASE
, 0); /* User value */
2269 wrmsr(MSR_GSBASE
, (u_int64_t
)&gd
->mi
);
2270 wrmsr(MSR_KGSBASE
, 0); /* User value while in the kernel */
2272 mi_gdinit(&gd
->mi
, 0);
2274 proc0paddr
= proc0paddr_buff
;
2275 mi_proc0init(&gd
->mi
, proc0paddr
);
2276 safepri
= TDPRI_MAX
;
2278 /* spinlocks and the BGL */
2282 for (x
= 0; x
< NIDT
; x
++)
2283 setidt_global(x
, &IDTVEC(rsvd
), SDT_SYSIGT
, SEL_KPL
, 0);
2284 setidt_global(IDT_DE
, &IDTVEC(div
), SDT_SYSIGT
, SEL_KPL
, 0);
2285 setidt_global(IDT_DB
, &IDTVEC(dbg
), SDT_SYSIGT
, SEL_KPL
, 0);
2286 setidt_global(IDT_NMI
, &IDTVEC(nmi
), SDT_SYSIGT
, SEL_KPL
, 1);
2287 setidt_global(IDT_BP
, &IDTVEC(bpt
), SDT_SYSIGT
, SEL_UPL
, 0);
2288 setidt_global(IDT_OF
, &IDTVEC(ofl
), SDT_SYSIGT
, SEL_KPL
, 0);
2289 setidt_global(IDT_BR
, &IDTVEC(bnd
), SDT_SYSIGT
, SEL_KPL
, 0);
2290 setidt_global(IDT_UD
, &IDTVEC(ill
), SDT_SYSIGT
, SEL_KPL
, 0);
2291 setidt_global(IDT_NM
, &IDTVEC(dna
), SDT_SYSIGT
, SEL_KPL
, 0);
2292 setidt_global(IDT_DF
, &IDTVEC(dblfault
), SDT_SYSIGT
, SEL_KPL
, 1);
2293 setidt_global(IDT_FPUGP
, &IDTVEC(fpusegm
), SDT_SYSIGT
, SEL_KPL
, 0);
2294 setidt_global(IDT_TS
, &IDTVEC(tss
), SDT_SYSIGT
, SEL_KPL
, 0);
2295 setidt_global(IDT_NP
, &IDTVEC(missing
), SDT_SYSIGT
, SEL_KPL
, 0);
2296 setidt_global(IDT_SS
, &IDTVEC(stk
), SDT_SYSIGT
, SEL_KPL
, 0);
2297 setidt_global(IDT_GP
, &IDTVEC(prot
), SDT_SYSIGT
, SEL_KPL
, 0);
2298 setidt_global(IDT_PF
, &IDTVEC(page
), SDT_SYSIGT
, SEL_KPL
, 0);
2299 setidt_global(IDT_MF
, &IDTVEC(fpu
), SDT_SYSIGT
, SEL_KPL
, 0);
2300 setidt_global(IDT_AC
, &IDTVEC(align
), SDT_SYSIGT
, SEL_KPL
, 0);
2301 setidt_global(IDT_MC
, &IDTVEC(mchk
), SDT_SYSIGT
, SEL_KPL
, 0);
2302 setidt_global(IDT_XF
, &IDTVEC(xmm
), SDT_SYSIGT
, SEL_KPL
, 0);
2304 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
2305 r_idt_arr
[cpu
].rd_limit
= sizeof(idt_arr
[cpu
]) - 1;
2306 r_idt_arr
[cpu
].rd_base
= (long) &idt_arr
[cpu
][0];
2309 lidt(&r_idt_arr
[0]);
2312 * Initialize the console before we print anything out.
2317 if (metadata_missing
)
2318 kprintf("WARNING: loader(8) metadata is missing!\n");
2328 * Initialize IRQ mapping
2331 * SHOULD be after elcr_probe()
2333 MachIntrABI_ICU
.initmap();
2334 MachIntrABI_IOAPIC
.initmap();
2338 if (boothowto
& RB_KDB
)
2339 Debugger("Boot flags requested debugger");
2343 finishidentcpu(); /* Final stage of CPU initialization */
2344 setidt(6, &IDTVEC(ill
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2345 setidt(13, &IDTVEC(prot
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2347 identify_cpu(); /* Final stage of CPU initialization */
2348 initializecpu(0); /* Initialize CPU registers */
2351 * On modern intel cpus, haswell or later, cpu_idle_hlt=1 is better
2352 * because the cpu does significant power management in MWAIT
2353 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2355 * On modern amd cpus cpu_idle_hlt=3 is better, because the cpu does
2356 * significant power management in HLT or ACPI (but cpu_idle_hlt=1
2357 * would try to use MWAIT).
2359 * On older amd or intel cpus, cpu_idle_hlt=2 is better because ACPI
2360 * is needed to reduce power consumption, but wakeup times are often
2363 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
2364 CPUID_TO_MODEL(cpu_id
) >= 0x3C) { /* Haswell or later */
2367 if (cpu_vendor_id
== CPU_VENDOR_AMD
&&
2368 CPUID_TO_FAMILY(cpu_id
) >= 0x14) { /* Bobcat or later */
2372 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable
); /* for compat */
2373 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable
);
2374 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable
);
2375 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt
);
2378 * Some of the virtual machines do not work w/ I/O APIC
2379 * enabled. If the user does not explicitly enable or
2380 * disable the I/O APIC (ioapic_enable < 0), then we
2381 * disable I/O APIC on all virtual machines.
2384 * This must be done after identify_cpu(), which sets
2387 if (ioapic_enable
< 0) {
2388 if (cpu_feature2
& CPUID2_VMM
)
2394 /* make an initial tss so cpu can get interrupt stack on syscall! */
2395 gd
->gd_common_tss
.tss_rsp0
=
2396 (register_t
)(thread0
.td_kstack
+
2397 KSTACK_PAGES
* PAGE_SIZE
- sizeof(struct pcb
));
2398 /* Ensure the stack is aligned to 16 bytes */
2399 gd
->gd_common_tss
.tss_rsp0
&= ~(register_t
)0xF;
2401 /* double fault stack */
2402 gd
->gd_common_tss
.tss_ist1
=
2403 (long)&gd
->mi
.gd_prvspace
->idlestack
[
2404 sizeof(gd
->mi
.gd_prvspace
->idlestack
)];
2406 /* Set the IO permission bitmap (empty due to tss seg limit) */
2407 gd
->gd_common_tss
.tss_iobase
= sizeof(struct x86_64tss
);
2409 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2410 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
];
2411 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2414 /* Set up the fast syscall stuff */
2415 msr
= rdmsr(MSR_EFER
) | EFER_SCE
;
2416 wrmsr(MSR_EFER
, msr
);
2417 wrmsr(MSR_LSTAR
, (u_int64_t
)IDTVEC(fast_syscall
));
2418 wrmsr(MSR_CSTAR
, (u_int64_t
)IDTVEC(fast_syscall32
));
2419 msr
= ((u_int64_t
)GSEL(GCODE_SEL
, SEL_KPL
) << 32) |
2420 ((u_int64_t
)GSEL(GUCODE32_SEL
, SEL_UPL
) << 48);
2421 wrmsr(MSR_STAR
, msr
);
2422 wrmsr(MSR_SF_MASK
, PSL_NT
|PSL_T
|PSL_I
|PSL_C
|PSL_D
|PSL_IOPL
);
2424 getmemsize(kmdp
, physfree
);
2425 init_param2(physmem
);
2427 /* now running on new page tables, configured,and u/iom is accessible */
2429 /* Map the message buffer. */
2431 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2432 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2435 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2438 /* transfer to user mode */
2440 _ucodesel
= GSEL(GUCODE_SEL
, SEL_UPL
);
2441 _udatasel
= GSEL(GUDATA_SEL
, SEL_UPL
);
2442 _ucode32sel
= GSEL(GUCODE32_SEL
, SEL_UPL
);
2448 /* setup proc 0's pcb */
2449 thread0
.td_pcb
->pcb_flags
= 0;
2450 thread0
.td_pcb
->pcb_cr3
= KPML4phys
;
2451 thread0
.td_pcb
->pcb_ext
= NULL
;
2452 lwp0
.lwp_md
.md_regs
= &proc0_tf
; /* XXX needed? */
2454 /* Location of kernel stack for locore */
2455 return ((u_int64_t
)thread0
.td_pcb
);
2459 * Initialize machine-dependant portions of the global data structure.
2460 * Note that the global data area and cpu0's idlestack in the private
2461 * data space were allocated in locore.
2463 * Note: the idlethread's cpl is 0
2465 * WARNING! Called from early boot, 'mycpu' may not work yet.
2468 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2471 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2473 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2474 gd
->mi
.gd_prvspace
->idlestack
,
2475 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2477 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2478 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2479 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2480 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2484 * We only have to check for DMAP bounds, the globaldata space is
2485 * actually part of the kernel_map so we don't have to waste time
2486 * checking CPU_prvspace[*].
2489 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2492 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2493 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2497 if (saddr
>= DMAP_MIN_ADDRESS
&& eaddr
<= DMAP_MAX_ADDRESS
)
2503 globaldata_find(int cpu
)
2505 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2506 return(&CPU_prvspace
[cpu
]->mdglobaldata
.mi
);
2510 * This path should be safe from the SYSRET issue because only stopped threads
2511 * can have their %rip adjusted this way (and all heavy weight thread switches
2512 * clear QUICKREF and thus do not use SYSRET). However, the code path is
2513 * convoluted so add a safety by forcing %rip to be cannonical.
2516 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2518 if (addr
& 0x0000800000000000LLU
)
2519 lp
->lwp_md
.md_regs
->tf_rip
= addr
| 0xFFFF000000000000LLU
;
2521 lp
->lwp_md
.md_regs
->tf_rip
= addr
& 0x0000FFFFFFFFFFFFLLU
;
2526 ptrace_single_step(struct lwp
*lp
)
2528 lp
->lwp_md
.md_regs
->tf_rflags
|= PSL_T
;
2533 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2535 struct trapframe
*tp
;
2537 if ((tp
= lp
->lwp_md
.md_regs
) == NULL
)
2539 bcopy(&tp
->tf_rdi
, ®s
->r_rdi
, sizeof(*regs
));
2544 set_regs(struct lwp
*lp
, struct reg
*regs
)
2546 struct trapframe
*tp
;
2548 tp
= lp
->lwp_md
.md_regs
;
2549 if (!EFL_SECURE(regs
->r_rflags
, tp
->tf_rflags
) ||
2550 !CS_SECURE(regs
->r_cs
))
2552 bcopy(®s
->r_rdi
, &tp
->tf_rdi
, sizeof(*regs
));
2558 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2560 struct env87
*penv_87
= &sv_87
->sv_env
;
2561 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2564 /* FPU control/status */
2565 penv_87
->en_cw
= penv_xmm
->en_cw
;
2566 penv_87
->en_sw
= penv_xmm
->en_sw
;
2567 penv_87
->en_tw
= penv_xmm
->en_tw
;
2568 penv_87
->en_fip
= penv_xmm
->en_fip
;
2569 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2570 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2571 penv_87
->en_foo
= penv_xmm
->en_foo
;
2572 penv_87
->en_fos
= penv_xmm
->en_fos
;
2575 for (i
= 0; i
< 8; ++i
)
2576 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2580 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2582 struct env87
*penv_87
= &sv_87
->sv_env
;
2583 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2586 /* FPU control/status */
2587 penv_xmm
->en_cw
= penv_87
->en_cw
;
2588 penv_xmm
->en_sw
= penv_87
->en_sw
;
2589 penv_xmm
->en_tw
= penv_87
->en_tw
;
2590 penv_xmm
->en_fip
= penv_87
->en_fip
;
2591 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2592 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2593 penv_xmm
->en_foo
= penv_87
->en_foo
;
2594 penv_xmm
->en_fos
= penv_87
->en_fos
;
2597 for (i
= 0; i
< 8; ++i
)
2598 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2602 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2604 if (lp
->lwp_thread
== NULL
|| lp
->lwp_thread
->td_pcb
== NULL
)
2607 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2608 (struct save87
*)fpregs
);
2611 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2616 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2619 set_fpregs_xmm((struct save87
*)fpregs
,
2620 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2623 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2628 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2633 dbregs
->dr
[0] = rdr0();
2634 dbregs
->dr
[1] = rdr1();
2635 dbregs
->dr
[2] = rdr2();
2636 dbregs
->dr
[3] = rdr3();
2637 dbregs
->dr
[4] = rdr4();
2638 dbregs
->dr
[5] = rdr5();
2639 dbregs
->dr
[6] = rdr6();
2640 dbregs
->dr
[7] = rdr7();
2643 if (lp
->lwp_thread
== NULL
|| (pcb
= lp
->lwp_thread
->td_pcb
) == NULL
)
2645 dbregs
->dr
[0] = pcb
->pcb_dr0
;
2646 dbregs
->dr
[1] = pcb
->pcb_dr1
;
2647 dbregs
->dr
[2] = pcb
->pcb_dr2
;
2648 dbregs
->dr
[3] = pcb
->pcb_dr3
;
2651 dbregs
->dr
[6] = pcb
->pcb_dr6
;
2652 dbregs
->dr
[7] = pcb
->pcb_dr7
;
2657 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2660 load_dr0(dbregs
->dr
[0]);
2661 load_dr1(dbregs
->dr
[1]);
2662 load_dr2(dbregs
->dr
[2]);
2663 load_dr3(dbregs
->dr
[3]);
2664 load_dr4(dbregs
->dr
[4]);
2665 load_dr5(dbregs
->dr
[5]);
2666 load_dr6(dbregs
->dr
[6]);
2667 load_dr7(dbregs
->dr
[7]);
2670 struct ucred
*ucred
;
2672 uint64_t mask1
, mask2
;
2675 * Don't let an illegal value for dr7 get set. Specifically,
2676 * check for undefined settings. Setting these bit patterns
2677 * result in undefined behaviour and can lead to an unexpected
2680 /* JG this loop looks unreadable */
2681 /* Check 4 2-bit fields for invalid patterns.
2682 * These fields are R/Wi, for i = 0..3
2684 /* Is 10 in LENi allowed when running in compatibility mode? */
2685 /* Pattern 10 in R/Wi might be used to indicate
2686 * breakpoint on I/O. Further analysis should be
2687 * carried to decide if it is safe and useful to
2688 * provide access to that capability
2690 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 4;
2691 i
++, mask1
<<= 4, mask2
<<= 4)
2692 if ((dbregs
->dr
[7] & mask1
) == mask2
)
2695 pcb
= lp
->lwp_thread
->td_pcb
;
2696 ucred
= lp
->lwp_proc
->p_ucred
;
2699 * Don't let a process set a breakpoint that is not within the
2700 * process's address space. If a process could do this, it
2701 * could halt the system by setting a breakpoint in the kernel
2702 * (if ddb was enabled). Thus, we need to check to make sure
2703 * that no breakpoints are being enabled for addresses outside
2704 * process's address space, unless, perhaps, we were called by
2707 * XXX - what about when the watched area of the user's
2708 * address space is written into from within the kernel
2709 * ... wouldn't that still cause a breakpoint to be generated
2710 * from within kernel mode?
2713 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2714 if (dbregs
->dr
[7] & 0x3) {
2715 /* dr0 is enabled */
2716 if (dbregs
->dr
[0] >= VM_MAX_USER_ADDRESS
)
2720 if (dbregs
->dr
[7] & (0x3<<2)) {
2721 /* dr1 is enabled */
2722 if (dbregs
->dr
[1] >= VM_MAX_USER_ADDRESS
)
2726 if (dbregs
->dr
[7] & (0x3<<4)) {
2727 /* dr2 is enabled */
2728 if (dbregs
->dr
[2] >= VM_MAX_USER_ADDRESS
)
2732 if (dbregs
->dr
[7] & (0x3<<6)) {
2733 /* dr3 is enabled */
2734 if (dbregs
->dr
[3] >= VM_MAX_USER_ADDRESS
)
2739 pcb
->pcb_dr0
= dbregs
->dr
[0];
2740 pcb
->pcb_dr1
= dbregs
->dr
[1];
2741 pcb
->pcb_dr2
= dbregs
->dr
[2];
2742 pcb
->pcb_dr3
= dbregs
->dr
[3];
2743 pcb
->pcb_dr6
= dbregs
->dr
[6];
2744 pcb
->pcb_dr7
= dbregs
->dr
[7];
2746 pcb
->pcb_flags
|= PCB_DBREGS
;
2753 * Return > 0 if a hardware breakpoint has been hit, and the
2754 * breakpoint was in user space. Return 0, otherwise.
2757 user_dbreg_trap(void)
2759 u_int64_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2760 u_int64_t bp
; /* breakpoint bits extracted from dr6 */
2761 int nbp
; /* number of breakpoints that triggered */
2762 caddr_t addr
[4]; /* breakpoint addresses */
2766 if ((dr7
& 0xff) == 0) {
2768 * all GE and LE bits in the dr7 register are zero,
2769 * thus the trap couldn't have been caused by the
2770 * hardware debug registers
2781 * None of the breakpoint bits are set meaning this
2782 * trap was not caused by any of the debug registers
2788 * at least one of the breakpoints were hit, check to see
2789 * which ones and if any of them are user space addresses
2793 addr
[nbp
++] = (caddr_t
)rdr0();
2796 addr
[nbp
++] = (caddr_t
)rdr1();
2799 addr
[nbp
++] = (caddr_t
)rdr2();
2802 addr
[nbp
++] = (caddr_t
)rdr3();
2805 for (i
=0; i
<nbp
; i
++) {
2807 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2809 * addr[i] is in user space
2816 * None of the breakpoints are in user space.
2824 Debugger(const char *msg
)
2826 kprintf("Debugger(\"%s\") called.\n", msg
);
2833 * Provide inb() and outb() as functions. They are normally only
2834 * available as macros calling inlined functions, thus cannot be
2835 * called inside DDB.
2837 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2843 /* silence compiler warnings */
2845 void outb(u_int
, u_char
);
2852 * We use %%dx and not %1 here because i/o is done at %dx and not at
2853 * %edx, while gcc generates inferior code (movw instead of movl)
2854 * if we tell it to load (u_short) port.
2856 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2861 outb(u_int port
, u_char data
)
2865 * Use an unnecessary assignment to help gcc's register allocator.
2866 * This make a large difference for gcc-1.40 and a tiny difference
2867 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2868 * best results. gcc-2.6.0 can't handle this.
2871 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2879 * initialize all the SMP locks
2882 /* critical region when masking or unmasking interupts */
2883 struct spinlock_deprecated imen_spinlock
;
2885 /* lock region used by kernel profiling */
2886 struct spinlock_deprecated mcount_spinlock
;
2888 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2889 struct spinlock_deprecated com_spinlock
;
2891 /* lock regions around the clock hardware */
2892 struct spinlock_deprecated clock_spinlock
;
2898 * Get the initial mplock with a count of 1 for the BSP.
2899 * This uses a LOGICAL cpu ID, ie BSP == 0.
2901 cpu_get_initial_mplock();
2903 spin_init_deprecated(&mcount_spinlock
);
2904 spin_init_deprecated(&imen_spinlock
);
2905 spin_init_deprecated(&com_spinlock
);
2906 spin_init_deprecated(&clock_spinlock
);
2908 /* our token pool needs to work early */
2909 lwkt_token_pool_init();
2913 cpu_mwait_hint_valid(uint32_t hint
)
2917 cx_idx
= MWAIT_EAX_TO_CX(hint
);
2918 if (cx_idx
>= CPU_MWAIT_CX_MAX
)
2921 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2922 if (sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
2929 cpu_mwait_cx_no_bmsts(void)
2931 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_STS
);
2935 cpu_mwait_cx_no_bmarb(void)
2937 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_ARB
);
2941 cpu_mwait_cx_hint2name(int hint
, char *name
, int namelen
, boolean_t allow_auto
)
2943 int old_cx_idx
, sub
= 0;
2946 old_cx_idx
= MWAIT_EAX_TO_CX(hint
);
2947 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2948 } else if (hint
== CPU_MWAIT_HINT_AUTO
) {
2949 old_cx_idx
= allow_auto
? CPU_MWAIT_C2
: CPU_MWAIT_CX_MAX
;
2950 } else if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
2951 old_cx_idx
= allow_auto
? CPU_MWAIT_C3
: CPU_MWAIT_CX_MAX
;
2953 old_cx_idx
= CPU_MWAIT_CX_MAX
;
2956 if (!CPU_MWAIT_HAS_CX
)
2957 strlcpy(name
, "NONE", namelen
);
2958 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTO
)
2959 strlcpy(name
, "AUTO", namelen
);
2960 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTODEEP
)
2961 strlcpy(name
, "AUTODEEP", namelen
);
2962 else if (old_cx_idx
>= CPU_MWAIT_CX_MAX
||
2963 sub
>= cpu_mwait_cx_info
[old_cx_idx
].subcnt
)
2964 strlcpy(name
, "INVALID", namelen
);
2966 ksnprintf(name
, namelen
, "C%d/%d", old_cx_idx
, sub
);
2972 cpu_mwait_cx_name2hint(char *name
, int *hint0
, boolean_t allow_auto
)
2974 int cx_idx
, sub
, hint
;
2977 if (allow_auto
&& strcmp(name
, "AUTO") == 0) {
2978 hint
= CPU_MWAIT_HINT_AUTO
;
2979 cx_idx
= CPU_MWAIT_C2
;
2982 if (allow_auto
&& strcmp(name
, "AUTODEEP") == 0) {
2983 hint
= CPU_MWAIT_HINT_AUTODEEP
;
2984 cx_idx
= CPU_MWAIT_C3
;
2988 if (strlen(name
) < 4 || toupper(name
[0]) != 'C')
2993 cx_idx
= strtol(start
, &ptr
, 10);
2994 if (ptr
== start
|| *ptr
!= '/')
2996 if (cx_idx
< 0 || cx_idx
>= CPU_MWAIT_CX_MAX
)
3002 sub
= strtol(start
, &ptr
, 10);
3005 if (sub
< 0 || sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
3008 hint
= MWAIT_EAX_HINT(cx_idx
, sub
);
3015 cpu_mwait_cx_transit(int old_cx_idx
, int cx_idx
)
3017 if (cx_idx
>= CPU_MWAIT_C3
&& cpu_mwait_c3_preamble
)
3019 if (old_cx_idx
< CPU_MWAIT_C3
&& cx_idx
>= CPU_MWAIT_C3
) {
3022 error
= cputimer_intr_powersave_addreq();
3025 } else if (old_cx_idx
>= CPU_MWAIT_C3
&& cx_idx
< CPU_MWAIT_C3
) {
3026 cputimer_intr_powersave_remreq();
3032 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
, int *hint0
,
3033 boolean_t allow_auto
)
3035 int error
, cx_idx
, old_cx_idx
, hint
;
3036 char name
[CPU_MWAIT_CX_NAMELEN
];
3039 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
),
3042 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3043 if (error
!= 0 || req
->newptr
== NULL
)
3046 if (!CPU_MWAIT_HAS_CX
)
3049 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, allow_auto
);
3053 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3062 cpu_mwait_cx_setname(struct cpu_idle_stat
*stat
, const char *cx_name
)
3064 int error
, cx_idx
, old_cx_idx
, hint
;
3065 char name
[CPU_MWAIT_CX_NAMELEN
];
3067 KASSERT(CPU_MWAIT_HAS_CX
, ("cpu does not support mwait CX extension"));
3070 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3072 strlcpy(name
, cx_name
, sizeof(name
));
3073 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3077 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3086 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3088 int hint
= cpu_mwait_halt_global
;
3089 int error
, cx_idx
, cpu
;
3090 char name
[CPU_MWAIT_CX_NAMELEN
], cx_name
[CPU_MWAIT_CX_NAMELEN
];
3092 cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3094 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3095 if (error
!= 0 || req
->newptr
== NULL
)
3098 if (!CPU_MWAIT_HAS_CX
)
3101 /* Save name for later per-cpu CX configuration */
3102 strlcpy(cx_name
, name
, sizeof(cx_name
));
3104 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3108 /* Change per-cpu CX configuration */
3109 for (cpu
= 0; cpu
< ncpus
; ++cpu
) {
3110 error
= cpu_mwait_cx_setname(&cpu_idle_stats
[cpu
], cx_name
);
3115 cpu_mwait_halt_global
= hint
;
3120 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3122 struct cpu_idle_stat
*stat
= arg1
;
3125 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3131 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
)
3135 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3136 &cpu_mwait_spin
, FALSE
);
3141 * This manual debugging code is called unconditionally from Xtimer
3142 * (the per-cpu timer interrupt) whether the current thread is in a
3143 * critical section or not) and can be useful in tracking down lockups.
3145 * NOTE: MANUAL DEBUG CODE
3148 static int saveticks
[SMP_MAXCPU
];
3149 static int savecounts
[SMP_MAXCPU
];
3153 pcpu_timer_always(struct intrframe
*frame
)
3156 globaldata_t gd
= mycpu
;
3157 int cpu
= gd
->gd_cpuid
;
3163 gptr
= (short *)0xFFFFFFFF800b8000 + 80 * cpu
;
3164 *gptr
= ((*gptr
+ 1) & 0x00FF) | 0x0700;
3167 ksnprintf(buf
, sizeof(buf
), " %p %16s %d %16s ",
3168 (void *)frame
->if_rip
, gd
->gd_curthread
->td_comm
, ticks
,
3170 for (i
= 0; buf
[i
]; ++i
) {
3171 gptr
[i
] = 0x0700 | (unsigned char)buf
[i
];
3175 if (saveticks
[gd
->gd_cpuid
] != ticks
) {
3176 saveticks
[gd
->gd_cpuid
] = ticks
;
3177 savecounts
[gd
->gd_cpuid
] = 0;
3179 ++savecounts
[gd
->gd_cpuid
];
3180 if (savecounts
[gd
->gd_cpuid
] > 2000 && panicstr
== NULL
) {
3181 panic("cpud %d panicing on ticks failure",
3184 for (i
= 0; i
< ncpus
; ++i
) {
3186 if (saveticks
[i
] && panicstr
== NULL
) {
3187 delta
= saveticks
[i
] - ticks
;
3188 if (delta
< -10 || delta
> 10) {
3189 panic("cpu %d panicing on cpu %d watchdog",