2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
43 //#include "use_npx.h"
47 #include "opt_directio.h"
49 #include "opt_msgbuf.h"
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/sysproto.h>
55 #include <sys/signalvar.h>
56 #include <sys/kernel.h>
57 #include <sys/linker.h>
58 #include <sys/malloc.h>
62 #include <sys/reboot.h>
64 #include <sys/msgbuf.h>
65 #include <sys/sysent.h>
66 #include <sys/sysctl.h>
67 #include <sys/vmmeter.h>
69 #include <sys/usched.h>
72 #include <sys/ctype.h>
73 #include <sys/serialize.h>
74 #include <sys/systimer.h>
77 #include <vm/vm_param.h>
79 #include <vm/vm_kern.h>
80 #include <vm/vm_object.h>
81 #include <vm/vm_page.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_pager.h>
84 #include <vm/vm_extern.h>
86 #include <sys/thread2.h>
87 #include <sys/mplock2.h>
88 #include <sys/mutex2.h>
98 #include <machine/cpu.h>
99 #include <machine/clock.h>
100 #include <machine/specialreg.h>
102 #include <machine/bootinfo.h>
104 #include <machine/md_var.h>
105 #include <machine/metadata.h>
106 #include <machine/pc/bios.h>
107 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
108 #include <machine/globaldata.h> /* CPU_prvspace */
109 #include <machine/smp.h>
110 #include <machine/cputypes.h>
111 #include <machine/intr_machdep.h>
112 #include <machine/framebuffer.h>
115 #include <bus/isa/isa_device.h>
117 #include <machine_base/isa/isa_intr.h>
118 #include <bus/isa/rtc.h>
119 #include <sys/random.h>
120 #include <sys/ptrace.h>
121 #include <machine/sigframe.h>
123 #include <sys/machintr.h>
124 #include <machine_base/icu/icu_abi.h>
125 #include <machine_base/icu/elcr_var.h>
126 #include <machine_base/apic/lapic.h>
127 #include <machine_base/apic/ioapic.h>
128 #include <machine_base/apic/ioapic_abi.h>
129 #include <machine/mptable.h>
131 #define PHYSMAP_ENTRIES 10
133 extern u_int64_t
hammer_time(u_int64_t
, u_int64_t
);
135 extern void printcpuinfo(void); /* XXX header file */
136 extern void identify_cpu(void);
138 extern void finishidentcpu(void);
140 extern void panicifcpuunsupported(void);
142 static void cpu_startup(void *);
143 static void pic_finish(void *);
144 static void cpu_finish(void *);
146 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
147 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
149 extern void ffs_rawread_setup(void);
150 #endif /* DIRECTIO */
151 static void init_locks(void);
153 extern void pcpu_timer_always(struct intrframe
*);
155 SYSINIT(cpu
, SI_BOOT2_START_CPU
, SI_ORDER_FIRST
, cpu_startup
, NULL
);
156 SYSINIT(pic_finish
, SI_BOOT2_FINISH_PIC
, SI_ORDER_FIRST
, pic_finish
, NULL
);
157 SYSINIT(cpu_finish
, SI_BOOT2_FINISH_CPU
, SI_ORDER_FIRST
, cpu_finish
, NULL
);
160 extern vm_offset_t ksym_start
, ksym_end
;
163 struct privatespace CPU_prvspace_bsp
__aligned(4096);
164 struct privatespace
*CPU_prvspace
[MAXCPU
] = { &CPU_prvspace_bsp
};
166 vm_paddr_t efi_systbl_phys
;
167 int _udatasel
, _ucodesel
, _ucode32sel
;
169 int64_t tsc_offsets
[MAXCPU
];
170 cpumask_t smp_idleinvl_mask
;
171 cpumask_t smp_idleinvl_reqs
;
173 static int cpu_mwait_halt_global
; /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
175 #if defined(SWTCH_OPTIM_STATS)
176 extern int swtch_optim_stats
;
177 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
178 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
179 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
180 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
182 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_halt
,
183 CTLFLAG_RD
, &cpu_mwait_halt_global
, 0, "");
184 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_spin
, CTLFLAG_RD
, &cpu_mwait_spin
, 0,
185 "monitor/mwait target state");
187 #define CPU_MWAIT_HAS_CX \
188 ((cpu_feature2 & CPUID2_MON) && \
189 (cpu_mwait_feature & CPUID_MWAIT_EXT))
191 #define CPU_MWAIT_CX_NAMELEN 16
193 #define CPU_MWAIT_C1 1
194 #define CPU_MWAIT_C2 2
195 #define CPU_MWAIT_C3 3
196 #define CPU_MWAIT_CX_MAX 8
198 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */
199 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */
201 SYSCTL_NODE(_machdep
, OID_AUTO
, mwait
, CTLFLAG_RW
, 0, "MWAIT features");
202 SYSCTL_NODE(_machdep_mwait
, OID_AUTO
, CX
, CTLFLAG_RW
, 0, "MWAIT Cx settings");
204 struct cpu_mwait_cx
{
207 struct sysctl_ctx_list sysctl_ctx
;
208 struct sysctl_oid
*sysctl_tree
;
210 static struct cpu_mwait_cx cpu_mwait_cx_info
[CPU_MWAIT_CX_MAX
];
211 static char cpu_mwait_cx_supported
[256];
213 static int cpu_mwait_c1_hints_cnt
;
214 static int cpu_mwait_hints_cnt
;
215 static int *cpu_mwait_hints
;
217 static int cpu_mwait_deep_hints_cnt
;
218 static int *cpu_mwait_deep_hints
;
220 #define CPU_IDLE_REPEAT_DEFAULT 750
222 static u_int cpu_idle_repeat
= CPU_IDLE_REPEAT_DEFAULT
;
223 static u_long cpu_idle_repeat_max
= CPU_IDLE_REPEAT_DEFAULT
;
224 static u_int cpu_mwait_repeat_shift
= 1;
226 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1
227 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2
229 static int cpu_mwait_c3_preamble
=
230 CPU_MWAIT_C3_PREAMBLE_BM_ARB
|
231 CPU_MWAIT_C3_PREAMBLE_BM_STS
;
233 SYSCTL_STRING(_machdep_mwait_CX
, OID_AUTO
, supported
, CTLFLAG_RD
,
234 cpu_mwait_cx_supported
, 0, "MWAIT supported C states");
235 SYSCTL_INT(_machdep_mwait_CX
, OID_AUTO
, c3_preamble
, CTLFLAG_RD
,
236 &cpu_mwait_c3_preamble
, 0, "C3+ preamble mask");
238 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
,
240 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
);
241 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
);
242 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
);
244 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, idle
, CTLTYPE_STRING
|CTLFLAG_RW
,
245 NULL
, 0, cpu_mwait_cx_idle_sysctl
, "A", "");
246 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, spin
, CTLTYPE_STRING
|CTLFLAG_RW
,
247 NULL
, 0, cpu_mwait_cx_spin_sysctl
, "A", "");
248 SYSCTL_UINT(_machdep_mwait_CX
, OID_AUTO
, repeat_shift
, CTLFLAG_RW
,
249 &cpu_mwait_repeat_shift
, 0, "");
253 u_long ebda_addr
= 0;
255 int imcr_present
= 0;
257 int naps
= 0; /* # of Applications processors */
260 struct mtx dt_lock
; /* lock for GDT and LDT */
263 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
265 u_long pmem
= ctob(physmem
);
267 int error
= sysctl_handle_long(oidp
, &pmem
, 0, req
);
271 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_ULONG
|CTLFLAG_RD
,
272 0, 0, sysctl_hw_physmem
, "LU", "Total system memory in bytes (number of pages * page size)");
275 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
277 int error
= sysctl_handle_int(oidp
, 0,
278 ctob(physmem
- vmstats
.v_wire_count
), req
);
282 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
283 0, 0, sysctl_hw_usermem
, "IU", "");
286 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
288 int error
= sysctl_handle_int(oidp
, 0,
289 x86_64_btop(avail_end
- avail_start
), req
);
293 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
294 0, 0, sysctl_hw_availpages
, "I", "");
300 * The number of PHYSMAP entries must be one less than the number of
301 * PHYSSEG entries because the PHYSMAP entry that spans the largest
302 * physical address that is accessible by ISA DMA is split into two
305 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
307 vm_paddr_t phys_avail
[PHYSMAP_SIZE
+ 2];
308 vm_paddr_t dump_avail
[PHYSMAP_SIZE
+ 2];
310 /* must be 2 less so 0 0 can signal end of chunks */
311 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 2)
312 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 2)
314 static vm_offset_t buffer_sva
, buffer_eva
;
315 vm_offset_t clean_sva
, clean_eva
;
316 static vm_offset_t pager_sva
, pager_eva
;
317 static struct trapframe proc0_tf
;
320 cpu_startup(void *dummy
)
324 vm_offset_t firstaddr
;
327 * Good {morning,afternoon,evening,night}.
329 kprintf("%s", version
);
332 panicifcpuunsupported();
333 kprintf("real memory = %ju (%ju MB)\n",
335 (intmax_t)Realmem
/ 1024 / 1024);
337 * Display any holes after the first chunk of extended memory.
342 kprintf("Physical memory chunk(s):\n");
343 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
344 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
346 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
347 (intmax_t)phys_avail
[indx
],
348 (intmax_t)phys_avail
[indx
+ 1] - 1,
350 (intmax_t)(size1
/ PAGE_SIZE
));
355 * Allocate space for system data structures.
356 * The first available kernel virtual address is in "v".
357 * As pages of kernel virtual memory are allocated, "v" is incremented.
358 * As pages of memory are allocated and cleared,
359 * "firstaddr" is incremented.
360 * An index into the kernel page table corresponding to the
361 * virtual memory address maintained in "v" is kept in "mapaddr".
365 * Make two passes. The first pass calculates how much memory is
366 * needed and allocates it. The second pass assigns virtual
367 * addresses to the various data structures.
371 v
= (caddr_t
)firstaddr
;
373 #define valloc(name, type, num) \
374 (name) = (type *)v; v = (caddr_t)((name)+(num))
375 #define valloclim(name, type, num, lim) \
376 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
379 * The nominal buffer size (and minimum KVA allocation) is MAXBSIZE.
380 * For the first 64MB of ram nominally allocate sufficient buffers to
381 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
382 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
383 * the buffer cache we limit the eventual kva reservation to
386 * factor represents the 1/4 x ram conversion.
389 long factor
= 4 * NBUFCALCSIZE
/ 1024;
390 long kbytes
= physmem
* (PAGE_SIZE
/ 1024);
394 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
396 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
397 if (maxbcache
&& nbuf
> maxbcache
/ NBUFCALCSIZE
)
398 nbuf
= maxbcache
/ NBUFCALCSIZE
;
402 * Do not allow the buffer_map to be more then 1/2 the size of the
405 if (nbuf
> (virtual_end
- virtual_start
+
406 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2)) {
407 nbuf
= (virtual_end
- virtual_start
+
408 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2);
409 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf
);
413 * Do not allow the buffer_map to use more than 50% of available
414 * physical-equivalent memory. Since the VM pages which back
415 * individual buffers are typically wired, having too many bufs
416 * can prevent the system from paging properly.
418 if (nbuf
> physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2)) {
419 nbuf
= physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2);
420 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf
);
424 * Do not allow the sizeof(struct buf) * nbuf to exceed half of
425 * the valloc space which is just the virtual_end - virtual_start
426 * section. We use valloc() to allocate the buf header array.
428 if (nbuf
> (virtual_end
- virtual_start
) / sizeof(struct buf
) / 2) {
429 nbuf
= (virtual_end
- virtual_start
) /
430 sizeof(struct buf
) / 2;
431 kprintf("Warning: nbufs capped at %ld due to valloc "
432 "considerations\n", nbuf
);
435 nswbuf_mem
= lmax(lmin(nbuf
/ 32, 512), 8);
437 if (nswbuf_mem
< NSWBUF_MIN
)
438 nswbuf_mem
= NSWBUF_MIN
;
440 nswbuf_kva
= lmax(lmin(nbuf
/ 4, 512), 16);
442 if (nswbuf_kva
< NSWBUF_MIN
)
443 nswbuf_kva
= NSWBUF_MIN
;
449 valloc(swbuf_mem
, struct buf
, nswbuf_mem
);
450 valloc(swbuf_kva
, struct buf
, nswbuf_kva
);
451 valloc(buf
, struct buf
, nbuf
);
454 * End of first pass, size has been calculated so allocate memory
456 if (firstaddr
== 0) {
457 size
= (vm_size_t
)(v
- firstaddr
);
458 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
));
460 panic("startup: no room for tables");
465 * End of second pass, addresses have been assigned
467 * nbuf is an int, make sure we don't overflow the field.
469 * On 64-bit systems we always reserve maximal allocations for
470 * buffer cache buffers and there are no fragmentation issues,
471 * so the KVA segment does not have to be excessively oversized.
473 if ((vm_size_t
)(v
- firstaddr
) != size
)
474 panic("startup: table size inconsistency");
476 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
477 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
) +
478 ((nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) + pager_map_size
);
479 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
480 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
));
481 buffer_map
.system_map
= 1;
482 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
483 ((vm_offset_t
)(nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) +
485 pager_map
.system_map
= 1;
486 kprintf("avail memory = %ju (%ju MB)\n",
487 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
),
488 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
) /
492 struct cpu_idle_stat
{
500 u_long mwait_cx
[CPU_MWAIT_CX_MAX
];
503 #define CPU_IDLE_STAT_HALT -1
504 #define CPU_IDLE_STAT_SPIN -2
506 static struct cpu_idle_stat cpu_idle_stats
[MAXCPU
];
509 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS
)
511 int idx
= arg2
, cpu
, error
;
514 if (idx
== CPU_IDLE_STAT_HALT
) {
515 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
516 val
+= cpu_idle_stats
[cpu
].halt
;
517 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
518 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
519 val
+= cpu_idle_stats
[cpu
].spin
;
521 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
522 ("invalid index %d", idx
));
523 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
524 val
+= cpu_idle_stats
[cpu
].mwait_cx
[idx
];
527 error
= sysctl_handle_quad(oidp
, &val
, 0, req
);
528 if (error
|| req
->newptr
== NULL
)
531 if (idx
== CPU_IDLE_STAT_HALT
) {
532 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
533 cpu_idle_stats
[cpu
].halt
= 0;
534 cpu_idle_stats
[0].halt
= val
;
535 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
536 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
537 cpu_idle_stats
[cpu
].spin
= 0;
538 cpu_idle_stats
[0].spin
= val
;
540 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
541 ("invalid index %d", idx
));
542 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
543 cpu_idle_stats
[cpu
].mwait_cx
[idx
] = 0;
544 cpu_idle_stats
[0].mwait_cx
[idx
] = val
;
550 cpu_mwait_attach(void)
555 if (!CPU_MWAIT_HAS_CX
)
558 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
559 (CPUID_TO_FAMILY(cpu_id
) > 0xf ||
560 (CPUID_TO_FAMILY(cpu_id
) == 0x6 &&
561 CPUID_TO_MODEL(cpu_id
) >= 0xf))) {
565 * Pentium dual-core, Core 2 and beyond do not need any
566 * additional activities to enter deep C-state, i.e. C3(+).
568 cpu_mwait_cx_no_bmarb();
570 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts
);
572 cpu_mwait_cx_no_bmsts();
575 sbuf_new(&sb
, cpu_mwait_cx_supported
,
576 sizeof(cpu_mwait_cx_supported
), SBUF_FIXEDLEN
);
578 for (i
= 0; i
< CPU_MWAIT_CX_MAX
; ++i
) {
579 struct cpu_mwait_cx
*cx
= &cpu_mwait_cx_info
[i
];
582 ksnprintf(cx
->name
, sizeof(cx
->name
), "C%d", i
);
584 sysctl_ctx_init(&cx
->sysctl_ctx
);
585 cx
->sysctl_tree
= SYSCTL_ADD_NODE(&cx
->sysctl_ctx
,
586 SYSCTL_STATIC_CHILDREN(_machdep_mwait
), OID_AUTO
,
587 cx
->name
, CTLFLAG_RW
, NULL
, "Cx control/info");
588 if (cx
->sysctl_tree
== NULL
)
591 cx
->subcnt
= CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu
, i
);
592 SYSCTL_ADD_INT(&cx
->sysctl_ctx
,
593 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
594 "subcnt", CTLFLAG_RD
, &cx
->subcnt
, 0,
596 SYSCTL_ADD_PROC(&cx
->sysctl_ctx
,
597 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
598 "entered", (CTLTYPE_QUAD
| CTLFLAG_RW
), 0,
599 i
, sysctl_cpu_idle_cnt
, "Q", "# of times entered");
601 for (sub
= 0; sub
< cx
->subcnt
; ++sub
)
602 sbuf_printf(&sb
, "C%d/%d ", i
, sub
);
610 cpu_mwait_c1_hints_cnt
= cpu_mwait_cx_info
[CPU_MWAIT_C1
].subcnt
;
611 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
)
612 cpu_mwait_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
613 cpu_mwait_hints
= kmalloc(sizeof(int) * cpu_mwait_hints_cnt
,
617 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
) {
620 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
621 for (j
= 0; j
< subcnt
; ++j
) {
622 KASSERT(hint_idx
< cpu_mwait_hints_cnt
,
623 ("invalid mwait hint index %d", hint_idx
));
624 cpu_mwait_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
628 KASSERT(hint_idx
== cpu_mwait_hints_cnt
,
629 ("mwait hint count %d != index %d",
630 cpu_mwait_hints_cnt
, hint_idx
));
633 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt
);
634 for (i
= 0; i
< cpu_mwait_hints_cnt
; ++i
) {
635 int hint
= cpu_mwait_hints
[i
];
637 kprintf(" C%d/%d hint 0x%04x\n",
638 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
646 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
)
647 cpu_mwait_deep_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
648 cpu_mwait_deep_hints
= kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt
,
652 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
) {
655 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
656 for (j
= 0; j
< subcnt
; ++j
) {
657 KASSERT(hint_idx
< cpu_mwait_deep_hints_cnt
,
658 ("invalid mwait deep hint index %d", hint_idx
));
659 cpu_mwait_deep_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
663 KASSERT(hint_idx
== cpu_mwait_deep_hints_cnt
,
664 ("mwait deep hint count %d != index %d",
665 cpu_mwait_deep_hints_cnt
, hint_idx
));
668 kprintf("MWAIT deep hints:\n");
669 for (i
= 0; i
< cpu_mwait_deep_hints_cnt
; ++i
) {
670 int hint
= cpu_mwait_deep_hints
[i
];
672 kprintf(" C%d/%d hint 0x%04x\n",
673 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
677 cpu_idle_repeat_max
= 256 * cpu_mwait_deep_hints_cnt
;
679 for (i
= 0; i
< ncpus
; ++i
) {
682 ksnprintf(name
, sizeof(name
), "idle%d", i
);
683 SYSCTL_ADD_PROC(NULL
,
684 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX
), OID_AUTO
,
685 name
, (CTLTYPE_STRING
| CTLFLAG_RW
), &cpu_idle_stats
[i
],
686 0, cpu_mwait_cx_pcpu_idle_sysctl
, "A", "");
691 cpu_finish(void *dummy __unused
)
698 pic_finish(void *dummy __unused
)
700 /* Log ELCR information */
703 /* Log MPTABLE information */
704 mptable_pci_int_dump();
707 MachIntrABI
.finalize();
711 * Send an interrupt to process.
713 * Stack is set up to allow sigcode stored
714 * at top to call routine, followed by kcall
715 * to sigreturn routine below. After sigreturn
716 * resets the signal mask, the stack, and the
717 * frame pointer, it returns to the user
721 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
723 struct lwp
*lp
= curthread
->td_lwp
;
724 struct proc
*p
= lp
->lwp_proc
;
725 struct trapframe
*regs
;
726 struct sigacts
*psp
= p
->p_sigacts
;
727 struct sigframe sf
, *sfp
;
731 regs
= lp
->lwp_md
.md_regs
;
732 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
734 /* Save user context */
735 bzero(&sf
, sizeof(struct sigframe
));
736 sf
.sf_uc
.uc_sigmask
= *mask
;
737 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
738 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
739 KKASSERT(__offsetof(struct trapframe
, tf_rdi
) == 0);
740 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_rdi
, sizeof(struct trapframe
));
742 /* Make the size of the saved context visible to userland */
743 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
745 /* Allocate and validate space for the signal handler context. */
746 if ((lp
->lwp_flags
& LWP_ALTSTACK
) != 0 && !oonstack
&&
747 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
748 sp
= (char *)(lp
->lwp_sigstk
.ss_sp
+ lp
->lwp_sigstk
.ss_size
-
749 sizeof(struct sigframe
));
750 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
752 /* We take red zone into account */
753 sp
= (char *)regs
->tf_rsp
- sizeof(struct sigframe
) - 128;
757 * XXX AVX needs 64-byte alignment but sigframe has other fields and
758 * the embedded ucontext is not at the front, so aligning this won't
759 * help us. Fortunately we bcopy in/out of the sigframe, so the
762 * The problem though is if userland winds up trying to use the
765 sfp
= (struct sigframe
*)((intptr_t)sp
& ~(intptr_t)0xF);
767 /* Translate the signal is appropriate */
768 if (p
->p_sysent
->sv_sigtbl
) {
769 if (sig
<= p
->p_sysent
->sv_sigsize
)
770 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
774 * Build the argument list for the signal handler.
776 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
778 regs
->tf_rdi
= sig
; /* argument 1 */
779 regs
->tf_rdx
= (register_t
)&sfp
->sf_uc
; /* argument 3 */
781 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
783 * Signal handler installed with SA_SIGINFO.
785 * action(signo, siginfo, ucontext)
787 regs
->tf_rsi
= (register_t
)&sfp
->sf_si
; /* argument 2 */
788 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
789 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
791 /* fill siginfo structure */
792 sf
.sf_si
.si_signo
= sig
;
793 sf
.sf_si
.si_code
= code
;
794 sf
.sf_si
.si_addr
= (void *)regs
->tf_addr
;
797 * Old FreeBSD-style arguments.
799 * handler (signo, code, [uc], addr)
801 regs
->tf_rsi
= (register_t
)code
; /* argument 2 */
802 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
803 sf
.sf_ahu
.sf_handler
= catcher
;
807 * If we're a vm86 process, we want to save the segment registers.
808 * We also change eflags to be our emulated eflags, not the actual
812 if (regs
->tf_eflags
& PSL_VM
) {
813 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
814 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
816 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
817 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
818 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
819 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
821 if (vm86
->vm86_has_vme
== 0)
822 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
823 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
824 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
827 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
828 * syscalls made by the signal handler. This just avoids
829 * wasting time for our lazy fixup of such faults. PSL_NT
830 * does nothing in vm86 mode, but vm86 programs can set it
831 * almost legitimately in probes for old cpu types.
833 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
838 * Save the FPU state and reinit the FP unit
840 npxpush(&sf
.sf_uc
.uc_mcontext
);
843 * Copy the sigframe out to the user's stack.
845 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
847 * Something is wrong with the stack pointer.
848 * ...Kill the process.
853 regs
->tf_rsp
= (register_t
)sfp
;
854 regs
->tf_rip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
857 * i386 abi specifies that the direction flag must be cleared
860 regs
->tf_rflags
&= ~(PSL_T
|PSL_D
);
863 * 64 bit mode has a code and stack selector but
864 * no data or extra selector. %fs and %gs are not
867 regs
->tf_cs
= _ucodesel
;
868 regs
->tf_ss
= _udatasel
;
873 * Sanitize the trapframe for a virtual kernel passing control to a custom
874 * VM context. Remove any items that would otherwise create a privilage
877 * XXX at the moment we allow userland to set the resume flag. Is this a
881 cpu_sanitize_frame(struct trapframe
*frame
)
883 frame
->tf_cs
= _ucodesel
;
884 frame
->tf_ss
= _udatasel
;
885 /* XXX VM (8086) mode not supported? */
886 frame
->tf_rflags
&= (PSL_RF
| PSL_USERCHANGE
| PSL_VM_UNSUPP
);
887 frame
->tf_rflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
893 * Sanitize the tls so loading the descriptor does not blow up
894 * on us. For x86_64 we don't have to do anything.
897 cpu_sanitize_tls(struct savetls
*tls
)
903 * sigreturn(ucontext_t *sigcntxp)
905 * System call to cleanup state after a signal
906 * has been taken. Reset signal mask and
907 * stack state from context left by sendsig (above).
908 * Return to previous pc and psl as specified by
909 * context left by sendsig. Check carefully to
910 * make sure that the user has not modified the
911 * state to gain improper privileges.
915 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
916 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
919 sys_sigreturn(struct sigreturn_args
*uap
)
921 struct lwp
*lp
= curthread
->td_lwp
;
922 struct trapframe
*regs
;
930 * We have to copy the information into kernel space so userland
931 * can't modify it while we are sniffing it.
933 regs
= lp
->lwp_md
.md_regs
;
934 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
938 rflags
= ucp
->uc_mcontext
.mc_rflags
;
940 /* VM (8086) mode not supported */
941 rflags
&= ~PSL_VM_UNSUPP
;
944 if (eflags
& PSL_VM
) {
945 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
946 struct vm86_kernel
*vm86
;
949 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
950 * set up the vm86 area, and we can't enter vm86 mode.
952 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
954 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
955 if (vm86
->vm86_inited
== 0)
958 /* go back to user mode if both flags are set */
959 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
960 trapsignal(lp
, SIGBUS
, 0);
962 if (vm86
->vm86_has_vme
) {
963 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
964 (eflags
& VME_USERCHANGE
) | PSL_VM
;
966 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
967 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
968 (eflags
& VM_USERCHANGE
) | PSL_VM
;
970 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
971 tf
->tf_eflags
= eflags
;
972 tf
->tf_vm86_ds
= tf
->tf_ds
;
973 tf
->tf_vm86_es
= tf
->tf_es
;
974 tf
->tf_vm86_fs
= tf
->tf_fs
;
975 tf
->tf_vm86_gs
= tf
->tf_gs
;
976 tf
->tf_ds
= _udatasel
;
977 tf
->tf_es
= _udatasel
;
978 tf
->tf_fs
= _udatasel
;
979 tf
->tf_gs
= _udatasel
;
984 * Don't allow users to change privileged or reserved flags.
987 * XXX do allow users to change the privileged flag PSL_RF.
988 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
989 * should sometimes set it there too. tf_eflags is kept in
990 * the signal context during signal handling and there is no
991 * other place to remember it, so the PSL_RF bit may be
992 * corrupted by the signal handler without us knowing.
993 * Corruption of the PSL_RF bit at worst causes one more or
994 * one less debugger trap, so allowing it is fairly harmless.
996 if (!EFL_SECURE(rflags
& ~PSL_RF
, regs
->tf_rflags
& ~PSL_RF
)) {
997 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags
);
1002 * Don't allow users to load a valid privileged %cs. Let the
1003 * hardware check for invalid selectors, excess privilege in
1004 * other selectors, invalid %eip's and invalid %esp's.
1006 cs
= ucp
->uc_mcontext
.mc_cs
;
1007 if (!CS_SECURE(cs
)) {
1008 kprintf("sigreturn: cs = 0x%x\n", cs
);
1009 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
1012 bcopy(&ucp
->uc_mcontext
.mc_rdi
, regs
, sizeof(struct trapframe
));
1016 * Restore the FPU state from the frame
1019 npxpop(&ucp
->uc_mcontext
);
1021 if (ucp
->uc_mcontext
.mc_onstack
& 1)
1022 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
1024 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
1026 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
1027 SIG_CANTMASK(lp
->lwp_sigmask
);
1030 return(EJUSTRETURN
);
1034 * Machine dependent boot() routine
1036 * I haven't seen anything to put here yet
1037 * Possibly some stuff might be grafted back here from boot()
1045 * Shutdown the CPU as much as possible
1051 __asm__
__volatile("hlt");
1055 * cpu_idle() represents the idle LWKT. You cannot return from this function
1056 * (unless you want to blow things up!). Instead we look for runnable threads
1057 * and loop or halt as appropriate. Giant is not held on entry to the thread.
1059 * The main loop is entered with a critical section held, we must release
1060 * the critical section before doing anything else. lwkt_switch() will
1061 * check for pending interrupts due to entering and exiting its own
1064 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1065 * However, there are cases where the idlethread will be entered with
1066 * the possibility that no IPI will occur and in such cases
1067 * lwkt_switch() sets TDF_IDLE_NOHLT.
1069 * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1070 * must occur before it starts using ACPI halt.
1072 * NOTE: Value overridden in hammer_time().
1074 static int cpu_idle_hlt
= 2;
1075 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
1076 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
1077 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_repeat
, CTLFLAG_RW
,
1078 &cpu_idle_repeat
, 0, "Idle entries before acpi hlt");
1080 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1081 0, CPU_IDLE_STAT_HALT
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry halts");
1082 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_spincnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1083 0, CPU_IDLE_STAT_SPIN
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry spins");
1086 cpu_idle_default_hook(void)
1089 * We must guarentee that hlt is exactly the instruction
1090 * following the sti.
1092 __asm
__volatile("sti; hlt");
1095 /* Other subsystems (e.g., ACPI) can hook this later. */
1096 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
1099 cpu_mwait_cx_hint(struct cpu_idle_stat
*stat
)
1108 idx
= (stat
->repeat
+ stat
->repeat_last
+ stat
->repeat_delta
) >>
1109 cpu_mwait_repeat_shift
;
1110 if (idx
>= cpu_mwait_c1_hints_cnt
) {
1111 /* Step up faster, once we walked through all C1 states */
1112 stat
->repeat_delta
+= 1 << (cpu_mwait_repeat_shift
+ 1);
1114 if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
1115 if (idx
>= cpu_mwait_deep_hints_cnt
)
1116 idx
= cpu_mwait_deep_hints_cnt
- 1;
1117 hint
= cpu_mwait_deep_hints
[idx
];
1119 if (idx
>= cpu_mwait_hints_cnt
)
1120 idx
= cpu_mwait_hints_cnt
- 1;
1121 hint
= cpu_mwait_hints
[idx
];
1124 cx_idx
= MWAIT_EAX_TO_CX(hint
);
1125 if (cx_idx
>= 0 && cx_idx
< CPU_MWAIT_CX_MAX
)
1126 stat
->mwait_cx
[cx_idx
]++;
1133 globaldata_t gd
= mycpu
;
1134 struct cpu_idle_stat
*stat
= &cpu_idle_stats
[gd
->gd_cpuid
];
1135 struct thread
*td __debugvar
= gd
->gd_curthread
;
1139 stat
->repeat
= stat
->repeat_last
= cpu_idle_repeat_max
;
1142 KKASSERT(td
->td_critcount
== 0);
1146 * See if there are any LWKTs ready to go.
1151 * When halting inside a cli we must check for reqflags
1152 * races, particularly [re]schedule requests. Running
1153 * splz() does the job.
1156 * 0 Never halt, just spin
1158 * 1 Always use HLT (or MONITOR/MWAIT if avail).
1160 * Better default for modern (Haswell+) Intel
1163 * 2 Use HLT/MONITOR/MWAIT up to a point and then
1164 * use the ACPI halt (default). This is a hybrid
1165 * approach. See machdep.cpu_idle_repeat.
1167 * Better default for modern AMD cpus and older
1170 * 3 Always use the ACPI halt. This typically
1171 * eats the least amount of power but the cpu
1172 * will be slow waking up. Slows down e.g.
1173 * compiles and other pipe/event oriented stuff.
1177 * NOTE: Interrupts are enabled and we are not in a critical
1180 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
1181 * don't bother capping gd_idle_repeat, it is ok if
1184 * Implement optimized invltlb operations when halted
1185 * in idle. By setting the bit in smp_idleinvl_mask
1186 * we inform other cpus that they can set _reqs to
1187 * request an invltlb. Current the code to do that
1188 * sets the bits in _reqs anyway, but then check _mask
1189 * to determine if they can assume the invltlb will execute.
1191 * A critical section is required to ensure that interrupts
1192 * do not fully run until after we've had a chance to execute
1195 if (gd
->gd_idle_repeat
== 0) {
1196 stat
->repeat
= (stat
->repeat
+ stat
->repeat_last
) >> 1;
1197 if (stat
->repeat
> cpu_idle_repeat_max
)
1198 stat
->repeat
= cpu_idle_repeat_max
;
1199 stat
->repeat_last
= 0;
1200 stat
->repeat_delta
= 0;
1202 ++stat
->repeat_last
;
1204 ++gd
->gd_idle_repeat
;
1205 reqflags
= gd
->gd_reqflags
;
1206 quick
= (cpu_idle_hlt
== 1) ||
1207 (cpu_idle_hlt
< 3 &&
1208 gd
->gd_idle_repeat
< cpu_idle_repeat
);
1210 if (quick
&& (cpu_mi_feature
& CPU_MI_MONITOR
) &&
1211 (reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1214 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1215 cpu_mmw_pause_int(&gd
->gd_reqflags
, reqflags
,
1216 cpu_mwait_cx_hint(stat
), 0);
1218 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1219 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1225 } else if (cpu_idle_hlt
) {
1226 __asm
__volatile("cli");
1229 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1230 if ((gd
->gd_reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1232 cpu_idle_default_hook();
1236 __asm
__volatile("sti");
1238 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1239 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1247 __asm
__volatile("sti");
1254 * Called in a loop indirectly via Xcpustop
1257 cpu_smp_stopped(void)
1259 globaldata_t gd
= mycpu
;
1260 volatile __uint64_t
*ptr
;
1263 ptr
= CPUMASK_ADDR(started_cpus
, gd
->gd_cpuid
);
1265 if ((ovalue
& CPUMASK_SIMPLE(gd
->gd_cpuid
& 63)) == 0) {
1266 if (cpu_mi_feature
& CPU_MI_MONITOR
) {
1267 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr
), ovalue
,
1268 cpu_mwait_hints
[CPU_MWAIT_C1
], 0);
1270 cpu_halt(); /* depend on lapic timer */
1276 * This routine is called if a spinlock has been held through the
1277 * exponential backoff period and is seriously contested. On a real cpu
1281 cpu_spinlock_contested(void)
1287 * Clear registers on exec
1290 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
1292 struct thread
*td
= curthread
;
1293 struct lwp
*lp
= td
->td_lwp
;
1294 struct pcb
*pcb
= td
->td_pcb
;
1295 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
1297 /* was i386_user_cleanup() in NetBSD */
1301 bzero((char *)regs
, sizeof(struct trapframe
));
1302 regs
->tf_rip
= entry
;
1303 regs
->tf_rsp
= ((stack
- 8) & ~0xFul
) + 8; /* align the stack */
1304 regs
->tf_rdi
= stack
; /* argv */
1305 regs
->tf_rflags
= PSL_USER
| (regs
->tf_rflags
& PSL_T
);
1306 regs
->tf_ss
= _udatasel
;
1307 regs
->tf_cs
= _ucodesel
;
1308 regs
->tf_rbx
= ps_strings
;
1311 * Reset the hardware debug registers if they were in use.
1312 * They won't have any meaning for the newly exec'd process.
1314 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1320 pcb
->pcb_dr7
= 0; /* JG set bit 10? */
1321 if (pcb
== td
->td_pcb
) {
1323 * Clear the debug registers on the running
1324 * CPU, otherwise they will end up affecting
1325 * the next process we switch to.
1329 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1333 * Initialize the math emulator (if any) for the current process.
1334 * Actually, just clear the bit that says that the emulator has
1335 * been initialized. Initialization is delayed until the process
1336 * traps to the emulator (if it is done at all) mainly because
1337 * emulators don't provide an entry point for initialization.
1339 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1342 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1343 * gd_npxthread. Otherwise a preemptive interrupt thread
1344 * may panic in npxdna().
1347 load_cr0(rcr0() | CR0_MP
);
1350 * NOTE: The MSR values must be correct so we can return to
1351 * userland. gd_user_fs/gs must be correct so the switch
1352 * code knows what the current MSR values are.
1354 pcb
->pcb_fsbase
= 0; /* Values loaded from PCB on switch */
1355 pcb
->pcb_gsbase
= 0;
1356 mdcpu
->gd_user_fs
= 0; /* Cache of current MSR values */
1357 mdcpu
->gd_user_gs
= 0;
1358 wrmsr(MSR_FSBASE
, 0); /* Set MSR values for return to userland */
1359 wrmsr(MSR_KGSBASE
, 0);
1361 /* Initialize the npx (if any) for the current process. */
1365 pcb
->pcb_ds
= _udatasel
;
1366 pcb
->pcb_es
= _udatasel
;
1367 pcb
->pcb_fs
= _udatasel
;
1368 pcb
->pcb_gs
= _udatasel
;
1377 cr0
|= CR0_NE
; /* Done by npxinit() */
1378 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1379 cr0
|= CR0_WP
| CR0_AM
;
1385 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1388 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1390 if (!error
&& req
->newptr
)
1395 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1396 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1398 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1399 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1402 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1403 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1406 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1407 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1409 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1410 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1411 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1414 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS
)
1416 struct efi_map_header
*efihdr
;
1420 kmdp
= preload_search_by_type("elf kernel");
1422 kmdp
= preload_search_by_type("elf64 kernel");
1423 efihdr
= (struct efi_map_header
*)preload_search_info(kmdp
,
1424 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1427 efisize
= *((uint32_t *)efihdr
- 1);
1428 return (SYSCTL_OUT(req
, efihdr
, efisize
));
1430 SYSCTL_PROC(_machdep
, OID_AUTO
, efi_map
, CTLTYPE_OPAQUE
|CTLFLAG_RD
, NULL
, 0,
1431 efi_map_sysctl_handler
, "S,efi_map_header", "Raw EFI Memory Map");
1434 * Initialize 386 and configure to run kernel
1438 * Initialize segments & interrupt table
1442 struct user_segment_descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1443 struct gate_descriptor idt_arr
[MAXCPU
][NIDT
];
1445 union descriptor ldt
[NLDT
]; /* local descriptor table */
1448 /* table descriptors - used to load tables by cpu */
1449 struct region_descriptor r_gdt
;
1450 struct region_descriptor r_idt_arr
[MAXCPU
];
1452 /* JG proc0paddr is a virtual address */
1455 char proc0paddr_buff
[LWKT_THREAD_STACK
];
1458 /* software prototypes -- in more palatable form */
1459 struct soft_segment_descriptor gdt_segs
[] = {
1460 /* GNULL_SEL 0 Null Descriptor */
1461 { 0x0, /* segment base address */
1463 0, /* segment type */
1464 0, /* segment descriptor priority level */
1465 0, /* segment descriptor present */
1467 0, /* default 32 vs 16 bit size */
1468 0 /* limit granularity (byte/page units)*/ },
1469 /* GCODE_SEL 1 Code Descriptor for kernel */
1470 { 0x0, /* segment base address */
1471 0xfffff, /* length - all address space */
1472 SDT_MEMERA
, /* segment type */
1473 SEL_KPL
, /* segment descriptor priority level */
1474 1, /* segment descriptor present */
1476 0, /* default 32 vs 16 bit size */
1477 1 /* limit granularity (byte/page units)*/ },
1478 /* GDATA_SEL 2 Data Descriptor for kernel */
1479 { 0x0, /* segment base address */
1480 0xfffff, /* length - all address space */
1481 SDT_MEMRWA
, /* segment type */
1482 SEL_KPL
, /* segment descriptor priority level */
1483 1, /* segment descriptor present */
1485 0, /* default 32 vs 16 bit size */
1486 1 /* limit granularity (byte/page units)*/ },
1487 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1488 { 0x0, /* segment base address */
1489 0xfffff, /* length - all address space */
1490 SDT_MEMERA
, /* segment type */
1491 SEL_UPL
, /* segment descriptor priority level */
1492 1, /* segment descriptor present */
1494 1, /* default 32 vs 16 bit size */
1495 1 /* limit granularity (byte/page units)*/ },
1496 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1497 { 0x0, /* segment base address */
1498 0xfffff, /* length - all address space */
1499 SDT_MEMRWA
, /* segment type */
1500 SEL_UPL
, /* segment descriptor priority level */
1501 1, /* segment descriptor present */
1503 1, /* default 32 vs 16 bit size */
1504 1 /* limit granularity (byte/page units)*/ },
1505 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1506 { 0x0, /* segment base address */
1507 0xfffff, /* length - all address space */
1508 SDT_MEMERA
, /* segment type */
1509 SEL_UPL
, /* segment descriptor priority level */
1510 1, /* segment descriptor present */
1512 0, /* default 32 vs 16 bit size */
1513 1 /* limit granularity (byte/page units)*/ },
1514 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1516 0x0, /* segment base address */
1517 sizeof(struct x86_64tss
)-1,/* length - all address space */
1518 SDT_SYSTSS
, /* segment type */
1519 SEL_KPL
, /* segment descriptor priority level */
1520 1, /* segment descriptor present */
1522 0, /* unused - default 32 vs 16 bit size */
1523 0 /* limit granularity (byte/page units)*/ },
1524 /* Actually, the TSS is a system descriptor which is double size */
1525 { 0x0, /* segment base address */
1527 0, /* segment type */
1528 0, /* segment descriptor priority level */
1529 0, /* segment descriptor present */
1531 0, /* default 32 vs 16 bit size */
1532 0 /* limit granularity (byte/page units)*/ },
1533 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1534 { 0x0, /* segment base address */
1535 0xfffff, /* length - all address space */
1536 SDT_MEMRWA
, /* segment type */
1537 SEL_UPL
, /* segment descriptor priority level */
1538 1, /* segment descriptor present */
1540 1, /* default 32 vs 16 bit size */
1541 1 /* limit granularity (byte/page units)*/ },
1545 setidt_global(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
)
1549 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
1550 struct gate_descriptor
*ip
= &idt_arr
[cpu
][idx
];
1552 ip
->gd_looffset
= (uintptr_t)func
;
1553 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1559 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1564 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
, int cpu
)
1566 struct gate_descriptor
*ip
;
1568 KASSERT(cpu
>= 0 && cpu
< ncpus
, ("invalid cpu %d", cpu
));
1570 ip
= &idt_arr
[cpu
][idx
];
1571 ip
->gd_looffset
= (uintptr_t)func
;
1572 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1578 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1581 #define IDTVEC(name) __CONCAT(X,name)
1584 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1585 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1586 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1587 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(rsvd
), IDTVEC(fpu
), IDTVEC(align
),
1588 IDTVEC(xmm
), IDTVEC(dblfault
),
1589 IDTVEC(fast_syscall
), IDTVEC(fast_syscall32
);
1592 sdtossd(struct user_segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1594 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1595 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1596 ssd
->ssd_type
= sd
->sd_type
;
1597 ssd
->ssd_dpl
= sd
->sd_dpl
;
1598 ssd
->ssd_p
= sd
->sd_p
;
1599 ssd
->ssd_def32
= sd
->sd_def32
;
1600 ssd
->ssd_gran
= sd
->sd_gran
;
1604 ssdtosd(struct soft_segment_descriptor
*ssd
, struct user_segment_descriptor
*sd
)
1607 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1608 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xff;
1609 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1610 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1611 sd
->sd_type
= ssd
->ssd_type
;
1612 sd
->sd_dpl
= ssd
->ssd_dpl
;
1613 sd
->sd_p
= ssd
->ssd_p
;
1614 sd
->sd_long
= ssd
->ssd_long
;
1615 sd
->sd_def32
= ssd
->ssd_def32
;
1616 sd
->sd_gran
= ssd
->ssd_gran
;
1620 ssdtosyssd(struct soft_segment_descriptor
*ssd
,
1621 struct system_segment_descriptor
*sd
)
1624 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1625 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xfffffffffful
;
1626 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1627 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1628 sd
->sd_type
= ssd
->ssd_type
;
1629 sd
->sd_dpl
= ssd
->ssd_dpl
;
1630 sd
->sd_p
= ssd
->ssd_p
;
1631 sd
->sd_gran
= ssd
->ssd_gran
;
1635 * Populate the (physmap) array with base/bound pairs describing the
1636 * available physical memory in the system, then test this memory and
1637 * build the phys_avail array describing the actually-available memory.
1639 * If we cannot accurately determine the physical memory map, then use
1640 * value from the 0xE801 call, and failing that, the RTC.
1642 * Total memory size may be set by the kernel environment variable
1643 * hw.physmem or the compile-time define MAXMEM.
1645 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1646 * of PAGE_SIZE. This also greatly reduces the memory test time
1647 * which would otherwise be excessive on machines with > 8G of ram.
1649 * XXX first should be vm_paddr_t.
1652 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024)
1653 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1654 vm_paddr_t physmap
[PHYSMAP_SIZE
];
1655 struct bios_smap
*smapbase
, *smap
, *smapend
;
1656 struct efi_map_header
*efihdrbase
;
1658 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024)
1659 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1)
1662 add_smap_entries(int *physmap_idx
)
1666 smapsize
= *((u_int32_t
*)smapbase
- 1);
1667 smapend
= (struct bios_smap
*)((uintptr_t)smapbase
+ smapsize
);
1669 for (smap
= smapbase
; smap
< smapend
; smap
++) {
1670 if (boothowto
& RB_VERBOSE
)
1671 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1672 smap
->type
, smap
->base
, smap
->length
);
1674 if (smap
->type
!= SMAP_TYPE_MEMORY
)
1677 if (smap
->length
== 0)
1680 for (i
= 0; i
<= *physmap_idx
; i
+= 2) {
1681 if (smap
->base
< physmap
[i
+ 1]) {
1682 if (boothowto
& RB_VERBOSE
) {
1683 kprintf("Overlapping or non-monotonic "
1684 "memory region, ignoring "
1690 if (i
<= *physmap_idx
)
1693 Realmem
+= smap
->length
;
1695 if (smap
->base
== physmap
[*physmap_idx
+ 1]) {
1696 physmap
[*physmap_idx
+ 1] += smap
->length
;
1701 if (*physmap_idx
== PHYSMAP_SIZE
) {
1702 kprintf("Too many segments in the physical "
1703 "address map, giving up\n");
1706 physmap
[*physmap_idx
] = smap
->base
;
1707 physmap
[*physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1712 add_efi_map_entries(int *physmap_idx
)
1714 struct efi_md
*map
, *p
;
1719 static const char *types
[] = {
1725 "RuntimeServicesCode",
1726 "RuntimeServicesData",
1727 "ConventionalMemory",
1729 "ACPIReclaimMemory",
1732 "MemoryMappedIOPortSpace",
1737 * Memory map data provided by UEFI via the GetMemoryMap
1738 * Boot Services API.
1740 efisz
= (sizeof(struct efi_map_header
) + 0xf) & ~0xf;
1741 map
= (struct efi_md
*)((uint8_t *)efihdrbase
+ efisz
);
1743 if (efihdrbase
->descriptor_size
== 0)
1745 ndesc
= efihdrbase
->memory_size
/ efihdrbase
->descriptor_size
;
1747 if (boothowto
& RB_VERBOSE
)
1748 kprintf("%23s %12s %12s %8s %4s\n",
1749 "Type", "Physical", "Virtual", "#Pages", "Attr");
1751 for (i
= 0, p
= map
; i
< ndesc
; i
++,
1752 p
= efi_next_descriptor(p
, efihdrbase
->descriptor_size
)) {
1753 if (boothowto
& RB_VERBOSE
) {
1754 if (p
->md_type
<= EFI_MD_TYPE_PALCODE
)
1755 type
= types
[p
->md_type
];
1758 kprintf("%23s %012lx %12p %08lx ", type
, p
->md_phys
,
1759 p
->md_virt
, p
->md_pages
);
1760 if (p
->md_attr
& EFI_MD_ATTR_UC
)
1762 if (p
->md_attr
& EFI_MD_ATTR_WC
)
1764 if (p
->md_attr
& EFI_MD_ATTR_WT
)
1766 if (p
->md_attr
& EFI_MD_ATTR_WB
)
1768 if (p
->md_attr
& EFI_MD_ATTR_UCE
)
1770 if (p
->md_attr
& EFI_MD_ATTR_WP
)
1772 if (p
->md_attr
& EFI_MD_ATTR_RP
)
1774 if (p
->md_attr
& EFI_MD_ATTR_XP
)
1776 if (p
->md_attr
& EFI_MD_ATTR_RT
)
1781 switch (p
->md_type
) {
1782 case EFI_MD_TYPE_CODE
:
1783 case EFI_MD_TYPE_DATA
:
1784 case EFI_MD_TYPE_BS_CODE
:
1785 case EFI_MD_TYPE_BS_DATA
:
1786 case EFI_MD_TYPE_FREE
:
1788 * We're allowed to use any entry with these types.
1795 Realmem
+= p
->md_pages
* PAGE_SIZE
;
1797 if (p
->md_phys
== physmap
[*physmap_idx
+ 1]) {
1798 physmap
[*physmap_idx
+ 1] += p
->md_pages
* PAGE_SIZE
;
1803 if (*physmap_idx
== PHYSMAP_SIZE
) {
1804 kprintf("Too many segments in the physical "
1805 "address map, giving up\n");
1808 physmap
[*physmap_idx
] = p
->md_phys
;
1809 physmap
[*physmap_idx
+ 1] = p
->md_phys
+ p
->md_pages
* PAGE_SIZE
;
1813 struct fb_info efi_fb_info
;
1814 static int have_efi_framebuffer
= 0;
1817 efi_fb_init_vaddr(int direct_map
)
1820 vm_offset_t addr
, v
;
1822 v
= efi_fb_info
.vaddr
;
1823 sz
= efi_fb_info
.stride
* efi_fb_info
.height
;
1826 addr
= PHYS_TO_DMAP(efi_fb_info
.paddr
);
1827 if (addr
>= DMAP_MIN_ADDRESS
&& addr
+ sz
< DMAP_MAX_ADDRESS
)
1828 efi_fb_info
.vaddr
= addr
;
1830 efi_fb_info
.vaddr
= (vm_offset_t
)pmap_mapdev_attr(
1831 efi_fb_info
.paddr
, sz
, PAT_WRITE_COMBINING
);
1836 probe_efi_fb(int early
)
1838 struct efi_fb
*efifb
;
1841 if (have_efi_framebuffer
) {
1843 (efi_fb_info
.vaddr
== 0 ||
1844 efi_fb_info
.vaddr
== PHYS_TO_DMAP(efi_fb_info
.paddr
)))
1845 efi_fb_init_vaddr(0);
1849 kmdp
= preload_search_by_type("elf kernel");
1851 kmdp
= preload_search_by_type("elf64 kernel");
1852 efifb
= (struct efi_fb
*)preload_search_info(kmdp
,
1853 MODINFO_METADATA
| MODINFOMD_EFI_FB
);
1857 have_efi_framebuffer
= 1;
1859 efi_fb_info
.is_vga_boot_display
= 1;
1860 efi_fb_info
.width
= efifb
->fb_width
;
1861 efi_fb_info
.height
= efifb
->fb_height
;
1862 efi_fb_info
.stride
= efifb
->fb_stride
* 4;
1863 efi_fb_info
.depth
= 32;
1864 efi_fb_info
.paddr
= efifb
->fb_addr
;
1866 efi_fb_info
.vaddr
= 0;
1868 efi_fb_init_vaddr(0);
1870 efi_fb_info
.fbops
.fb_set_par
= NULL
;
1871 efi_fb_info
.fbops
.fb_blank
= NULL
;
1872 efi_fb_info
.fbops
.fb_debug_enter
= NULL
;
1873 efi_fb_info
.device
= NULL
;
1879 efifb_startup(void *arg
)
1884 SYSINIT(efi_fb_info
, SI_BOOT1_POST
, SI_ORDER_FIRST
, efifb_startup
, NULL
);
1887 getmemsize(caddr_t kmdp
, u_int64_t first
)
1889 int off
, physmap_idx
, pa_indx
, da_indx
;
1892 vm_paddr_t msgbuf_size
;
1893 u_long physmem_tunable
;
1895 quad_t dcons_addr
, dcons_size
;
1897 bzero(physmap
, sizeof(physmap
));
1901 * get memory map from INT 15:E820, kindly supplied by the loader.
1903 * subr_module.c says:
1904 * "Consumer may safely assume that size value precedes data."
1905 * ie: an int32_t immediately precedes smap.
1907 efihdrbase
= (struct efi_map_header
*)preload_search_info(kmdp
,
1908 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1909 smapbase
= (struct bios_smap
*)preload_search_info(kmdp
,
1910 MODINFO_METADATA
| MODINFOMD_SMAP
);
1911 if (smapbase
== NULL
&& efihdrbase
== NULL
)
1912 panic("No BIOS smap or EFI map info from loader!");
1914 if (efihdrbase
== NULL
)
1915 add_smap_entries(&physmap_idx
);
1917 add_efi_map_entries(&physmap_idx
);
1919 base_memory
= physmap
[1] / 1024;
1920 /* make hole for AP bootstrap code */
1921 physmap
[1] = mp_bootaddress(base_memory
);
1923 /* Save EBDA address, if any */
1924 ebda_addr
= (u_long
)(*(u_short
*)(KERNBASE
+ 0x40e));
1928 * Maxmem isn't the "maximum memory", it's one larger than the
1929 * highest page of the physical address space. It should be
1930 * called something like "Maxphyspage". We may adjust this
1931 * based on ``hw.physmem'' and the results of the memory test.
1933 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1936 Maxmem
= MAXMEM
/ 4;
1939 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable
))
1940 Maxmem
= atop(physmem_tunable
);
1943 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1946 if (Maxmem
> atop(physmap
[physmap_idx
+ 1]))
1947 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1950 * Blowing out the DMAP will blow up the system.
1952 if (Maxmem
> atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
)) {
1953 kprintf("Limiting Maxmem due to DMAP size\n");
1954 Maxmem
= atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
);
1957 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1958 (boothowto
& RB_VERBOSE
)) {
1959 kprintf("Physical memory use set to %ldK\n", Maxmem
* 4);
1963 * Call pmap initialization to make new kernel address space
1967 pmap_bootstrap(&first
);
1968 physmap
[0] = PAGE_SIZE
;
1971 * Align the physmap to PHYSMAP_ALIGN and cut out anything
1974 for (i
= j
= 0; i
<= physmap_idx
; i
+= 2) {
1975 if (physmap
[i
+1] > ptoa(Maxmem
))
1976 physmap
[i
+1] = ptoa(Maxmem
);
1977 physmap
[i
] = (physmap
[i
] + PHYSMAP_ALIGN_MASK
) &
1978 ~PHYSMAP_ALIGN_MASK
;
1979 physmap
[i
+1] = physmap
[i
+1] & ~PHYSMAP_ALIGN_MASK
;
1981 physmap
[j
] = physmap
[i
];
1982 physmap
[j
+1] = physmap
[i
+1];
1984 if (physmap
[i
] < physmap
[i
+1])
1987 physmap_idx
= j
- 2;
1990 * Align anything else used in the validation loop.
1992 first
= (first
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
1995 * Size up each available chunk of physical memory.
1999 phys_avail
[pa_indx
++] = physmap
[0];
2000 phys_avail
[pa_indx
] = physmap
[0];
2001 dump_avail
[da_indx
] = physmap
[0];
2005 * Get dcons buffer address
2007 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
2008 kgetenv_quad("dcons.size", &dcons_size
) == 0)
2012 * Validate the physical memory. The physical memory segments
2013 * have already been aligned to PHYSMAP_ALIGN which is a multiple
2016 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
2018 vm_paddr_t incr
= PHYSMAP_ALIGN
;
2020 end
= physmap
[i
+ 1];
2022 for (pa
= physmap
[i
]; pa
< end
; pa
+= incr
) {
2024 volatile uint64_t *ptr
= (uint64_t *)CADDR1
;
2027 incr
= PHYSMAP_ALIGN
;
2031 * block out kernel memory as not available.
2033 if (pa
>= 0x200000 && pa
< first
)
2037 * block out dcons buffer
2040 && pa
>= trunc_page(dcons_addr
)
2041 && pa
< dcons_addr
+ dcons_size
) {
2048 * Always test the first and last block supplied in
2049 * the map entry, but it just takes too long to run
2050 * the test these days and we already have to skip
2051 * pages. Handwave it on PHYSMAP_HANDWAVE boundaries.
2053 if (pa
!= physmap
[i
]) {
2054 vm_paddr_t bytes
= end
- pa
;
2055 if ((pa
& PHYSMAP_HANDWAVE_MASK
) == 0 &&
2056 bytes
>= PHYSMAP_HANDWAVE
+ PHYSMAP_ALIGN
) {
2057 incr
= PHYSMAP_HANDWAVE
;
2063 * map page into kernel: valid, read/write,non-cacheable
2066 kernel_pmap
.pmap_bits
[PG_V_IDX
] |
2067 kernel_pmap
.pmap_bits
[PG_RW_IDX
] |
2068 kernel_pmap
.pmap_bits
[PG_N_IDX
];
2069 cpu_invlpg(__DEVOLATILE(void *, ptr
));
2074 * Test for alternating 1's and 0's
2076 *ptr
= 0xaaaaaaaaaaaaaaaaLLU
;
2078 if (*ptr
!= 0xaaaaaaaaaaaaaaaaLLU
)
2081 * Test for alternating 0's and 1's
2083 *ptr
= 0x5555555555555555LLU
;
2085 if (*ptr
!= 0x5555555555555555LLU
)
2090 *ptr
= 0xffffffffffffffffLLU
;
2092 if (*ptr
!= 0xffffffffffffffffLLU
)
2102 * Restore original value.
2108 * Adjust array of valid/good pages.
2110 if (page_bad
== TRUE
)
2114 * If this good page is a continuation of the
2115 * previous set of good pages, then just increase
2116 * the end pointer. Otherwise start a new chunk.
2117 * Note that "end" points one higher than end,
2118 * making the range >= start and < end.
2119 * If we're also doing a speculative memory
2120 * test and we at or past the end, bump up Maxmem
2121 * so that we keep going. The first bad page
2122 * will terminate the loop.
2124 if (phys_avail
[pa_indx
] == pa
) {
2125 phys_avail
[pa_indx
] += incr
;
2128 if (pa_indx
== PHYS_AVAIL_ARRAY_END
) {
2130 "Too many holes in the physical address space, giving up\n");
2135 phys_avail
[pa_indx
++] = pa
;
2136 phys_avail
[pa_indx
] = pa
+ incr
;
2138 physmem
+= incr
/ PAGE_SIZE
;
2140 if (dump_avail
[da_indx
] == pa
) {
2141 dump_avail
[da_indx
] += incr
;
2144 if (da_indx
== DUMP_AVAIL_ARRAY_END
) {
2148 dump_avail
[da_indx
++] = pa
;
2149 dump_avail
[da_indx
] = pa
+ incr
;
2161 * The last chunk must contain at least one page plus the message
2162 * buffer to avoid complicating other code (message buffer address
2163 * calculation, etc.).
2165 msgbuf_size
= (MSGBUF_SIZE
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
2167 while (phys_avail
[pa_indx
- 1] + PHYSMAP_ALIGN
+
2168 msgbuf_size
>= phys_avail
[pa_indx
]) {
2169 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
2170 phys_avail
[pa_indx
--] = 0;
2171 phys_avail
[pa_indx
--] = 0;
2174 Maxmem
= atop(phys_avail
[pa_indx
]);
2176 /* Trim off space for the message buffer. */
2177 phys_avail
[pa_indx
] -= msgbuf_size
;
2179 avail_end
= phys_avail
[pa_indx
];
2181 /* Map the message buffer. */
2182 for (off
= 0; off
< msgbuf_size
; off
+= PAGE_SIZE
) {
2183 pmap_kenter((vm_offset_t
)msgbufp
+ off
,
2184 phys_avail
[pa_indx
] + off
);
2186 /* Try to get EFI framebuffer working as early as possible */
2187 if (have_efi_framebuffer
)
2188 efi_fb_init_vaddr(1);
2191 struct machintr_abi MachIntrABI
;
2202 * 7 Device Not Available (x87)
2204 * 9 Coprocessor Segment overrun (unsupported, reserved)
2206 * 11 Segment not present
2208 * 13 General Protection
2211 * 16 x87 FP Exception pending
2212 * 17 Alignment Check
2214 * 19 SIMD floating point
2216 * 32-255 INTn/external sources
2219 hammer_time(u_int64_t modulep
, u_int64_t physfree
)
2222 int gsel_tss
, x
, cpu
;
2224 int metadata_missing
, off
;
2226 struct mdglobaldata
*gd
;
2230 * Prevent lowering of the ipl if we call tsleep() early.
2232 gd
= &CPU_prvspace
[0]->mdglobaldata
;
2233 bzero(gd
, sizeof(*gd
));
2236 * Note: on both UP and SMP curthread must be set non-NULL
2237 * early in the boot sequence because the system assumes
2238 * that 'curthread' is never NULL.
2241 gd
->mi
.gd_curthread
= &thread0
;
2242 thread0
.td_gd
= &gd
->mi
;
2244 atdevbase
= ISA_HOLE_START
+ PTOV_OFFSET
;
2247 metadata_missing
= 0;
2248 if (bootinfo
.bi_modulep
) {
2249 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
2250 preload_bootstrap_relocate(KERNBASE
);
2252 metadata_missing
= 1;
2254 if (bootinfo
.bi_envp
)
2255 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
2258 preload_metadata
= (caddr_t
)(uintptr_t)(modulep
+ PTOV_OFFSET
);
2259 preload_bootstrap_relocate(PTOV_OFFSET
);
2260 kmdp
= preload_search_by_type("elf kernel");
2262 kmdp
= preload_search_by_type("elf64 kernel");
2263 boothowto
= MD_FETCH(kmdp
, MODINFOMD_HOWTO
, int);
2264 kern_envp
= MD_FETCH(kmdp
, MODINFOMD_ENVP
, char *) + PTOV_OFFSET
;
2266 ksym_start
= MD_FETCH(kmdp
, MODINFOMD_SSYM
, uintptr_t);
2267 ksym_end
= MD_FETCH(kmdp
, MODINFOMD_ESYM
, uintptr_t);
2269 efi_systbl_phys
= MD_FETCH(kmdp
, MODINFOMD_FW_HANDLE
, vm_paddr_t
);
2271 if (boothowto
& RB_VERBOSE
)
2275 * Default MachIntrABI to ICU
2277 MachIntrABI
= MachIntrABI_ICU
;
2280 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
2281 * and ncpus_fit_mask remain 0.
2286 /* Init basic tunables, hz etc */
2290 * make gdt memory segments
2292 gdt_segs
[GPROC0_SEL
].ssd_base
=
2293 (uintptr_t) &CPU_prvspace
[0]->mdglobaldata
.gd_common_tss
;
2295 gd
->mi
.gd_prvspace
= CPU_prvspace
[0];
2297 for (x
= 0; x
< NGDT
; x
++) {
2298 if (x
!= GPROC0_SEL
&& x
!= (GPROC0_SEL
+ 1))
2299 ssdtosd(&gdt_segs
[x
], &gdt
[x
]);
2301 ssdtosyssd(&gdt_segs
[GPROC0_SEL
],
2302 (struct system_segment_descriptor
*)&gdt
[GPROC0_SEL
]);
2304 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
2305 r_gdt
.rd_base
= (long) gdt
;
2308 wrmsr(MSR_FSBASE
, 0); /* User value */
2309 wrmsr(MSR_GSBASE
, (u_int64_t
)&gd
->mi
);
2310 wrmsr(MSR_KGSBASE
, 0); /* User value while in the kernel */
2312 mi_gdinit(&gd
->mi
, 0);
2314 proc0paddr
= proc0paddr_buff
;
2315 mi_proc0init(&gd
->mi
, proc0paddr
);
2316 safepri
= TDPRI_MAX
;
2318 /* spinlocks and the BGL */
2322 for (x
= 0; x
< NIDT
; x
++)
2323 setidt_global(x
, &IDTVEC(rsvd
), SDT_SYSIGT
, SEL_KPL
, 0);
2324 setidt_global(IDT_DE
, &IDTVEC(div
), SDT_SYSIGT
, SEL_KPL
, 0);
2325 setidt_global(IDT_DB
, &IDTVEC(dbg
), SDT_SYSIGT
, SEL_KPL
, 0);
2326 setidt_global(IDT_NMI
, &IDTVEC(nmi
), SDT_SYSIGT
, SEL_KPL
, 1);
2327 setidt_global(IDT_BP
, &IDTVEC(bpt
), SDT_SYSIGT
, SEL_UPL
, 0);
2328 setidt_global(IDT_OF
, &IDTVEC(ofl
), SDT_SYSIGT
, SEL_KPL
, 0);
2329 setidt_global(IDT_BR
, &IDTVEC(bnd
), SDT_SYSIGT
, SEL_KPL
, 0);
2330 setidt_global(IDT_UD
, &IDTVEC(ill
), SDT_SYSIGT
, SEL_KPL
, 0);
2331 setidt_global(IDT_NM
, &IDTVEC(dna
), SDT_SYSIGT
, SEL_KPL
, 0);
2332 setidt_global(IDT_DF
, &IDTVEC(dblfault
), SDT_SYSIGT
, SEL_KPL
, 1);
2333 setidt_global(IDT_FPUGP
, &IDTVEC(fpusegm
), SDT_SYSIGT
, SEL_KPL
, 0);
2334 setidt_global(IDT_TS
, &IDTVEC(tss
), SDT_SYSIGT
, SEL_KPL
, 0);
2335 setidt_global(IDT_NP
, &IDTVEC(missing
), SDT_SYSIGT
, SEL_KPL
, 0);
2336 setidt_global(IDT_SS
, &IDTVEC(stk
), SDT_SYSIGT
, SEL_KPL
, 0);
2337 setidt_global(IDT_GP
, &IDTVEC(prot
), SDT_SYSIGT
, SEL_KPL
, 0);
2338 setidt_global(IDT_PF
, &IDTVEC(page
), SDT_SYSIGT
, SEL_KPL
, 0);
2339 setidt_global(IDT_MF
, &IDTVEC(fpu
), SDT_SYSIGT
, SEL_KPL
, 0);
2340 setidt_global(IDT_AC
, &IDTVEC(align
), SDT_SYSIGT
, SEL_KPL
, 0);
2341 setidt_global(IDT_MC
, &IDTVEC(mchk
), SDT_SYSIGT
, SEL_KPL
, 0);
2342 setidt_global(IDT_XF
, &IDTVEC(xmm
), SDT_SYSIGT
, SEL_KPL
, 0);
2344 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
2345 r_idt_arr
[cpu
].rd_limit
= sizeof(idt_arr
[cpu
]) - 1;
2346 r_idt_arr
[cpu
].rd_base
= (long) &idt_arr
[cpu
][0];
2349 lidt(&r_idt_arr
[0]);
2352 * Initialize the console before we print anything out.
2357 if (metadata_missing
)
2358 kprintf("WARNING: loader(8) metadata is missing!\n");
2368 * Initialize IRQ mapping
2371 * SHOULD be after elcr_probe()
2373 MachIntrABI_ICU
.initmap();
2374 MachIntrABI_IOAPIC
.initmap();
2378 if (boothowto
& RB_KDB
)
2379 Debugger("Boot flags requested debugger");
2383 finishidentcpu(); /* Final stage of CPU initialization */
2384 setidt(6, &IDTVEC(ill
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2385 setidt(13, &IDTVEC(prot
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2387 identify_cpu(); /* Final stage of CPU initialization */
2388 initializecpu(0); /* Initialize CPU registers */
2391 * On modern intel cpus, haswell or later, cpu_idle_hlt=1 is better
2392 * because the cpu does significant power management in MWAIT
2393 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2395 * On modern amd cpus cpu_idle_hlt=3 is better, because the cpu does
2396 * significant power management in HLT or ACPI (but cpu_idle_hlt=1
2397 * would try to use MWAIT).
2399 * On older amd or intel cpus, cpu_idle_hlt=2 is better because ACPI
2400 * is needed to reduce power consumption, but wakeup times are often
2403 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
2404 CPUID_TO_MODEL(cpu_id
) >= 0x3C) { /* Haswell or later */
2407 if (cpu_vendor_id
== CPU_VENDOR_AMD
&&
2408 CPUID_TO_FAMILY(cpu_id
) >= 0x14) { /* Bobcat or later */
2412 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable
); /* for compat */
2413 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable
);
2414 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable
);
2415 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt
);
2418 * Some of the virtual machines do not work w/ I/O APIC
2419 * enabled. If the user does not explicitly enable or
2420 * disable the I/O APIC (ioapic_enable < 0), then we
2421 * disable I/O APIC on all virtual machines.
2424 * This must be done after identify_cpu(), which sets
2427 if (ioapic_enable
< 0) {
2428 if (cpu_feature2
& CPUID2_VMM
)
2434 /* make an initial tss so cpu can get interrupt stack on syscall! */
2435 gd
->gd_common_tss
.tss_rsp0
=
2436 (register_t
)(thread0
.td_kstack
+
2437 KSTACK_PAGES
* PAGE_SIZE
- sizeof(struct pcb
));
2438 /* Ensure the stack is aligned to 16 bytes */
2439 gd
->gd_common_tss
.tss_rsp0
&= ~(register_t
)0xF;
2441 /* double fault stack */
2442 gd
->gd_common_tss
.tss_ist1
=
2443 (long)&gd
->mi
.gd_prvspace
->idlestack
[
2444 sizeof(gd
->mi
.gd_prvspace
->idlestack
)];
2446 /* Set the IO permission bitmap (empty due to tss seg limit) */
2447 gd
->gd_common_tss
.tss_iobase
= sizeof(struct x86_64tss
);
2449 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2450 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
];
2451 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2454 /* Set up the fast syscall stuff */
2455 msr
= rdmsr(MSR_EFER
) | EFER_SCE
;
2456 wrmsr(MSR_EFER
, msr
);
2457 wrmsr(MSR_LSTAR
, (u_int64_t
)IDTVEC(fast_syscall
));
2458 wrmsr(MSR_CSTAR
, (u_int64_t
)IDTVEC(fast_syscall32
));
2459 msr
= ((u_int64_t
)GSEL(GCODE_SEL
, SEL_KPL
) << 32) |
2460 ((u_int64_t
)GSEL(GUCODE32_SEL
, SEL_UPL
) << 48);
2461 wrmsr(MSR_STAR
, msr
);
2462 wrmsr(MSR_SF_MASK
, PSL_NT
|PSL_T
|PSL_I
|PSL_C
|PSL_D
|PSL_IOPL
);
2464 getmemsize(kmdp
, physfree
);
2465 init_param2(physmem
);
2467 /* now running on new page tables, configured,and u/iom is accessible */
2469 /* Map the message buffer. */
2471 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2472 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2475 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2478 /* transfer to user mode */
2480 _ucodesel
= GSEL(GUCODE_SEL
, SEL_UPL
);
2481 _udatasel
= GSEL(GUDATA_SEL
, SEL_UPL
);
2482 _ucode32sel
= GSEL(GUCODE32_SEL
, SEL_UPL
);
2488 /* setup proc 0's pcb */
2489 thread0
.td_pcb
->pcb_flags
= 0;
2490 thread0
.td_pcb
->pcb_cr3
= KPML4phys
;
2491 thread0
.td_pcb
->pcb_ext
= NULL
;
2492 lwp0
.lwp_md
.md_regs
= &proc0_tf
; /* XXX needed? */
2494 /* Location of kernel stack for locore */
2495 return ((u_int64_t
)thread0
.td_pcb
);
2499 * Initialize machine-dependant portions of the global data structure.
2500 * Note that the global data area and cpu0's idlestack in the private
2501 * data space were allocated in locore.
2503 * Note: the idlethread's cpl is 0
2505 * WARNING! Called from early boot, 'mycpu' may not work yet.
2508 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2511 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2513 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2514 gd
->mi
.gd_prvspace
->idlestack
,
2515 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2517 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2518 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2519 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2520 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2524 * We only have to check for DMAP bounds, the globaldata space is
2525 * actually part of the kernel_map so we don't have to waste time
2526 * checking CPU_prvspace[*].
2529 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2532 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2533 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2537 if (saddr
>= DMAP_MIN_ADDRESS
&& eaddr
<= DMAP_MAX_ADDRESS
)
2543 globaldata_find(int cpu
)
2545 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2546 return(&CPU_prvspace
[cpu
]->mdglobaldata
.mi
);
2550 * This path should be safe from the SYSRET issue because only stopped threads
2551 * can have their %rip adjusted this way (and all heavy weight thread switches
2552 * clear QUICKREF and thus do not use SYSRET). However, the code path is
2553 * convoluted so add a safety by forcing %rip to be cannonical.
2556 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2558 if (addr
& 0x0000800000000000LLU
)
2559 lp
->lwp_md
.md_regs
->tf_rip
= addr
| 0xFFFF000000000000LLU
;
2561 lp
->lwp_md
.md_regs
->tf_rip
= addr
& 0x0000FFFFFFFFFFFFLLU
;
2566 ptrace_single_step(struct lwp
*lp
)
2568 lp
->lwp_md
.md_regs
->tf_rflags
|= PSL_T
;
2573 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2575 struct trapframe
*tp
;
2577 if ((tp
= lp
->lwp_md
.md_regs
) == NULL
)
2579 bcopy(&tp
->tf_rdi
, ®s
->r_rdi
, sizeof(*regs
));
2584 set_regs(struct lwp
*lp
, struct reg
*regs
)
2586 struct trapframe
*tp
;
2588 tp
= lp
->lwp_md
.md_regs
;
2589 if (!EFL_SECURE(regs
->r_rflags
, tp
->tf_rflags
) ||
2590 !CS_SECURE(regs
->r_cs
))
2592 bcopy(®s
->r_rdi
, &tp
->tf_rdi
, sizeof(*regs
));
2598 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2600 struct env87
*penv_87
= &sv_87
->sv_env
;
2601 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2604 /* FPU control/status */
2605 penv_87
->en_cw
= penv_xmm
->en_cw
;
2606 penv_87
->en_sw
= penv_xmm
->en_sw
;
2607 penv_87
->en_tw
= penv_xmm
->en_tw
;
2608 penv_87
->en_fip
= penv_xmm
->en_fip
;
2609 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2610 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2611 penv_87
->en_foo
= penv_xmm
->en_foo
;
2612 penv_87
->en_fos
= penv_xmm
->en_fos
;
2615 for (i
= 0; i
< 8; ++i
)
2616 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2620 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2622 struct env87
*penv_87
= &sv_87
->sv_env
;
2623 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2626 /* FPU control/status */
2627 penv_xmm
->en_cw
= penv_87
->en_cw
;
2628 penv_xmm
->en_sw
= penv_87
->en_sw
;
2629 penv_xmm
->en_tw
= penv_87
->en_tw
;
2630 penv_xmm
->en_fip
= penv_87
->en_fip
;
2631 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2632 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2633 penv_xmm
->en_foo
= penv_87
->en_foo
;
2634 penv_xmm
->en_fos
= penv_87
->en_fos
;
2637 for (i
= 0; i
< 8; ++i
)
2638 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2642 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2644 if (lp
->lwp_thread
== NULL
|| lp
->lwp_thread
->td_pcb
== NULL
)
2647 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2648 (struct save87
*)fpregs
);
2651 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2656 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2659 set_fpregs_xmm((struct save87
*)fpregs
,
2660 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2663 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2668 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2673 dbregs
->dr
[0] = rdr0();
2674 dbregs
->dr
[1] = rdr1();
2675 dbregs
->dr
[2] = rdr2();
2676 dbregs
->dr
[3] = rdr3();
2677 dbregs
->dr
[4] = rdr4();
2678 dbregs
->dr
[5] = rdr5();
2679 dbregs
->dr
[6] = rdr6();
2680 dbregs
->dr
[7] = rdr7();
2683 if (lp
->lwp_thread
== NULL
|| (pcb
= lp
->lwp_thread
->td_pcb
) == NULL
)
2685 dbregs
->dr
[0] = pcb
->pcb_dr0
;
2686 dbregs
->dr
[1] = pcb
->pcb_dr1
;
2687 dbregs
->dr
[2] = pcb
->pcb_dr2
;
2688 dbregs
->dr
[3] = pcb
->pcb_dr3
;
2691 dbregs
->dr
[6] = pcb
->pcb_dr6
;
2692 dbregs
->dr
[7] = pcb
->pcb_dr7
;
2697 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2700 load_dr0(dbregs
->dr
[0]);
2701 load_dr1(dbregs
->dr
[1]);
2702 load_dr2(dbregs
->dr
[2]);
2703 load_dr3(dbregs
->dr
[3]);
2704 load_dr4(dbregs
->dr
[4]);
2705 load_dr5(dbregs
->dr
[5]);
2706 load_dr6(dbregs
->dr
[6]);
2707 load_dr7(dbregs
->dr
[7]);
2710 struct ucred
*ucred
;
2712 uint64_t mask1
, mask2
;
2715 * Don't let an illegal value for dr7 get set. Specifically,
2716 * check for undefined settings. Setting these bit patterns
2717 * result in undefined behaviour and can lead to an unexpected
2720 /* JG this loop looks unreadable */
2721 /* Check 4 2-bit fields for invalid patterns.
2722 * These fields are R/Wi, for i = 0..3
2724 /* Is 10 in LENi allowed when running in compatibility mode? */
2725 /* Pattern 10 in R/Wi might be used to indicate
2726 * breakpoint on I/O. Further analysis should be
2727 * carried to decide if it is safe and useful to
2728 * provide access to that capability
2730 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 4;
2731 i
++, mask1
<<= 4, mask2
<<= 4)
2732 if ((dbregs
->dr
[7] & mask1
) == mask2
)
2735 pcb
= lp
->lwp_thread
->td_pcb
;
2736 ucred
= lp
->lwp_proc
->p_ucred
;
2739 * Don't let a process set a breakpoint that is not within the
2740 * process's address space. If a process could do this, it
2741 * could halt the system by setting a breakpoint in the kernel
2742 * (if ddb was enabled). Thus, we need to check to make sure
2743 * that no breakpoints are being enabled for addresses outside
2744 * process's address space, unless, perhaps, we were called by
2747 * XXX - what about when the watched area of the user's
2748 * address space is written into from within the kernel
2749 * ... wouldn't that still cause a breakpoint to be generated
2750 * from within kernel mode?
2753 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2754 if (dbregs
->dr
[7] & 0x3) {
2755 /* dr0 is enabled */
2756 if (dbregs
->dr
[0] >= VM_MAX_USER_ADDRESS
)
2760 if (dbregs
->dr
[7] & (0x3<<2)) {
2761 /* dr1 is enabled */
2762 if (dbregs
->dr
[1] >= VM_MAX_USER_ADDRESS
)
2766 if (dbregs
->dr
[7] & (0x3<<4)) {
2767 /* dr2 is enabled */
2768 if (dbregs
->dr
[2] >= VM_MAX_USER_ADDRESS
)
2772 if (dbregs
->dr
[7] & (0x3<<6)) {
2773 /* dr3 is enabled */
2774 if (dbregs
->dr
[3] >= VM_MAX_USER_ADDRESS
)
2779 pcb
->pcb_dr0
= dbregs
->dr
[0];
2780 pcb
->pcb_dr1
= dbregs
->dr
[1];
2781 pcb
->pcb_dr2
= dbregs
->dr
[2];
2782 pcb
->pcb_dr3
= dbregs
->dr
[3];
2783 pcb
->pcb_dr6
= dbregs
->dr
[6];
2784 pcb
->pcb_dr7
= dbregs
->dr
[7];
2786 pcb
->pcb_flags
|= PCB_DBREGS
;
2793 * Return > 0 if a hardware breakpoint has been hit, and the
2794 * breakpoint was in user space. Return 0, otherwise.
2797 user_dbreg_trap(void)
2799 u_int64_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2800 u_int64_t bp
; /* breakpoint bits extracted from dr6 */
2801 int nbp
; /* number of breakpoints that triggered */
2802 caddr_t addr
[4]; /* breakpoint addresses */
2806 if ((dr7
& 0xff) == 0) {
2808 * all GE and LE bits in the dr7 register are zero,
2809 * thus the trap couldn't have been caused by the
2810 * hardware debug registers
2821 * None of the breakpoint bits are set meaning this
2822 * trap was not caused by any of the debug registers
2828 * at least one of the breakpoints were hit, check to see
2829 * which ones and if any of them are user space addresses
2833 addr
[nbp
++] = (caddr_t
)rdr0();
2836 addr
[nbp
++] = (caddr_t
)rdr1();
2839 addr
[nbp
++] = (caddr_t
)rdr2();
2842 addr
[nbp
++] = (caddr_t
)rdr3();
2845 for (i
=0; i
<nbp
; i
++) {
2847 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2849 * addr[i] is in user space
2856 * None of the breakpoints are in user space.
2864 Debugger(const char *msg
)
2866 kprintf("Debugger(\"%s\") called.\n", msg
);
2873 * Provide inb() and outb() as functions. They are normally only
2874 * available as macros calling inlined functions, thus cannot be
2875 * called inside DDB.
2877 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2883 /* silence compiler warnings */
2885 void outb(u_int
, u_char
);
2892 * We use %%dx and not %1 here because i/o is done at %dx and not at
2893 * %edx, while gcc generates inferior code (movw instead of movl)
2894 * if we tell it to load (u_short) port.
2896 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2901 outb(u_int port
, u_char data
)
2905 * Use an unnecessary assignment to help gcc's register allocator.
2906 * This make a large difference for gcc-1.40 and a tiny difference
2907 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2908 * best results. gcc-2.6.0 can't handle this.
2911 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2919 * initialize all the SMP locks
2922 /* critical region when masking or unmasking interupts */
2923 struct spinlock_deprecated imen_spinlock
;
2925 /* lock region used by kernel profiling */
2926 struct spinlock_deprecated mcount_spinlock
;
2928 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2929 struct spinlock_deprecated com_spinlock
;
2931 /* lock regions around the clock hardware */
2932 struct spinlock_deprecated clock_spinlock
;
2938 * Get the initial mplock with a count of 1 for the BSP.
2939 * This uses a LOGICAL cpu ID, ie BSP == 0.
2941 cpu_get_initial_mplock();
2943 spin_init_deprecated(&mcount_spinlock
);
2944 spin_init_deprecated(&imen_spinlock
);
2945 spin_init_deprecated(&com_spinlock
);
2946 spin_init_deprecated(&clock_spinlock
);
2948 /* our token pool needs to work early */
2949 lwkt_token_pool_init();
2953 cpu_mwait_hint_valid(uint32_t hint
)
2957 cx_idx
= MWAIT_EAX_TO_CX(hint
);
2958 if (cx_idx
>= CPU_MWAIT_CX_MAX
)
2961 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2962 if (sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
2969 cpu_mwait_cx_no_bmsts(void)
2971 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_STS
);
2975 cpu_mwait_cx_no_bmarb(void)
2977 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_ARB
);
2981 cpu_mwait_cx_hint2name(int hint
, char *name
, int namelen
, boolean_t allow_auto
)
2983 int old_cx_idx
, sub
= 0;
2986 old_cx_idx
= MWAIT_EAX_TO_CX(hint
);
2987 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2988 } else if (hint
== CPU_MWAIT_HINT_AUTO
) {
2989 old_cx_idx
= allow_auto
? CPU_MWAIT_C2
: CPU_MWAIT_CX_MAX
;
2990 } else if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
2991 old_cx_idx
= allow_auto
? CPU_MWAIT_C3
: CPU_MWAIT_CX_MAX
;
2993 old_cx_idx
= CPU_MWAIT_CX_MAX
;
2996 if (!CPU_MWAIT_HAS_CX
)
2997 strlcpy(name
, "NONE", namelen
);
2998 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTO
)
2999 strlcpy(name
, "AUTO", namelen
);
3000 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTODEEP
)
3001 strlcpy(name
, "AUTODEEP", namelen
);
3002 else if (old_cx_idx
>= CPU_MWAIT_CX_MAX
||
3003 sub
>= cpu_mwait_cx_info
[old_cx_idx
].subcnt
)
3004 strlcpy(name
, "INVALID", namelen
);
3006 ksnprintf(name
, namelen
, "C%d/%d", old_cx_idx
, sub
);
3012 cpu_mwait_cx_name2hint(char *name
, int *hint0
, boolean_t allow_auto
)
3014 int cx_idx
, sub
, hint
;
3017 if (allow_auto
&& strcmp(name
, "AUTO") == 0) {
3018 hint
= CPU_MWAIT_HINT_AUTO
;
3019 cx_idx
= CPU_MWAIT_C2
;
3022 if (allow_auto
&& strcmp(name
, "AUTODEEP") == 0) {
3023 hint
= CPU_MWAIT_HINT_AUTODEEP
;
3024 cx_idx
= CPU_MWAIT_C3
;
3028 if (strlen(name
) < 4 || toupper(name
[0]) != 'C')
3033 cx_idx
= strtol(start
, &ptr
, 10);
3034 if (ptr
== start
|| *ptr
!= '/')
3036 if (cx_idx
< 0 || cx_idx
>= CPU_MWAIT_CX_MAX
)
3042 sub
= strtol(start
, &ptr
, 10);
3045 if (sub
< 0 || sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
3048 hint
= MWAIT_EAX_HINT(cx_idx
, sub
);
3055 cpu_mwait_cx_transit(int old_cx_idx
, int cx_idx
)
3057 if (cx_idx
>= CPU_MWAIT_C3
&& cpu_mwait_c3_preamble
)
3059 if (old_cx_idx
< CPU_MWAIT_C3
&& cx_idx
>= CPU_MWAIT_C3
) {
3062 error
= cputimer_intr_powersave_addreq();
3065 } else if (old_cx_idx
>= CPU_MWAIT_C3
&& cx_idx
< CPU_MWAIT_C3
) {
3066 cputimer_intr_powersave_remreq();
3072 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
, int *hint0
,
3073 boolean_t allow_auto
)
3075 int error
, cx_idx
, old_cx_idx
, hint
;
3076 char name
[CPU_MWAIT_CX_NAMELEN
];
3079 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
),
3082 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3083 if (error
!= 0 || req
->newptr
== NULL
)
3086 if (!CPU_MWAIT_HAS_CX
)
3089 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, allow_auto
);
3093 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3102 cpu_mwait_cx_setname(struct cpu_idle_stat
*stat
, const char *cx_name
)
3104 int error
, cx_idx
, old_cx_idx
, hint
;
3105 char name
[CPU_MWAIT_CX_NAMELEN
];
3107 KASSERT(CPU_MWAIT_HAS_CX
, ("cpu does not support mwait CX extension"));
3110 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3112 strlcpy(name
, cx_name
, sizeof(name
));
3113 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3117 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3126 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3128 int hint
= cpu_mwait_halt_global
;
3129 int error
, cx_idx
, cpu
;
3130 char name
[CPU_MWAIT_CX_NAMELEN
], cx_name
[CPU_MWAIT_CX_NAMELEN
];
3132 cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3134 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3135 if (error
!= 0 || req
->newptr
== NULL
)
3138 if (!CPU_MWAIT_HAS_CX
)
3141 /* Save name for later per-cpu CX configuration */
3142 strlcpy(cx_name
, name
, sizeof(cx_name
));
3144 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3148 /* Change per-cpu CX configuration */
3149 for (cpu
= 0; cpu
< ncpus
; ++cpu
) {
3150 error
= cpu_mwait_cx_setname(&cpu_idle_stats
[cpu
], cx_name
);
3155 cpu_mwait_halt_global
= hint
;
3160 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3162 struct cpu_idle_stat
*stat
= arg1
;
3165 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3171 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
)
3175 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3176 &cpu_mwait_spin
, FALSE
);
3181 * This manual debugging code is called unconditionally from Xtimer
3182 * (the per-cpu timer interrupt) whether the current thread is in a
3183 * critical section or not) and can be useful in tracking down lockups.
3185 * NOTE: MANUAL DEBUG CODE
3188 static int saveticks
[SMP_MAXCPU
];
3189 static int savecounts
[SMP_MAXCPU
];
3193 pcpu_timer_always(struct intrframe
*frame
)
3196 globaldata_t gd
= mycpu
;
3197 int cpu
= gd
->gd_cpuid
;
3203 gptr
= (short *)0xFFFFFFFF800b8000 + 80 * cpu
;
3204 *gptr
= ((*gptr
+ 1) & 0x00FF) | 0x0700;
3207 ksnprintf(buf
, sizeof(buf
), " %p %16s %d %16s ",
3208 (void *)frame
->if_rip
, gd
->gd_curthread
->td_comm
, ticks
,
3210 for (i
= 0; buf
[i
]; ++i
) {
3211 gptr
[i
] = 0x0700 | (unsigned char)buf
[i
];
3215 if (saveticks
[gd
->gd_cpuid
] != ticks
) {
3216 saveticks
[gd
->gd_cpuid
] = ticks
;
3217 savecounts
[gd
->gd_cpuid
] = 0;
3219 ++savecounts
[gd
->gd_cpuid
];
3220 if (savecounts
[gd
->gd_cpuid
] > 2000 && panicstr
== NULL
) {
3221 panic("cpud %d panicing on ticks failure",
3224 for (i
= 0; i
< ncpus
; ++i
) {
3226 if (saveticks
[i
] && panicstr
== NULL
) {
3227 delta
= saveticks
[i
] - ticks
;
3228 if (delta
< -10 || delta
> 10) {
3229 panic("cpu %d panicing on cpu %d watchdog",