2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
43 //#include "use_npx.h"
47 #include "opt_directio.h"
49 #include "opt_msgbuf.h"
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/sysproto.h>
55 #include <sys/signalvar.h>
56 #include <sys/kernel.h>
57 #include <sys/linker.h>
58 #include <sys/malloc.h>
62 #include <sys/reboot.h>
64 #include <sys/msgbuf.h>
65 #include <sys/sysent.h>
66 #include <sys/sysctl.h>
67 #include <sys/vmmeter.h>
69 #include <sys/usched.h>
72 #include <sys/ctype.h>
73 #include <sys/serialize.h>
74 #include <sys/systimer.h>
77 #include <vm/vm_param.h>
79 #include <vm/vm_kern.h>
80 #include <vm/vm_object.h>
81 #include <vm/vm_page.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_pager.h>
84 #include <vm/vm_extern.h>
86 #include <sys/thread2.h>
87 #include <sys/mplock2.h>
88 #include <sys/mutex2.h>
98 #include <machine/cpu.h>
99 #include <machine/clock.h>
100 #include <machine/specialreg.h>
102 #include <machine/bootinfo.h>
104 #include <machine/md_var.h>
105 #include <machine/metadata.h>
106 #include <machine/pc/bios.h>
107 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
108 #include <machine/globaldata.h> /* CPU_prvspace */
109 #include <machine/smp.h>
110 #include <machine/cputypes.h>
111 #include <machine/intr_machdep.h>
112 #include <machine/framebuffer.h>
115 #include <bus/isa/isa_device.h>
117 #include <machine_base/isa/isa_intr.h>
118 #include <bus/isa/rtc.h>
119 #include <sys/random.h>
120 #include <sys/ptrace.h>
121 #include <machine/sigframe.h>
123 #include <sys/machintr.h>
124 #include <machine_base/icu/icu_abi.h>
125 #include <machine_base/icu/elcr_var.h>
126 #include <machine_base/apic/lapic.h>
127 #include <machine_base/apic/ioapic.h>
128 #include <machine_base/apic/ioapic_abi.h>
129 #include <machine/mptable.h>
131 #define PHYSMAP_ENTRIES 10
133 extern u_int64_t
hammer_time(u_int64_t
, u_int64_t
);
135 extern void printcpuinfo(void); /* XXX header file */
136 extern void identify_cpu(void);
138 extern void finishidentcpu(void);
140 extern void panicifcpuunsupported(void);
142 static void cpu_startup(void *);
143 static void pic_finish(void *);
144 static void cpu_finish(void *);
146 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
147 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
149 extern void ffs_rawread_setup(void);
150 #endif /* DIRECTIO */
151 static void init_locks(void);
153 extern void pcpu_timer_always(struct intrframe
*);
155 SYSINIT(cpu
, SI_BOOT2_START_CPU
, SI_ORDER_FIRST
, cpu_startup
, NULL
);
156 SYSINIT(pic_finish
, SI_BOOT2_FINISH_PIC
, SI_ORDER_FIRST
, pic_finish
, NULL
);
157 SYSINIT(cpu_finish
, SI_BOOT2_FINISH_CPU
, SI_ORDER_FIRST
, cpu_finish
, NULL
);
160 extern vm_offset_t ksym_start
, ksym_end
;
163 struct privatespace CPU_prvspace_bsp
__aligned(4096);
164 struct privatespace
*CPU_prvspace
[MAXCPU
] = { &CPU_prvspace_bsp
};
166 vm_paddr_t efi_systbl_phys
;
167 int _udatasel
, _ucodesel
, _ucode32sel
;
169 int64_t tsc_offsets
[MAXCPU
];
170 cpumask_t smp_idleinvl_mask
;
171 cpumask_t smp_idleinvl_reqs
;
173 static int cpu_mwait_halt_global
; /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
175 #if defined(SWTCH_OPTIM_STATS)
176 extern int swtch_optim_stats
;
177 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
178 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
179 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
180 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
182 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_halt
,
183 CTLFLAG_RD
, &cpu_mwait_halt_global
, 0, "");
184 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_spin
, CTLFLAG_RD
, &cpu_mwait_spin
, 0,
185 "monitor/mwait target state");
187 #define CPU_MWAIT_HAS_CX \
188 ((cpu_feature2 & CPUID2_MON) && \
189 (cpu_mwait_feature & CPUID_MWAIT_EXT))
191 #define CPU_MWAIT_CX_NAMELEN 16
193 #define CPU_MWAIT_C1 1
194 #define CPU_MWAIT_C2 2
195 #define CPU_MWAIT_C3 3
196 #define CPU_MWAIT_CX_MAX 8
198 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */
199 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */
201 SYSCTL_NODE(_machdep
, OID_AUTO
, mwait
, CTLFLAG_RW
, 0, "MWAIT features");
202 SYSCTL_NODE(_machdep_mwait
, OID_AUTO
, CX
, CTLFLAG_RW
, 0, "MWAIT Cx settings");
204 struct cpu_mwait_cx
{
207 struct sysctl_ctx_list sysctl_ctx
;
208 struct sysctl_oid
*sysctl_tree
;
210 static struct cpu_mwait_cx cpu_mwait_cx_info
[CPU_MWAIT_CX_MAX
];
211 static char cpu_mwait_cx_supported
[256];
213 static int cpu_mwait_c1_hints_cnt
;
214 static int cpu_mwait_hints_cnt
;
215 static int *cpu_mwait_hints
;
217 static int cpu_mwait_deep_hints_cnt
;
218 static int *cpu_mwait_deep_hints
;
220 #define CPU_IDLE_REPEAT_DEFAULT 750
222 static u_int cpu_idle_repeat
= CPU_IDLE_REPEAT_DEFAULT
;
223 static u_long cpu_idle_repeat_max
= CPU_IDLE_REPEAT_DEFAULT
;
224 static u_int cpu_mwait_repeat_shift
= 1;
226 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1
227 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2
229 static int cpu_mwait_c3_preamble
=
230 CPU_MWAIT_C3_PREAMBLE_BM_ARB
|
231 CPU_MWAIT_C3_PREAMBLE_BM_STS
;
233 SYSCTL_STRING(_machdep_mwait_CX
, OID_AUTO
, supported
, CTLFLAG_RD
,
234 cpu_mwait_cx_supported
, 0, "MWAIT supported C states");
235 SYSCTL_INT(_machdep_mwait_CX
, OID_AUTO
, c3_preamble
, CTLFLAG_RD
,
236 &cpu_mwait_c3_preamble
, 0, "C3+ preamble mask");
238 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
,
240 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
);
241 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
);
242 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
);
244 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, idle
, CTLTYPE_STRING
|CTLFLAG_RW
,
245 NULL
, 0, cpu_mwait_cx_idle_sysctl
, "A", "");
246 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, spin
, CTLTYPE_STRING
|CTLFLAG_RW
,
247 NULL
, 0, cpu_mwait_cx_spin_sysctl
, "A", "");
248 SYSCTL_UINT(_machdep_mwait_CX
, OID_AUTO
, repeat_shift
, CTLFLAG_RW
,
249 &cpu_mwait_repeat_shift
, 0, "");
253 u_long ebda_addr
= 0;
255 int imcr_present
= 0;
257 int naps
= 0; /* # of Applications processors */
260 struct mtx dt_lock
; /* lock for GDT and LDT */
263 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
265 u_long pmem
= ctob(physmem
);
267 int error
= sysctl_handle_long(oidp
, &pmem
, 0, req
);
271 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_ULONG
|CTLFLAG_RD
,
272 0, 0, sysctl_hw_physmem
, "LU", "Total system memory in bytes (number of pages * page size)");
275 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
277 int error
= sysctl_handle_int(oidp
, 0,
278 ctob(physmem
- vmstats
.v_wire_count
), req
);
282 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
283 0, 0, sysctl_hw_usermem
, "IU", "");
286 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
288 int error
= sysctl_handle_int(oidp
, 0,
289 x86_64_btop(avail_end
- avail_start
), req
);
293 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
294 0, 0, sysctl_hw_availpages
, "I", "");
300 * The number of PHYSMAP entries must be one less than the number of
301 * PHYSSEG entries because the PHYSMAP entry that spans the largest
302 * physical address that is accessible by ISA DMA is split into two
305 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
307 vm_paddr_t phys_avail
[PHYSMAP_SIZE
+ 2];
308 vm_paddr_t dump_avail
[PHYSMAP_SIZE
+ 2];
310 /* must be 2 less so 0 0 can signal end of chunks */
311 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 2)
312 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 2)
314 static vm_offset_t buffer_sva
, buffer_eva
;
315 vm_offset_t clean_sva
, clean_eva
;
316 static vm_offset_t pager_sva
, pager_eva
;
317 static struct trapframe proc0_tf
;
320 cpu_startup(void *dummy
)
324 vm_offset_t firstaddr
;
327 * Good {morning,afternoon,evening,night}.
329 kprintf("%s", version
);
332 panicifcpuunsupported();
333 kprintf("real memory = %ju (%ju MB)\n",
335 (intmax_t)Realmem
/ 1024 / 1024);
337 * Display any holes after the first chunk of extended memory.
342 kprintf("Physical memory chunk(s):\n");
343 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
344 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
346 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
347 (intmax_t)phys_avail
[indx
],
348 (intmax_t)phys_avail
[indx
+ 1] - 1,
350 (intmax_t)(size1
/ PAGE_SIZE
));
355 * Allocate space for system data structures.
356 * The first available kernel virtual address is in "v".
357 * As pages of kernel virtual memory are allocated, "v" is incremented.
358 * As pages of memory are allocated and cleared,
359 * "firstaddr" is incremented.
360 * An index into the kernel page table corresponding to the
361 * virtual memory address maintained in "v" is kept in "mapaddr".
365 * Make two passes. The first pass calculates how much memory is
366 * needed and allocates it. The second pass assigns virtual
367 * addresses to the various data structures.
371 v
= (caddr_t
)firstaddr
;
373 #define valloc(name, type, num) \
374 (name) = (type *)v; v = (caddr_t)((name)+(num))
375 #define valloclim(name, type, num, lim) \
376 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
379 * The nominal buffer size (and minimum KVA allocation) is MAXBSIZE.
380 * For the first 64MB of ram nominally allocate sufficient buffers to
381 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
382 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
383 * the buffer cache we limit the eventual kva reservation to
386 * factor represents the 1/4 x ram conversion.
389 long factor
= 4 * NBUFCALCSIZE
/ 1024;
390 long kbytes
= physmem
* (PAGE_SIZE
/ 1024);
394 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
396 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
397 if (maxbcache
&& nbuf
> maxbcache
/ NBUFCALCSIZE
)
398 nbuf
= maxbcache
/ NBUFCALCSIZE
;
402 * Do not allow the buffer_map to be more then 1/2 the size of the
405 if (nbuf
> (virtual_end
- virtual_start
+
406 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2)) {
407 nbuf
= (virtual_end
- virtual_start
+
408 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2);
409 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf
);
413 * Do not allow the buffer_map to use more than 50% of available
414 * physical-equivalent memory. Since the VM pages which back
415 * individual buffers are typically wired, having too many bufs
416 * can prevent the system from paging properly.
418 if (nbuf
> physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2)) {
419 nbuf
= physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2);
420 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf
);
424 * Do not allow the sizeof(struct buf) * nbuf to exceed half of
425 * the valloc space which is just the virtual_end - virtual_start
426 * section. We use valloc() to allocate the buf header array.
428 if (nbuf
> (virtual_end
- virtual_start
) / sizeof(struct buf
) / 2) {
429 nbuf
= (virtual_end
- virtual_start
) /
430 sizeof(struct buf
) / 2;
431 kprintf("Warning: nbufs capped at %ld due to valloc "
432 "considerations\n", nbuf
);
435 nswbuf_mem
= lmax(lmin(nbuf
/ 32, 512), 8);
437 if (nswbuf_mem
< NSWBUF_MIN
)
438 nswbuf_mem
= NSWBUF_MIN
;
440 nswbuf_kva
= lmax(lmin(nbuf
/ 4, 512), 16);
442 if (nswbuf_kva
< NSWBUF_MIN
)
443 nswbuf_kva
= NSWBUF_MIN
;
449 valloc(swbuf_mem
, struct buf
, nswbuf_mem
);
450 valloc(swbuf_kva
, struct buf
, nswbuf_kva
);
451 valloc(buf
, struct buf
, nbuf
);
454 * End of first pass, size has been calculated so allocate memory
456 if (firstaddr
== 0) {
457 size
= (vm_size_t
)(v
- firstaddr
);
458 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
),
461 panic("startup: no room for tables");
466 * End of second pass, addresses have been assigned
468 * nbuf is an int, make sure we don't overflow the field.
470 * On 64-bit systems we always reserve maximal allocations for
471 * buffer cache buffers and there are no fragmentation issues,
472 * so the KVA segment does not have to be excessively oversized.
474 if ((vm_size_t
)(v
- firstaddr
) != size
)
475 panic("startup: table size inconsistency");
477 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
478 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
) +
479 ((nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) + pager_map_size
);
480 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
481 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
));
482 buffer_map
.system_map
= 1;
483 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
484 ((vm_offset_t
)(nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) +
486 pager_map
.system_map
= 1;
487 kprintf("avail memory = %ju (%ju MB)\n",
488 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
),
489 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
) /
493 struct cpu_idle_stat
{
501 u_long mwait_cx
[CPU_MWAIT_CX_MAX
];
504 #define CPU_IDLE_STAT_HALT -1
505 #define CPU_IDLE_STAT_SPIN -2
507 static struct cpu_idle_stat cpu_idle_stats
[MAXCPU
];
510 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS
)
512 int idx
= arg2
, cpu
, error
;
515 if (idx
== CPU_IDLE_STAT_HALT
) {
516 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
517 val
+= cpu_idle_stats
[cpu
].halt
;
518 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
519 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
520 val
+= cpu_idle_stats
[cpu
].spin
;
522 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
523 ("invalid index %d", idx
));
524 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
525 val
+= cpu_idle_stats
[cpu
].mwait_cx
[idx
];
528 error
= sysctl_handle_quad(oidp
, &val
, 0, req
);
529 if (error
|| req
->newptr
== NULL
)
532 if (idx
== CPU_IDLE_STAT_HALT
) {
533 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
534 cpu_idle_stats
[cpu
].halt
= 0;
535 cpu_idle_stats
[0].halt
= val
;
536 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
537 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
538 cpu_idle_stats
[cpu
].spin
= 0;
539 cpu_idle_stats
[0].spin
= val
;
541 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
542 ("invalid index %d", idx
));
543 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
544 cpu_idle_stats
[cpu
].mwait_cx
[idx
] = 0;
545 cpu_idle_stats
[0].mwait_cx
[idx
] = val
;
551 cpu_mwait_attach(void)
556 if (!CPU_MWAIT_HAS_CX
)
559 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
560 (CPUID_TO_FAMILY(cpu_id
) > 0xf ||
561 (CPUID_TO_FAMILY(cpu_id
) == 0x6 &&
562 CPUID_TO_MODEL(cpu_id
) >= 0xf))) {
566 * Pentium dual-core, Core 2 and beyond do not need any
567 * additional activities to enter deep C-state, i.e. C3(+).
569 cpu_mwait_cx_no_bmarb();
571 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts
);
573 cpu_mwait_cx_no_bmsts();
576 sbuf_new(&sb
, cpu_mwait_cx_supported
,
577 sizeof(cpu_mwait_cx_supported
), SBUF_FIXEDLEN
);
579 for (i
= 0; i
< CPU_MWAIT_CX_MAX
; ++i
) {
580 struct cpu_mwait_cx
*cx
= &cpu_mwait_cx_info
[i
];
583 ksnprintf(cx
->name
, sizeof(cx
->name
), "C%d", i
);
585 sysctl_ctx_init(&cx
->sysctl_ctx
);
586 cx
->sysctl_tree
= SYSCTL_ADD_NODE(&cx
->sysctl_ctx
,
587 SYSCTL_STATIC_CHILDREN(_machdep_mwait
), OID_AUTO
,
588 cx
->name
, CTLFLAG_RW
, NULL
, "Cx control/info");
589 if (cx
->sysctl_tree
== NULL
)
592 cx
->subcnt
= CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu
, i
);
593 SYSCTL_ADD_INT(&cx
->sysctl_ctx
,
594 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
595 "subcnt", CTLFLAG_RD
, &cx
->subcnt
, 0,
597 SYSCTL_ADD_PROC(&cx
->sysctl_ctx
,
598 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
599 "entered", (CTLTYPE_QUAD
| CTLFLAG_RW
), 0,
600 i
, sysctl_cpu_idle_cnt
, "Q", "# of times entered");
602 for (sub
= 0; sub
< cx
->subcnt
; ++sub
)
603 sbuf_printf(&sb
, "C%d/%d ", i
, sub
);
611 cpu_mwait_c1_hints_cnt
= cpu_mwait_cx_info
[CPU_MWAIT_C1
].subcnt
;
612 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
)
613 cpu_mwait_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
614 cpu_mwait_hints
= kmalloc(sizeof(int) * cpu_mwait_hints_cnt
,
618 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
) {
621 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
622 for (j
= 0; j
< subcnt
; ++j
) {
623 KASSERT(hint_idx
< cpu_mwait_hints_cnt
,
624 ("invalid mwait hint index %d", hint_idx
));
625 cpu_mwait_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
629 KASSERT(hint_idx
== cpu_mwait_hints_cnt
,
630 ("mwait hint count %d != index %d",
631 cpu_mwait_hints_cnt
, hint_idx
));
634 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt
);
635 for (i
= 0; i
< cpu_mwait_hints_cnt
; ++i
) {
636 int hint
= cpu_mwait_hints
[i
];
638 kprintf(" C%d/%d hint 0x%04x\n",
639 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
647 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
)
648 cpu_mwait_deep_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
649 cpu_mwait_deep_hints
= kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt
,
653 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
) {
656 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
657 for (j
= 0; j
< subcnt
; ++j
) {
658 KASSERT(hint_idx
< cpu_mwait_deep_hints_cnt
,
659 ("invalid mwait deep hint index %d", hint_idx
));
660 cpu_mwait_deep_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
664 KASSERT(hint_idx
== cpu_mwait_deep_hints_cnt
,
665 ("mwait deep hint count %d != index %d",
666 cpu_mwait_deep_hints_cnt
, hint_idx
));
669 kprintf("MWAIT deep hints:\n");
670 for (i
= 0; i
< cpu_mwait_deep_hints_cnt
; ++i
) {
671 int hint
= cpu_mwait_deep_hints
[i
];
673 kprintf(" C%d/%d hint 0x%04x\n",
674 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
678 cpu_idle_repeat_max
= 256 * cpu_mwait_deep_hints_cnt
;
680 for (i
= 0; i
< ncpus
; ++i
) {
683 ksnprintf(name
, sizeof(name
), "idle%d", i
);
684 SYSCTL_ADD_PROC(NULL
,
685 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX
), OID_AUTO
,
686 name
, (CTLTYPE_STRING
| CTLFLAG_RW
), &cpu_idle_stats
[i
],
687 0, cpu_mwait_cx_pcpu_idle_sysctl
, "A", "");
692 cpu_finish(void *dummy __unused
)
699 pic_finish(void *dummy __unused
)
701 /* Log ELCR information */
704 /* Log MPTABLE information */
705 mptable_pci_int_dump();
708 MachIntrABI
.finalize();
712 * Send an interrupt to process.
714 * Stack is set up to allow sigcode stored
715 * at top to call routine, followed by kcall
716 * to sigreturn routine below. After sigreturn
717 * resets the signal mask, the stack, and the
718 * frame pointer, it returns to the user
722 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
724 struct lwp
*lp
= curthread
->td_lwp
;
725 struct proc
*p
= lp
->lwp_proc
;
726 struct trapframe
*regs
;
727 struct sigacts
*psp
= p
->p_sigacts
;
728 struct sigframe sf
, *sfp
;
732 regs
= lp
->lwp_md
.md_regs
;
733 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
735 /* Save user context */
736 bzero(&sf
, sizeof(struct sigframe
));
737 sf
.sf_uc
.uc_sigmask
= *mask
;
738 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
739 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
740 KKASSERT(__offsetof(struct trapframe
, tf_rdi
) == 0);
741 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_rdi
, sizeof(struct trapframe
));
743 /* Make the size of the saved context visible to userland */
744 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
746 /* Allocate and validate space for the signal handler context. */
747 if ((lp
->lwp_flags
& LWP_ALTSTACK
) != 0 && !oonstack
&&
748 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
749 sp
= (char *)(lp
->lwp_sigstk
.ss_sp
+ lp
->lwp_sigstk
.ss_size
-
750 sizeof(struct sigframe
));
751 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
753 /* We take red zone into account */
754 sp
= (char *)regs
->tf_rsp
- sizeof(struct sigframe
) - 128;
758 * XXX AVX needs 64-byte alignment but sigframe has other fields and
759 * the embedded ucontext is not at the front, so aligning this won't
760 * help us. Fortunately we bcopy in/out of the sigframe, so the
763 * The problem though is if userland winds up trying to use the
766 sfp
= (struct sigframe
*)((intptr_t)sp
& ~(intptr_t)0xF);
768 /* Translate the signal is appropriate */
769 if (p
->p_sysent
->sv_sigtbl
) {
770 if (sig
<= p
->p_sysent
->sv_sigsize
)
771 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
775 * Build the argument list for the signal handler.
777 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
779 regs
->tf_rdi
= sig
; /* argument 1 */
780 regs
->tf_rdx
= (register_t
)&sfp
->sf_uc
; /* argument 3 */
782 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
784 * Signal handler installed with SA_SIGINFO.
786 * action(signo, siginfo, ucontext)
788 regs
->tf_rsi
= (register_t
)&sfp
->sf_si
; /* argument 2 */
789 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
790 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
792 /* fill siginfo structure */
793 sf
.sf_si
.si_signo
= sig
;
794 sf
.sf_si
.si_code
= code
;
795 sf
.sf_si
.si_addr
= (void *)regs
->tf_addr
;
798 * Old FreeBSD-style arguments.
800 * handler (signo, code, [uc], addr)
802 regs
->tf_rsi
= (register_t
)code
; /* argument 2 */
803 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
804 sf
.sf_ahu
.sf_handler
= catcher
;
808 * If we're a vm86 process, we want to save the segment registers.
809 * We also change eflags to be our emulated eflags, not the actual
813 if (regs
->tf_eflags
& PSL_VM
) {
814 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
815 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
817 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
818 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
819 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
820 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
822 if (vm86
->vm86_has_vme
== 0)
823 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
824 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
825 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
828 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
829 * syscalls made by the signal handler. This just avoids
830 * wasting time for our lazy fixup of such faults. PSL_NT
831 * does nothing in vm86 mode, but vm86 programs can set it
832 * almost legitimately in probes for old cpu types.
834 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
839 * Save the FPU state and reinit the FP unit
841 npxpush(&sf
.sf_uc
.uc_mcontext
);
844 * Copy the sigframe out to the user's stack.
846 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
848 * Something is wrong with the stack pointer.
849 * ...Kill the process.
854 regs
->tf_rsp
= (register_t
)sfp
;
855 regs
->tf_rip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
858 * i386 abi specifies that the direction flag must be cleared
861 regs
->tf_rflags
&= ~(PSL_T
|PSL_D
);
864 * 64 bit mode has a code and stack selector but
865 * no data or extra selector. %fs and %gs are not
868 regs
->tf_cs
= _ucodesel
;
869 regs
->tf_ss
= _udatasel
;
874 * Sanitize the trapframe for a virtual kernel passing control to a custom
875 * VM context. Remove any items that would otherwise create a privilage
878 * XXX at the moment we allow userland to set the resume flag. Is this a
882 cpu_sanitize_frame(struct trapframe
*frame
)
884 frame
->tf_cs
= _ucodesel
;
885 frame
->tf_ss
= _udatasel
;
886 /* XXX VM (8086) mode not supported? */
887 frame
->tf_rflags
&= (PSL_RF
| PSL_USERCHANGE
| PSL_VM_UNSUPP
);
888 frame
->tf_rflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
894 * Sanitize the tls so loading the descriptor does not blow up
895 * on us. For x86_64 we don't have to do anything.
898 cpu_sanitize_tls(struct savetls
*tls
)
904 * sigreturn(ucontext_t *sigcntxp)
906 * System call to cleanup state after a signal
907 * has been taken. Reset signal mask and
908 * stack state from context left by sendsig (above).
909 * Return to previous pc and psl as specified by
910 * context left by sendsig. Check carefully to
911 * make sure that the user has not modified the
912 * state to gain improper privileges.
916 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
917 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
920 sys_sigreturn(struct sigreturn_args
*uap
)
922 struct lwp
*lp
= curthread
->td_lwp
;
923 struct trapframe
*regs
;
931 * We have to copy the information into kernel space so userland
932 * can't modify it while we are sniffing it.
934 regs
= lp
->lwp_md
.md_regs
;
935 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
939 rflags
= ucp
->uc_mcontext
.mc_rflags
;
941 /* VM (8086) mode not supported */
942 rflags
&= ~PSL_VM_UNSUPP
;
945 if (eflags
& PSL_VM
) {
946 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
947 struct vm86_kernel
*vm86
;
950 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
951 * set up the vm86 area, and we can't enter vm86 mode.
953 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
955 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
956 if (vm86
->vm86_inited
== 0)
959 /* go back to user mode if both flags are set */
960 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
961 trapsignal(lp
, SIGBUS
, 0);
963 if (vm86
->vm86_has_vme
) {
964 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
965 (eflags
& VME_USERCHANGE
) | PSL_VM
;
967 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
968 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
969 (eflags
& VM_USERCHANGE
) | PSL_VM
;
971 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
972 tf
->tf_eflags
= eflags
;
973 tf
->tf_vm86_ds
= tf
->tf_ds
;
974 tf
->tf_vm86_es
= tf
->tf_es
;
975 tf
->tf_vm86_fs
= tf
->tf_fs
;
976 tf
->tf_vm86_gs
= tf
->tf_gs
;
977 tf
->tf_ds
= _udatasel
;
978 tf
->tf_es
= _udatasel
;
979 tf
->tf_fs
= _udatasel
;
980 tf
->tf_gs
= _udatasel
;
985 * Don't allow users to change privileged or reserved flags.
988 * XXX do allow users to change the privileged flag PSL_RF.
989 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
990 * should sometimes set it there too. tf_eflags is kept in
991 * the signal context during signal handling and there is no
992 * other place to remember it, so the PSL_RF bit may be
993 * corrupted by the signal handler without us knowing.
994 * Corruption of the PSL_RF bit at worst causes one more or
995 * one less debugger trap, so allowing it is fairly harmless.
997 if (!EFL_SECURE(rflags
& ~PSL_RF
, regs
->tf_rflags
& ~PSL_RF
)) {
998 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags
);
1003 * Don't allow users to load a valid privileged %cs. Let the
1004 * hardware check for invalid selectors, excess privilege in
1005 * other selectors, invalid %eip's and invalid %esp's.
1007 cs
= ucp
->uc_mcontext
.mc_cs
;
1008 if (!CS_SECURE(cs
)) {
1009 kprintf("sigreturn: cs = 0x%x\n", cs
);
1010 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
1013 bcopy(&ucp
->uc_mcontext
.mc_rdi
, regs
, sizeof(struct trapframe
));
1017 * Restore the FPU state from the frame
1020 npxpop(&ucp
->uc_mcontext
);
1022 if (ucp
->uc_mcontext
.mc_onstack
& 1)
1023 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
1025 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
1027 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
1028 SIG_CANTMASK(lp
->lwp_sigmask
);
1031 return(EJUSTRETURN
);
1035 * Machine dependent boot() routine
1037 * I haven't seen anything to put here yet
1038 * Possibly some stuff might be grafted back here from boot()
1046 * Shutdown the CPU as much as possible
1052 __asm__
__volatile("hlt");
1056 * cpu_idle() represents the idle LWKT. You cannot return from this function
1057 * (unless you want to blow things up!). Instead we look for runnable threads
1058 * and loop or halt as appropriate. Giant is not held on entry to the thread.
1060 * The main loop is entered with a critical section held, we must release
1061 * the critical section before doing anything else. lwkt_switch() will
1062 * check for pending interrupts due to entering and exiting its own
1065 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1066 * However, there are cases where the idlethread will be entered with
1067 * the possibility that no IPI will occur and in such cases
1068 * lwkt_switch() sets TDF_IDLE_NOHLT.
1070 * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1071 * must occur before it starts using ACPI halt.
1073 * NOTE: Value overridden in hammer_time().
1075 static int cpu_idle_hlt
= 2;
1076 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
1077 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
1078 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_repeat
, CTLFLAG_RW
,
1079 &cpu_idle_repeat
, 0, "Idle entries before acpi hlt");
1081 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1082 0, CPU_IDLE_STAT_HALT
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry halts");
1083 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_spincnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1084 0, CPU_IDLE_STAT_SPIN
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry spins");
1087 cpu_idle_default_hook(void)
1090 * We must guarentee that hlt is exactly the instruction
1091 * following the sti.
1093 __asm
__volatile("sti; hlt");
1096 /* Other subsystems (e.g., ACPI) can hook this later. */
1097 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
1100 cpu_mwait_cx_hint(struct cpu_idle_stat
*stat
)
1109 idx
= (stat
->repeat
+ stat
->repeat_last
+ stat
->repeat_delta
) >>
1110 cpu_mwait_repeat_shift
;
1111 if (idx
>= cpu_mwait_c1_hints_cnt
) {
1112 /* Step up faster, once we walked through all C1 states */
1113 stat
->repeat_delta
+= 1 << (cpu_mwait_repeat_shift
+ 1);
1115 if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
1116 if (idx
>= cpu_mwait_deep_hints_cnt
)
1117 idx
= cpu_mwait_deep_hints_cnt
- 1;
1118 hint
= cpu_mwait_deep_hints
[idx
];
1120 if (idx
>= cpu_mwait_hints_cnt
)
1121 idx
= cpu_mwait_hints_cnt
- 1;
1122 hint
= cpu_mwait_hints
[idx
];
1125 cx_idx
= MWAIT_EAX_TO_CX(hint
);
1126 if (cx_idx
>= 0 && cx_idx
< CPU_MWAIT_CX_MAX
)
1127 stat
->mwait_cx
[cx_idx
]++;
1134 globaldata_t gd
= mycpu
;
1135 struct cpu_idle_stat
*stat
= &cpu_idle_stats
[gd
->gd_cpuid
];
1136 struct thread
*td __debugvar
= gd
->gd_curthread
;
1140 stat
->repeat
= stat
->repeat_last
= cpu_idle_repeat_max
;
1143 KKASSERT(td
->td_critcount
== 0);
1147 * See if there are any LWKTs ready to go.
1152 * When halting inside a cli we must check for reqflags
1153 * races, particularly [re]schedule requests. Running
1154 * splz() does the job.
1157 * 0 Never halt, just spin
1159 * 1 Always use HLT (or MONITOR/MWAIT if avail).
1161 * Better default for modern (Haswell+) Intel
1164 * 2 Use HLT/MONITOR/MWAIT up to a point and then
1165 * use the ACPI halt (default). This is a hybrid
1166 * approach. See machdep.cpu_idle_repeat.
1168 * Better default for modern AMD cpus and older
1171 * 3 Always use the ACPI halt. This typically
1172 * eats the least amount of power but the cpu
1173 * will be slow waking up. Slows down e.g.
1174 * compiles and other pipe/event oriented stuff.
1178 * NOTE: Interrupts are enabled and we are not in a critical
1181 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
1182 * don't bother capping gd_idle_repeat, it is ok if
1185 * Implement optimized invltlb operations when halted
1186 * in idle. By setting the bit in smp_idleinvl_mask
1187 * we inform other cpus that they can set _reqs to
1188 * request an invltlb. Current the code to do that
1189 * sets the bits in _reqs anyway, but then check _mask
1190 * to determine if they can assume the invltlb will execute.
1192 * A critical section is required to ensure that interrupts
1193 * do not fully run until after we've had a chance to execute
1196 if (gd
->gd_idle_repeat
== 0) {
1197 stat
->repeat
= (stat
->repeat
+ stat
->repeat_last
) >> 1;
1198 if (stat
->repeat
> cpu_idle_repeat_max
)
1199 stat
->repeat
= cpu_idle_repeat_max
;
1200 stat
->repeat_last
= 0;
1201 stat
->repeat_delta
= 0;
1203 ++stat
->repeat_last
;
1205 ++gd
->gd_idle_repeat
;
1206 reqflags
= gd
->gd_reqflags
;
1207 quick
= (cpu_idle_hlt
== 1) ||
1208 (cpu_idle_hlt
< 3 &&
1209 gd
->gd_idle_repeat
< cpu_idle_repeat
);
1211 if (quick
&& (cpu_mi_feature
& CPU_MI_MONITOR
) &&
1212 (reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1215 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1216 cpu_mmw_pause_int(&gd
->gd_reqflags
, reqflags
,
1217 cpu_mwait_cx_hint(stat
), 0);
1219 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1220 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1226 } else if (cpu_idle_hlt
) {
1227 __asm
__volatile("cli");
1230 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1231 if ((gd
->gd_reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1233 cpu_idle_default_hook();
1237 __asm
__volatile("sti");
1239 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1240 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1248 __asm
__volatile("sti");
1255 * Called in a loop indirectly via Xcpustop
1258 cpu_smp_stopped(void)
1260 globaldata_t gd
= mycpu
;
1261 volatile __uint64_t
*ptr
;
1264 ptr
= CPUMASK_ADDR(started_cpus
, gd
->gd_cpuid
);
1266 if ((ovalue
& CPUMASK_SIMPLE(gd
->gd_cpuid
& 63)) == 0) {
1267 if (cpu_mi_feature
& CPU_MI_MONITOR
) {
1268 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr
), ovalue
,
1269 cpu_mwait_hints
[CPU_MWAIT_C1
], 0);
1271 cpu_halt(); /* depend on lapic timer */
1277 * This routine is called if a spinlock has been held through the
1278 * exponential backoff period and is seriously contested. On a real cpu
1282 cpu_spinlock_contested(void)
1288 * Clear registers on exec
1291 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
1293 struct thread
*td
= curthread
;
1294 struct lwp
*lp
= td
->td_lwp
;
1295 struct pcb
*pcb
= td
->td_pcb
;
1296 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
1298 /* was i386_user_cleanup() in NetBSD */
1302 bzero((char *)regs
, sizeof(struct trapframe
));
1303 regs
->tf_rip
= entry
;
1304 regs
->tf_rsp
= ((stack
- 8) & ~0xFul
) + 8; /* align the stack */
1305 regs
->tf_rdi
= stack
; /* argv */
1306 regs
->tf_rflags
= PSL_USER
| (regs
->tf_rflags
& PSL_T
);
1307 regs
->tf_ss
= _udatasel
;
1308 regs
->tf_cs
= _ucodesel
;
1309 regs
->tf_rbx
= ps_strings
;
1312 * Reset the hardware debug registers if they were in use.
1313 * They won't have any meaning for the newly exec'd process.
1315 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1321 pcb
->pcb_dr7
= 0; /* JG set bit 10? */
1322 if (pcb
== td
->td_pcb
) {
1324 * Clear the debug registers on the running
1325 * CPU, otherwise they will end up affecting
1326 * the next process we switch to.
1330 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1334 * Initialize the math emulator (if any) for the current process.
1335 * Actually, just clear the bit that says that the emulator has
1336 * been initialized. Initialization is delayed until the process
1337 * traps to the emulator (if it is done at all) mainly because
1338 * emulators don't provide an entry point for initialization.
1340 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1343 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1344 * gd_npxthread. Otherwise a preemptive interrupt thread
1345 * may panic in npxdna().
1348 load_cr0(rcr0() | CR0_MP
);
1351 * NOTE: The MSR values must be correct so we can return to
1352 * userland. gd_user_fs/gs must be correct so the switch
1353 * code knows what the current MSR values are.
1355 pcb
->pcb_fsbase
= 0; /* Values loaded from PCB on switch */
1356 pcb
->pcb_gsbase
= 0;
1357 mdcpu
->gd_user_fs
= 0; /* Cache of current MSR values */
1358 mdcpu
->gd_user_gs
= 0;
1359 wrmsr(MSR_FSBASE
, 0); /* Set MSR values for return to userland */
1360 wrmsr(MSR_KGSBASE
, 0);
1362 /* Initialize the npx (if any) for the current process. */
1366 pcb
->pcb_ds
= _udatasel
;
1367 pcb
->pcb_es
= _udatasel
;
1368 pcb
->pcb_fs
= _udatasel
;
1369 pcb
->pcb_gs
= _udatasel
;
1378 cr0
|= CR0_NE
; /* Done by npxinit() */
1379 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1380 cr0
|= CR0_WP
| CR0_AM
;
1386 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1389 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1391 if (!error
&& req
->newptr
)
1396 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1397 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1399 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1400 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1403 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1404 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1407 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1408 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1410 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1411 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1412 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1415 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS
)
1417 struct efi_map_header
*efihdr
;
1421 kmdp
= preload_search_by_type("elf kernel");
1423 kmdp
= preload_search_by_type("elf64 kernel");
1424 efihdr
= (struct efi_map_header
*)preload_search_info(kmdp
,
1425 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1428 efisize
= *((uint32_t *)efihdr
- 1);
1429 return (SYSCTL_OUT(req
, efihdr
, efisize
));
1431 SYSCTL_PROC(_machdep
, OID_AUTO
, efi_map
, CTLTYPE_OPAQUE
|CTLFLAG_RD
, NULL
, 0,
1432 efi_map_sysctl_handler
, "S,efi_map_header", "Raw EFI Memory Map");
1435 * Initialize 386 and configure to run kernel
1439 * Initialize segments & interrupt table
1443 struct user_segment_descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1444 struct gate_descriptor idt_arr
[MAXCPU
][NIDT
];
1446 union descriptor ldt
[NLDT
]; /* local descriptor table */
1449 /* table descriptors - used to load tables by cpu */
1450 struct region_descriptor r_gdt
;
1451 struct region_descriptor r_idt_arr
[MAXCPU
];
1453 /* JG proc0paddr is a virtual address */
1456 char proc0paddr_buff
[LWKT_THREAD_STACK
];
1459 /* software prototypes -- in more palatable form */
1460 struct soft_segment_descriptor gdt_segs
[] = {
1461 /* GNULL_SEL 0 Null Descriptor */
1462 { 0x0, /* segment base address */
1464 0, /* segment type */
1465 0, /* segment descriptor priority level */
1466 0, /* segment descriptor present */
1468 0, /* default 32 vs 16 bit size */
1469 0 /* limit granularity (byte/page units)*/ },
1470 /* GCODE_SEL 1 Code Descriptor for kernel */
1471 { 0x0, /* segment base address */
1472 0xfffff, /* length - all address space */
1473 SDT_MEMERA
, /* segment type */
1474 SEL_KPL
, /* segment descriptor priority level */
1475 1, /* segment descriptor present */
1477 0, /* default 32 vs 16 bit size */
1478 1 /* limit granularity (byte/page units)*/ },
1479 /* GDATA_SEL 2 Data Descriptor for kernel */
1480 { 0x0, /* segment base address */
1481 0xfffff, /* length - all address space */
1482 SDT_MEMRWA
, /* segment type */
1483 SEL_KPL
, /* segment descriptor priority level */
1484 1, /* segment descriptor present */
1486 0, /* default 32 vs 16 bit size */
1487 1 /* limit granularity (byte/page units)*/ },
1488 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1489 { 0x0, /* segment base address */
1490 0xfffff, /* length - all address space */
1491 SDT_MEMERA
, /* segment type */
1492 SEL_UPL
, /* segment descriptor priority level */
1493 1, /* segment descriptor present */
1495 1, /* default 32 vs 16 bit size */
1496 1 /* limit granularity (byte/page units)*/ },
1497 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1498 { 0x0, /* segment base address */
1499 0xfffff, /* length - all address space */
1500 SDT_MEMRWA
, /* segment type */
1501 SEL_UPL
, /* segment descriptor priority level */
1502 1, /* segment descriptor present */
1504 1, /* default 32 vs 16 bit size */
1505 1 /* limit granularity (byte/page units)*/ },
1506 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1507 { 0x0, /* segment base address */
1508 0xfffff, /* length - all address space */
1509 SDT_MEMERA
, /* segment type */
1510 SEL_UPL
, /* segment descriptor priority level */
1511 1, /* segment descriptor present */
1513 0, /* default 32 vs 16 bit size */
1514 1 /* limit granularity (byte/page units)*/ },
1515 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1517 0x0, /* segment base address */
1518 sizeof(struct x86_64tss
)-1,/* length - all address space */
1519 SDT_SYSTSS
, /* segment type */
1520 SEL_KPL
, /* segment descriptor priority level */
1521 1, /* segment descriptor present */
1523 0, /* unused - default 32 vs 16 bit size */
1524 0 /* limit granularity (byte/page units)*/ },
1525 /* Actually, the TSS is a system descriptor which is double size */
1526 { 0x0, /* segment base address */
1528 0, /* segment type */
1529 0, /* segment descriptor priority level */
1530 0, /* segment descriptor present */
1532 0, /* default 32 vs 16 bit size */
1533 0 /* limit granularity (byte/page units)*/ },
1534 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1535 { 0x0, /* segment base address */
1536 0xfffff, /* length - all address space */
1537 SDT_MEMRWA
, /* segment type */
1538 SEL_UPL
, /* segment descriptor priority level */
1539 1, /* segment descriptor present */
1541 1, /* default 32 vs 16 bit size */
1542 1 /* limit granularity (byte/page units)*/ },
1546 setidt_global(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
)
1550 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
1551 struct gate_descriptor
*ip
= &idt_arr
[cpu
][idx
];
1553 ip
->gd_looffset
= (uintptr_t)func
;
1554 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1560 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1565 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
, int cpu
)
1567 struct gate_descriptor
*ip
;
1569 KASSERT(cpu
>= 0 && cpu
< ncpus
, ("invalid cpu %d", cpu
));
1571 ip
= &idt_arr
[cpu
][idx
];
1572 ip
->gd_looffset
= (uintptr_t)func
;
1573 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1579 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1582 #define IDTVEC(name) __CONCAT(X,name)
1585 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1586 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1587 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1588 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(rsvd
), IDTVEC(fpu
), IDTVEC(align
),
1589 IDTVEC(xmm
), IDTVEC(dblfault
),
1590 IDTVEC(fast_syscall
), IDTVEC(fast_syscall32
);
1593 sdtossd(struct user_segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1595 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1596 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1597 ssd
->ssd_type
= sd
->sd_type
;
1598 ssd
->ssd_dpl
= sd
->sd_dpl
;
1599 ssd
->ssd_p
= sd
->sd_p
;
1600 ssd
->ssd_def32
= sd
->sd_def32
;
1601 ssd
->ssd_gran
= sd
->sd_gran
;
1605 ssdtosd(struct soft_segment_descriptor
*ssd
, struct user_segment_descriptor
*sd
)
1608 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1609 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xff;
1610 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1611 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1612 sd
->sd_type
= ssd
->ssd_type
;
1613 sd
->sd_dpl
= ssd
->ssd_dpl
;
1614 sd
->sd_p
= ssd
->ssd_p
;
1615 sd
->sd_long
= ssd
->ssd_long
;
1616 sd
->sd_def32
= ssd
->ssd_def32
;
1617 sd
->sd_gran
= ssd
->ssd_gran
;
1621 ssdtosyssd(struct soft_segment_descriptor
*ssd
,
1622 struct system_segment_descriptor
*sd
)
1625 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1626 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xfffffffffful
;
1627 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1628 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1629 sd
->sd_type
= ssd
->ssd_type
;
1630 sd
->sd_dpl
= ssd
->ssd_dpl
;
1631 sd
->sd_p
= ssd
->ssd_p
;
1632 sd
->sd_gran
= ssd
->ssd_gran
;
1636 * Populate the (physmap) array with base/bound pairs describing the
1637 * available physical memory in the system, then test this memory and
1638 * build the phys_avail array describing the actually-available memory.
1640 * If we cannot accurately determine the physical memory map, then use
1641 * value from the 0xE801 call, and failing that, the RTC.
1643 * Total memory size may be set by the kernel environment variable
1644 * hw.physmem or the compile-time define MAXMEM.
1646 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1647 * of PAGE_SIZE. This also greatly reduces the memory test time
1648 * which would otherwise be excessive on machines with > 8G of ram.
1650 * XXX first should be vm_paddr_t.
1653 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024)
1654 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1655 vm_paddr_t physmap
[PHYSMAP_SIZE
];
1656 struct bios_smap
*smapbase
, *smap
, *smapend
;
1657 struct efi_map_header
*efihdrbase
;
1659 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024)
1660 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1)
1663 add_smap_entries(int *physmap_idx
)
1667 smapsize
= *((u_int32_t
*)smapbase
- 1);
1668 smapend
= (struct bios_smap
*)((uintptr_t)smapbase
+ smapsize
);
1670 for (smap
= smapbase
; smap
< smapend
; smap
++) {
1671 if (boothowto
& RB_VERBOSE
)
1672 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1673 smap
->type
, smap
->base
, smap
->length
);
1675 if (smap
->type
!= SMAP_TYPE_MEMORY
)
1678 if (smap
->length
== 0)
1681 for (i
= 0; i
<= *physmap_idx
; i
+= 2) {
1682 if (smap
->base
< physmap
[i
+ 1]) {
1683 if (boothowto
& RB_VERBOSE
) {
1684 kprintf("Overlapping or non-monotonic "
1685 "memory region, ignoring "
1691 if (i
<= *physmap_idx
)
1694 Realmem
+= smap
->length
;
1696 if (smap
->base
== physmap
[*physmap_idx
+ 1]) {
1697 physmap
[*physmap_idx
+ 1] += smap
->length
;
1702 if (*physmap_idx
== PHYSMAP_SIZE
) {
1703 kprintf("Too many segments in the physical "
1704 "address map, giving up\n");
1707 physmap
[*physmap_idx
] = smap
->base
;
1708 physmap
[*physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1713 add_efi_map_entries(int *physmap_idx
)
1715 struct efi_md
*map
, *p
;
1720 static const char *types
[] = {
1726 "RuntimeServicesCode",
1727 "RuntimeServicesData",
1728 "ConventionalMemory",
1730 "ACPIReclaimMemory",
1733 "MemoryMappedIOPortSpace",
1738 * Memory map data provided by UEFI via the GetMemoryMap
1739 * Boot Services API.
1741 efisz
= (sizeof(struct efi_map_header
) + 0xf) & ~0xf;
1742 map
= (struct efi_md
*)((uint8_t *)efihdrbase
+ efisz
);
1744 if (efihdrbase
->descriptor_size
== 0)
1746 ndesc
= efihdrbase
->memory_size
/ efihdrbase
->descriptor_size
;
1748 if (boothowto
& RB_VERBOSE
)
1749 kprintf("%23s %12s %12s %8s %4s\n",
1750 "Type", "Physical", "Virtual", "#Pages", "Attr");
1752 for (i
= 0, p
= map
; i
< ndesc
; i
++,
1753 p
= efi_next_descriptor(p
, efihdrbase
->descriptor_size
)) {
1754 if (boothowto
& RB_VERBOSE
) {
1755 if (p
->md_type
<= EFI_MD_TYPE_PALCODE
)
1756 type
= types
[p
->md_type
];
1759 kprintf("%23s %012lx %12p %08lx ", type
, p
->md_phys
,
1760 p
->md_virt
, p
->md_pages
);
1761 if (p
->md_attr
& EFI_MD_ATTR_UC
)
1763 if (p
->md_attr
& EFI_MD_ATTR_WC
)
1765 if (p
->md_attr
& EFI_MD_ATTR_WT
)
1767 if (p
->md_attr
& EFI_MD_ATTR_WB
)
1769 if (p
->md_attr
& EFI_MD_ATTR_UCE
)
1771 if (p
->md_attr
& EFI_MD_ATTR_WP
)
1773 if (p
->md_attr
& EFI_MD_ATTR_RP
)
1775 if (p
->md_attr
& EFI_MD_ATTR_XP
)
1777 if (p
->md_attr
& EFI_MD_ATTR_RT
)
1782 switch (p
->md_type
) {
1783 case EFI_MD_TYPE_CODE
:
1784 case EFI_MD_TYPE_DATA
:
1785 case EFI_MD_TYPE_BS_CODE
:
1786 case EFI_MD_TYPE_BS_DATA
:
1787 case EFI_MD_TYPE_FREE
:
1789 * We're allowed to use any entry with these types.
1796 Realmem
+= p
->md_pages
* PAGE_SIZE
;
1798 if (p
->md_phys
== physmap
[*physmap_idx
+ 1]) {
1799 physmap
[*physmap_idx
+ 1] += p
->md_pages
* PAGE_SIZE
;
1804 if (*physmap_idx
== PHYSMAP_SIZE
) {
1805 kprintf("Too many segments in the physical "
1806 "address map, giving up\n");
1809 physmap
[*physmap_idx
] = p
->md_phys
;
1810 physmap
[*physmap_idx
+ 1] = p
->md_phys
+ p
->md_pages
* PAGE_SIZE
;
1814 struct fb_info efi_fb_info
;
1815 static int have_efi_framebuffer
= 0;
1818 efi_fb_init_vaddr(int direct_map
)
1821 vm_offset_t addr
, v
;
1823 v
= efi_fb_info
.vaddr
;
1824 sz
= efi_fb_info
.stride
* efi_fb_info
.height
;
1827 addr
= PHYS_TO_DMAP(efi_fb_info
.paddr
);
1828 if (addr
>= DMAP_MIN_ADDRESS
&& addr
+ sz
< DMAP_MAX_ADDRESS
)
1829 efi_fb_info
.vaddr
= addr
;
1831 efi_fb_info
.vaddr
= (vm_offset_t
)pmap_mapdev_attr(
1832 efi_fb_info
.paddr
, sz
, PAT_WRITE_COMBINING
);
1837 probe_efi_fb(int early
)
1839 struct efi_fb
*efifb
;
1842 if (have_efi_framebuffer
) {
1844 (efi_fb_info
.vaddr
== 0 ||
1845 efi_fb_info
.vaddr
== PHYS_TO_DMAP(efi_fb_info
.paddr
)))
1846 efi_fb_init_vaddr(0);
1850 kmdp
= preload_search_by_type("elf kernel");
1852 kmdp
= preload_search_by_type("elf64 kernel");
1853 efifb
= (struct efi_fb
*)preload_search_info(kmdp
,
1854 MODINFO_METADATA
| MODINFOMD_EFI_FB
);
1858 have_efi_framebuffer
= 1;
1860 efi_fb_info
.is_vga_boot_display
= 1;
1861 efi_fb_info
.width
= efifb
->fb_width
;
1862 efi_fb_info
.height
= efifb
->fb_height
;
1863 efi_fb_info
.stride
= efifb
->fb_stride
* 4;
1864 efi_fb_info
.depth
= 32;
1865 efi_fb_info
.paddr
= efifb
->fb_addr
;
1867 efi_fb_info
.vaddr
= 0;
1869 efi_fb_init_vaddr(0);
1871 efi_fb_info
.fbops
.fb_set_par
= NULL
;
1872 efi_fb_info
.fbops
.fb_blank
= NULL
;
1873 efi_fb_info
.fbops
.fb_debug_enter
= NULL
;
1874 efi_fb_info
.device
= NULL
;
1880 efifb_startup(void *arg
)
1885 SYSINIT(efi_fb_info
, SI_BOOT1_POST
, SI_ORDER_FIRST
, efifb_startup
, NULL
);
1888 getmemsize(caddr_t kmdp
, u_int64_t first
)
1890 int off
, physmap_idx
, pa_indx
, da_indx
;
1893 vm_paddr_t msgbuf_size
;
1894 u_long physmem_tunable
;
1896 quad_t dcons_addr
, dcons_size
;
1898 bzero(physmap
, sizeof(physmap
));
1902 * get memory map from INT 15:E820, kindly supplied by the loader.
1904 * subr_module.c says:
1905 * "Consumer may safely assume that size value precedes data."
1906 * ie: an int32_t immediately precedes smap.
1908 efihdrbase
= (struct efi_map_header
*)preload_search_info(kmdp
,
1909 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1910 smapbase
= (struct bios_smap
*)preload_search_info(kmdp
,
1911 MODINFO_METADATA
| MODINFOMD_SMAP
);
1912 if (smapbase
== NULL
&& efihdrbase
== NULL
)
1913 panic("No BIOS smap or EFI map info from loader!");
1915 if (efihdrbase
== NULL
)
1916 add_smap_entries(&physmap_idx
);
1918 add_efi_map_entries(&physmap_idx
);
1920 base_memory
= physmap
[1] / 1024;
1921 /* make hole for AP bootstrap code */
1922 physmap
[1] = mp_bootaddress(base_memory
);
1924 /* Save EBDA address, if any */
1925 ebda_addr
= (u_long
)(*(u_short
*)(KERNBASE
+ 0x40e));
1929 * Maxmem isn't the "maximum memory", it's one larger than the
1930 * highest page of the physical address space. It should be
1931 * called something like "Maxphyspage". We may adjust this
1932 * based on ``hw.physmem'' and the results of the memory test.
1934 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1937 Maxmem
= MAXMEM
/ 4;
1940 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable
))
1941 Maxmem
= atop(physmem_tunable
);
1944 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1947 if (Maxmem
> atop(physmap
[physmap_idx
+ 1]))
1948 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1951 * Blowing out the DMAP will blow up the system.
1953 if (Maxmem
> atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
)) {
1954 kprintf("Limiting Maxmem due to DMAP size\n");
1955 Maxmem
= atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
);
1958 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1959 (boothowto
& RB_VERBOSE
)) {
1960 kprintf("Physical memory use set to %ldK\n", Maxmem
* 4);
1964 * Call pmap initialization to make new kernel address space
1968 pmap_bootstrap(&first
);
1969 physmap
[0] = PAGE_SIZE
;
1972 * Align the physmap to PHYSMAP_ALIGN and cut out anything
1975 for (i
= j
= 0; i
<= physmap_idx
; i
+= 2) {
1976 if (physmap
[i
+1] > ptoa(Maxmem
))
1977 physmap
[i
+1] = ptoa(Maxmem
);
1978 physmap
[i
] = (physmap
[i
] + PHYSMAP_ALIGN_MASK
) &
1979 ~PHYSMAP_ALIGN_MASK
;
1980 physmap
[i
+1] = physmap
[i
+1] & ~PHYSMAP_ALIGN_MASK
;
1982 physmap
[j
] = physmap
[i
];
1983 physmap
[j
+1] = physmap
[i
+1];
1985 if (physmap
[i
] < physmap
[i
+1])
1988 physmap_idx
= j
- 2;
1991 * Align anything else used in the validation loop.
1993 first
= (first
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
1996 * Size up each available chunk of physical memory.
2000 phys_avail
[pa_indx
++] = physmap
[0];
2001 phys_avail
[pa_indx
] = physmap
[0];
2002 dump_avail
[da_indx
] = physmap
[0];
2006 * Get dcons buffer address
2008 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
2009 kgetenv_quad("dcons.size", &dcons_size
) == 0)
2013 * Validate the physical memory. The physical memory segments
2014 * have already been aligned to PHYSMAP_ALIGN which is a multiple
2017 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
2019 vm_paddr_t incr
= PHYSMAP_ALIGN
;
2021 end
= physmap
[i
+ 1];
2023 for (pa
= physmap
[i
]; pa
< end
; pa
+= incr
) {
2025 volatile uint64_t *ptr
= (uint64_t *)CADDR1
;
2028 incr
= PHYSMAP_ALIGN
;
2032 * block out kernel memory as not available.
2034 if (pa
>= 0x200000 && pa
< first
)
2038 * block out dcons buffer
2041 && pa
>= trunc_page(dcons_addr
)
2042 && pa
< dcons_addr
+ dcons_size
) {
2049 * Always test the first and last block supplied in
2050 * the map entry, but it just takes too long to run
2051 * the test these days and we already have to skip
2052 * pages. Handwave it on PHYSMAP_HANDWAVE boundaries.
2054 if (pa
!= physmap
[i
]) {
2055 vm_paddr_t bytes
= end
- pa
;
2056 if ((pa
& PHYSMAP_HANDWAVE_MASK
) == 0 &&
2057 bytes
>= PHYSMAP_HANDWAVE
+ PHYSMAP_ALIGN
) {
2058 incr
= PHYSMAP_HANDWAVE
;
2064 * map page into kernel: valid, read/write,non-cacheable
2067 kernel_pmap
.pmap_bits
[PG_V_IDX
] |
2068 kernel_pmap
.pmap_bits
[PG_RW_IDX
] |
2069 kernel_pmap
.pmap_bits
[PG_N_IDX
];
2070 cpu_invlpg(__DEVOLATILE(void *, ptr
));
2075 * Test for alternating 1's and 0's
2077 *ptr
= 0xaaaaaaaaaaaaaaaaLLU
;
2079 if (*ptr
!= 0xaaaaaaaaaaaaaaaaLLU
)
2082 * Test for alternating 0's and 1's
2084 *ptr
= 0x5555555555555555LLU
;
2086 if (*ptr
!= 0x5555555555555555LLU
)
2091 *ptr
= 0xffffffffffffffffLLU
;
2093 if (*ptr
!= 0xffffffffffffffffLLU
)
2103 * Restore original value.
2109 * Adjust array of valid/good pages.
2111 if (page_bad
== TRUE
)
2115 * If this good page is a continuation of the
2116 * previous set of good pages, then just increase
2117 * the end pointer. Otherwise start a new chunk.
2118 * Note that "end" points one higher than end,
2119 * making the range >= start and < end.
2120 * If we're also doing a speculative memory
2121 * test and we at or past the end, bump up Maxmem
2122 * so that we keep going. The first bad page
2123 * will terminate the loop.
2125 if (phys_avail
[pa_indx
] == pa
) {
2126 phys_avail
[pa_indx
] += incr
;
2129 if (pa_indx
== PHYS_AVAIL_ARRAY_END
) {
2131 "Too many holes in the physical address space, giving up\n");
2136 phys_avail
[pa_indx
++] = pa
;
2137 phys_avail
[pa_indx
] = pa
+ incr
;
2139 physmem
+= incr
/ PAGE_SIZE
;
2141 if (dump_avail
[da_indx
] == pa
) {
2142 dump_avail
[da_indx
] += incr
;
2145 if (da_indx
== DUMP_AVAIL_ARRAY_END
) {
2149 dump_avail
[da_indx
++] = pa
;
2150 dump_avail
[da_indx
] = pa
+ incr
;
2162 * The last chunk must contain at least one page plus the message
2163 * buffer to avoid complicating other code (message buffer address
2164 * calculation, etc.).
2166 msgbuf_size
= (MSGBUF_SIZE
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
2168 while (phys_avail
[pa_indx
- 1] + PHYSMAP_ALIGN
+
2169 msgbuf_size
>= phys_avail
[pa_indx
]) {
2170 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
2171 phys_avail
[pa_indx
--] = 0;
2172 phys_avail
[pa_indx
--] = 0;
2175 Maxmem
= atop(phys_avail
[pa_indx
]);
2177 /* Trim off space for the message buffer. */
2178 phys_avail
[pa_indx
] -= msgbuf_size
;
2180 avail_end
= phys_avail
[pa_indx
];
2182 /* Map the message buffer. */
2183 for (off
= 0; off
< msgbuf_size
; off
+= PAGE_SIZE
) {
2184 pmap_kenter((vm_offset_t
)msgbufp
+ off
,
2185 phys_avail
[pa_indx
] + off
);
2187 /* Try to get EFI framebuffer working as early as possible */
2188 if (have_efi_framebuffer
)
2189 efi_fb_init_vaddr(1);
2192 struct machintr_abi MachIntrABI
;
2203 * 7 Device Not Available (x87)
2205 * 9 Coprocessor Segment overrun (unsupported, reserved)
2207 * 11 Segment not present
2209 * 13 General Protection
2212 * 16 x87 FP Exception pending
2213 * 17 Alignment Check
2215 * 19 SIMD floating point
2217 * 32-255 INTn/external sources
2220 hammer_time(u_int64_t modulep
, u_int64_t physfree
)
2223 int gsel_tss
, x
, cpu
;
2225 int metadata_missing
, off
;
2227 struct mdglobaldata
*gd
;
2231 * Prevent lowering of the ipl if we call tsleep() early.
2233 gd
= &CPU_prvspace
[0]->mdglobaldata
;
2234 bzero(gd
, sizeof(*gd
));
2237 * Note: on both UP and SMP curthread must be set non-NULL
2238 * early in the boot sequence because the system assumes
2239 * that 'curthread' is never NULL.
2242 gd
->mi
.gd_curthread
= &thread0
;
2243 thread0
.td_gd
= &gd
->mi
;
2245 atdevbase
= ISA_HOLE_START
+ PTOV_OFFSET
;
2248 metadata_missing
= 0;
2249 if (bootinfo
.bi_modulep
) {
2250 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
2251 preload_bootstrap_relocate(KERNBASE
);
2253 metadata_missing
= 1;
2255 if (bootinfo
.bi_envp
)
2256 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
2259 preload_metadata
= (caddr_t
)(uintptr_t)(modulep
+ PTOV_OFFSET
);
2260 preload_bootstrap_relocate(PTOV_OFFSET
);
2261 kmdp
= preload_search_by_type("elf kernel");
2263 kmdp
= preload_search_by_type("elf64 kernel");
2264 boothowto
= MD_FETCH(kmdp
, MODINFOMD_HOWTO
, int);
2265 kern_envp
= MD_FETCH(kmdp
, MODINFOMD_ENVP
, char *) + PTOV_OFFSET
;
2267 ksym_start
= MD_FETCH(kmdp
, MODINFOMD_SSYM
, uintptr_t);
2268 ksym_end
= MD_FETCH(kmdp
, MODINFOMD_ESYM
, uintptr_t);
2270 efi_systbl_phys
= MD_FETCH(kmdp
, MODINFOMD_FW_HANDLE
, vm_paddr_t
);
2272 if (boothowto
& RB_VERBOSE
)
2276 * Default MachIntrABI to ICU
2278 MachIntrABI
= MachIntrABI_ICU
;
2281 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
2282 * and ncpus_fit_mask remain 0.
2287 /* Init basic tunables, hz etc */
2291 * make gdt memory segments
2293 gdt_segs
[GPROC0_SEL
].ssd_base
=
2294 (uintptr_t) &CPU_prvspace
[0]->mdglobaldata
.gd_common_tss
;
2296 gd
->mi
.gd_prvspace
= CPU_prvspace
[0];
2298 for (x
= 0; x
< NGDT
; x
++) {
2299 if (x
!= GPROC0_SEL
&& x
!= (GPROC0_SEL
+ 1))
2300 ssdtosd(&gdt_segs
[x
], &gdt
[x
]);
2302 ssdtosyssd(&gdt_segs
[GPROC0_SEL
],
2303 (struct system_segment_descriptor
*)&gdt
[GPROC0_SEL
]);
2305 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
2306 r_gdt
.rd_base
= (long) gdt
;
2309 wrmsr(MSR_FSBASE
, 0); /* User value */
2310 wrmsr(MSR_GSBASE
, (u_int64_t
)&gd
->mi
);
2311 wrmsr(MSR_KGSBASE
, 0); /* User value while in the kernel */
2313 mi_gdinit(&gd
->mi
, 0);
2315 proc0paddr
= proc0paddr_buff
;
2316 mi_proc0init(&gd
->mi
, proc0paddr
);
2317 safepri
= TDPRI_MAX
;
2319 /* spinlocks and the BGL */
2323 for (x
= 0; x
< NIDT
; x
++)
2324 setidt_global(x
, &IDTVEC(rsvd
), SDT_SYSIGT
, SEL_KPL
, 0);
2325 setidt_global(IDT_DE
, &IDTVEC(div
), SDT_SYSIGT
, SEL_KPL
, 0);
2326 setidt_global(IDT_DB
, &IDTVEC(dbg
), SDT_SYSIGT
, SEL_KPL
, 0);
2327 setidt_global(IDT_NMI
, &IDTVEC(nmi
), SDT_SYSIGT
, SEL_KPL
, 1);
2328 setidt_global(IDT_BP
, &IDTVEC(bpt
), SDT_SYSIGT
, SEL_UPL
, 0);
2329 setidt_global(IDT_OF
, &IDTVEC(ofl
), SDT_SYSIGT
, SEL_KPL
, 0);
2330 setidt_global(IDT_BR
, &IDTVEC(bnd
), SDT_SYSIGT
, SEL_KPL
, 0);
2331 setidt_global(IDT_UD
, &IDTVEC(ill
), SDT_SYSIGT
, SEL_KPL
, 0);
2332 setidt_global(IDT_NM
, &IDTVEC(dna
), SDT_SYSIGT
, SEL_KPL
, 0);
2333 setidt_global(IDT_DF
, &IDTVEC(dblfault
), SDT_SYSIGT
, SEL_KPL
, 1);
2334 setidt_global(IDT_FPUGP
, &IDTVEC(fpusegm
), SDT_SYSIGT
, SEL_KPL
, 0);
2335 setidt_global(IDT_TS
, &IDTVEC(tss
), SDT_SYSIGT
, SEL_KPL
, 0);
2336 setidt_global(IDT_NP
, &IDTVEC(missing
), SDT_SYSIGT
, SEL_KPL
, 0);
2337 setidt_global(IDT_SS
, &IDTVEC(stk
), SDT_SYSIGT
, SEL_KPL
, 0);
2338 setidt_global(IDT_GP
, &IDTVEC(prot
), SDT_SYSIGT
, SEL_KPL
, 0);
2339 setidt_global(IDT_PF
, &IDTVEC(page
), SDT_SYSIGT
, SEL_KPL
, 0);
2340 setidt_global(IDT_MF
, &IDTVEC(fpu
), SDT_SYSIGT
, SEL_KPL
, 0);
2341 setidt_global(IDT_AC
, &IDTVEC(align
), SDT_SYSIGT
, SEL_KPL
, 0);
2342 setidt_global(IDT_MC
, &IDTVEC(mchk
), SDT_SYSIGT
, SEL_KPL
, 0);
2343 setidt_global(IDT_XF
, &IDTVEC(xmm
), SDT_SYSIGT
, SEL_KPL
, 0);
2345 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
2346 r_idt_arr
[cpu
].rd_limit
= sizeof(idt_arr
[cpu
]) - 1;
2347 r_idt_arr
[cpu
].rd_base
= (long) &idt_arr
[cpu
][0];
2350 lidt(&r_idt_arr
[0]);
2353 * Initialize the console before we print anything out.
2358 if (metadata_missing
)
2359 kprintf("WARNING: loader(8) metadata is missing!\n");
2369 * Initialize IRQ mapping
2372 * SHOULD be after elcr_probe()
2374 MachIntrABI_ICU
.initmap();
2375 MachIntrABI_IOAPIC
.initmap();
2379 if (boothowto
& RB_KDB
)
2380 Debugger("Boot flags requested debugger");
2384 finishidentcpu(); /* Final stage of CPU initialization */
2385 setidt(6, &IDTVEC(ill
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2386 setidt(13, &IDTVEC(prot
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2388 identify_cpu(); /* Final stage of CPU initialization */
2389 initializecpu(0); /* Initialize CPU registers */
2392 * On modern intel cpus, haswell or later, cpu_idle_hlt=1 is better
2393 * because the cpu does significant power management in MWAIT
2394 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2396 * On modern amd cpus cpu_idle_hlt=3 is better, because the cpu does
2397 * significant power management in HLT or ACPI (but cpu_idle_hlt=1
2398 * would try to use MWAIT).
2400 * On older amd or intel cpus, cpu_idle_hlt=2 is better because ACPI
2401 * is needed to reduce power consumption, but wakeup times are often
2404 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
2405 CPUID_TO_MODEL(cpu_id
) >= 0x3C) { /* Haswell or later */
2408 if (cpu_vendor_id
== CPU_VENDOR_AMD
&&
2409 CPUID_TO_FAMILY(cpu_id
) >= 0x14) { /* Bobcat or later */
2413 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable
); /* for compat */
2414 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable
);
2415 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable
);
2416 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt
);
2419 * Some of the virtual machines do not work w/ I/O APIC
2420 * enabled. If the user does not explicitly enable or
2421 * disable the I/O APIC (ioapic_enable < 0), then we
2422 * disable I/O APIC on all virtual machines.
2425 * This must be done after identify_cpu(), which sets
2428 if (ioapic_enable
< 0) {
2429 if (cpu_feature2
& CPUID2_VMM
)
2435 /* make an initial tss so cpu can get interrupt stack on syscall! */
2436 gd
->gd_common_tss
.tss_rsp0
=
2437 (register_t
)(thread0
.td_kstack
+
2438 KSTACK_PAGES
* PAGE_SIZE
- sizeof(struct pcb
));
2439 /* Ensure the stack is aligned to 16 bytes */
2440 gd
->gd_common_tss
.tss_rsp0
&= ~(register_t
)0xF;
2442 /* double fault stack */
2443 gd
->gd_common_tss
.tss_ist1
=
2444 (long)&gd
->mi
.gd_prvspace
->idlestack
[
2445 sizeof(gd
->mi
.gd_prvspace
->idlestack
)];
2447 /* Set the IO permission bitmap (empty due to tss seg limit) */
2448 gd
->gd_common_tss
.tss_iobase
= sizeof(struct x86_64tss
);
2450 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2451 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
];
2452 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2455 /* Set up the fast syscall stuff */
2456 msr
= rdmsr(MSR_EFER
) | EFER_SCE
;
2457 wrmsr(MSR_EFER
, msr
);
2458 wrmsr(MSR_LSTAR
, (u_int64_t
)IDTVEC(fast_syscall
));
2459 wrmsr(MSR_CSTAR
, (u_int64_t
)IDTVEC(fast_syscall32
));
2460 msr
= ((u_int64_t
)GSEL(GCODE_SEL
, SEL_KPL
) << 32) |
2461 ((u_int64_t
)GSEL(GUCODE32_SEL
, SEL_UPL
) << 48);
2462 wrmsr(MSR_STAR
, msr
);
2463 wrmsr(MSR_SF_MASK
, PSL_NT
|PSL_T
|PSL_I
|PSL_C
|PSL_D
|PSL_IOPL
);
2465 getmemsize(kmdp
, physfree
);
2466 init_param2(physmem
);
2468 /* now running on new page tables, configured,and u/iom is accessible */
2470 /* Map the message buffer. */
2472 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2473 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2476 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2479 /* transfer to user mode */
2481 _ucodesel
= GSEL(GUCODE_SEL
, SEL_UPL
);
2482 _udatasel
= GSEL(GUDATA_SEL
, SEL_UPL
);
2483 _ucode32sel
= GSEL(GUCODE32_SEL
, SEL_UPL
);
2489 /* setup proc 0's pcb */
2490 thread0
.td_pcb
->pcb_flags
= 0;
2491 thread0
.td_pcb
->pcb_cr3
= KPML4phys
;
2492 thread0
.td_pcb
->pcb_ext
= NULL
;
2493 lwp0
.lwp_md
.md_regs
= &proc0_tf
; /* XXX needed? */
2495 /* Location of kernel stack for locore */
2496 return ((u_int64_t
)thread0
.td_pcb
);
2500 * Initialize machine-dependant portions of the global data structure.
2501 * Note that the global data area and cpu0's idlestack in the private
2502 * data space were allocated in locore.
2504 * Note: the idlethread's cpl is 0
2506 * WARNING! Called from early boot, 'mycpu' may not work yet.
2509 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2512 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2514 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2515 gd
->mi
.gd_prvspace
->idlestack
,
2516 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2518 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2519 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2520 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2521 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2525 * We only have to check for DMAP bounds, the globaldata space is
2526 * actually part of the kernel_map so we don't have to waste time
2527 * checking CPU_prvspace[*].
2530 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2533 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2534 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2538 if (saddr
>= DMAP_MIN_ADDRESS
&& eaddr
<= DMAP_MAX_ADDRESS
)
2544 globaldata_find(int cpu
)
2546 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2547 return(&CPU_prvspace
[cpu
]->mdglobaldata
.mi
);
2551 * This path should be safe from the SYSRET issue because only stopped threads
2552 * can have their %rip adjusted this way (and all heavy weight thread switches
2553 * clear QUICKREF and thus do not use SYSRET). However, the code path is
2554 * convoluted so add a safety by forcing %rip to be cannonical.
2557 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2559 if (addr
& 0x0000800000000000LLU
)
2560 lp
->lwp_md
.md_regs
->tf_rip
= addr
| 0xFFFF000000000000LLU
;
2562 lp
->lwp_md
.md_regs
->tf_rip
= addr
& 0x0000FFFFFFFFFFFFLLU
;
2567 ptrace_single_step(struct lwp
*lp
)
2569 lp
->lwp_md
.md_regs
->tf_rflags
|= PSL_T
;
2574 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2576 struct trapframe
*tp
;
2578 if ((tp
= lp
->lwp_md
.md_regs
) == NULL
)
2580 bcopy(&tp
->tf_rdi
, ®s
->r_rdi
, sizeof(*regs
));
2585 set_regs(struct lwp
*lp
, struct reg
*regs
)
2587 struct trapframe
*tp
;
2589 tp
= lp
->lwp_md
.md_regs
;
2590 if (!EFL_SECURE(regs
->r_rflags
, tp
->tf_rflags
) ||
2591 !CS_SECURE(regs
->r_cs
))
2593 bcopy(®s
->r_rdi
, &tp
->tf_rdi
, sizeof(*regs
));
2599 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2601 struct env87
*penv_87
= &sv_87
->sv_env
;
2602 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2605 /* FPU control/status */
2606 penv_87
->en_cw
= penv_xmm
->en_cw
;
2607 penv_87
->en_sw
= penv_xmm
->en_sw
;
2608 penv_87
->en_tw
= penv_xmm
->en_tw
;
2609 penv_87
->en_fip
= penv_xmm
->en_fip
;
2610 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2611 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2612 penv_87
->en_foo
= penv_xmm
->en_foo
;
2613 penv_87
->en_fos
= penv_xmm
->en_fos
;
2616 for (i
= 0; i
< 8; ++i
)
2617 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2621 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2623 struct env87
*penv_87
= &sv_87
->sv_env
;
2624 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2627 /* FPU control/status */
2628 penv_xmm
->en_cw
= penv_87
->en_cw
;
2629 penv_xmm
->en_sw
= penv_87
->en_sw
;
2630 penv_xmm
->en_tw
= penv_87
->en_tw
;
2631 penv_xmm
->en_fip
= penv_87
->en_fip
;
2632 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2633 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2634 penv_xmm
->en_foo
= penv_87
->en_foo
;
2635 penv_xmm
->en_fos
= penv_87
->en_fos
;
2638 for (i
= 0; i
< 8; ++i
)
2639 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2643 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2645 if (lp
->lwp_thread
== NULL
|| lp
->lwp_thread
->td_pcb
== NULL
)
2648 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2649 (struct save87
*)fpregs
);
2652 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2657 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2660 set_fpregs_xmm((struct save87
*)fpregs
,
2661 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2664 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2669 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2674 dbregs
->dr
[0] = rdr0();
2675 dbregs
->dr
[1] = rdr1();
2676 dbregs
->dr
[2] = rdr2();
2677 dbregs
->dr
[3] = rdr3();
2678 dbregs
->dr
[4] = rdr4();
2679 dbregs
->dr
[5] = rdr5();
2680 dbregs
->dr
[6] = rdr6();
2681 dbregs
->dr
[7] = rdr7();
2684 if (lp
->lwp_thread
== NULL
|| (pcb
= lp
->lwp_thread
->td_pcb
) == NULL
)
2686 dbregs
->dr
[0] = pcb
->pcb_dr0
;
2687 dbregs
->dr
[1] = pcb
->pcb_dr1
;
2688 dbregs
->dr
[2] = pcb
->pcb_dr2
;
2689 dbregs
->dr
[3] = pcb
->pcb_dr3
;
2692 dbregs
->dr
[6] = pcb
->pcb_dr6
;
2693 dbregs
->dr
[7] = pcb
->pcb_dr7
;
2698 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2701 load_dr0(dbregs
->dr
[0]);
2702 load_dr1(dbregs
->dr
[1]);
2703 load_dr2(dbregs
->dr
[2]);
2704 load_dr3(dbregs
->dr
[3]);
2705 load_dr4(dbregs
->dr
[4]);
2706 load_dr5(dbregs
->dr
[5]);
2707 load_dr6(dbregs
->dr
[6]);
2708 load_dr7(dbregs
->dr
[7]);
2711 struct ucred
*ucred
;
2713 uint64_t mask1
, mask2
;
2716 * Don't let an illegal value for dr7 get set. Specifically,
2717 * check for undefined settings. Setting these bit patterns
2718 * result in undefined behaviour and can lead to an unexpected
2721 /* JG this loop looks unreadable */
2722 /* Check 4 2-bit fields for invalid patterns.
2723 * These fields are R/Wi, for i = 0..3
2725 /* Is 10 in LENi allowed when running in compatibility mode? */
2726 /* Pattern 10 in R/Wi might be used to indicate
2727 * breakpoint on I/O. Further analysis should be
2728 * carried to decide if it is safe and useful to
2729 * provide access to that capability
2731 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 4;
2732 i
++, mask1
<<= 4, mask2
<<= 4)
2733 if ((dbregs
->dr
[7] & mask1
) == mask2
)
2736 pcb
= lp
->lwp_thread
->td_pcb
;
2737 ucred
= lp
->lwp_proc
->p_ucred
;
2740 * Don't let a process set a breakpoint that is not within the
2741 * process's address space. If a process could do this, it
2742 * could halt the system by setting a breakpoint in the kernel
2743 * (if ddb was enabled). Thus, we need to check to make sure
2744 * that no breakpoints are being enabled for addresses outside
2745 * process's address space, unless, perhaps, we were called by
2748 * XXX - what about when the watched area of the user's
2749 * address space is written into from within the kernel
2750 * ... wouldn't that still cause a breakpoint to be generated
2751 * from within kernel mode?
2754 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2755 if (dbregs
->dr
[7] & 0x3) {
2756 /* dr0 is enabled */
2757 if (dbregs
->dr
[0] >= VM_MAX_USER_ADDRESS
)
2761 if (dbregs
->dr
[7] & (0x3<<2)) {
2762 /* dr1 is enabled */
2763 if (dbregs
->dr
[1] >= VM_MAX_USER_ADDRESS
)
2767 if (dbregs
->dr
[7] & (0x3<<4)) {
2768 /* dr2 is enabled */
2769 if (dbregs
->dr
[2] >= VM_MAX_USER_ADDRESS
)
2773 if (dbregs
->dr
[7] & (0x3<<6)) {
2774 /* dr3 is enabled */
2775 if (dbregs
->dr
[3] >= VM_MAX_USER_ADDRESS
)
2780 pcb
->pcb_dr0
= dbregs
->dr
[0];
2781 pcb
->pcb_dr1
= dbregs
->dr
[1];
2782 pcb
->pcb_dr2
= dbregs
->dr
[2];
2783 pcb
->pcb_dr3
= dbregs
->dr
[3];
2784 pcb
->pcb_dr6
= dbregs
->dr
[6];
2785 pcb
->pcb_dr7
= dbregs
->dr
[7];
2787 pcb
->pcb_flags
|= PCB_DBREGS
;
2794 * Return > 0 if a hardware breakpoint has been hit, and the
2795 * breakpoint was in user space. Return 0, otherwise.
2798 user_dbreg_trap(void)
2800 u_int64_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2801 u_int64_t bp
; /* breakpoint bits extracted from dr6 */
2802 int nbp
; /* number of breakpoints that triggered */
2803 caddr_t addr
[4]; /* breakpoint addresses */
2807 if ((dr7
& 0xff) == 0) {
2809 * all GE and LE bits in the dr7 register are zero,
2810 * thus the trap couldn't have been caused by the
2811 * hardware debug registers
2822 * None of the breakpoint bits are set meaning this
2823 * trap was not caused by any of the debug registers
2829 * at least one of the breakpoints were hit, check to see
2830 * which ones and if any of them are user space addresses
2834 addr
[nbp
++] = (caddr_t
)rdr0();
2837 addr
[nbp
++] = (caddr_t
)rdr1();
2840 addr
[nbp
++] = (caddr_t
)rdr2();
2843 addr
[nbp
++] = (caddr_t
)rdr3();
2846 for (i
=0; i
<nbp
; i
++) {
2848 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2850 * addr[i] is in user space
2857 * None of the breakpoints are in user space.
2865 Debugger(const char *msg
)
2867 kprintf("Debugger(\"%s\") called.\n", msg
);
2874 * Provide inb() and outb() as functions. They are normally only
2875 * available as macros calling inlined functions, thus cannot be
2876 * called inside DDB.
2878 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2884 /* silence compiler warnings */
2886 void outb(u_int
, u_char
);
2893 * We use %%dx and not %1 here because i/o is done at %dx and not at
2894 * %edx, while gcc generates inferior code (movw instead of movl)
2895 * if we tell it to load (u_short) port.
2897 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2902 outb(u_int port
, u_char data
)
2906 * Use an unnecessary assignment to help gcc's register allocator.
2907 * This make a large difference for gcc-1.40 and a tiny difference
2908 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2909 * best results. gcc-2.6.0 can't handle this.
2912 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2920 * initialize all the SMP locks
2923 /* critical region when masking or unmasking interupts */
2924 struct spinlock_deprecated imen_spinlock
;
2926 /* lock region used by kernel profiling */
2927 struct spinlock_deprecated mcount_spinlock
;
2929 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2930 struct spinlock_deprecated com_spinlock
;
2932 /* lock regions around the clock hardware */
2933 struct spinlock_deprecated clock_spinlock
;
2939 * Get the initial mplock with a count of 1 for the BSP.
2940 * This uses a LOGICAL cpu ID, ie BSP == 0.
2942 cpu_get_initial_mplock();
2944 spin_init_deprecated(&mcount_spinlock
);
2945 spin_init_deprecated(&imen_spinlock
);
2946 spin_init_deprecated(&com_spinlock
);
2947 spin_init_deprecated(&clock_spinlock
);
2949 /* our token pool needs to work early */
2950 lwkt_token_pool_init();
2954 cpu_mwait_hint_valid(uint32_t hint
)
2958 cx_idx
= MWAIT_EAX_TO_CX(hint
);
2959 if (cx_idx
>= CPU_MWAIT_CX_MAX
)
2962 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2963 if (sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
2970 cpu_mwait_cx_no_bmsts(void)
2972 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_STS
);
2976 cpu_mwait_cx_no_bmarb(void)
2978 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_ARB
);
2982 cpu_mwait_cx_hint2name(int hint
, char *name
, int namelen
, boolean_t allow_auto
)
2984 int old_cx_idx
, sub
= 0;
2987 old_cx_idx
= MWAIT_EAX_TO_CX(hint
);
2988 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2989 } else if (hint
== CPU_MWAIT_HINT_AUTO
) {
2990 old_cx_idx
= allow_auto
? CPU_MWAIT_C2
: CPU_MWAIT_CX_MAX
;
2991 } else if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
2992 old_cx_idx
= allow_auto
? CPU_MWAIT_C3
: CPU_MWAIT_CX_MAX
;
2994 old_cx_idx
= CPU_MWAIT_CX_MAX
;
2997 if (!CPU_MWAIT_HAS_CX
)
2998 strlcpy(name
, "NONE", namelen
);
2999 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTO
)
3000 strlcpy(name
, "AUTO", namelen
);
3001 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTODEEP
)
3002 strlcpy(name
, "AUTODEEP", namelen
);
3003 else if (old_cx_idx
>= CPU_MWAIT_CX_MAX
||
3004 sub
>= cpu_mwait_cx_info
[old_cx_idx
].subcnt
)
3005 strlcpy(name
, "INVALID", namelen
);
3007 ksnprintf(name
, namelen
, "C%d/%d", old_cx_idx
, sub
);
3013 cpu_mwait_cx_name2hint(char *name
, int *hint0
, boolean_t allow_auto
)
3015 int cx_idx
, sub
, hint
;
3018 if (allow_auto
&& strcmp(name
, "AUTO") == 0) {
3019 hint
= CPU_MWAIT_HINT_AUTO
;
3020 cx_idx
= CPU_MWAIT_C2
;
3023 if (allow_auto
&& strcmp(name
, "AUTODEEP") == 0) {
3024 hint
= CPU_MWAIT_HINT_AUTODEEP
;
3025 cx_idx
= CPU_MWAIT_C3
;
3029 if (strlen(name
) < 4 || toupper(name
[0]) != 'C')
3034 cx_idx
= strtol(start
, &ptr
, 10);
3035 if (ptr
== start
|| *ptr
!= '/')
3037 if (cx_idx
< 0 || cx_idx
>= CPU_MWAIT_CX_MAX
)
3043 sub
= strtol(start
, &ptr
, 10);
3046 if (sub
< 0 || sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
3049 hint
= MWAIT_EAX_HINT(cx_idx
, sub
);
3056 cpu_mwait_cx_transit(int old_cx_idx
, int cx_idx
)
3058 if (cx_idx
>= CPU_MWAIT_C3
&& cpu_mwait_c3_preamble
)
3060 if (old_cx_idx
< CPU_MWAIT_C3
&& cx_idx
>= CPU_MWAIT_C3
) {
3063 error
= cputimer_intr_powersave_addreq();
3066 } else if (old_cx_idx
>= CPU_MWAIT_C3
&& cx_idx
< CPU_MWAIT_C3
) {
3067 cputimer_intr_powersave_remreq();
3073 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
, int *hint0
,
3074 boolean_t allow_auto
)
3076 int error
, cx_idx
, old_cx_idx
, hint
;
3077 char name
[CPU_MWAIT_CX_NAMELEN
];
3080 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
),
3083 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3084 if (error
!= 0 || req
->newptr
== NULL
)
3087 if (!CPU_MWAIT_HAS_CX
)
3090 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, allow_auto
);
3094 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3103 cpu_mwait_cx_setname(struct cpu_idle_stat
*stat
, const char *cx_name
)
3105 int error
, cx_idx
, old_cx_idx
, hint
;
3106 char name
[CPU_MWAIT_CX_NAMELEN
];
3108 KASSERT(CPU_MWAIT_HAS_CX
, ("cpu does not support mwait CX extension"));
3111 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3113 strlcpy(name
, cx_name
, sizeof(name
));
3114 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3118 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3127 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3129 int hint
= cpu_mwait_halt_global
;
3130 int error
, cx_idx
, cpu
;
3131 char name
[CPU_MWAIT_CX_NAMELEN
], cx_name
[CPU_MWAIT_CX_NAMELEN
];
3133 cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3135 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3136 if (error
!= 0 || req
->newptr
== NULL
)
3139 if (!CPU_MWAIT_HAS_CX
)
3142 /* Save name for later per-cpu CX configuration */
3143 strlcpy(cx_name
, name
, sizeof(cx_name
));
3145 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3149 /* Change per-cpu CX configuration */
3150 for (cpu
= 0; cpu
< ncpus
; ++cpu
) {
3151 error
= cpu_mwait_cx_setname(&cpu_idle_stats
[cpu
], cx_name
);
3156 cpu_mwait_halt_global
= hint
;
3161 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3163 struct cpu_idle_stat
*stat
= arg1
;
3166 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3172 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
)
3176 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3177 &cpu_mwait_spin
, FALSE
);
3182 * This manual debugging code is called unconditionally from Xtimer
3183 * (the per-cpu timer interrupt) whether the current thread is in a
3184 * critical section or not) and can be useful in tracking down lockups.
3186 * NOTE: MANUAL DEBUG CODE
3189 static int saveticks
[SMP_MAXCPU
];
3190 static int savecounts
[SMP_MAXCPU
];
3194 pcpu_timer_always(struct intrframe
*frame
)
3197 globaldata_t gd
= mycpu
;
3198 int cpu
= gd
->gd_cpuid
;
3204 gptr
= (short *)0xFFFFFFFF800b8000 + 80 * cpu
;
3205 *gptr
= ((*gptr
+ 1) & 0x00FF) | 0x0700;
3208 ksnprintf(buf
, sizeof(buf
), " %p %16s %d %16s ",
3209 (void *)frame
->if_rip
, gd
->gd_curthread
->td_comm
, ticks
,
3211 for (i
= 0; buf
[i
]; ++i
) {
3212 gptr
[i
] = 0x0700 | (unsigned char)buf
[i
];
3216 if (saveticks
[gd
->gd_cpuid
] != ticks
) {
3217 saveticks
[gd
->gd_cpuid
] = ticks
;
3218 savecounts
[gd
->gd_cpuid
] = 0;
3220 ++savecounts
[gd
->gd_cpuid
];
3221 if (savecounts
[gd
->gd_cpuid
] > 2000 && panicstr
== NULL
) {
3222 panic("cpud %d panicing on ticks failure",
3225 for (i
= 0; i
< ncpus
; ++i
) {
3227 if (saveticks
[i
] && panicstr
== NULL
) {
3228 delta
= saveticks
[i
] - ticks
;
3229 if (delta
< -10 || delta
> 10) {
3230 panic("cpu %d panicing on cpu %d watchdog",