2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
43 //#include "use_npx.h"
45 #include "opt_compat.h"
48 #include "opt_directio.h"
50 #include "opt_msgbuf.h"
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/sysproto.h>
56 #include <sys/signalvar.h>
57 #include <sys/kernel.h>
58 #include <sys/linker.h>
59 #include <sys/malloc.h>
63 #include <sys/reboot.h>
65 #include <sys/msgbuf.h>
66 #include <sys/sysent.h>
67 #include <sys/sysctl.h>
68 #include <sys/vmmeter.h>
70 #include <sys/usched.h>
73 #include <sys/ctype.h>
74 #include <sys/serialize.h>
75 #include <sys/systimer.h>
78 #include <vm/vm_param.h>
80 #include <vm/vm_kern.h>
81 #include <vm/vm_object.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_pager.h>
85 #include <vm/vm_extern.h>
87 #include <sys/thread2.h>
88 #include <sys/mplock2.h>
89 #include <sys/mutex2.h>
99 #include <machine/cpu.h>
100 #include <machine/clock.h>
101 #include <machine/specialreg.h>
103 #include <machine/bootinfo.h>
105 #include <machine/md_var.h>
106 #include <machine/metadata.h>
107 #include <machine/pc/bios.h>
108 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
109 #include <machine/globaldata.h> /* CPU_prvspace */
110 #include <machine/smp.h>
112 #include <machine/perfmon.h>
114 #include <machine/cputypes.h>
115 #include <machine/intr_machdep.h>
116 #include <machine/framebuffer.h>
119 #include <bus/isa/isa_device.h>
121 #include <machine_base/isa/isa_intr.h>
122 #include <bus/isa/rtc.h>
123 #include <sys/random.h>
124 #include <sys/ptrace.h>
125 #include <machine/sigframe.h>
127 #include <sys/machintr.h>
128 #include <machine_base/icu/icu_abi.h>
129 #include <machine_base/icu/elcr_var.h>
130 #include <machine_base/apic/lapic.h>
131 #include <machine_base/apic/ioapic.h>
132 #include <machine_base/apic/ioapic_abi.h>
133 #include <machine/mptable.h>
135 #define PHYSMAP_ENTRIES 10
137 extern u_int64_t
hammer_time(u_int64_t
, u_int64_t
);
139 extern void printcpuinfo(void); /* XXX header file */
140 extern void identify_cpu(void);
142 extern void finishidentcpu(void);
144 extern void panicifcpuunsupported(void);
146 static void cpu_startup(void *);
147 static void pic_finish(void *);
148 static void cpu_finish(void *);
150 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
151 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
153 extern void ffs_rawread_setup(void);
154 #endif /* DIRECTIO */
155 static void init_locks(void);
157 extern void pcpu_timer_always(struct intrframe
*);
159 SYSINIT(cpu
, SI_BOOT2_START_CPU
, SI_ORDER_FIRST
, cpu_startup
, NULL
);
160 SYSINIT(pic_finish
, SI_BOOT2_FINISH_PIC
, SI_ORDER_FIRST
, pic_finish
, NULL
);
161 SYSINIT(cpu_finish
, SI_BOOT2_FINISH_CPU
, SI_ORDER_FIRST
, cpu_finish
, NULL
);
164 extern vm_offset_t ksym_start
, ksym_end
;
167 struct privatespace CPU_prvspace_bsp
__aligned(4096);
168 struct privatespace
*CPU_prvspace
[MAXCPU
] = { &CPU_prvspace_bsp
};
170 int _udatasel
, _ucodesel
, _ucode32sel
;
172 int64_t tsc_offsets
[MAXCPU
];
173 cpumask_t smp_idleinvl_mask
;
174 cpumask_t smp_idleinvl_reqs
;
176 static int cpu_mwait_halt_global
; /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
178 #if defined(SWTCH_OPTIM_STATS)
179 extern int swtch_optim_stats
;
180 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
181 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
182 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
183 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
185 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_halt
,
186 CTLFLAG_RD
, &cpu_mwait_halt_global
, 0, "");
187 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_spin
, CTLFLAG_RD
, &cpu_mwait_spin
, 0,
188 "monitor/mwait target state");
190 #define CPU_MWAIT_HAS_CX \
191 ((cpu_feature2 & CPUID2_MON) && \
192 (cpu_mwait_feature & CPUID_MWAIT_EXT))
194 #define CPU_MWAIT_CX_NAMELEN 16
196 #define CPU_MWAIT_C1 1
197 #define CPU_MWAIT_C2 2
198 #define CPU_MWAIT_C3 3
199 #define CPU_MWAIT_CX_MAX 8
201 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */
202 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */
204 SYSCTL_NODE(_machdep
, OID_AUTO
, mwait
, CTLFLAG_RW
, 0, "MWAIT features");
205 SYSCTL_NODE(_machdep_mwait
, OID_AUTO
, CX
, CTLFLAG_RW
, 0, "MWAIT Cx settings");
207 struct cpu_mwait_cx
{
210 struct sysctl_ctx_list sysctl_ctx
;
211 struct sysctl_oid
*sysctl_tree
;
213 static struct cpu_mwait_cx cpu_mwait_cx_info
[CPU_MWAIT_CX_MAX
];
214 static char cpu_mwait_cx_supported
[256];
216 static int cpu_mwait_c1_hints_cnt
;
217 static int cpu_mwait_hints_cnt
;
218 static int *cpu_mwait_hints
;
220 static int cpu_mwait_deep_hints_cnt
;
221 static int *cpu_mwait_deep_hints
;
223 #define CPU_IDLE_REPEAT_DEFAULT 750
225 static u_int cpu_idle_repeat
= CPU_IDLE_REPEAT_DEFAULT
;
226 static u_long cpu_idle_repeat_max
= CPU_IDLE_REPEAT_DEFAULT
;
227 static u_int cpu_mwait_repeat_shift
= 1;
229 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1
230 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2
232 static int cpu_mwait_c3_preamble
=
233 CPU_MWAIT_C3_PREAMBLE_BM_ARB
|
234 CPU_MWAIT_C3_PREAMBLE_BM_STS
;
236 SYSCTL_STRING(_machdep_mwait_CX
, OID_AUTO
, supported
, CTLFLAG_RD
,
237 cpu_mwait_cx_supported
, 0, "MWAIT supported C states");
238 SYSCTL_INT(_machdep_mwait_CX
, OID_AUTO
, c3_preamble
, CTLFLAG_RD
,
239 &cpu_mwait_c3_preamble
, 0, "C3+ preamble mask");
241 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
,
243 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
);
244 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
);
245 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
);
247 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, idle
, CTLTYPE_STRING
|CTLFLAG_RW
,
248 NULL
, 0, cpu_mwait_cx_idle_sysctl
, "A", "");
249 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, spin
, CTLTYPE_STRING
|CTLFLAG_RW
,
250 NULL
, 0, cpu_mwait_cx_spin_sysctl
, "A", "");
251 SYSCTL_UINT(_machdep_mwait_CX
, OID_AUTO
, repeat_shift
, CTLFLAG_RW
,
252 &cpu_mwait_repeat_shift
, 0, "");
256 u_long ebda_addr
= 0;
258 int imcr_present
= 0;
260 int naps
= 0; /* # of Applications processors */
263 struct mtx dt_lock
; /* lock for GDT and LDT */
266 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
268 u_long pmem
= ctob(physmem
);
270 int error
= sysctl_handle_long(oidp
, &pmem
, 0, req
);
274 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_ULONG
|CTLFLAG_RD
,
275 0, 0, sysctl_hw_physmem
, "LU", "Total system memory in bytes (number of pages * page size)");
278 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
280 int error
= sysctl_handle_int(oidp
, 0,
281 ctob(physmem
- vmstats
.v_wire_count
), req
);
285 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
286 0, 0, sysctl_hw_usermem
, "IU", "");
289 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
291 int error
= sysctl_handle_int(oidp
, 0,
292 x86_64_btop(avail_end
- avail_start
), req
);
296 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
297 0, 0, sysctl_hw_availpages
, "I", "");
303 * The number of PHYSMAP entries must be one less than the number of
304 * PHYSSEG entries because the PHYSMAP entry that spans the largest
305 * physical address that is accessible by ISA DMA is split into two
308 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
310 vm_paddr_t phys_avail
[PHYSMAP_SIZE
+ 2];
311 vm_paddr_t dump_avail
[PHYSMAP_SIZE
+ 2];
313 /* must be 2 less so 0 0 can signal end of chunks */
314 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 2)
315 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 2)
317 static vm_offset_t buffer_sva
, buffer_eva
;
318 vm_offset_t clean_sva
, clean_eva
;
319 static vm_offset_t pager_sva
, pager_eva
;
320 static struct trapframe proc0_tf
;
323 cpu_startup(void *dummy
)
327 vm_offset_t firstaddr
;
330 * Good {morning,afternoon,evening,night}.
332 kprintf("%s", version
);
335 panicifcpuunsupported();
339 kprintf("real memory = %ju (%ju MB)\n",
341 (intmax_t)Realmem
/ 1024 / 1024);
343 * Display any holes after the first chunk of extended memory.
348 kprintf("Physical memory chunk(s):\n");
349 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
350 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
352 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
353 (intmax_t)phys_avail
[indx
],
354 (intmax_t)phys_avail
[indx
+ 1] - 1,
356 (intmax_t)(size1
/ PAGE_SIZE
));
361 * Allocate space for system data structures.
362 * The first available kernel virtual address is in "v".
363 * As pages of kernel virtual memory are allocated, "v" is incremented.
364 * As pages of memory are allocated and cleared,
365 * "firstaddr" is incremented.
366 * An index into the kernel page table corresponding to the
367 * virtual memory address maintained in "v" is kept in "mapaddr".
371 * Make two passes. The first pass calculates how much memory is
372 * needed and allocates it. The second pass assigns virtual
373 * addresses to the various data structures.
377 v
= (caddr_t
)firstaddr
;
379 #define valloc(name, type, num) \
380 (name) = (type *)v; v = (caddr_t)((name)+(num))
381 #define valloclim(name, type, num, lim) \
382 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
385 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
386 * For the first 64MB of ram nominally allocate sufficient buffers to
387 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
388 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
389 * the buffer cache we limit the eventual kva reservation to
392 * factor represents the 1/4 x ram conversion.
395 long factor
= 4 * BKVASIZE
/ 1024;
396 long kbytes
= physmem
* (PAGE_SIZE
/ 1024);
400 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
402 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
403 if (maxbcache
&& nbuf
> maxbcache
/ BKVASIZE
)
404 nbuf
= maxbcache
/ BKVASIZE
;
408 * Do not allow the buffer_map to be more then 1/2 the size of the
411 if (nbuf
> (virtual_end
- virtual_start
+
412 virtual2_end
- virtual2_start
) / (BKVASIZE
* 2)) {
413 nbuf
= (virtual_end
- virtual_start
+
414 virtual2_end
- virtual2_start
) / (BKVASIZE
* 2);
415 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf
);
419 * Do not allow the buffer_map to use more than 50% of available
420 * physical-equivalent memory. Since the VM pages which back
421 * individual buffers are typically wired, having too many bufs
422 * can prevent the system from paging properly.
424 if (nbuf
> physmem
* PAGE_SIZE
/ (BKVASIZE
* 2)) {
425 nbuf
= physmem
* PAGE_SIZE
/ (BKVASIZE
* 2);
426 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf
);
430 * Do not allow the sizeof(struct buf) * nbuf to exceed half of
431 * the valloc space which is just the virtual_end - virtual_start
432 * section. We use valloc() to allocate the buf header array.
434 if (nbuf
> (virtual_end
- virtual_start
) / sizeof(struct buf
) / 2) {
435 nbuf
= (virtual_end
- virtual_start
) /
436 sizeof(struct buf
) / 2;
437 kprintf("Warning: nbufs capped at %ld due to valloc "
438 "considerations", nbuf
);
441 nswbuf_mem
= lmax(lmin(nbuf
/ 32, 256), 8);
443 if (nswbuf_mem
< NSWBUF_MIN
)
444 nswbuf_mem
= NSWBUF_MIN
;
446 nswbuf_kva
= lmax(lmin(nbuf
/ 4, 256), 16);
448 if (nswbuf_kva
< NSWBUF_MIN
)
449 nswbuf_kva
= NSWBUF_MIN
;
455 valloc(swbuf_mem
, struct buf
, nswbuf_mem
);
456 valloc(swbuf_kva
, struct buf
, nswbuf_kva
);
457 valloc(buf
, struct buf
, nbuf
);
460 * End of first pass, size has been calculated so allocate memory
462 if (firstaddr
== 0) {
463 size
= (vm_size_t
)(v
- firstaddr
);
464 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
));
466 panic("startup: no room for tables");
471 * End of second pass, addresses have been assigned
473 * nbuf is an int, make sure we don't overflow the field.
475 * On 64-bit systems we always reserve maximal allocations for
476 * buffer cache buffers and there are no fragmentation issues,
477 * so the KVA segment does not have to be excessively oversized.
479 if ((vm_size_t
)(v
- firstaddr
) != size
)
480 panic("startup: table size inconsistency");
482 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
483 ((vm_offset_t
)(nbuf
+ 16) * BKVASIZE
) +
484 ((nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) + pager_map_size
);
485 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
486 ((vm_offset_t
)(nbuf
+ 16) * BKVASIZE
));
487 buffer_map
.system_map
= 1;
488 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
489 ((vm_offset_t
)(nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) +
491 pager_map
.system_map
= 1;
492 kprintf("avail memory = %ju (%ju MB)\n",
493 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
),
494 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
) /
498 struct cpu_idle_stat
{
506 u_long mwait_cx
[CPU_MWAIT_CX_MAX
];
509 #define CPU_IDLE_STAT_HALT -1
510 #define CPU_IDLE_STAT_SPIN -2
512 static struct cpu_idle_stat cpu_idle_stats
[MAXCPU
];
515 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS
)
517 int idx
= arg2
, cpu
, error
;
520 if (idx
== CPU_IDLE_STAT_HALT
) {
521 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
522 val
+= cpu_idle_stats
[cpu
].halt
;
523 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
524 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
525 val
+= cpu_idle_stats
[cpu
].spin
;
527 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
528 ("invalid index %d", idx
));
529 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
530 val
+= cpu_idle_stats
[cpu
].mwait_cx
[idx
];
533 error
= sysctl_handle_quad(oidp
, &val
, 0, req
);
534 if (error
|| req
->newptr
== NULL
)
537 if (idx
== CPU_IDLE_STAT_HALT
) {
538 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
539 cpu_idle_stats
[cpu
].halt
= 0;
540 cpu_idle_stats
[0].halt
= val
;
541 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
542 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
543 cpu_idle_stats
[cpu
].spin
= 0;
544 cpu_idle_stats
[0].spin
= val
;
546 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
547 ("invalid index %d", idx
));
548 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
549 cpu_idle_stats
[cpu
].mwait_cx
[idx
] = 0;
550 cpu_idle_stats
[0].mwait_cx
[idx
] = val
;
556 cpu_mwait_attach(void)
561 if (!CPU_MWAIT_HAS_CX
)
564 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
565 (CPUID_TO_FAMILY(cpu_id
) > 0xf ||
566 (CPUID_TO_FAMILY(cpu_id
) == 0x6 &&
567 CPUID_TO_MODEL(cpu_id
) >= 0xf))) {
571 * Pentium dual-core, Core 2 and beyond do not need any
572 * additional activities to enter deep C-state, i.e. C3(+).
574 cpu_mwait_cx_no_bmarb();
576 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts
);
578 cpu_mwait_cx_no_bmsts();
581 sbuf_new(&sb
, cpu_mwait_cx_supported
,
582 sizeof(cpu_mwait_cx_supported
), SBUF_FIXEDLEN
);
584 for (i
= 0; i
< CPU_MWAIT_CX_MAX
; ++i
) {
585 struct cpu_mwait_cx
*cx
= &cpu_mwait_cx_info
[i
];
588 ksnprintf(cx
->name
, sizeof(cx
->name
), "C%d", i
);
590 sysctl_ctx_init(&cx
->sysctl_ctx
);
591 cx
->sysctl_tree
= SYSCTL_ADD_NODE(&cx
->sysctl_ctx
,
592 SYSCTL_STATIC_CHILDREN(_machdep_mwait
), OID_AUTO
,
593 cx
->name
, CTLFLAG_RW
, NULL
, "Cx control/info");
594 if (cx
->sysctl_tree
== NULL
)
597 cx
->subcnt
= CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu
, i
);
598 SYSCTL_ADD_INT(&cx
->sysctl_ctx
,
599 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
600 "subcnt", CTLFLAG_RD
, &cx
->subcnt
, 0,
602 SYSCTL_ADD_PROC(&cx
->sysctl_ctx
,
603 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
604 "entered", (CTLTYPE_QUAD
| CTLFLAG_RW
), 0,
605 i
, sysctl_cpu_idle_cnt
, "Q", "# of times entered");
607 for (sub
= 0; sub
< cx
->subcnt
; ++sub
)
608 sbuf_printf(&sb
, "C%d/%d ", i
, sub
);
616 cpu_mwait_c1_hints_cnt
= cpu_mwait_cx_info
[CPU_MWAIT_C1
].subcnt
;
617 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
)
618 cpu_mwait_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
619 cpu_mwait_hints
= kmalloc(sizeof(int) * cpu_mwait_hints_cnt
,
623 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
) {
626 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
627 for (j
= 0; j
< subcnt
; ++j
) {
628 KASSERT(hint_idx
< cpu_mwait_hints_cnt
,
629 ("invalid mwait hint index %d", hint_idx
));
630 cpu_mwait_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
634 KASSERT(hint_idx
== cpu_mwait_hints_cnt
,
635 ("mwait hint count %d != index %d",
636 cpu_mwait_hints_cnt
, hint_idx
));
639 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt
);
640 for (i
= 0; i
< cpu_mwait_hints_cnt
; ++i
) {
641 int hint
= cpu_mwait_hints
[i
];
643 kprintf(" C%d/%d hint 0x%04x\n",
644 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
652 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
)
653 cpu_mwait_deep_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
654 cpu_mwait_deep_hints
= kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt
,
658 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
) {
661 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
662 for (j
= 0; j
< subcnt
; ++j
) {
663 KASSERT(hint_idx
< cpu_mwait_deep_hints_cnt
,
664 ("invalid mwait deep hint index %d", hint_idx
));
665 cpu_mwait_deep_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
669 KASSERT(hint_idx
== cpu_mwait_deep_hints_cnt
,
670 ("mwait deep hint count %d != index %d",
671 cpu_mwait_deep_hints_cnt
, hint_idx
));
674 kprintf("MWAIT deep hints:\n");
675 for (i
= 0; i
< cpu_mwait_deep_hints_cnt
; ++i
) {
676 int hint
= cpu_mwait_deep_hints
[i
];
678 kprintf(" C%d/%d hint 0x%04x\n",
679 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
683 cpu_idle_repeat_max
= 256 * cpu_mwait_deep_hints_cnt
;
685 for (i
= 0; i
< ncpus
; ++i
) {
688 ksnprintf(name
, sizeof(name
), "idle%d", i
);
689 SYSCTL_ADD_PROC(NULL
,
690 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX
), OID_AUTO
,
691 name
, (CTLTYPE_STRING
| CTLFLAG_RW
), &cpu_idle_stats
[i
],
692 0, cpu_mwait_cx_pcpu_idle_sysctl
, "A", "");
697 cpu_finish(void *dummy __unused
)
704 pic_finish(void *dummy __unused
)
706 /* Log ELCR information */
709 /* Log MPTABLE information */
710 mptable_pci_int_dump();
713 MachIntrABI
.finalize();
717 * Send an interrupt to process.
719 * Stack is set up to allow sigcode stored
720 * at top to call routine, followed by kcall
721 * to sigreturn routine below. After sigreturn
722 * resets the signal mask, the stack, and the
723 * frame pointer, it returns to the user
727 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
729 struct lwp
*lp
= curthread
->td_lwp
;
730 struct proc
*p
= lp
->lwp_proc
;
731 struct trapframe
*regs
;
732 struct sigacts
*psp
= p
->p_sigacts
;
733 struct sigframe sf
, *sfp
;
737 regs
= lp
->lwp_md
.md_regs
;
738 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
740 /* Save user context */
741 bzero(&sf
, sizeof(struct sigframe
));
742 sf
.sf_uc
.uc_sigmask
= *mask
;
743 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
744 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
745 KKASSERT(__offsetof(struct trapframe
, tf_rdi
) == 0);
746 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_rdi
, sizeof(struct trapframe
));
748 /* Make the size of the saved context visible to userland */
749 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
751 /* Allocate and validate space for the signal handler context. */
752 if ((lp
->lwp_flags
& LWP_ALTSTACK
) != 0 && !oonstack
&&
753 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
754 sp
= (char *)(lp
->lwp_sigstk
.ss_sp
+ lp
->lwp_sigstk
.ss_size
-
755 sizeof(struct sigframe
));
756 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
758 /* We take red zone into account */
759 sp
= (char *)regs
->tf_rsp
- sizeof(struct sigframe
) - 128;
763 * XXX AVX needs 64-byte alignment but sigframe has other fields and
764 * the embedded ucontext is not at the front, so aligning this won't
765 * help us. Fortunately we bcopy in/out of the sigframe, so the
768 * The problem though is if userland winds up trying to use the
771 sfp
= (struct sigframe
*)((intptr_t)sp
& ~(intptr_t)0xF);
773 /* Translate the signal is appropriate */
774 if (p
->p_sysent
->sv_sigtbl
) {
775 if (sig
<= p
->p_sysent
->sv_sigsize
)
776 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
780 * Build the argument list for the signal handler.
782 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
784 regs
->tf_rdi
= sig
; /* argument 1 */
785 regs
->tf_rdx
= (register_t
)&sfp
->sf_uc
; /* argument 3 */
787 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
789 * Signal handler installed with SA_SIGINFO.
791 * action(signo, siginfo, ucontext)
793 regs
->tf_rsi
= (register_t
)&sfp
->sf_si
; /* argument 2 */
794 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
795 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
797 /* fill siginfo structure */
798 sf
.sf_si
.si_signo
= sig
;
799 sf
.sf_si
.si_code
= code
;
800 sf
.sf_si
.si_addr
= (void *)regs
->tf_addr
;
803 * Old FreeBSD-style arguments.
805 * handler (signo, code, [uc], addr)
807 regs
->tf_rsi
= (register_t
)code
; /* argument 2 */
808 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
809 sf
.sf_ahu
.sf_handler
= catcher
;
813 * If we're a vm86 process, we want to save the segment registers.
814 * We also change eflags to be our emulated eflags, not the actual
818 if (regs
->tf_eflags
& PSL_VM
) {
819 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
820 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
822 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
823 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
824 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
825 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
827 if (vm86
->vm86_has_vme
== 0)
828 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
829 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
830 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
833 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
834 * syscalls made by the signal handler. This just avoids
835 * wasting time for our lazy fixup of such faults. PSL_NT
836 * does nothing in vm86 mode, but vm86 programs can set it
837 * almost legitimately in probes for old cpu types.
839 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
844 * Save the FPU state and reinit the FP unit
846 npxpush(&sf
.sf_uc
.uc_mcontext
);
849 * Copy the sigframe out to the user's stack.
851 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
853 * Something is wrong with the stack pointer.
854 * ...Kill the process.
859 regs
->tf_rsp
= (register_t
)sfp
;
860 regs
->tf_rip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
863 * i386 abi specifies that the direction flag must be cleared
866 regs
->tf_rflags
&= ~(PSL_T
|PSL_D
);
869 * 64 bit mode has a code and stack selector but
870 * no data or extra selector. %fs and %gs are not
873 regs
->tf_cs
= _ucodesel
;
874 regs
->tf_ss
= _udatasel
;
879 * Sanitize the trapframe for a virtual kernel passing control to a custom
880 * VM context. Remove any items that would otherwise create a privilage
883 * XXX at the moment we allow userland to set the resume flag. Is this a
887 cpu_sanitize_frame(struct trapframe
*frame
)
889 frame
->tf_cs
= _ucodesel
;
890 frame
->tf_ss
= _udatasel
;
891 /* XXX VM (8086) mode not supported? */
892 frame
->tf_rflags
&= (PSL_RF
| PSL_USERCHANGE
| PSL_VM_UNSUPP
);
893 frame
->tf_rflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
899 * Sanitize the tls so loading the descriptor does not blow up
900 * on us. For x86_64 we don't have to do anything.
903 cpu_sanitize_tls(struct savetls
*tls
)
909 * sigreturn(ucontext_t *sigcntxp)
911 * System call to cleanup state after a signal
912 * has been taken. Reset signal mask and
913 * stack state from context left by sendsig (above).
914 * Return to previous pc and psl as specified by
915 * context left by sendsig. Check carefully to
916 * make sure that the user has not modified the
917 * state to gain improper privileges.
921 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
922 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
925 sys_sigreturn(struct sigreturn_args
*uap
)
927 struct lwp
*lp
= curthread
->td_lwp
;
928 struct trapframe
*regs
;
936 * We have to copy the information into kernel space so userland
937 * can't modify it while we are sniffing it.
939 regs
= lp
->lwp_md
.md_regs
;
940 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
944 rflags
= ucp
->uc_mcontext
.mc_rflags
;
946 /* VM (8086) mode not supported */
947 rflags
&= ~PSL_VM_UNSUPP
;
950 if (eflags
& PSL_VM
) {
951 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
952 struct vm86_kernel
*vm86
;
955 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
956 * set up the vm86 area, and we can't enter vm86 mode.
958 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
960 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
961 if (vm86
->vm86_inited
== 0)
964 /* go back to user mode if both flags are set */
965 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
966 trapsignal(lp
, SIGBUS
, 0);
968 if (vm86
->vm86_has_vme
) {
969 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
970 (eflags
& VME_USERCHANGE
) | PSL_VM
;
972 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
973 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
974 (eflags
& VM_USERCHANGE
) | PSL_VM
;
976 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
977 tf
->tf_eflags
= eflags
;
978 tf
->tf_vm86_ds
= tf
->tf_ds
;
979 tf
->tf_vm86_es
= tf
->tf_es
;
980 tf
->tf_vm86_fs
= tf
->tf_fs
;
981 tf
->tf_vm86_gs
= tf
->tf_gs
;
982 tf
->tf_ds
= _udatasel
;
983 tf
->tf_es
= _udatasel
;
984 tf
->tf_fs
= _udatasel
;
985 tf
->tf_gs
= _udatasel
;
990 * Don't allow users to change privileged or reserved flags.
993 * XXX do allow users to change the privileged flag PSL_RF.
994 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
995 * should sometimes set it there too. tf_eflags is kept in
996 * the signal context during signal handling and there is no
997 * other place to remember it, so the PSL_RF bit may be
998 * corrupted by the signal handler without us knowing.
999 * Corruption of the PSL_RF bit at worst causes one more or
1000 * one less debugger trap, so allowing it is fairly harmless.
1002 if (!EFL_SECURE(rflags
& ~PSL_RF
, regs
->tf_rflags
& ~PSL_RF
)) {
1003 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags
);
1008 * Don't allow users to load a valid privileged %cs. Let the
1009 * hardware check for invalid selectors, excess privilege in
1010 * other selectors, invalid %eip's and invalid %esp's.
1012 cs
= ucp
->uc_mcontext
.mc_cs
;
1013 if (!CS_SECURE(cs
)) {
1014 kprintf("sigreturn: cs = 0x%x\n", cs
);
1015 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
1018 bcopy(&ucp
->uc_mcontext
.mc_rdi
, regs
, sizeof(struct trapframe
));
1022 * Restore the FPU state from the frame
1025 npxpop(&ucp
->uc_mcontext
);
1027 if (ucp
->uc_mcontext
.mc_onstack
& 1)
1028 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
1030 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
1032 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
1033 SIG_CANTMASK(lp
->lwp_sigmask
);
1036 return(EJUSTRETURN
);
1040 * Machine dependent boot() routine
1042 * I haven't seen anything to put here yet
1043 * Possibly some stuff might be grafted back here from boot()
1051 * Shutdown the CPU as much as possible
1057 __asm__
__volatile("hlt");
1061 * cpu_idle() represents the idle LWKT. You cannot return from this function
1062 * (unless you want to blow things up!). Instead we look for runnable threads
1063 * and loop or halt as appropriate. Giant is not held on entry to the thread.
1065 * The main loop is entered with a critical section held, we must release
1066 * the critical section before doing anything else. lwkt_switch() will
1067 * check for pending interrupts due to entering and exiting its own
1070 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1071 * However, there are cases where the idlethread will be entered with
1072 * the possibility that no IPI will occur and in such cases
1073 * lwkt_switch() sets TDF_IDLE_NOHLT.
1075 * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1076 * must occur before it starts using ACPI halt.
1078 * NOTE: Value overridden in hammer_time().
1080 static int cpu_idle_hlt
= 2;
1081 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
1082 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
1083 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_repeat
, CTLFLAG_RW
,
1084 &cpu_idle_repeat
, 0, "Idle entries before acpi hlt");
1086 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1087 0, CPU_IDLE_STAT_HALT
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry halts");
1088 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_spincnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1089 0, CPU_IDLE_STAT_SPIN
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry spins");
1092 cpu_idle_default_hook(void)
1095 * We must guarentee that hlt is exactly the instruction
1096 * following the sti.
1098 __asm
__volatile("sti; hlt");
1101 /* Other subsystems (e.g., ACPI) can hook this later. */
1102 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
1105 cpu_mwait_cx_hint(struct cpu_idle_stat
*stat
)
1114 idx
= (stat
->repeat
+ stat
->repeat_last
+ stat
->repeat_delta
) >>
1115 cpu_mwait_repeat_shift
;
1116 if (idx
>= cpu_mwait_c1_hints_cnt
) {
1117 /* Step up faster, once we walked through all C1 states */
1118 stat
->repeat_delta
+= 1 << (cpu_mwait_repeat_shift
+ 1);
1120 if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
1121 if (idx
>= cpu_mwait_deep_hints_cnt
)
1122 idx
= cpu_mwait_deep_hints_cnt
- 1;
1123 hint
= cpu_mwait_deep_hints
[idx
];
1125 if (idx
>= cpu_mwait_hints_cnt
)
1126 idx
= cpu_mwait_hints_cnt
- 1;
1127 hint
= cpu_mwait_hints
[idx
];
1130 cx_idx
= MWAIT_EAX_TO_CX(hint
);
1131 if (cx_idx
>= 0 && cx_idx
< CPU_MWAIT_CX_MAX
)
1132 stat
->mwait_cx
[cx_idx
]++;
1139 globaldata_t gd
= mycpu
;
1140 struct cpu_idle_stat
*stat
= &cpu_idle_stats
[gd
->gd_cpuid
];
1141 struct thread
*td __debugvar
= gd
->gd_curthread
;
1145 stat
->repeat
= stat
->repeat_last
= cpu_idle_repeat_max
;
1148 KKASSERT(td
->td_critcount
== 0);
1152 * See if there are any LWKTs ready to go.
1157 * When halting inside a cli we must check for reqflags
1158 * races, particularly [re]schedule requests. Running
1159 * splz() does the job.
1162 * 0 Never halt, just spin
1164 * 1 Always use HLT (or MONITOR/MWAIT if avail).
1166 * Better default for modern (Haswell+) Intel
1169 * 2 Use HLT/MONITOR/MWAIT up to a point and then
1170 * use the ACPI halt (default). This is a hybrid
1171 * approach. See machdep.cpu_idle_repeat.
1173 * Better default for modern AMD cpus and older
1176 * 3 Always use the ACPI halt. This typically
1177 * eats the least amount of power but the cpu
1178 * will be slow waking up. Slows down e.g.
1179 * compiles and other pipe/event oriented stuff.
1183 * NOTE: Interrupts are enabled and we are not in a critical
1186 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
1187 * don't bother capping gd_idle_repeat, it is ok if
1190 * Implement optimized invltlb operations when halted
1191 * in idle. By setting the bit in smp_idleinvl_mask
1192 * we inform other cpus that they can set _reqs to
1193 * request an invltlb. Current the code to do that
1194 * sets the bits in _reqs anyway, but then check _mask
1195 * to determine if they can assume the invltlb will execute.
1197 * A critical section is required to ensure that interrupts
1198 * do not fully run until after we've had a chance to execute
1201 if (gd
->gd_idle_repeat
== 0) {
1202 stat
->repeat
= (stat
->repeat
+ stat
->repeat_last
) >> 1;
1203 if (stat
->repeat
> cpu_idle_repeat_max
)
1204 stat
->repeat
= cpu_idle_repeat_max
;
1205 stat
->repeat_last
= 0;
1206 stat
->repeat_delta
= 0;
1208 ++stat
->repeat_last
;
1210 ++gd
->gd_idle_repeat
;
1211 reqflags
= gd
->gd_reqflags
;
1212 quick
= (cpu_idle_hlt
== 1) ||
1213 (cpu_idle_hlt
< 3 &&
1214 gd
->gd_idle_repeat
< cpu_idle_repeat
);
1216 if (quick
&& (cpu_mi_feature
& CPU_MI_MONITOR
) &&
1217 (reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1220 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1221 cpu_mmw_pause_int(&gd
->gd_reqflags
, reqflags
,
1222 cpu_mwait_cx_hint(stat
), 0);
1224 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1225 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1230 } else if (cpu_idle_hlt
) {
1231 __asm
__volatile("cli");
1234 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1235 if ((gd
->gd_reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1237 cpu_idle_default_hook();
1241 __asm
__volatile("sti");
1243 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1244 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1251 __asm
__volatile("sti");
1258 * This routine is called if a spinlock has been held through the
1259 * exponential backoff period and is seriously contested. On a real cpu
1263 cpu_spinlock_contested(void)
1269 * Clear registers on exec
1272 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
1274 struct thread
*td
= curthread
;
1275 struct lwp
*lp
= td
->td_lwp
;
1276 struct pcb
*pcb
= td
->td_pcb
;
1277 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
1279 /* was i386_user_cleanup() in NetBSD */
1283 bzero((char *)regs
, sizeof(struct trapframe
));
1284 regs
->tf_rip
= entry
;
1285 regs
->tf_rsp
= ((stack
- 8) & ~0xFul
) + 8; /* align the stack */
1286 regs
->tf_rdi
= stack
; /* argv */
1287 regs
->tf_rflags
= PSL_USER
| (regs
->tf_rflags
& PSL_T
);
1288 regs
->tf_ss
= _udatasel
;
1289 regs
->tf_cs
= _ucodesel
;
1290 regs
->tf_rbx
= ps_strings
;
1293 * Reset the hardware debug registers if they were in use.
1294 * They won't have any meaning for the newly exec'd process.
1296 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1302 pcb
->pcb_dr7
= 0; /* JG set bit 10? */
1303 if (pcb
== td
->td_pcb
) {
1305 * Clear the debug registers on the running
1306 * CPU, otherwise they will end up affecting
1307 * the next process we switch to.
1311 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1315 * Initialize the math emulator (if any) for the current process.
1316 * Actually, just clear the bit that says that the emulator has
1317 * been initialized. Initialization is delayed until the process
1318 * traps to the emulator (if it is done at all) mainly because
1319 * emulators don't provide an entry point for initialization.
1321 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1324 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1325 * gd_npxthread. Otherwise a preemptive interrupt thread
1326 * may panic in npxdna().
1329 load_cr0(rcr0() | CR0_MP
);
1332 * NOTE: The MSR values must be correct so we can return to
1333 * userland. gd_user_fs/gs must be correct so the switch
1334 * code knows what the current MSR values are.
1336 pcb
->pcb_fsbase
= 0; /* Values loaded from PCB on switch */
1337 pcb
->pcb_gsbase
= 0;
1338 mdcpu
->gd_user_fs
= 0; /* Cache of current MSR values */
1339 mdcpu
->gd_user_gs
= 0;
1340 wrmsr(MSR_FSBASE
, 0); /* Set MSR values for return to userland */
1341 wrmsr(MSR_KGSBASE
, 0);
1343 /* Initialize the npx (if any) for the current process. */
1347 pcb
->pcb_ds
= _udatasel
;
1348 pcb
->pcb_es
= _udatasel
;
1349 pcb
->pcb_fs
= _udatasel
;
1350 pcb
->pcb_gs
= _udatasel
;
1359 cr0
|= CR0_NE
; /* Done by npxinit() */
1360 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1361 cr0
|= CR0_WP
| CR0_AM
;
1367 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1370 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1372 if (!error
&& req
->newptr
)
1377 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1378 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1380 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1381 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1384 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1385 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1388 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1389 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1391 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1392 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1393 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1396 * Initialize 386 and configure to run kernel
1400 * Initialize segments & interrupt table
1404 struct user_segment_descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1405 struct gate_descriptor idt_arr
[MAXCPU
][NIDT
];
1407 union descriptor ldt
[NLDT
]; /* local descriptor table */
1410 /* table descriptors - used to load tables by cpu */
1411 struct region_descriptor r_gdt
;
1412 struct region_descriptor r_idt_arr
[MAXCPU
];
1414 /* JG proc0paddr is a virtual address */
1417 char proc0paddr_buff
[LWKT_THREAD_STACK
];
1420 /* software prototypes -- in more palatable form */
1421 struct soft_segment_descriptor gdt_segs
[] = {
1422 /* GNULL_SEL 0 Null Descriptor */
1423 { 0x0, /* segment base address */
1425 0, /* segment type */
1426 0, /* segment descriptor priority level */
1427 0, /* segment descriptor present */
1429 0, /* default 32 vs 16 bit size */
1430 0 /* limit granularity (byte/page units)*/ },
1431 /* GCODE_SEL 1 Code Descriptor for kernel */
1432 { 0x0, /* segment base address */
1433 0xfffff, /* length - all address space */
1434 SDT_MEMERA
, /* segment type */
1435 SEL_KPL
, /* segment descriptor priority level */
1436 1, /* segment descriptor present */
1438 0, /* default 32 vs 16 bit size */
1439 1 /* limit granularity (byte/page units)*/ },
1440 /* GDATA_SEL 2 Data Descriptor for kernel */
1441 { 0x0, /* segment base address */
1442 0xfffff, /* length - all address space */
1443 SDT_MEMRWA
, /* segment type */
1444 SEL_KPL
, /* segment descriptor priority level */
1445 1, /* segment descriptor present */
1447 0, /* default 32 vs 16 bit size */
1448 1 /* limit granularity (byte/page units)*/ },
1449 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1450 { 0x0, /* segment base address */
1451 0xfffff, /* length - all address space */
1452 SDT_MEMERA
, /* segment type */
1453 SEL_UPL
, /* segment descriptor priority level */
1454 1, /* segment descriptor present */
1456 1, /* default 32 vs 16 bit size */
1457 1 /* limit granularity (byte/page units)*/ },
1458 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1459 { 0x0, /* segment base address */
1460 0xfffff, /* length - all address space */
1461 SDT_MEMRWA
, /* segment type */
1462 SEL_UPL
, /* segment descriptor priority level */
1463 1, /* segment descriptor present */
1465 1, /* default 32 vs 16 bit size */
1466 1 /* limit granularity (byte/page units)*/ },
1467 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1468 { 0x0, /* segment base address */
1469 0xfffff, /* length - all address space */
1470 SDT_MEMERA
, /* segment type */
1471 SEL_UPL
, /* segment descriptor priority level */
1472 1, /* segment descriptor present */
1474 0, /* default 32 vs 16 bit size */
1475 1 /* limit granularity (byte/page units)*/ },
1476 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1478 0x0, /* segment base address */
1479 sizeof(struct x86_64tss
)-1,/* length - all address space */
1480 SDT_SYSTSS
, /* segment type */
1481 SEL_KPL
, /* segment descriptor priority level */
1482 1, /* segment descriptor present */
1484 0, /* unused - default 32 vs 16 bit size */
1485 0 /* limit granularity (byte/page units)*/ },
1486 /* Actually, the TSS is a system descriptor which is double size */
1487 { 0x0, /* segment base address */
1489 0, /* segment type */
1490 0, /* segment descriptor priority level */
1491 0, /* segment descriptor present */
1493 0, /* default 32 vs 16 bit size */
1494 0 /* limit granularity (byte/page units)*/ },
1495 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1496 { 0x0, /* segment base address */
1497 0xfffff, /* length - all address space */
1498 SDT_MEMRWA
, /* segment type */
1499 SEL_UPL
, /* segment descriptor priority level */
1500 1, /* segment descriptor present */
1502 1, /* default 32 vs 16 bit size */
1503 1 /* limit granularity (byte/page units)*/ },
1507 setidt_global(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
)
1511 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
1512 struct gate_descriptor
*ip
= &idt_arr
[cpu
][idx
];
1514 ip
->gd_looffset
= (uintptr_t)func
;
1515 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1521 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1526 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
, int cpu
)
1528 struct gate_descriptor
*ip
;
1530 KASSERT(cpu
>= 0 && cpu
< ncpus
, ("invalid cpu %d", cpu
));
1532 ip
= &idt_arr
[cpu
][idx
];
1533 ip
->gd_looffset
= (uintptr_t)func
;
1534 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1540 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1543 #define IDTVEC(name) __CONCAT(X,name)
1546 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1547 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1548 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1549 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(rsvd
), IDTVEC(fpu
), IDTVEC(align
),
1550 IDTVEC(xmm
), IDTVEC(dblfault
),
1551 IDTVEC(fast_syscall
), IDTVEC(fast_syscall32
);
1554 sdtossd(struct user_segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1556 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1557 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1558 ssd
->ssd_type
= sd
->sd_type
;
1559 ssd
->ssd_dpl
= sd
->sd_dpl
;
1560 ssd
->ssd_p
= sd
->sd_p
;
1561 ssd
->ssd_def32
= sd
->sd_def32
;
1562 ssd
->ssd_gran
= sd
->sd_gran
;
1566 ssdtosd(struct soft_segment_descriptor
*ssd
, struct user_segment_descriptor
*sd
)
1569 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1570 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xff;
1571 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1572 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1573 sd
->sd_type
= ssd
->ssd_type
;
1574 sd
->sd_dpl
= ssd
->ssd_dpl
;
1575 sd
->sd_p
= ssd
->ssd_p
;
1576 sd
->sd_long
= ssd
->ssd_long
;
1577 sd
->sd_def32
= ssd
->ssd_def32
;
1578 sd
->sd_gran
= ssd
->ssd_gran
;
1582 ssdtosyssd(struct soft_segment_descriptor
*ssd
,
1583 struct system_segment_descriptor
*sd
)
1586 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1587 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xfffffffffful
;
1588 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1589 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1590 sd
->sd_type
= ssd
->ssd_type
;
1591 sd
->sd_dpl
= ssd
->ssd_dpl
;
1592 sd
->sd_p
= ssd
->ssd_p
;
1593 sd
->sd_gran
= ssd
->ssd_gran
;
1597 * Populate the (physmap) array with base/bound pairs describing the
1598 * available physical memory in the system, then test this memory and
1599 * build the phys_avail array describing the actually-available memory.
1601 * If we cannot accurately determine the physical memory map, then use
1602 * value from the 0xE801 call, and failing that, the RTC.
1604 * Total memory size may be set by the kernel environment variable
1605 * hw.physmem or the compile-time define MAXMEM.
1607 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1608 * of PAGE_SIZE. This also greatly reduces the memory test time
1609 * which would otherwise be excessive on machines with > 8G of ram.
1611 * XXX first should be vm_paddr_t.
1614 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024)
1615 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1616 vm_paddr_t physmap
[PHYSMAP_SIZE
];
1617 struct bios_smap
*smapbase
, *smap
, *smapend
;
1618 struct efi_map_header
*efihdrbase
;
1622 add_smap_entries(int *physmap_idx
)
1626 smapsize
= *((u_int32_t
*)smapbase
- 1);
1627 smapend
= (struct bios_smap
*)((uintptr_t)smapbase
+ smapsize
);
1629 for (smap
= smapbase
; smap
< smapend
; smap
++) {
1630 if (boothowto
& RB_VERBOSE
)
1631 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1632 smap
->type
, smap
->base
, smap
->length
);
1634 if (smap
->type
!= SMAP_TYPE_MEMORY
)
1637 if (smap
->length
== 0)
1640 for (i
= 0; i
<= *physmap_idx
; i
+= 2) {
1641 if (smap
->base
< physmap
[i
+ 1]) {
1642 if (boothowto
& RB_VERBOSE
) {
1643 kprintf("Overlapping or non-monotonic "
1644 "memory region, ignoring "
1650 if (i
<= *physmap_idx
)
1653 Realmem
+= smap
->length
;
1655 if (smap
->base
== physmap
[*physmap_idx
+ 1]) {
1656 physmap
[*physmap_idx
+ 1] += smap
->length
;
1661 if (*physmap_idx
== PHYSMAP_SIZE
) {
1662 kprintf("Too many segments in the physical "
1663 "address map, giving up\n");
1666 physmap
[*physmap_idx
] = smap
->base
;
1667 physmap
[*physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1671 #define efi_next_descriptor(ptr, size) \
1672 ((struct efi_md *)(((uint8_t *) ptr) + size))
1675 add_efi_map_entries(int *physmap_idx
)
1677 struct efi_md
*map
, *p
;
1682 static const char *types
[] = {
1688 "RuntimeServicesCode",
1689 "RuntimeServicesData",
1690 "ConventionalMemory",
1692 "ACPIReclaimMemory",
1695 "MemoryMappedIOPortSpace",
1700 * Memory map data provided by UEFI via the GetMemoryMap
1701 * Boot Services API.
1703 efisz
= (sizeof(struct efi_map_header
) + 0xf) & ~0xf;
1704 map
= (struct efi_md
*)((uint8_t *)efihdrbase
+ efisz
);
1706 if (efihdrbase
->descriptor_size
== 0)
1708 ndesc
= efihdrbase
->memory_size
/ efihdrbase
->descriptor_size
;
1710 if (boothowto
& RB_VERBOSE
)
1711 kprintf("%23s %12s %12s %8s %4s\n",
1712 "Type", "Physical", "Virtual", "#Pages", "Attr");
1714 for (i
= 0, p
= map
; i
< ndesc
; i
++,
1715 p
= efi_next_descriptor(p
, efihdrbase
->descriptor_size
)) {
1716 if (boothowto
& RB_VERBOSE
) {
1717 if (p
->md_type
<= EFI_MD_TYPE_PALCODE
)
1718 type
= types
[p
->md_type
];
1721 kprintf("%23s %012lx %12p %08lx ", type
, p
->md_phys
,
1722 p
->md_virt
, p
->md_pages
);
1723 if (p
->md_attr
& EFI_MD_ATTR_UC
)
1725 if (p
->md_attr
& EFI_MD_ATTR_WC
)
1727 if (p
->md_attr
& EFI_MD_ATTR_WT
)
1729 if (p
->md_attr
& EFI_MD_ATTR_WB
)
1731 if (p
->md_attr
& EFI_MD_ATTR_UCE
)
1733 if (p
->md_attr
& EFI_MD_ATTR_WP
)
1735 if (p
->md_attr
& EFI_MD_ATTR_RP
)
1737 if (p
->md_attr
& EFI_MD_ATTR_XP
)
1739 if (p
->md_attr
& EFI_MD_ATTR_RT
)
1744 switch (p
->md_type
) {
1745 case EFI_MD_TYPE_CODE
:
1746 case EFI_MD_TYPE_DATA
:
1747 case EFI_MD_TYPE_BS_CODE
:
1748 case EFI_MD_TYPE_BS_DATA
:
1749 case EFI_MD_TYPE_FREE
:
1751 * We're allowed to use any entry with these types.
1758 Realmem
+= p
->md_pages
* PAGE_SIZE
;
1760 if (p
->md_phys
== physmap
[*physmap_idx
+ 1]) {
1761 physmap
[*physmap_idx
+ 1] += p
->md_pages
* PAGE_SIZE
;
1766 if (*physmap_idx
== PHYSMAP_SIZE
) {
1767 kprintf("Too many segments in the physical "
1768 "address map, giving up\n");
1771 physmap
[*physmap_idx
] = p
->md_phys
;
1772 physmap
[*physmap_idx
+ 1] = p
->md_phys
+ p
->md_pages
* PAGE_SIZE
;
1776 struct fb_info efi_fb_info
;
1777 static int have_efi_framebuffer
= 0;
1780 efi_fb_init_vaddr(int direct_map
)
1783 vm_offset_t addr
, v
;
1785 v
= efi_fb_info
.vaddr
;
1786 sz
= efi_fb_info
.stride
* efi_fb_info
.height
;
1789 addr
= PHYS_TO_DMAP(efi_fb_info
.paddr
);
1790 if (addr
>= DMAP_MIN_ADDRESS
&& addr
+ sz
< DMAP_MAX_ADDRESS
)
1791 efi_fb_info
.vaddr
= addr
;
1793 efi_fb_info
.vaddr
= (vm_offset_t
)pmap_mapdev_attr(
1794 efi_fb_info
.paddr
, sz
, PAT_WRITE_COMBINING
);
1797 if (v
== 0 && efi_fb_info
.vaddr
!= 0)
1798 memset((void *)efi_fb_info
.vaddr
, 0x77, sz
);
1802 probe_efi_fb(int early
)
1804 struct efi_fb
*efifb
;
1807 if (have_efi_framebuffer
) {
1809 (efi_fb_info
.vaddr
== 0 ||
1810 efi_fb_info
.vaddr
== PHYS_TO_DMAP(efi_fb_info
.paddr
)))
1811 efi_fb_init_vaddr(0);
1815 kmdp
= preload_search_by_type("elf kernel");
1817 kmdp
= preload_search_by_type("elf64 kernel");
1818 efifb
= (struct efi_fb
*)preload_search_info(kmdp
,
1819 MODINFO_METADATA
| MODINFOMD_EFI_FB
);
1823 have_efi_framebuffer
= 1;
1825 efi_fb_info
.is_vga_boot_display
= 1;
1826 efi_fb_info
.width
= efifb
->fb_width
;
1827 efi_fb_info
.height
= efifb
->fb_height
;
1828 efi_fb_info
.stride
= efifb
->fb_stride
* 4;
1829 efi_fb_info
.depth
= 32;
1830 efi_fb_info
.paddr
= efifb
->fb_addr
;
1832 efi_fb_info
.vaddr
= 0;
1834 efi_fb_init_vaddr(0);
1836 efi_fb_info
.restore
= NULL
;
1837 efi_fb_info
.device
= NULL
;
1843 efifb_startup(void *arg
)
1848 SYSINIT(efi_fb_info
, SI_BOOT1_POST
, SI_ORDER_FIRST
, efifb_startup
, NULL
);
1851 getmemsize(caddr_t kmdp
, u_int64_t first
)
1853 int off
, physmap_idx
, pa_indx
, da_indx
;
1856 vm_paddr_t msgbuf_size
;
1857 u_long physmem_tunable
;
1859 quad_t dcons_addr
, dcons_size
;
1861 bzero(physmap
, sizeof(physmap
));
1865 * get memory map from INT 15:E820, kindly supplied by the loader.
1867 * subr_module.c says:
1868 * "Consumer may safely assume that size value precedes data."
1869 * ie: an int32_t immediately precedes smap.
1871 efihdrbase
= (struct efi_map_header
*)preload_search_info(kmdp
,
1872 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1873 smapbase
= (struct bios_smap
*)preload_search_info(kmdp
,
1874 MODINFO_METADATA
| MODINFOMD_SMAP
);
1875 if (smapbase
== NULL
&& efihdrbase
== NULL
)
1876 panic("No BIOS smap or EFI map info from loader!");
1878 if (efihdrbase
== NULL
)
1879 add_smap_entries(&physmap_idx
);
1881 add_efi_map_entries(&physmap_idx
);
1883 base_memory
= physmap
[1] / 1024;
1884 /* make hole for AP bootstrap code */
1885 physmap
[1] = mp_bootaddress(base_memory
);
1887 /* Save EBDA address, if any */
1888 ebda_addr
= (u_long
)(*(u_short
*)(KERNBASE
+ 0x40e));
1892 * Maxmem isn't the "maximum memory", it's one larger than the
1893 * highest page of the physical address space. It should be
1894 * called something like "Maxphyspage". We may adjust this
1895 * based on ``hw.physmem'' and the results of the memory test.
1897 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1900 Maxmem
= MAXMEM
/ 4;
1903 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable
))
1904 Maxmem
= atop(physmem_tunable
);
1907 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1910 if (Maxmem
> atop(physmap
[physmap_idx
+ 1]))
1911 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1914 * Blowing out the DMAP will blow up the system.
1916 if (Maxmem
> atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
)) {
1917 kprintf("Limiting Maxmem due to DMAP size\n");
1918 Maxmem
= atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
);
1921 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1922 (boothowto
& RB_VERBOSE
)) {
1923 kprintf("Physical memory use set to %ldK\n", Maxmem
* 4);
1927 * Call pmap initialization to make new kernel address space
1931 pmap_bootstrap(&first
);
1932 physmap
[0] = PAGE_SIZE
;
1935 * Align the physmap to PHYSMAP_ALIGN and cut out anything
1938 for (i
= j
= 0; i
<= physmap_idx
; i
+= 2) {
1939 if (physmap
[i
+1] > ptoa(Maxmem
))
1940 physmap
[i
+1] = ptoa(Maxmem
);
1941 physmap
[i
] = (physmap
[i
] + PHYSMAP_ALIGN_MASK
) &
1942 ~PHYSMAP_ALIGN_MASK
;
1943 physmap
[i
+1] = physmap
[i
+1] & ~PHYSMAP_ALIGN_MASK
;
1945 physmap
[j
] = physmap
[i
];
1946 physmap
[j
+1] = physmap
[i
+1];
1948 if (physmap
[i
] < physmap
[i
+1])
1951 physmap_idx
= j
- 2;
1954 * Align anything else used in the validation loop.
1956 first
= (first
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
1959 * Size up each available chunk of physical memory.
1963 phys_avail
[pa_indx
++] = physmap
[0];
1964 phys_avail
[pa_indx
] = physmap
[0];
1965 dump_avail
[da_indx
] = physmap
[0];
1969 * Get dcons buffer address
1971 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
1972 kgetenv_quad("dcons.size", &dcons_size
) == 0)
1976 * Validate the physical memory. The physical memory segments
1977 * have already been aligned to PHYSMAP_ALIGN which is a multiple
1980 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1983 end
= physmap
[i
+ 1];
1985 for (pa
= physmap
[i
]; pa
< end
; pa
+= PHYSMAP_ALIGN
) {
1986 int tmp
, page_bad
, full
;
1987 int *ptr
= (int *)CADDR1
;
1991 * block out kernel memory as not available.
1993 if (pa
>= 0x200000 && pa
< first
)
1997 * block out dcons buffer
2000 && pa
>= trunc_page(dcons_addr
)
2001 && pa
< dcons_addr
+ dcons_size
) {
2008 * map page into kernel: valid, read/write,non-cacheable
2011 kernel_pmap
.pmap_bits
[PG_V_IDX
] |
2012 kernel_pmap
.pmap_bits
[PG_RW_IDX
] |
2013 kernel_pmap
.pmap_bits
[PG_N_IDX
];
2018 * Test for alternating 1's and 0's
2020 *(volatile int *)ptr
= 0xaaaaaaaa;
2022 if (*(volatile int *)ptr
!= 0xaaaaaaaa)
2025 * Test for alternating 0's and 1's
2027 *(volatile int *)ptr
= 0x55555555;
2029 if (*(volatile int *)ptr
!= 0x55555555)
2034 *(volatile int *)ptr
= 0xffffffff;
2036 if (*(volatile int *)ptr
!= 0xffffffff)
2041 *(volatile int *)ptr
= 0x0;
2043 if (*(volatile int *)ptr
!= 0x0)
2046 * Restore original value.
2051 * Adjust array of valid/good pages.
2053 if (page_bad
== TRUE
)
2056 * If this good page is a continuation of the
2057 * previous set of good pages, then just increase
2058 * the end pointer. Otherwise start a new chunk.
2059 * Note that "end" points one higher than end,
2060 * making the range >= start and < end.
2061 * If we're also doing a speculative memory
2062 * test and we at or past the end, bump up Maxmem
2063 * so that we keep going. The first bad page
2064 * will terminate the loop.
2066 if (phys_avail
[pa_indx
] == pa
) {
2067 phys_avail
[pa_indx
] += PHYSMAP_ALIGN
;
2070 if (pa_indx
== PHYS_AVAIL_ARRAY_END
) {
2072 "Too many holes in the physical address space, giving up\n");
2077 phys_avail
[pa_indx
++] = pa
;
2078 phys_avail
[pa_indx
] = pa
+ PHYSMAP_ALIGN
;
2080 physmem
+= PHYSMAP_ALIGN
/ PAGE_SIZE
;
2082 if (dump_avail
[da_indx
] == pa
) {
2083 dump_avail
[da_indx
] += PHYSMAP_ALIGN
;
2086 if (da_indx
== DUMP_AVAIL_ARRAY_END
) {
2090 dump_avail
[da_indx
++] = pa
;
2091 dump_avail
[da_indx
] = pa
+ PHYSMAP_ALIGN
;
2102 * The last chunk must contain at least one page plus the message
2103 * buffer to avoid complicating other code (message buffer address
2104 * calculation, etc.).
2106 msgbuf_size
= (MSGBUF_SIZE
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
2108 while (phys_avail
[pa_indx
- 1] + PHYSMAP_ALIGN
+
2109 msgbuf_size
>= phys_avail
[pa_indx
]) {
2110 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
2111 phys_avail
[pa_indx
--] = 0;
2112 phys_avail
[pa_indx
--] = 0;
2115 Maxmem
= atop(phys_avail
[pa_indx
]);
2117 /* Trim off space for the message buffer. */
2118 phys_avail
[pa_indx
] -= msgbuf_size
;
2120 avail_end
= phys_avail
[pa_indx
];
2122 /* Map the message buffer. */
2123 for (off
= 0; off
< msgbuf_size
; off
+= PAGE_SIZE
) {
2124 pmap_kenter((vm_offset_t
)msgbufp
+ off
,
2125 phys_avail
[pa_indx
] + off
);
2127 /* Try to get EFI framebuffer working as early as possible */
2128 if (have_efi_framebuffer
)
2129 efi_fb_init_vaddr(1);
2132 struct machintr_abi MachIntrABI
;
2143 * 7 Device Not Available (x87)
2145 * 9 Coprocessor Segment overrun (unsupported, reserved)
2147 * 11 Segment not present
2149 * 13 General Protection
2152 * 16 x87 FP Exception pending
2153 * 17 Alignment Check
2155 * 19 SIMD floating point
2157 * 32-255 INTn/external sources
2160 hammer_time(u_int64_t modulep
, u_int64_t physfree
)
2163 int gsel_tss
, x
, cpu
;
2165 int metadata_missing
, off
;
2167 struct mdglobaldata
*gd
;
2171 * Prevent lowering of the ipl if we call tsleep() early.
2173 gd
= &CPU_prvspace
[0]->mdglobaldata
;
2174 bzero(gd
, sizeof(*gd
));
2177 * Note: on both UP and SMP curthread must be set non-NULL
2178 * early in the boot sequence because the system assumes
2179 * that 'curthread' is never NULL.
2182 gd
->mi
.gd_curthread
= &thread0
;
2183 thread0
.td_gd
= &gd
->mi
;
2185 atdevbase
= ISA_HOLE_START
+ PTOV_OFFSET
;
2188 metadata_missing
= 0;
2189 if (bootinfo
.bi_modulep
) {
2190 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
2191 preload_bootstrap_relocate(KERNBASE
);
2193 metadata_missing
= 1;
2195 if (bootinfo
.bi_envp
)
2196 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
2199 preload_metadata
= (caddr_t
)(uintptr_t)(modulep
+ PTOV_OFFSET
);
2200 preload_bootstrap_relocate(PTOV_OFFSET
);
2201 kmdp
= preload_search_by_type("elf kernel");
2203 kmdp
= preload_search_by_type("elf64 kernel");
2204 boothowto
= MD_FETCH(kmdp
, MODINFOMD_HOWTO
, int);
2205 kern_envp
= MD_FETCH(kmdp
, MODINFOMD_ENVP
, char *) + PTOV_OFFSET
;
2207 ksym_start
= MD_FETCH(kmdp
, MODINFOMD_SSYM
, uintptr_t);
2208 ksym_end
= MD_FETCH(kmdp
, MODINFOMD_ESYM
, uintptr_t);
2211 if (boothowto
& RB_VERBOSE
)
2215 * Default MachIntrABI to ICU
2217 MachIntrABI
= MachIntrABI_ICU
;
2220 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
2221 * and ncpus_fit_mask remain 0.
2226 /* Init basic tunables, hz etc */
2230 * make gdt memory segments
2232 gdt_segs
[GPROC0_SEL
].ssd_base
=
2233 (uintptr_t) &CPU_prvspace
[0]->mdglobaldata
.gd_common_tss
;
2235 gd
->mi
.gd_prvspace
= CPU_prvspace
[0];
2237 for (x
= 0; x
< NGDT
; x
++) {
2238 if (x
!= GPROC0_SEL
&& x
!= (GPROC0_SEL
+ 1))
2239 ssdtosd(&gdt_segs
[x
], &gdt
[x
]);
2241 ssdtosyssd(&gdt_segs
[GPROC0_SEL
],
2242 (struct system_segment_descriptor
*)&gdt
[GPROC0_SEL
]);
2244 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
2245 r_gdt
.rd_base
= (long) gdt
;
2248 wrmsr(MSR_FSBASE
, 0); /* User value */
2249 wrmsr(MSR_GSBASE
, (u_int64_t
)&gd
->mi
);
2250 wrmsr(MSR_KGSBASE
, 0); /* User value while in the kernel */
2252 mi_gdinit(&gd
->mi
, 0);
2254 proc0paddr
= proc0paddr_buff
;
2255 mi_proc0init(&gd
->mi
, proc0paddr
);
2256 safepri
= TDPRI_MAX
;
2258 /* spinlocks and the BGL */
2262 for (x
= 0; x
< NIDT
; x
++)
2263 setidt_global(x
, &IDTVEC(rsvd
), SDT_SYSIGT
, SEL_KPL
, 0);
2264 setidt_global(IDT_DE
, &IDTVEC(div
), SDT_SYSIGT
, SEL_KPL
, 0);
2265 setidt_global(IDT_DB
, &IDTVEC(dbg
), SDT_SYSIGT
, SEL_KPL
, 0);
2266 setidt_global(IDT_NMI
, &IDTVEC(nmi
), SDT_SYSIGT
, SEL_KPL
, 1);
2267 setidt_global(IDT_BP
, &IDTVEC(bpt
), SDT_SYSIGT
, SEL_UPL
, 0);
2268 setidt_global(IDT_OF
, &IDTVEC(ofl
), SDT_SYSIGT
, SEL_KPL
, 0);
2269 setidt_global(IDT_BR
, &IDTVEC(bnd
), SDT_SYSIGT
, SEL_KPL
, 0);
2270 setidt_global(IDT_UD
, &IDTVEC(ill
), SDT_SYSIGT
, SEL_KPL
, 0);
2271 setidt_global(IDT_NM
, &IDTVEC(dna
), SDT_SYSIGT
, SEL_KPL
, 0);
2272 setidt_global(IDT_DF
, &IDTVEC(dblfault
), SDT_SYSIGT
, SEL_KPL
, 1);
2273 setidt_global(IDT_FPUGP
, &IDTVEC(fpusegm
), SDT_SYSIGT
, SEL_KPL
, 0);
2274 setidt_global(IDT_TS
, &IDTVEC(tss
), SDT_SYSIGT
, SEL_KPL
, 0);
2275 setidt_global(IDT_NP
, &IDTVEC(missing
), SDT_SYSIGT
, SEL_KPL
, 0);
2276 setidt_global(IDT_SS
, &IDTVEC(stk
), SDT_SYSIGT
, SEL_KPL
, 0);
2277 setidt_global(IDT_GP
, &IDTVEC(prot
), SDT_SYSIGT
, SEL_KPL
, 0);
2278 setidt_global(IDT_PF
, &IDTVEC(page
), SDT_SYSIGT
, SEL_KPL
, 0);
2279 setidt_global(IDT_MF
, &IDTVEC(fpu
), SDT_SYSIGT
, SEL_KPL
, 0);
2280 setidt_global(IDT_AC
, &IDTVEC(align
), SDT_SYSIGT
, SEL_KPL
, 0);
2281 setidt_global(IDT_MC
, &IDTVEC(mchk
), SDT_SYSIGT
, SEL_KPL
, 0);
2282 setidt_global(IDT_XF
, &IDTVEC(xmm
), SDT_SYSIGT
, SEL_KPL
, 0);
2284 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
2285 r_idt_arr
[cpu
].rd_limit
= sizeof(idt_arr
[cpu
]) - 1;
2286 r_idt_arr
[cpu
].rd_base
= (long) &idt_arr
[cpu
][0];
2289 lidt(&r_idt_arr
[0]);
2292 * Initialize the console before we print anything out.
2297 if (metadata_missing
)
2298 kprintf("WARNING: loader(8) metadata is missing!\n");
2308 * Initialize IRQ mapping
2311 * SHOULD be after elcr_probe()
2313 MachIntrABI_ICU
.initmap();
2314 MachIntrABI_IOAPIC
.initmap();
2318 if (boothowto
& RB_KDB
)
2319 Debugger("Boot flags requested debugger");
2323 finishidentcpu(); /* Final stage of CPU initialization */
2324 setidt(6, &IDTVEC(ill
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2325 setidt(13, &IDTVEC(prot
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2327 identify_cpu(); /* Final stage of CPU initialization */
2328 initializecpu(0); /* Initialize CPU registers */
2331 * On modern intel cpus, haswell or later, cpu_idle_hlt=1 is better
2332 * because the cpu does significant power management in MWAIT
2333 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2335 * On modern amd cpus cpu_idle_hlt=3 is better, because the cpu does
2336 * significant power management in HLT or ACPI (but cpu_idle_hlt=1
2337 * would try to use MWAIT).
2339 * On older amd or intel cpus, cpu_idle_hlt=2 is better because ACPI
2340 * is needed to reduce power consumption, but wakeup times are often
2343 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
2344 CPUID_TO_MODEL(cpu_id
) >= 0x3C) { /* Haswell or later */
2347 if (cpu_vendor_id
== CPU_VENDOR_AMD
&&
2348 CPUID_TO_FAMILY(cpu_id
) >= 0x14) { /* Bobcat or later */
2352 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable
); /* for compat */
2353 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable
);
2354 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable
);
2355 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt
);
2358 * Some of the virtual machines do not work w/ I/O APIC
2359 * enabled. If the user does not explicitly enable or
2360 * disable the I/O APIC (ioapic_enable < 0), then we
2361 * disable I/O APIC on all virtual machines.
2364 * This must be done after identify_cpu(), which sets
2367 if (ioapic_enable
< 0) {
2368 if (cpu_feature2
& CPUID2_VMM
)
2374 /* make an initial tss so cpu can get interrupt stack on syscall! */
2375 gd
->gd_common_tss
.tss_rsp0
=
2376 (register_t
)(thread0
.td_kstack
+
2377 KSTACK_PAGES
* PAGE_SIZE
- sizeof(struct pcb
));
2378 /* Ensure the stack is aligned to 16 bytes */
2379 gd
->gd_common_tss
.tss_rsp0
&= ~(register_t
)0xF;
2381 /* double fault stack */
2382 gd
->gd_common_tss
.tss_ist1
=
2383 (long)&gd
->mi
.gd_prvspace
->idlestack
[
2384 sizeof(gd
->mi
.gd_prvspace
->idlestack
)];
2386 /* Set the IO permission bitmap (empty due to tss seg limit) */
2387 gd
->gd_common_tss
.tss_iobase
= sizeof(struct x86_64tss
);
2389 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2390 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
];
2391 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2394 /* Set up the fast syscall stuff */
2395 msr
= rdmsr(MSR_EFER
) | EFER_SCE
;
2396 wrmsr(MSR_EFER
, msr
);
2397 wrmsr(MSR_LSTAR
, (u_int64_t
)IDTVEC(fast_syscall
));
2398 wrmsr(MSR_CSTAR
, (u_int64_t
)IDTVEC(fast_syscall32
));
2399 msr
= ((u_int64_t
)GSEL(GCODE_SEL
, SEL_KPL
) << 32) |
2400 ((u_int64_t
)GSEL(GUCODE32_SEL
, SEL_UPL
) << 48);
2401 wrmsr(MSR_STAR
, msr
);
2402 wrmsr(MSR_SF_MASK
, PSL_NT
|PSL_T
|PSL_I
|PSL_C
|PSL_D
|PSL_IOPL
);
2404 getmemsize(kmdp
, physfree
);
2405 init_param2(physmem
);
2407 /* now running on new page tables, configured,and u/iom is accessible */
2409 /* Map the message buffer. */
2411 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2412 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2415 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2418 /* transfer to user mode */
2420 _ucodesel
= GSEL(GUCODE_SEL
, SEL_UPL
);
2421 _udatasel
= GSEL(GUDATA_SEL
, SEL_UPL
);
2422 _ucode32sel
= GSEL(GUCODE32_SEL
, SEL_UPL
);
2428 /* setup proc 0's pcb */
2429 thread0
.td_pcb
->pcb_flags
= 0;
2430 thread0
.td_pcb
->pcb_cr3
= KPML4phys
;
2431 thread0
.td_pcb
->pcb_ext
= NULL
;
2432 lwp0
.lwp_md
.md_regs
= &proc0_tf
; /* XXX needed? */
2434 /* Location of kernel stack for locore */
2435 return ((u_int64_t
)thread0
.td_pcb
);
2439 * Initialize machine-dependant portions of the global data structure.
2440 * Note that the global data area and cpu0's idlestack in the private
2441 * data space were allocated in locore.
2443 * Note: the idlethread's cpl is 0
2445 * WARNING! Called from early boot, 'mycpu' may not work yet.
2448 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2451 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2453 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2454 gd
->mi
.gd_prvspace
->idlestack
,
2455 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2457 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2458 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2459 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2460 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2464 * We only have to check for DMAP bounds, the globaldata space is
2465 * actually part of the kernel_map so we don't have to waste time
2466 * checking CPU_prvspace[*].
2469 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2472 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2473 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2477 if (saddr
>= DMAP_MIN_ADDRESS
&& eaddr
<= DMAP_MAX_ADDRESS
)
2483 globaldata_find(int cpu
)
2485 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2486 return(&CPU_prvspace
[cpu
]->mdglobaldata
.mi
);
2490 * This path should be safe from the SYSRET issue because only stopped threads
2491 * can have their %rip adjusted this way (and all heavy weight thread switches
2492 * clear QUICKREF and thus do not use SYSRET). However, the code path is
2493 * convoluted so add a safety by forcing %rip to be cannonical.
2496 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2498 if (addr
& 0x0000800000000000LLU
)
2499 lp
->lwp_md
.md_regs
->tf_rip
= addr
| 0xFFFF000000000000LLU
;
2501 lp
->lwp_md
.md_regs
->tf_rip
= addr
& 0x0000FFFFFFFFFFFFLLU
;
2506 ptrace_single_step(struct lwp
*lp
)
2508 lp
->lwp_md
.md_regs
->tf_rflags
|= PSL_T
;
2513 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2515 struct trapframe
*tp
;
2517 if ((tp
= lp
->lwp_md
.md_regs
) == NULL
)
2519 bcopy(&tp
->tf_rdi
, ®s
->r_rdi
, sizeof(*regs
));
2524 set_regs(struct lwp
*lp
, struct reg
*regs
)
2526 struct trapframe
*tp
;
2528 tp
= lp
->lwp_md
.md_regs
;
2529 if (!EFL_SECURE(regs
->r_rflags
, tp
->tf_rflags
) ||
2530 !CS_SECURE(regs
->r_cs
))
2532 bcopy(®s
->r_rdi
, &tp
->tf_rdi
, sizeof(*regs
));
2538 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2540 struct env87
*penv_87
= &sv_87
->sv_env
;
2541 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2544 /* FPU control/status */
2545 penv_87
->en_cw
= penv_xmm
->en_cw
;
2546 penv_87
->en_sw
= penv_xmm
->en_sw
;
2547 penv_87
->en_tw
= penv_xmm
->en_tw
;
2548 penv_87
->en_fip
= penv_xmm
->en_fip
;
2549 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2550 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2551 penv_87
->en_foo
= penv_xmm
->en_foo
;
2552 penv_87
->en_fos
= penv_xmm
->en_fos
;
2555 for (i
= 0; i
< 8; ++i
)
2556 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2560 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2562 struct env87
*penv_87
= &sv_87
->sv_env
;
2563 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2566 /* FPU control/status */
2567 penv_xmm
->en_cw
= penv_87
->en_cw
;
2568 penv_xmm
->en_sw
= penv_87
->en_sw
;
2569 penv_xmm
->en_tw
= penv_87
->en_tw
;
2570 penv_xmm
->en_fip
= penv_87
->en_fip
;
2571 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2572 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2573 penv_xmm
->en_foo
= penv_87
->en_foo
;
2574 penv_xmm
->en_fos
= penv_87
->en_fos
;
2577 for (i
= 0; i
< 8; ++i
)
2578 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2582 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2584 if (lp
->lwp_thread
== NULL
|| lp
->lwp_thread
->td_pcb
== NULL
)
2587 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2588 (struct save87
*)fpregs
);
2591 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2596 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2599 set_fpregs_xmm((struct save87
*)fpregs
,
2600 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2603 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2608 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2613 dbregs
->dr
[0] = rdr0();
2614 dbregs
->dr
[1] = rdr1();
2615 dbregs
->dr
[2] = rdr2();
2616 dbregs
->dr
[3] = rdr3();
2617 dbregs
->dr
[4] = rdr4();
2618 dbregs
->dr
[5] = rdr5();
2619 dbregs
->dr
[6] = rdr6();
2620 dbregs
->dr
[7] = rdr7();
2623 if (lp
->lwp_thread
== NULL
|| (pcb
= lp
->lwp_thread
->td_pcb
) == NULL
)
2625 dbregs
->dr
[0] = pcb
->pcb_dr0
;
2626 dbregs
->dr
[1] = pcb
->pcb_dr1
;
2627 dbregs
->dr
[2] = pcb
->pcb_dr2
;
2628 dbregs
->dr
[3] = pcb
->pcb_dr3
;
2631 dbregs
->dr
[6] = pcb
->pcb_dr6
;
2632 dbregs
->dr
[7] = pcb
->pcb_dr7
;
2637 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2640 load_dr0(dbregs
->dr
[0]);
2641 load_dr1(dbregs
->dr
[1]);
2642 load_dr2(dbregs
->dr
[2]);
2643 load_dr3(dbregs
->dr
[3]);
2644 load_dr4(dbregs
->dr
[4]);
2645 load_dr5(dbregs
->dr
[5]);
2646 load_dr6(dbregs
->dr
[6]);
2647 load_dr7(dbregs
->dr
[7]);
2650 struct ucred
*ucred
;
2652 uint64_t mask1
, mask2
;
2655 * Don't let an illegal value for dr7 get set. Specifically,
2656 * check for undefined settings. Setting these bit patterns
2657 * result in undefined behaviour and can lead to an unexpected
2660 /* JG this loop looks unreadable */
2661 /* Check 4 2-bit fields for invalid patterns.
2662 * These fields are R/Wi, for i = 0..3
2664 /* Is 10 in LENi allowed when running in compatibility mode? */
2665 /* Pattern 10 in R/Wi might be used to indicate
2666 * breakpoint on I/O. Further analysis should be
2667 * carried to decide if it is safe and useful to
2668 * provide access to that capability
2670 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 4;
2671 i
++, mask1
<<= 4, mask2
<<= 4)
2672 if ((dbregs
->dr
[7] & mask1
) == mask2
)
2675 pcb
= lp
->lwp_thread
->td_pcb
;
2676 ucred
= lp
->lwp_proc
->p_ucred
;
2679 * Don't let a process set a breakpoint that is not within the
2680 * process's address space. If a process could do this, it
2681 * could halt the system by setting a breakpoint in the kernel
2682 * (if ddb was enabled). Thus, we need to check to make sure
2683 * that no breakpoints are being enabled for addresses outside
2684 * process's address space, unless, perhaps, we were called by
2687 * XXX - what about when the watched area of the user's
2688 * address space is written into from within the kernel
2689 * ... wouldn't that still cause a breakpoint to be generated
2690 * from within kernel mode?
2693 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2694 if (dbregs
->dr
[7] & 0x3) {
2695 /* dr0 is enabled */
2696 if (dbregs
->dr
[0] >= VM_MAX_USER_ADDRESS
)
2700 if (dbregs
->dr
[7] & (0x3<<2)) {
2701 /* dr1 is enabled */
2702 if (dbregs
->dr
[1] >= VM_MAX_USER_ADDRESS
)
2706 if (dbregs
->dr
[7] & (0x3<<4)) {
2707 /* dr2 is enabled */
2708 if (dbregs
->dr
[2] >= VM_MAX_USER_ADDRESS
)
2712 if (dbregs
->dr
[7] & (0x3<<6)) {
2713 /* dr3 is enabled */
2714 if (dbregs
->dr
[3] >= VM_MAX_USER_ADDRESS
)
2719 pcb
->pcb_dr0
= dbregs
->dr
[0];
2720 pcb
->pcb_dr1
= dbregs
->dr
[1];
2721 pcb
->pcb_dr2
= dbregs
->dr
[2];
2722 pcb
->pcb_dr3
= dbregs
->dr
[3];
2723 pcb
->pcb_dr6
= dbregs
->dr
[6];
2724 pcb
->pcb_dr7
= dbregs
->dr
[7];
2726 pcb
->pcb_flags
|= PCB_DBREGS
;
2733 * Return > 0 if a hardware breakpoint has been hit, and the
2734 * breakpoint was in user space. Return 0, otherwise.
2737 user_dbreg_trap(void)
2739 u_int64_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2740 u_int64_t bp
; /* breakpoint bits extracted from dr6 */
2741 int nbp
; /* number of breakpoints that triggered */
2742 caddr_t addr
[4]; /* breakpoint addresses */
2746 if ((dr7
& 0xff) == 0) {
2748 * all GE and LE bits in the dr7 register are zero,
2749 * thus the trap couldn't have been caused by the
2750 * hardware debug registers
2761 * None of the breakpoint bits are set meaning this
2762 * trap was not caused by any of the debug registers
2768 * at least one of the breakpoints were hit, check to see
2769 * which ones and if any of them are user space addresses
2773 addr
[nbp
++] = (caddr_t
)rdr0();
2776 addr
[nbp
++] = (caddr_t
)rdr1();
2779 addr
[nbp
++] = (caddr_t
)rdr2();
2782 addr
[nbp
++] = (caddr_t
)rdr3();
2785 for (i
=0; i
<nbp
; i
++) {
2787 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2789 * addr[i] is in user space
2796 * None of the breakpoints are in user space.
2804 Debugger(const char *msg
)
2806 kprintf("Debugger(\"%s\") called.\n", msg
);
2813 * Provide inb() and outb() as functions. They are normally only
2814 * available as macros calling inlined functions, thus cannot be
2815 * called inside DDB.
2817 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2823 /* silence compiler warnings */
2825 void outb(u_int
, u_char
);
2832 * We use %%dx and not %1 here because i/o is done at %dx and not at
2833 * %edx, while gcc generates inferior code (movw instead of movl)
2834 * if we tell it to load (u_short) port.
2836 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2841 outb(u_int port
, u_char data
)
2845 * Use an unnecessary assignment to help gcc's register allocator.
2846 * This make a large difference for gcc-1.40 and a tiny difference
2847 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2848 * best results. gcc-2.6.0 can't handle this.
2851 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2859 * initialize all the SMP locks
2862 /* critical region when masking or unmasking interupts */
2863 struct spinlock_deprecated imen_spinlock
;
2865 /* lock region used by kernel profiling */
2866 struct spinlock_deprecated mcount_spinlock
;
2868 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2869 struct spinlock_deprecated com_spinlock
;
2871 /* lock regions around the clock hardware */
2872 struct spinlock_deprecated clock_spinlock
;
2878 * Get the initial mplock with a count of 1 for the BSP.
2879 * This uses a LOGICAL cpu ID, ie BSP == 0.
2881 cpu_get_initial_mplock();
2883 spin_init_deprecated(&mcount_spinlock
);
2884 spin_init_deprecated(&imen_spinlock
);
2885 spin_init_deprecated(&com_spinlock
);
2886 spin_init_deprecated(&clock_spinlock
);
2888 /* our token pool needs to work early */
2889 lwkt_token_pool_init();
2893 cpu_mwait_hint_valid(uint32_t hint
)
2897 cx_idx
= MWAIT_EAX_TO_CX(hint
);
2898 if (cx_idx
>= CPU_MWAIT_CX_MAX
)
2901 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2902 if (sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
2909 cpu_mwait_cx_no_bmsts(void)
2911 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_STS
);
2915 cpu_mwait_cx_no_bmarb(void)
2917 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_ARB
);
2921 cpu_mwait_cx_hint2name(int hint
, char *name
, int namelen
, boolean_t allow_auto
)
2923 int old_cx_idx
, sub
= 0;
2926 old_cx_idx
= MWAIT_EAX_TO_CX(hint
);
2927 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2928 } else if (hint
== CPU_MWAIT_HINT_AUTO
) {
2929 old_cx_idx
= allow_auto
? CPU_MWAIT_C2
: CPU_MWAIT_CX_MAX
;
2930 } else if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
2931 old_cx_idx
= allow_auto
? CPU_MWAIT_C3
: CPU_MWAIT_CX_MAX
;
2933 old_cx_idx
= CPU_MWAIT_CX_MAX
;
2936 if (!CPU_MWAIT_HAS_CX
)
2937 strlcpy(name
, "NONE", namelen
);
2938 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTO
)
2939 strlcpy(name
, "AUTO", namelen
);
2940 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTODEEP
)
2941 strlcpy(name
, "AUTODEEP", namelen
);
2942 else if (old_cx_idx
>= CPU_MWAIT_CX_MAX
||
2943 sub
>= cpu_mwait_cx_info
[old_cx_idx
].subcnt
)
2944 strlcpy(name
, "INVALID", namelen
);
2946 ksnprintf(name
, namelen
, "C%d/%d", old_cx_idx
, sub
);
2952 cpu_mwait_cx_name2hint(char *name
, int *hint0
, boolean_t allow_auto
)
2954 int cx_idx
, sub
, hint
;
2957 if (allow_auto
&& strcmp(name
, "AUTO") == 0) {
2958 hint
= CPU_MWAIT_HINT_AUTO
;
2959 cx_idx
= CPU_MWAIT_C2
;
2962 if (allow_auto
&& strcmp(name
, "AUTODEEP") == 0) {
2963 hint
= CPU_MWAIT_HINT_AUTODEEP
;
2964 cx_idx
= CPU_MWAIT_C3
;
2968 if (strlen(name
) < 4 || toupper(name
[0]) != 'C')
2973 cx_idx
= strtol(start
, &ptr
, 10);
2974 if (ptr
== start
|| *ptr
!= '/')
2976 if (cx_idx
< 0 || cx_idx
>= CPU_MWAIT_CX_MAX
)
2982 sub
= strtol(start
, &ptr
, 10);
2985 if (sub
< 0 || sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
2988 hint
= MWAIT_EAX_HINT(cx_idx
, sub
);
2995 cpu_mwait_cx_transit(int old_cx_idx
, int cx_idx
)
2997 if (cx_idx
>= CPU_MWAIT_C3
&& cpu_mwait_c3_preamble
)
2999 if (old_cx_idx
< CPU_MWAIT_C3
&& cx_idx
>= CPU_MWAIT_C3
) {
3002 error
= cputimer_intr_powersave_addreq();
3005 } else if (old_cx_idx
>= CPU_MWAIT_C3
&& cx_idx
< CPU_MWAIT_C3
) {
3006 cputimer_intr_powersave_remreq();
3012 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
, int *hint0
,
3013 boolean_t allow_auto
)
3015 int error
, cx_idx
, old_cx_idx
, hint
;
3016 char name
[CPU_MWAIT_CX_NAMELEN
];
3019 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
),
3022 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3023 if (error
!= 0 || req
->newptr
== NULL
)
3026 if (!CPU_MWAIT_HAS_CX
)
3029 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, allow_auto
);
3033 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3042 cpu_mwait_cx_setname(struct cpu_idle_stat
*stat
, const char *cx_name
)
3044 int error
, cx_idx
, old_cx_idx
, hint
;
3045 char name
[CPU_MWAIT_CX_NAMELEN
];
3047 KASSERT(CPU_MWAIT_HAS_CX
, ("cpu does not support mwait CX extension"));
3050 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3052 strlcpy(name
, cx_name
, sizeof(name
));
3053 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3057 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3066 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3068 int hint
= cpu_mwait_halt_global
;
3069 int error
, cx_idx
, cpu
;
3070 char name
[CPU_MWAIT_CX_NAMELEN
], cx_name
[CPU_MWAIT_CX_NAMELEN
];
3072 cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3074 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3075 if (error
!= 0 || req
->newptr
== NULL
)
3078 if (!CPU_MWAIT_HAS_CX
)
3081 /* Save name for later per-cpu CX configuration */
3082 strlcpy(cx_name
, name
, sizeof(cx_name
));
3084 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3088 /* Change per-cpu CX configuration */
3089 for (cpu
= 0; cpu
< ncpus
; ++cpu
) {
3090 error
= cpu_mwait_cx_setname(&cpu_idle_stats
[cpu
], cx_name
);
3095 cpu_mwait_halt_global
= hint
;
3100 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3102 struct cpu_idle_stat
*stat
= arg1
;
3105 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3111 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
)
3115 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3116 &cpu_mwait_spin
, FALSE
);
3121 * This manual debugging code is called unconditionally from Xtimer
3122 * (the per-cpu timer interrupt) whether the current thread is in a
3123 * critical section or not) and can be useful in tracking down lockups.
3125 * NOTE: MANUAL DEBUG CODE
3128 static int saveticks
[SMP_MAXCPU
];
3129 static int savecounts
[SMP_MAXCPU
];
3133 pcpu_timer_always(struct intrframe
*frame
)
3136 globaldata_t gd
= mycpu
;
3137 int cpu
= gd
->gd_cpuid
;
3143 gptr
= (short *)0xFFFFFFFF800b8000 + 80 * cpu
;
3144 *gptr
= ((*gptr
+ 1) & 0x00FF) | 0x0700;
3147 ksnprintf(buf
, sizeof(buf
), " %p %16s %d %16s ",
3148 (void *)frame
->if_rip
, gd
->gd_curthread
->td_comm
, ticks
,
3150 for (i
= 0; buf
[i
]; ++i
) {
3151 gptr
[i
] = 0x0700 | (unsigned char)buf
[i
];
3155 if (saveticks
[gd
->gd_cpuid
] != ticks
) {
3156 saveticks
[gd
->gd_cpuid
] = ticks
;
3157 savecounts
[gd
->gd_cpuid
] = 0;
3159 ++savecounts
[gd
->gd_cpuid
];
3160 if (savecounts
[gd
->gd_cpuid
] > 2000 && panicstr
== NULL
) {
3161 panic("cpud %d panicing on ticks failure",
3164 for (i
= 0; i
< ncpus
; ++i
) {
3166 if (saveticks
[i
] && panicstr
== NULL
) {
3167 delta
= saveticks
[i
] - ticks
;
3168 if (delta
< -10 || delta
> 10) {
3169 panic("cpu %d panicing on cpu %d watchdog",