2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
43 //#include "use_npx.h"
45 #include "opt_compat.h"
48 #include "opt_directio.h"
50 #include "opt_msgbuf.h"
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/sysproto.h>
56 #include <sys/signalvar.h>
57 #include <sys/kernel.h>
58 #include <sys/linker.h>
59 #include <sys/malloc.h>
63 #include <sys/reboot.h>
65 #include <sys/msgbuf.h>
66 #include <sys/sysent.h>
67 #include <sys/sysctl.h>
68 #include <sys/vmmeter.h>
70 #include <sys/usched.h>
73 #include <sys/ctype.h>
74 #include <sys/serialize.h>
75 #include <sys/systimer.h>
78 #include <vm/vm_param.h>
80 #include <vm/vm_kern.h>
81 #include <vm/vm_object.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_pager.h>
85 #include <vm/vm_extern.h>
87 #include <sys/thread2.h>
88 #include <sys/mplock2.h>
89 #include <sys/mutex2.h>
99 #include <machine/cpu.h>
100 #include <machine/clock.h>
101 #include <machine/specialreg.h>
103 #include <machine/bootinfo.h>
105 #include <machine/md_var.h>
106 #include <machine/metadata.h>
107 #include <machine/pc/bios.h>
108 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
109 #include <machine/globaldata.h> /* CPU_prvspace */
110 #include <machine/smp.h>
111 #include <machine/cputypes.h>
112 #include <machine/intr_machdep.h>
113 #include <machine/framebuffer.h>
116 #include <bus/isa/isa_device.h>
118 #include <machine_base/isa/isa_intr.h>
119 #include <bus/isa/rtc.h>
120 #include <sys/random.h>
121 #include <sys/ptrace.h>
122 #include <machine/sigframe.h>
124 #include <sys/machintr.h>
125 #include <machine_base/icu/icu_abi.h>
126 #include <machine_base/icu/elcr_var.h>
127 #include <machine_base/apic/lapic.h>
128 #include <machine_base/apic/ioapic.h>
129 #include <machine_base/apic/ioapic_abi.h>
130 #include <machine/mptable.h>
132 #define PHYSMAP_ENTRIES 10
134 extern u_int64_t
hammer_time(u_int64_t
, u_int64_t
);
136 extern void printcpuinfo(void); /* XXX header file */
137 extern void identify_cpu(void);
139 extern void finishidentcpu(void);
141 extern void panicifcpuunsupported(void);
143 static void cpu_startup(void *);
144 static void pic_finish(void *);
145 static void cpu_finish(void *);
147 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
148 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
150 extern void ffs_rawread_setup(void);
151 #endif /* DIRECTIO */
152 static void init_locks(void);
154 extern void pcpu_timer_always(struct intrframe
*);
156 SYSINIT(cpu
, SI_BOOT2_START_CPU
, SI_ORDER_FIRST
, cpu_startup
, NULL
);
157 SYSINIT(pic_finish
, SI_BOOT2_FINISH_PIC
, SI_ORDER_FIRST
, pic_finish
, NULL
);
158 SYSINIT(cpu_finish
, SI_BOOT2_FINISH_CPU
, SI_ORDER_FIRST
, cpu_finish
, NULL
);
161 extern vm_offset_t ksym_start
, ksym_end
;
164 struct privatespace CPU_prvspace_bsp
__aligned(4096);
165 struct privatespace
*CPU_prvspace
[MAXCPU
] = { &CPU_prvspace_bsp
};
167 int _udatasel
, _ucodesel
, _ucode32sel
;
169 int64_t tsc_offsets
[MAXCPU
];
170 cpumask_t smp_idleinvl_mask
;
171 cpumask_t smp_idleinvl_reqs
;
173 static int cpu_mwait_halt_global
; /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
175 #if defined(SWTCH_OPTIM_STATS)
176 extern int swtch_optim_stats
;
177 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
178 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
179 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
180 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
182 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_halt
,
183 CTLFLAG_RD
, &cpu_mwait_halt_global
, 0, "");
184 SYSCTL_INT(_hw
, OID_AUTO
, cpu_mwait_spin
, CTLFLAG_RD
, &cpu_mwait_spin
, 0,
185 "monitor/mwait target state");
187 #define CPU_MWAIT_HAS_CX \
188 ((cpu_feature2 & CPUID2_MON) && \
189 (cpu_mwait_feature & CPUID_MWAIT_EXT))
191 #define CPU_MWAIT_CX_NAMELEN 16
193 #define CPU_MWAIT_C1 1
194 #define CPU_MWAIT_C2 2
195 #define CPU_MWAIT_C3 3
196 #define CPU_MWAIT_CX_MAX 8
198 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */
199 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */
201 SYSCTL_NODE(_machdep
, OID_AUTO
, mwait
, CTLFLAG_RW
, 0, "MWAIT features");
202 SYSCTL_NODE(_machdep_mwait
, OID_AUTO
, CX
, CTLFLAG_RW
, 0, "MWAIT Cx settings");
204 struct cpu_mwait_cx
{
207 struct sysctl_ctx_list sysctl_ctx
;
208 struct sysctl_oid
*sysctl_tree
;
210 static struct cpu_mwait_cx cpu_mwait_cx_info
[CPU_MWAIT_CX_MAX
];
211 static char cpu_mwait_cx_supported
[256];
213 static int cpu_mwait_c1_hints_cnt
;
214 static int cpu_mwait_hints_cnt
;
215 static int *cpu_mwait_hints
;
217 static int cpu_mwait_deep_hints_cnt
;
218 static int *cpu_mwait_deep_hints
;
220 #define CPU_IDLE_REPEAT_DEFAULT 750
222 static u_int cpu_idle_repeat
= CPU_IDLE_REPEAT_DEFAULT
;
223 static u_long cpu_idle_repeat_max
= CPU_IDLE_REPEAT_DEFAULT
;
224 static u_int cpu_mwait_repeat_shift
= 1;
226 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1
227 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2
229 static int cpu_mwait_c3_preamble
=
230 CPU_MWAIT_C3_PREAMBLE_BM_ARB
|
231 CPU_MWAIT_C3_PREAMBLE_BM_STS
;
233 SYSCTL_STRING(_machdep_mwait_CX
, OID_AUTO
, supported
, CTLFLAG_RD
,
234 cpu_mwait_cx_supported
, 0, "MWAIT supported C states");
235 SYSCTL_INT(_machdep_mwait_CX
, OID_AUTO
, c3_preamble
, CTLFLAG_RD
,
236 &cpu_mwait_c3_preamble
, 0, "C3+ preamble mask");
238 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
,
240 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
);
241 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
);
242 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
);
244 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, idle
, CTLTYPE_STRING
|CTLFLAG_RW
,
245 NULL
, 0, cpu_mwait_cx_idle_sysctl
, "A", "");
246 SYSCTL_PROC(_machdep_mwait_CX
, OID_AUTO
, spin
, CTLTYPE_STRING
|CTLFLAG_RW
,
247 NULL
, 0, cpu_mwait_cx_spin_sysctl
, "A", "");
248 SYSCTL_UINT(_machdep_mwait_CX
, OID_AUTO
, repeat_shift
, CTLFLAG_RW
,
249 &cpu_mwait_repeat_shift
, 0, "");
253 u_long ebda_addr
= 0;
255 int imcr_present
= 0;
257 int naps
= 0; /* # of Applications processors */
260 struct mtx dt_lock
; /* lock for GDT and LDT */
263 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
265 u_long pmem
= ctob(physmem
);
267 int error
= sysctl_handle_long(oidp
, &pmem
, 0, req
);
271 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_ULONG
|CTLFLAG_RD
,
272 0, 0, sysctl_hw_physmem
, "LU", "Total system memory in bytes (number of pages * page size)");
275 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
277 int error
= sysctl_handle_int(oidp
, 0,
278 ctob(physmem
- vmstats
.v_wire_count
), req
);
282 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
283 0, 0, sysctl_hw_usermem
, "IU", "");
286 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
288 int error
= sysctl_handle_int(oidp
, 0,
289 x86_64_btop(avail_end
- avail_start
), req
);
293 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
294 0, 0, sysctl_hw_availpages
, "I", "");
300 * The number of PHYSMAP entries must be one less than the number of
301 * PHYSSEG entries because the PHYSMAP entry that spans the largest
302 * physical address that is accessible by ISA DMA is split into two
305 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
307 vm_paddr_t phys_avail
[PHYSMAP_SIZE
+ 2];
308 vm_paddr_t dump_avail
[PHYSMAP_SIZE
+ 2];
310 /* must be 2 less so 0 0 can signal end of chunks */
311 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 2)
312 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 2)
314 static vm_offset_t buffer_sva
, buffer_eva
;
315 vm_offset_t clean_sva
, clean_eva
;
316 static vm_offset_t pager_sva
, pager_eva
;
317 static struct trapframe proc0_tf
;
320 cpu_startup(void *dummy
)
324 vm_offset_t firstaddr
;
327 * Good {morning,afternoon,evening,night}.
329 kprintf("%s", version
);
332 panicifcpuunsupported();
333 kprintf("real memory = %ju (%ju MB)\n",
335 (intmax_t)Realmem
/ 1024 / 1024);
337 * Display any holes after the first chunk of extended memory.
342 kprintf("Physical memory chunk(s):\n");
343 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
344 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
346 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
347 (intmax_t)phys_avail
[indx
],
348 (intmax_t)phys_avail
[indx
+ 1] - 1,
350 (intmax_t)(size1
/ PAGE_SIZE
));
355 * Allocate space for system data structures.
356 * The first available kernel virtual address is in "v".
357 * As pages of kernel virtual memory are allocated, "v" is incremented.
358 * As pages of memory are allocated and cleared,
359 * "firstaddr" is incremented.
360 * An index into the kernel page table corresponding to the
361 * virtual memory address maintained in "v" is kept in "mapaddr".
365 * Make two passes. The first pass calculates how much memory is
366 * needed and allocates it. The second pass assigns virtual
367 * addresses to the various data structures.
371 v
= (caddr_t
)firstaddr
;
373 #define valloc(name, type, num) \
374 (name) = (type *)v; v = (caddr_t)((name)+(num))
375 #define valloclim(name, type, num, lim) \
376 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
379 * The nominal buffer size (and minimum KVA allocation) is MAXBSIZE.
380 * For the first 64MB of ram nominally allocate sufficient buffers to
381 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
382 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
383 * the buffer cache we limit the eventual kva reservation to
386 * factor represents the 1/4 x ram conversion.
389 long factor
= 4 * NBUFCALCSIZE
/ 1024;
390 long kbytes
= physmem
* (PAGE_SIZE
/ 1024);
394 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
396 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
397 if (maxbcache
&& nbuf
> maxbcache
/ NBUFCALCSIZE
)
398 nbuf
= maxbcache
/ NBUFCALCSIZE
;
402 * Do not allow the buffer_map to be more then 1/2 the size of the
405 if (nbuf
> (virtual_end
- virtual_start
+
406 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2)) {
407 nbuf
= (virtual_end
- virtual_start
+
408 virtual2_end
- virtual2_start
) / (MAXBSIZE
* 2);
409 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf
);
413 * Do not allow the buffer_map to use more than 50% of available
414 * physical-equivalent memory. Since the VM pages which back
415 * individual buffers are typically wired, having too many bufs
416 * can prevent the system from paging properly.
418 if (nbuf
> physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2)) {
419 nbuf
= physmem
* PAGE_SIZE
/ (NBUFCALCSIZE
* 2);
420 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf
);
424 * Do not allow the sizeof(struct buf) * nbuf to exceed half of
425 * the valloc space which is just the virtual_end - virtual_start
426 * section. We use valloc() to allocate the buf header array.
428 if (nbuf
> (virtual_end
- virtual_start
) / sizeof(struct buf
) / 2) {
429 nbuf
= (virtual_end
- virtual_start
) /
430 sizeof(struct buf
) / 2;
431 kprintf("Warning: nbufs capped at %ld due to valloc "
432 "considerations\n", nbuf
);
435 nswbuf_mem
= lmax(lmin(nbuf
/ 32, 512), 8);
437 if (nswbuf_mem
< NSWBUF_MIN
)
438 nswbuf_mem
= NSWBUF_MIN
;
440 nswbuf_kva
= lmax(lmin(nbuf
/ 4, 512), 16);
442 if (nswbuf_kva
< NSWBUF_MIN
)
443 nswbuf_kva
= NSWBUF_MIN
;
449 valloc(swbuf_mem
, struct buf
, nswbuf_mem
);
450 valloc(swbuf_kva
, struct buf
, nswbuf_kva
);
451 valloc(buf
, struct buf
, nbuf
);
454 * End of first pass, size has been calculated so allocate memory
456 if (firstaddr
== 0) {
457 size
= (vm_size_t
)(v
- firstaddr
);
458 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
));
460 panic("startup: no room for tables");
465 * End of second pass, addresses have been assigned
467 * nbuf is an int, make sure we don't overflow the field.
469 * On 64-bit systems we always reserve maximal allocations for
470 * buffer cache buffers and there are no fragmentation issues,
471 * so the KVA segment does not have to be excessively oversized.
473 if ((vm_size_t
)(v
- firstaddr
) != size
)
474 panic("startup: table size inconsistency");
476 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
477 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
) +
478 ((nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) + pager_map_size
);
479 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
480 ((vm_offset_t
)(nbuf
+ 16) * MAXBSIZE
));
481 buffer_map
.system_map
= 1;
482 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
483 ((vm_offset_t
)(nswbuf_mem
+ nswbuf_kva
) * MAXPHYS
) +
485 pager_map
.system_map
= 1;
486 kprintf("avail memory = %ju (%ju MB)\n",
487 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
),
488 (uintmax_t)ptoa(vmstats
.v_free_count
+ vmstats
.v_dma_pages
) /
492 struct cpu_idle_stat
{
500 u_long mwait_cx
[CPU_MWAIT_CX_MAX
];
503 #define CPU_IDLE_STAT_HALT -1
504 #define CPU_IDLE_STAT_SPIN -2
506 static struct cpu_idle_stat cpu_idle_stats
[MAXCPU
];
509 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS
)
511 int idx
= arg2
, cpu
, error
;
514 if (idx
== CPU_IDLE_STAT_HALT
) {
515 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
516 val
+= cpu_idle_stats
[cpu
].halt
;
517 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
518 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
519 val
+= cpu_idle_stats
[cpu
].spin
;
521 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
522 ("invalid index %d", idx
));
523 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
524 val
+= cpu_idle_stats
[cpu
].mwait_cx
[idx
];
527 error
= sysctl_handle_quad(oidp
, &val
, 0, req
);
528 if (error
|| req
->newptr
== NULL
)
531 if (idx
== CPU_IDLE_STAT_HALT
) {
532 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
533 cpu_idle_stats
[cpu
].halt
= 0;
534 cpu_idle_stats
[0].halt
= val
;
535 } else if (idx
== CPU_IDLE_STAT_SPIN
) {
536 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
537 cpu_idle_stats
[cpu
].spin
= 0;
538 cpu_idle_stats
[0].spin
= val
;
540 KASSERT(idx
>= 0 && idx
< CPU_MWAIT_CX_MAX
,
541 ("invalid index %d", idx
));
542 for (cpu
= 0; cpu
< ncpus
; ++cpu
)
543 cpu_idle_stats
[cpu
].mwait_cx
[idx
] = 0;
544 cpu_idle_stats
[0].mwait_cx
[idx
] = val
;
550 cpu_mwait_attach(void)
555 if (!CPU_MWAIT_HAS_CX
)
558 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
559 (CPUID_TO_FAMILY(cpu_id
) > 0xf ||
560 (CPUID_TO_FAMILY(cpu_id
) == 0x6 &&
561 CPUID_TO_MODEL(cpu_id
) >= 0xf))) {
565 * Pentium dual-core, Core 2 and beyond do not need any
566 * additional activities to enter deep C-state, i.e. C3(+).
568 cpu_mwait_cx_no_bmarb();
570 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts
);
572 cpu_mwait_cx_no_bmsts();
575 sbuf_new(&sb
, cpu_mwait_cx_supported
,
576 sizeof(cpu_mwait_cx_supported
), SBUF_FIXEDLEN
);
578 for (i
= 0; i
< CPU_MWAIT_CX_MAX
; ++i
) {
579 struct cpu_mwait_cx
*cx
= &cpu_mwait_cx_info
[i
];
582 ksnprintf(cx
->name
, sizeof(cx
->name
), "C%d", i
);
584 sysctl_ctx_init(&cx
->sysctl_ctx
);
585 cx
->sysctl_tree
= SYSCTL_ADD_NODE(&cx
->sysctl_ctx
,
586 SYSCTL_STATIC_CHILDREN(_machdep_mwait
), OID_AUTO
,
587 cx
->name
, CTLFLAG_RW
, NULL
, "Cx control/info");
588 if (cx
->sysctl_tree
== NULL
)
591 cx
->subcnt
= CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu
, i
);
592 SYSCTL_ADD_INT(&cx
->sysctl_ctx
,
593 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
594 "subcnt", CTLFLAG_RD
, &cx
->subcnt
, 0,
596 SYSCTL_ADD_PROC(&cx
->sysctl_ctx
,
597 SYSCTL_CHILDREN(cx
->sysctl_tree
), OID_AUTO
,
598 "entered", (CTLTYPE_QUAD
| CTLFLAG_RW
), 0,
599 i
, sysctl_cpu_idle_cnt
, "Q", "# of times entered");
601 for (sub
= 0; sub
< cx
->subcnt
; ++sub
)
602 sbuf_printf(&sb
, "C%d/%d ", i
, sub
);
610 cpu_mwait_c1_hints_cnt
= cpu_mwait_cx_info
[CPU_MWAIT_C1
].subcnt
;
611 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
)
612 cpu_mwait_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
613 cpu_mwait_hints
= kmalloc(sizeof(int) * cpu_mwait_hints_cnt
,
617 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_C3
; ++i
) {
620 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
621 for (j
= 0; j
< subcnt
; ++j
) {
622 KASSERT(hint_idx
< cpu_mwait_hints_cnt
,
623 ("invalid mwait hint index %d", hint_idx
));
624 cpu_mwait_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
628 KASSERT(hint_idx
== cpu_mwait_hints_cnt
,
629 ("mwait hint count %d != index %d",
630 cpu_mwait_hints_cnt
, hint_idx
));
633 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt
);
634 for (i
= 0; i
< cpu_mwait_hints_cnt
; ++i
) {
635 int hint
= cpu_mwait_hints
[i
];
637 kprintf(" C%d/%d hint 0x%04x\n",
638 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
646 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
)
647 cpu_mwait_deep_hints_cnt
+= cpu_mwait_cx_info
[i
].subcnt
;
648 cpu_mwait_deep_hints
= kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt
,
652 for (i
= CPU_MWAIT_C1
; i
< CPU_MWAIT_CX_MAX
; ++i
) {
655 subcnt
= cpu_mwait_cx_info
[i
].subcnt
;
656 for (j
= 0; j
< subcnt
; ++j
) {
657 KASSERT(hint_idx
< cpu_mwait_deep_hints_cnt
,
658 ("invalid mwait deep hint index %d", hint_idx
));
659 cpu_mwait_deep_hints
[hint_idx
] = MWAIT_EAX_HINT(i
, j
);
663 KASSERT(hint_idx
== cpu_mwait_deep_hints_cnt
,
664 ("mwait deep hint count %d != index %d",
665 cpu_mwait_deep_hints_cnt
, hint_idx
));
668 kprintf("MWAIT deep hints:\n");
669 for (i
= 0; i
< cpu_mwait_deep_hints_cnt
; ++i
) {
670 int hint
= cpu_mwait_deep_hints
[i
];
672 kprintf(" C%d/%d hint 0x%04x\n",
673 MWAIT_EAX_TO_CX(hint
), MWAIT_EAX_TO_CX_SUB(hint
),
677 cpu_idle_repeat_max
= 256 * cpu_mwait_deep_hints_cnt
;
679 for (i
= 0; i
< ncpus
; ++i
) {
682 ksnprintf(name
, sizeof(name
), "idle%d", i
);
683 SYSCTL_ADD_PROC(NULL
,
684 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX
), OID_AUTO
,
685 name
, (CTLTYPE_STRING
| CTLFLAG_RW
), &cpu_idle_stats
[i
],
686 0, cpu_mwait_cx_pcpu_idle_sysctl
, "A", "");
691 cpu_finish(void *dummy __unused
)
698 pic_finish(void *dummy __unused
)
700 /* Log ELCR information */
703 /* Log MPTABLE information */
704 mptable_pci_int_dump();
707 MachIntrABI
.finalize();
711 * Send an interrupt to process.
713 * Stack is set up to allow sigcode stored
714 * at top to call routine, followed by kcall
715 * to sigreturn routine below. After sigreturn
716 * resets the signal mask, the stack, and the
717 * frame pointer, it returns to the user
721 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
723 struct lwp
*lp
= curthread
->td_lwp
;
724 struct proc
*p
= lp
->lwp_proc
;
725 struct trapframe
*regs
;
726 struct sigacts
*psp
= p
->p_sigacts
;
727 struct sigframe sf
, *sfp
;
731 regs
= lp
->lwp_md
.md_regs
;
732 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
734 /* Save user context */
735 bzero(&sf
, sizeof(struct sigframe
));
736 sf
.sf_uc
.uc_sigmask
= *mask
;
737 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
738 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
739 KKASSERT(__offsetof(struct trapframe
, tf_rdi
) == 0);
740 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_rdi
, sizeof(struct trapframe
));
742 /* Make the size of the saved context visible to userland */
743 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
745 /* Allocate and validate space for the signal handler context. */
746 if ((lp
->lwp_flags
& LWP_ALTSTACK
) != 0 && !oonstack
&&
747 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
748 sp
= (char *)(lp
->lwp_sigstk
.ss_sp
+ lp
->lwp_sigstk
.ss_size
-
749 sizeof(struct sigframe
));
750 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
752 /* We take red zone into account */
753 sp
= (char *)regs
->tf_rsp
- sizeof(struct sigframe
) - 128;
757 * XXX AVX needs 64-byte alignment but sigframe has other fields and
758 * the embedded ucontext is not at the front, so aligning this won't
759 * help us. Fortunately we bcopy in/out of the sigframe, so the
762 * The problem though is if userland winds up trying to use the
765 sfp
= (struct sigframe
*)((intptr_t)sp
& ~(intptr_t)0xF);
767 /* Translate the signal is appropriate */
768 if (p
->p_sysent
->sv_sigtbl
) {
769 if (sig
<= p
->p_sysent
->sv_sigsize
)
770 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
774 * Build the argument list for the signal handler.
776 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
778 regs
->tf_rdi
= sig
; /* argument 1 */
779 regs
->tf_rdx
= (register_t
)&sfp
->sf_uc
; /* argument 3 */
781 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
783 * Signal handler installed with SA_SIGINFO.
785 * action(signo, siginfo, ucontext)
787 regs
->tf_rsi
= (register_t
)&sfp
->sf_si
; /* argument 2 */
788 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
789 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
791 /* fill siginfo structure */
792 sf
.sf_si
.si_signo
= sig
;
793 sf
.sf_si
.si_code
= code
;
794 sf
.sf_si
.si_addr
= (void *)regs
->tf_addr
;
797 * Old FreeBSD-style arguments.
799 * handler (signo, code, [uc], addr)
801 regs
->tf_rsi
= (register_t
)code
; /* argument 2 */
802 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
803 sf
.sf_ahu
.sf_handler
= catcher
;
807 * If we're a vm86 process, we want to save the segment registers.
808 * We also change eflags to be our emulated eflags, not the actual
812 if (regs
->tf_eflags
& PSL_VM
) {
813 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
814 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
816 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
817 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
818 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
819 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
821 if (vm86
->vm86_has_vme
== 0)
822 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
823 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
824 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
827 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
828 * syscalls made by the signal handler. This just avoids
829 * wasting time for our lazy fixup of such faults. PSL_NT
830 * does nothing in vm86 mode, but vm86 programs can set it
831 * almost legitimately in probes for old cpu types.
833 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
838 * Save the FPU state and reinit the FP unit
840 npxpush(&sf
.sf_uc
.uc_mcontext
);
843 * Copy the sigframe out to the user's stack.
845 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
847 * Something is wrong with the stack pointer.
848 * ...Kill the process.
853 regs
->tf_rsp
= (register_t
)sfp
;
854 regs
->tf_rip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
857 * i386 abi specifies that the direction flag must be cleared
860 regs
->tf_rflags
&= ~(PSL_T
|PSL_D
);
863 * 64 bit mode has a code and stack selector but
864 * no data or extra selector. %fs and %gs are not
867 regs
->tf_cs
= _ucodesel
;
868 regs
->tf_ss
= _udatasel
;
873 * Sanitize the trapframe for a virtual kernel passing control to a custom
874 * VM context. Remove any items that would otherwise create a privilage
877 * XXX at the moment we allow userland to set the resume flag. Is this a
881 cpu_sanitize_frame(struct trapframe
*frame
)
883 frame
->tf_cs
= _ucodesel
;
884 frame
->tf_ss
= _udatasel
;
885 /* XXX VM (8086) mode not supported? */
886 frame
->tf_rflags
&= (PSL_RF
| PSL_USERCHANGE
| PSL_VM_UNSUPP
);
887 frame
->tf_rflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
893 * Sanitize the tls so loading the descriptor does not blow up
894 * on us. For x86_64 we don't have to do anything.
897 cpu_sanitize_tls(struct savetls
*tls
)
903 * sigreturn(ucontext_t *sigcntxp)
905 * System call to cleanup state after a signal
906 * has been taken. Reset signal mask and
907 * stack state from context left by sendsig (above).
908 * Return to previous pc and psl as specified by
909 * context left by sendsig. Check carefully to
910 * make sure that the user has not modified the
911 * state to gain improper privileges.
915 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
916 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
919 sys_sigreturn(struct sigreturn_args
*uap
)
921 struct lwp
*lp
= curthread
->td_lwp
;
922 struct trapframe
*regs
;
930 * We have to copy the information into kernel space so userland
931 * can't modify it while we are sniffing it.
933 regs
= lp
->lwp_md
.md_regs
;
934 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
938 rflags
= ucp
->uc_mcontext
.mc_rflags
;
940 /* VM (8086) mode not supported */
941 rflags
&= ~PSL_VM_UNSUPP
;
944 if (eflags
& PSL_VM
) {
945 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
946 struct vm86_kernel
*vm86
;
949 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
950 * set up the vm86 area, and we can't enter vm86 mode.
952 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
954 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
955 if (vm86
->vm86_inited
== 0)
958 /* go back to user mode if both flags are set */
959 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
960 trapsignal(lp
, SIGBUS
, 0);
962 if (vm86
->vm86_has_vme
) {
963 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
964 (eflags
& VME_USERCHANGE
) | PSL_VM
;
966 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
967 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
968 (eflags
& VM_USERCHANGE
) | PSL_VM
;
970 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
971 tf
->tf_eflags
= eflags
;
972 tf
->tf_vm86_ds
= tf
->tf_ds
;
973 tf
->tf_vm86_es
= tf
->tf_es
;
974 tf
->tf_vm86_fs
= tf
->tf_fs
;
975 tf
->tf_vm86_gs
= tf
->tf_gs
;
976 tf
->tf_ds
= _udatasel
;
977 tf
->tf_es
= _udatasel
;
978 tf
->tf_fs
= _udatasel
;
979 tf
->tf_gs
= _udatasel
;
984 * Don't allow users to change privileged or reserved flags.
987 * XXX do allow users to change the privileged flag PSL_RF.
988 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
989 * should sometimes set it there too. tf_eflags is kept in
990 * the signal context during signal handling and there is no
991 * other place to remember it, so the PSL_RF bit may be
992 * corrupted by the signal handler without us knowing.
993 * Corruption of the PSL_RF bit at worst causes one more or
994 * one less debugger trap, so allowing it is fairly harmless.
996 if (!EFL_SECURE(rflags
& ~PSL_RF
, regs
->tf_rflags
& ~PSL_RF
)) {
997 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags
);
1002 * Don't allow users to load a valid privileged %cs. Let the
1003 * hardware check for invalid selectors, excess privilege in
1004 * other selectors, invalid %eip's and invalid %esp's.
1006 cs
= ucp
->uc_mcontext
.mc_cs
;
1007 if (!CS_SECURE(cs
)) {
1008 kprintf("sigreturn: cs = 0x%x\n", cs
);
1009 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
1012 bcopy(&ucp
->uc_mcontext
.mc_rdi
, regs
, sizeof(struct trapframe
));
1016 * Restore the FPU state from the frame
1019 npxpop(&ucp
->uc_mcontext
);
1021 if (ucp
->uc_mcontext
.mc_onstack
& 1)
1022 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
1024 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
1026 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
1027 SIG_CANTMASK(lp
->lwp_sigmask
);
1030 return(EJUSTRETURN
);
1034 * Machine dependent boot() routine
1036 * I haven't seen anything to put here yet
1037 * Possibly some stuff might be grafted back here from boot()
1045 * Shutdown the CPU as much as possible
1051 __asm__
__volatile("hlt");
1055 * cpu_idle() represents the idle LWKT. You cannot return from this function
1056 * (unless you want to blow things up!). Instead we look for runnable threads
1057 * and loop or halt as appropriate. Giant is not held on entry to the thread.
1059 * The main loop is entered with a critical section held, we must release
1060 * the critical section before doing anything else. lwkt_switch() will
1061 * check for pending interrupts due to entering and exiting its own
1064 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1065 * However, there are cases where the idlethread will be entered with
1066 * the possibility that no IPI will occur and in such cases
1067 * lwkt_switch() sets TDF_IDLE_NOHLT.
1069 * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1070 * must occur before it starts using ACPI halt.
1072 * NOTE: Value overridden in hammer_time().
1074 static int cpu_idle_hlt
= 2;
1075 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
1076 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
1077 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_repeat
, CTLFLAG_RW
,
1078 &cpu_idle_repeat
, 0, "Idle entries before acpi hlt");
1080 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1081 0, CPU_IDLE_STAT_HALT
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry halts");
1082 SYSCTL_PROC(_machdep
, OID_AUTO
, cpu_idle_spincnt
, (CTLTYPE_QUAD
| CTLFLAG_RW
),
1083 0, CPU_IDLE_STAT_SPIN
, sysctl_cpu_idle_cnt
, "Q", "Idle loop entry spins");
1086 cpu_idle_default_hook(void)
1089 * We must guarentee that hlt is exactly the instruction
1090 * following the sti.
1092 __asm
__volatile("sti; hlt");
1095 /* Other subsystems (e.g., ACPI) can hook this later. */
1096 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
1099 cpu_mwait_cx_hint(struct cpu_idle_stat
*stat
)
1108 idx
= (stat
->repeat
+ stat
->repeat_last
+ stat
->repeat_delta
) >>
1109 cpu_mwait_repeat_shift
;
1110 if (idx
>= cpu_mwait_c1_hints_cnt
) {
1111 /* Step up faster, once we walked through all C1 states */
1112 stat
->repeat_delta
+= 1 << (cpu_mwait_repeat_shift
+ 1);
1114 if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
1115 if (idx
>= cpu_mwait_deep_hints_cnt
)
1116 idx
= cpu_mwait_deep_hints_cnt
- 1;
1117 hint
= cpu_mwait_deep_hints
[idx
];
1119 if (idx
>= cpu_mwait_hints_cnt
)
1120 idx
= cpu_mwait_hints_cnt
- 1;
1121 hint
= cpu_mwait_hints
[idx
];
1124 cx_idx
= MWAIT_EAX_TO_CX(hint
);
1125 if (cx_idx
>= 0 && cx_idx
< CPU_MWAIT_CX_MAX
)
1126 stat
->mwait_cx
[cx_idx
]++;
1133 globaldata_t gd
= mycpu
;
1134 struct cpu_idle_stat
*stat
= &cpu_idle_stats
[gd
->gd_cpuid
];
1135 struct thread
*td __debugvar
= gd
->gd_curthread
;
1139 stat
->repeat
= stat
->repeat_last
= cpu_idle_repeat_max
;
1142 KKASSERT(td
->td_critcount
== 0);
1146 * See if there are any LWKTs ready to go.
1151 * When halting inside a cli we must check for reqflags
1152 * races, particularly [re]schedule requests. Running
1153 * splz() does the job.
1156 * 0 Never halt, just spin
1158 * 1 Always use HLT (or MONITOR/MWAIT if avail).
1160 * Better default for modern (Haswell+) Intel
1163 * 2 Use HLT/MONITOR/MWAIT up to a point and then
1164 * use the ACPI halt (default). This is a hybrid
1165 * approach. See machdep.cpu_idle_repeat.
1167 * Better default for modern AMD cpus and older
1170 * 3 Always use the ACPI halt. This typically
1171 * eats the least amount of power but the cpu
1172 * will be slow waking up. Slows down e.g.
1173 * compiles and other pipe/event oriented stuff.
1177 * NOTE: Interrupts are enabled and we are not in a critical
1180 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
1181 * don't bother capping gd_idle_repeat, it is ok if
1184 * Implement optimized invltlb operations when halted
1185 * in idle. By setting the bit in smp_idleinvl_mask
1186 * we inform other cpus that they can set _reqs to
1187 * request an invltlb. Current the code to do that
1188 * sets the bits in _reqs anyway, but then check _mask
1189 * to determine if they can assume the invltlb will execute.
1191 * A critical section is required to ensure that interrupts
1192 * do not fully run until after we've had a chance to execute
1195 if (gd
->gd_idle_repeat
== 0) {
1196 stat
->repeat
= (stat
->repeat
+ stat
->repeat_last
) >> 1;
1197 if (stat
->repeat
> cpu_idle_repeat_max
)
1198 stat
->repeat
= cpu_idle_repeat_max
;
1199 stat
->repeat_last
= 0;
1200 stat
->repeat_delta
= 0;
1202 ++stat
->repeat_last
;
1204 ++gd
->gd_idle_repeat
;
1205 reqflags
= gd
->gd_reqflags
;
1206 quick
= (cpu_idle_hlt
== 1) ||
1207 (cpu_idle_hlt
< 3 &&
1208 gd
->gd_idle_repeat
< cpu_idle_repeat
);
1210 if (quick
&& (cpu_mi_feature
& CPU_MI_MONITOR
) &&
1211 (reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1214 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1215 cpu_mmw_pause_int(&gd
->gd_reqflags
, reqflags
,
1216 cpu_mwait_cx_hint(stat
), 0);
1218 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1219 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1225 } else if (cpu_idle_hlt
) {
1226 __asm
__volatile("cli");
1229 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1230 if ((gd
->gd_reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
1232 cpu_idle_default_hook();
1236 __asm
__volatile("sti");
1238 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask
, gd
->gd_cpuid
);
1239 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs
,
1247 __asm
__volatile("sti");
1254 * This routine is called if a spinlock has been held through the
1255 * exponential backoff period and is seriously contested. On a real cpu
1259 cpu_spinlock_contested(void)
1265 * Clear registers on exec
1268 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
1270 struct thread
*td
= curthread
;
1271 struct lwp
*lp
= td
->td_lwp
;
1272 struct pcb
*pcb
= td
->td_pcb
;
1273 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
1275 /* was i386_user_cleanup() in NetBSD */
1279 bzero((char *)regs
, sizeof(struct trapframe
));
1280 regs
->tf_rip
= entry
;
1281 regs
->tf_rsp
= ((stack
- 8) & ~0xFul
) + 8; /* align the stack */
1282 regs
->tf_rdi
= stack
; /* argv */
1283 regs
->tf_rflags
= PSL_USER
| (regs
->tf_rflags
& PSL_T
);
1284 regs
->tf_ss
= _udatasel
;
1285 regs
->tf_cs
= _ucodesel
;
1286 regs
->tf_rbx
= ps_strings
;
1289 * Reset the hardware debug registers if they were in use.
1290 * They won't have any meaning for the newly exec'd process.
1292 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1298 pcb
->pcb_dr7
= 0; /* JG set bit 10? */
1299 if (pcb
== td
->td_pcb
) {
1301 * Clear the debug registers on the running
1302 * CPU, otherwise they will end up affecting
1303 * the next process we switch to.
1307 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1311 * Initialize the math emulator (if any) for the current process.
1312 * Actually, just clear the bit that says that the emulator has
1313 * been initialized. Initialization is delayed until the process
1314 * traps to the emulator (if it is done at all) mainly because
1315 * emulators don't provide an entry point for initialization.
1317 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1320 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1321 * gd_npxthread. Otherwise a preemptive interrupt thread
1322 * may panic in npxdna().
1325 load_cr0(rcr0() | CR0_MP
);
1328 * NOTE: The MSR values must be correct so we can return to
1329 * userland. gd_user_fs/gs must be correct so the switch
1330 * code knows what the current MSR values are.
1332 pcb
->pcb_fsbase
= 0; /* Values loaded from PCB on switch */
1333 pcb
->pcb_gsbase
= 0;
1334 mdcpu
->gd_user_fs
= 0; /* Cache of current MSR values */
1335 mdcpu
->gd_user_gs
= 0;
1336 wrmsr(MSR_FSBASE
, 0); /* Set MSR values for return to userland */
1337 wrmsr(MSR_KGSBASE
, 0);
1339 /* Initialize the npx (if any) for the current process. */
1343 pcb
->pcb_ds
= _udatasel
;
1344 pcb
->pcb_es
= _udatasel
;
1345 pcb
->pcb_fs
= _udatasel
;
1346 pcb
->pcb_gs
= _udatasel
;
1355 cr0
|= CR0_NE
; /* Done by npxinit() */
1356 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1357 cr0
|= CR0_WP
| CR0_AM
;
1363 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1366 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1368 if (!error
&& req
->newptr
)
1373 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1374 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1376 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1377 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1380 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1381 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1384 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1385 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1387 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1388 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1389 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1392 * Initialize 386 and configure to run kernel
1396 * Initialize segments & interrupt table
1400 struct user_segment_descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1401 struct gate_descriptor idt_arr
[MAXCPU
][NIDT
];
1403 union descriptor ldt
[NLDT
]; /* local descriptor table */
1406 /* table descriptors - used to load tables by cpu */
1407 struct region_descriptor r_gdt
;
1408 struct region_descriptor r_idt_arr
[MAXCPU
];
1410 /* JG proc0paddr is a virtual address */
1413 char proc0paddr_buff
[LWKT_THREAD_STACK
];
1416 /* software prototypes -- in more palatable form */
1417 struct soft_segment_descriptor gdt_segs
[] = {
1418 /* GNULL_SEL 0 Null Descriptor */
1419 { 0x0, /* segment base address */
1421 0, /* segment type */
1422 0, /* segment descriptor priority level */
1423 0, /* segment descriptor present */
1425 0, /* default 32 vs 16 bit size */
1426 0 /* limit granularity (byte/page units)*/ },
1427 /* GCODE_SEL 1 Code Descriptor for kernel */
1428 { 0x0, /* segment base address */
1429 0xfffff, /* length - all address space */
1430 SDT_MEMERA
, /* segment type */
1431 SEL_KPL
, /* segment descriptor priority level */
1432 1, /* segment descriptor present */
1434 0, /* default 32 vs 16 bit size */
1435 1 /* limit granularity (byte/page units)*/ },
1436 /* GDATA_SEL 2 Data Descriptor for kernel */
1437 { 0x0, /* segment base address */
1438 0xfffff, /* length - all address space */
1439 SDT_MEMRWA
, /* segment type */
1440 SEL_KPL
, /* segment descriptor priority level */
1441 1, /* segment descriptor present */
1443 0, /* default 32 vs 16 bit size */
1444 1 /* limit granularity (byte/page units)*/ },
1445 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1446 { 0x0, /* segment base address */
1447 0xfffff, /* length - all address space */
1448 SDT_MEMERA
, /* segment type */
1449 SEL_UPL
, /* segment descriptor priority level */
1450 1, /* segment descriptor present */
1452 1, /* default 32 vs 16 bit size */
1453 1 /* limit granularity (byte/page units)*/ },
1454 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1455 { 0x0, /* segment base address */
1456 0xfffff, /* length - all address space */
1457 SDT_MEMRWA
, /* segment type */
1458 SEL_UPL
, /* segment descriptor priority level */
1459 1, /* segment descriptor present */
1461 1, /* default 32 vs 16 bit size */
1462 1 /* limit granularity (byte/page units)*/ },
1463 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1464 { 0x0, /* segment base address */
1465 0xfffff, /* length - all address space */
1466 SDT_MEMERA
, /* segment type */
1467 SEL_UPL
, /* segment descriptor priority level */
1468 1, /* segment descriptor present */
1470 0, /* default 32 vs 16 bit size */
1471 1 /* limit granularity (byte/page units)*/ },
1472 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1474 0x0, /* segment base address */
1475 sizeof(struct x86_64tss
)-1,/* length - all address space */
1476 SDT_SYSTSS
, /* segment type */
1477 SEL_KPL
, /* segment descriptor priority level */
1478 1, /* segment descriptor present */
1480 0, /* unused - default 32 vs 16 bit size */
1481 0 /* limit granularity (byte/page units)*/ },
1482 /* Actually, the TSS is a system descriptor which is double size */
1483 { 0x0, /* segment base address */
1485 0, /* segment type */
1486 0, /* segment descriptor priority level */
1487 0, /* segment descriptor present */
1489 0, /* default 32 vs 16 bit size */
1490 0 /* limit granularity (byte/page units)*/ },
1491 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1492 { 0x0, /* segment base address */
1493 0xfffff, /* length - all address space */
1494 SDT_MEMRWA
, /* segment type */
1495 SEL_UPL
, /* segment descriptor priority level */
1496 1, /* segment descriptor present */
1498 1, /* default 32 vs 16 bit size */
1499 1 /* limit granularity (byte/page units)*/ },
1503 setidt_global(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
)
1507 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
1508 struct gate_descriptor
*ip
= &idt_arr
[cpu
][idx
];
1510 ip
->gd_looffset
= (uintptr_t)func
;
1511 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1517 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1522 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
, int cpu
)
1524 struct gate_descriptor
*ip
;
1526 KASSERT(cpu
>= 0 && cpu
< ncpus
, ("invalid cpu %d", cpu
));
1528 ip
= &idt_arr
[cpu
][idx
];
1529 ip
->gd_looffset
= (uintptr_t)func
;
1530 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1536 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1539 #define IDTVEC(name) __CONCAT(X,name)
1542 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1543 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1544 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1545 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(rsvd
), IDTVEC(fpu
), IDTVEC(align
),
1546 IDTVEC(xmm
), IDTVEC(dblfault
),
1547 IDTVEC(fast_syscall
), IDTVEC(fast_syscall32
);
1550 sdtossd(struct user_segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1552 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1553 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1554 ssd
->ssd_type
= sd
->sd_type
;
1555 ssd
->ssd_dpl
= sd
->sd_dpl
;
1556 ssd
->ssd_p
= sd
->sd_p
;
1557 ssd
->ssd_def32
= sd
->sd_def32
;
1558 ssd
->ssd_gran
= sd
->sd_gran
;
1562 ssdtosd(struct soft_segment_descriptor
*ssd
, struct user_segment_descriptor
*sd
)
1565 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1566 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xff;
1567 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1568 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1569 sd
->sd_type
= ssd
->ssd_type
;
1570 sd
->sd_dpl
= ssd
->ssd_dpl
;
1571 sd
->sd_p
= ssd
->ssd_p
;
1572 sd
->sd_long
= ssd
->ssd_long
;
1573 sd
->sd_def32
= ssd
->ssd_def32
;
1574 sd
->sd_gran
= ssd
->ssd_gran
;
1578 ssdtosyssd(struct soft_segment_descriptor
*ssd
,
1579 struct system_segment_descriptor
*sd
)
1582 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1583 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xfffffffffful
;
1584 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1585 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1586 sd
->sd_type
= ssd
->ssd_type
;
1587 sd
->sd_dpl
= ssd
->ssd_dpl
;
1588 sd
->sd_p
= ssd
->ssd_p
;
1589 sd
->sd_gran
= ssd
->ssd_gran
;
1593 * Populate the (physmap) array with base/bound pairs describing the
1594 * available physical memory in the system, then test this memory and
1595 * build the phys_avail array describing the actually-available memory.
1597 * If we cannot accurately determine the physical memory map, then use
1598 * value from the 0xE801 call, and failing that, the RTC.
1600 * Total memory size may be set by the kernel environment variable
1601 * hw.physmem or the compile-time define MAXMEM.
1603 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1604 * of PAGE_SIZE. This also greatly reduces the memory test time
1605 * which would otherwise be excessive on machines with > 8G of ram.
1607 * XXX first should be vm_paddr_t.
1610 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024)
1611 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1612 vm_paddr_t physmap
[PHYSMAP_SIZE
];
1613 struct bios_smap
*smapbase
, *smap
, *smapend
;
1614 struct efi_map_header
*efihdrbase
;
1616 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024)
1617 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1)
1620 add_smap_entries(int *physmap_idx
)
1624 smapsize
= *((u_int32_t
*)smapbase
- 1);
1625 smapend
= (struct bios_smap
*)((uintptr_t)smapbase
+ smapsize
);
1627 for (smap
= smapbase
; smap
< smapend
; smap
++) {
1628 if (boothowto
& RB_VERBOSE
)
1629 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1630 smap
->type
, smap
->base
, smap
->length
);
1632 if (smap
->type
!= SMAP_TYPE_MEMORY
)
1635 if (smap
->length
== 0)
1638 for (i
= 0; i
<= *physmap_idx
; i
+= 2) {
1639 if (smap
->base
< physmap
[i
+ 1]) {
1640 if (boothowto
& RB_VERBOSE
) {
1641 kprintf("Overlapping or non-monotonic "
1642 "memory region, ignoring "
1648 if (i
<= *physmap_idx
)
1651 Realmem
+= smap
->length
;
1653 if (smap
->base
== physmap
[*physmap_idx
+ 1]) {
1654 physmap
[*physmap_idx
+ 1] += smap
->length
;
1659 if (*physmap_idx
== PHYSMAP_SIZE
) {
1660 kprintf("Too many segments in the physical "
1661 "address map, giving up\n");
1664 physmap
[*physmap_idx
] = smap
->base
;
1665 physmap
[*physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1669 #define efi_next_descriptor(ptr, size) \
1670 ((struct efi_md *)(((uint8_t *) ptr) + size))
1673 add_efi_map_entries(int *physmap_idx
)
1675 struct efi_md
*map
, *p
;
1680 static const char *types
[] = {
1686 "RuntimeServicesCode",
1687 "RuntimeServicesData",
1688 "ConventionalMemory",
1690 "ACPIReclaimMemory",
1693 "MemoryMappedIOPortSpace",
1698 * Memory map data provided by UEFI via the GetMemoryMap
1699 * Boot Services API.
1701 efisz
= (sizeof(struct efi_map_header
) + 0xf) & ~0xf;
1702 map
= (struct efi_md
*)((uint8_t *)efihdrbase
+ efisz
);
1704 if (efihdrbase
->descriptor_size
== 0)
1706 ndesc
= efihdrbase
->memory_size
/ efihdrbase
->descriptor_size
;
1708 if (boothowto
& RB_VERBOSE
)
1709 kprintf("%23s %12s %12s %8s %4s\n",
1710 "Type", "Physical", "Virtual", "#Pages", "Attr");
1712 for (i
= 0, p
= map
; i
< ndesc
; i
++,
1713 p
= efi_next_descriptor(p
, efihdrbase
->descriptor_size
)) {
1714 if (boothowto
& RB_VERBOSE
) {
1715 if (p
->md_type
<= EFI_MD_TYPE_PALCODE
)
1716 type
= types
[p
->md_type
];
1719 kprintf("%23s %012lx %12p %08lx ", type
, p
->md_phys
,
1720 p
->md_virt
, p
->md_pages
);
1721 if (p
->md_attr
& EFI_MD_ATTR_UC
)
1723 if (p
->md_attr
& EFI_MD_ATTR_WC
)
1725 if (p
->md_attr
& EFI_MD_ATTR_WT
)
1727 if (p
->md_attr
& EFI_MD_ATTR_WB
)
1729 if (p
->md_attr
& EFI_MD_ATTR_UCE
)
1731 if (p
->md_attr
& EFI_MD_ATTR_WP
)
1733 if (p
->md_attr
& EFI_MD_ATTR_RP
)
1735 if (p
->md_attr
& EFI_MD_ATTR_XP
)
1737 if (p
->md_attr
& EFI_MD_ATTR_RT
)
1742 switch (p
->md_type
) {
1743 case EFI_MD_TYPE_CODE
:
1744 case EFI_MD_TYPE_DATA
:
1745 case EFI_MD_TYPE_BS_CODE
:
1746 case EFI_MD_TYPE_BS_DATA
:
1747 case EFI_MD_TYPE_FREE
:
1749 * We're allowed to use any entry with these types.
1756 Realmem
+= p
->md_pages
* PAGE_SIZE
;
1758 if (p
->md_phys
== physmap
[*physmap_idx
+ 1]) {
1759 physmap
[*physmap_idx
+ 1] += p
->md_pages
* PAGE_SIZE
;
1764 if (*physmap_idx
== PHYSMAP_SIZE
) {
1765 kprintf("Too many segments in the physical "
1766 "address map, giving up\n");
1769 physmap
[*physmap_idx
] = p
->md_phys
;
1770 physmap
[*physmap_idx
+ 1] = p
->md_phys
+ p
->md_pages
* PAGE_SIZE
;
1774 struct fb_info efi_fb_info
;
1775 static int have_efi_framebuffer
= 0;
1778 efi_fb_init_vaddr(int direct_map
)
1781 vm_offset_t addr
, v
;
1783 v
= efi_fb_info
.vaddr
;
1784 sz
= efi_fb_info
.stride
* efi_fb_info
.height
;
1787 addr
= PHYS_TO_DMAP(efi_fb_info
.paddr
);
1788 if (addr
>= DMAP_MIN_ADDRESS
&& addr
+ sz
< DMAP_MAX_ADDRESS
)
1789 efi_fb_info
.vaddr
= addr
;
1791 efi_fb_info
.vaddr
= (vm_offset_t
)pmap_mapdev_attr(
1792 efi_fb_info
.paddr
, sz
, PAT_WRITE_COMBINING
);
1795 if (v
== 0 && efi_fb_info
.vaddr
!= 0)
1796 memset((void *)efi_fb_info
.vaddr
, 0x77, sz
);
1800 probe_efi_fb(int early
)
1802 struct efi_fb
*efifb
;
1805 if (have_efi_framebuffer
) {
1807 (efi_fb_info
.vaddr
== 0 ||
1808 efi_fb_info
.vaddr
== PHYS_TO_DMAP(efi_fb_info
.paddr
)))
1809 efi_fb_init_vaddr(0);
1813 kmdp
= preload_search_by_type("elf kernel");
1815 kmdp
= preload_search_by_type("elf64 kernel");
1816 efifb
= (struct efi_fb
*)preload_search_info(kmdp
,
1817 MODINFO_METADATA
| MODINFOMD_EFI_FB
);
1821 have_efi_framebuffer
= 1;
1823 efi_fb_info
.is_vga_boot_display
= 1;
1824 efi_fb_info
.width
= efifb
->fb_width
;
1825 efi_fb_info
.height
= efifb
->fb_height
;
1826 efi_fb_info
.stride
= efifb
->fb_stride
* 4;
1827 efi_fb_info
.depth
= 32;
1828 efi_fb_info
.paddr
= efifb
->fb_addr
;
1830 efi_fb_info
.vaddr
= 0;
1832 efi_fb_init_vaddr(0);
1834 efi_fb_info
.restore
= NULL
;
1835 efi_fb_info
.device
= NULL
;
1841 efifb_startup(void *arg
)
1846 SYSINIT(efi_fb_info
, SI_BOOT1_POST
, SI_ORDER_FIRST
, efifb_startup
, NULL
);
1849 getmemsize(caddr_t kmdp
, u_int64_t first
)
1851 int off
, physmap_idx
, pa_indx
, da_indx
;
1854 vm_paddr_t msgbuf_size
;
1855 u_long physmem_tunable
;
1857 quad_t dcons_addr
, dcons_size
;
1859 bzero(physmap
, sizeof(physmap
));
1863 * get memory map from INT 15:E820, kindly supplied by the loader.
1865 * subr_module.c says:
1866 * "Consumer may safely assume that size value precedes data."
1867 * ie: an int32_t immediately precedes smap.
1869 efihdrbase
= (struct efi_map_header
*)preload_search_info(kmdp
,
1870 MODINFO_METADATA
| MODINFOMD_EFI_MAP
);
1871 smapbase
= (struct bios_smap
*)preload_search_info(kmdp
,
1872 MODINFO_METADATA
| MODINFOMD_SMAP
);
1873 if (smapbase
== NULL
&& efihdrbase
== NULL
)
1874 panic("No BIOS smap or EFI map info from loader!");
1876 if (efihdrbase
== NULL
)
1877 add_smap_entries(&physmap_idx
);
1879 add_efi_map_entries(&physmap_idx
);
1881 base_memory
= physmap
[1] / 1024;
1882 /* make hole for AP bootstrap code */
1883 physmap
[1] = mp_bootaddress(base_memory
);
1885 /* Save EBDA address, if any */
1886 ebda_addr
= (u_long
)(*(u_short
*)(KERNBASE
+ 0x40e));
1890 * Maxmem isn't the "maximum memory", it's one larger than the
1891 * highest page of the physical address space. It should be
1892 * called something like "Maxphyspage". We may adjust this
1893 * based on ``hw.physmem'' and the results of the memory test.
1895 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1898 Maxmem
= MAXMEM
/ 4;
1901 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable
))
1902 Maxmem
= atop(physmem_tunable
);
1905 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1908 if (Maxmem
> atop(physmap
[physmap_idx
+ 1]))
1909 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1912 * Blowing out the DMAP will blow up the system.
1914 if (Maxmem
> atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
)) {
1915 kprintf("Limiting Maxmem due to DMAP size\n");
1916 Maxmem
= atop(DMAP_MAX_ADDRESS
- DMAP_MIN_ADDRESS
);
1919 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1920 (boothowto
& RB_VERBOSE
)) {
1921 kprintf("Physical memory use set to %ldK\n", Maxmem
* 4);
1925 * Call pmap initialization to make new kernel address space
1929 pmap_bootstrap(&first
);
1930 physmap
[0] = PAGE_SIZE
;
1933 * Align the physmap to PHYSMAP_ALIGN and cut out anything
1936 for (i
= j
= 0; i
<= physmap_idx
; i
+= 2) {
1937 if (physmap
[i
+1] > ptoa(Maxmem
))
1938 physmap
[i
+1] = ptoa(Maxmem
);
1939 physmap
[i
] = (physmap
[i
] + PHYSMAP_ALIGN_MASK
) &
1940 ~PHYSMAP_ALIGN_MASK
;
1941 physmap
[i
+1] = physmap
[i
+1] & ~PHYSMAP_ALIGN_MASK
;
1943 physmap
[j
] = physmap
[i
];
1944 physmap
[j
+1] = physmap
[i
+1];
1946 if (physmap
[i
] < physmap
[i
+1])
1949 physmap_idx
= j
- 2;
1952 * Align anything else used in the validation loop.
1954 first
= (first
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
1957 * Size up each available chunk of physical memory.
1961 phys_avail
[pa_indx
++] = physmap
[0];
1962 phys_avail
[pa_indx
] = physmap
[0];
1963 dump_avail
[da_indx
] = physmap
[0];
1967 * Get dcons buffer address
1969 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
1970 kgetenv_quad("dcons.size", &dcons_size
) == 0)
1974 * Validate the physical memory. The physical memory segments
1975 * have already been aligned to PHYSMAP_ALIGN which is a multiple
1978 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1980 vm_paddr_t incr
= PHYSMAP_ALIGN
;
1982 end
= physmap
[i
+ 1];
1984 for (pa
= physmap
[i
]; pa
< end
; pa
+= incr
) {
1986 volatile uint64_t *ptr
= (uint64_t *)CADDR1
;
1989 incr
= PHYSMAP_ALIGN
;
1993 * block out kernel memory as not available.
1995 if (pa
>= 0x200000 && pa
< first
)
1999 * block out dcons buffer
2002 && pa
>= trunc_page(dcons_addr
)
2003 && pa
< dcons_addr
+ dcons_size
) {
2010 * Always test the first and last block supplied in
2011 * the map entry, but it just takes too long to run
2012 * the test these days and we already have to skip
2013 * pages. Handwave it on PHYSMAP_HANDWAVE boundaries.
2015 if (pa
!= physmap
[i
]) {
2016 vm_paddr_t bytes
= end
- pa
;
2017 if ((pa
& PHYSMAP_HANDWAVE_MASK
) == 0 &&
2018 bytes
>= PHYSMAP_HANDWAVE
+ PHYSMAP_ALIGN
) {
2019 incr
= PHYSMAP_HANDWAVE
;
2025 * map page into kernel: valid, read/write,non-cacheable
2028 kernel_pmap
.pmap_bits
[PG_V_IDX
] |
2029 kernel_pmap
.pmap_bits
[PG_RW_IDX
] |
2030 kernel_pmap
.pmap_bits
[PG_N_IDX
];
2031 cpu_invlpg(__DEVOLATILE(void *, ptr
));
2036 * Test for alternating 1's and 0's
2038 *ptr
= 0xaaaaaaaaaaaaaaaaLLU
;
2040 if (*ptr
!= 0xaaaaaaaaaaaaaaaaLLU
)
2043 * Test for alternating 0's and 1's
2045 *ptr
= 0x5555555555555555LLU
;
2047 if (*ptr
!= 0x5555555555555555LLU
)
2052 *ptr
= 0xffffffffffffffffLLU
;
2054 if (*ptr
!= 0xffffffffffffffffLLU
)
2064 * Restore original value.
2070 * Adjust array of valid/good pages.
2072 if (page_bad
== TRUE
)
2076 * If this good page is a continuation of the
2077 * previous set of good pages, then just increase
2078 * the end pointer. Otherwise start a new chunk.
2079 * Note that "end" points one higher than end,
2080 * making the range >= start and < end.
2081 * If we're also doing a speculative memory
2082 * test and we at or past the end, bump up Maxmem
2083 * so that we keep going. The first bad page
2084 * will terminate the loop.
2086 if (phys_avail
[pa_indx
] == pa
) {
2087 phys_avail
[pa_indx
] += incr
;
2090 if (pa_indx
== PHYS_AVAIL_ARRAY_END
) {
2092 "Too many holes in the physical address space, giving up\n");
2097 phys_avail
[pa_indx
++] = pa
;
2098 phys_avail
[pa_indx
] = pa
+ incr
;
2100 physmem
+= incr
/ PAGE_SIZE
;
2102 if (dump_avail
[da_indx
] == pa
) {
2103 dump_avail
[da_indx
] += incr
;
2106 if (da_indx
== DUMP_AVAIL_ARRAY_END
) {
2110 dump_avail
[da_indx
++] = pa
;
2111 dump_avail
[da_indx
] = pa
+ incr
;
2123 * The last chunk must contain at least one page plus the message
2124 * buffer to avoid complicating other code (message buffer address
2125 * calculation, etc.).
2127 msgbuf_size
= (MSGBUF_SIZE
+ PHYSMAP_ALIGN_MASK
) & ~PHYSMAP_ALIGN_MASK
;
2129 while (phys_avail
[pa_indx
- 1] + PHYSMAP_ALIGN
+
2130 msgbuf_size
>= phys_avail
[pa_indx
]) {
2131 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
2132 phys_avail
[pa_indx
--] = 0;
2133 phys_avail
[pa_indx
--] = 0;
2136 Maxmem
= atop(phys_avail
[pa_indx
]);
2138 /* Trim off space for the message buffer. */
2139 phys_avail
[pa_indx
] -= msgbuf_size
;
2141 avail_end
= phys_avail
[pa_indx
];
2143 /* Map the message buffer. */
2144 for (off
= 0; off
< msgbuf_size
; off
+= PAGE_SIZE
) {
2145 pmap_kenter((vm_offset_t
)msgbufp
+ off
,
2146 phys_avail
[pa_indx
] + off
);
2148 /* Try to get EFI framebuffer working as early as possible */
2149 if (have_efi_framebuffer
)
2150 efi_fb_init_vaddr(1);
2153 struct machintr_abi MachIntrABI
;
2164 * 7 Device Not Available (x87)
2166 * 9 Coprocessor Segment overrun (unsupported, reserved)
2168 * 11 Segment not present
2170 * 13 General Protection
2173 * 16 x87 FP Exception pending
2174 * 17 Alignment Check
2176 * 19 SIMD floating point
2178 * 32-255 INTn/external sources
2181 hammer_time(u_int64_t modulep
, u_int64_t physfree
)
2184 int gsel_tss
, x
, cpu
;
2186 int metadata_missing
, off
;
2188 struct mdglobaldata
*gd
;
2192 * Prevent lowering of the ipl if we call tsleep() early.
2194 gd
= &CPU_prvspace
[0]->mdglobaldata
;
2195 bzero(gd
, sizeof(*gd
));
2198 * Note: on both UP and SMP curthread must be set non-NULL
2199 * early in the boot sequence because the system assumes
2200 * that 'curthread' is never NULL.
2203 gd
->mi
.gd_curthread
= &thread0
;
2204 thread0
.td_gd
= &gd
->mi
;
2206 atdevbase
= ISA_HOLE_START
+ PTOV_OFFSET
;
2209 metadata_missing
= 0;
2210 if (bootinfo
.bi_modulep
) {
2211 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
2212 preload_bootstrap_relocate(KERNBASE
);
2214 metadata_missing
= 1;
2216 if (bootinfo
.bi_envp
)
2217 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
2220 preload_metadata
= (caddr_t
)(uintptr_t)(modulep
+ PTOV_OFFSET
);
2221 preload_bootstrap_relocate(PTOV_OFFSET
);
2222 kmdp
= preload_search_by_type("elf kernel");
2224 kmdp
= preload_search_by_type("elf64 kernel");
2225 boothowto
= MD_FETCH(kmdp
, MODINFOMD_HOWTO
, int);
2226 kern_envp
= MD_FETCH(kmdp
, MODINFOMD_ENVP
, char *) + PTOV_OFFSET
;
2228 ksym_start
= MD_FETCH(kmdp
, MODINFOMD_SSYM
, uintptr_t);
2229 ksym_end
= MD_FETCH(kmdp
, MODINFOMD_ESYM
, uintptr_t);
2232 if (boothowto
& RB_VERBOSE
)
2236 * Default MachIntrABI to ICU
2238 MachIntrABI
= MachIntrABI_ICU
;
2241 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
2242 * and ncpus_fit_mask remain 0.
2247 /* Init basic tunables, hz etc */
2251 * make gdt memory segments
2253 gdt_segs
[GPROC0_SEL
].ssd_base
=
2254 (uintptr_t) &CPU_prvspace
[0]->mdglobaldata
.gd_common_tss
;
2256 gd
->mi
.gd_prvspace
= CPU_prvspace
[0];
2258 for (x
= 0; x
< NGDT
; x
++) {
2259 if (x
!= GPROC0_SEL
&& x
!= (GPROC0_SEL
+ 1))
2260 ssdtosd(&gdt_segs
[x
], &gdt
[x
]);
2262 ssdtosyssd(&gdt_segs
[GPROC0_SEL
],
2263 (struct system_segment_descriptor
*)&gdt
[GPROC0_SEL
]);
2265 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
2266 r_gdt
.rd_base
= (long) gdt
;
2269 wrmsr(MSR_FSBASE
, 0); /* User value */
2270 wrmsr(MSR_GSBASE
, (u_int64_t
)&gd
->mi
);
2271 wrmsr(MSR_KGSBASE
, 0); /* User value while in the kernel */
2273 mi_gdinit(&gd
->mi
, 0);
2275 proc0paddr
= proc0paddr_buff
;
2276 mi_proc0init(&gd
->mi
, proc0paddr
);
2277 safepri
= TDPRI_MAX
;
2279 /* spinlocks and the BGL */
2283 for (x
= 0; x
< NIDT
; x
++)
2284 setidt_global(x
, &IDTVEC(rsvd
), SDT_SYSIGT
, SEL_KPL
, 0);
2285 setidt_global(IDT_DE
, &IDTVEC(div
), SDT_SYSIGT
, SEL_KPL
, 0);
2286 setidt_global(IDT_DB
, &IDTVEC(dbg
), SDT_SYSIGT
, SEL_KPL
, 0);
2287 setidt_global(IDT_NMI
, &IDTVEC(nmi
), SDT_SYSIGT
, SEL_KPL
, 1);
2288 setidt_global(IDT_BP
, &IDTVEC(bpt
), SDT_SYSIGT
, SEL_UPL
, 0);
2289 setidt_global(IDT_OF
, &IDTVEC(ofl
), SDT_SYSIGT
, SEL_KPL
, 0);
2290 setidt_global(IDT_BR
, &IDTVEC(bnd
), SDT_SYSIGT
, SEL_KPL
, 0);
2291 setidt_global(IDT_UD
, &IDTVEC(ill
), SDT_SYSIGT
, SEL_KPL
, 0);
2292 setidt_global(IDT_NM
, &IDTVEC(dna
), SDT_SYSIGT
, SEL_KPL
, 0);
2293 setidt_global(IDT_DF
, &IDTVEC(dblfault
), SDT_SYSIGT
, SEL_KPL
, 1);
2294 setidt_global(IDT_FPUGP
, &IDTVEC(fpusegm
), SDT_SYSIGT
, SEL_KPL
, 0);
2295 setidt_global(IDT_TS
, &IDTVEC(tss
), SDT_SYSIGT
, SEL_KPL
, 0);
2296 setidt_global(IDT_NP
, &IDTVEC(missing
), SDT_SYSIGT
, SEL_KPL
, 0);
2297 setidt_global(IDT_SS
, &IDTVEC(stk
), SDT_SYSIGT
, SEL_KPL
, 0);
2298 setidt_global(IDT_GP
, &IDTVEC(prot
), SDT_SYSIGT
, SEL_KPL
, 0);
2299 setidt_global(IDT_PF
, &IDTVEC(page
), SDT_SYSIGT
, SEL_KPL
, 0);
2300 setidt_global(IDT_MF
, &IDTVEC(fpu
), SDT_SYSIGT
, SEL_KPL
, 0);
2301 setidt_global(IDT_AC
, &IDTVEC(align
), SDT_SYSIGT
, SEL_KPL
, 0);
2302 setidt_global(IDT_MC
, &IDTVEC(mchk
), SDT_SYSIGT
, SEL_KPL
, 0);
2303 setidt_global(IDT_XF
, &IDTVEC(xmm
), SDT_SYSIGT
, SEL_KPL
, 0);
2305 for (cpu
= 0; cpu
< MAXCPU
; ++cpu
) {
2306 r_idt_arr
[cpu
].rd_limit
= sizeof(idt_arr
[cpu
]) - 1;
2307 r_idt_arr
[cpu
].rd_base
= (long) &idt_arr
[cpu
][0];
2310 lidt(&r_idt_arr
[0]);
2313 * Initialize the console before we print anything out.
2318 if (metadata_missing
)
2319 kprintf("WARNING: loader(8) metadata is missing!\n");
2329 * Initialize IRQ mapping
2332 * SHOULD be after elcr_probe()
2334 MachIntrABI_ICU
.initmap();
2335 MachIntrABI_IOAPIC
.initmap();
2339 if (boothowto
& RB_KDB
)
2340 Debugger("Boot flags requested debugger");
2344 finishidentcpu(); /* Final stage of CPU initialization */
2345 setidt(6, &IDTVEC(ill
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2346 setidt(13, &IDTVEC(prot
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2348 identify_cpu(); /* Final stage of CPU initialization */
2349 initializecpu(0); /* Initialize CPU registers */
2352 * On modern intel cpus, haswell or later, cpu_idle_hlt=1 is better
2353 * because the cpu does significant power management in MWAIT
2354 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2356 * On modern amd cpus cpu_idle_hlt=3 is better, because the cpu does
2357 * significant power management in HLT or ACPI (but cpu_idle_hlt=1
2358 * would try to use MWAIT).
2360 * On older amd or intel cpus, cpu_idle_hlt=2 is better because ACPI
2361 * is needed to reduce power consumption, but wakeup times are often
2364 if (cpu_vendor_id
== CPU_VENDOR_INTEL
&&
2365 CPUID_TO_MODEL(cpu_id
) >= 0x3C) { /* Haswell or later */
2368 if (cpu_vendor_id
== CPU_VENDOR_AMD
&&
2369 CPUID_TO_FAMILY(cpu_id
) >= 0x14) { /* Bobcat or later */
2373 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable
); /* for compat */
2374 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable
);
2375 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable
);
2376 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt
);
2379 * Some of the virtual machines do not work w/ I/O APIC
2380 * enabled. If the user does not explicitly enable or
2381 * disable the I/O APIC (ioapic_enable < 0), then we
2382 * disable I/O APIC on all virtual machines.
2385 * This must be done after identify_cpu(), which sets
2388 if (ioapic_enable
< 0) {
2389 if (cpu_feature2
& CPUID2_VMM
)
2395 /* make an initial tss so cpu can get interrupt stack on syscall! */
2396 gd
->gd_common_tss
.tss_rsp0
=
2397 (register_t
)(thread0
.td_kstack
+
2398 KSTACK_PAGES
* PAGE_SIZE
- sizeof(struct pcb
));
2399 /* Ensure the stack is aligned to 16 bytes */
2400 gd
->gd_common_tss
.tss_rsp0
&= ~(register_t
)0xF;
2402 /* double fault stack */
2403 gd
->gd_common_tss
.tss_ist1
=
2404 (long)&gd
->mi
.gd_prvspace
->idlestack
[
2405 sizeof(gd
->mi
.gd_prvspace
->idlestack
)];
2407 /* Set the IO permission bitmap (empty due to tss seg limit) */
2408 gd
->gd_common_tss
.tss_iobase
= sizeof(struct x86_64tss
);
2410 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2411 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
];
2412 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2415 /* Set up the fast syscall stuff */
2416 msr
= rdmsr(MSR_EFER
) | EFER_SCE
;
2417 wrmsr(MSR_EFER
, msr
);
2418 wrmsr(MSR_LSTAR
, (u_int64_t
)IDTVEC(fast_syscall
));
2419 wrmsr(MSR_CSTAR
, (u_int64_t
)IDTVEC(fast_syscall32
));
2420 msr
= ((u_int64_t
)GSEL(GCODE_SEL
, SEL_KPL
) << 32) |
2421 ((u_int64_t
)GSEL(GUCODE32_SEL
, SEL_UPL
) << 48);
2422 wrmsr(MSR_STAR
, msr
);
2423 wrmsr(MSR_SF_MASK
, PSL_NT
|PSL_T
|PSL_I
|PSL_C
|PSL_D
|PSL_IOPL
);
2425 getmemsize(kmdp
, physfree
);
2426 init_param2(physmem
);
2428 /* now running on new page tables, configured,and u/iom is accessible */
2430 /* Map the message buffer. */
2432 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2433 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2436 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2439 /* transfer to user mode */
2441 _ucodesel
= GSEL(GUCODE_SEL
, SEL_UPL
);
2442 _udatasel
= GSEL(GUDATA_SEL
, SEL_UPL
);
2443 _ucode32sel
= GSEL(GUCODE32_SEL
, SEL_UPL
);
2449 /* setup proc 0's pcb */
2450 thread0
.td_pcb
->pcb_flags
= 0;
2451 thread0
.td_pcb
->pcb_cr3
= KPML4phys
;
2452 thread0
.td_pcb
->pcb_ext
= NULL
;
2453 lwp0
.lwp_md
.md_regs
= &proc0_tf
; /* XXX needed? */
2455 /* Location of kernel stack for locore */
2456 return ((u_int64_t
)thread0
.td_pcb
);
2460 * Initialize machine-dependant portions of the global data structure.
2461 * Note that the global data area and cpu0's idlestack in the private
2462 * data space were allocated in locore.
2464 * Note: the idlethread's cpl is 0
2466 * WARNING! Called from early boot, 'mycpu' may not work yet.
2469 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2472 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2474 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2475 gd
->mi
.gd_prvspace
->idlestack
,
2476 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2478 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2479 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2480 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2481 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2485 * We only have to check for DMAP bounds, the globaldata space is
2486 * actually part of the kernel_map so we don't have to waste time
2487 * checking CPU_prvspace[*].
2490 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2493 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2494 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2498 if (saddr
>= DMAP_MIN_ADDRESS
&& eaddr
<= DMAP_MAX_ADDRESS
)
2504 globaldata_find(int cpu
)
2506 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2507 return(&CPU_prvspace
[cpu
]->mdglobaldata
.mi
);
2511 * This path should be safe from the SYSRET issue because only stopped threads
2512 * can have their %rip adjusted this way (and all heavy weight thread switches
2513 * clear QUICKREF and thus do not use SYSRET). However, the code path is
2514 * convoluted so add a safety by forcing %rip to be cannonical.
2517 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2519 if (addr
& 0x0000800000000000LLU
)
2520 lp
->lwp_md
.md_regs
->tf_rip
= addr
| 0xFFFF000000000000LLU
;
2522 lp
->lwp_md
.md_regs
->tf_rip
= addr
& 0x0000FFFFFFFFFFFFLLU
;
2527 ptrace_single_step(struct lwp
*lp
)
2529 lp
->lwp_md
.md_regs
->tf_rflags
|= PSL_T
;
2534 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2536 struct trapframe
*tp
;
2538 if ((tp
= lp
->lwp_md
.md_regs
) == NULL
)
2540 bcopy(&tp
->tf_rdi
, ®s
->r_rdi
, sizeof(*regs
));
2545 set_regs(struct lwp
*lp
, struct reg
*regs
)
2547 struct trapframe
*tp
;
2549 tp
= lp
->lwp_md
.md_regs
;
2550 if (!EFL_SECURE(regs
->r_rflags
, tp
->tf_rflags
) ||
2551 !CS_SECURE(regs
->r_cs
))
2553 bcopy(®s
->r_rdi
, &tp
->tf_rdi
, sizeof(*regs
));
2559 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2561 struct env87
*penv_87
= &sv_87
->sv_env
;
2562 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2565 /* FPU control/status */
2566 penv_87
->en_cw
= penv_xmm
->en_cw
;
2567 penv_87
->en_sw
= penv_xmm
->en_sw
;
2568 penv_87
->en_tw
= penv_xmm
->en_tw
;
2569 penv_87
->en_fip
= penv_xmm
->en_fip
;
2570 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2571 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2572 penv_87
->en_foo
= penv_xmm
->en_foo
;
2573 penv_87
->en_fos
= penv_xmm
->en_fos
;
2576 for (i
= 0; i
< 8; ++i
)
2577 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2581 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2583 struct env87
*penv_87
= &sv_87
->sv_env
;
2584 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2587 /* FPU control/status */
2588 penv_xmm
->en_cw
= penv_87
->en_cw
;
2589 penv_xmm
->en_sw
= penv_87
->en_sw
;
2590 penv_xmm
->en_tw
= penv_87
->en_tw
;
2591 penv_xmm
->en_fip
= penv_87
->en_fip
;
2592 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2593 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2594 penv_xmm
->en_foo
= penv_87
->en_foo
;
2595 penv_xmm
->en_fos
= penv_87
->en_fos
;
2598 for (i
= 0; i
< 8; ++i
)
2599 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2603 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2605 if (lp
->lwp_thread
== NULL
|| lp
->lwp_thread
->td_pcb
== NULL
)
2608 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2609 (struct save87
*)fpregs
);
2612 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2617 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2620 set_fpregs_xmm((struct save87
*)fpregs
,
2621 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2624 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2629 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2634 dbregs
->dr
[0] = rdr0();
2635 dbregs
->dr
[1] = rdr1();
2636 dbregs
->dr
[2] = rdr2();
2637 dbregs
->dr
[3] = rdr3();
2638 dbregs
->dr
[4] = rdr4();
2639 dbregs
->dr
[5] = rdr5();
2640 dbregs
->dr
[6] = rdr6();
2641 dbregs
->dr
[7] = rdr7();
2644 if (lp
->lwp_thread
== NULL
|| (pcb
= lp
->lwp_thread
->td_pcb
) == NULL
)
2646 dbregs
->dr
[0] = pcb
->pcb_dr0
;
2647 dbregs
->dr
[1] = pcb
->pcb_dr1
;
2648 dbregs
->dr
[2] = pcb
->pcb_dr2
;
2649 dbregs
->dr
[3] = pcb
->pcb_dr3
;
2652 dbregs
->dr
[6] = pcb
->pcb_dr6
;
2653 dbregs
->dr
[7] = pcb
->pcb_dr7
;
2658 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2661 load_dr0(dbregs
->dr
[0]);
2662 load_dr1(dbregs
->dr
[1]);
2663 load_dr2(dbregs
->dr
[2]);
2664 load_dr3(dbregs
->dr
[3]);
2665 load_dr4(dbregs
->dr
[4]);
2666 load_dr5(dbregs
->dr
[5]);
2667 load_dr6(dbregs
->dr
[6]);
2668 load_dr7(dbregs
->dr
[7]);
2671 struct ucred
*ucred
;
2673 uint64_t mask1
, mask2
;
2676 * Don't let an illegal value for dr7 get set. Specifically,
2677 * check for undefined settings. Setting these bit patterns
2678 * result in undefined behaviour and can lead to an unexpected
2681 /* JG this loop looks unreadable */
2682 /* Check 4 2-bit fields for invalid patterns.
2683 * These fields are R/Wi, for i = 0..3
2685 /* Is 10 in LENi allowed when running in compatibility mode? */
2686 /* Pattern 10 in R/Wi might be used to indicate
2687 * breakpoint on I/O. Further analysis should be
2688 * carried to decide if it is safe and useful to
2689 * provide access to that capability
2691 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 4;
2692 i
++, mask1
<<= 4, mask2
<<= 4)
2693 if ((dbregs
->dr
[7] & mask1
) == mask2
)
2696 pcb
= lp
->lwp_thread
->td_pcb
;
2697 ucred
= lp
->lwp_proc
->p_ucred
;
2700 * Don't let a process set a breakpoint that is not within the
2701 * process's address space. If a process could do this, it
2702 * could halt the system by setting a breakpoint in the kernel
2703 * (if ddb was enabled). Thus, we need to check to make sure
2704 * that no breakpoints are being enabled for addresses outside
2705 * process's address space, unless, perhaps, we were called by
2708 * XXX - what about when the watched area of the user's
2709 * address space is written into from within the kernel
2710 * ... wouldn't that still cause a breakpoint to be generated
2711 * from within kernel mode?
2714 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2715 if (dbregs
->dr
[7] & 0x3) {
2716 /* dr0 is enabled */
2717 if (dbregs
->dr
[0] >= VM_MAX_USER_ADDRESS
)
2721 if (dbregs
->dr
[7] & (0x3<<2)) {
2722 /* dr1 is enabled */
2723 if (dbregs
->dr
[1] >= VM_MAX_USER_ADDRESS
)
2727 if (dbregs
->dr
[7] & (0x3<<4)) {
2728 /* dr2 is enabled */
2729 if (dbregs
->dr
[2] >= VM_MAX_USER_ADDRESS
)
2733 if (dbregs
->dr
[7] & (0x3<<6)) {
2734 /* dr3 is enabled */
2735 if (dbregs
->dr
[3] >= VM_MAX_USER_ADDRESS
)
2740 pcb
->pcb_dr0
= dbregs
->dr
[0];
2741 pcb
->pcb_dr1
= dbregs
->dr
[1];
2742 pcb
->pcb_dr2
= dbregs
->dr
[2];
2743 pcb
->pcb_dr3
= dbregs
->dr
[3];
2744 pcb
->pcb_dr6
= dbregs
->dr
[6];
2745 pcb
->pcb_dr7
= dbregs
->dr
[7];
2747 pcb
->pcb_flags
|= PCB_DBREGS
;
2754 * Return > 0 if a hardware breakpoint has been hit, and the
2755 * breakpoint was in user space. Return 0, otherwise.
2758 user_dbreg_trap(void)
2760 u_int64_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2761 u_int64_t bp
; /* breakpoint bits extracted from dr6 */
2762 int nbp
; /* number of breakpoints that triggered */
2763 caddr_t addr
[4]; /* breakpoint addresses */
2767 if ((dr7
& 0xff) == 0) {
2769 * all GE and LE bits in the dr7 register are zero,
2770 * thus the trap couldn't have been caused by the
2771 * hardware debug registers
2782 * None of the breakpoint bits are set meaning this
2783 * trap was not caused by any of the debug registers
2789 * at least one of the breakpoints were hit, check to see
2790 * which ones and if any of them are user space addresses
2794 addr
[nbp
++] = (caddr_t
)rdr0();
2797 addr
[nbp
++] = (caddr_t
)rdr1();
2800 addr
[nbp
++] = (caddr_t
)rdr2();
2803 addr
[nbp
++] = (caddr_t
)rdr3();
2806 for (i
=0; i
<nbp
; i
++) {
2808 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2810 * addr[i] is in user space
2817 * None of the breakpoints are in user space.
2825 Debugger(const char *msg
)
2827 kprintf("Debugger(\"%s\") called.\n", msg
);
2834 * Provide inb() and outb() as functions. They are normally only
2835 * available as macros calling inlined functions, thus cannot be
2836 * called inside DDB.
2838 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2844 /* silence compiler warnings */
2846 void outb(u_int
, u_char
);
2853 * We use %%dx and not %1 here because i/o is done at %dx and not at
2854 * %edx, while gcc generates inferior code (movw instead of movl)
2855 * if we tell it to load (u_short) port.
2857 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2862 outb(u_int port
, u_char data
)
2866 * Use an unnecessary assignment to help gcc's register allocator.
2867 * This make a large difference for gcc-1.40 and a tiny difference
2868 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2869 * best results. gcc-2.6.0 can't handle this.
2872 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2880 * initialize all the SMP locks
2883 /* critical region when masking or unmasking interupts */
2884 struct spinlock_deprecated imen_spinlock
;
2886 /* lock region used by kernel profiling */
2887 struct spinlock_deprecated mcount_spinlock
;
2889 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2890 struct spinlock_deprecated com_spinlock
;
2892 /* lock regions around the clock hardware */
2893 struct spinlock_deprecated clock_spinlock
;
2899 * Get the initial mplock with a count of 1 for the BSP.
2900 * This uses a LOGICAL cpu ID, ie BSP == 0.
2902 cpu_get_initial_mplock();
2904 spin_init_deprecated(&mcount_spinlock
);
2905 spin_init_deprecated(&imen_spinlock
);
2906 spin_init_deprecated(&com_spinlock
);
2907 spin_init_deprecated(&clock_spinlock
);
2909 /* our token pool needs to work early */
2910 lwkt_token_pool_init();
2914 cpu_mwait_hint_valid(uint32_t hint
)
2918 cx_idx
= MWAIT_EAX_TO_CX(hint
);
2919 if (cx_idx
>= CPU_MWAIT_CX_MAX
)
2922 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2923 if (sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
2930 cpu_mwait_cx_no_bmsts(void)
2932 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_STS
);
2936 cpu_mwait_cx_no_bmarb(void)
2938 atomic_clear_int(&cpu_mwait_c3_preamble
, CPU_MWAIT_C3_PREAMBLE_BM_ARB
);
2942 cpu_mwait_cx_hint2name(int hint
, char *name
, int namelen
, boolean_t allow_auto
)
2944 int old_cx_idx
, sub
= 0;
2947 old_cx_idx
= MWAIT_EAX_TO_CX(hint
);
2948 sub
= MWAIT_EAX_TO_CX_SUB(hint
);
2949 } else if (hint
== CPU_MWAIT_HINT_AUTO
) {
2950 old_cx_idx
= allow_auto
? CPU_MWAIT_C2
: CPU_MWAIT_CX_MAX
;
2951 } else if (hint
== CPU_MWAIT_HINT_AUTODEEP
) {
2952 old_cx_idx
= allow_auto
? CPU_MWAIT_C3
: CPU_MWAIT_CX_MAX
;
2954 old_cx_idx
= CPU_MWAIT_CX_MAX
;
2957 if (!CPU_MWAIT_HAS_CX
)
2958 strlcpy(name
, "NONE", namelen
);
2959 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTO
)
2960 strlcpy(name
, "AUTO", namelen
);
2961 else if (allow_auto
&& hint
== CPU_MWAIT_HINT_AUTODEEP
)
2962 strlcpy(name
, "AUTODEEP", namelen
);
2963 else if (old_cx_idx
>= CPU_MWAIT_CX_MAX
||
2964 sub
>= cpu_mwait_cx_info
[old_cx_idx
].subcnt
)
2965 strlcpy(name
, "INVALID", namelen
);
2967 ksnprintf(name
, namelen
, "C%d/%d", old_cx_idx
, sub
);
2973 cpu_mwait_cx_name2hint(char *name
, int *hint0
, boolean_t allow_auto
)
2975 int cx_idx
, sub
, hint
;
2978 if (allow_auto
&& strcmp(name
, "AUTO") == 0) {
2979 hint
= CPU_MWAIT_HINT_AUTO
;
2980 cx_idx
= CPU_MWAIT_C2
;
2983 if (allow_auto
&& strcmp(name
, "AUTODEEP") == 0) {
2984 hint
= CPU_MWAIT_HINT_AUTODEEP
;
2985 cx_idx
= CPU_MWAIT_C3
;
2989 if (strlen(name
) < 4 || toupper(name
[0]) != 'C')
2994 cx_idx
= strtol(start
, &ptr
, 10);
2995 if (ptr
== start
|| *ptr
!= '/')
2997 if (cx_idx
< 0 || cx_idx
>= CPU_MWAIT_CX_MAX
)
3003 sub
= strtol(start
, &ptr
, 10);
3006 if (sub
< 0 || sub
>= cpu_mwait_cx_info
[cx_idx
].subcnt
)
3009 hint
= MWAIT_EAX_HINT(cx_idx
, sub
);
3016 cpu_mwait_cx_transit(int old_cx_idx
, int cx_idx
)
3018 if (cx_idx
>= CPU_MWAIT_C3
&& cpu_mwait_c3_preamble
)
3020 if (old_cx_idx
< CPU_MWAIT_C3
&& cx_idx
>= CPU_MWAIT_C3
) {
3023 error
= cputimer_intr_powersave_addreq();
3026 } else if (old_cx_idx
>= CPU_MWAIT_C3
&& cx_idx
< CPU_MWAIT_C3
) {
3027 cputimer_intr_powersave_remreq();
3033 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS
, int *hint0
,
3034 boolean_t allow_auto
)
3036 int error
, cx_idx
, old_cx_idx
, hint
;
3037 char name
[CPU_MWAIT_CX_NAMELEN
];
3040 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
),
3043 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3044 if (error
!= 0 || req
->newptr
== NULL
)
3047 if (!CPU_MWAIT_HAS_CX
)
3050 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, allow_auto
);
3054 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3063 cpu_mwait_cx_setname(struct cpu_idle_stat
*stat
, const char *cx_name
)
3065 int error
, cx_idx
, old_cx_idx
, hint
;
3066 char name
[CPU_MWAIT_CX_NAMELEN
];
3068 KASSERT(CPU_MWAIT_HAS_CX
, ("cpu does not support mwait CX extension"));
3071 old_cx_idx
= cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3073 strlcpy(name
, cx_name
, sizeof(name
));
3074 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3078 error
= cpu_mwait_cx_transit(old_cx_idx
, cx_idx
);
3087 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3089 int hint
= cpu_mwait_halt_global
;
3090 int error
, cx_idx
, cpu
;
3091 char name
[CPU_MWAIT_CX_NAMELEN
], cx_name
[CPU_MWAIT_CX_NAMELEN
];
3093 cpu_mwait_cx_hint2name(hint
, name
, sizeof(name
), TRUE
);
3095 error
= sysctl_handle_string(oidp
, name
, sizeof(name
), req
);
3096 if (error
!= 0 || req
->newptr
== NULL
)
3099 if (!CPU_MWAIT_HAS_CX
)
3102 /* Save name for later per-cpu CX configuration */
3103 strlcpy(cx_name
, name
, sizeof(cx_name
));
3105 cx_idx
= cpu_mwait_cx_name2hint(name
, &hint
, TRUE
);
3109 /* Change per-cpu CX configuration */
3110 for (cpu
= 0; cpu
< ncpus
; ++cpu
) {
3111 error
= cpu_mwait_cx_setname(&cpu_idle_stats
[cpu
], cx_name
);
3116 cpu_mwait_halt_global
= hint
;
3121 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS
)
3123 struct cpu_idle_stat
*stat
= arg1
;
3126 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3132 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS
)
3136 error
= cpu_mwait_cx_select_sysctl(oidp
, arg1
, arg2
, req
,
3137 &cpu_mwait_spin
, FALSE
);
3142 * This manual debugging code is called unconditionally from Xtimer
3143 * (the per-cpu timer interrupt) whether the current thread is in a
3144 * critical section or not) and can be useful in tracking down lockups.
3146 * NOTE: MANUAL DEBUG CODE
3149 static int saveticks
[SMP_MAXCPU
];
3150 static int savecounts
[SMP_MAXCPU
];
3154 pcpu_timer_always(struct intrframe
*frame
)
3157 globaldata_t gd
= mycpu
;
3158 int cpu
= gd
->gd_cpuid
;
3164 gptr
= (short *)0xFFFFFFFF800b8000 + 80 * cpu
;
3165 *gptr
= ((*gptr
+ 1) & 0x00FF) | 0x0700;
3168 ksnprintf(buf
, sizeof(buf
), " %p %16s %d %16s ",
3169 (void *)frame
->if_rip
, gd
->gd_curthread
->td_comm
, ticks
,
3171 for (i
= 0; buf
[i
]; ++i
) {
3172 gptr
[i
] = 0x0700 | (unsigned char)buf
[i
];
3176 if (saveticks
[gd
->gd_cpuid
] != ticks
) {
3177 saveticks
[gd
->gd_cpuid
] = ticks
;
3178 savecounts
[gd
->gd_cpuid
] = 0;
3180 ++savecounts
[gd
->gd_cpuid
];
3181 if (savecounts
[gd
->gd_cpuid
] > 2000 && panicstr
== NULL
) {
3182 panic("cpud %d panicing on ticks failure",
3185 for (i
= 0; i
< ncpus
; ++i
) {
3187 if (saveticks
[i
] && panicstr
== NULL
) {
3188 delta
= saveticks
[i
] - ticks
;
3189 if (delta
< -10 || delta
> 10) {
3190 panic("cpu %d panicing on cpu %d watchdog",