2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
43 #include "use_ether.h"
44 //#include "use_npx.h"
46 #include "opt_atalk.h"
47 #include "opt_compat.h"
50 #include "opt_directio.h"
53 #include "opt_msgbuf.h"
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/sysproto.h>
59 #include <sys/signalvar.h>
60 #include <sys/kernel.h>
61 #include <sys/linker.h>
62 #include <sys/malloc.h>
66 #include <sys/reboot.h>
68 #include <sys/msgbuf.h>
69 #include <sys/sysent.h>
70 #include <sys/sysctl.h>
71 #include <sys/vmmeter.h>
73 #include <sys/upcall.h>
74 #include <sys/usched.h>
78 #include <vm/vm_param.h>
80 #include <vm/vm_kern.h>
81 #include <vm/vm_object.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_pager.h>
85 #include <vm/vm_extern.h>
87 #include <sys/thread2.h>
88 #include <sys/mplock2.h>
96 #include <machine/cpu.h>
97 #include <machine/clock.h>
98 #include <machine/specialreg.h>
100 #include <machine/bootinfo.h>
102 #include <machine/md_var.h>
103 #include <machine/metadata.h>
104 #include <machine/pc/bios.h>
105 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
106 #include <machine/globaldata.h> /* CPU_prvspace */
107 #include <machine/smp.h>
109 #include <machine/perfmon.h>
111 #include <machine/cputypes.h>
114 #include <bus/isa/isa_device.h>
116 #include <machine_base/isa/intr_machdep.h>
117 #include <bus/isa/rtc.h>
118 #include <sys/random.h>
119 #include <sys/ptrace.h>
120 #include <machine/sigframe.h>
122 #define PHYSMAP_ENTRIES 10
124 extern void init386(int first
);
125 extern void dblfault_handler(void);
126 extern u_int64_t
hammer_time(u_int64_t
, u_int64_t
);
128 extern void printcpuinfo(void); /* XXX header file */
129 extern void identify_cpu(void);
131 extern void finishidentcpu(void);
133 extern void panicifcpuunsupported(void);
135 static void cpu_startup(void *);
136 #ifndef CPU_DISABLE_SSE
137 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
138 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
139 #endif /* CPU_DISABLE_SSE */
141 extern void ffs_rawread_setup(void);
142 #endif /* DIRECTIO */
143 static void init_locks(void);
145 SYSINIT(cpu
, SI_BOOT2_SMP
, SI_ORDER_FIRST
, cpu_startup
, NULL
)
148 extern vm_offset_t ksym_start
, ksym_end
;
155 struct privatespace CPU_prvspace
[MAXCPU
];
157 int _udatasel
, _ucodesel
, _ucode32sel
;
160 int64_t tsc_offsets
[MAXCPU
];
162 int64_t tsc_offsets
[1];
165 #if defined(SWTCH_OPTIM_STATS)
166 extern int swtch_optim_stats
;
167 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
168 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
169 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
170 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
175 u_long ebda_addr
= 0;
178 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
180 int error
= sysctl_handle_int(oidp
, 0, ctob(physmem
), req
);
184 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_INT
|CTLFLAG_RD
,
185 0, 0, sysctl_hw_physmem
, "IU", "");
188 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
190 int error
= sysctl_handle_int(oidp
, 0,
191 ctob(physmem
- vmstats
.v_wire_count
), req
);
195 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
196 0, 0, sysctl_hw_usermem
, "IU", "");
199 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
201 int error
= sysctl_handle_int(oidp
, 0,
202 x86_64_btop(avail_end
- avail_start
), req
);
206 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
207 0, 0, sysctl_hw_availpages
, "I", "");
213 * The number of PHYSMAP entries must be one less than the number of
214 * PHYSSEG entries because the PHYSMAP entry that spans the largest
215 * physical address that is accessible by ISA DMA is split into two
218 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
220 vm_paddr_t phys_avail
[PHYSMAP_SIZE
+ 2];
221 vm_paddr_t dump_avail
[PHYSMAP_SIZE
+ 2];
223 /* must be 2 less so 0 0 can signal end of chunks */
224 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
225 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
227 static vm_offset_t buffer_sva
, buffer_eva
;
228 vm_offset_t clean_sva
, clean_eva
;
229 static vm_offset_t pager_sva
, pager_eva
;
230 static struct trapframe proc0_tf
;
233 cpu_startup(void *dummy
)
237 vm_offset_t firstaddr
;
239 if (boothowto
& RB_VERBOSE
)
243 * Good {morning,afternoon,evening,night}.
245 kprintf("%s", version
);
248 panicifcpuunsupported();
252 kprintf("real memory = %ju (%ju MB)\n",
254 (intmax_t)Realmem
/ 1024 / 1024);
256 * Display any holes after the first chunk of extended memory.
261 kprintf("Physical memory chunk(s):\n");
262 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
263 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
265 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
266 (intmax_t)phys_avail
[indx
],
267 (intmax_t)phys_avail
[indx
+ 1] - 1,
269 (intmax_t)(size1
/ PAGE_SIZE
));
274 * Allocate space for system data structures.
275 * The first available kernel virtual address is in "v".
276 * As pages of kernel virtual memory are allocated, "v" is incremented.
277 * As pages of memory are allocated and cleared,
278 * "firstaddr" is incremented.
279 * An index into the kernel page table corresponding to the
280 * virtual memory address maintained in "v" is kept in "mapaddr".
284 * Make two passes. The first pass calculates how much memory is
285 * needed and allocates it. The second pass assigns virtual
286 * addresses to the various data structures.
290 v
= (caddr_t
)firstaddr
;
292 #define valloc(name, type, num) \
293 (name) = (type *)v; v = (caddr_t)((name)+(num))
294 #define valloclim(name, type, num, lim) \
295 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
298 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
299 * For the first 64MB of ram nominally allocate sufficient buffers to
300 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
301 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
302 * the buffer cache we limit the eventual kva reservation to
305 * factor represents the 1/4 x ram conversion.
308 int factor
= 4 * BKVASIZE
/ 1024;
309 int kbytes
= physmem
* (PAGE_SIZE
/ 1024);
313 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
315 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
316 if (maxbcache
&& nbuf
> maxbcache
/ BKVASIZE
)
317 nbuf
= maxbcache
/ BKVASIZE
;
321 * Do not allow the buffer_map to be more then 1/2 the size of the
324 if (nbuf
> (virtual_end
- virtual_start
) / (BKVASIZE
* 2)) {
325 nbuf
= (virtual_end
- virtual_start
) / (BKVASIZE
* 2);
326 kprintf("Warning: nbufs capped at %d\n", nbuf
);
329 nswbuf
= max(min(nbuf
/4, 256), 16);
331 if (nswbuf
< NSWBUF_MIN
)
338 valloc(swbuf
, struct buf
, nswbuf
);
339 valloc(buf
, struct buf
, nbuf
);
342 * End of first pass, size has been calculated so allocate memory
344 if (firstaddr
== 0) {
345 size
= (vm_size_t
)(v
- firstaddr
);
346 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
));
348 panic("startup: no room for tables");
353 * End of second pass, addresses have been assigned
355 if ((vm_size_t
)(v
- firstaddr
) != size
)
356 panic("startup: table size inconsistency");
358 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
359 (nbuf
*BKVASIZE
) + (nswbuf
*MAXPHYS
) + pager_map_size
);
360 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
362 buffer_map
.system_map
= 1;
363 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
364 (nswbuf
*MAXPHYS
) + pager_map_size
);
365 pager_map
.system_map
= 1;
367 #if defined(USERCONFIG)
369 cninit(); /* the preferred console may have changed */
372 kprintf("avail memory = %ju (%ju MB)\n",
373 (uintmax_t)ptoa(vmstats
.v_free_count
),
374 (uintmax_t)ptoa(vmstats
.v_free_count
) / 1024 / 1024);
377 * Set up buffers, so they can be used to read disk labels.
380 vm_pager_bufferinit();
384 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
386 mp_start(); /* fire up the APs and APICs */
393 * Send an interrupt to process.
395 * Stack is set up to allow sigcode stored
396 * at top to call routine, followed by kcall
397 * to sigreturn routine below. After sigreturn
398 * resets the signal mask, the stack, and the
399 * frame pointer, it returns to the user
403 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
405 struct lwp
*lp
= curthread
->td_lwp
;
406 struct proc
*p
= lp
->lwp_proc
;
407 struct trapframe
*regs
;
408 struct sigacts
*psp
= p
->p_sigacts
;
409 struct sigframe sf
, *sfp
;
413 regs
= lp
->lwp_md
.md_regs
;
414 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
416 /* Save user context */
417 bzero(&sf
, sizeof(struct sigframe
));
418 sf
.sf_uc
.uc_sigmask
= *mask
;
419 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
420 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
421 KKASSERT(__offsetof(struct trapframe
, tf_rdi
) == 0);
422 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_rdi
, sizeof(struct trapframe
));
424 /* Make the size of the saved context visible to userland */
425 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
427 /* Save mailbox pending state for syscall interlock semantics */
428 if (p
->p_flag
& P_MAILBOX
)
429 sf
.sf_uc
.uc_mcontext
.mc_xflags
|= PGEX_MAILBOX
;
431 /* Allocate and validate space for the signal handler context. */
432 if ((lp
->lwp_flag
& LWP_ALTSTACK
) != 0 && !oonstack
&&
433 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
434 sp
= (char *)(lp
->lwp_sigstk
.ss_sp
+ lp
->lwp_sigstk
.ss_size
-
435 sizeof(struct sigframe
));
436 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
438 /* We take red zone into account */
439 sp
= (char *)regs
->tf_rsp
- sizeof(struct sigframe
) - 128;
442 /* Align to 16 bytes */
443 sfp
= (struct sigframe
*)((intptr_t)sp
& ~0xFUL
);
445 /* Translate the signal is appropriate */
446 if (p
->p_sysent
->sv_sigtbl
) {
447 if (sig
<= p
->p_sysent
->sv_sigsize
)
448 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
452 * Build the argument list for the signal handler.
454 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
456 regs
->tf_rdi
= sig
; /* argument 1 */
457 regs
->tf_rdx
= (register_t
)&sfp
->sf_uc
; /* argument 3 */
459 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
461 * Signal handler installed with SA_SIGINFO.
463 * action(signo, siginfo, ucontext)
465 regs
->tf_rsi
= (register_t
)&sfp
->sf_si
; /* argument 2 */
466 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
467 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
469 /* fill siginfo structure */
470 sf
.sf_si
.si_signo
= sig
;
471 sf
.sf_si
.si_code
= code
;
472 sf
.sf_si
.si_addr
= (void *)regs
->tf_addr
;
475 * Old FreeBSD-style arguments.
477 * handler (signo, code, [uc], addr)
479 regs
->tf_rsi
= (register_t
)code
; /* argument 2 */
480 regs
->tf_rcx
= (register_t
)regs
->tf_addr
; /* argument 4 */
481 sf
.sf_ahu
.sf_handler
= catcher
;
485 * If we're a vm86 process, we want to save the segment registers.
486 * We also change eflags to be our emulated eflags, not the actual
490 if (regs
->tf_eflags
& PSL_VM
) {
491 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
492 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
494 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
495 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
496 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
497 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
499 if (vm86
->vm86_has_vme
== 0)
500 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
501 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
502 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
505 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
506 * syscalls made by the signal handler. This just avoids
507 * wasting time for our lazy fixup of such faults. PSL_NT
508 * does nothing in vm86 mode, but vm86 programs can set it
509 * almost legitimately in probes for old cpu types.
511 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
516 * Save the FPU state and reinit the FP unit
518 npxpush(&sf
.sf_uc
.uc_mcontext
);
521 * Copy the sigframe out to the user's stack.
523 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
525 * Something is wrong with the stack pointer.
526 * ...Kill the process.
531 regs
->tf_rsp
= (register_t
)sfp
;
532 regs
->tf_rip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
535 * i386 abi specifies that the direction flag must be cleared
538 regs
->tf_rflags
&= ~(PSL_T
|PSL_D
);
541 * 64 bit mode has a code and stack selector but
542 * no data or extra selector. %fs and %gs are not
545 regs
->tf_cs
= _ucodesel
;
546 regs
->tf_ss
= _udatasel
;
550 * Sanitize the trapframe for a virtual kernel passing control to a custom
551 * VM context. Remove any items that would otherwise create a privilage
554 * XXX at the moment we allow userland to set the resume flag. Is this a
558 cpu_sanitize_frame(struct trapframe
*frame
)
560 frame
->tf_cs
= _ucodesel
;
561 frame
->tf_ss
= _udatasel
;
562 /* XXX VM (8086) mode not supported? */
563 frame
->tf_rflags
&= (PSL_RF
| PSL_USERCHANGE
| PSL_VM_UNSUPP
);
564 frame
->tf_rflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
570 * Sanitize the tls so loading the descriptor does not blow up
571 * on us. For x86_64 we don't have to do anything.
574 cpu_sanitize_tls(struct savetls
*tls
)
580 * sigreturn(ucontext_t *sigcntxp)
582 * System call to cleanup state after a signal
583 * has been taken. Reset signal mask and
584 * stack state from context left by sendsig (above).
585 * Return to previous pc and psl as specified by
586 * context left by sendsig. Check carefully to
587 * make sure that the user has not modified the
588 * state to gain improper privileges.
592 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
593 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
596 sys_sigreturn(struct sigreturn_args
*uap
)
598 struct lwp
*lp
= curthread
->td_lwp
;
599 struct proc
*p
= lp
->lwp_proc
;
600 struct trapframe
*regs
;
608 * We have to copy the information into kernel space so userland
609 * can't modify it while we are sniffing it.
611 regs
= lp
->lwp_md
.md_regs
;
612 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
616 rflags
= ucp
->uc_mcontext
.mc_rflags
;
618 /* VM (8086) mode not supported */
619 rflags
&= ~PSL_VM_UNSUPP
;
622 if (eflags
& PSL_VM
) {
623 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
624 struct vm86_kernel
*vm86
;
627 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
628 * set up the vm86 area, and we can't enter vm86 mode.
630 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
632 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
633 if (vm86
->vm86_inited
== 0)
636 /* go back to user mode if both flags are set */
637 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
638 trapsignal(lp
, SIGBUS
, 0);
640 if (vm86
->vm86_has_vme
) {
641 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
642 (eflags
& VME_USERCHANGE
) | PSL_VM
;
644 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
645 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
646 (eflags
& VM_USERCHANGE
) | PSL_VM
;
648 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
649 tf
->tf_eflags
= eflags
;
650 tf
->tf_vm86_ds
= tf
->tf_ds
;
651 tf
->tf_vm86_es
= tf
->tf_es
;
652 tf
->tf_vm86_fs
= tf
->tf_fs
;
653 tf
->tf_vm86_gs
= tf
->tf_gs
;
654 tf
->tf_ds
= _udatasel
;
655 tf
->tf_es
= _udatasel
;
656 tf
->tf_fs
= _udatasel
;
657 tf
->tf_gs
= _udatasel
;
662 * Don't allow users to change privileged or reserved flags.
665 * XXX do allow users to change the privileged flag PSL_RF.
666 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
667 * should sometimes set it there too. tf_eflags is kept in
668 * the signal context during signal handling and there is no
669 * other place to remember it, so the PSL_RF bit may be
670 * corrupted by the signal handler without us knowing.
671 * Corruption of the PSL_RF bit at worst causes one more or
672 * one less debugger trap, so allowing it is fairly harmless.
674 if (!EFL_SECURE(rflags
& ~PSL_RF
, regs
->tf_rflags
& ~PSL_RF
)) {
675 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags
);
680 * Don't allow users to load a valid privileged %cs. Let the
681 * hardware check for invalid selectors, excess privilege in
682 * other selectors, invalid %eip's and invalid %esp's.
684 cs
= ucp
->uc_mcontext
.mc_cs
;
685 if (!CS_SECURE(cs
)) {
686 kprintf("sigreturn: cs = 0x%x\n", cs
);
687 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
690 bcopy(&ucp
->uc_mcontext
.mc_rdi
, regs
, sizeof(struct trapframe
));
694 * Restore the FPU state from the frame
697 npxpop(&ucp
->uc_mcontext
);
700 * Merge saved signal mailbox pending flag to maintain interlock
701 * semantics against system calls.
703 if (ucp
->uc_mcontext
.mc_xflags
& PGEX_MAILBOX
)
704 p
->p_flag
|= P_MAILBOX
;
706 if (ucp
->uc_mcontext
.mc_onstack
& 1)
707 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
709 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
711 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
712 SIG_CANTMASK(lp
->lwp_sigmask
);
718 * Stack frame on entry to function. %rax will contain the function vector,
719 * %rcx will contain the function data. flags, rcx, and rax will have
720 * already been pushed on the stack.
731 sendupcall(struct vmupcall
*vu
, int morepending
)
733 struct lwp
*lp
= curthread
->td_lwp
;
734 struct trapframe
*regs
;
735 struct upcall upcall
;
736 struct upc_frame upc_frame
;
740 * If we are a virtual kernel running an emulated user process
741 * context, switch back to the virtual kernel context before
742 * trying to post the signal.
744 if (lp
->lwp_vkernel
&& lp
->lwp_vkernel
->ve
) {
745 lp
->lwp_md
.md_regs
->tf_trapno
= 0;
746 vkernel_trap(lp
, lp
->lwp_md
.md_regs
);
750 * Get the upcall data structure
752 if (copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
)) ||
753 copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int))
756 kprintf("bad upcall address\n");
761 * If the data structure is already marked pending or has a critical
762 * section count, mark the data structure as pending and return
763 * without doing an upcall. vu_pending is left set.
765 if (upcall
.upc_pending
|| crit_count
>= vu
->vu_pending
) {
766 if (upcall
.upc_pending
< vu
->vu_pending
) {
767 upcall
.upc_pending
= vu
->vu_pending
;
768 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
769 sizeof(upcall
.upc_pending
));
775 * We can run this upcall now, clear vu_pending.
777 * Bump our critical section count and set or clear the
778 * user pending flag depending on whether more upcalls are
779 * pending. The user will be responsible for calling
780 * upc_dispatch(-1) to process remaining upcalls.
783 upcall
.upc_pending
= morepending
;
785 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
786 sizeof(upcall
.upc_pending
));
787 copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
,
791 * Construct a stack frame and issue the upcall
793 regs
= lp
->lwp_md
.md_regs
;
794 upc_frame
.rax
= regs
->tf_rax
;
795 upc_frame
.rcx
= regs
->tf_rcx
;
796 upc_frame
.rdx
= regs
->tf_rdx
;
797 upc_frame
.flags
= regs
->tf_rflags
;
798 upc_frame
.oldip
= regs
->tf_rip
;
799 if (copyout(&upc_frame
, (void *)(regs
->tf_rsp
- sizeof(upc_frame
)),
800 sizeof(upc_frame
)) != 0) {
801 kprintf("bad stack on upcall\n");
803 regs
->tf_rax
= (register_t
)vu
->vu_func
;
804 regs
->tf_rcx
= (register_t
)vu
->vu_data
;
805 regs
->tf_rdx
= (register_t
)lp
->lwp_upcall
;
806 regs
->tf_rip
= (register_t
)vu
->vu_ctx
;
807 regs
->tf_rsp
-= sizeof(upc_frame
);
812 * fetchupcall occurs in the context of a system call, which means that
813 * we have to return EJUSTRETURN in order to prevent eax and edx from
814 * being overwritten by the syscall return value.
816 * if vu is not NULL we return the new context in %edx, the new data in %ecx,
817 * and the function pointer in %eax.
820 fetchupcall(struct vmupcall
*vu
, int morepending
, void *rsp
)
822 struct upc_frame upc_frame
;
823 struct lwp
*lp
= curthread
->td_lwp
;
824 struct trapframe
*regs
;
826 struct upcall upcall
;
829 regs
= lp
->lwp_md
.md_regs
;
831 error
= copyout(&morepending
, &lp
->lwp_upcall
->upc_pending
, sizeof(int));
835 * This jumps us to the next ready context.
838 error
= copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
));
841 error
= copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int));
844 error
= copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, sizeof(int));
845 regs
->tf_rax
= (register_t
)vu
->vu_func
;
846 regs
->tf_rcx
= (register_t
)vu
->vu_data
;
847 regs
->tf_rdx
= (register_t
)lp
->lwp_upcall
;
848 regs
->tf_rip
= (register_t
)vu
->vu_ctx
;
849 regs
->tf_rsp
= (register_t
)rsp
;
852 * This returns us to the originally interrupted code.
854 error
= copyin(rsp
, &upc_frame
, sizeof(upc_frame
));
855 regs
->tf_rax
= upc_frame
.rax
;
856 regs
->tf_rcx
= upc_frame
.rcx
;
857 regs
->tf_rdx
= upc_frame
.rdx
;
858 regs
->tf_rflags
= (regs
->tf_rflags
& ~PSL_USERCHANGE
) |
859 (upc_frame
.flags
& PSL_USERCHANGE
);
860 regs
->tf_rip
= upc_frame
.oldip
;
861 regs
->tf_rsp
= (register_t
)((char *)rsp
+ sizeof(upc_frame
));
870 * Machine dependent boot() routine
872 * I haven't seen anything to put here yet
873 * Possibly some stuff might be grafted back here from boot()
881 * Shutdown the CPU as much as possible
887 __asm__
__volatile("hlt");
891 * cpu_idle() represents the idle LWKT. You cannot return from this function
892 * (unless you want to blow things up!). Instead we look for runnable threads
893 * and loop or halt as appropriate. Giant is not held on entry to the thread.
895 * The main loop is entered with a critical section held, we must release
896 * the critical section before doing anything else. lwkt_switch() will
897 * check for pending interrupts due to entering and exiting its own
900 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI
901 * to wake a HLTed cpu up. However, there are cases where the idlethread
902 * will be entered with the possibility that no IPI will occur and in such
903 * cases lwkt_switch() sets TDF_IDLE_NOHLT.
905 static int cpu_idle_hlt
= 1;
906 static int cpu_idle_hltcnt
;
907 static int cpu_idle_spincnt
;
908 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
909 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
910 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, CTLFLAG_RW
,
911 &cpu_idle_hltcnt
, 0, "Idle loop entry halts");
912 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_spincnt
, CTLFLAG_RW
,
913 &cpu_idle_spincnt
, 0, "Idle loop entry spins");
916 cpu_idle_default_hook(void)
919 * We must guarentee that hlt is exactly the instruction
922 __asm
__volatile("sti; hlt");
925 /* Other subsystems (e.g., ACPI) can hook this later. */
926 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
931 struct thread
*td
= curthread
;
934 KKASSERT(td
->td_critcount
== 0);
937 * See if there are any LWKTs ready to go.
942 * If we are going to halt call splz unconditionally after
943 * CLIing to catch any interrupt races. Note that we are
944 * at SPL0 and interrupts are enabled.
946 if (cpu_idle_hlt
&& !lwkt_runnable() &&
947 (td
->td_flags
& TDF_IDLE_NOHLT
) == 0) {
948 __asm
__volatile("cli");
950 if (!lwkt_runnable())
954 handle_cpu_contention_mask();
958 td
->td_flags
&= ~TDF_IDLE_NOHLT
;
961 __asm
__volatile("sti");
962 handle_cpu_contention_mask();
964 __asm
__volatile("sti");
974 * This routine is called when the only runnable threads require
975 * the MP lock, and the scheduler couldn't get it. On a real cpu
976 * we let the scheduler spin.
979 handle_cpu_contention_mask(void)
983 mask
= cpu_contention_mask
;
985 if (mask
&& bsfl(mask
) != mycpu
->gd_cpuid
)
990 * This routine is called if a spinlock has been held through the
991 * exponential backoff period and is seriously contested. On a real cpu
995 cpu_spinlock_contested(void)
1003 * Clear registers on exec
1006 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
1008 struct thread
*td
= curthread
;
1009 struct lwp
*lp
= td
->td_lwp
;
1010 struct pcb
*pcb
= td
->td_pcb
;
1011 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
1013 /* was i386_user_cleanup() in NetBSD */
1016 bzero((char *)regs
, sizeof(struct trapframe
));
1017 regs
->tf_rip
= entry
;
1018 regs
->tf_rsp
= ((stack
- 8) & ~0xFul
) + 8; /* align the stack */
1019 regs
->tf_rdi
= stack
; /* argv */
1020 regs
->tf_rflags
= PSL_USER
| (regs
->tf_rflags
& PSL_T
);
1021 regs
->tf_ss
= _udatasel
;
1022 regs
->tf_cs
= _ucodesel
;
1023 regs
->tf_rbx
= ps_strings
;
1026 * Reset the hardware debug registers if they were in use.
1027 * They won't have any meaning for the newly exec'd process.
1029 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1035 pcb
->pcb_dr7
= 0; /* JG set bit 10? */
1036 if (pcb
== td
->td_pcb
) {
1038 * Clear the debug registers on the running
1039 * CPU, otherwise they will end up affecting
1040 * the next process we switch to.
1044 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1048 * Initialize the math emulator (if any) for the current process.
1049 * Actually, just clear the bit that says that the emulator has
1050 * been initialized. Initialization is delayed until the process
1051 * traps to the emulator (if it is done at all) mainly because
1052 * emulators don't provide an entry point for initialization.
1054 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1057 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1058 * gd_npxthread. Otherwise a preemptive interrupt thread
1059 * may panic in npxdna().
1062 load_cr0(rcr0() | CR0_MP
);
1065 * NOTE: The MSR values must be correct so we can return to
1066 * userland. gd_user_fs/gs must be correct so the switch
1067 * code knows what the current MSR values are.
1069 pcb
->pcb_fsbase
= 0; /* Values loaded from PCB on switch */
1070 pcb
->pcb_gsbase
= 0;
1071 mdcpu
->gd_user_fs
= 0; /* Cache of current MSR values */
1072 mdcpu
->gd_user_gs
= 0;
1073 wrmsr(MSR_FSBASE
, 0); /* Set MSR values for return to userland */
1074 wrmsr(MSR_KGSBASE
, 0);
1076 /* Initialize the npx (if any) for the current process. */
1077 npxinit(__INITIAL_NPXCW__
);
1080 pcb
->pcb_ds
= _udatasel
;
1081 pcb
->pcb_es
= _udatasel
;
1082 pcb
->pcb_fs
= _udatasel
;
1083 pcb
->pcb_gs
= _udatasel
;
1092 cr0
|= CR0_NE
; /* Done by npxinit() */
1093 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1094 cr0
|= CR0_WP
| CR0_AM
;
1100 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1103 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1105 if (!error
&& req
->newptr
)
1110 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1111 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1113 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1114 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1117 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1118 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1121 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1122 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1124 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1125 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1126 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1129 * Initialize 386 and configure to run kernel
1133 * Initialize segments & interrupt table
1137 struct user_segment_descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1138 static struct gate_descriptor idt0
[NIDT
];
1139 struct gate_descriptor
*idt
= &idt0
[0]; /* interrupt descriptor table */
1141 union descriptor ldt
[NLDT
]; /* local descriptor table */
1144 /* table descriptors - used to load tables by cpu */
1145 struct region_descriptor r_gdt
, r_idt
;
1147 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
1148 extern int has_f00f_bug
;
1151 static char dblfault_stack
[PAGE_SIZE
] __aligned(16);
1153 /* JG proc0paddr is a virtual address */
1156 char proc0paddr_buff
[LWKT_THREAD_STACK
];
1159 /* software prototypes -- in more palatable form */
1160 struct soft_segment_descriptor gdt_segs
[] = {
1161 /* GNULL_SEL 0 Null Descriptor */
1162 { 0x0, /* segment base address */
1164 0, /* segment type */
1165 0, /* segment descriptor priority level */
1166 0, /* segment descriptor present */
1168 0, /* default 32 vs 16 bit size */
1169 0 /* limit granularity (byte/page units)*/ },
1170 /* GCODE_SEL 1 Code Descriptor for kernel */
1171 { 0x0, /* segment base address */
1172 0xfffff, /* length - all address space */
1173 SDT_MEMERA
, /* segment type */
1174 SEL_KPL
, /* segment descriptor priority level */
1175 1, /* segment descriptor present */
1177 0, /* default 32 vs 16 bit size */
1178 1 /* limit granularity (byte/page units)*/ },
1179 /* GDATA_SEL 2 Data Descriptor for kernel */
1180 { 0x0, /* segment base address */
1181 0xfffff, /* length - all address space */
1182 SDT_MEMRWA
, /* segment type */
1183 SEL_KPL
, /* segment descriptor priority level */
1184 1, /* segment descriptor present */
1186 0, /* default 32 vs 16 bit size */
1187 1 /* limit granularity (byte/page units)*/ },
1188 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1189 { 0x0, /* segment base address */
1190 0xfffff, /* length - all address space */
1191 SDT_MEMERA
, /* segment type */
1192 SEL_UPL
, /* segment descriptor priority level */
1193 1, /* segment descriptor present */
1195 1, /* default 32 vs 16 bit size */
1196 1 /* limit granularity (byte/page units)*/ },
1197 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1198 { 0x0, /* segment base address */
1199 0xfffff, /* length - all address space */
1200 SDT_MEMRWA
, /* segment type */
1201 SEL_UPL
, /* segment descriptor priority level */
1202 1, /* segment descriptor present */
1204 1, /* default 32 vs 16 bit size */
1205 1 /* limit granularity (byte/page units)*/ },
1206 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1207 { 0x0, /* segment base address */
1208 0xfffff, /* length - all address space */
1209 SDT_MEMERA
, /* segment type */
1210 SEL_UPL
, /* segment descriptor priority level */
1211 1, /* segment descriptor present */
1213 0, /* default 32 vs 16 bit size */
1214 1 /* limit granularity (byte/page units)*/ },
1215 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1217 0x0, /* segment base address */
1218 sizeof(struct x86_64tss
)-1,/* length - all address space */
1219 SDT_SYSTSS
, /* segment type */
1220 SEL_KPL
, /* segment descriptor priority level */
1221 1, /* segment descriptor present */
1223 0, /* unused - default 32 vs 16 bit size */
1224 0 /* limit granularity (byte/page units)*/ },
1225 /* Actually, the TSS is a system descriptor which is double size */
1226 { 0x0, /* segment base address */
1228 0, /* segment type */
1229 0, /* segment descriptor priority level */
1230 0, /* segment descriptor present */
1232 0, /* default 32 vs 16 bit size */
1233 0 /* limit granularity (byte/page units)*/ },
1234 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1235 { 0x0, /* segment base address */
1236 0xfffff, /* length - all address space */
1237 SDT_MEMRWA
, /* segment type */
1238 SEL_UPL
, /* segment descriptor priority level */
1239 1, /* segment descriptor present */
1241 1, /* default 32 vs 16 bit size */
1242 1 /* limit granularity (byte/page units)*/ },
1246 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
)
1248 struct gate_descriptor
*ip
;
1251 ip
->gd_looffset
= (uintptr_t)func
;
1252 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1258 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1261 #define IDTVEC(name) __CONCAT(X,name)
1264 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1265 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1266 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1267 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(rsvd
), IDTVEC(fpu
), IDTVEC(align
),
1268 IDTVEC(xmm
), IDTVEC(dblfault
),
1269 IDTVEC(fast_syscall
), IDTVEC(fast_syscall32
);
1271 #ifdef DEBUG_INTERRUPTS
1272 extern inthand_t
*Xrsvdary
[256];
1276 sdtossd(struct user_segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1278 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1279 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1280 ssd
->ssd_type
= sd
->sd_type
;
1281 ssd
->ssd_dpl
= sd
->sd_dpl
;
1282 ssd
->ssd_p
= sd
->sd_p
;
1283 ssd
->ssd_def32
= sd
->sd_def32
;
1284 ssd
->ssd_gran
= sd
->sd_gran
;
1288 ssdtosd(struct soft_segment_descriptor
*ssd
, struct user_segment_descriptor
*sd
)
1291 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1292 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xff;
1293 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1294 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1295 sd
->sd_type
= ssd
->ssd_type
;
1296 sd
->sd_dpl
= ssd
->ssd_dpl
;
1297 sd
->sd_p
= ssd
->ssd_p
;
1298 sd
->sd_long
= ssd
->ssd_long
;
1299 sd
->sd_def32
= ssd
->ssd_def32
;
1300 sd
->sd_gran
= ssd
->ssd_gran
;
1304 ssdtosyssd(struct soft_segment_descriptor
*ssd
,
1305 struct system_segment_descriptor
*sd
)
1308 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1309 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xfffffffffful
;
1310 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1311 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1312 sd
->sd_type
= ssd
->ssd_type
;
1313 sd
->sd_dpl
= ssd
->ssd_dpl
;
1314 sd
->sd_p
= ssd
->ssd_p
;
1315 sd
->sd_gran
= ssd
->ssd_gran
;
1321 * Populate the (physmap) array with base/bound pairs describing the
1322 * available physical memory in the system, then test this memory and
1323 * build the phys_avail array describing the actually-available memory.
1325 * If we cannot accurately determine the physical memory map, then use
1326 * value from the 0xE801 call, and failing that, the RTC.
1328 * Total memory size may be set by the kernel environment variable
1329 * hw.physmem or the compile-time define MAXMEM.
1331 * XXX first should be vm_paddr_t.
1334 getmemsize(caddr_t kmdp
, u_int64_t first
)
1336 int i
, off
, physmap_idx
, pa_indx
, da_indx
;
1337 vm_paddr_t pa
, physmap
[PHYSMAP_SIZE
];
1338 u_long physmem_tunable
;
1340 struct bios_smap
*smapbase
, *smap
, *smapend
;
1342 quad_t dcons_addr
, dcons_size
;
1344 bzero(physmap
, sizeof(physmap
));
1349 * get memory map from INT 15:E820, kindly supplied by the loader.
1351 * subr_module.c says:
1352 * "Consumer may safely assume that size value precedes data."
1353 * ie: an int32_t immediately precedes smap.
1355 smapbase
= (struct bios_smap
*)preload_search_info(kmdp
,
1356 MODINFO_METADATA
| MODINFOMD_SMAP
);
1357 if (smapbase
== NULL
)
1358 panic("No BIOS smap info from loader!");
1360 smapsize
= *((u_int32_t
*)smapbase
- 1);
1361 smapend
= (struct bios_smap
*)((uintptr_t)smapbase
+ smapsize
);
1363 for (smap
= smapbase
; smap
< smapend
; smap
++) {
1364 if (boothowto
& RB_VERBOSE
)
1365 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1366 smap
->type
, smap
->base
, smap
->length
);
1368 if (smap
->type
!= SMAP_TYPE_MEMORY
)
1371 if (smap
->length
== 0)
1374 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1375 if (smap
->base
< physmap
[i
+ 1]) {
1376 if (boothowto
& RB_VERBOSE
) {
1377 kprintf("Overlapping or non-monotonic "
1378 "memory region, ignoring "
1384 Realmem
+= smap
->length
;
1386 if (smap
->base
== physmap
[physmap_idx
+ 1]) {
1387 physmap
[physmap_idx
+ 1] += smap
->length
;
1392 if (physmap_idx
== PHYSMAP_SIZE
) {
1393 kprintf("Too many segments in the physical "
1394 "address map, giving up\n");
1397 physmap
[physmap_idx
] = smap
->base
;
1398 physmap
[physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1402 * Find the 'base memory' segment for SMP
1405 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1406 if (physmap
[i
] == 0x00000000) {
1407 basemem
= physmap
[i
+ 1] / 1024;
1412 panic("BIOS smap did not include a basemem segment!");
1415 /* make hole for AP bootstrap code */
1416 physmap
[1] = mp_bootaddress(physmap
[1] / 1024);
1418 /* Save EBDA address, if any */
1419 ebda_addr
= (u_long
)(*(u_short
*)(KERNBASE
+ 0x40e));
1424 * Maxmem isn't the "maximum memory", it's one larger than the
1425 * highest page of the physical address space. It should be
1426 * called something like "Maxphyspage". We may adjust this
1427 * based on ``hw.physmem'' and the results of the memory test.
1429 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1432 Maxmem
= MAXMEM
/ 4;
1435 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable
))
1436 Maxmem
= atop(physmem_tunable
);
1439 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1442 if (Maxmem
> atop(physmap
[physmap_idx
+ 1]))
1443 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1445 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1446 (boothowto
& RB_VERBOSE
))
1447 kprintf("Physical memory use set to %ldK\n", Maxmem
* 4);
1449 /* call pmap initialization to make new kernel address space */
1450 pmap_bootstrap(&first
);
1453 * Size up each available chunk of physical memory.
1455 physmap
[0] = PAGE_SIZE
; /* mask off page 0 */
1458 phys_avail
[pa_indx
++] = physmap
[0];
1459 phys_avail
[pa_indx
] = physmap
[0];
1460 dump_avail
[da_indx
] = physmap
[0];
1464 * Get dcons buffer address
1466 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
1467 kgetenv_quad("dcons.size", &dcons_size
) == 0)
1471 * physmap is in bytes, so when converting to page boundaries,
1472 * round up the start address and round down the end address.
1474 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1477 end
= ptoa((vm_paddr_t
)Maxmem
);
1478 if (physmap
[i
+ 1] < end
)
1479 end
= trunc_page(physmap
[i
+ 1]);
1480 for (pa
= round_page(physmap
[i
]); pa
< end
; pa
+= PAGE_SIZE
) {
1481 int tmp
, page_bad
, full
;
1482 int *ptr
= (int *)CADDR1
;
1486 * block out kernel memory as not available.
1488 if (pa
>= 0x100000 && pa
< first
)
1492 * block out dcons buffer
1495 && pa
>= trunc_page(dcons_addr
)
1496 && pa
< dcons_addr
+ dcons_size
)
1502 * map page into kernel: valid, read/write,non-cacheable
1504 *pte
= pa
| PG_V
| PG_RW
| PG_N
;
1509 * Test for alternating 1's and 0's
1511 *(volatile int *)ptr
= 0xaaaaaaaa;
1512 if (*(volatile int *)ptr
!= 0xaaaaaaaa)
1515 * Test for alternating 0's and 1's
1517 *(volatile int *)ptr
= 0x55555555;
1518 if (*(volatile int *)ptr
!= 0x55555555)
1523 *(volatile int *)ptr
= 0xffffffff;
1524 if (*(volatile int *)ptr
!= 0xffffffff)
1529 *(volatile int *)ptr
= 0x0;
1530 if (*(volatile int *)ptr
!= 0x0)
1533 * Restore original value.
1538 * Adjust array of valid/good pages.
1540 if (page_bad
== TRUE
)
1543 * If this good page is a continuation of the
1544 * previous set of good pages, then just increase
1545 * the end pointer. Otherwise start a new chunk.
1546 * Note that "end" points one higher than end,
1547 * making the range >= start and < end.
1548 * If we're also doing a speculative memory
1549 * test and we at or past the end, bump up Maxmem
1550 * so that we keep going. The first bad page
1551 * will terminate the loop.
1553 if (phys_avail
[pa_indx
] == pa
) {
1554 phys_avail
[pa_indx
] += PAGE_SIZE
;
1557 if (pa_indx
== PHYS_AVAIL_ARRAY_END
) {
1559 "Too many holes in the physical address space, giving up\n");
1564 phys_avail
[pa_indx
++] = pa
; /* start */
1565 phys_avail
[pa_indx
] = pa
+ PAGE_SIZE
; /* end */
1569 if (dump_avail
[da_indx
] == pa
) {
1570 dump_avail
[da_indx
] += PAGE_SIZE
;
1573 if (da_indx
== DUMP_AVAIL_ARRAY_END
) {
1577 dump_avail
[da_indx
++] = pa
; /* start */
1578 dump_avail
[da_indx
] = pa
+ PAGE_SIZE
; /* end */
1590 * The last chunk must contain at least one page plus the message
1591 * buffer to avoid complicating other code (message buffer address
1592 * calculation, etc.).
1594 while (phys_avail
[pa_indx
- 1] + PAGE_SIZE
+
1595 round_page(MSGBUF_SIZE
) >= phys_avail
[pa_indx
]) {
1596 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
1597 phys_avail
[pa_indx
--] = 0;
1598 phys_avail
[pa_indx
--] = 0;
1601 Maxmem
= atop(phys_avail
[pa_indx
]);
1603 /* Trim off space for the message buffer. */
1604 phys_avail
[pa_indx
] -= round_page(MSGBUF_SIZE
);
1606 avail_end
= phys_avail
[pa_indx
];
1608 /* Map the message buffer. */
1609 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
1610 pmap_kenter((vm_offset_t
)msgbufp
+ off
, phys_avail
[pa_indx
] +
1623 * 7 Device Not Available (x87)
1625 * 9 Coprocessor Segment overrun (unsupported, reserved)
1627 * 11 Segment not present
1629 * 13 General Protection
1632 * 16 x87 FP Exception pending
1633 * 17 Alignment Check
1635 * 19 SIMD floating point
1637 * 32-255 INTn/external sources
1640 hammer_time(u_int64_t modulep
, u_int64_t physfree
)
1645 int metadata_missing
, off
;
1647 struct mdglobaldata
*gd
;
1651 * Prevent lowering of the ipl if we call tsleep() early.
1653 gd
= &CPU_prvspace
[0].mdglobaldata
;
1654 bzero(gd
, sizeof(*gd
));
1657 * Note: on both UP and SMP curthread must be set non-NULL
1658 * early in the boot sequence because the system assumes
1659 * that 'curthread' is never NULL.
1662 gd
->mi
.gd_curthread
= &thread0
;
1663 thread0
.td_gd
= &gd
->mi
;
1665 atdevbase
= ISA_HOLE_START
+ PTOV_OFFSET
;
1668 metadata_missing
= 0;
1669 if (bootinfo
.bi_modulep
) {
1670 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
1671 preload_bootstrap_relocate(KERNBASE
);
1673 metadata_missing
= 1;
1675 if (bootinfo
.bi_envp
)
1676 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
1679 preload_metadata
= (caddr_t
)(uintptr_t)(modulep
+ PTOV_OFFSET
);
1680 preload_bootstrap_relocate(PTOV_OFFSET
);
1681 kmdp
= preload_search_by_type("elf kernel");
1683 kmdp
= preload_search_by_type("elf64 kernel");
1684 boothowto
= MD_FETCH(kmdp
, MODINFOMD_HOWTO
, int);
1685 kern_envp
= MD_FETCH(kmdp
, MODINFOMD_ENVP
, char *) + PTOV_OFFSET
;
1687 ksym_start
= MD_FETCH(kmdp
, MODINFOMD_SSYM
, uintptr_t);
1688 ksym_end
= MD_FETCH(kmdp
, MODINFOMD_ESYM
, uintptr_t);
1692 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
1693 * and ncpus_fit_mask remain 0.
1698 /* Init basic tunables, hz etc */
1702 * make gdt memory segments
1704 gdt_segs
[GPROC0_SEL
].ssd_base
=
1705 (uintptr_t) &CPU_prvspace
[0].mdglobaldata
.gd_common_tss
;
1707 gd
->mi
.gd_prvspace
= &CPU_prvspace
[0];
1709 for (x
= 0; x
< NGDT
; x
++) {
1710 if (x
!= GPROC0_SEL
&& x
!= (GPROC0_SEL
+ 1))
1711 ssdtosd(&gdt_segs
[x
], &gdt
[x
]);
1713 ssdtosyssd(&gdt_segs
[GPROC0_SEL
],
1714 (struct system_segment_descriptor
*)&gdt
[GPROC0_SEL
]);
1716 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
1717 r_gdt
.rd_base
= (long) gdt
;
1720 wrmsr(MSR_FSBASE
, 0); /* User value */
1721 wrmsr(MSR_GSBASE
, (u_int64_t
)&gd
->mi
);
1722 wrmsr(MSR_KGSBASE
, 0); /* User value while in the kernel */
1724 mi_gdinit(&gd
->mi
, 0);
1726 proc0paddr
= proc0paddr_buff
;
1727 mi_proc0init(&gd
->mi
, proc0paddr
);
1728 safepri
= TDPRI_MAX
;
1730 /* spinlocks and the BGL */
1734 for (x
= 0; x
< NIDT
; x
++)
1735 setidt(x
, &IDTVEC(rsvd
), SDT_SYSIGT
, SEL_KPL
, 0);
1736 setidt(IDT_DE
, &IDTVEC(div
), SDT_SYSIGT
, SEL_KPL
, 0);
1737 setidt(IDT_DB
, &IDTVEC(dbg
), SDT_SYSIGT
, SEL_KPL
, 0);
1738 setidt(IDT_NMI
, &IDTVEC(nmi
), SDT_SYSIGT
, SEL_KPL
, 1);
1739 setidt(IDT_BP
, &IDTVEC(bpt
), SDT_SYSIGT
, SEL_UPL
, 0);
1740 setidt(IDT_OF
, &IDTVEC(ofl
), SDT_SYSIGT
, SEL_KPL
, 0);
1741 setidt(IDT_BR
, &IDTVEC(bnd
), SDT_SYSIGT
, SEL_KPL
, 0);
1742 setidt(IDT_UD
, &IDTVEC(ill
), SDT_SYSIGT
, SEL_KPL
, 0);
1743 setidt(IDT_NM
, &IDTVEC(dna
), SDT_SYSIGT
, SEL_KPL
, 0);
1744 setidt(IDT_DF
, &IDTVEC(dblfault
), SDT_SYSIGT
, SEL_KPL
, 1);
1745 setidt(IDT_FPUGP
, &IDTVEC(fpusegm
), SDT_SYSIGT
, SEL_KPL
, 0);
1746 setidt(IDT_TS
, &IDTVEC(tss
), SDT_SYSIGT
, SEL_KPL
, 0);
1747 setidt(IDT_NP
, &IDTVEC(missing
), SDT_SYSIGT
, SEL_KPL
, 0);
1748 setidt(IDT_SS
, &IDTVEC(stk
), SDT_SYSIGT
, SEL_KPL
, 0);
1749 setidt(IDT_GP
, &IDTVEC(prot
), SDT_SYSIGT
, SEL_KPL
, 0);
1750 setidt(IDT_PF
, &IDTVEC(page
), SDT_SYSIGT
, SEL_KPL
, 0);
1751 setidt(IDT_MF
, &IDTVEC(fpu
), SDT_SYSIGT
, SEL_KPL
, 0);
1752 setidt(IDT_AC
, &IDTVEC(align
), SDT_SYSIGT
, SEL_KPL
, 0);
1753 setidt(IDT_MC
, &IDTVEC(mchk
), SDT_SYSIGT
, SEL_KPL
, 0);
1754 setidt(IDT_XF
, &IDTVEC(xmm
), SDT_SYSIGT
, SEL_KPL
, 0);
1756 r_idt
.rd_limit
= sizeof(idt0
) - 1;
1757 r_idt
.rd_base
= (long) idt
;
1761 * Initialize the console before we print anything out.
1766 if (metadata_missing
)
1767 kprintf("WARNING: loader(8) metadata is missing!\n");
1777 if (boothowto
& RB_KDB
)
1778 Debugger("Boot flags requested debugger");
1782 finishidentcpu(); /* Final stage of CPU initialization */
1783 setidt(6, &IDTVEC(ill
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1784 setidt(13, &IDTVEC(prot
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1786 identify_cpu(); /* Final stage of CPU initialization */
1787 initializecpu(); /* Initialize CPU registers */
1789 /* make an initial tss so cpu can get interrupt stack on syscall! */
1790 gd
->gd_common_tss
.tss_rsp0
=
1791 (register_t
)(thread0
.td_kstack
+
1792 KSTACK_PAGES
* PAGE_SIZE
- sizeof(struct pcb
));
1793 /* Ensure the stack is aligned to 16 bytes */
1794 gd
->gd_common_tss
.tss_rsp0
&= ~(register_t
)0xF;
1795 gd
->gd_rsp0
= gd
->gd_common_tss
.tss_rsp0
;
1797 /* doublefault stack space, runs on ist1 */
1798 gd
->gd_common_tss
.tss_ist1
= (long)&dblfault_stack
[sizeof(dblfault_stack
)];
1800 /* Set the IO permission bitmap (empty due to tss seg limit) */
1801 gd
->gd_common_tss
.tss_iobase
= sizeof(struct x86_64tss
);
1803 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
1804 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
];
1805 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
1808 /* Set up the fast syscall stuff */
1809 msr
= rdmsr(MSR_EFER
) | EFER_SCE
;
1810 wrmsr(MSR_EFER
, msr
);
1811 wrmsr(MSR_LSTAR
, (u_int64_t
)IDTVEC(fast_syscall
));
1812 wrmsr(MSR_CSTAR
, (u_int64_t
)IDTVEC(fast_syscall32
));
1813 msr
= ((u_int64_t
)GSEL(GCODE_SEL
, SEL_KPL
) << 32) |
1814 ((u_int64_t
)GSEL(GUCODE32_SEL
, SEL_UPL
) << 48);
1815 wrmsr(MSR_STAR
, msr
);
1816 wrmsr(MSR_SF_MASK
, PSL_NT
|PSL_T
|PSL_I
|PSL_C
|PSL_D
);
1818 getmemsize(kmdp
, physfree
);
1819 init_param2(physmem
);
1821 /* now running on new page tables, configured,and u/iom is accessible */
1823 /* Map the message buffer. */
1825 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
1826 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
1829 msgbufinit(msgbufp
, MSGBUF_SIZE
);
1832 /* transfer to user mode */
1834 _ucodesel
= GSEL(GUCODE_SEL
, SEL_UPL
);
1835 _udatasel
= GSEL(GUDATA_SEL
, SEL_UPL
);
1836 _ucode32sel
= GSEL(GUCODE32_SEL
, SEL_UPL
);
1842 /* setup proc 0's pcb */
1843 thread0
.td_pcb
->pcb_flags
= 0;
1844 thread0
.td_pcb
->pcb_cr3
= KPML4phys
;
1845 thread0
.td_pcb
->pcb_ext
= 0;
1846 lwp0
.lwp_md
.md_regs
= &proc0_tf
;
1848 /* Location of kernel stack for locore */
1849 return ((u_int64_t
)thread0
.td_pcb
);
1853 * Initialize machine-dependant portions of the global data structure.
1854 * Note that the global data area and cpu0's idlestack in the private
1855 * data space were allocated in locore.
1857 * Note: the idlethread's cpl is 0
1859 * WARNING! Called from early boot, 'mycpu' may not work yet.
1862 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
1865 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
1867 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
1868 gd
->mi
.gd_prvspace
->idlestack
,
1869 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
1871 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
1872 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
1873 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
1874 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
1878 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
1880 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
1881 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
1888 globaldata_find(int cpu
)
1890 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
1891 return(&CPU_prvspace
[cpu
].mdglobaldata
.mi
);
1894 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
1895 static void f00f_hack(void *unused
);
1896 SYSINIT(f00f_hack
, SI_BOOT2_BIOS
, SI_ORDER_ANY
, f00f_hack
, NULL
);
1899 f00f_hack(void *unused
)
1901 struct gate_descriptor
*new_idt
;
1907 kprintf("Intel Pentium detected, installing workaround for F00F bug\n");
1909 r_idt
.rd_limit
= sizeof(idt0
) - 1;
1911 tmp
= kmem_alloc(&kernel_map
, PAGE_SIZE
* 2);
1913 panic("kmem_alloc returned 0");
1914 if (((unsigned int)tmp
& (PAGE_SIZE
-1)) != 0)
1915 panic("kmem_alloc returned non-page-aligned memory");
1916 /* Put the first seven entries in the lower page */
1917 new_idt
= (struct gate_descriptor
*)(tmp
+ PAGE_SIZE
- (7*8));
1918 bcopy(idt
, new_idt
, sizeof(idt0
));
1919 r_idt
.rd_base
= (int)new_idt
;
1922 if (vm_map_protect(&kernel_map
, tmp
, tmp
+ PAGE_SIZE
,
1923 VM_PROT_READ
, FALSE
) != KERN_SUCCESS
)
1924 panic("vm_map_protect failed");
1927 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
1930 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
1932 lp
->lwp_md
.md_regs
->tf_rip
= addr
;
1937 ptrace_single_step(struct lwp
*lp
)
1939 lp
->lwp_md
.md_regs
->tf_rflags
|= PSL_T
;
1944 fill_regs(struct lwp
*lp
, struct reg
*regs
)
1946 struct trapframe
*tp
;
1948 tp
= lp
->lwp_md
.md_regs
;
1949 bcopy(&tp
->tf_rdi
, ®s
->r_rdi
, sizeof(*regs
));
1954 set_regs(struct lwp
*lp
, struct reg
*regs
)
1956 struct trapframe
*tp
;
1958 tp
= lp
->lwp_md
.md_regs
;
1959 if (!EFL_SECURE(regs
->r_rflags
, tp
->tf_rflags
) ||
1960 !CS_SECURE(regs
->r_cs
))
1962 bcopy(®s
->r_rdi
, &tp
->tf_rdi
, sizeof(*regs
));
1966 #ifndef CPU_DISABLE_SSE
1968 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
1970 struct env87
*penv_87
= &sv_87
->sv_env
;
1971 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
1974 /* FPU control/status */
1975 penv_87
->en_cw
= penv_xmm
->en_cw
;
1976 penv_87
->en_sw
= penv_xmm
->en_sw
;
1977 penv_87
->en_tw
= penv_xmm
->en_tw
;
1978 penv_87
->en_fip
= penv_xmm
->en_fip
;
1979 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
1980 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
1981 penv_87
->en_foo
= penv_xmm
->en_foo
;
1982 penv_87
->en_fos
= penv_xmm
->en_fos
;
1985 for (i
= 0; i
< 8; ++i
)
1986 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
1988 sv_87
->sv_ex_sw
= sv_xmm
->sv_ex_sw
;
1992 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
1994 struct env87
*penv_87
= &sv_87
->sv_env
;
1995 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
1998 /* FPU control/status */
1999 penv_xmm
->en_cw
= penv_87
->en_cw
;
2000 penv_xmm
->en_sw
= penv_87
->en_sw
;
2001 penv_xmm
->en_tw
= penv_87
->en_tw
;
2002 penv_xmm
->en_fip
= penv_87
->en_fip
;
2003 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2004 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2005 penv_xmm
->en_foo
= penv_87
->en_foo
;
2006 penv_xmm
->en_fos
= penv_87
->en_fos
;
2009 for (i
= 0; i
< 8; ++i
)
2010 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2012 sv_xmm
->sv_ex_sw
= sv_87
->sv_ex_sw
;
2014 #endif /* CPU_DISABLE_SSE */
2017 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2019 #ifndef CPU_DISABLE_SSE
2021 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2022 (struct save87
*)fpregs
);
2025 #endif /* CPU_DISABLE_SSE */
2026 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2031 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2033 #ifndef CPU_DISABLE_SSE
2035 set_fpregs_xmm((struct save87
*)fpregs
,
2036 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2039 #endif /* CPU_DISABLE_SSE */
2040 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2045 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2048 dbregs
->dr
[0] = rdr0();
2049 dbregs
->dr
[1] = rdr1();
2050 dbregs
->dr
[2] = rdr2();
2051 dbregs
->dr
[3] = rdr3();
2052 dbregs
->dr
[4] = rdr4();
2053 dbregs
->dr
[5] = rdr5();
2054 dbregs
->dr
[6] = rdr6();
2055 dbregs
->dr
[7] = rdr7();
2059 pcb
= lp
->lwp_thread
->td_pcb
;
2060 dbregs
->dr
[0] = pcb
->pcb_dr0
;
2061 dbregs
->dr
[1] = pcb
->pcb_dr1
;
2062 dbregs
->dr
[2] = pcb
->pcb_dr2
;
2063 dbregs
->dr
[3] = pcb
->pcb_dr3
;
2066 dbregs
->dr
[6] = pcb
->pcb_dr6
;
2067 dbregs
->dr
[7] = pcb
->pcb_dr7
;
2073 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2076 load_dr0(dbregs
->dr
[0]);
2077 load_dr1(dbregs
->dr
[1]);
2078 load_dr2(dbregs
->dr
[2]);
2079 load_dr3(dbregs
->dr
[3]);
2080 load_dr4(dbregs
->dr
[4]);
2081 load_dr5(dbregs
->dr
[5]);
2082 load_dr6(dbregs
->dr
[6]);
2083 load_dr7(dbregs
->dr
[7]);
2086 struct ucred
*ucred
;
2088 uint64_t mask1
, mask2
;
2091 * Don't let an illegal value for dr7 get set. Specifically,
2092 * check for undefined settings. Setting these bit patterns
2093 * result in undefined behaviour and can lead to an unexpected
2096 /* JG this loop looks unreadable */
2097 /* Check 4 2-bit fields for invalid patterns.
2098 * These fields are R/Wi, for i = 0..3
2100 /* Is 10 in LENi allowed when running in compatibility mode? */
2101 /* Pattern 10 in R/Wi might be used to indicate
2102 * breakpoint on I/O. Further analysis should be
2103 * carried to decide if it is safe and useful to
2104 * provide access to that capability
2106 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 4;
2107 i
++, mask1
<<= 4, mask2
<<= 4)
2108 if ((dbregs
->dr
[7] & mask1
) == mask2
)
2111 pcb
= lp
->lwp_thread
->td_pcb
;
2112 ucred
= lp
->lwp_proc
->p_ucred
;
2115 * Don't let a process set a breakpoint that is not within the
2116 * process's address space. If a process could do this, it
2117 * could halt the system by setting a breakpoint in the kernel
2118 * (if ddb was enabled). Thus, we need to check to make sure
2119 * that no breakpoints are being enabled for addresses outside
2120 * process's address space, unless, perhaps, we were called by
2123 * XXX - what about when the watched area of the user's
2124 * address space is written into from within the kernel
2125 * ... wouldn't that still cause a breakpoint to be generated
2126 * from within kernel mode?
2129 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2130 if (dbregs
->dr
[7] & 0x3) {
2131 /* dr0 is enabled */
2132 if (dbregs
->dr
[0] >= VM_MAX_USER_ADDRESS
)
2136 if (dbregs
->dr
[7] & (0x3<<2)) {
2137 /* dr1 is enabled */
2138 if (dbregs
->dr
[1] >= VM_MAX_USER_ADDRESS
)
2142 if (dbregs
->dr
[7] & (0x3<<4)) {
2143 /* dr2 is enabled */
2144 if (dbregs
->dr
[2] >= VM_MAX_USER_ADDRESS
)
2148 if (dbregs
->dr
[7] & (0x3<<6)) {
2149 /* dr3 is enabled */
2150 if (dbregs
->dr
[3] >= VM_MAX_USER_ADDRESS
)
2155 pcb
->pcb_dr0
= dbregs
->dr
[0];
2156 pcb
->pcb_dr1
= dbregs
->dr
[1];
2157 pcb
->pcb_dr2
= dbregs
->dr
[2];
2158 pcb
->pcb_dr3
= dbregs
->dr
[3];
2159 pcb
->pcb_dr6
= dbregs
->dr
[6];
2160 pcb
->pcb_dr7
= dbregs
->dr
[7];
2162 pcb
->pcb_flags
|= PCB_DBREGS
;
2169 * Return > 0 if a hardware breakpoint has been hit, and the
2170 * breakpoint was in user space. Return 0, otherwise.
2173 user_dbreg_trap(void)
2175 u_int64_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2176 u_int64_t bp
; /* breakpoint bits extracted from dr6 */
2177 int nbp
; /* number of breakpoints that triggered */
2178 caddr_t addr
[4]; /* breakpoint addresses */
2182 if ((dr7
& 0xff) == 0) {
2184 * all GE and LE bits in the dr7 register are zero,
2185 * thus the trap couldn't have been caused by the
2186 * hardware debug registers
2197 * None of the breakpoint bits are set meaning this
2198 * trap was not caused by any of the debug registers
2204 * at least one of the breakpoints were hit, check to see
2205 * which ones and if any of them are user space addresses
2209 addr
[nbp
++] = (caddr_t
)rdr0();
2212 addr
[nbp
++] = (caddr_t
)rdr1();
2215 addr
[nbp
++] = (caddr_t
)rdr2();
2218 addr
[nbp
++] = (caddr_t
)rdr3();
2221 for (i
=0; i
<nbp
; i
++) {
2223 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2225 * addr[i] is in user space
2232 * None of the breakpoints are in user space.
2240 Debugger(const char *msg
)
2242 kprintf("Debugger(\"%s\") called.\n", msg
);
2249 * Provide inb() and outb() as functions. They are normally only
2250 * available as macros calling inlined functions, thus cannot be
2251 * called inside DDB.
2253 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2259 /* silence compiler warnings */
2261 void outb(u_int
, u_char
);
2268 * We use %%dx and not %1 here because i/o is done at %dx and not at
2269 * %edx, while gcc generates inferior code (movw instead of movl)
2270 * if we tell it to load (u_short) port.
2272 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2277 outb(u_int port
, u_char data
)
2281 * Use an unnecessary assignment to help gcc's register allocator.
2282 * This make a large difference for gcc-1.40 and a tiny difference
2283 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2284 * best results. gcc-2.6.0 can't handle this.
2287 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2294 #include "opt_cpu.h"
2298 * initialize all the SMP locks
2301 /* critical region when masking or unmasking interupts */
2302 struct spinlock_deprecated imen_spinlock
;
2304 /* critical region for old style disable_intr/enable_intr */
2305 struct spinlock_deprecated mpintr_spinlock
;
2307 /* critical region around INTR() routines */
2308 struct spinlock_deprecated intr_spinlock
;
2310 /* lock region used by kernel profiling */
2311 struct spinlock_deprecated mcount_spinlock
;
2313 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2314 struct spinlock_deprecated com_spinlock
;
2316 /* lock regions around the clock hardware */
2317 struct spinlock_deprecated clock_spinlock
;
2323 * mp_lock = 0; BSP already owns the MP lock
2326 * Get the initial mp_lock with a count of 1 for the BSP.
2327 * This uses a LOGICAL cpu ID, ie BSP == 0.
2330 cpu_get_initial_mplock();
2333 spin_lock_init(&mcount_spinlock
);
2334 spin_lock_init(&intr_spinlock
);
2335 spin_lock_init(&mpintr_spinlock
);
2336 spin_lock_init(&imen_spinlock
);
2337 spin_lock_init(&com_spinlock
);
2338 spin_lock_init(&clock_spinlock
);
2340 /* our token pool needs to work early */
2341 lwkt_token_pool_init();