2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (C) 1994, David Greenman
4 * Copyright (c) 1982, 1987, 1990, 1993
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
39 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
40 * $DragonFly: src/sys/platform/pc64/amd64/Attic/cpu_regs.c,v 1.4 2007/12/12 23:49:22 dillon Exp $
43 #include "use_ether.h"
46 #include "opt_atalk.h"
47 #include "opt_compat.h"
49 #include "opt_directio.h"
52 #include "opt_msgbuf.h"
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/sysproto.h>
58 #include <sys/signalvar.h>
59 #include <sys/kernel.h>
60 #include <sys/linker.h>
61 #include <sys/malloc.h>
64 #include <sys/reboot.h>
66 #include <sys/msgbuf.h>
67 #include <sys/sysent.h>
68 #include <sys/sysctl.h>
69 #include <sys/vmmeter.h>
71 #include <sys/upcall.h>
72 #include <sys/usched.h>
76 #include <vm/vm_param.h>
78 #include <vm/vm_kern.h>
79 #include <vm/vm_object.h>
80 #include <vm/vm_page.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_pager.h>
83 #include <vm/vm_extern.h>
85 #include <sys/thread2.h>
93 #include <machine/cpu.h>
94 #include <machine/clock.h>
95 #include <machine/specialreg.h>
96 #include <machine/md_var.h>
97 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
98 #include <machine/globaldata.h> /* CPU_prvspace */
99 #include <machine/smp.h>
101 #include <machine/perfmon.h>
103 #include <machine/cputypes.h>
105 #include <bus/isa/rtc.h>
106 /* #include <machine/vm86.h> */
107 #include <sys/random.h>
108 #include <sys/ptrace.h>
109 #include <machine/sigframe.h>
110 #include <unistd.h> /* umtx_* functions */
112 extern void dblfault_handler (void);
114 #ifndef CPU_DISABLE_SSE
115 static void set_fpregs_xmm (struct save87
*, struct savexmm
*);
116 static void fill_fpregs_xmm (struct savexmm
*, struct save87
*);
117 #endif /* CPU_DISABLE_SSE */
119 extern void ffs_rawread_setup(void);
120 #endif /* DIRECTIO */
123 int64_t tsc_offsets
[MAXCPU
];
125 int64_t tsc_offsets
[1];
128 #if defined(SWTCH_OPTIM_STATS)
129 extern int swtch_optim_stats
;
130 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
131 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
132 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
133 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
137 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
139 int error
= sysctl_handle_int(oidp
, 0, ctob((int)Maxmem
), req
);
143 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_INT
|CTLFLAG_RD
,
144 0, 0, sysctl_hw_physmem
, "IU", "");
147 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
149 int error
= sysctl_handle_int(oidp
, 0,
150 ctob((int)Maxmem
- vmstats
.v_wire_count
), req
);
154 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
155 0, 0, sysctl_hw_usermem
, "IU", "");
157 SYSCTL_ULONG(_hw
, OID_AUTO
, availpages
, CTLFLAG_RD
, &Maxmem
, NULL
, "");
162 sysctl_machdep_msgbuf(SYSCTL_HANDLER_ARGS
)
166 /* Unwind the buffer, so that it's linear (possibly starting with
167 * some initial nulls).
169 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
+msgbufp
->msg_bufr
,
170 msgbufp
->msg_size
-msgbufp
->msg_bufr
,req
);
171 if(error
) return(error
);
172 if(msgbufp
->msg_bufr
>0) {
173 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
,
174 msgbufp
->msg_bufr
,req
);
179 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf
, CTLTYPE_STRING
|CTLFLAG_RD
,
180 0, 0, sysctl_machdep_msgbuf
, "A","Contents of kernel message buffer");
182 static int msgbuf_clear
;
185 sysctl_machdep_msgbuf_clear(SYSCTL_HANDLER_ARGS
)
188 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
190 if (!error
&& req
->newptr
) {
191 /* Clear the buffer and reset write pointer */
192 bzero(msgbufp
->msg_ptr
,msgbufp
->msg_size
);
193 msgbufp
->msg_bufr
=msgbufp
->msg_bufx
=0;
199 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf_clear
, CTLTYPE_INT
|CTLFLAG_RW
,
200 &msgbuf_clear
, 0, sysctl_machdep_msgbuf_clear
, "I",
201 "Clear kernel message buffer");
206 * Send an interrupt to process.
208 * Stack is set up to allow sigcode stored
209 * at top to call routine, followed by kcall
210 * to sigreturn routine below. After sigreturn
211 * resets the signal mask, the stack, and the
212 * frame pointer, it returns to the user
216 extern int _ucodesel
, _udatasel
;
218 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
220 struct lwp
*lp
= curthread
->td_lwp
;
221 struct proc
*p
= lp
->lwp_proc
;
222 struct trapframe
*regs
;
223 struct sigacts
*psp
= p
->p_sigacts
;
224 struct sigframe sf
, *sfp
;
227 regs
= lp
->lwp_md
.md_regs
;
228 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
230 /* save user context */
231 bzero(&sf
, sizeof(struct sigframe
));
232 sf
.sf_uc
.uc_sigmask
= *mask
;
233 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
234 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
235 /* bcopy(regs, &sf.sf_uc.uc_mcontext.mc_gs, sizeof(struct trapframe)); */
237 /* make the size of the saved context visible to userland */
238 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
240 /* save mailbox pending state for syscall interlock semantics */
241 if (p
->p_flag
& P_MAILBOX
)
242 sf
.sf_uc
.uc_mcontext
.mc_flags
|= PGEX_MAILBOX
;
245 /* Allocate and validate space for the signal handler context. */
246 if ((lp
->lwp_flag
& LWP_ALTSTACK
) != 0 && !oonstack
&&
247 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
248 sfp
= (struct sigframe
*)(lp
->lwp_sigstk
.ss_sp
+
249 lp
->lwp_sigstk
.ss_size
- sizeof(struct sigframe
));
250 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
253 sfp
= (struct sigframe
*)regs
->tf_rsp
- 1;
255 /* Translate the signal is appropriate */
256 if (p
->p_sysent
->sv_sigtbl
) {
257 if (sig
<= p
->p_sysent
->sv_sigsize
)
258 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
261 /* Build the argument list for the signal handler. */
263 sf
.sf_ucontext
= (register_t
)&sfp
->sf_uc
;
264 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
265 /* Signal handler installed with SA_SIGINFO. */
266 sf
.sf_siginfo
= (register_t
)&sfp
->sf_si
;
267 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
269 /* fill siginfo structure */
270 sf
.sf_si
.si_signo
= sig
;
271 sf
.sf_si
.si_code
= code
;
272 sf
.sf_si
.si_addr
= (void*)regs
->tf_err
;
275 /* Old FreeBSD-style arguments. */
276 sf
.sf_siginfo
= code
;
277 sf
.sf_addr
= regs
->tf_err
;
278 sf
.sf_ahu
.sf_handler
= catcher
;
283 * If we're a vm86 process, we want to save the segment registers.
284 * We also change eflags to be our emulated eflags, not the actual
287 if (regs
->tf_rflags
& PSL_VM
) {
288 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
289 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
291 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
292 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
293 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
294 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
296 if (vm86
->vm86_has_vme
== 0)
297 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
298 (tf
->tf_rflags
& ~(PSL_VIF
| PSL_VIP
)) |
299 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
302 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
303 * syscalls made by the signal handler. This just avoids
304 * wasting time for our lazy fixup of such faults. PSL_NT
305 * does nothing in vm86 mode, but vm86 programs can set it
306 * almost legitimately in probes for old cpu types.
308 tf
->tf_rflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
313 * Save the FPU state and reinit the FP unit
315 npxpush(&sf
.sf_uc
.uc_mcontext
);
318 * Copy the sigframe out to the user's stack.
320 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
322 * Something is wrong with the stack pointer.
323 * ...Kill the process.
328 regs
->tf_rsp
= (int)sfp
;
329 regs
->tf_rip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
330 regs
->tf_rflags
&= ~PSL_T
;
331 regs
->tf_cs
= _ucodesel
;
332 /* regs->tf_ds = _udatasel;
333 regs->tf_es = _udatasel; */
334 if (regs
->tf_trapno
== T_PROTFLT
) {
335 /* regs->tf_fs = _udatasel;
336 regs->tf_gs = _udatasel; */
338 regs
->tf_ss
= _udatasel
;
342 * Sanitize the trapframe for a virtual kernel passing control to a custom
345 * Allow userland to set or maintain PSL_RF, the resume flag. This flag
346 * basically controls whether the return PC should skip the first instruction
347 * (as in an explicit system call) or re-execute it (as in an exception).
350 cpu_sanitize_frame(struct trapframe
*frame
)
352 frame
->tf_cs
= _ucodesel
;
354 frame
->tf_ds
= _udatasel
;
355 frame
->tf_es
= _udatasel
;
356 frame
->tf_fs
= _udatasel
;
357 frame
->tf_gs
= _udatasel
;
359 frame
->tf_ss
= _udatasel
;
360 frame
->tf_rflags
&= (PSL_RF
| PSL_USERCHANGE
);
361 frame
->tf_rflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
366 cpu_sanitize_tls(struct savetls
*tls
)
368 struct segment_descriptor
*desc
;
371 for (i
= 0; i
< NGTLS
; ++i
) {
373 if (desc
->sd_dpl
== 0 && desc
->sd_type
== 0)
375 if (desc
->sd_def32
== 0)
377 if (desc
->sd_type
!= SDT_MEMRWA
)
379 if (desc
->sd_dpl
!= SEL_UPL
)
381 if (desc
->sd_xx
!= 0 || desc
->sd_p
!= 1)
388 * sigreturn(ucontext_t *sigcntxp)
390 * System call to cleanup state after a signal
391 * has been taken. Reset signal mask and
392 * stack state from context left by sendsig (above).
393 * Return to previous pc and psl as specified by
394 * context left by sendsig. Check carefully to
395 * make sure that the user has not modified the
396 * state to gain improper privileges.
398 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
399 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
402 sys_sigreturn(struct sigreturn_args
*uap
)
404 struct lwp
*lp
= curthread
->td_lwp
;
405 struct proc
*p
= lp
->lwp_proc
;
406 struct trapframe
*regs
;
412 error
= copyin(uap
->sigcntxp
, &ucp
, sizeof(ucp
));
416 regs
= lp
->lwp_md
.md_regs
;
417 rflags
= ucp
.uc_mcontext
.mc_rflags
;
420 if (eflags
& PSL_VM
) {
421 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
422 struct vm86_kernel
*vm86
;
425 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
426 * set up the vm86 area, and we can't enter vm86 mode.
428 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
430 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
431 if (vm86
->vm86_inited
== 0)
434 /* go back to user mode if both flags are set */
435 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
436 trapsignal(lp
->lwp_proc
, SIGBUS
, 0);
438 if (vm86
->vm86_has_vme
) {
439 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
440 (eflags
& VME_USERCHANGE
) | PSL_VM
;
442 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
443 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) | (eflags
& VM_USERCHANGE
) | PSL_VM
;
445 bcopy(&ucp
.uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
446 tf
->tf_eflags
= eflags
;
447 tf
->tf_vm86_ds
= tf
->tf_ds
;
448 tf
->tf_vm86_es
= tf
->tf_es
;
449 tf
->tf_vm86_fs
= tf
->tf_fs
;
450 tf
->tf_vm86_gs
= tf
->tf_gs
;
451 tf
->tf_ds
= _udatasel
;
452 tf
->tf_es
= _udatasel
;
454 tf
->tf_fs
= _udatasel
;
455 tf
->tf_gs
= _udatasel
;
461 * Don't allow users to change privileged or reserved flags.
464 * XXX do allow users to change the privileged flag PSL_RF.
465 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
466 * should sometimes set it there too. tf_eflags is kept in
467 * the signal context during signal handling and there is no
468 * other place to remember it, so the PSL_RF bit may be
469 * corrupted by the signal handler without us knowing.
470 * Corruption of the PSL_RF bit at worst causes one more or
471 * one less debugger trap, so allowing it is fairly harmless.
473 if (!EFL_SECURE(rflags
& ~PSL_RF
, regs
->tf_rflags
& ~PSL_RF
)) {
474 kprintf("sigreturn: eflags = 0x%x\n", rflags
);
479 * Don't allow users to load a valid privileged %cs. Let the
480 * hardware check for invalid selectors, excess privilege in
481 * other selectors, invalid %eip's and invalid %esp's.
483 cs
= ucp
.uc_mcontext
.mc_cs
;
484 if (!CS_SECURE(cs
)) {
485 kprintf("sigreturn: cs = 0x%x\n", cs
);
486 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
489 /* bcopy(&ucp.uc_mcontext.mc_gs, regs, sizeof(struct trapframe)); */
493 * Restore the FPU state from the frame
495 npxpop(&ucp
.uc_mcontext
);
498 * Merge saved signal mailbox pending flag to maintain interlock
499 * semantics against system calls.
501 if (ucp
.uc_mcontext
.mc_flags
& PGEX_MAILBOX
)
502 p
->p_flag
|= P_MAILBOX
;
504 if (ucp
.uc_mcontext
.mc_onstack
& 1)
505 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
507 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
509 lp
->lwp_sigmask
= ucp
.uc_sigmask
;
510 SIG_CANTMASK(lp
->lwp_sigmask
);
515 * Stack frame on entry to function. %eax will contain the function vector,
516 * %ecx will contain the function data. flags, ecx, and eax will have
517 * already been pushed on the stack.
528 sendupcall(struct vmupcall
*vu
, int morepending
)
530 struct lwp
*lp
= curthread
->td_lwp
;
531 struct trapframe
*regs
;
532 struct upcall upcall
;
533 struct upc_frame upc_frame
;
537 * If we are a virtual kernel running an emulated user process
538 * context, switch back to the virtual kernel context before
539 * trying to post the signal.
541 if (lp
->lwp_vkernel
&& lp
->lwp_vkernel
->ve
) {
542 lp
->lwp_md
.md_regs
->tf_trapno
= 0;
543 vkernel_trap(lp
, lp
->lwp_md
.md_regs
);
547 * Get the upcall data structure
549 if (copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
)) ||
550 copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int))
553 kprintf("bad upcall address\n");
558 * If the data structure is already marked pending or has a critical
559 * section count, mark the data structure as pending and return
560 * without doing an upcall. vu_pending is left set.
562 if (upcall
.upc_pending
|| crit_count
>= vu
->vu_pending
) {
563 if (upcall
.upc_pending
< vu
->vu_pending
) {
564 upcall
.upc_pending
= vu
->vu_pending
;
565 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
566 sizeof(upcall
.upc_pending
));
572 * We can run this upcall now, clear vu_pending.
574 * Bump our critical section count and set or clear the
575 * user pending flag depending on whether more upcalls are
576 * pending. The user will be responsible for calling
577 * upc_dispatch(-1) to process remaining upcalls.
580 upcall
.upc_pending
= morepending
;
581 crit_count
+= TDPRI_CRIT
;
582 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
583 sizeof(upcall
.upc_pending
));
584 copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
,
588 * Construct a stack frame and issue the upcall
590 regs
= lp
->lwp_md
.md_regs
;
591 upc_frame
.eax
= regs
->tf_rax
;
592 upc_frame
.ecx
= regs
->tf_rcx
;
593 upc_frame
.edx
= regs
->tf_rdx
;
594 upc_frame
.flags
= regs
->tf_rflags
;
595 upc_frame
.oldip
= regs
->tf_rip
;
596 if (copyout(&upc_frame
, (void *)(regs
->tf_rsp
- sizeof(upc_frame
)),
597 sizeof(upc_frame
)) != 0) {
598 kprintf("bad stack on upcall\n");
600 regs
->tf_rax
= (register_t
)vu
->vu_func
;
601 regs
->tf_rcx
= (register_t
)vu
->vu_data
;
602 regs
->tf_rdx
= (register_t
)lp
->lwp_upcall
;
603 regs
->tf_rip
= (register_t
)vu
->vu_ctx
;
604 regs
->tf_rsp
-= sizeof(upc_frame
);
609 * fetchupcall occurs in the context of a system call, which means that
610 * we have to return EJUSTRETURN in order to prevent eax and edx from
611 * being overwritten by the syscall return value.
613 * if vu is not NULL we return the new context in %edx, the new data in %ecx,
614 * and the function pointer in %eax.
617 fetchupcall (struct vmupcall
*vu
, int morepending
, void *rsp
)
619 struct upc_frame upc_frame
;
620 struct lwp
*lp
= curthread
->td_lwp
;
621 struct trapframe
*regs
;
623 struct upcall upcall
;
626 regs
= lp
->lwp_md
.md_regs
;
628 error
= copyout(&morepending
, &lp
->lwp_upcall
->upc_pending
, sizeof(int));
632 * This jumps us to the next ready context.
635 error
= copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
));
638 error
= copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int));
639 crit_count
+= TDPRI_CRIT
;
641 error
= copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, sizeof(int));
642 regs
->tf_rax
= (register_t
)vu
->vu_func
;
643 regs
->tf_rcx
= (register_t
)vu
->vu_data
;
644 regs
->tf_rdx
= (register_t
)lp
->lwp_upcall
;
645 regs
->tf_rip
= (register_t
)vu
->vu_ctx
;
646 regs
->tf_rsp
= (register_t
)rsp
;
649 * This returns us to the originally interrupted code.
651 error
= copyin(rsp
, &upc_frame
, sizeof(upc_frame
));
652 regs
->tf_rax
= upc_frame
.eax
;
653 regs
->tf_rcx
= upc_frame
.ecx
;
654 regs
->tf_rdx
= upc_frame
.edx
;
655 regs
->tf_rflags
= (regs
->tf_rflags
& ~PSL_USERCHANGE
) |
656 (upc_frame
.flags
& PSL_USERCHANGE
);
657 regs
->tf_rip
= upc_frame
.oldip
;
658 regs
->tf_rsp
= (register_t
)((char *)rsp
+ sizeof(upc_frame
));
667 * cpu_idle() represents the idle LWKT. You cannot return from this function
668 * (unless you want to blow things up!). Instead we look for runnable threads
669 * and loop or halt as appropriate. Giant is not held on entry to the thread.
671 * The main loop is entered with a critical section held, we must release
672 * the critical section before doing anything else. lwkt_switch() will
673 * check for pending interrupts due to entering and exiting its own
676 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI
677 * to wake a HLTed cpu up. However, there are cases where the idlethread
678 * will be entered with the possibility that no IPI will occur and in such
679 * cases lwkt_switch() sets TDF_IDLE_NOHLT.
681 static int cpu_idle_hlt
= 1;
682 static int cpu_idle_hltcnt
;
683 static int cpu_idle_spincnt
;
684 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
685 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
686 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, CTLFLAG_RW
,
687 &cpu_idle_hltcnt
, 0, "Idle loop entry halts");
688 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_spincnt
, CTLFLAG_RW
,
689 &cpu_idle_spincnt
, 0, "Idle loop entry spins");
694 struct thread
*td
= curthread
;
695 struct mdglobaldata
*gd
= mdcpu
;
698 KKASSERT(td
->td_pri
< TDPRI_CRIT
);
701 * See if there are any LWKTs ready to go.
706 * The idle loop halts only if no threads are scheduleable
707 * and no signals have occured.
709 if (cpu_idle_hlt
&& !lwkt_runnable() &&
710 (td
->td_flags
& TDF_IDLE_NOHLT
) == 0) {
712 if (!lwkt_runnable()) {
714 struct timeval tv1
, tv2
;
715 gettimeofday(&tv1
, NULL
);
717 /* umtx_sleep(&gd->mi.gd_runqmask, 0, 1000000); */
719 gettimeofday(&tv2
, NULL
);
720 if (tv2
.tv_usec
- tv1
.tv_usec
+
721 (tv2
.tv_sec
- tv1
.tv_sec
) * 1000000
723 kprintf("cpu %d idlelock %08x %08x\n",
732 __asm
__volatile("pause");
737 td
->td_flags
&= ~TDF_IDLE_NOHLT
;
740 /*__asm __volatile("sti; pause");*/
741 __asm
__volatile("pause");
743 /*__asm __volatile("sti");*/
753 * Called by the LWKT switch core with a critical section held if the only
754 * schedulable thread needs the MP lock and we couldn't get it. On
755 * a real cpu we just spin in the scheduler. In the virtual kernel
756 * we sleep for a bit.
759 cpu_mplock_contested(void)
765 * Called by the spinlock code with or without a critical section held
766 * when a spinlock is found to be seriously constested.
769 cpu_spinlock_contested(void)
777 * Clear registers on exec
780 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
782 struct thread
*td
= curthread
;
783 struct lwp
*lp
= td
->td_lwp
;
784 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
785 struct pcb
*pcb
= lp
->lwp_thread
->td_pcb
;
787 /* was i386_user_cleanup() in NetBSD */
790 bzero((char *)regs
, sizeof(struct trapframe
));
791 regs
->tf_rip
= entry
;
792 regs
->tf_rsp
= stack
;
793 regs
->tf_rflags
= PSL_USER
| (regs
->tf_rflags
& PSL_T
);
801 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
802 regs
->tf_rbx
= ps_strings
;
805 * Reset the hardware debug registers if they were in use.
806 * They won't have any meaning for the newly exec'd process.
808 if (pcb
->pcb_flags
& PCB_DBREGS
) {
815 if (pcb
== td
->td_pcb
) {
817 * Clear the debug registers on the running
818 * CPU, otherwise they will end up affecting
819 * the next process we switch to.
823 pcb
->pcb_flags
&= ~PCB_DBREGS
;
827 * Initialize the math emulator (if any) for the current process.
828 * Actually, just clear the bit that says that the emulator has
829 * been initialized. Initialization is delayed until the process
830 * traps to the emulator (if it is done at all) mainly because
831 * emulators don't provide an entry point for initialization.
833 /* pcb->pcb_flags &= ~FP_SOFTFP; */
836 * note: do not set CR0_TS here. npxinit() must do it after clearing
837 * gd_npxthread. Otherwise a preemptive interrupt thread may panic
842 load_cr0(rcr0() | CR0_MP
);
846 /* Initialize the npx (if any) for the current process. */
847 npxinit(__INITIAL_NPXCW__
);
852 * note: linux emulator needs edx to be 0x0 on entry, which is
853 * handled in execve simply by setting the 64 bit syscall
865 cr0
|= CR0_NE
; /* Done by npxinit() */
866 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
867 cr0
|= CR0_WP
| CR0_AM
;
874 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
877 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
879 if (!error
&& req
->newptr
)
884 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
885 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
887 extern u_long bootdev
; /* not a cdev_t - encoding is different */
888 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
889 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
892 * Initialize 386 and configure to run kernel
896 * Initialize segments & interrupt table
899 extern struct user
*proc0paddr
;
904 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
905 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
906 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
907 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(fpu
), IDTVEC(align
),
908 IDTVEC(xmm
), IDTVEC(syscall
),
911 IDTVEC(int0x80_syscall
);
915 #ifdef DEBUG_INTERRUPTS
916 extern inthand_t
*Xrsvdary
[256];
920 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
922 lp
->lwp_md
.md_regs
->tf_rip
= addr
;
927 ptrace_single_step(struct lwp
*lp
)
929 lp
->lwp_md
.md_regs
->tf_rflags
|= PSL_T
;
934 fill_regs(struct lwp
*lp
, struct reg
*regs
)
936 struct trapframe
*tp
;
938 tp
= lp
->lwp_md
.md_regs
;
939 /* regs->r_gs = tp->tf_gs;
940 regs->r_fs = tp->tf_fs;
941 regs->r_es = tp->tf_es;
942 regs->r_ds = tp->tf_ds; */
943 regs
->r_rdi
= tp
->tf_rdi
;
944 regs
->r_rsi
= tp
->tf_rsi
;
945 regs
->r_rbp
= tp
->tf_rbp
;
946 regs
->r_rbx
= tp
->tf_rbx
;
947 regs
->r_rdx
= tp
->tf_rdx
;
948 regs
->r_rcx
= tp
->tf_rcx
;
949 regs
->r_rax
= tp
->tf_rax
;
950 regs
->r_rip
= tp
->tf_rip
;
951 regs
->r_cs
= tp
->tf_cs
;
952 regs
->r_rflags
= tp
->tf_rflags
;
953 regs
->r_rsp
= tp
->tf_rsp
;
954 regs
->r_ss
= tp
->tf_ss
;
959 set_regs(struct lwp
*lp
, struct reg
*regs
)
961 struct trapframe
*tp
;
963 tp
= lp
->lwp_md
.md_regs
;
964 if (!EFL_SECURE(regs
->r_rflags
, tp
->tf_rflags
) ||
965 !CS_SECURE(regs
->r_cs
))
967 /* tp->tf_gs = regs->r_gs;
968 tp->tf_fs = regs->r_fs;
969 tp->tf_es = regs->r_es;
970 tp->tf_ds = regs->r_ds; */
971 tp
->tf_rdi
= regs
->r_rdi
;
972 tp
->tf_rsi
= regs
->r_rsi
;
973 tp
->tf_rbp
= regs
->r_rbp
;
974 tp
->tf_rbx
= regs
->r_rbx
;
975 tp
->tf_rdx
= regs
->r_rdx
;
976 tp
->tf_rcx
= regs
->r_rcx
;
977 tp
->tf_rax
= regs
->r_rax
;
978 tp
->tf_rip
= regs
->r_rip
;
979 tp
->tf_cs
= regs
->r_cs
;
980 tp
->tf_rflags
= regs
->r_rflags
;
981 tp
->tf_rsp
= regs
->r_rsp
;
982 tp
->tf_ss
= regs
->r_ss
;
986 #ifndef CPU_DISABLE_SSE
988 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
990 struct env87
*penv_87
= &sv_87
->sv_env
;
991 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
994 /* FPU control/status */
995 penv_87
->en_cw
= penv_xmm
->en_cw
;
996 penv_87
->en_sw
= penv_xmm
->en_sw
;
997 penv_87
->en_tw
= penv_xmm
->en_tw
;
998 penv_87
->en_fip
= penv_xmm
->en_fip
;
999 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
1000 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
1001 penv_87
->en_foo
= penv_xmm
->en_foo
;
1002 penv_87
->en_fos
= penv_xmm
->en_fos
;
1005 for (i
= 0; i
< 8; ++i
)
1006 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
1008 sv_87
->sv_ex_sw
= sv_xmm
->sv_ex_sw
;
1012 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
1014 struct env87
*penv_87
= &sv_87
->sv_env
;
1015 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
1018 /* FPU control/status */
1019 penv_xmm
->en_cw
= penv_87
->en_cw
;
1020 penv_xmm
->en_sw
= penv_87
->en_sw
;
1021 penv_xmm
->en_tw
= penv_87
->en_tw
;
1022 penv_xmm
->en_fip
= penv_87
->en_fip
;
1023 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
1024 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
1025 penv_xmm
->en_foo
= penv_87
->en_foo
;
1026 penv_xmm
->en_fos
= penv_87
->en_fos
;
1029 for (i
= 0; i
< 8; ++i
)
1030 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
1032 sv_xmm
->sv_ex_sw
= sv_87
->sv_ex_sw
;
1034 #endif /* CPU_DISABLE_SSE */
1037 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
1039 #ifndef CPU_DISABLE_SSE
1041 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
1042 (struct save87
*)fpregs
);
1045 #endif /* CPU_DISABLE_SSE */
1046 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
1051 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
1053 #ifndef CPU_DISABLE_SSE
1055 set_fpregs_xmm((struct save87
*)fpregs
,
1056 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
1059 #endif /* CPU_DISABLE_SSE */
1060 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
1065 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
1068 dbregs
->dr
[0] = rdr0();
1069 dbregs
->dr
[1] = rdr1();
1070 dbregs
->dr
[2] = rdr2();
1071 dbregs
->dr
[3] = rdr3();
1072 dbregs
->dr
[4] = rdr4();
1073 dbregs
->dr
[5] = rdr5();
1074 dbregs
->dr
[6] = rdr6();
1075 dbregs
->dr
[7] = rdr7();
1079 pcb
= lp
->lwp_thread
->td_pcb
;
1080 dbregs
->dr
[0] = pcb
->pcb_dr0
;
1081 dbregs
->dr
[1] = pcb
->pcb_dr1
;
1082 dbregs
->dr
[2] = pcb
->pcb_dr2
;
1083 dbregs
->dr
[3] = pcb
->pcb_dr3
;
1086 dbregs
->dr
[6] = pcb
->pcb_dr6
;
1087 dbregs
->dr
[7] = pcb
->pcb_dr7
;
1093 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
1096 load_dr0(dbregs
->dr
[0]);
1097 load_dr1(dbregs
->dr
[1]);
1098 load_dr2(dbregs
->dr
[2]);
1099 load_dr3(dbregs
->dr
[3]);
1100 load_dr4(dbregs
->dr
[4]);
1101 load_dr5(dbregs
->dr
[5]);
1102 load_dr6(dbregs
->dr
[6]);
1103 load_dr7(dbregs
->dr
[7]);
1106 struct ucred
*ucred
;
1108 uint32_t mask1
, mask2
;
1111 * Don't let an illegal value for dr7 get set. Specifically,
1112 * check for undefined settings. Setting these bit patterns
1113 * result in undefined behaviour and can lead to an unexpected
1116 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 8;
1117 i
++, mask1
<<= 2, mask2
<<= 2)
1118 if ((dbregs
->dr
[7] & mask1
) == mask2
)
1121 pcb
= lp
->lwp_thread
->td_pcb
;
1122 ucred
= lp
->lwp_proc
->p_ucred
;
1125 * Don't let a process set a breakpoint that is not within the
1126 * process's address space. If a process could do this, it
1127 * could halt the system by setting a breakpoint in the kernel
1128 * (if ddb was enabled). Thus, we need to check to make sure
1129 * that no breakpoints are being enabled for addresses outside
1130 * process's address space, unless, perhaps, we were called by
1133 * XXX - what about when the watched area of the user's
1134 * address space is written into from within the kernel
1135 * ... wouldn't that still cause a breakpoint to be generated
1136 * from within kernel mode?
1139 if (suser_cred(ucred
, 0) != 0) {
1140 if (dbregs
->dr
[7] & 0x3) {
1141 /* dr0 is enabled */
1142 if (dbregs
->dr
[0] >= VM_MAX_USER_ADDRESS
)
1146 if (dbregs
->dr
[7] & (0x3<<2)) {
1147 /* dr1 is enabled */
1148 if (dbregs
->dr
[1] >= VM_MAX_USER_ADDRESS
)
1152 if (dbregs
->dr
[7] & (0x3<<4)) {
1153 /* dr2 is enabled */
1154 if (dbregs
->dr
[2] >= VM_MAX_USER_ADDRESS
)
1158 if (dbregs
->dr
[7] & (0x3<<6)) {
1159 /* dr3 is enabled */
1160 if (dbregs
->dr
[3] >= VM_MAX_USER_ADDRESS
)
1165 pcb
->pcb_dr0
= dbregs
->dr
[0];
1166 pcb
->pcb_dr1
= dbregs
->dr
[1];
1167 pcb
->pcb_dr2
= dbregs
->dr
[2];
1168 pcb
->pcb_dr3
= dbregs
->dr
[3];
1169 pcb
->pcb_dr6
= dbregs
->dr
[6];
1170 pcb
->pcb_dr7
= dbregs
->dr
[7];
1172 pcb
->pcb_flags
|= PCB_DBREGS
;
1180 * Return > 0 if a hardware breakpoint has been hit, and the
1181 * breakpoint was in user space. Return 0, otherwise.
1184 user_dbreg_trap(void)
1186 u_int32_t dr7
, dr6
; /* debug registers dr6 and dr7 */
1187 u_int32_t bp
; /* breakpoint bits extracted from dr6 */
1188 int nbp
; /* number of breakpoints that triggered */
1189 caddr_t addr
[4]; /* breakpoint addresses */
1193 if ((dr7
& 0x000000ff) == 0) {
1195 * all GE and LE bits in the dr7 register are zero,
1196 * thus the trap couldn't have been caused by the
1197 * hardware debug registers
1204 bp
= dr6
& 0x0000000f;
1208 * None of the breakpoint bits are set meaning this
1209 * trap was not caused by any of the debug registers
1215 * at least one of the breakpoints were hit, check to see
1216 * which ones and if any of them are user space addresses
1220 addr
[nbp
++] = (caddr_t
)rdr0();
1223 addr
[nbp
++] = (caddr_t
)rdr1();
1226 addr
[nbp
++] = (caddr_t
)rdr2();
1229 addr
[nbp
++] = (caddr_t
)rdr3();
1232 for (i
=0; i
<nbp
; i
++) {
1234 (caddr_t
)VM_MAX_USER_ADDRESS
) {
1236 * addr[i] is in user space
1243 * None of the breakpoints are in user space.
1253 Debugger(const char *msg
)
1255 kprintf("Debugger(\"%s\") called.\n", msg
);