2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (C) 1994, David Greenman
4 * Copyright (c) 1982, 1987, 1990, 1993
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
39 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
40 * $DragonFly: src/sys/platform/vkernel/i386/cpu_regs.c,v 1.29 2008/06/06 13:19:25 swildner Exp $
43 #include "use_ether.h"
46 #include "opt_atalk.h"
47 #include "opt_compat.h"
49 #include "opt_directio.h"
52 #include "opt_msgbuf.h"
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/sysproto.h>
58 #include <sys/signalvar.h>
59 #include <sys/kernel.h>
60 #include <sys/linker.h>
61 #include <sys/malloc.h>
64 #include <sys/reboot.h>
66 #include <sys/msgbuf.h>
67 #include <sys/sysent.h>
68 #include <sys/sysctl.h>
69 #include <sys/vmmeter.h>
71 #include <sys/upcall.h>
72 #include <sys/usched.h>
76 #include <vm/vm_param.h>
78 #include <vm/vm_kern.h>
79 #include <vm/vm_object.h>
80 #include <vm/vm_page.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_pager.h>
83 #include <vm/vm_extern.h>
85 #include <sys/thread2.h>
93 #include <machine/cpu.h>
94 #include <machine/clock.h>
95 #include <machine/specialreg.h>
96 #include <machine/md_var.h>
97 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
98 #include <machine/globaldata.h> /* CPU_prvspace */
99 #include <machine/smp.h>
101 #include <machine/perfmon.h>
103 #include <machine/cputypes.h>
105 #include <bus/isa/rtc.h>
106 #include <machine/vm86.h>
107 #include <sys/random.h>
108 #include <sys/ptrace.h>
109 #include <machine/sigframe.h>
110 #include <unistd.h> /* umtx_* functions */
112 extern void dblfault_handler (void);
114 #ifndef CPU_DISABLE_SSE
115 static void set_fpregs_xmm (struct save87
*, struct savexmm
*);
116 static void fill_fpregs_xmm (struct savexmm
*, struct save87
*);
117 #endif /* CPU_DISABLE_SSE */
119 extern void ffs_rawread_setup(void);
120 #endif /* DIRECTIO */
123 int64_t tsc_offsets
[MAXCPU
];
125 int64_t tsc_offsets
[1];
128 #if defined(SWTCH_OPTIM_STATS)
129 extern int swtch_optim_stats
;
130 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
131 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
132 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
133 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
137 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
139 int error
= sysctl_handle_int(oidp
, 0, ctob((int)Maxmem
), req
);
143 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_INT
|CTLFLAG_RD
,
144 0, 0, sysctl_hw_physmem
, "IU", "");
147 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
149 int error
= sysctl_handle_int(oidp
, 0,
150 ctob((int)Maxmem
- vmstats
.v_wire_count
), req
);
154 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
155 0, 0, sysctl_hw_usermem
, "IU", "");
157 SYSCTL_ULONG(_hw
, OID_AUTO
, availpages
, CTLFLAG_RD
, &Maxmem
, 0, "");
162 sysctl_machdep_msgbuf(SYSCTL_HANDLER_ARGS
)
166 /* Unwind the buffer, so that it's linear (possibly starting with
167 * some initial nulls).
169 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
+msgbufp
->msg_bufr
,
170 msgbufp
->msg_size
-msgbufp
->msg_bufr
,req
);
171 if(error
) return(error
);
172 if(msgbufp
->msg_bufr
>0) {
173 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
,
174 msgbufp
->msg_bufr
,req
);
179 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf
, CTLTYPE_STRING
|CTLFLAG_RD
,
180 0, 0, sysctl_machdep_msgbuf
, "A","Contents of kernel message buffer");
182 static int msgbuf_clear
;
185 sysctl_machdep_msgbuf_clear(SYSCTL_HANDLER_ARGS
)
188 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
190 if (!error
&& req
->newptr
) {
191 /* Clear the buffer and reset write pointer */
192 bzero(msgbufp
->msg_ptr
,msgbufp
->msg_size
);
193 msgbufp
->msg_bufr
=msgbufp
->msg_bufx
=0;
199 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf_clear
, CTLTYPE_INT
|CTLFLAG_RW
,
200 &msgbuf_clear
, 0, sysctl_machdep_msgbuf_clear
, "I",
201 "Clear kernel message buffer");
206 * Send an interrupt to process.
208 * Stack is set up to allow sigcode stored
209 * at top to call routine, followed by kcall
210 * to sigreturn routine below. After sigreturn
211 * resets the signal mask, the stack, and the
212 * frame pointer, it returns to the user
216 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
218 struct lwp
*lp
= curthread
->td_lwp
;
219 struct proc
*p
= lp
->lwp_proc
;
220 struct trapframe
*regs
;
221 struct sigacts
*psp
= p
->p_sigacts
;
222 struct sigframe sf
, *sfp
;
225 regs
= lp
->lwp_md
.md_regs
;
226 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
228 /* save user context */
229 bzero(&sf
, sizeof(struct sigframe
));
230 sf
.sf_uc
.uc_sigmask
= *mask
;
231 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
232 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
233 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_gs
, sizeof(struct trapframe
));
235 /* make the size of the saved context visible to userland */
236 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
238 /* save mailbox pending state for syscall interlock semantics */
239 if (p
->p_flag
& P_MAILBOX
)
240 sf
.sf_uc
.uc_mcontext
.mc_xflags
|= PGEX_MAILBOX
;
243 /* Allocate and validate space for the signal handler context. */
244 if ((lp
->lwp_flag
& LWP_ALTSTACK
) != 0 && !oonstack
&&
245 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
246 sfp
= (struct sigframe
*)(lp
->lwp_sigstk
.ss_sp
+
247 lp
->lwp_sigstk
.ss_size
- sizeof(struct sigframe
));
248 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
251 sfp
= (struct sigframe
*)regs
->tf_esp
- 1;
253 /* Translate the signal is appropriate */
254 if (p
->p_sysent
->sv_sigtbl
) {
255 if (sig
<= p
->p_sysent
->sv_sigsize
)
256 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
259 /* Build the argument list for the signal handler. */
261 sf
.sf_ucontext
= (register_t
)&sfp
->sf_uc
;
262 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
263 /* Signal handler installed with SA_SIGINFO. */
264 sf
.sf_siginfo
= (register_t
)&sfp
->sf_si
;
265 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
267 /* fill siginfo structure */
268 sf
.sf_si
.si_signo
= sig
;
269 sf
.sf_si
.si_code
= code
;
270 sf
.sf_si
.si_addr
= (void*)regs
->tf_err
;
273 /* Old FreeBSD-style arguments. */
274 sf
.sf_siginfo
= code
;
275 sf
.sf_addr
= regs
->tf_err
;
276 sf
.sf_ahu
.sf_handler
= catcher
;
281 * If we're a vm86 process, we want to save the segment registers.
282 * We also change eflags to be our emulated eflags, not the actual
285 if (regs
->tf_eflags
& PSL_VM
) {
286 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
287 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
289 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
290 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
291 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
292 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
294 if (vm86
->vm86_has_vme
== 0)
295 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
296 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
297 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
300 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
301 * syscalls made by the signal handler. This just avoids
302 * wasting time for our lazy fixup of such faults. PSL_NT
303 * does nothing in vm86 mode, but vm86 programs can set it
304 * almost legitimately in probes for old cpu types.
306 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
311 * Save the FPU state and reinit the FP unit
313 npxpush(&sf
.sf_uc
.uc_mcontext
);
316 * Copy the sigframe out to the user's stack.
318 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
320 * Something is wrong with the stack pointer.
321 * ...Kill the process.
326 regs
->tf_esp
= (int)sfp
;
327 regs
->tf_eip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
330 * i386 abi specifies that the direction flag must be cleared
333 regs
->tf_eflags
&= ~(PSL_T
|PSL_D
);
335 regs
->tf_cs
= _ucodesel
;
336 regs
->tf_ds
= _udatasel
;
337 regs
->tf_es
= _udatasel
;
338 if (regs
->tf_trapno
== T_PROTFLT
) {
339 regs
->tf_fs
= _udatasel
;
340 regs
->tf_gs
= _udatasel
;
342 regs
->tf_ss
= _udatasel
;
346 * Sanitize the trapframe for a virtual kernel passing control to a custom
349 * Allow userland to set or maintain PSL_RF, the resume flag. This flag
350 * basically controls whether the return PC should skip the first instruction
351 * (as in an explicit system call) or re-execute it (as in an exception).
354 cpu_sanitize_frame(struct trapframe
*frame
)
356 frame
->tf_cs
= _ucodesel
;
357 frame
->tf_ds
= _udatasel
;
358 frame
->tf_es
= _udatasel
;
360 frame
->tf_fs
= _udatasel
;
361 frame
->tf_gs
= _udatasel
;
363 frame
->tf_ss
= _udatasel
;
364 frame
->tf_eflags
&= (PSL_RF
| PSL_USERCHANGE
);
365 frame
->tf_eflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
370 cpu_sanitize_tls(struct savetls
*tls
)
372 struct segment_descriptor
*desc
;
375 for (i
= 0; i
< NGTLS
; ++i
) {
377 if (desc
->sd_dpl
== 0 && desc
->sd_type
== 0)
379 if (desc
->sd_def32
== 0)
381 if (desc
->sd_type
!= SDT_MEMRWA
)
383 if (desc
->sd_dpl
!= SEL_UPL
)
385 if (desc
->sd_xx
!= 0 || desc
->sd_p
!= 1)
392 * sigreturn(ucontext_t *sigcntxp)
394 * System call to cleanup state after a signal
395 * has been taken. Reset signal mask and
396 * stack state from context left by sendsig (above).
397 * Return to previous pc and psl as specified by
398 * context left by sendsig. Check carefully to
399 * make sure that the user has not modified the
400 * state to gain improper privileges.
402 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
403 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
406 sys_sigreturn(struct sigreturn_args
*uap
)
408 struct lwp
*lp
= curthread
->td_lwp
;
409 struct proc
*p
= lp
->lwp_proc
;
410 struct trapframe
*regs
;
416 error
= copyin(uap
->sigcntxp
, &ucp
, sizeof(ucp
));
420 regs
= lp
->lwp_md
.md_regs
;
421 eflags
= ucp
.uc_mcontext
.mc_eflags
;
424 if (eflags
& PSL_VM
) {
425 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
426 struct vm86_kernel
*vm86
;
429 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
430 * set up the vm86 area, and we can't enter vm86 mode.
432 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
434 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
435 if (vm86
->vm86_inited
== 0)
438 /* go back to user mode if both flags are set */
439 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
440 trapsignal(lp
->lwp_proc
, SIGBUS
, 0);
442 if (vm86
->vm86_has_vme
) {
443 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
444 (eflags
& VME_USERCHANGE
) | PSL_VM
;
446 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
447 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) | (eflags
& VM_USERCHANGE
) | PSL_VM
;
449 bcopy(&ucp
.uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
450 tf
->tf_eflags
= eflags
;
451 tf
->tf_vm86_ds
= tf
->tf_ds
;
452 tf
->tf_vm86_es
= tf
->tf_es
;
453 tf
->tf_vm86_fs
= tf
->tf_fs
;
454 tf
->tf_vm86_gs
= tf
->tf_gs
;
455 tf
->tf_ds
= _udatasel
;
456 tf
->tf_es
= _udatasel
;
458 tf
->tf_fs
= _udatasel
;
459 tf
->tf_gs
= _udatasel
;
465 * Don't allow users to change privileged or reserved flags.
468 * XXX do allow users to change the privileged flag PSL_RF.
469 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
470 * should sometimes set it there too. tf_eflags is kept in
471 * the signal context during signal handling and there is no
472 * other place to remember it, so the PSL_RF bit may be
473 * corrupted by the signal handler without us knowing.
474 * Corruption of the PSL_RF bit at worst causes one more or
475 * one less debugger trap, so allowing it is fairly harmless.
477 if (!EFL_SECURE(eflags
& ~PSL_RF
, regs
->tf_eflags
& ~PSL_RF
)) {
478 kprintf("sigreturn: eflags = 0x%x\n", eflags
);
483 * Don't allow users to load a valid privileged %cs. Let the
484 * hardware check for invalid selectors, excess privilege in
485 * other selectors, invalid %eip's and invalid %esp's.
487 cs
= ucp
.uc_mcontext
.mc_cs
;
488 if (!CS_SECURE(cs
)) {
489 kprintf("sigreturn: cs = 0x%x\n", cs
);
490 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
493 bcopy(&ucp
.uc_mcontext
.mc_gs
, regs
, sizeof(struct trapframe
));
497 * Restore the FPU state from the frame
499 npxpop(&ucp
.uc_mcontext
);
502 * Merge saved signal mailbox pending flag to maintain interlock
503 * semantics against system calls.
505 if (ucp
.uc_mcontext
.mc_xflags
& PGEX_MAILBOX
)
506 p
->p_flag
|= P_MAILBOX
;
508 if (ucp
.uc_mcontext
.mc_onstack
& 1)
509 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
511 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
513 lp
->lwp_sigmask
= ucp
.uc_sigmask
;
514 SIG_CANTMASK(lp
->lwp_sigmask
);
519 * Stack frame on entry to function. %eax will contain the function vector,
520 * %ecx will contain the function data. flags, ecx, and eax will have
521 * already been pushed on the stack.
532 sendupcall(struct vmupcall
*vu
, int morepending
)
534 struct lwp
*lp
= curthread
->td_lwp
;
535 struct trapframe
*regs
;
536 struct upcall upcall
;
537 struct upc_frame upc_frame
;
541 * If we are a virtual kernel running an emulated user process
542 * context, switch back to the virtual kernel context before
543 * trying to post the signal.
545 if (lp
->lwp_vkernel
&& lp
->lwp_vkernel
->ve
) {
546 lp
->lwp_md
.md_regs
->tf_trapno
= 0;
547 vkernel_trap(lp
, lp
->lwp_md
.md_regs
);
551 * Get the upcall data structure
553 if (copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
)) ||
554 copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int))
557 kprintf("bad upcall address\n");
562 * If the data structure is already marked pending or has a critical
563 * section count, mark the data structure as pending and return
564 * without doing an upcall. vu_pending is left set.
566 if (upcall
.upc_pending
|| crit_count
>= vu
->vu_pending
) {
567 if (upcall
.upc_pending
< vu
->vu_pending
) {
568 upcall
.upc_pending
= vu
->vu_pending
;
569 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
570 sizeof(upcall
.upc_pending
));
576 * We can run this upcall now, clear vu_pending.
578 * Bump our critical section count and set or clear the
579 * user pending flag depending on whether more upcalls are
580 * pending. The user will be responsible for calling
581 * upc_dispatch(-1) to process remaining upcalls.
584 upcall
.upc_pending
= morepending
;
585 crit_count
+= TDPRI_CRIT
;
586 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
587 sizeof(upcall
.upc_pending
));
588 copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
,
592 * Construct a stack frame and issue the upcall
594 regs
= lp
->lwp_md
.md_regs
;
595 upc_frame
.eax
= regs
->tf_eax
;
596 upc_frame
.ecx
= regs
->tf_ecx
;
597 upc_frame
.edx
= regs
->tf_edx
;
598 upc_frame
.flags
= regs
->tf_eflags
;
599 upc_frame
.oldip
= regs
->tf_eip
;
600 if (copyout(&upc_frame
, (void *)(regs
->tf_esp
- sizeof(upc_frame
)),
601 sizeof(upc_frame
)) != 0) {
602 kprintf("bad stack on upcall\n");
604 regs
->tf_eax
= (register_t
)vu
->vu_func
;
605 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
606 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
607 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
608 regs
->tf_esp
-= sizeof(upc_frame
);
613 * fetchupcall occurs in the context of a system call, which means that
614 * we have to return EJUSTRETURN in order to prevent eax and edx from
615 * being overwritten by the syscall return value.
617 * if vu is not NULL we return the new context in %edx, the new data in %ecx,
618 * and the function pointer in %eax.
621 fetchupcall (struct vmupcall
*vu
, int morepending
, void *rsp
)
623 struct upc_frame upc_frame
;
624 struct lwp
*lp
= curthread
->td_lwp
;
625 struct trapframe
*regs
;
627 struct upcall upcall
;
630 regs
= lp
->lwp_md
.md_regs
;
632 error
= copyout(&morepending
, &lp
->lwp_upcall
->upc_pending
, sizeof(int));
636 * This jumps us to the next ready context.
639 error
= copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
));
642 error
= copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int));
643 crit_count
+= TDPRI_CRIT
;
645 error
= copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, sizeof(int));
646 regs
->tf_eax
= (register_t
)vu
->vu_func
;
647 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
648 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
649 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
650 regs
->tf_esp
= (register_t
)rsp
;
653 * This returns us to the originally interrupted code.
655 error
= copyin(rsp
, &upc_frame
, sizeof(upc_frame
));
656 regs
->tf_eax
= upc_frame
.eax
;
657 regs
->tf_ecx
= upc_frame
.ecx
;
658 regs
->tf_edx
= upc_frame
.edx
;
659 regs
->tf_eflags
= (regs
->tf_eflags
& ~PSL_USERCHANGE
) |
660 (upc_frame
.flags
& PSL_USERCHANGE
);
661 regs
->tf_eip
= upc_frame
.oldip
;
662 regs
->tf_esp
= (register_t
)((char *)rsp
+ sizeof(upc_frame
));
671 * cpu_idle() represents the idle LWKT. You cannot return from this function
672 * (unless you want to blow things up!). Instead we look for runnable threads
673 * and loop or halt as appropriate. Giant is not held on entry to the thread.
675 * The main loop is entered with a critical section held, we must release
676 * the critical section before doing anything else. lwkt_switch() will
677 * check for pending interrupts due to entering and exiting its own
680 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI
681 * to wake a HLTed cpu up. However, there are cases where the idlethread
682 * will be entered with the possibility that no IPI will occur and in such
683 * cases lwkt_switch() sets TDF_IDLE_NOHLT.
685 static int cpu_idle_hlt
= 1;
686 static int cpu_idle_hltcnt
;
687 static int cpu_idle_spincnt
;
688 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
689 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
690 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, CTLFLAG_RW
,
691 &cpu_idle_hltcnt
, 0, "Idle loop entry halts");
692 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_spincnt
, CTLFLAG_RW
,
693 &cpu_idle_spincnt
, 0, "Idle loop entry spins");
698 struct thread
*td
= curthread
;
699 struct mdglobaldata
*gd
= mdcpu
;
702 KKASSERT(td
->td_pri
< TDPRI_CRIT
);
706 * See if there are any LWKTs ready to go.
711 * The idle loop halts only if no threads are scheduleable
712 * and no signals have occured.
714 if (cpu_idle_hlt
&& !lwkt_runnable() &&
715 (td
->td_flags
& TDF_IDLE_NOHLT
) == 0) {
717 if (!lwkt_runnable()) {
719 struct timeval tv1
, tv2
;
720 gettimeofday(&tv1
, NULL
);
722 umtx_sleep(&gd
->mi
.gd_runqmask
, 0, 1000000);
724 gettimeofday(&tv2
, NULL
);
725 if (tv2
.tv_usec
- tv1
.tv_usec
+
726 (tv2
.tv_sec
- tv1
.tv_sec
) * 1000000
728 kprintf("cpu %d idlelock %08x %08x\n",
737 __asm
__volatile("pause");
742 td
->td_flags
&= ~TDF_IDLE_NOHLT
;
745 /*__asm __volatile("sti; pause");*/
746 __asm
__volatile("pause");
748 /*__asm __volatile("sti");*/
758 * Called by the LWKT switch core with a critical section held if the only
759 * schedulable thread needs the MP lock and we couldn't get it. On
760 * a real cpu we just spin in the scheduler. In the virtual kernel
761 * we sleep for a bit.
764 cpu_mplock_contested(void)
770 * Called by the spinlock code with or without a critical section held
771 * when a spinlock is found to be seriously constested.
774 cpu_spinlock_contested(void)
782 * Clear registers on exec
785 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
787 struct thread
*td
= curthread
;
788 struct lwp
*lp
= td
->td_lwp
;
789 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
790 struct pcb
*pcb
= lp
->lwp_thread
->td_pcb
;
792 /* was i386_user_cleanup() in NetBSD */
795 bzero((char *)regs
, sizeof(struct trapframe
));
796 regs
->tf_eip
= entry
;
797 regs
->tf_esp
= stack
;
798 regs
->tf_eflags
= PSL_USER
| (regs
->tf_eflags
& PSL_T
);
806 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
807 regs
->tf_ebx
= ps_strings
;
810 * Reset the hardware debug registers if they were in use.
811 * They won't have any meaning for the newly exec'd process.
813 if (pcb
->pcb_flags
& PCB_DBREGS
) {
820 if (pcb
== td
->td_pcb
) {
822 * Clear the debug registers on the running
823 * CPU, otherwise they will end up affecting
824 * the next process we switch to.
828 pcb
->pcb_flags
&= ~PCB_DBREGS
;
832 * Initialize the math emulator (if any) for the current process.
833 * Actually, just clear the bit that says that the emulator has
834 * been initialized. Initialization is delayed until the process
835 * traps to the emulator (if it is done at all) mainly because
836 * emulators don't provide an entry point for initialization.
838 pcb
->pcb_flags
&= ~FP_SOFTFP
;
841 * note: do not set CR0_TS here. npxinit() must do it after clearing
842 * gd_npxthread. Otherwise a preemptive interrupt thread may panic
847 load_cr0(rcr0() | CR0_MP
);
851 /* Initialize the npx (if any) for the current process. */
852 npxinit(__INITIAL_NPXCW__
);
857 * note: linux emulator needs edx to be 0x0 on entry, which is
858 * handled in execve simply by setting the 64 bit syscall
870 cr0
|= CR0_NE
; /* Done by npxinit() */
871 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
872 cr0
|= CR0_WP
| CR0_AM
;
879 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
882 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
884 if (!error
&& req
->newptr
)
889 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
890 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
892 extern u_long bootdev
; /* not a cdev_t - encoding is different */
893 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
894 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
897 * Initialize 386 and configure to run kernel
901 * Initialize segments & interrupt table
904 extern struct user
*proc0paddr
;
909 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
910 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
911 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
912 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(fpu
), IDTVEC(align
),
913 IDTVEC(xmm
), IDTVEC(syscall
),
916 IDTVEC(int0x80_syscall
);
920 #ifdef DEBUG_INTERRUPTS
921 extern inthand_t
*Xrsvdary
[256];
925 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
927 lp
->lwp_md
.md_regs
->tf_eip
= addr
;
932 ptrace_single_step(struct lwp
*lp
)
934 lp
->lwp_md
.md_regs
->tf_eflags
|= PSL_T
;
939 fill_regs(struct lwp
*lp
, struct reg
*regs
)
941 struct trapframe
*tp
;
943 tp
= lp
->lwp_md
.md_regs
;
944 regs
->r_gs
= tp
->tf_gs
;
945 regs
->r_fs
= tp
->tf_fs
;
946 regs
->r_es
= tp
->tf_es
;
947 regs
->r_ds
= tp
->tf_ds
;
948 regs
->r_edi
= tp
->tf_edi
;
949 regs
->r_esi
= tp
->tf_esi
;
950 regs
->r_ebp
= tp
->tf_ebp
;
951 regs
->r_ebx
= tp
->tf_ebx
;
952 regs
->r_edx
= tp
->tf_edx
;
953 regs
->r_ecx
= tp
->tf_ecx
;
954 regs
->r_eax
= tp
->tf_eax
;
955 regs
->r_eip
= tp
->tf_eip
;
956 regs
->r_cs
= tp
->tf_cs
;
957 regs
->r_eflags
= tp
->tf_eflags
;
958 regs
->r_esp
= tp
->tf_esp
;
959 regs
->r_ss
= tp
->tf_ss
;
964 set_regs(struct lwp
*lp
, struct reg
*regs
)
966 struct trapframe
*tp
;
968 tp
= lp
->lwp_md
.md_regs
;
969 if (!EFL_SECURE(regs
->r_eflags
, tp
->tf_eflags
) ||
970 !CS_SECURE(regs
->r_cs
))
972 tp
->tf_gs
= regs
->r_gs
;
973 tp
->tf_fs
= regs
->r_fs
;
974 tp
->tf_es
= regs
->r_es
;
975 tp
->tf_ds
= regs
->r_ds
;
976 tp
->tf_edi
= regs
->r_edi
;
977 tp
->tf_esi
= regs
->r_esi
;
978 tp
->tf_ebp
= regs
->r_ebp
;
979 tp
->tf_ebx
= regs
->r_ebx
;
980 tp
->tf_edx
= regs
->r_edx
;
981 tp
->tf_ecx
= regs
->r_ecx
;
982 tp
->tf_eax
= regs
->r_eax
;
983 tp
->tf_eip
= regs
->r_eip
;
984 tp
->tf_cs
= regs
->r_cs
;
985 tp
->tf_eflags
= regs
->r_eflags
;
986 tp
->tf_esp
= regs
->r_esp
;
987 tp
->tf_ss
= regs
->r_ss
;
991 #ifndef CPU_DISABLE_SSE
993 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
995 struct env87
*penv_87
= &sv_87
->sv_env
;
996 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
999 /* FPU control/status */
1000 penv_87
->en_cw
= penv_xmm
->en_cw
;
1001 penv_87
->en_sw
= penv_xmm
->en_sw
;
1002 penv_87
->en_tw
= penv_xmm
->en_tw
;
1003 penv_87
->en_fip
= penv_xmm
->en_fip
;
1004 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
1005 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
1006 penv_87
->en_foo
= penv_xmm
->en_foo
;
1007 penv_87
->en_fos
= penv_xmm
->en_fos
;
1010 for (i
= 0; i
< 8; ++i
)
1011 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
1013 sv_87
->sv_ex_sw
= sv_xmm
->sv_ex_sw
;
1017 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
1019 struct env87
*penv_87
= &sv_87
->sv_env
;
1020 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
1023 /* FPU control/status */
1024 penv_xmm
->en_cw
= penv_87
->en_cw
;
1025 penv_xmm
->en_sw
= penv_87
->en_sw
;
1026 penv_xmm
->en_tw
= penv_87
->en_tw
;
1027 penv_xmm
->en_fip
= penv_87
->en_fip
;
1028 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
1029 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
1030 penv_xmm
->en_foo
= penv_87
->en_foo
;
1031 penv_xmm
->en_fos
= penv_87
->en_fos
;
1034 for (i
= 0; i
< 8; ++i
)
1035 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
1037 sv_xmm
->sv_ex_sw
= sv_87
->sv_ex_sw
;
1039 #endif /* CPU_DISABLE_SSE */
1042 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
1044 #ifndef CPU_DISABLE_SSE
1046 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
1047 (struct save87
*)fpregs
);
1050 #endif /* CPU_DISABLE_SSE */
1051 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
1056 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
1058 #ifndef CPU_DISABLE_SSE
1060 set_fpregs_xmm((struct save87
*)fpregs
,
1061 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
1064 #endif /* CPU_DISABLE_SSE */
1065 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
1070 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
1076 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
1083 * Return > 0 if a hardware breakpoint has been hit, and the
1084 * breakpoint was in user space. Return 0, otherwise.
1087 user_dbreg_trap(void)
1089 u_int32_t dr7
, dr6
; /* debug registers dr6 and dr7 */
1090 u_int32_t bp
; /* breakpoint bits extracted from dr6 */
1091 int nbp
; /* number of breakpoints that triggered */
1092 caddr_t addr
[4]; /* breakpoint addresses */
1096 if ((dr7
& 0x000000ff) == 0) {
1098 * all GE and LE bits in the dr7 register are zero,
1099 * thus the trap couldn't have been caused by the
1100 * hardware debug registers
1107 bp
= dr6
& 0x0000000f;
1111 * None of the breakpoint bits are set meaning this
1112 * trap was not caused by any of the debug registers
1118 * at least one of the breakpoints were hit, check to see
1119 * which ones and if any of them are user space addresses
1123 addr
[nbp
++] = (caddr_t
)rdr0();
1126 addr
[nbp
++] = (caddr_t
)rdr1();
1129 addr
[nbp
++] = (caddr_t
)rdr2();
1132 addr
[nbp
++] = (caddr_t
)rdr3();
1135 for (i
=0; i
<nbp
; i
++) {
1137 (caddr_t
)VM_MAX_USER_ADDRESS
) {
1139 * addr[i] is in user space
1146 * None of the breakpoints are in user space.
1159 cpu_feature
= regs
[3];
1165 Debugger(const char *msg
)
1167 kprintf("Debugger(\"%s\") called.\n", msg
);