2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (C) 1994, David Greenman
4 * Copyright (c) 1982, 1987, 1990, 1993
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
39 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
43 #include "opt_compat.h"
45 #include "opt_directio.h"
48 #include "opt_msgbuf.h"
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/sysproto.h>
54 #include <sys/signalvar.h>
55 #include <sys/kernel.h>
56 #include <sys/linker.h>
57 #include <sys/malloc.h>
60 #include <sys/reboot.h>
62 #include <sys/msgbuf.h>
63 #include <sys/sysent.h>
64 #include <sys/sysctl.h>
65 #include <sys/vmmeter.h>
67 #include <sys/upcall.h>
68 #include <sys/usched.h>
72 #include <vm/vm_param.h>
74 #include <vm/vm_kern.h>
75 #include <vm/vm_object.h>
76 #include <vm/vm_page.h>
77 #include <vm/vm_map.h>
78 #include <vm/vm_pager.h>
79 #include <vm/vm_extern.h>
81 #include <sys/thread2.h>
82 #include <sys/mplock2.h>
90 #include <machine/cpu.h>
91 #include <machine/clock.h>
92 #include <machine/specialreg.h>
93 #include <machine/md_var.h>
94 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
95 #include <machine/globaldata.h> /* CPU_prvspace */
96 #include <machine/smp.h>
98 #include <machine/perfmon.h>
100 #include <machine/cputypes.h>
102 #include <bus/isa/rtc.h>
103 #include <machine/vm86.h>
104 #include <sys/random.h>
105 #include <sys/ptrace.h>
106 #include <machine/sigframe.h>
107 #include <unistd.h> /* umtx_* functions */
108 #include <pthread.h> /* pthread_yield */
110 extern void dblfault_handler (void);
112 #ifndef CPU_DISABLE_SSE
113 static void set_fpregs_xmm (struct save87
*, struct savexmm
*);
114 static void fill_fpregs_xmm (struct savexmm
*, struct save87
*);
115 #endif /* CPU_DISABLE_SSE */
117 extern void ffs_rawread_setup(void);
118 #endif /* DIRECTIO */
121 int64_t tsc_offsets
[MAXCPU
];
123 int64_t tsc_offsets
[1];
126 #if defined(SWTCH_OPTIM_STATS)
127 extern int swtch_optim_stats
;
128 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
129 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
130 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
131 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
135 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
137 u_long pmem
= ctob(physmem
);
139 int error
= sysctl_handle_long(oidp
, &pmem
, 0, req
);
143 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_ULONG
|CTLFLAG_RD
,
144 0, 0, sysctl_hw_physmem
, "LU", "Total system memory in bytes (number of pages * page size)");
147 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
149 int error
= sysctl_handle_int(oidp
, 0,
150 ctob((int)Maxmem
- vmstats
.v_wire_count
), req
);
154 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
155 0, 0, sysctl_hw_usermem
, "IU", "");
157 SYSCTL_ULONG(_hw
, OID_AUTO
, availpages
, CTLFLAG_RD
, &Maxmem
, 0, "");
162 sysctl_machdep_msgbuf(SYSCTL_HANDLER_ARGS
)
166 /* Unwind the buffer, so that it's linear (possibly starting with
167 * some initial nulls).
169 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
+msgbufp
->msg_bufr
,
170 msgbufp
->msg_size
-msgbufp
->msg_bufr
,req
);
171 if(error
) return(error
);
172 if(msgbufp
->msg_bufr
>0) {
173 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
,
174 msgbufp
->msg_bufr
,req
);
179 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf
, CTLTYPE_STRING
|CTLFLAG_RD
,
180 0, 0, sysctl_machdep_msgbuf
, "A","Contents of kernel message buffer");
182 static int msgbuf_clear
;
185 sysctl_machdep_msgbuf_clear(SYSCTL_HANDLER_ARGS
)
188 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
190 if (!error
&& req
->newptr
) {
191 /* Clear the buffer and reset write pointer */
192 bzero(msgbufp
->msg_ptr
,msgbufp
->msg_size
);
193 msgbufp
->msg_bufr
=msgbufp
->msg_bufx
=0;
199 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf_clear
, CTLTYPE_INT
|CTLFLAG_RW
,
200 &msgbuf_clear
, 0, sysctl_machdep_msgbuf_clear
, "I",
201 "Clear kernel message buffer");
206 * Send an interrupt to process.
208 * Stack is set up to allow sigcode stored
209 * at top to call routine, followed by kcall
210 * to sigreturn routine below. After sigreturn
211 * resets the signal mask, the stack, and the
212 * frame pointer, it returns to the user
216 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
218 struct lwp
*lp
= curthread
->td_lwp
;
219 struct proc
*p
= lp
->lwp_proc
;
220 struct trapframe
*regs
;
221 struct sigacts
*psp
= p
->p_sigacts
;
222 struct sigframe sf
, *sfp
;
225 regs
= lp
->lwp_md
.md_regs
;
226 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
228 /* save user context */
229 bzero(&sf
, sizeof(struct sigframe
));
230 sf
.sf_uc
.uc_sigmask
= *mask
;
231 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
232 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
233 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_gs
, sizeof(struct trapframe
));
235 /* make the size of the saved context visible to userland */
236 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
238 /* Allocate and validate space for the signal handler context. */
239 if ((lp
->lwp_flags
& LWP_ALTSTACK
) != 0 && !oonstack
&&
240 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
241 sfp
= (struct sigframe
*)(lp
->lwp_sigstk
.ss_sp
+
242 lp
->lwp_sigstk
.ss_size
- sizeof(struct sigframe
));
243 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
246 sfp
= (struct sigframe
*)regs
->tf_esp
- 1;
248 /* Translate the signal is appropriate */
249 if (p
->p_sysent
->sv_sigtbl
) {
250 if (sig
<= p
->p_sysent
->sv_sigsize
)
251 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
254 /* Build the argument list for the signal handler. */
256 sf
.sf_ucontext
= (register_t
)&sfp
->sf_uc
;
257 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
258 /* Signal handler installed with SA_SIGINFO. */
259 sf
.sf_siginfo
= (register_t
)&sfp
->sf_si
;
260 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
262 /* fill siginfo structure */
263 sf
.sf_si
.si_signo
= sig
;
264 sf
.sf_si
.si_code
= code
;
265 sf
.sf_si
.si_addr
= (void*)regs
->tf_err
;
268 /* Old FreeBSD-style arguments. */
269 sf
.sf_siginfo
= code
;
270 sf
.sf_addr
= regs
->tf_err
;
271 sf
.sf_ahu
.sf_handler
= catcher
;
276 * If we're a vm86 process, we want to save the segment registers.
277 * We also change eflags to be our emulated eflags, not the actual
280 if (regs
->tf_eflags
& PSL_VM
) {
281 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
282 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
284 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
285 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
286 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
287 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
289 if (vm86
->vm86_has_vme
== 0)
290 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
291 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
292 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
295 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
296 * syscalls made by the signal handler. This just avoids
297 * wasting time for our lazy fixup of such faults. PSL_NT
298 * does nothing in vm86 mode, but vm86 programs can set it
299 * almost legitimately in probes for old cpu types.
301 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
306 * Save the FPU state and reinit the FP unit
308 npxpush(&sf
.sf_uc
.uc_mcontext
);
311 * Copy the sigframe out to the user's stack.
313 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
315 * Something is wrong with the stack pointer.
316 * ...Kill the process.
321 regs
->tf_esp
= (int)sfp
;
322 regs
->tf_eip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
325 * i386 abi specifies that the direction flag must be cleared
328 regs
->tf_eflags
&= ~(PSL_T
|PSL_D
);
330 regs
->tf_cs
= _ucodesel
;
331 regs
->tf_ds
= _udatasel
;
332 regs
->tf_es
= _udatasel
;
333 if (regs
->tf_trapno
== T_PROTFLT
) {
334 regs
->tf_fs
= _udatasel
;
335 regs
->tf_gs
= _udatasel
;
337 regs
->tf_ss
= _udatasel
;
341 * Sanitize the trapframe for a virtual kernel passing control to a custom
344 * Allow userland to set or maintain PSL_RF, the resume flag. This flag
345 * basically controls whether the return PC should skip the first instruction
346 * (as in an explicit system call) or re-execute it (as in an exception).
349 cpu_sanitize_frame(struct trapframe
*frame
)
351 frame
->tf_cs
= _ucodesel
;
352 frame
->tf_ds
= _udatasel
;
353 frame
->tf_es
= _udatasel
;
355 frame
->tf_fs
= _udatasel
;
356 frame
->tf_gs
= _udatasel
;
358 frame
->tf_ss
= _udatasel
;
359 frame
->tf_eflags
&= (PSL_RF
| PSL_USERCHANGE
);
360 frame
->tf_eflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
365 cpu_sanitize_tls(struct savetls
*tls
)
367 struct segment_descriptor
*desc
;
370 for (i
= 0; i
< NGTLS
; ++i
) {
372 if (desc
->sd_dpl
== 0 && desc
->sd_type
== 0)
374 if (desc
->sd_def32
== 0)
376 if (desc
->sd_type
!= SDT_MEMRWA
)
378 if (desc
->sd_dpl
!= SEL_UPL
)
380 if (desc
->sd_xx
!= 0 || desc
->sd_p
!= 1)
387 * sigreturn(ucontext_t *sigcntxp)
389 * System call to cleanup state after a signal
390 * has been taken. Reset signal mask and
391 * stack state from context left by sendsig (above).
392 * Return to previous pc and psl as specified by
393 * context left by sendsig. Check carefully to
394 * make sure that the user has not modified the
395 * state to gain improper privileges.
399 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
400 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
403 sys_sigreturn(struct sigreturn_args
*uap
)
405 struct lwp
*lp
= curthread
->td_lwp
;
406 struct trapframe
*regs
;
412 error
= copyin(uap
->sigcntxp
, &ucp
, sizeof(ucp
));
416 regs
= lp
->lwp_md
.md_regs
;
417 eflags
= ucp
.uc_mcontext
.mc_eflags
;
420 if (eflags
& PSL_VM
) {
421 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
422 struct vm86_kernel
*vm86
;
425 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
426 * set up the vm86 area, and we can't enter vm86 mode.
428 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
430 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
431 if (vm86
->vm86_inited
== 0)
434 /* go back to user mode if both flags are set */
435 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
436 trapsignal(lp
->lwp_proc
, SIGBUS
, 0);
438 if (vm86
->vm86_has_vme
) {
439 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
440 (eflags
& VME_USERCHANGE
) | PSL_VM
;
442 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
443 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) | (eflags
& VM_USERCHANGE
) | PSL_VM
;
445 bcopy(&ucp
.uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
446 tf
->tf_eflags
= eflags
;
447 tf
->tf_vm86_ds
= tf
->tf_ds
;
448 tf
->tf_vm86_es
= tf
->tf_es
;
449 tf
->tf_vm86_fs
= tf
->tf_fs
;
450 tf
->tf_vm86_gs
= tf
->tf_gs
;
451 tf
->tf_ds
= _udatasel
;
452 tf
->tf_es
= _udatasel
;
454 tf
->tf_fs
= _udatasel
;
455 tf
->tf_gs
= _udatasel
;
461 * Don't allow users to change privileged or reserved flags.
464 * XXX do allow users to change the privileged flag PSL_RF.
465 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
466 * should sometimes set it there too. tf_eflags is kept in
467 * the signal context during signal handling and there is no
468 * other place to remember it, so the PSL_RF bit may be
469 * corrupted by the signal handler without us knowing.
470 * Corruption of the PSL_RF bit at worst causes one more or
471 * one less debugger trap, so allowing it is fairly harmless.
473 if (!EFL_SECURE(eflags
& ~PSL_RF
, regs
->tf_eflags
& ~PSL_RF
)) {
474 kprintf("sigreturn: eflags = 0x%x\n", eflags
);
479 * Don't allow users to load a valid privileged %cs. Let the
480 * hardware check for invalid selectors, excess privilege in
481 * other selectors, invalid %eip's and invalid %esp's.
483 cs
= ucp
.uc_mcontext
.mc_cs
;
484 if (!CS_SECURE(cs
)) {
485 kprintf("sigreturn: cs = 0x%x\n", cs
);
486 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
489 bcopy(&ucp
.uc_mcontext
.mc_gs
, regs
, sizeof(struct trapframe
));
493 * Restore the FPU state from the frame
496 npxpop(&ucp
.uc_mcontext
);
498 if (ucp
.uc_mcontext
.mc_onstack
& 1)
499 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
501 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
503 lp
->lwp_sigmask
= ucp
.uc_sigmask
;
504 SIG_CANTMASK(lp
->lwp_sigmask
);
510 * Stack frame on entry to function. %eax will contain the function vector,
511 * %ecx will contain the function data. flags, ecx, and eax will have
512 * already been pushed on the stack.
523 sendupcall(struct vmupcall
*vu
, int morepending
)
525 struct lwp
*lp
= curthread
->td_lwp
;
526 struct trapframe
*regs
;
527 struct upcall upcall
;
528 struct upc_frame upc_frame
;
532 * If we are a virtual kernel running an emulated user process
533 * context, switch back to the virtual kernel context before
534 * trying to post the signal.
536 if (lp
->lwp_vkernel
&& lp
->lwp_vkernel
->ve
) {
537 lp
->lwp_md
.md_regs
->tf_trapno
= 0;
538 vkernel_trap(lp
, lp
->lwp_md
.md_regs
);
542 * Get the upcall data structure
544 if (copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
)) ||
545 copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int))
548 kprintf("bad upcall address\n");
553 * If the data structure is already marked pending or has a critical
554 * section count, mark the data structure as pending and return
555 * without doing an upcall. vu_pending is left set.
557 if (upcall
.upc_pending
|| crit_count
>= vu
->vu_pending
) {
558 if (upcall
.upc_pending
< vu
->vu_pending
) {
559 upcall
.upc_pending
= vu
->vu_pending
;
560 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
561 sizeof(upcall
.upc_pending
));
567 * We can run this upcall now, clear vu_pending.
569 * Bump our critical section count and set or clear the
570 * user pending flag depending on whether more upcalls are
571 * pending. The user will be responsible for calling
572 * upc_dispatch(-1) to process remaining upcalls.
575 upcall
.upc_pending
= morepending
;
577 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
578 sizeof(upcall
.upc_pending
));
579 copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
,
583 * Construct a stack frame and issue the upcall
585 regs
= lp
->lwp_md
.md_regs
;
586 upc_frame
.eax
= regs
->tf_eax
;
587 upc_frame
.ecx
= regs
->tf_ecx
;
588 upc_frame
.edx
= regs
->tf_edx
;
589 upc_frame
.flags
= regs
->tf_eflags
;
590 upc_frame
.oldip
= regs
->tf_eip
;
591 if (copyout(&upc_frame
, (void *)(regs
->tf_esp
- sizeof(upc_frame
)),
592 sizeof(upc_frame
)) != 0) {
593 kprintf("bad stack on upcall\n");
595 regs
->tf_eax
= (register_t
)vu
->vu_func
;
596 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
597 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
598 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
599 regs
->tf_esp
-= sizeof(upc_frame
);
604 * fetchupcall occurs in the context of a system call, which means that
605 * we have to return EJUSTRETURN in order to prevent eax and edx from
606 * being overwritten by the syscall return value.
608 * if vu is not NULL we return the new context in %edx, the new data in %ecx,
609 * and the function pointer in %eax.
612 fetchupcall (struct vmupcall
*vu
, int morepending
, void *rsp
)
614 struct upc_frame upc_frame
;
615 struct lwp
*lp
= curthread
->td_lwp
;
616 struct trapframe
*regs
;
618 struct upcall upcall
;
621 regs
= lp
->lwp_md
.md_regs
;
623 error
= copyout(&morepending
, &lp
->lwp_upcall
->upc_pending
, sizeof(int));
627 * This jumps us to the next ready context.
630 error
= copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
));
633 error
= copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int));
636 error
= copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, sizeof(int));
637 regs
->tf_eax
= (register_t
)vu
->vu_func
;
638 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
639 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
640 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
641 regs
->tf_esp
= (register_t
)rsp
;
644 * This returns us to the originally interrupted code.
646 error
= copyin(rsp
, &upc_frame
, sizeof(upc_frame
));
647 regs
->tf_eax
= upc_frame
.eax
;
648 regs
->tf_ecx
= upc_frame
.ecx
;
649 regs
->tf_edx
= upc_frame
.edx
;
650 regs
->tf_eflags
= (regs
->tf_eflags
& ~PSL_USERCHANGE
) |
651 (upc_frame
.flags
& PSL_USERCHANGE
);
652 regs
->tf_eip
= upc_frame
.oldip
;
653 regs
->tf_esp
= (register_t
)((char *)rsp
+ sizeof(upc_frame
));
662 * cpu_idle() represents the idle LWKT. You cannot return from this function
663 * (unless you want to blow things up!). Instead we look for runnable threads
664 * and loop or halt as appropriate. Giant is not held on entry to the thread.
666 * The main loop is entered with a critical section held, we must release
667 * the critical section before doing anything else. lwkt_switch() will
668 * check for pending interrupts due to entering and exiting its own
671 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI
672 * to wake a HLTed cpu up.
674 static int cpu_idle_hlt
= 1;
675 static int cpu_idle_hltcnt
;
676 static int cpu_idle_spincnt
;
677 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
678 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
679 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, CTLFLAG_RW
,
680 &cpu_idle_hltcnt
, 0, "Idle loop entry halts");
681 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_spincnt
, CTLFLAG_RW
,
682 &cpu_idle_spincnt
, 0, "Idle loop entry spins");
687 struct thread
*td
= curthread
;
688 struct mdglobaldata
*gd
= mdcpu
;
692 KKASSERT(td
->td_critcount
== 0);
696 * See if there are any LWKTs ready to go.
701 * The idle loop halts only if no threads are scheduleable
702 * and no signals have occured.
705 (td
->td_gd
->gd_reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
708 KKASSERT(MP_LOCK_HELD() == 0);
710 if ((td
->td_gd
->gd_reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
712 struct timeval tv1
, tv2
;
713 gettimeofday(&tv1
, NULL
);
715 reqflags
= gd
->mi
.gd_reqflags
&
716 ~RQF_IDLECHECK_WK_MASK
;
717 umtx_sleep(&gd
->mi
.gd_reqflags
, reqflags
,
720 gettimeofday(&tv2
, NULL
);
721 if (tv2
.tv_usec
- tv1
.tv_usec
+
722 (tv2
.tv_sec
- tv1
.tv_sec
) * 1000000
724 kprintf("cpu %d idlelock %08x %08x\n",
735 __asm
__volatile("pause");
745 * Called by the spinlock code with or without a critical section held
746 * when a spinlock is found to be seriously constested.
748 * We need to enter a critical section to prevent signals from recursing
752 cpu_spinlock_contested(void)
760 * Clear registers on exec
763 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
765 struct thread
*td
= curthread
;
766 struct lwp
*lp
= td
->td_lwp
;
767 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
768 struct pcb
*pcb
= lp
->lwp_thread
->td_pcb
;
770 /* was i386_user_cleanup() in NetBSD */
773 bzero((char *)regs
, sizeof(struct trapframe
));
774 regs
->tf_eip
= entry
;
775 regs
->tf_esp
= stack
;
776 regs
->tf_eflags
= PSL_USER
| (regs
->tf_eflags
& PSL_T
);
784 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
785 regs
->tf_ebx
= ps_strings
;
788 * Reset the hardware debug registers if they were in use.
789 * They won't have any meaning for the newly exec'd process.
791 if (pcb
->pcb_flags
& PCB_DBREGS
) {
798 if (pcb
== td
->td_pcb
) {
800 * Clear the debug registers on the running
801 * CPU, otherwise they will end up affecting
802 * the next process we switch to.
806 pcb
->pcb_flags
&= ~PCB_DBREGS
;
810 * Initialize the math emulator (if any) for the current process.
811 * Actually, just clear the bit that says that the emulator has
812 * been initialized. Initialization is delayed until the process
813 * traps to the emulator (if it is done at all) mainly because
814 * emulators don't provide an entry point for initialization.
816 pcb
->pcb_flags
&= ~FP_SOFTFP
;
819 * note: do not set CR0_TS here. npxinit() must do it after clearing
820 * gd_npxthread. Otherwise a preemptive interrupt thread may panic
825 load_cr0(rcr0() | CR0_MP
);
829 /* Initialize the npx (if any) for the current process. */
830 npxinit(__INITIAL_NPXCW__
);
835 * note: linux emulator needs edx to be 0x0 on entry, which is
836 * handled in execve simply by setting the 64 bit syscall
848 cr0
|= CR0_NE
; /* Done by npxinit() */
849 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
850 cr0
|= CR0_WP
| CR0_AM
;
857 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
860 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
862 if (!error
&& req
->newptr
)
867 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
868 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
870 extern u_long bootdev
; /* not a cdev_t - encoding is different */
871 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
872 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
875 * Initialize 386 and configure to run kernel
879 * Initialize segments & interrupt table
882 extern struct user
*proc0paddr
;
887 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
888 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
889 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
890 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(fpu
), IDTVEC(align
),
891 IDTVEC(xmm
), IDTVEC(syscall
),
894 IDTVEC(int0x80_syscall
);
898 #ifdef DEBUG_INTERRUPTS
899 extern inthand_t
*Xrsvdary
[256];
903 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
905 lp
->lwp_md
.md_regs
->tf_eip
= addr
;
910 ptrace_single_step(struct lwp
*lp
)
912 lp
->lwp_md
.md_regs
->tf_eflags
|= PSL_T
;
917 fill_regs(struct lwp
*lp
, struct reg
*regs
)
919 struct trapframe
*tp
;
921 if ((tp
= lp
->lwp_md
.md_regs
) == NULL
)
923 regs
->r_gs
= tp
->tf_gs
;
924 regs
->r_fs
= tp
->tf_fs
;
925 regs
->r_es
= tp
->tf_es
;
926 regs
->r_ds
= tp
->tf_ds
;
927 regs
->r_edi
= tp
->tf_edi
;
928 regs
->r_esi
= tp
->tf_esi
;
929 regs
->r_ebp
= tp
->tf_ebp
;
930 regs
->r_ebx
= tp
->tf_ebx
;
931 regs
->r_edx
= tp
->tf_edx
;
932 regs
->r_ecx
= tp
->tf_ecx
;
933 regs
->r_eax
= tp
->tf_eax
;
934 regs
->r_eip
= tp
->tf_eip
;
935 regs
->r_cs
= tp
->tf_cs
;
936 regs
->r_eflags
= tp
->tf_eflags
;
937 regs
->r_esp
= tp
->tf_esp
;
938 regs
->r_ss
= tp
->tf_ss
;
943 set_regs(struct lwp
*lp
, struct reg
*regs
)
945 struct trapframe
*tp
;
947 tp
= lp
->lwp_md
.md_regs
;
948 if (!EFL_SECURE(regs
->r_eflags
, tp
->tf_eflags
) ||
949 !CS_SECURE(regs
->r_cs
))
951 tp
->tf_gs
= regs
->r_gs
;
952 tp
->tf_fs
= regs
->r_fs
;
953 tp
->tf_es
= regs
->r_es
;
954 tp
->tf_ds
= regs
->r_ds
;
955 tp
->tf_edi
= regs
->r_edi
;
956 tp
->tf_esi
= regs
->r_esi
;
957 tp
->tf_ebp
= regs
->r_ebp
;
958 tp
->tf_ebx
= regs
->r_ebx
;
959 tp
->tf_edx
= regs
->r_edx
;
960 tp
->tf_ecx
= regs
->r_ecx
;
961 tp
->tf_eax
= regs
->r_eax
;
962 tp
->tf_eip
= regs
->r_eip
;
963 tp
->tf_cs
= regs
->r_cs
;
964 tp
->tf_eflags
= regs
->r_eflags
;
965 tp
->tf_esp
= regs
->r_esp
;
966 tp
->tf_ss
= regs
->r_ss
;
970 #ifndef CPU_DISABLE_SSE
972 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
974 struct env87
*penv_87
= &sv_87
->sv_env
;
975 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
978 /* FPU control/status */
979 penv_87
->en_cw
= penv_xmm
->en_cw
;
980 penv_87
->en_sw
= penv_xmm
->en_sw
;
981 penv_87
->en_tw
= penv_xmm
->en_tw
;
982 penv_87
->en_fip
= penv_xmm
->en_fip
;
983 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
984 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
985 penv_87
->en_foo
= penv_xmm
->en_foo
;
986 penv_87
->en_fos
= penv_xmm
->en_fos
;
989 for (i
= 0; i
< 8; ++i
)
990 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
994 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
996 struct env87
*penv_87
= &sv_87
->sv_env
;
997 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
1000 /* FPU control/status */
1001 penv_xmm
->en_cw
= penv_87
->en_cw
;
1002 penv_xmm
->en_sw
= penv_87
->en_sw
;
1003 penv_xmm
->en_tw
= penv_87
->en_tw
;
1004 penv_xmm
->en_fip
= penv_87
->en_fip
;
1005 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
1006 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
1007 penv_xmm
->en_foo
= penv_87
->en_foo
;
1008 penv_xmm
->en_fos
= penv_87
->en_fos
;
1011 for (i
= 0; i
< 8; ++i
)
1012 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
1014 #endif /* CPU_DISABLE_SSE */
1017 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
1019 if (lp
->lwp_thread
== NULL
|| lp
->lwp_thread
->td_pcb
== NULL
)
1021 #ifndef CPU_DISABLE_SSE
1023 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
1024 (struct save87
*)fpregs
);
1027 #endif /* CPU_DISABLE_SSE */
1028 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
1033 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
1035 #ifndef CPU_DISABLE_SSE
1037 set_fpregs_xmm((struct save87
*)fpregs
,
1038 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
1041 #endif /* CPU_DISABLE_SSE */
1042 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
1047 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
1053 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
1060 * Return > 0 if a hardware breakpoint has been hit, and the
1061 * breakpoint was in user space. Return 0, otherwise.
1064 user_dbreg_trap(void)
1066 u_int32_t dr7
, dr6
; /* debug registers dr6 and dr7 */
1067 u_int32_t bp
; /* breakpoint bits extracted from dr6 */
1068 int nbp
; /* number of breakpoints that triggered */
1069 caddr_t addr
[4]; /* breakpoint addresses */
1073 if ((dr7
& 0x000000ff) == 0) {
1075 * all GE and LE bits in the dr7 register are zero,
1076 * thus the trap couldn't have been caused by the
1077 * hardware debug registers
1084 bp
= dr6
& 0x0000000f;
1088 * None of the breakpoint bits are set meaning this
1089 * trap was not caused by any of the debug registers
1095 * at least one of the breakpoints were hit, check to see
1096 * which ones and if any of them are user space addresses
1100 addr
[nbp
++] = (caddr_t
)rdr0();
1103 addr
[nbp
++] = (caddr_t
)rdr1();
1106 addr
[nbp
++] = (caddr_t
)rdr2();
1109 addr
[nbp
++] = (caddr_t
)rdr3();
1112 for (i
=0; i
<nbp
; i
++) {
1114 (caddr_t
)VM_MAX_USER_ADDRESS
) {
1116 * addr[i] is in user space
1123 * None of the breakpoints are in user space.
1136 cpu_feature
= regs
[3];
1142 Debugger(const char *msg
)
1144 kprintf("Debugger(\"%s\") called.\n", msg
);