2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
6 * This code is derived from software contributed to Berkeley by
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
38 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
43 #include "opt_atalk.h"
44 #include "opt_compat.h"
47 #include "opt_directio.h"
50 #include "opt_maxmem.h"
51 #include "opt_msgbuf.h"
52 #include "opt_perfmon.h"
54 #include "opt_userconfig.h"
57 #include <sys/param.h>
58 #include <sys/systm.h>
59 #include <sys/sysproto.h>
60 #include <sys/signalvar.h>
61 #include <sys/kernel.h>
62 #include <sys/linker.h>
63 #include <sys/malloc.h>
67 #include <sys/reboot.h>
69 #include <sys/msgbuf.h>
70 #include <sys/sysent.h>
71 #include <sys/sysctl.h>
72 #include <sys/vmmeter.h>
74 #include <sys/upcall.h>
75 #include <sys/usched.h>
79 #include <vm/vm_param.h>
81 #include <vm/vm_kern.h>
82 #include <vm/vm_object.h>
83 #include <vm/vm_page.h>
84 #include <vm/vm_map.h>
85 #include <vm/vm_pager.h>
86 #include <vm/vm_extern.h>
88 #include <sys/thread2.h>
89 #include <sys/mplock2.h>
97 #include <machine/cpu.h>
98 #include <machine/clock.h>
99 #include <machine/specialreg.h>
100 #include <machine/bootinfo.h>
101 #include <machine/md_var.h>
102 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
103 #include <machine/globaldata.h> /* CPU_prvspace */
104 #include <machine/smp.h>
106 #include <machine/perfmon.h>
108 #include <machine/cputypes.h>
109 #include <machine/intr_machdep.h>
112 #include <bus/isa/isa_device.h>
114 #include <machine_base/isa/isa_intr.h>
115 #include <machine_base/isa/elcr_var.h>
116 #include <bus/isa/rtc.h>
117 #include <machine/vm86.h>
118 #include <sys/random.h>
119 #include <sys/ptrace.h>
120 #include <machine/sigframe.h>
122 #include <sys/machintr.h>
123 #include <machine_base/icu/icu_abi.h>
125 #define PHYSMAP_ENTRIES 10
127 extern void init386(int first
);
128 extern void dblfault_handler(void);
130 extern void printcpuinfo(void); /* XXX header file */
131 extern void finishidentcpu(void);
132 extern void panicifcpuunsupported(void);
133 extern void initializecpu(void);
135 static void cpu_startup(void *);
136 #ifndef CPU_DISABLE_SSE
137 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
138 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
139 #endif /* CPU_DISABLE_SSE */
141 extern void ffs_rawread_setup(void);
142 #endif /* DIRECTIO */
143 static void init_locks(void);
145 SYSINIT(cpu
, SI_BOOT2_SMP
, SI_ORDER_FIRST
, cpu_startup
, NULL
)
147 int _udatasel
, _ucodesel
;
150 int64_t tsc_offsets
[MAXCPU
];
152 int64_t tsc_offsets
[1];
155 #if defined(SWTCH_OPTIM_STATS)
156 extern int swtch_optim_stats
;
157 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
158 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
159 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
160 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
165 u_long ebda_addr
= 0;
168 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
170 u_long pmem
= ctob(physmem
);
172 int error
= sysctl_handle_long(oidp
, &pmem
, 0, req
);
176 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_ULONG
|CTLFLAG_RD
,
177 0, 0, sysctl_hw_physmem
, "LU", "Total system memory in bytes (number of pages * page size)");
180 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
182 int error
= sysctl_handle_int(oidp
, 0,
183 ctob(physmem
- vmstats
.v_wire_count
), req
);
187 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
188 0, 0, sysctl_hw_usermem
, "IU", "");
191 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
193 int error
= sysctl_handle_int(oidp
, 0,
194 i386_btop(avail_end
- avail_start
), req
);
198 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
199 0, 0, sysctl_hw_availpages
, "I", "");
204 vm_paddr_t phys_avail
[PHYSMAP_ENTRIES
*2+2];
205 vm_paddr_t dump_avail
[PHYSMAP_ENTRIES
*2+2];
208 static vm_offset_t buffer_sva
, buffer_eva
;
209 vm_offset_t clean_sva
, clean_eva
;
210 static vm_offset_t pager_sva
, pager_eva
;
211 static struct trapframe proc0_tf
;
214 cpu_startup(void *dummy
)
218 vm_offset_t firstaddr
;
220 if (boothowto
& RB_VERBOSE
)
224 * Good {morning,afternoon,evening,night}.
226 kprintf("%s", version
);
229 panicifcpuunsupported();
233 kprintf("real memory = %ju (%ju MB)\n",
235 (intmax_t)Realmem
/ 1024 / 1024);
237 * Display any holes after the first chunk of extended memory.
242 kprintf("Physical memory chunk(s):\n");
243 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
244 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
246 kprintf("0x%08llx - 0x%08llx, %llu bytes (%llu pages)\n",
247 phys_avail
[indx
], phys_avail
[indx
+ 1] - 1, size1
,
253 * Allocate space for system data structures.
254 * The first available kernel virtual address is in "v".
255 * As pages of kernel virtual memory are allocated, "v" is incremented.
256 * As pages of memory are allocated and cleared,
257 * "firstaddr" is incremented.
258 * An index into the kernel page table corresponding to the
259 * virtual memory address maintained in "v" is kept in "mapaddr".
263 * Make two passes. The first pass calculates how much memory is
264 * needed and allocates it. The second pass assigns virtual
265 * addresses to the various data structures.
269 v
= (caddr_t
)firstaddr
;
271 #define valloc(name, type, num) \
272 (name) = (type *)v; v = (caddr_t)((name)+(num))
273 #define valloclim(name, type, num, lim) \
274 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
277 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
278 * For the first 64MB of ram nominally allocate sufficient buffers to
279 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
280 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
281 * the buffer cache we limit the eventual kva reservation to
284 * factor represents the 1/4 x ram conversion.
287 int factor
= 4 * BKVASIZE
/ 1024;
288 int kbytes
= physmem
* (PAGE_SIZE
/ 1024);
292 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
294 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
295 if (maxbcache
&& nbuf
> maxbcache
/ BKVASIZE
)
296 nbuf
= maxbcache
/ BKVASIZE
;
300 * Do not allow the buffer_map to be more then 1/2 the size of the
303 if (nbuf
> (virtual_end
- virtual_start
) / (BKVASIZE
* 2)) {
304 nbuf
= (virtual_end
- virtual_start
) / (BKVASIZE
* 2);
305 kprintf("Warning: nbufs capped at %d\n", nbuf
);
308 /* limit to 128 on i386 */
309 nswbuf
= max(min(nbuf
/4, 128), 16);
311 if (nswbuf
< NSWBUF_MIN
)
318 valloc(swbuf
, struct buf
, nswbuf
);
319 valloc(buf
, struct buf
, nbuf
);
322 * End of first pass, size has been calculated so allocate memory
324 if (firstaddr
== 0) {
325 size
= (vm_size_t
)(v
- firstaddr
);
326 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
));
328 panic("startup: no room for tables");
333 * End of second pass, addresses have been assigned
335 if ((vm_size_t
)(v
- firstaddr
) != size
)
336 panic("startup: table size inconsistency");
338 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
339 (nbuf
*BKVASIZE
) + (nswbuf
*MAXPHYS
) + pager_map_size
);
340 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
342 buffer_map
.system_map
= 1;
343 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
344 (nswbuf
*MAXPHYS
) + pager_map_size
);
345 pager_map
.system_map
= 1;
347 #if defined(USERCONFIG)
349 cninit(); /* the preferred console may have changed */
352 kprintf("avail memory = %ju (%ju MB)\n",
353 (intmax_t)ptoa(vmstats
.v_free_count
),
354 (intmax_t)ptoa(vmstats
.v_free_count
) / 1024 / 1024);
357 * Set up buffers, so they can be used to read disk labels.
360 vm_pager_bufferinit();
362 /* Log ELCR information */
367 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
369 mp_start(); /* fire up the APs and APICs */
372 MachIntrABI
.finalize();
378 * Send an interrupt to process.
380 * Stack is set up to allow sigcode stored
381 * at top to call routine, followed by kcall
382 * to sigreturn routine below. After sigreturn
383 * resets the signal mask, the stack, and the
384 * frame pointer, it returns to the user
388 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
390 struct lwp
*lp
= curthread
->td_lwp
;
391 struct proc
*p
= lp
->lwp_proc
;
392 struct trapframe
*regs
;
393 struct sigacts
*psp
= p
->p_sigacts
;
394 struct sigframe sf
, *sfp
;
397 regs
= lp
->lwp_md
.md_regs
;
398 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
400 /* save user context */
401 bzero(&sf
, sizeof(struct sigframe
));
402 sf
.sf_uc
.uc_sigmask
= *mask
;
403 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
404 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
405 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_gs
, sizeof(struct trapframe
));
407 /* make the size of the saved context visible to userland */
408 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
410 /* save mailbox pending state for syscall interlock semantics */
411 if (p
->p_flag
& P_MAILBOX
)
412 sf
.sf_uc
.uc_mcontext
.mc_xflags
|= PGEX_MAILBOX
;
414 /* Allocate and validate space for the signal handler context. */
415 if ((lp
->lwp_flag
& LWP_ALTSTACK
) != 0 && !oonstack
&&
416 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
417 sfp
= (struct sigframe
*)(lp
->lwp_sigstk
.ss_sp
+
418 lp
->lwp_sigstk
.ss_size
- sizeof(struct sigframe
));
419 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
421 sfp
= (struct sigframe
*)regs
->tf_esp
- 1;
424 /* Translate the signal is appropriate */
425 if (p
->p_sysent
->sv_sigtbl
) {
426 if (sig
<= p
->p_sysent
->sv_sigsize
)
427 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
430 /* Build the argument list for the signal handler. */
432 sf
.sf_ucontext
= (register_t
)&sfp
->sf_uc
;
433 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
434 /* Signal handler installed with SA_SIGINFO. */
435 sf
.sf_siginfo
= (register_t
)&sfp
->sf_si
;
436 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
438 /* fill siginfo structure */
439 sf
.sf_si
.si_signo
= sig
;
440 sf
.sf_si
.si_code
= code
;
441 sf
.sf_si
.si_addr
= (void*)regs
->tf_err
;
444 /* Old FreeBSD-style arguments. */
445 sf
.sf_siginfo
= code
;
446 sf
.sf_addr
= regs
->tf_err
;
447 sf
.sf_ahu
.sf_handler
= catcher
;
451 * If we're a vm86 process, we want to save the segment registers.
452 * We also change eflags to be our emulated eflags, not the actual
455 if (regs
->tf_eflags
& PSL_VM
) {
456 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
457 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
459 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
460 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
461 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
462 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
464 if (vm86
->vm86_has_vme
== 0)
465 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
466 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
467 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
470 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
471 * syscalls made by the signal handler. This just avoids
472 * wasting time for our lazy fixup of such faults. PSL_NT
473 * does nothing in vm86 mode, but vm86 programs can set it
474 * almost legitimately in probes for old cpu types.
476 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
480 * Save the FPU state and reinit the FP unit
482 npxpush(&sf
.sf_uc
.uc_mcontext
);
485 * Copy the sigframe out to the user's stack.
487 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
489 * Something is wrong with the stack pointer.
490 * ...Kill the process.
495 regs
->tf_esp
= (int)sfp
;
496 regs
->tf_eip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
499 * i386 abi specifies that the direction flag must be cleared
502 regs
->tf_eflags
&= ~(PSL_T
|PSL_D
);
504 regs
->tf_cs
= _ucodesel
;
505 regs
->tf_ds
= _udatasel
;
506 regs
->tf_es
= _udatasel
;
509 * Allow the signal handler to inherit %fs in addition to %gs as
510 * the userland program might be using both.
512 * However, if a T_PROTFLT occured the segment registers could be
513 * totally broken. They must be reset in order to be able to
514 * return to userland.
516 if (regs
->tf_trapno
== T_PROTFLT
) {
517 regs
->tf_fs
= _udatasel
;
518 regs
->tf_gs
= _udatasel
;
520 regs
->tf_ss
= _udatasel
;
524 * Sanitize the trapframe for a virtual kernel passing control to a custom
525 * VM context. Remove any items that would otherwise create a privilage
528 * XXX at the moment we allow userland to set the resume flag. Is this a
532 cpu_sanitize_frame(struct trapframe
*frame
)
534 frame
->tf_cs
= _ucodesel
;
535 frame
->tf_ds
= _udatasel
;
536 frame
->tf_es
= _udatasel
; /* XXX allow userland this one too? */
538 frame
->tf_fs
= _udatasel
;
539 frame
->tf_gs
= _udatasel
;
541 frame
->tf_ss
= _udatasel
;
542 frame
->tf_eflags
&= (PSL_RF
| PSL_USERCHANGE
);
543 frame
->tf_eflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
548 cpu_sanitize_tls(struct savetls
*tls
)
550 struct segment_descriptor
*desc
;
553 for (i
= 0; i
< NGTLS
; ++i
) {
555 if (desc
->sd_dpl
== 0 && desc
->sd_type
== 0)
557 if (desc
->sd_def32
== 0)
559 if (desc
->sd_type
!= SDT_MEMRWA
)
561 if (desc
->sd_dpl
!= SEL_UPL
)
563 if (desc
->sd_xx
!= 0 || desc
->sd_p
!= 1)
570 * sigreturn(ucontext_t *sigcntxp)
572 * System call to cleanup state after a signal
573 * has been taken. Reset signal mask and
574 * stack state from context left by sendsig (above).
575 * Return to previous pc and psl as specified by
576 * context left by sendsig. Check carefully to
577 * make sure that the user has not modified the
578 * state to gain improper privileges.
582 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
583 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
586 sys_sigreturn(struct sigreturn_args
*uap
)
588 struct lwp
*lp
= curthread
->td_lwp
;
589 struct proc
*p
= lp
->lwp_proc
;
590 struct trapframe
*regs
;
598 * We have to copy the information into kernel space so userland
599 * can't modify it while we are sniffing it.
601 regs
= lp
->lwp_md
.md_regs
;
602 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
606 eflags
= ucp
->uc_mcontext
.mc_eflags
;
608 if (eflags
& PSL_VM
) {
609 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
610 struct vm86_kernel
*vm86
;
613 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
614 * set up the vm86 area, and we can't enter vm86 mode.
616 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
618 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
619 if (vm86
->vm86_inited
== 0)
622 /* go back to user mode if both flags are set */
623 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
624 trapsignal(lp
, SIGBUS
, 0);
626 if (vm86
->vm86_has_vme
) {
627 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
628 (eflags
& VME_USERCHANGE
) | PSL_VM
;
630 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
631 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
632 (eflags
& VM_USERCHANGE
) | PSL_VM
;
634 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
635 tf
->tf_eflags
= eflags
;
636 tf
->tf_vm86_ds
= tf
->tf_ds
;
637 tf
->tf_vm86_es
= tf
->tf_es
;
638 tf
->tf_vm86_fs
= tf
->tf_fs
;
639 tf
->tf_vm86_gs
= tf
->tf_gs
;
640 tf
->tf_ds
= _udatasel
;
641 tf
->tf_es
= _udatasel
;
643 tf
->tf_fs
= _udatasel
;
644 tf
->tf_gs
= _udatasel
;
648 * Don't allow users to change privileged or reserved flags.
651 * XXX do allow users to change the privileged flag PSL_RF.
652 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
653 * should sometimes set it there too. tf_eflags is kept in
654 * the signal context during signal handling and there is no
655 * other place to remember it, so the PSL_RF bit may be
656 * corrupted by the signal handler without us knowing.
657 * Corruption of the PSL_RF bit at worst causes one more or
658 * one less debugger trap, so allowing it is fairly harmless.
660 if (!EFL_SECURE(eflags
& ~PSL_RF
, regs
->tf_eflags
& ~PSL_RF
)) {
661 kprintf("sigreturn: eflags = 0x%x\n", eflags
);
666 * Don't allow users to load a valid privileged %cs. Let the
667 * hardware check for invalid selectors, excess privilege in
668 * other selectors, invalid %eip's and invalid %esp's.
670 cs
= ucp
->uc_mcontext
.mc_cs
;
671 if (!CS_SECURE(cs
)) {
672 kprintf("sigreturn: cs = 0x%x\n", cs
);
673 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
676 bcopy(&ucp
->uc_mcontext
.mc_gs
, regs
, sizeof(struct trapframe
));
680 * Restore the FPU state from the frame
683 npxpop(&ucp
->uc_mcontext
);
686 * Merge saved signal mailbox pending flag to maintain interlock
687 * semantics against system calls.
689 if (ucp
->uc_mcontext
.mc_xflags
& PGEX_MAILBOX
)
690 p
->p_flag
|= P_MAILBOX
;
692 if (ucp
->uc_mcontext
.mc_onstack
& 1)
693 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
695 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
697 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
698 SIG_CANTMASK(lp
->lwp_sigmask
);
704 * Stack frame on entry to function. %eax will contain the function vector,
705 * %ecx will contain the function data. flags, ecx, and eax will have
706 * already been pushed on the stack.
717 sendupcall(struct vmupcall
*vu
, int morepending
)
719 struct lwp
*lp
= curthread
->td_lwp
;
720 struct trapframe
*regs
;
721 struct upcall upcall
;
722 struct upc_frame upc_frame
;
726 * If we are a virtual kernel running an emulated user process
727 * context, switch back to the virtual kernel context before
728 * trying to post the signal.
730 if (lp
->lwp_vkernel
&& lp
->lwp_vkernel
->ve
) {
731 lp
->lwp_md
.md_regs
->tf_trapno
= 0;
732 vkernel_trap(lp
, lp
->lwp_md
.md_regs
);
736 * Get the upcall data structure
738 if (copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
)) ||
739 copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int))
742 kprintf("bad upcall address\n");
747 * If the data structure is already marked pending or has a critical
748 * section count, mark the data structure as pending and return
749 * without doing an upcall. vu_pending is left set.
751 if (upcall
.upc_pending
|| crit_count
>= vu
->vu_pending
) {
752 if (upcall
.upc_pending
< vu
->vu_pending
) {
753 upcall
.upc_pending
= vu
->vu_pending
;
754 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
755 sizeof(upcall
.upc_pending
));
761 * We can run this upcall now, clear vu_pending.
763 * Bump our critical section count and set or clear the
764 * user pending flag depending on whether more upcalls are
765 * pending. The user will be responsible for calling
766 * upc_dispatch(-1) to process remaining upcalls.
769 upcall
.upc_pending
= morepending
;
771 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
772 sizeof(upcall
.upc_pending
));
773 copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
,
777 * Construct a stack frame and issue the upcall
779 regs
= lp
->lwp_md
.md_regs
;
780 upc_frame
.eax
= regs
->tf_eax
;
781 upc_frame
.ecx
= regs
->tf_ecx
;
782 upc_frame
.edx
= regs
->tf_edx
;
783 upc_frame
.flags
= regs
->tf_eflags
;
784 upc_frame
.oldip
= regs
->tf_eip
;
785 if (copyout(&upc_frame
, (void *)(regs
->tf_esp
- sizeof(upc_frame
)),
786 sizeof(upc_frame
)) != 0) {
787 kprintf("bad stack on upcall\n");
789 regs
->tf_eax
= (register_t
)vu
->vu_func
;
790 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
791 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
792 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
793 regs
->tf_esp
-= sizeof(upc_frame
);
798 * fetchupcall occurs in the context of a system call, which means that
799 * we have to return EJUSTRETURN in order to prevent eax and edx from
800 * being overwritten by the syscall return value.
802 * if vu is not NULL we return the new context in %edx, the new data in %ecx,
803 * and the function pointer in %eax.
806 fetchupcall(struct vmupcall
*vu
, int morepending
, void *rsp
)
808 struct upc_frame upc_frame
;
809 struct lwp
*lp
= curthread
->td_lwp
;
810 struct trapframe
*regs
;
812 struct upcall upcall
;
815 regs
= lp
->lwp_md
.md_regs
;
817 error
= copyout(&morepending
, &lp
->lwp_upcall
->upc_pending
, sizeof(int));
821 * This jumps us to the next ready context.
824 error
= copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
));
827 error
= copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int));
830 error
= copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, sizeof(int));
831 regs
->tf_eax
= (register_t
)vu
->vu_func
;
832 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
833 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
834 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
835 regs
->tf_esp
= (register_t
)rsp
;
838 * This returns us to the originally interrupted code.
840 error
= copyin(rsp
, &upc_frame
, sizeof(upc_frame
));
841 regs
->tf_eax
= upc_frame
.eax
;
842 regs
->tf_ecx
= upc_frame
.ecx
;
843 regs
->tf_edx
= upc_frame
.edx
;
844 regs
->tf_eflags
= (regs
->tf_eflags
& ~PSL_USERCHANGE
) |
845 (upc_frame
.flags
& PSL_USERCHANGE
);
846 regs
->tf_eip
= upc_frame
.oldip
;
847 regs
->tf_esp
= (register_t
)((char *)rsp
+ sizeof(upc_frame
));
856 * Machine dependent boot() routine
858 * I haven't seen anything to put here yet
859 * Possibly some stuff might be grafted back here from boot()
867 * Shutdown the CPU as much as possible
873 __asm__
__volatile("hlt");
877 * cpu_idle() represents the idle LWKT. You cannot return from this function
878 * (unless you want to blow things up!). Instead we look for runnable threads
879 * and loop or halt as appropriate. Giant is not held on entry to the thread.
881 * The main loop is entered with a critical section held, we must release
882 * the critical section before doing anything else. lwkt_switch() will
883 * check for pending interrupts due to entering and exiting its own
886 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
887 * However, there are cases where the idlethread will be entered with
888 * the possibility that no IPI will occur and in such cases
889 * lwkt_switch() sets RQF_WAKEUP. We usually check
890 * RQF_IDLECHECK_WK_MASK.
892 * NOTE: cpu_idle_hlt again defaults to 2 (use ACPI sleep states). Set to
893 * 1 to just use hlt and for debugging purposes.
895 static int cpu_idle_hlt
= 2;
896 static int cpu_idle_hltcnt
;
897 static int cpu_idle_spincnt
;
898 static u_int cpu_idle_repeat
= 4;
899 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
900 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
901 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, CTLFLAG_RW
,
902 &cpu_idle_hltcnt
, 0, "Idle loop entry halts");
903 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_spincnt
, CTLFLAG_RW
,
904 &cpu_idle_spincnt
, 0, "Idle loop entry spins");
905 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_repeat
, CTLFLAG_RW
,
906 &cpu_idle_repeat
, 0, "Idle entries before acpi hlt");
909 cpu_idle_default_hook(void)
912 * We must guarentee that hlt is exactly the instruction
915 __asm
__volatile("sti; hlt");
918 /* Other subsystems (e.g., ACPI) can hook this later. */
919 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
924 globaldata_t gd
= mycpu
;
925 struct thread
*td
= gd
->gd_curthread
;
930 KKASSERT(td
->td_critcount
== 0);
933 * See if there are any LWKTs ready to go.
938 * When halting inside a cli we must check for reqflags
939 * races, particularly [re]schedule requests. Running
940 * splz() does the job.
943 * 0 Never halt, just spin
945 * 1 Always use HLT (or MONITOR/MWAIT if avail).
946 * This typically eats more power than the
949 * 2 Use HLT/MONITOR/MWAIT up to a point and then
950 * use the ACPI halt (default). This is a hybrid
951 * approach. See machdep.cpu_idle_repeat.
953 * 3 Always use the ACPI halt. This typically
954 * eats the least amount of power but the cpu
955 * will be slow waking up. Slows down e.g.
956 * compiles and other pipe/event oriented stuff.
959 * NOTE: Interrupts are enabled and we are not in a critical
962 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
963 * don't bother capping gd_idle_repeat, it is ok if
966 ++gd
->gd_idle_repeat
;
967 reqflags
= gd
->gd_reqflags
;
968 quick
= (cpu_idle_hlt
== 1) ||
970 gd
->gd_idle_repeat
< cpu_idle_repeat
);
972 if (quick
&& (cpu_mi_feature
& CPU_MI_MONITOR
) &&
973 (reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
974 cpu_mmw_pause_int(&gd
->gd_reqflags
, reqflags
);
976 } else if (cpu_idle_hlt
) {
977 __asm
__volatile("cli");
979 if ((gd
->gd_reqflags
& RQF_IDLECHECK_WK_MASK
) == 0) {
981 cpu_idle_default_hook();
985 __asm
__volatile("sti");
989 __asm
__volatile("sti");
998 * This routine is called if a spinlock has been held through the
999 * exponential backoff period and is seriously contested. On a real cpu
1003 cpu_spinlock_contested(void)
1011 * Clear registers on exec
1014 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
1016 struct thread
*td
= curthread
;
1017 struct lwp
*lp
= td
->td_lwp
;
1018 struct pcb
*pcb
= td
->td_pcb
;
1019 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
1021 /* was i386_user_cleanup() in NetBSD */
1024 bzero((char *)regs
, sizeof(struct trapframe
));
1025 regs
->tf_eip
= entry
;
1026 regs
->tf_esp
= stack
;
1027 regs
->tf_eflags
= PSL_USER
| (regs
->tf_eflags
& PSL_T
);
1028 regs
->tf_ss
= _udatasel
;
1029 regs
->tf_ds
= _udatasel
;
1030 regs
->tf_es
= _udatasel
;
1031 regs
->tf_fs
= _udatasel
;
1032 regs
->tf_gs
= _udatasel
;
1033 regs
->tf_cs
= _ucodesel
;
1035 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
1036 regs
->tf_ebx
= ps_strings
;
1039 * Reset the hardware debug registers if they were in use.
1040 * They won't have any meaning for the newly exec'd process.
1042 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1049 if (pcb
== td
->td_pcb
) {
1051 * Clear the debug registers on the running
1052 * CPU, otherwise they will end up affecting
1053 * the next process we switch to.
1057 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1061 * Initialize the math emulator (if any) for the current process.
1062 * Actually, just clear the bit that says that the emulator has
1063 * been initialized. Initialization is delayed until the process
1064 * traps to the emulator (if it is done at all) mainly because
1065 * emulators don't provide an entry point for initialization.
1067 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1070 * note: do not set CR0_TS here. npxinit() must do it after clearing
1071 * gd_npxthread. Otherwise a preemptive interrupt thread may panic
1075 load_cr0(rcr0() | CR0_MP
);
1078 /* Initialize the npx (if any) for the current process. */
1079 npxinit(__INITIAL_NPXCW__
);
1084 * note: linux emulator needs edx to be 0x0 on entry, which is
1085 * handled in execve simply by setting the 64 bit syscall
1086 * return value to 0.
1096 cr0
|= CR0_NE
; /* Done by npxinit() */
1097 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1098 cr0
|= CR0_WP
| CR0_AM
;
1104 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1107 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1109 if (!error
&& req
->newptr
)
1114 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1115 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1117 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1118 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1120 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1121 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1123 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1124 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1126 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1127 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1128 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1131 * Initialize 386 and configure to run kernel
1135 * Initialize segments & interrupt table
1139 union descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1140 static struct gate_descriptor idt0
[NIDT
];
1141 struct gate_descriptor
*idt
= &idt0
[0]; /* interrupt descriptor table */
1142 union descriptor ldt
[NLDT
]; /* local descriptor table */
1144 /* table descriptors - used to load tables by cpu */
1145 struct region_descriptor r_gdt
, r_idt
;
1147 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
1148 extern int has_f00f_bug
;
1151 static struct i386tss dblfault_tss
;
1152 static char dblfault_stack
[PAGE_SIZE
];
1154 extern struct user
*proc0paddr
;
1157 /* software prototypes -- in more palatable form */
1158 struct soft_segment_descriptor gdt_segs
[] = {
1159 /* GNULL_SEL 0 Null Descriptor */
1160 { 0x0, /* segment base address */
1162 0, /* segment type */
1163 0, /* segment descriptor priority level */
1164 0, /* segment descriptor present */
1166 0, /* default 32 vs 16 bit size */
1167 0 /* limit granularity (byte/page units)*/ },
1168 /* GCODE_SEL 1 Code Descriptor for kernel */
1169 { 0x0, /* segment base address */
1170 0xfffff, /* length - all address space */
1171 SDT_MEMERA
, /* segment type */
1172 0, /* segment descriptor priority level */
1173 1, /* segment descriptor present */
1175 1, /* default 32 vs 16 bit size */
1176 1 /* limit granularity (byte/page units)*/ },
1177 /* GDATA_SEL 2 Data Descriptor for kernel */
1178 { 0x0, /* segment base address */
1179 0xfffff, /* length - all address space */
1180 SDT_MEMRWA
, /* segment type */
1181 0, /* segment descriptor priority level */
1182 1, /* segment descriptor present */
1184 1, /* default 32 vs 16 bit size */
1185 1 /* limit granularity (byte/page units)*/ },
1186 /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */
1187 { 0x0, /* segment base address */
1188 0xfffff, /* length - all address space */
1189 SDT_MEMRWA
, /* segment type */
1190 0, /* segment descriptor priority level */
1191 1, /* segment descriptor present */
1193 1, /* default 32 vs 16 bit size */
1194 1 /* limit granularity (byte/page units)*/ },
1195 /* GPROC0_SEL 4 Proc 0 Tss Descriptor */
1197 0x0, /* segment base address */
1198 sizeof(struct i386tss
)-1,/* length - all address space */
1199 SDT_SYS386TSS
, /* segment type */
1200 0, /* segment descriptor priority level */
1201 1, /* segment descriptor present */
1203 0, /* unused - default 32 vs 16 bit size */
1204 0 /* limit granularity (byte/page units)*/ },
1205 /* GLDT_SEL 5 LDT Descriptor */
1206 { (int) ldt
, /* segment base address */
1207 sizeof(ldt
)-1, /* length - all address space */
1208 SDT_SYSLDT
, /* segment type */
1209 SEL_UPL
, /* segment descriptor priority level */
1210 1, /* segment descriptor present */
1212 0, /* unused - default 32 vs 16 bit size */
1213 0 /* limit granularity (byte/page units)*/ },
1214 /* GUSERLDT_SEL 6 User LDT Descriptor per process */
1215 { (int) ldt
, /* segment base address */
1216 (512 * sizeof(union descriptor
)-1), /* length */
1217 SDT_SYSLDT
, /* segment type */
1218 0, /* segment descriptor priority level */
1219 1, /* segment descriptor present */
1221 0, /* unused - default 32 vs 16 bit size */
1222 0 /* limit granularity (byte/page units)*/ },
1223 /* GTGATE_SEL 7 Null Descriptor - Placeholder */
1224 { 0x0, /* segment base address */
1225 0x0, /* length - all address space */
1226 0, /* segment type */
1227 0, /* segment descriptor priority level */
1228 0, /* segment descriptor present */
1230 0, /* default 32 vs 16 bit size */
1231 0 /* limit granularity (byte/page units)*/ },
1232 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
1233 { 0x400, /* segment base address */
1234 0xfffff, /* length */
1235 SDT_MEMRWA
, /* segment type */
1236 0, /* segment descriptor priority level */
1237 1, /* segment descriptor present */
1239 1, /* default 32 vs 16 bit size */
1240 1 /* limit granularity (byte/page units)*/ },
1241 /* GPANIC_SEL 9 Panic Tss Descriptor */
1242 { (int) &dblfault_tss
, /* segment base address */
1243 sizeof(struct i386tss
)-1,/* length - all address space */
1244 SDT_SYS386TSS
, /* segment type */
1245 0, /* segment descriptor priority level */
1246 1, /* segment descriptor present */
1248 0, /* unused - default 32 vs 16 bit size */
1249 0 /* limit granularity (byte/page units)*/ },
1250 /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */
1251 { 0, /* segment base address (overwritten) */
1252 0xfffff, /* length */
1253 SDT_MEMERA
, /* segment type */
1254 0, /* segment descriptor priority level */
1255 1, /* segment descriptor present */
1257 0, /* default 32 vs 16 bit size */
1258 1 /* limit granularity (byte/page units)*/ },
1259 /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */
1260 { 0, /* segment base address (overwritten) */
1261 0xfffff, /* length */
1262 SDT_MEMERA
, /* segment type */
1263 0, /* segment descriptor priority level */
1264 1, /* segment descriptor present */
1266 0, /* default 32 vs 16 bit size */
1267 1 /* limit granularity (byte/page units)*/ },
1268 /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */
1269 { 0, /* segment base address (overwritten) */
1270 0xfffff, /* length */
1271 SDT_MEMRWA
, /* segment type */
1272 0, /* segment descriptor priority level */
1273 1, /* segment descriptor present */
1275 1, /* default 32 vs 16 bit size */
1276 1 /* limit granularity (byte/page units)*/ },
1277 /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */
1278 { 0, /* segment base address (overwritten) */
1279 0xfffff, /* length */
1280 SDT_MEMRWA
, /* segment type */
1281 0, /* segment descriptor priority level */
1282 1, /* segment descriptor present */
1284 0, /* default 32 vs 16 bit size */
1285 1 /* limit granularity (byte/page units)*/ },
1286 /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */
1287 { 0, /* segment base address (overwritten) */
1288 0xfffff, /* length */
1289 SDT_MEMRWA
, /* segment type */
1290 0, /* segment descriptor priority level */
1291 1, /* segment descriptor present */
1293 0, /* default 32 vs 16 bit size */
1294 1 /* limit granularity (byte/page units)*/ },
1295 /* GTLS_START 15 TLS */
1296 { 0x0, /* segment base address */
1298 0, /* segment type */
1299 0, /* segment descriptor priority level */
1300 0, /* segment descriptor present */
1302 0, /* default 32 vs 16 bit size */
1303 0 /* limit granularity (byte/page units)*/ },
1304 /* GTLS_START+1 16 TLS */
1305 { 0x0, /* segment base address */
1307 0, /* segment type */
1308 0, /* segment descriptor priority level */
1309 0, /* segment descriptor present */
1311 0, /* default 32 vs 16 bit size */
1312 0 /* limit granularity (byte/page units)*/ },
1313 /* GTLS_END 17 TLS */
1314 { 0x0, /* segment base address */
1316 0, /* segment type */
1317 0, /* segment descriptor priority level */
1318 0, /* segment descriptor present */
1320 0, /* default 32 vs 16 bit size */
1321 0 /* limit granularity (byte/page units)*/ },
1324 static struct soft_segment_descriptor ldt_segs
[] = {
1325 /* Null Descriptor - overwritten by call gate */
1326 { 0x0, /* segment base address */
1327 0x0, /* length - all address space */
1328 0, /* segment type */
1329 0, /* segment descriptor priority level */
1330 0, /* segment descriptor present */
1332 0, /* default 32 vs 16 bit size */
1333 0 /* limit granularity (byte/page units)*/ },
1334 /* Null Descriptor - overwritten by call gate */
1335 { 0x0, /* segment base address */
1336 0x0, /* length - all address space */
1337 0, /* segment type */
1338 0, /* segment descriptor priority level */
1339 0, /* segment descriptor present */
1341 0, /* default 32 vs 16 bit size */
1342 0 /* limit granularity (byte/page units)*/ },
1343 /* Null Descriptor - overwritten by call gate */
1344 { 0x0, /* segment base address */
1345 0x0, /* length - all address space */
1346 0, /* segment type */
1347 0, /* segment descriptor priority level */
1348 0, /* segment descriptor present */
1350 0, /* default 32 vs 16 bit size */
1351 0 /* limit granularity (byte/page units)*/ },
1352 /* Code Descriptor for user */
1353 { 0x0, /* segment base address */
1354 0xfffff, /* length - all address space */
1355 SDT_MEMERA
, /* segment type */
1356 SEL_UPL
, /* segment descriptor priority level */
1357 1, /* segment descriptor present */
1359 1, /* default 32 vs 16 bit size */
1360 1 /* limit granularity (byte/page units)*/ },
1361 /* Null Descriptor - overwritten by call gate */
1362 { 0x0, /* segment base address */
1363 0x0, /* length - all address space */
1364 0, /* segment type */
1365 0, /* segment descriptor priority level */
1366 0, /* segment descriptor present */
1368 0, /* default 32 vs 16 bit size */
1369 0 /* limit granularity (byte/page units)*/ },
1370 /* Data Descriptor for user */
1371 { 0x0, /* segment base address */
1372 0xfffff, /* length - all address space */
1373 SDT_MEMRWA
, /* segment type */
1374 SEL_UPL
, /* segment descriptor priority level */
1375 1, /* segment descriptor present */
1377 1, /* default 32 vs 16 bit size */
1378 1 /* limit granularity (byte/page units)*/ },
1382 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int selec
)
1384 struct gate_descriptor
*ip
;
1387 ip
->gd_looffset
= (int)func
;
1388 ip
->gd_selector
= selec
;
1394 ip
->gd_hioffset
= ((int)func
)>>16 ;
1397 #define IDTVEC(name) __CONCAT(X,name)
1400 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1401 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1402 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1403 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(fpu
), IDTVEC(align
),
1404 IDTVEC(xmm
), IDTVEC(syscall
),
1407 IDTVEC(int0x80_syscall
);
1409 #ifdef DEBUG_INTERRUPTS
1410 extern inthand_t
*Xrsvdary
[256];
1414 sdtossd(struct segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1416 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1417 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1418 ssd
->ssd_type
= sd
->sd_type
;
1419 ssd
->ssd_dpl
= sd
->sd_dpl
;
1420 ssd
->ssd_p
= sd
->sd_p
;
1421 ssd
->ssd_def32
= sd
->sd_def32
;
1422 ssd
->ssd_gran
= sd
->sd_gran
;
1426 * Populate the (physmap) array with base/bound pairs describing the
1427 * available physical memory in the system, then test this memory and
1428 * build the phys_avail array describing the actually-available memory.
1430 * If we cannot accurately determine the physical memory map, then use
1431 * value from the 0xE801 call, and failing that, the RTC.
1433 * Total memory size may be set by the kernel environment variable
1434 * hw.physmem or the compile-time define MAXMEM.
1437 getmemsize(int first
)
1439 int i
, physmap_idx
, pa_indx
, da_indx
;
1441 u_int basemem
, extmem
;
1442 struct vm86frame vmf
;
1443 struct vm86context vmc
;
1445 vm_offset_t physmap
[PHYSMAP_ENTRIES
*2];
1453 quad_t dcons_addr
, dcons_size
;
1455 bzero(&vmf
, sizeof(struct vm86frame
));
1456 bzero(physmap
, sizeof(physmap
));
1460 * Some newer BIOSes has broken INT 12H implementation which cause
1461 * kernel panic immediately. In this case, we need to scan SMAP
1462 * with INT 15:E820 first, then determine base memory size.
1465 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12
);
1466 if (hasbrokenint12
) {
1471 * Perform "base memory" related probes & setup. If we get a crazy
1472 * value give the bios some scribble space just in case.
1474 vm86_intcall(0x12, &vmf
);
1475 basemem
= vmf
.vmf_ax
;
1476 if (basemem
> 640) {
1477 kprintf("Preposterous BIOS basemem of %uK, "
1478 "truncating to < 640K\n", basemem
);
1483 * XXX if biosbasemem is now < 640, there is a `hole'
1484 * between the end of base memory and the start of
1485 * ISA memory. The hole may be empty or it may
1486 * contain BIOS code or data. Map it read/write so
1487 * that the BIOS can write to it. (Memory from 0 to
1488 * the physical end of the kernel is mapped read-only
1489 * to begin with and then parts of it are remapped.
1490 * The parts that aren't remapped form holes that
1491 * remain read-only and are unused by the kernel.
1492 * The base memory area is below the physical end of
1493 * the kernel and right now forms a read-only hole.
1494 * The part of it from PAGE_SIZE to
1495 * (trunc_page(biosbasemem * 1024) - 1) will be
1496 * remapped and used by the kernel later.)
1498 * This code is similar to the code used in
1499 * pmap_mapdev, but since no memory needs to be
1500 * allocated we simply change the mapping.
1502 for (pa
= trunc_page(basemem
* 1024);
1503 pa
< ISA_HOLE_START
; pa
+= PAGE_SIZE
) {
1504 pte
= vtopte(pa
+ KERNBASE
);
1505 *pte
= pa
| PG_RW
| PG_V
;
1509 * if basemem != 640, map pages r/w into vm86 page table so
1510 * that the bios can scribble on it.
1513 for (i
= basemem
/ 4; i
< 160; i
++)
1514 pte
[i
] = (i
<< PAGE_SHIFT
) | PG_V
| PG_RW
| PG_U
;
1518 * map page 1 R/W into the kernel page table so we can use it
1519 * as a buffer. The kernel will unmap this page later.
1521 pte
= vtopte(KERNBASE
+ (1 << PAGE_SHIFT
));
1522 *pte
= (1 << PAGE_SHIFT
) | PG_RW
| PG_V
;
1525 * get memory map with INT 15:E820
1527 #define SMAPSIZ sizeof(*smap)
1528 #define SMAP_SIG 0x534D4150 /* 'SMAP' */
1531 smap
= (void *)vm86_addpage(&vmc
, 1, KERNBASE
+ (1 << PAGE_SHIFT
));
1532 vm86_getptr(&vmc
, (vm_offset_t
)smap
, &vmf
.vmf_es
, &vmf
.vmf_di
);
1537 vmf
.vmf_eax
= 0xE820;
1538 vmf
.vmf_edx
= SMAP_SIG
;
1539 vmf
.vmf_ecx
= SMAPSIZ
;
1540 i
= vm86_datacall(0x15, &vmf
, &vmc
);
1541 if (i
|| vmf
.vmf_eax
!= SMAP_SIG
)
1543 if (boothowto
& RB_VERBOSE
)
1544 kprintf("SMAP type=%02x base=%08x %08x len=%08x %08x\n",
1546 *(u_int32_t
*)((char *)&smap
->base
+ 4),
1547 (u_int32_t
)smap
->base
,
1548 *(u_int32_t
*)((char *)&smap
->length
+ 4),
1549 (u_int32_t
)smap
->length
);
1551 if (smap
->type
!= 0x01)
1554 if (smap
->length
== 0)
1557 Realmem
+= smap
->length
;
1559 if (smap
->base
>= 0xffffffffLLU
) {
1560 kprintf("%ju MB of memory above 4GB ignored\n",
1561 (uintmax_t)(smap
->length
/ 1024 / 1024));
1565 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1566 if (smap
->base
< physmap
[i
+ 1]) {
1567 if (boothowto
& RB_VERBOSE
) {
1568 kprintf("Overlapping or non-montonic "
1569 "memory region, ignoring "
1572 Realmem
-= smap
->length
;
1577 if (smap
->base
== physmap
[physmap_idx
+ 1]) {
1578 physmap
[physmap_idx
+ 1] += smap
->length
;
1583 if (physmap_idx
== PHYSMAP_ENTRIES
*2) {
1584 kprintf("Too many segments in the physical "
1585 "address map, giving up\n");
1588 physmap
[physmap_idx
] = smap
->base
;
1589 physmap
[physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1591 ; /* fix GCC3.x warning */
1592 } while (vmf
.vmf_ebx
!= 0);
1595 * Perform "base memory" related probes & setup based on SMAP
1598 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1599 if (physmap
[i
] == 0x00000000) {
1600 basemem
= physmap
[i
+ 1] / 1024;
1609 if (basemem
> 640) {
1610 kprintf("Preposterous BIOS basemem of %uK, "
1611 "truncating to 640K\n", basemem
);
1615 for (pa
= trunc_page(basemem
* 1024);
1616 pa
< ISA_HOLE_START
; pa
+= PAGE_SIZE
) {
1617 pte
= vtopte(pa
+ KERNBASE
);
1618 *pte
= pa
| PG_RW
| PG_V
;
1622 for (i
= basemem
/ 4; i
< 160; i
++)
1623 pte
[i
] = (i
<< PAGE_SHIFT
) | PG_V
| PG_RW
| PG_U
;
1626 if (physmap
[1] != 0)
1630 * If we failed above, try memory map with INT 15:E801
1632 vmf
.vmf_ax
= 0xE801;
1633 if (vm86_intcall(0x15, &vmf
) == 0) {
1634 extmem
= vmf
.vmf_cx
+ vmf
.vmf_dx
* 64;
1638 vm86_intcall(0x15, &vmf
);
1639 extmem
= vmf
.vmf_ax
;
1642 * Prefer the RTC value for extended memory.
1644 extmem
= rtcin(RTC_EXTLO
) + (rtcin(RTC_EXTHI
) << 8);
1649 * Special hack for chipsets that still remap the 384k hole when
1650 * there's 16MB of memory - this really confuses people that
1651 * are trying to use bus mastering ISA controllers with the
1652 * "16MB limit"; they only have 16MB, but the remapping puts
1653 * them beyond the limit.
1655 * If extended memory is between 15-16MB (16-17MB phys address range),
1658 if ((extmem
> 15 * 1024) && (extmem
< 16 * 1024))
1662 physmap
[1] = basemem
* 1024;
1664 physmap
[physmap_idx
] = 0x100000;
1665 physmap
[physmap_idx
+ 1] = physmap
[physmap_idx
] + extmem
* 1024;
1669 * Now, physmap contains a map of physical memory.
1673 /* make hole for AP bootstrap code YYY */
1674 physmap
[1] = mp_bootaddress(physmap
[1]);
1676 /* Save EBDA address, if any */
1677 ebda_addr
= (u_long
)(*(u_short
*)(KERNBASE
+ 0x40e));
1682 * Maxmem isn't the "maximum memory", it's one larger than the
1683 * highest page of the physical address space. It should be
1684 * called something like "Maxphyspage". We may adjust this
1685 * based on ``hw.physmem'' and the results of the memory test.
1687 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1690 Maxmem
= MAXMEM
/ 4;
1693 if (kgetenv_quad("hw.physmem", &maxmem
))
1694 Maxmem
= atop(maxmem
);
1696 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1697 (boothowto
& RB_VERBOSE
))
1698 kprintf("Physical memory use set to %lluK\n", Maxmem
* 4);
1701 * If Maxmem has been increased beyond what the system has detected,
1702 * extend the last memory segment to the new limit.
1704 if (atop(physmap
[physmap_idx
+ 1]) < Maxmem
)
1705 physmap
[physmap_idx
+ 1] = ptoa(Maxmem
);
1707 /* call pmap initialization to make new kernel address space */
1708 pmap_bootstrap(first
, 0);
1711 * Size up each available chunk of physical memory.
1713 physmap
[0] = PAGE_SIZE
; /* mask off page 0 */
1716 phys_avail
[pa_indx
++] = physmap
[0];
1717 phys_avail
[pa_indx
] = physmap
[0];
1718 dump_avail
[da_indx
] = physmap
[0];
1723 * Get dcons buffer address
1725 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
1726 kgetenv_quad("dcons.size", &dcons_size
) == 0)
1730 * physmap is in bytes, so when converting to page boundaries,
1731 * round up the start address and round down the end address.
1733 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1737 if (physmap
[i
+ 1] < end
)
1738 end
= trunc_page(physmap
[i
+ 1]);
1739 for (pa
= round_page(physmap
[i
]); pa
< end
; pa
+= PAGE_SIZE
) {
1740 int tmp
, page_bad
, full
;
1744 int *ptr
= (int *)CADDR1
;
1749 * block out kernel memory as not available.
1751 if (pa
>= 0x100000 && pa
< first
)
1755 * block out dcons buffer
1758 && pa
>= trunc_page(dcons_addr
)
1759 && pa
< dcons_addr
+ dcons_size
)
1765 * map page into kernel: valid, read/write,non-cacheable
1767 *pte
= pa
| PG_V
| PG_RW
| PG_N
;
1772 * Test for alternating 1's and 0's
1774 *(volatile int *)ptr
= 0xaaaaaaaa;
1775 if (*(volatile int *)ptr
!= 0xaaaaaaaa) {
1779 * Test for alternating 0's and 1's
1781 *(volatile int *)ptr
= 0x55555555;
1782 if (*(volatile int *)ptr
!= 0x55555555) {
1788 *(volatile int *)ptr
= 0xffffffff;
1789 if (*(volatile int *)ptr
!= 0xffffffff) {
1795 *(volatile int *)ptr
= 0x0;
1796 if (*(volatile int *)ptr
!= 0x0) {
1800 * Restore original value.
1805 * Adjust array of valid/good pages.
1807 if (page_bad
== TRUE
) {
1811 * If this good page is a continuation of the
1812 * previous set of good pages, then just increase
1813 * the end pointer. Otherwise start a new chunk.
1814 * Note that "end" points one higher than end,
1815 * making the range >= start and < end.
1816 * If we're also doing a speculative memory
1817 * test and we at or past the end, bump up Maxmem
1818 * so that we keep going. The first bad page
1819 * will terminate the loop.
1821 if (phys_avail
[pa_indx
] == pa
) {
1822 phys_avail
[pa_indx
] += PAGE_SIZE
;
1825 if (pa_indx
>= PHYSMAP_ENTRIES
*2) {
1826 kprintf("Too many holes in the physical address space, giving up\n");
1831 phys_avail
[pa_indx
++] = pa
; /* start */
1832 phys_avail
[pa_indx
] = pa
+ PAGE_SIZE
; /* end */
1836 if (dump_avail
[da_indx
] == pa
) {
1837 dump_avail
[da_indx
] += PAGE_SIZE
;
1840 if (da_indx
>= PHYSMAP_ENTRIES
*2) {
1844 dump_avail
[da_indx
++] = pa
; /* start */
1845 dump_avail
[da_indx
] = pa
+ PAGE_SIZE
; /* end */
1858 * The last chunk must contain at least one page plus the message
1859 * buffer to avoid complicating other code (message buffer address
1860 * calculation, etc.).
1862 while (phys_avail
[pa_indx
- 1] + PAGE_SIZE
+
1863 round_page(MSGBUF_SIZE
) >= phys_avail
[pa_indx
]) {
1864 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
1865 phys_avail
[pa_indx
--] = 0;
1866 phys_avail
[pa_indx
--] = 0;
1869 Maxmem
= atop(phys_avail
[pa_indx
]);
1871 /* Trim off space for the message buffer. */
1872 phys_avail
[pa_indx
] -= round_page(MSGBUF_SIZE
);
1874 avail_end
= phys_avail
[pa_indx
];
1879 int apic_io_enable
= 1; /* Enabled by default for kernels compiled w/APIC_IO */
1881 int apic_io_enable
= 0; /* Disabled by default for kernels compiled without */
1883 TUNABLE_INT("hw.apic_io_enable", &apic_io_enable
);
1884 extern struct machintr_abi MachIntrABI_APIC
;
1887 struct machintr_abi MachIntrABI
;
1898 * 7 Device Not Available (x87)
1900 * 9 Coprocessor Segment overrun (unsupported, reserved)
1902 * 11 Segment not present
1904 * 13 General Protection
1907 * 16 x87 FP Exception pending
1908 * 17 Alignment Check
1910 * 19 SIMD floating point
1912 * 32-255 INTn/external sources
1917 struct gate_descriptor
*gdp
;
1918 int gsel_tss
, metadata_missing
, off
, x
;
1919 struct mdglobaldata
*gd
;
1922 * Prevent lowering of the ipl if we call tsleep() early.
1924 gd
= &CPU_prvspace
[0].mdglobaldata
;
1925 bzero(gd
, sizeof(*gd
));
1927 gd
->mi
.gd_curthread
= &thread0
;
1928 thread0
.td_gd
= &gd
->mi
;
1930 atdevbase
= ISA_HOLE_START
+ KERNBASE
;
1932 metadata_missing
= 0;
1933 if (bootinfo
.bi_modulep
) {
1934 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
1935 preload_bootstrap_relocate(KERNBASE
);
1937 metadata_missing
= 1;
1939 if (bootinfo
.bi_envp
)
1940 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
1943 * Default MachIntrABI to ICU
1945 MachIntrABI
= MachIntrABI_ICU
;
1947 TUNABLE_INT_FETCH("hw.apic_io_enable", &apic_io_enable
);
1951 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
1952 * and ncpus_fit_mask remain 0.
1957 /* Init basic tunables, hz etc */
1961 * make gdt memory segments, the code segment goes up to end of the
1962 * page with etext in it, the data segment goes to the end of
1966 * XXX text protection is temporarily (?) disabled. The limit was
1967 * i386_btop(round_page(etext)) - 1.
1969 gdt_segs
[GCODE_SEL
].ssd_limit
= atop(0 - 1);
1970 gdt_segs
[GDATA_SEL
].ssd_limit
= atop(0 - 1);
1972 gdt_segs
[GPRIV_SEL
].ssd_limit
=
1973 atop(sizeof(struct privatespace
) - 1);
1974 gdt_segs
[GPRIV_SEL
].ssd_base
= (int) &CPU_prvspace
[0];
1975 gdt_segs
[GPROC0_SEL
].ssd_base
=
1976 (int) &CPU_prvspace
[0].mdglobaldata
.gd_common_tss
;
1978 gd
->mi
.gd_prvspace
= &CPU_prvspace
[0];
1981 * Note: on both UP and SMP curthread must be set non-NULL
1982 * early in the boot sequence because the system assumes
1983 * that 'curthread' is never NULL.
1986 for (x
= 0; x
< NGDT
; x
++) {
1988 /* avoid overwriting db entries with APM ones */
1989 if (x
>= GAPMCODE32_SEL
&& x
<= GAPMDATA_SEL
)
1992 ssdtosd(&gdt_segs
[x
], &gdt
[x
].sd
);
1995 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
1996 r_gdt
.rd_base
= (int) gdt
;
1999 mi_gdinit(&gd
->mi
, 0);
2001 mi_proc0init(&gd
->mi
, proc0paddr
);
2002 safepri
= TDPRI_MAX
;
2004 /* make ldt memory segments */
2006 * XXX - VM_MAX_USER_ADDRESS is an end address, not a max. And it
2007 * should be spelled ...MAX_USER...
2009 ldt_segs
[LUCODE_SEL
].ssd_limit
= atop(VM_MAX_USER_ADDRESS
- 1);
2010 ldt_segs
[LUDATA_SEL
].ssd_limit
= atop(VM_MAX_USER_ADDRESS
- 1);
2011 for (x
= 0; x
< sizeof ldt_segs
/ sizeof ldt_segs
[0]; x
++)
2012 ssdtosd(&ldt_segs
[x
], &ldt
[x
].sd
);
2014 _default_ldt
= GSEL(GLDT_SEL
, SEL_KPL
);
2016 gd
->gd_currentldt
= _default_ldt
;
2017 /* spinlocks and the BGL */
2021 * Setup the hardware exception table. Most exceptions use
2022 * SDT_SYS386TGT, known as a 'trap gate'. Trap gates leave
2023 * interrupts enabled. VM page faults use SDT_SYS386IGT, known as
2024 * an 'interrupt trap gate', which disables interrupts on entry,
2025 * in order to be able to poll the appropriate CRn register to
2026 * determine the fault address.
2028 for (x
= 0; x
< NIDT
; x
++) {
2029 #ifdef DEBUG_INTERRUPTS
2030 setidt(x
, Xrsvdary
[x
], SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2032 setidt(x
, &IDTVEC(rsvd0
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2035 setidt(0, &IDTVEC(div
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2036 setidt(1, &IDTVEC(dbg
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2037 setidt(2, &IDTVEC(nmi
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2038 setidt(3, &IDTVEC(bpt
), SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2039 setidt(4, &IDTVEC(ofl
), SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2040 setidt(5, &IDTVEC(bnd
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2041 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2042 setidt(7, &IDTVEC(dna
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2043 setidt(8, 0, SDT_SYSTASKGT
, SEL_KPL
, GSEL(GPANIC_SEL
, SEL_KPL
));
2044 setidt(9, &IDTVEC(fpusegm
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2045 setidt(10, &IDTVEC(tss
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2046 setidt(11, &IDTVEC(missing
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2047 setidt(12, &IDTVEC(stk
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2048 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2049 setidt(14, &IDTVEC(page
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2050 setidt(15, &IDTVEC(rsvd0
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2051 setidt(16, &IDTVEC(fpu
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2052 setidt(17, &IDTVEC(align
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2053 setidt(18, &IDTVEC(mchk
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2054 setidt(19, &IDTVEC(xmm
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2055 setidt(0x80, &IDTVEC(int0x80_syscall
),
2056 SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2058 r_idt
.rd_limit
= sizeof(idt0
) - 1;
2059 r_idt
.rd_base
= (int) idt
;
2063 * Initialize the console before we print anything out.
2067 if (metadata_missing
)
2068 kprintf("WARNING: loader(8) metadata is missing!\n");
2078 if (boothowto
& RB_KDB
)
2079 Debugger("Boot flags requested debugger");
2082 finishidentcpu(); /* Final stage of CPU initialization */
2083 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2084 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2085 initializecpu(); /* Initialize CPU registers */
2088 * make an initial tss so cpu can get interrupt stack on syscall!
2089 * The 16 bytes is to save room for a VM86 context.
2091 gd
->gd_common_tss
.tss_esp0
= (int) thread0
.td_pcb
- 16;
2092 gd
->gd_common_tss
.tss_ss0
= GSEL(GDATA_SEL
, SEL_KPL
) ;
2093 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2094 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
].sd
;
2095 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2096 gd
->gd_common_tss
.tss_ioopt
= (sizeof gd
->gd_common_tss
) << 16;
2099 dblfault_tss
.tss_esp
= dblfault_tss
.tss_esp0
= dblfault_tss
.tss_esp1
=
2100 dblfault_tss
.tss_esp2
= (int) &dblfault_stack
[sizeof(dblfault_stack
)];
2101 dblfault_tss
.tss_ss
= dblfault_tss
.tss_ss0
= dblfault_tss
.tss_ss1
=
2102 dblfault_tss
.tss_ss2
= GSEL(GDATA_SEL
, SEL_KPL
);
2103 dblfault_tss
.tss_cr3
= (int)IdlePTD
;
2104 dblfault_tss
.tss_eip
= (int) dblfault_handler
;
2105 dblfault_tss
.tss_eflags
= PSL_KERNEL
;
2106 dblfault_tss
.tss_ds
= dblfault_tss
.tss_es
=
2107 dblfault_tss
.tss_gs
= GSEL(GDATA_SEL
, SEL_KPL
);
2108 dblfault_tss
.tss_fs
= GSEL(GPRIV_SEL
, SEL_KPL
);
2109 dblfault_tss
.tss_cs
= GSEL(GCODE_SEL
, SEL_KPL
);
2110 dblfault_tss
.tss_ldt
= GSEL(GLDT_SEL
, SEL_KPL
);
2114 init_param2(physmem
);
2116 /* now running on new page tables, configured,and u/iom is accessible */
2118 /* Map the message buffer. */
2119 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2120 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2122 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2124 /* make a call gate to reenter kernel with */
2125 gdp
= &ldt
[LSYS5CALLS_SEL
].gd
;
2127 x
= (int) &IDTVEC(syscall
);
2128 gdp
->gd_looffset
= x
++;
2129 gdp
->gd_selector
= GSEL(GCODE_SEL
,SEL_KPL
);
2131 gdp
->gd_type
= SDT_SYS386CGT
;
2132 gdp
->gd_dpl
= SEL_UPL
;
2134 gdp
->gd_hioffset
= ((int) &IDTVEC(syscall
)) >>16;
2136 /* XXX does this work? */
2137 ldt
[LBSDICALLS_SEL
] = ldt
[LSYS5CALLS_SEL
];
2138 ldt
[LSOL26CALLS_SEL
] = ldt
[LSYS5CALLS_SEL
];
2140 /* transfer to user mode */
2142 _ucodesel
= LSEL(LUCODE_SEL
, SEL_UPL
);
2143 _udatasel
= LSEL(LUDATA_SEL
, SEL_UPL
);
2145 /* setup proc 0's pcb */
2146 thread0
.td_pcb
->pcb_flags
= 0;
2147 thread0
.td_pcb
->pcb_cr3
= (int)IdlePTD
; /* should already be setup */
2148 thread0
.td_pcb
->pcb_ext
= 0;
2149 lwp0
.lwp_md
.md_regs
= &proc0_tf
;
2153 * Initialize machine-dependant portions of the global data structure.
2154 * Note that the global data area and cpu0's idlestack in the private
2155 * data space were allocated in locore.
2157 * Note: the idlethread's cpl is 0
2159 * WARNING! Called from early boot, 'mycpu' may not work yet.
2162 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2165 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2167 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2168 gd
->mi
.gd_prvspace
->idlestack
,
2169 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2171 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2172 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2173 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2174 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2178 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2180 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2181 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2188 globaldata_find(int cpu
)
2190 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2191 return(&CPU_prvspace
[cpu
].mdglobaldata
.mi
);
2194 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
2195 static void f00f_hack(void *unused
);
2196 SYSINIT(f00f_hack
, SI_BOOT2_BIOS
, SI_ORDER_ANY
, f00f_hack
, NULL
);
2199 f00f_hack(void *unused
)
2201 struct gate_descriptor
*new_idt
;
2207 kprintf("Intel Pentium detected, installing workaround for F00F bug\n");
2209 r_idt
.rd_limit
= sizeof(idt0
) - 1;
2211 tmp
= kmem_alloc(&kernel_map
, PAGE_SIZE
* 2);
2213 panic("kmem_alloc returned 0");
2214 if (((unsigned int)tmp
& (PAGE_SIZE
-1)) != 0)
2215 panic("kmem_alloc returned non-page-aligned memory");
2216 /* Put the first seven entries in the lower page */
2217 new_idt
= (struct gate_descriptor
*)(tmp
+ PAGE_SIZE
- (7*8));
2218 bcopy(idt
, new_idt
, sizeof(idt0
));
2219 r_idt
.rd_base
= (int)new_idt
;
2222 if (vm_map_protect(&kernel_map
, tmp
, tmp
+ PAGE_SIZE
,
2223 VM_PROT_READ
, FALSE
) != KERN_SUCCESS
)
2224 panic("vm_map_protect failed");
2227 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
2230 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2232 lp
->lwp_md
.md_regs
->tf_eip
= addr
;
2237 ptrace_single_step(struct lwp
*lp
)
2239 lp
->lwp_md
.md_regs
->tf_eflags
|= PSL_T
;
2244 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2246 struct trapframe
*tp
;
2248 tp
= lp
->lwp_md
.md_regs
;
2249 regs
->r_gs
= tp
->tf_gs
;
2250 regs
->r_fs
= tp
->tf_fs
;
2251 regs
->r_es
= tp
->tf_es
;
2252 regs
->r_ds
= tp
->tf_ds
;
2253 regs
->r_edi
= tp
->tf_edi
;
2254 regs
->r_esi
= tp
->tf_esi
;
2255 regs
->r_ebp
= tp
->tf_ebp
;
2256 regs
->r_ebx
= tp
->tf_ebx
;
2257 regs
->r_edx
= tp
->tf_edx
;
2258 regs
->r_ecx
= tp
->tf_ecx
;
2259 regs
->r_eax
= tp
->tf_eax
;
2260 regs
->r_eip
= tp
->tf_eip
;
2261 regs
->r_cs
= tp
->tf_cs
;
2262 regs
->r_eflags
= tp
->tf_eflags
;
2263 regs
->r_esp
= tp
->tf_esp
;
2264 regs
->r_ss
= tp
->tf_ss
;
2269 set_regs(struct lwp
*lp
, struct reg
*regs
)
2271 struct trapframe
*tp
;
2273 tp
= lp
->lwp_md
.md_regs
;
2274 if (!EFL_SECURE(regs
->r_eflags
, tp
->tf_eflags
) ||
2275 !CS_SECURE(regs
->r_cs
))
2277 tp
->tf_gs
= regs
->r_gs
;
2278 tp
->tf_fs
= regs
->r_fs
;
2279 tp
->tf_es
= regs
->r_es
;
2280 tp
->tf_ds
= regs
->r_ds
;
2281 tp
->tf_edi
= regs
->r_edi
;
2282 tp
->tf_esi
= regs
->r_esi
;
2283 tp
->tf_ebp
= regs
->r_ebp
;
2284 tp
->tf_ebx
= regs
->r_ebx
;
2285 tp
->tf_edx
= regs
->r_edx
;
2286 tp
->tf_ecx
= regs
->r_ecx
;
2287 tp
->tf_eax
= regs
->r_eax
;
2288 tp
->tf_eip
= regs
->r_eip
;
2289 tp
->tf_cs
= regs
->r_cs
;
2290 tp
->tf_eflags
= regs
->r_eflags
;
2291 tp
->tf_esp
= regs
->r_esp
;
2292 tp
->tf_ss
= regs
->r_ss
;
2296 #ifndef CPU_DISABLE_SSE
2298 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2300 struct env87
*penv_87
= &sv_87
->sv_env
;
2301 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2304 /* FPU control/status */
2305 penv_87
->en_cw
= penv_xmm
->en_cw
;
2306 penv_87
->en_sw
= penv_xmm
->en_sw
;
2307 penv_87
->en_tw
= penv_xmm
->en_tw
;
2308 penv_87
->en_fip
= penv_xmm
->en_fip
;
2309 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2310 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2311 penv_87
->en_foo
= penv_xmm
->en_foo
;
2312 penv_87
->en_fos
= penv_xmm
->en_fos
;
2315 for (i
= 0; i
< 8; ++i
)
2316 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2318 sv_87
->sv_ex_sw
= sv_xmm
->sv_ex_sw
;
2322 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2324 struct env87
*penv_87
= &sv_87
->sv_env
;
2325 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2328 /* FPU control/status */
2329 penv_xmm
->en_cw
= penv_87
->en_cw
;
2330 penv_xmm
->en_sw
= penv_87
->en_sw
;
2331 penv_xmm
->en_tw
= penv_87
->en_tw
;
2332 penv_xmm
->en_fip
= penv_87
->en_fip
;
2333 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2334 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2335 penv_xmm
->en_foo
= penv_87
->en_foo
;
2336 penv_xmm
->en_fos
= penv_87
->en_fos
;
2339 for (i
= 0; i
< 8; ++i
)
2340 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2342 sv_xmm
->sv_ex_sw
= sv_87
->sv_ex_sw
;
2344 #endif /* CPU_DISABLE_SSE */
2347 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2349 #ifndef CPU_DISABLE_SSE
2351 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2352 (struct save87
*)fpregs
);
2355 #endif /* CPU_DISABLE_SSE */
2356 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2361 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2363 #ifndef CPU_DISABLE_SSE
2365 set_fpregs_xmm((struct save87
*)fpregs
,
2366 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2369 #endif /* CPU_DISABLE_SSE */
2370 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2375 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2378 dbregs
->dr0
= rdr0();
2379 dbregs
->dr1
= rdr1();
2380 dbregs
->dr2
= rdr2();
2381 dbregs
->dr3
= rdr3();
2382 dbregs
->dr4
= rdr4();
2383 dbregs
->dr5
= rdr5();
2384 dbregs
->dr6
= rdr6();
2385 dbregs
->dr7
= rdr7();
2389 pcb
= lp
->lwp_thread
->td_pcb
;
2390 dbregs
->dr0
= pcb
->pcb_dr0
;
2391 dbregs
->dr1
= pcb
->pcb_dr1
;
2392 dbregs
->dr2
= pcb
->pcb_dr2
;
2393 dbregs
->dr3
= pcb
->pcb_dr3
;
2396 dbregs
->dr6
= pcb
->pcb_dr6
;
2397 dbregs
->dr7
= pcb
->pcb_dr7
;
2403 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2406 load_dr0(dbregs
->dr0
);
2407 load_dr1(dbregs
->dr1
);
2408 load_dr2(dbregs
->dr2
);
2409 load_dr3(dbregs
->dr3
);
2410 load_dr4(dbregs
->dr4
);
2411 load_dr5(dbregs
->dr5
);
2412 load_dr6(dbregs
->dr6
);
2413 load_dr7(dbregs
->dr7
);
2416 struct ucred
*ucred
;
2418 uint32_t mask1
, mask2
;
2421 * Don't let an illegal value for dr7 get set. Specifically,
2422 * check for undefined settings. Setting these bit patterns
2423 * result in undefined behaviour and can lead to an unexpected
2426 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 8;
2427 i
++, mask1
<<= 2, mask2
<<= 2)
2428 if ((dbregs
->dr7
& mask1
) == mask2
)
2431 pcb
= lp
->lwp_thread
->td_pcb
;
2432 ucred
= lp
->lwp_proc
->p_ucred
;
2435 * Don't let a process set a breakpoint that is not within the
2436 * process's address space. If a process could do this, it
2437 * could halt the system by setting a breakpoint in the kernel
2438 * (if ddb was enabled). Thus, we need to check to make sure
2439 * that no breakpoints are being enabled for addresses outside
2440 * process's address space, unless, perhaps, we were called by
2443 * XXX - what about when the watched area of the user's
2444 * address space is written into from within the kernel
2445 * ... wouldn't that still cause a breakpoint to be generated
2446 * from within kernel mode?
2449 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2450 if (dbregs
->dr7
& 0x3) {
2451 /* dr0 is enabled */
2452 if (dbregs
->dr0
>= VM_MAX_USER_ADDRESS
)
2456 if (dbregs
->dr7
& (0x3<<2)) {
2457 /* dr1 is enabled */
2458 if (dbregs
->dr1
>= VM_MAX_USER_ADDRESS
)
2462 if (dbregs
->dr7
& (0x3<<4)) {
2463 /* dr2 is enabled */
2464 if (dbregs
->dr2
>= VM_MAX_USER_ADDRESS
)
2468 if (dbregs
->dr7
& (0x3<<6)) {
2469 /* dr3 is enabled */
2470 if (dbregs
->dr3
>= VM_MAX_USER_ADDRESS
)
2475 pcb
->pcb_dr0
= dbregs
->dr0
;
2476 pcb
->pcb_dr1
= dbregs
->dr1
;
2477 pcb
->pcb_dr2
= dbregs
->dr2
;
2478 pcb
->pcb_dr3
= dbregs
->dr3
;
2479 pcb
->pcb_dr6
= dbregs
->dr6
;
2480 pcb
->pcb_dr7
= dbregs
->dr7
;
2482 pcb
->pcb_flags
|= PCB_DBREGS
;
2489 * Return > 0 if a hardware breakpoint has been hit, and the
2490 * breakpoint was in user space. Return 0, otherwise.
2493 user_dbreg_trap(void)
2495 u_int32_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2496 u_int32_t bp
; /* breakpoint bits extracted from dr6 */
2497 int nbp
; /* number of breakpoints that triggered */
2498 caddr_t addr
[4]; /* breakpoint addresses */
2502 if ((dr7
& 0x000000ff) == 0) {
2504 * all GE and LE bits in the dr7 register are zero,
2505 * thus the trap couldn't have been caused by the
2506 * hardware debug registers
2513 bp
= dr6
& 0x0000000f;
2517 * None of the breakpoint bits are set meaning this
2518 * trap was not caused by any of the debug registers
2524 * at least one of the breakpoints were hit, check to see
2525 * which ones and if any of them are user space addresses
2529 addr
[nbp
++] = (caddr_t
)rdr0();
2532 addr
[nbp
++] = (caddr_t
)rdr1();
2535 addr
[nbp
++] = (caddr_t
)rdr2();
2538 addr
[nbp
++] = (caddr_t
)rdr3();
2541 for (i
=0; i
<nbp
; i
++) {
2543 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2545 * addr[i] is in user space
2552 * None of the breakpoints are in user space.
2560 Debugger(const char *msg
)
2562 kprintf("Debugger(\"%s\") called.\n", msg
);
2569 * Provide inb() and outb() as functions. They are normally only
2570 * available as macros calling inlined functions, thus cannot be
2571 * called inside DDB.
2573 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2579 /* silence compiler warnings */
2581 void outb(u_int
, u_char
);
2588 * We use %%dx and not %1 here because i/o is done at %dx and not at
2589 * %edx, while gcc generates inferior code (movw instead of movl)
2590 * if we tell it to load (u_short) port.
2592 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2597 outb(u_int port
, u_char data
)
2601 * Use an unnecessary assignment to help gcc's register allocator.
2602 * This make a large difference for gcc-1.40 and a tiny difference
2603 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2604 * best results. gcc-2.6.0 can't handle this.
2607 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2614 #include "opt_cpu.h"
2618 * initialize all the SMP locks
2621 /* critical region when masking or unmasking interupts */
2622 struct spinlock_deprecated imen_spinlock
;
2624 /* critical region for old style disable_intr/enable_intr */
2625 struct spinlock_deprecated mpintr_spinlock
;
2627 /* critical region around INTR() routines */
2628 struct spinlock_deprecated intr_spinlock
;
2630 /* lock region used by kernel profiling */
2631 struct spinlock_deprecated mcount_spinlock
;
2633 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2634 struct spinlock_deprecated com_spinlock
;
2636 /* lock regions around the clock hardware */
2637 struct spinlock_deprecated clock_spinlock
;
2639 /* lock around the MP rendezvous */
2640 struct spinlock_deprecated smp_rv_spinlock
;
2647 * Get the initial mplock with a count of 1 for the BSP.
2648 * This uses a LOGICAL cpu ID, ie BSP == 0.
2650 cpu_get_initial_mplock();
2653 spin_lock_init(&mcount_spinlock
);
2654 spin_lock_init(&intr_spinlock
);
2655 spin_lock_init(&mpintr_spinlock
);
2656 spin_lock_init(&imen_spinlock
);
2657 spin_lock_init(&smp_rv_spinlock
);
2658 spin_lock_init(&com_spinlock
);
2659 spin_lock_init(&clock_spinlock
);
2661 /* our token pool needs to work early */
2662 lwkt_token_pool_init();