2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
6 * This code is derived from software contributed to Berkeley by
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
38 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
39 * $DragonFly: src/sys/platform/pc32/i386/machdep.c,v 1.97 2006/09/13 18:45:12 swildner Exp $
43 #include "use_ether.h"
46 #include "opt_atalk.h"
47 #include "opt_compat.h"
50 #include "opt_directio.h"
53 #include "opt_maxmem.h"
54 #include "opt_msgbuf.h"
55 #include "opt_perfmon.h"
57 #include "opt_userconfig.h"
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/sysproto.h>
62 #include <sys/signalvar.h>
63 #include <sys/kernel.h>
64 #include <sys/linker.h>
65 #include <sys/malloc.h>
68 #include <sys/reboot.h>
70 #include <sys/msgbuf.h>
71 #include <sys/sysent.h>
72 #include <sys/sysctl.h>
73 #include <sys/vmmeter.h>
75 #include <sys/upcall.h>
76 #include <sys/usched.h>
79 #include <vm/vm_param.h>
81 #include <vm/vm_kern.h>
82 #include <vm/vm_object.h>
83 #include <vm/vm_page.h>
84 #include <vm/vm_map.h>
85 #include <vm/vm_pager.h>
86 #include <vm/vm_extern.h>
88 #include <sys/thread2.h>
96 #include <machine/cpu.h>
97 #include <machine/reg.h>
98 #include <machine/clock.h>
99 #include <machine/specialreg.h>
100 #include <machine/bootinfo.h>
101 #include <machine/ipl.h>
102 #include <machine/md_var.h>
103 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
104 #include <machine/globaldata.h> /* CPU_prvspace */
105 #include <machine/smp.h>
107 #include <machine/perfmon.h>
109 #include <machine/cputypes.h>
112 #include <bus/isa/i386/isa_device.h>
114 #include <i386/isa/intr_machdep.h>
115 #include <bus/isa/rtc.h>
116 #include <machine/vm86.h>
117 #include <sys/random.h>
118 #include <sys/ptrace.h>
119 #include <machine/sigframe.h>
121 #define PHYSMAP_ENTRIES 10
123 extern void init386 (int first
);
124 extern void dblfault_handler (void);
126 extern void printcpuinfo(void); /* XXX header file */
127 extern void finishidentcpu(void);
128 extern void panicifcpuunsupported(void);
129 extern void initializecpu(void);
131 static void cpu_startup (void *);
132 #ifndef CPU_DISABLE_SSE
133 static void set_fpregs_xmm (struct save87
*, struct savexmm
*);
134 static void fill_fpregs_xmm (struct savexmm
*, struct save87
*);
135 #endif /* CPU_DISABLE_SSE */
137 extern void ffs_rawread_setup(void);
138 #endif /* DIRECTIO */
139 static void init_locks(void);
141 SYSINIT(cpu
, SI_SUB_CPU
, SI_ORDER_FIRST
, cpu_startup
, NULL
)
143 int _udatasel
, _ucodesel
;
146 int64_t tsc_offsets
[MAXCPU
];
148 int64_t tsc_offsets
[1];
151 #if defined(SWTCH_OPTIM_STATS)
152 extern int swtch_optim_stats
;
153 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
154 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
155 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
156 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
163 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
165 int error
= sysctl_handle_int(oidp
, 0, ctob(physmem
), req
);
169 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_INT
|CTLFLAG_RD
,
170 0, 0, sysctl_hw_physmem
, "IU", "");
173 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
175 int error
= sysctl_handle_int(oidp
, 0,
176 ctob(physmem
- vmstats
.v_wire_count
), req
);
180 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
181 0, 0, sysctl_hw_usermem
, "IU", "");
184 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
186 int error
= sysctl_handle_int(oidp
, 0,
187 i386_btop(avail_end
- avail_start
), req
);
191 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
192 0, 0, sysctl_hw_availpages
, "I", "");
195 sysctl_machdep_msgbuf(SYSCTL_HANDLER_ARGS
)
199 /* Unwind the buffer, so that it's linear (possibly starting with
200 * some initial nulls).
202 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
+msgbufp
->msg_bufr
,
203 msgbufp
->msg_size
-msgbufp
->msg_bufr
,req
);
204 if(error
) return(error
);
205 if(msgbufp
->msg_bufr
>0) {
206 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
,
207 msgbufp
->msg_bufr
,req
);
212 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf
, CTLTYPE_STRING
|CTLFLAG_RD
,
213 0, 0, sysctl_machdep_msgbuf
, "A","Contents of kernel message buffer");
215 static int msgbuf_clear
;
218 sysctl_machdep_msgbuf_clear(SYSCTL_HANDLER_ARGS
)
221 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
223 if (!error
&& req
->newptr
) {
224 /* Clear the buffer and reset write pointer */
225 bzero(msgbufp
->msg_ptr
,msgbufp
->msg_size
);
226 msgbufp
->msg_bufr
=msgbufp
->msg_bufx
=0;
232 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf_clear
, CTLTYPE_INT
|CTLFLAG_RW
,
233 &msgbuf_clear
, 0, sysctl_machdep_msgbuf_clear
, "I",
234 "Clear kernel message buffer");
237 vm_paddr_t Maxmem
= 0;
240 vm_paddr_t phys_avail
[PHYSMAP_ENTRIES
*2+2];
242 static vm_offset_t buffer_sva
, buffer_eva
;
243 vm_offset_t clean_sva
, clean_eva
;
244 static vm_offset_t pager_sva
, pager_eva
;
245 static struct trapframe proc0_tf
;
248 cpu_startup(void *dummy
)
256 if (boothowto
& RB_VERBOSE
)
260 * Good {morning,afternoon,evening,night}.
262 printf("%s", version
);
265 panicifcpuunsupported();
269 printf("real memory = %llu (%lluK bytes)\n", ptoa(Maxmem
), ptoa(Maxmem
) / 1024);
271 * Display any holes after the first chunk of extended memory.
276 printf("Physical memory chunk(s):\n");
277 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
278 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
280 printf("0x%08llx - 0x%08llx, %llu bytes (%llu pages)\n",
281 phys_avail
[indx
], phys_avail
[indx
+ 1] - 1, size1
,
287 * Allocate space for system data structures.
288 * The first available kernel virtual address is in "v".
289 * As pages of kernel virtual memory are allocated, "v" is incremented.
290 * As pages of memory are allocated and cleared,
291 * "firstaddr" is incremented.
292 * An index into the kernel page table corresponding to the
293 * virtual memory address maintained in "v" is kept in "mapaddr".
297 * Make two passes. The first pass calculates how much memory is
298 * needed and allocates it. The second pass assigns virtual
299 * addresses to the various data structures.
303 v
= (caddr_t
)firstaddr
;
305 #define valloc(name, type, num) \
306 (name) = (type *)v; v = (caddr_t)((name)+(num))
307 #define valloclim(name, type, num, lim) \
308 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
311 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
312 * For the first 64MB of ram nominally allocate sufficient buffers to
313 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
314 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
315 * the buffer cache we limit the eventual kva reservation to
318 * factor represents the 1/4 x ram conversion.
321 int factor
= 4 * BKVASIZE
/ 1024;
322 int kbytes
= physmem
* (PAGE_SIZE
/ 1024);
326 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
328 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
329 if (maxbcache
&& nbuf
> maxbcache
/ BKVASIZE
)
330 nbuf
= maxbcache
/ BKVASIZE
;
334 * Do not allow the buffer_map to be more then 1/2 the size of the
337 if (nbuf
> (kernel_map
->max_offset
- kernel_map
->min_offset
) /
339 nbuf
= (kernel_map
->max_offset
- kernel_map
->min_offset
) /
341 printf("Warning: nbufs capped at %d\n", nbuf
);
344 nswbuf
= max(min(nbuf
/4, 256), 16);
346 if (nswbuf
< NSWBUF_MIN
)
353 valloc(swbuf
, struct buf
, nswbuf
);
354 valloc(buf
, struct buf
, nbuf
);
357 * End of first pass, size has been calculated so allocate memory
359 if (firstaddr
== 0) {
360 size
= (vm_size_t
)(v
- firstaddr
);
361 firstaddr
= (int)kmem_alloc(kernel_map
, round_page(size
));
363 panic("startup: no room for tables");
368 * End of second pass, addresses have been assigned
370 if ((vm_size_t
)(v
- firstaddr
) != size
)
371 panic("startup: table size inconsistency");
373 clean_map
= kmem_suballoc(kernel_map
, &clean_sva
, &clean_eva
,
374 (nbuf
*BKVASIZE
) + (nswbuf
*MAXPHYS
) + pager_map_size
);
375 buffer_map
= kmem_suballoc(clean_map
, &buffer_sva
, &buffer_eva
,
377 buffer_map
->system_map
= 1;
378 pager_map
= kmem_suballoc(clean_map
, &pager_sva
, &pager_eva
,
379 (nswbuf
*MAXPHYS
) + pager_map_size
);
380 pager_map
->system_map
= 1;
381 exec_map
= kmem_suballoc(kernel_map
, &minaddr
, &maxaddr
,
382 (16*(ARG_MAX
+(PAGE_SIZE
*3))));
384 #if defined(USERCONFIG)
386 cninit(); /* the preferred console may have changed */
389 printf("avail memory = %u (%uK bytes)\n", ptoa(vmstats
.v_free_count
),
390 ptoa(vmstats
.v_free_count
) / 1024);
393 * Set up buffers, so they can be used to read disk labels.
396 vm_pager_bufferinit();
400 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
402 mp_start(); /* fire up the APs and APICs */
409 * Send an interrupt to process.
411 * Stack is set up to allow sigcode stored
412 * at top to call routine, followed by kcall
413 * to sigreturn routine below. After sigreturn
414 * resets the signal mask, the stack, and the
415 * frame pointer, it returns to the user
419 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
421 struct lwp
*lp
= curthread
->td_lwp
;
422 struct proc
*p
= lp
->lwp_proc
;
423 struct trapframe
*regs
;
424 struct sigacts
*psp
= p
->p_sigacts
;
425 struct sigframe sf
, *sfp
;
428 regs
= lp
->lwp_md
.md_regs
;
429 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
431 /* save user context */
432 bzero(&sf
, sizeof(struct sigframe
));
433 sf
.sf_uc
.uc_sigmask
= *mask
;
434 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
435 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
436 sf
.sf_uc
.uc_mcontext
.mc_gs
= rgs();
437 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_fs
, sizeof(struct trapframe
));
439 /* Allocate and validate space for the signal handler context. */
441 if ((p
->p_flag
& P_ALTSTACK
) != 0 && !oonstack
&&
442 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
443 sfp
= (struct sigframe
*)(lp
->lwp_sigstk
.ss_sp
+
444 lp
->lwp_sigstk
.ss_size
- sizeof(struct sigframe
));
445 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
448 sfp
= (struct sigframe
*)regs
->tf_esp
- 1;
450 /* Translate the signal is appropriate */
451 if (p
->p_sysent
->sv_sigtbl
) {
452 if (sig
<= p
->p_sysent
->sv_sigsize
)
453 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
456 /* Build the argument list for the signal handler. */
458 sf
.sf_ucontext
= (register_t
)&sfp
->sf_uc
;
459 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
460 /* Signal handler installed with SA_SIGINFO. */
461 sf
.sf_siginfo
= (register_t
)&sfp
->sf_si
;
462 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
464 /* fill siginfo structure */
465 sf
.sf_si
.si_signo
= sig
;
466 sf
.sf_si
.si_code
= code
;
467 sf
.sf_si
.si_addr
= (void*)regs
->tf_err
;
470 /* Old FreeBSD-style arguments. */
471 sf
.sf_siginfo
= code
;
472 sf
.sf_addr
= regs
->tf_err
;
473 sf
.sf_ahu
.sf_handler
= catcher
;
477 * If we're a vm86 process, we want to save the segment registers.
478 * We also change eflags to be our emulated eflags, not the actual
481 if (regs
->tf_eflags
& PSL_VM
) {
482 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
483 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
485 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
486 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
487 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
488 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
490 if (vm86
->vm86_has_vme
== 0)
491 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
492 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
493 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
496 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
497 * syscalls made by the signal handler. This just avoids
498 * wasting time for our lazy fixup of such faults. PSL_NT
499 * does nothing in vm86 mode, but vm86 programs can set it
500 * almost legitimately in probes for old cpu types.
502 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
506 * Copy the sigframe out to the user's stack.
508 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
510 * Something is wrong with the stack pointer.
511 * ...Kill the process.
516 regs
->tf_esp
= (int)sfp
;
517 regs
->tf_eip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
518 regs
->tf_eflags
&= ~PSL_T
;
519 regs
->tf_cs
= _ucodesel
;
520 regs
->tf_ds
= _udatasel
;
521 regs
->tf_es
= _udatasel
;
522 regs
->tf_fs
= _udatasel
;
523 regs
->tf_ss
= _udatasel
;
527 * sigreturn(ucontext_t *sigcntxp)
529 * System call to cleanup state after a signal
530 * has been taken. Reset signal mask and
531 * stack state from context left by sendsig (above).
532 * Return to previous pc and psl as specified by
533 * context left by sendsig. Check carefully to
534 * make sure that the user has not modified the
535 * state to gain improper privileges.
537 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
538 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
541 sys_sigreturn(struct sigreturn_args
*uap
)
543 struct lwp
*lp
= curthread
->td_lwp
;
544 struct trapframe
*regs
;
550 if (!useracc((caddr_t
)ucp
, sizeof(ucontext_t
), VM_PROT_READ
))
553 regs
= lp
->lwp_md
.md_regs
;
554 eflags
= ucp
->uc_mcontext
.mc_eflags
;
556 if (eflags
& PSL_VM
) {
557 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
558 struct vm86_kernel
*vm86
;
561 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
562 * set up the vm86 area, and we can't enter vm86 mode.
564 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
566 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
567 if (vm86
->vm86_inited
== 0)
570 /* go back to user mode if both flags are set */
571 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
572 trapsignal(lp
->lwp_proc
, SIGBUS
, 0);
574 if (vm86
->vm86_has_vme
) {
575 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
576 (eflags
& VME_USERCHANGE
) | PSL_VM
;
578 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
579 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) | (eflags
& VM_USERCHANGE
) | PSL_VM
;
581 bcopy(&ucp
->uc_mcontext
.mc_fs
, tf
, sizeof(struct trapframe
));
582 tf
->tf_eflags
= eflags
;
583 tf
->tf_vm86_ds
= tf
->tf_ds
;
584 tf
->tf_vm86_es
= tf
->tf_es
;
585 tf
->tf_vm86_fs
= tf
->tf_fs
;
586 tf
->tf_vm86_gs
= ucp
->uc_mcontext
.mc_gs
;
587 tf
->tf_ds
= _udatasel
;
588 tf
->tf_es
= _udatasel
;
589 tf
->tf_fs
= _udatasel
;
592 * Don't allow users to change privileged or reserved flags.
595 * XXX do allow users to change the privileged flag PSL_RF.
596 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
597 * should sometimes set it there too. tf_eflags is kept in
598 * the signal context during signal handling and there is no
599 * other place to remember it, so the PSL_RF bit may be
600 * corrupted by the signal handler without us knowing.
601 * Corruption of the PSL_RF bit at worst causes one more or
602 * one less debugger trap, so allowing it is fairly harmless.
604 if (!EFL_SECURE(eflags
& ~PSL_RF
, regs
->tf_eflags
& ~PSL_RF
)) {
605 printf("sigreturn: eflags = 0x%x\n", eflags
);
610 * Don't allow users to load a valid privileged %cs. Let the
611 * hardware check for invalid selectors, excess privilege in
612 * other selectors, invalid %eip's and invalid %esp's.
614 cs
= ucp
->uc_mcontext
.mc_cs
;
615 if (!CS_SECURE(cs
)) {
616 printf("sigreturn: cs = 0x%x\n", cs
);
617 trapsignal(lp
->lwp_proc
, SIGBUS
, T_PROTFLT
);
620 bcopy(&ucp
->uc_mcontext
.mc_fs
, regs
, sizeof(struct trapframe
));
623 if (ucp
->uc_mcontext
.mc_onstack
& 1)
624 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
626 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
628 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
629 SIG_CANTMASK(lp
->lwp_sigmask
);
634 * Stack frame on entry to function. %eax will contain the function vector,
635 * %ecx will contain the function data. flags, ecx, and eax will have
636 * already been pushed on the stack.
647 sendupcall(struct vmupcall
*vu
, int morepending
)
649 struct lwp
*lp
= curthread
->td_lwp
;
650 struct trapframe
*regs
;
651 struct upcall upcall
;
652 struct upc_frame upc_frame
;
656 * Get the upcall data structure
658 if (copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
)) ||
659 copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int))
662 printf("bad upcall address\n");
667 * If the data structure is already marked pending or has a critical
668 * section count, mark the data structure as pending and return
669 * without doing an upcall. vu_pending is left set.
671 if (upcall
.upc_pending
|| crit_count
>= vu
->vu_pending
) {
672 if (upcall
.upc_pending
< vu
->vu_pending
) {
673 upcall
.upc_pending
= vu
->vu_pending
;
674 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
675 sizeof(upcall
.upc_pending
));
681 * We can run this upcall now, clear vu_pending.
683 * Bump our critical section count and set or clear the
684 * user pending flag depending on whether more upcalls are
685 * pending. The user will be responsible for calling
686 * upc_dispatch(-1) to process remaining upcalls.
689 upcall
.upc_pending
= morepending
;
690 crit_count
+= TDPRI_CRIT
;
691 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
692 sizeof(upcall
.upc_pending
));
693 copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
,
697 * Construct a stack frame and issue the upcall
699 regs
= lp
->lwp_md
.md_regs
;
700 upc_frame
.eax
= regs
->tf_eax
;
701 upc_frame
.ecx
= regs
->tf_ecx
;
702 upc_frame
.edx
= regs
->tf_edx
;
703 upc_frame
.flags
= regs
->tf_eflags
;
704 upc_frame
.oldip
= regs
->tf_eip
;
705 if (copyout(&upc_frame
, (void *)(regs
->tf_esp
- sizeof(upc_frame
)),
706 sizeof(upc_frame
)) != 0) {
707 printf("bad stack on upcall\n");
709 regs
->tf_eax
= (register_t
)vu
->vu_func
;
710 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
711 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
712 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
713 regs
->tf_esp
-= sizeof(upc_frame
);
718 * fetchupcall occurs in the context of a system call, which means that
719 * we have to return EJUSTRETURN in order to prevent eax and edx from
720 * being overwritten by the syscall return value.
722 * if vu is not NULL we return the new context in %edx, the new data in %ecx,
723 * and the function pointer in %eax.
726 fetchupcall (struct vmupcall
*vu
, int morepending
, void *rsp
)
728 struct upc_frame upc_frame
;
729 struct lwp
*lp
= curthread
->td_lwp
;
730 struct trapframe
*regs
;
732 struct upcall upcall
;
735 regs
= lp
->lwp_md
.md_regs
;
737 error
= copyout(&morepending
, &lp
->lwp_upcall
->upc_pending
, sizeof(int));
741 * This jumps us to the next ready context.
744 error
= copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
));
747 error
= copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int));
748 crit_count
+= TDPRI_CRIT
;
750 error
= copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, sizeof(int));
751 regs
->tf_eax
= (register_t
)vu
->vu_func
;
752 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
753 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
754 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
755 regs
->tf_esp
= (register_t
)rsp
;
758 * This returns us to the originally interrupted code.
760 error
= copyin(rsp
, &upc_frame
, sizeof(upc_frame
));
761 regs
->tf_eax
= upc_frame
.eax
;
762 regs
->tf_ecx
= upc_frame
.ecx
;
763 regs
->tf_edx
= upc_frame
.edx
;
764 regs
->tf_eflags
= (regs
->tf_eflags
& ~PSL_USERCHANGE
) |
765 (upc_frame
.flags
& PSL_USERCHANGE
);
766 regs
->tf_eip
= upc_frame
.oldip
;
767 regs
->tf_esp
= (register_t
)((char *)rsp
+ sizeof(upc_frame
));
776 * Machine dependent boot() routine
778 * I haven't seen anything to put here yet
779 * Possibly some stuff might be grafted back here from boot()
787 * Shutdown the CPU as much as possible
797 * cpu_idle() represents the idle LWKT. You cannot return from this function
798 * (unless you want to blow things up!). Instead we look for runnable threads
799 * and loop or halt as appropriate. Giant is not held on entry to the thread.
801 * The main loop is entered with a critical section held, we must release
802 * the critical section before doing anything else. lwkt_switch() will
803 * check for pending interrupts due to entering and exiting its own
806 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI
807 * to wake a HLTed cpu up. However, there are cases where the idlethread
808 * will be entered with the possibility that no IPI will occur and in such
809 * cases lwkt_switch() sets TDF_IDLE_NOHLT.
811 static int cpu_idle_hlt
= 1;
812 static int cpu_idle_hltcnt
;
813 static int cpu_idle_spincnt
;
814 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
815 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
816 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, CTLFLAG_RW
,
817 &cpu_idle_hltcnt
, 0, "Idle loop entry halts");
818 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_spincnt
, CTLFLAG_RW
,
819 &cpu_idle_spincnt
, 0, "Idle loop entry spins");
822 cpu_idle_default_hook(void)
825 * We must guarentee that hlt is exactly the instruction
828 __asm
__volatile("sti; hlt");
831 /* Other subsystems (e.g., ACPI) can hook this later. */
832 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
837 struct thread
*td
= curthread
;
840 KKASSERT(td
->td_pri
< TDPRI_CRIT
);
843 * See if there are any LWKTs ready to go.
848 * If we are going to halt call splz unconditionally after
849 * CLIing to catch any interrupt races. Note that we are
850 * at SPL0 and interrupts are enabled.
852 if (cpu_idle_hlt
&& !lwkt_runnable() &&
853 (td
->td_flags
& TDF_IDLE_NOHLT
) == 0) {
854 __asm
__volatile("cli");
856 if (!lwkt_runnable())
860 __asm
__volatile("pause");
864 td
->td_flags
&= ~TDF_IDLE_NOHLT
;
867 __asm
__volatile("sti; pause");
869 __asm
__volatile("sti");
877 * Clear registers on exec
880 setregs(struct proc
*p
, u_long entry
, u_long stack
, u_long ps_strings
)
882 struct trapframe
*regs
= p
->p_md
.md_regs
;
883 struct pcb
*pcb
= p
->p_thread
->td_pcb
;
885 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */
886 pcb
->pcb_gs
= _udatasel
;
889 /* was i386_user_cleanup() in NetBSD */
892 bzero((char *)regs
, sizeof(struct trapframe
));
893 regs
->tf_eip
= entry
;
894 regs
->tf_esp
= stack
;
895 regs
->tf_eflags
= PSL_USER
| (regs
->tf_eflags
& PSL_T
);
896 regs
->tf_ss
= _udatasel
;
897 regs
->tf_ds
= _udatasel
;
898 regs
->tf_es
= _udatasel
;
899 regs
->tf_fs
= _udatasel
;
900 regs
->tf_cs
= _ucodesel
;
902 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
903 regs
->tf_ebx
= ps_strings
;
906 * Reset the hardware debug registers if they were in use.
907 * They won't have any meaning for the newly exec'd process.
909 if (pcb
->pcb_flags
& PCB_DBREGS
) {
916 if (pcb
== curthread
->td_pcb
) {
918 * Clear the debug registers on the running
919 * CPU, otherwise they will end up affecting
920 * the next process we switch to.
924 pcb
->pcb_flags
&= ~PCB_DBREGS
;
928 * Initialize the math emulator (if any) for the current process.
929 * Actually, just clear the bit that says that the emulator has
930 * been initialized. Initialization is delayed until the process
931 * traps to the emulator (if it is done at all) mainly because
932 * emulators don't provide an entry point for initialization.
934 p
->p_thread
->td_pcb
->pcb_flags
&= ~FP_SOFTFP
;
937 * note: do not set CR0_TS here. npxinit() must do it after clearing
938 * gd_npxthread. Otherwise a preemptive interrupt thread may panic
942 load_cr0(rcr0() | CR0_MP
);
945 /* Initialize the npx (if any) for the current process. */
946 npxinit(__INITIAL_NPXCW__
);
951 * note: linux emulator needs edx to be 0x0 on entry, which is
952 * handled in execve simply by setting the 64 bit syscall
963 cr0
|= CR0_NE
; /* Done by npxinit() */
964 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
966 if (cpu_class
!= CPUCLASS_386
)
968 cr0
|= CR0_WP
| CR0_AM
;
974 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
977 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
979 if (!error
&& req
->newptr
)
984 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
985 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
987 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
988 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
990 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
991 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
993 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
994 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
996 extern u_long bootdev
; /* not a cdev_t - encoding is different */
997 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
998 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1001 * Initialize 386 and configure to run kernel
1005 * Initialize segments & interrupt table
1009 union descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1010 static struct gate_descriptor idt0
[NIDT
];
1011 struct gate_descriptor
*idt
= &idt0
[0]; /* interrupt descriptor table */
1012 union descriptor ldt
[NLDT
]; /* local descriptor table */
1014 /* table descriptors - used to load tables by cpu */
1015 struct region_descriptor r_gdt
, r_idt
;
1017 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
1018 extern int has_f00f_bug
;
1021 static struct i386tss dblfault_tss
;
1022 static char dblfault_stack
[PAGE_SIZE
];
1024 extern struct user
*proc0paddr
;
1027 /* software prototypes -- in more palatable form */
1028 struct soft_segment_descriptor gdt_segs
[] = {
1029 /* GNULL_SEL 0 Null Descriptor */
1030 { 0x0, /* segment base address */
1032 0, /* segment type */
1033 0, /* segment descriptor priority level */
1034 0, /* segment descriptor present */
1036 0, /* default 32 vs 16 bit size */
1037 0 /* limit granularity (byte/page units)*/ },
1038 /* GCODE_SEL 1 Code Descriptor for kernel */
1039 { 0x0, /* segment base address */
1040 0xfffff, /* length - all address space */
1041 SDT_MEMERA
, /* segment type */
1042 0, /* segment descriptor priority level */
1043 1, /* segment descriptor present */
1045 1, /* default 32 vs 16 bit size */
1046 1 /* limit granularity (byte/page units)*/ },
1047 /* GDATA_SEL 2 Data Descriptor for kernel */
1048 { 0x0, /* segment base address */
1049 0xfffff, /* length - all address space */
1050 SDT_MEMRWA
, /* segment type */
1051 0, /* segment descriptor priority level */
1052 1, /* segment descriptor present */
1054 1, /* default 32 vs 16 bit size */
1055 1 /* limit granularity (byte/page units)*/ },
1056 /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */
1057 { 0x0, /* segment base address */
1058 0xfffff, /* length - all address space */
1059 SDT_MEMRWA
, /* segment type */
1060 0, /* segment descriptor priority level */
1061 1, /* segment descriptor present */
1063 1, /* default 32 vs 16 bit size */
1064 1 /* limit granularity (byte/page units)*/ },
1065 /* GPROC0_SEL 4 Proc 0 Tss Descriptor */
1067 0x0, /* segment base address */
1068 sizeof(struct i386tss
)-1,/* length - all address space */
1069 SDT_SYS386TSS
, /* segment type */
1070 0, /* segment descriptor priority level */
1071 1, /* segment descriptor present */
1073 0, /* unused - default 32 vs 16 bit size */
1074 0 /* limit granularity (byte/page units)*/ },
1075 /* GLDT_SEL 5 LDT Descriptor */
1076 { (int) ldt
, /* segment base address */
1077 sizeof(ldt
)-1, /* length - all address space */
1078 SDT_SYSLDT
, /* segment type */
1079 SEL_UPL
, /* segment descriptor priority level */
1080 1, /* segment descriptor present */
1082 0, /* unused - default 32 vs 16 bit size */
1083 0 /* limit granularity (byte/page units)*/ },
1084 /* GUSERLDT_SEL 6 User LDT Descriptor per process */
1085 { (int) ldt
, /* segment base address */
1086 (512 * sizeof(union descriptor
)-1), /* length */
1087 SDT_SYSLDT
, /* segment type */
1088 0, /* segment descriptor priority level */
1089 1, /* segment descriptor present */
1091 0, /* unused - default 32 vs 16 bit size */
1092 0 /* limit granularity (byte/page units)*/ },
1093 /* GTGATE_SEL 7 Null Descriptor - Placeholder */
1094 { 0x0, /* segment base address */
1095 0x0, /* length - all address space */
1096 0, /* segment type */
1097 0, /* segment descriptor priority level */
1098 0, /* segment descriptor present */
1100 0, /* default 32 vs 16 bit size */
1101 0 /* limit granularity (byte/page units)*/ },
1102 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
1103 { 0x400, /* segment base address */
1104 0xfffff, /* length */
1105 SDT_MEMRWA
, /* segment type */
1106 0, /* segment descriptor priority level */
1107 1, /* segment descriptor present */
1109 1, /* default 32 vs 16 bit size */
1110 1 /* limit granularity (byte/page units)*/ },
1111 /* GPANIC_SEL 9 Panic Tss Descriptor */
1112 { (int) &dblfault_tss
, /* segment base address */
1113 sizeof(struct i386tss
)-1,/* length - all address space */
1114 SDT_SYS386TSS
, /* segment type */
1115 0, /* segment descriptor priority level */
1116 1, /* segment descriptor present */
1118 0, /* unused - default 32 vs 16 bit size */
1119 0 /* limit granularity (byte/page units)*/ },
1120 /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */
1121 { 0, /* segment base address (overwritten) */
1122 0xfffff, /* length */
1123 SDT_MEMERA
, /* segment type */
1124 0, /* segment descriptor priority level */
1125 1, /* segment descriptor present */
1127 0, /* default 32 vs 16 bit size */
1128 1 /* limit granularity (byte/page units)*/ },
1129 /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */
1130 { 0, /* segment base address (overwritten) */
1131 0xfffff, /* length */
1132 SDT_MEMERA
, /* segment type */
1133 0, /* segment descriptor priority level */
1134 1, /* segment descriptor present */
1136 0, /* default 32 vs 16 bit size */
1137 1 /* limit granularity (byte/page units)*/ },
1138 /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */
1139 { 0, /* segment base address (overwritten) */
1140 0xfffff, /* length */
1141 SDT_MEMRWA
, /* segment type */
1142 0, /* segment descriptor priority level */
1143 1, /* segment descriptor present */
1145 1, /* default 32 vs 16 bit size */
1146 1 /* limit granularity (byte/page units)*/ },
1147 /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */
1148 { 0, /* segment base address (overwritten) */
1149 0xfffff, /* length */
1150 SDT_MEMRWA
, /* segment type */
1151 0, /* segment descriptor priority level */
1152 1, /* segment descriptor present */
1154 0, /* default 32 vs 16 bit size */
1155 1 /* limit granularity (byte/page units)*/ },
1156 /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */
1157 { 0, /* segment base address (overwritten) */
1158 0xfffff, /* length */
1159 SDT_MEMRWA
, /* segment type */
1160 0, /* segment descriptor priority level */
1161 1, /* segment descriptor present */
1163 0, /* default 32 vs 16 bit size */
1164 1 /* limit granularity (byte/page units)*/ },
1165 /* GTLS_START 15 TLS */
1166 { 0x0, /* segment base address */
1168 0, /* segment type */
1169 0, /* segment descriptor priority level */
1170 0, /* segment descriptor present */
1172 0, /* default 32 vs 16 bit size */
1173 0 /* limit granularity (byte/page units)*/ },
1174 /* GTLS_START+1 16 TLS */
1175 { 0x0, /* segment base address */
1177 0, /* segment type */
1178 0, /* segment descriptor priority level */
1179 0, /* segment descriptor present */
1181 0, /* default 32 vs 16 bit size */
1182 0 /* limit granularity (byte/page units)*/ },
1183 /* GTLS_END 17 TLS */
1184 { 0x0, /* segment base address */
1186 0, /* segment type */
1187 0, /* segment descriptor priority level */
1188 0, /* segment descriptor present */
1190 0, /* default 32 vs 16 bit size */
1191 0 /* limit granularity (byte/page units)*/ },
1194 static struct soft_segment_descriptor ldt_segs
[] = {
1195 /* Null Descriptor - overwritten by call gate */
1196 { 0x0, /* segment base address */
1197 0x0, /* length - all address space */
1198 0, /* segment type */
1199 0, /* segment descriptor priority level */
1200 0, /* segment descriptor present */
1202 0, /* default 32 vs 16 bit size */
1203 0 /* limit granularity (byte/page units)*/ },
1204 /* Null Descriptor - overwritten by call gate */
1205 { 0x0, /* segment base address */
1206 0x0, /* length - all address space */
1207 0, /* segment type */
1208 0, /* segment descriptor priority level */
1209 0, /* segment descriptor present */
1211 0, /* default 32 vs 16 bit size */
1212 0 /* limit granularity (byte/page units)*/ },
1213 /* Null Descriptor - overwritten by call gate */
1214 { 0x0, /* segment base address */
1215 0x0, /* length - all address space */
1216 0, /* segment type */
1217 0, /* segment descriptor priority level */
1218 0, /* segment descriptor present */
1220 0, /* default 32 vs 16 bit size */
1221 0 /* limit granularity (byte/page units)*/ },
1222 /* Code Descriptor for user */
1223 { 0x0, /* segment base address */
1224 0xfffff, /* length - all address space */
1225 SDT_MEMERA
, /* segment type */
1226 SEL_UPL
, /* segment descriptor priority level */
1227 1, /* segment descriptor present */
1229 1, /* default 32 vs 16 bit size */
1230 1 /* limit granularity (byte/page units)*/ },
1231 /* Null Descriptor - overwritten by call gate */
1232 { 0x0, /* segment base address */
1233 0x0, /* length - all address space */
1234 0, /* segment type */
1235 0, /* segment descriptor priority level */
1236 0, /* segment descriptor present */
1238 0, /* default 32 vs 16 bit size */
1239 0 /* limit granularity (byte/page units)*/ },
1240 /* Data Descriptor for user */
1241 { 0x0, /* segment base address */
1242 0xfffff, /* length - all address space */
1243 SDT_MEMRWA
, /* segment type */
1244 SEL_UPL
, /* segment descriptor priority level */
1245 1, /* segment descriptor present */
1247 1, /* default 32 vs 16 bit size */
1248 1 /* limit granularity (byte/page units)*/ },
1252 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int selec
)
1254 struct gate_descriptor
*ip
;
1257 ip
->gd_looffset
= (int)func
;
1258 ip
->gd_selector
= selec
;
1264 ip
->gd_hioffset
= ((int)func
)>>16 ;
1267 #define IDTVEC(name) __CONCAT(X,name)
1270 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1271 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1272 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1273 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(fpu
), IDTVEC(align
),
1274 IDTVEC(xmm
), IDTVEC(syscall
),
1277 IDTVEC(int0x80_syscall
);
1279 #ifdef DEBUG_INTERRUPTS
1280 extern inthand_t
*Xrsvdary
[256];
1284 sdtossd(struct segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1286 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1287 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1288 ssd
->ssd_type
= sd
->sd_type
;
1289 ssd
->ssd_dpl
= sd
->sd_dpl
;
1290 ssd
->ssd_p
= sd
->sd_p
;
1291 ssd
->ssd_def32
= sd
->sd_def32
;
1292 ssd
->ssd_gran
= sd
->sd_gran
;
1296 * Populate the (physmap) array with base/bound pairs describing the
1297 * available physical memory in the system, then test this memory and
1298 * build the phys_avail array describing the actually-available memory.
1300 * If we cannot accurately determine the physical memory map, then use
1301 * value from the 0xE801 call, and failing that, the RTC.
1303 * Total memory size may be set by the kernel environment variable
1304 * hw.physmem or the compile-time define MAXMEM.
1307 getmemsize(int first
)
1309 int i
, physmap_idx
, pa_indx
;
1311 u_int basemem
, extmem
;
1312 struct vm86frame vmf
;
1313 struct vm86context vmc
;
1315 vm_offset_t physmap
[PHYSMAP_ENTRIES
*2];
1323 quad_t dcons_addr
, dcons_size
;
1326 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12
);
1327 bzero(&vmf
, sizeof(struct vm86frame
));
1328 bzero(physmap
, sizeof(physmap
));
1332 * Some newer BIOSes has broken INT 12H implementation which cause
1333 * kernel panic immediately. In this case, we need to scan SMAP
1334 * with INT 15:E820 first, then determine base memory size.
1336 if (hasbrokenint12
) {
1341 * Perform "base memory" related probes & setup. If we get a crazy
1342 * value give the bios some scribble space just in case.
1344 vm86_intcall(0x12, &vmf
);
1345 basemem
= vmf
.vmf_ax
;
1346 if (basemem
> 640) {
1347 printf("Preposterous BIOS basemem of %uK, "
1348 "truncating to < 640K\n", basemem
);
1353 * XXX if biosbasemem is now < 640, there is a `hole'
1354 * between the end of base memory and the start of
1355 * ISA memory. The hole may be empty or it may
1356 * contain BIOS code or data. Map it read/write so
1357 * that the BIOS can write to it. (Memory from 0 to
1358 * the physical end of the kernel is mapped read-only
1359 * to begin with and then parts of it are remapped.
1360 * The parts that aren't remapped form holes that
1361 * remain read-only and are unused by the kernel.
1362 * The base memory area is below the physical end of
1363 * the kernel and right now forms a read-only hole.
1364 * The part of it from PAGE_SIZE to
1365 * (trunc_page(biosbasemem * 1024) - 1) will be
1366 * remapped and used by the kernel later.)
1368 * This code is similar to the code used in
1369 * pmap_mapdev, but since no memory needs to be
1370 * allocated we simply change the mapping.
1372 for (pa
= trunc_page(basemem
* 1024);
1373 pa
< ISA_HOLE_START
; pa
+= PAGE_SIZE
) {
1374 pte
= vtopte(pa
+ KERNBASE
);
1375 *pte
= pa
| PG_RW
| PG_V
;
1379 * if basemem != 640, map pages r/w into vm86 page table so
1380 * that the bios can scribble on it.
1383 for (i
= basemem
/ 4; i
< 160; i
++)
1384 pte
[i
] = (i
<< PAGE_SHIFT
) | PG_V
| PG_RW
| PG_U
;
1388 * map page 1 R/W into the kernel page table so we can use it
1389 * as a buffer. The kernel will unmap this page later.
1391 pte
= vtopte(KERNBASE
+ (1 << PAGE_SHIFT
));
1392 *pte
= (1 << PAGE_SHIFT
) | PG_RW
| PG_V
;
1395 * get memory map with INT 15:E820
1397 #define SMAPSIZ sizeof(*smap)
1398 #define SMAP_SIG 0x534D4150 /* 'SMAP' */
1401 smap
= (void *)vm86_addpage(&vmc
, 1, KERNBASE
+ (1 << PAGE_SHIFT
));
1402 vm86_getptr(&vmc
, (vm_offset_t
)smap
, &vmf
.vmf_es
, &vmf
.vmf_di
);
1407 vmf
.vmf_eax
= 0xE820;
1408 vmf
.vmf_edx
= SMAP_SIG
;
1409 vmf
.vmf_ecx
= SMAPSIZ
;
1410 i
= vm86_datacall(0x15, &vmf
, &vmc
);
1411 if (i
|| vmf
.vmf_eax
!= SMAP_SIG
)
1413 if (boothowto
& RB_VERBOSE
)
1414 printf("SMAP type=%02x base=%08x %08x len=%08x %08x\n",
1416 *(u_int32_t
*)((char *)&smap
->base
+ 4),
1417 (u_int32_t
)smap
->base
,
1418 *(u_int32_t
*)((char *)&smap
->length
+ 4),
1419 (u_int32_t
)smap
->length
);
1421 if (smap
->type
!= 0x01)
1424 if (smap
->length
== 0)
1427 if (smap
->base
>= 0xffffffff) {
1428 printf("%uK of memory above 4GB ignored\n",
1429 (u_int
)(smap
->length
/ 1024));
1433 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1434 if (smap
->base
< physmap
[i
+ 1]) {
1435 if (boothowto
& RB_VERBOSE
)
1437 "Overlapping or non-montonic memory region, ignoring second region\n");
1442 if (smap
->base
== physmap
[physmap_idx
+ 1]) {
1443 physmap
[physmap_idx
+ 1] += smap
->length
;
1448 if (physmap_idx
== PHYSMAP_ENTRIES
*2) {
1450 "Too many segments in the physical address map, giving up\n");
1453 physmap
[physmap_idx
] = smap
->base
;
1454 physmap
[physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1456 ; /* fix GCC3.x warning */
1457 } while (vmf
.vmf_ebx
!= 0);
1460 * Perform "base memory" related probes & setup based on SMAP
1463 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1464 if (physmap
[i
] == 0x00000000) {
1465 basemem
= physmap
[i
+ 1] / 1024;
1474 if (basemem
> 640) {
1475 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
1480 for (pa
= trunc_page(basemem
* 1024);
1481 pa
< ISA_HOLE_START
; pa
+= PAGE_SIZE
) {
1482 pte
= vtopte(pa
+ KERNBASE
);
1483 *pte
= pa
| PG_RW
| PG_V
;
1487 for (i
= basemem
/ 4; i
< 160; i
++)
1488 pte
[i
] = (i
<< PAGE_SHIFT
) | PG_V
| PG_RW
| PG_U
;
1491 if (physmap
[1] != 0)
1495 * If we failed above, try memory map with INT 15:E801
1497 vmf
.vmf_ax
= 0xE801;
1498 if (vm86_intcall(0x15, &vmf
) == 0) {
1499 extmem
= vmf
.vmf_cx
+ vmf
.vmf_dx
* 64;
1503 vm86_intcall(0x15, &vmf
);
1504 extmem
= vmf
.vmf_ax
;
1507 * Prefer the RTC value for extended memory.
1509 extmem
= rtcin(RTC_EXTLO
) + (rtcin(RTC_EXTHI
) << 8);
1514 * Special hack for chipsets that still remap the 384k hole when
1515 * there's 16MB of memory - this really confuses people that
1516 * are trying to use bus mastering ISA controllers with the
1517 * "16MB limit"; they only have 16MB, but the remapping puts
1518 * them beyond the limit.
1520 * If extended memory is between 15-16MB (16-17MB phys address range),
1523 if ((extmem
> 15 * 1024) && (extmem
< 16 * 1024))
1527 physmap
[1] = basemem
* 1024;
1529 physmap
[physmap_idx
] = 0x100000;
1530 physmap
[physmap_idx
+ 1] = physmap
[physmap_idx
] + extmem
* 1024;
1534 * Now, physmap contains a map of physical memory.
1538 /* make hole for AP bootstrap code YYY */
1539 physmap
[1] = mp_bootaddress(physmap
[1] / 1024);
1541 /* look for the MP hardware - needed for apic addresses */
1546 * Maxmem isn't the "maximum memory", it's one larger than the
1547 * highest page of the physical address space. It should be
1548 * called something like "Maxphyspage". We may adjust this
1549 * based on ``hw.physmem'' and the results of the memory test.
1551 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1554 Maxmem
= MAXMEM
/ 4;
1558 * hw.physmem is a size in bytes; we also allow k, m, and g suffixes
1559 * for the appropriate modifiers. This overrides MAXMEM.
1561 if ((cp
= kgetenv("hw.physmem")) != NULL
) {
1562 u_int64_t AllowMem
, sanity
;
1565 sanity
= AllowMem
= strtouq(cp
, &ep
, 0);
1566 if ((ep
!= cp
) && (*ep
!= 0)) {
1579 AllowMem
= sanity
= 0;
1581 if (AllowMem
< sanity
)
1585 printf("Ignoring invalid memory size of '%s'\n", cp
);
1587 Maxmem
= atop(AllowMem
);
1590 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1591 (boothowto
& RB_VERBOSE
))
1592 printf("Physical memory use set to %lluK\n", Maxmem
* 4);
1595 * If Maxmem has been increased beyond what the system has detected,
1596 * extend the last memory segment to the new limit.
1598 if (atop(physmap
[physmap_idx
+ 1]) < Maxmem
)
1599 physmap
[physmap_idx
+ 1] = ptoa(Maxmem
);
1601 /* call pmap initialization to make new kernel address space */
1602 pmap_bootstrap(first
, 0);
1605 * Size up each available chunk of physical memory.
1607 physmap
[0] = PAGE_SIZE
; /* mask off page 0 */
1609 phys_avail
[pa_indx
++] = physmap
[0];
1610 phys_avail
[pa_indx
] = physmap
[0];
1614 * Get dcons buffer address
1616 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
1617 kgetenv_quad("dcons.size", &dcons_size
) == 0)
1621 * physmap is in bytes, so when converting to page boundaries,
1622 * round up the start address and round down the end address.
1624 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1628 if (physmap
[i
+ 1] < end
)
1629 end
= trunc_page(physmap
[i
+ 1]);
1630 for (pa
= round_page(physmap
[i
]); pa
< end
; pa
+= PAGE_SIZE
) {
1635 int *ptr
= (int *)CADDR1
;
1639 * block out kernel memory as not available.
1641 if (pa
>= 0x100000 && pa
< first
)
1645 * block out dcons buffer
1648 && pa
>= trunc_page(dcons_addr
)
1649 && pa
< dcons_addr
+ dcons_size
)
1655 * map page into kernel: valid, read/write,non-cacheable
1657 *pte
= pa
| PG_V
| PG_RW
| PG_N
;
1662 * Test for alternating 1's and 0's
1664 *(volatile int *)ptr
= 0xaaaaaaaa;
1665 if (*(volatile int *)ptr
!= 0xaaaaaaaa) {
1669 * Test for alternating 0's and 1's
1671 *(volatile int *)ptr
= 0x55555555;
1672 if (*(volatile int *)ptr
!= 0x55555555) {
1678 *(volatile int *)ptr
= 0xffffffff;
1679 if (*(volatile int *)ptr
!= 0xffffffff) {
1685 *(volatile int *)ptr
= 0x0;
1686 if (*(volatile int *)ptr
!= 0x0) {
1690 * Restore original value.
1695 * Adjust array of valid/good pages.
1697 if (page_bad
== TRUE
) {
1701 * If this good page is a continuation of the
1702 * previous set of good pages, then just increase
1703 * the end pointer. Otherwise start a new chunk.
1704 * Note that "end" points one higher than end,
1705 * making the range >= start and < end.
1706 * If we're also doing a speculative memory
1707 * test and we at or past the end, bump up Maxmem
1708 * so that we keep going. The first bad page
1709 * will terminate the loop.
1711 if (phys_avail
[pa_indx
] == pa
) {
1712 phys_avail
[pa_indx
] += PAGE_SIZE
;
1715 if (pa_indx
>= PHYSMAP_ENTRIES
*2) {
1716 printf("Too many holes in the physical address space, giving up\n");
1720 phys_avail
[pa_indx
++] = pa
; /* start */
1721 phys_avail
[pa_indx
] = pa
+ PAGE_SIZE
; /* end */
1731 * The last chunk must contain at least one page plus the message
1732 * buffer to avoid complicating other code (message buffer address
1733 * calculation, etc.).
1735 while (phys_avail
[pa_indx
- 1] + PAGE_SIZE
+
1736 round_page(MSGBUF_SIZE
) >= phys_avail
[pa_indx
]) {
1737 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
1738 phys_avail
[pa_indx
--] = 0;
1739 phys_avail
[pa_indx
--] = 0;
1742 Maxmem
= atop(phys_avail
[pa_indx
]);
1744 /* Trim off space for the message buffer. */
1745 phys_avail
[pa_indx
] -= round_page(MSGBUF_SIZE
);
1747 avail_end
= phys_avail
[pa_indx
];
1759 * 7 Device Not Available (x87)
1761 * 9 Coprocessor Segment overrun (unsupported, reserved)
1763 * 11 Segment not present
1765 * 13 General Protection
1768 * 16 x87 FP Exception pending
1769 * 17 Alignment Check
1771 * 19 SIMD floating point
1773 * 32-255 INTn/external sources
1778 struct gate_descriptor
*gdp
;
1779 int gsel_tss
, metadata_missing
, off
, x
;
1780 struct mdglobaldata
*gd
;
1783 * Prevent lowering of the ipl if we call tsleep() early.
1785 gd
= &CPU_prvspace
[0].mdglobaldata
;
1786 bzero(gd
, sizeof(*gd
));
1788 gd
->mi
.gd_curthread
= &thread0
;
1790 atdevbase
= ISA_HOLE_START
+ KERNBASE
;
1792 metadata_missing
= 0;
1793 if (bootinfo
.bi_modulep
) {
1794 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
1795 preload_bootstrap_relocate(KERNBASE
);
1797 metadata_missing
= 1;
1799 if (bootinfo
.bi_envp
)
1800 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
1803 * start with one cpu. Note: ncpus2_shift and ncpus2_mask are left
1808 /* Init basic tunables, hz etc */
1812 * make gdt memory segments, the code segment goes up to end of the
1813 * page with etext in it, the data segment goes to the end of
1817 * XXX text protection is temporarily (?) disabled. The limit was
1818 * i386_btop(round_page(etext)) - 1.
1820 gdt_segs
[GCODE_SEL
].ssd_limit
= atop(0 - 1);
1821 gdt_segs
[GDATA_SEL
].ssd_limit
= atop(0 - 1);
1823 gdt_segs
[GPRIV_SEL
].ssd_limit
=
1824 atop(sizeof(struct privatespace
) - 1);
1825 gdt_segs
[GPRIV_SEL
].ssd_base
= (int) &CPU_prvspace
[0];
1826 gdt_segs
[GPROC0_SEL
].ssd_base
=
1827 (int) &CPU_prvspace
[0].mdglobaldata
.gd_common_tss
;
1829 gd
->mi
.gd_prvspace
= &CPU_prvspace
[0];
1832 * Note: on both UP and SMP curthread must be set non-NULL
1833 * early in the boot sequence because the system assumes
1834 * that 'curthread' is never NULL.
1837 for (x
= 0; x
< NGDT
; x
++) {
1839 /* avoid overwriting db entries with APM ones */
1840 if (x
>= GAPMCODE32_SEL
&& x
<= GAPMDATA_SEL
)
1843 ssdtosd(&gdt_segs
[x
], &gdt
[x
].sd
);
1846 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
1847 r_gdt
.rd_base
= (int) gdt
;
1850 mi_gdinit(&gd
->mi
, 0);
1852 lwkt_init_thread(&thread0
, proc0paddr
, LWKT_THREAD_STACK
, 0, &gd
->mi
);
1853 lwkt_set_comm(&thread0
, "thread0");
1854 proc0
.p_addr
= (void *)thread0
.td_kstack
;
1855 LIST_INIT(&proc0
.p_lwps
);
1856 LIST_INSERT_HEAD(&proc0
.p_lwps
, &proc0
.p_lwp
, lwp_list
);
1857 proc0
.p_lwp
.lwp_thread
= &thread0
;
1858 proc0
.p_lwp
.lwp_proc
= &proc0
;
1859 proc0
.p_usched
= usched_init();
1860 proc0
.p_lwp
.lwp_cpumask
= 0xFFFFFFFF;
1861 varsymset_init(&proc0
.p_varsymset
, NULL
);
1862 thread0
.td_flags
|= TDF_RUNNING
;
1863 thread0
.td_proc
= &proc0
;
1864 thread0
.td_lwp
= &proc0
.p_lwp
;
1865 thread0
.td_switch
= cpu_heavy_switch
; /* YYY eventually LWKT */
1866 safepri
= TDPRI_MAX
;
1868 /* make ldt memory segments */
1870 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it
1871 * should be spelled ...MAX_USER...
1873 ldt_segs
[LUCODE_SEL
].ssd_limit
= atop(VM_MAXUSER_ADDRESS
- 1);
1874 ldt_segs
[LUDATA_SEL
].ssd_limit
= atop(VM_MAXUSER_ADDRESS
- 1);
1875 for (x
= 0; x
< sizeof ldt_segs
/ sizeof ldt_segs
[0]; x
++)
1876 ssdtosd(&ldt_segs
[x
], &ldt
[x
].sd
);
1878 _default_ldt
= GSEL(GLDT_SEL
, SEL_KPL
);
1880 gd
->gd_currentldt
= _default_ldt
;
1881 /* spinlocks and the BGL */
1885 * Setup the hardware exception table. Most exceptions use
1886 * SDT_SYS386TGT, known as a 'trap gate'. Trap gates leave
1887 * interrupts enabled. VM page faults use SDT_SYS386IGT, known as
1888 * an 'interrupt trap gate', which disables interrupts on entry,
1889 * in order to be able to poll the appropriate CRn register to
1890 * determine the fault address.
1892 for (x
= 0; x
< NIDT
; x
++) {
1893 #ifdef DEBUG_INTERRUPTS
1894 setidt(x
, Xrsvdary
[x
], SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1896 setidt(x
, &IDTVEC(rsvd0
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1899 setidt(0, &IDTVEC(div
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1900 setidt(1, &IDTVEC(dbg
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1901 setidt(2, &IDTVEC(nmi
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1902 setidt(3, &IDTVEC(bpt
), SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1903 setidt(4, &IDTVEC(ofl
), SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1904 setidt(5, &IDTVEC(bnd
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1905 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1906 setidt(7, &IDTVEC(dna
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1907 setidt(8, 0, SDT_SYSTASKGT
, SEL_KPL
, GSEL(GPANIC_SEL
, SEL_KPL
));
1908 setidt(9, &IDTVEC(fpusegm
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1909 setidt(10, &IDTVEC(tss
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1910 setidt(11, &IDTVEC(missing
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1911 setidt(12, &IDTVEC(stk
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1912 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1913 setidt(14, &IDTVEC(page
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1914 setidt(15, &IDTVEC(rsvd0
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1915 setidt(16, &IDTVEC(fpu
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1916 setidt(17, &IDTVEC(align
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1917 setidt(18, &IDTVEC(mchk
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1918 setidt(19, &IDTVEC(xmm
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1919 setidt(0x80, &IDTVEC(int0x80_syscall
),
1920 SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1922 r_idt
.rd_limit
= sizeof(idt0
) - 1;
1923 r_idt
.rd_base
= (int) idt
;
1927 * Initialize the console before we print anything out.
1931 if (metadata_missing
)
1932 printf("WARNING: loader(8) metadata is missing!\n");
1941 if (boothowto
& RB_KDB
)
1942 Debugger("Boot flags requested debugger");
1945 finishidentcpu(); /* Final stage of CPU initialization */
1946 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1947 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1948 initializecpu(); /* Initialize CPU registers */
1951 * make an initial tss so cpu can get interrupt stack on syscall!
1952 * The 16 bytes is to save room for a VM86 context.
1954 gd
->gd_common_tss
.tss_esp0
= (int) thread0
.td_pcb
- 16;
1955 gd
->gd_common_tss
.tss_ss0
= GSEL(GDATA_SEL
, SEL_KPL
) ;
1956 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
1957 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
].sd
;
1958 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
1959 gd
->gd_common_tss
.tss_ioopt
= (sizeof gd
->gd_common_tss
) << 16;
1962 dblfault_tss
.tss_esp
= dblfault_tss
.tss_esp0
= dblfault_tss
.tss_esp1
=
1963 dblfault_tss
.tss_esp2
= (int) &dblfault_stack
[sizeof(dblfault_stack
)];
1964 dblfault_tss
.tss_ss
= dblfault_tss
.tss_ss0
= dblfault_tss
.tss_ss1
=
1965 dblfault_tss
.tss_ss2
= GSEL(GDATA_SEL
, SEL_KPL
);
1966 dblfault_tss
.tss_cr3
= (int)IdlePTD
;
1967 dblfault_tss
.tss_eip
= (int) dblfault_handler
;
1968 dblfault_tss
.tss_eflags
= PSL_KERNEL
;
1969 dblfault_tss
.tss_ds
= dblfault_tss
.tss_es
=
1970 dblfault_tss
.tss_gs
= GSEL(GDATA_SEL
, SEL_KPL
);
1971 dblfault_tss
.tss_fs
= GSEL(GPRIV_SEL
, SEL_KPL
);
1972 dblfault_tss
.tss_cs
= GSEL(GCODE_SEL
, SEL_KPL
);
1973 dblfault_tss
.tss_ldt
= GSEL(GLDT_SEL
, SEL_KPL
);
1977 init_param2(physmem
);
1979 /* now running on new page tables, configured,and u/iom is accessible */
1981 /* Map the message buffer. */
1982 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
1983 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
1985 msgbufinit(msgbufp
, MSGBUF_SIZE
);
1987 /* make a call gate to reenter kernel with */
1988 gdp
= &ldt
[LSYS5CALLS_SEL
].gd
;
1990 x
= (int) &IDTVEC(syscall
);
1991 gdp
->gd_looffset
= x
++;
1992 gdp
->gd_selector
= GSEL(GCODE_SEL
,SEL_KPL
);
1994 gdp
->gd_type
= SDT_SYS386CGT
;
1995 gdp
->gd_dpl
= SEL_UPL
;
1997 gdp
->gd_hioffset
= ((int) &IDTVEC(syscall
)) >>16;
1999 /* XXX does this work? */
2000 ldt
[LBSDICALLS_SEL
] = ldt
[LSYS5CALLS_SEL
];
2001 ldt
[LSOL26CALLS_SEL
] = ldt
[LSYS5CALLS_SEL
];
2003 /* transfer to user mode */
2005 _ucodesel
= LSEL(LUCODE_SEL
, SEL_UPL
);
2006 _udatasel
= LSEL(LUDATA_SEL
, SEL_UPL
);
2008 /* setup proc 0's pcb */
2009 thread0
.td_pcb
->pcb_flags
= 0;
2010 thread0
.td_pcb
->pcb_cr3
= (int)IdlePTD
; /* should already be setup */
2011 thread0
.td_pcb
->pcb_ext
= 0;
2012 proc0
.p_lwp
.lwp_md
.md_regs
= &proc0_tf
;
2016 * Initialize machine-dependant portions of the global data structure.
2017 * Note that the global data area and cpu0's idlestack in the private
2018 * data space were allocated in locore.
2020 * Note: the idlethread's cpl is 0
2022 * WARNING! Called from early boot, 'mycpu' may not work yet.
2025 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2028 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2030 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2031 gd
->mi
.gd_prvspace
->idlestack
,
2032 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2033 TDF_MPSAFE
, &gd
->mi
);
2034 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2035 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2036 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2037 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2041 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2043 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2044 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2051 globaldata_find(int cpu
)
2053 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2054 return(&CPU_prvspace
[cpu
].mdglobaldata
.mi
);
2057 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
2058 static void f00f_hack(void *unused
);
2059 SYSINIT(f00f_hack
, SI_SUB_INTRINSIC
, SI_ORDER_FIRST
, f00f_hack
, NULL
);
2062 f00f_hack(void *unused
)
2064 struct gate_descriptor
*new_idt
;
2070 printf("Intel Pentium detected, installing workaround for F00F bug\n");
2072 r_idt
.rd_limit
= sizeof(idt0
) - 1;
2074 tmp
= kmem_alloc(kernel_map
, PAGE_SIZE
* 2);
2076 panic("kmem_alloc returned 0");
2077 if (((unsigned int)tmp
& (PAGE_SIZE
-1)) != 0)
2078 panic("kmem_alloc returned non-page-aligned memory");
2079 /* Put the first seven entries in the lower page */
2080 new_idt
= (struct gate_descriptor
*)(tmp
+ PAGE_SIZE
- (7*8));
2081 bcopy(idt
, new_idt
, sizeof(idt0
));
2082 r_idt
.rd_base
= (int)new_idt
;
2085 if (vm_map_protect(kernel_map
, tmp
, tmp
+ PAGE_SIZE
,
2086 VM_PROT_READ
, FALSE
) != KERN_SUCCESS
)
2087 panic("vm_map_protect failed");
2090 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
2093 ptrace_set_pc(struct proc
*p
, unsigned long addr
)
2095 p
->p_md
.md_regs
->tf_eip
= addr
;
2100 ptrace_single_step(struct lwp
*lp
)
2102 lp
->lwp_md
.md_regs
->tf_eflags
|= PSL_T
;
2107 ptrace_read_u_check(struct proc
*p
, vm_offset_t addr
, size_t len
)
2111 if ((vm_offset_t
) (addr
+ len
) < addr
)
2113 if ((vm_offset_t
) (addr
+ len
) <= sizeof(struct user
))
2116 gap
= (char *) p
->p_md
.md_regs
- (char *) p
->p_addr
;
2118 if ((vm_offset_t
) addr
< gap
)
2120 if ((vm_offset_t
) (addr
+ len
) <=
2121 (vm_offset_t
) (gap
+ sizeof(struct trapframe
)))
2127 ptrace_write_u(struct proc
*p
, vm_offset_t off
, long data
)
2129 struct trapframe frame_copy
;
2131 struct trapframe
*tp
;
2134 * Privileged kernel state is scattered all over the user area.
2135 * Only allow write access to parts of regs and to fpregs.
2137 min
= (char *)p
->p_md
.md_regs
- (char *)p
->p_addr
;
2138 if (off
>= min
&& off
<= min
+ sizeof(struct trapframe
) - sizeof(int)) {
2139 tp
= p
->p_md
.md_regs
;
2141 *(int *)((char *)&frame_copy
+ (off
- min
)) = data
;
2142 if (!EFL_SECURE(frame_copy
.tf_eflags
, tp
->tf_eflags
) ||
2143 !CS_SECURE(frame_copy
.tf_cs
))
2145 *(int*)((char *)p
->p_addr
+ off
) = data
;
2150 * The PCB is at the end of the user area YYY
2152 min
= (char *)p
->p_thread
->td_pcb
- (char *)p
->p_addr
;
2153 min
+= offsetof(struct pcb
, pcb_save
);
2154 if (off
>= min
&& off
<= min
+ sizeof(union savefpu
) - sizeof(int)) {
2155 *(int*)((char *)p
->p_addr
+ off
) = data
;
2162 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2165 struct trapframe
*tp
;
2167 tp
= lp
->lwp_md
.md_regs
;
2168 regs
->r_fs
= tp
->tf_fs
;
2169 regs
->r_es
= tp
->tf_es
;
2170 regs
->r_ds
= tp
->tf_ds
;
2171 regs
->r_edi
= tp
->tf_edi
;
2172 regs
->r_esi
= tp
->tf_esi
;
2173 regs
->r_ebp
= tp
->tf_ebp
;
2174 regs
->r_ebx
= tp
->tf_ebx
;
2175 regs
->r_edx
= tp
->tf_edx
;
2176 regs
->r_ecx
= tp
->tf_ecx
;
2177 regs
->r_eax
= tp
->tf_eax
;
2178 regs
->r_eip
= tp
->tf_eip
;
2179 regs
->r_cs
= tp
->tf_cs
;
2180 regs
->r_eflags
= tp
->tf_eflags
;
2181 regs
->r_esp
= tp
->tf_esp
;
2182 regs
->r_ss
= tp
->tf_ss
;
2183 pcb
= lp
->lwp_thread
->td_pcb
;
2184 regs
->r_gs
= pcb
->pcb_gs
;
2189 set_regs(struct lwp
*lp
, struct reg
*regs
)
2192 struct trapframe
*tp
;
2194 tp
= lp
->lwp_md
.md_regs
;
2195 if (!EFL_SECURE(regs
->r_eflags
, tp
->tf_eflags
) ||
2196 !CS_SECURE(regs
->r_cs
))
2198 tp
->tf_fs
= regs
->r_fs
;
2199 tp
->tf_es
= regs
->r_es
;
2200 tp
->tf_ds
= regs
->r_ds
;
2201 tp
->tf_edi
= regs
->r_edi
;
2202 tp
->tf_esi
= regs
->r_esi
;
2203 tp
->tf_ebp
= regs
->r_ebp
;
2204 tp
->tf_ebx
= regs
->r_ebx
;
2205 tp
->tf_edx
= regs
->r_edx
;
2206 tp
->tf_ecx
= regs
->r_ecx
;
2207 tp
->tf_eax
= regs
->r_eax
;
2208 tp
->tf_eip
= regs
->r_eip
;
2209 tp
->tf_cs
= regs
->r_cs
;
2210 tp
->tf_eflags
= regs
->r_eflags
;
2211 tp
->tf_esp
= regs
->r_esp
;
2212 tp
->tf_ss
= regs
->r_ss
;
2213 pcb
= lp
->lwp_thread
->td_pcb
;
2214 pcb
->pcb_gs
= regs
->r_gs
;
2218 #ifndef CPU_DISABLE_SSE
2220 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2222 struct env87
*penv_87
= &sv_87
->sv_env
;
2223 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2226 /* FPU control/status */
2227 penv_87
->en_cw
= penv_xmm
->en_cw
;
2228 penv_87
->en_sw
= penv_xmm
->en_sw
;
2229 penv_87
->en_tw
= penv_xmm
->en_tw
;
2230 penv_87
->en_fip
= penv_xmm
->en_fip
;
2231 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2232 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2233 penv_87
->en_foo
= penv_xmm
->en_foo
;
2234 penv_87
->en_fos
= penv_xmm
->en_fos
;
2237 for (i
= 0; i
< 8; ++i
)
2238 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2240 sv_87
->sv_ex_sw
= sv_xmm
->sv_ex_sw
;
2244 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2246 struct env87
*penv_87
= &sv_87
->sv_env
;
2247 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2250 /* FPU control/status */
2251 penv_xmm
->en_cw
= penv_87
->en_cw
;
2252 penv_xmm
->en_sw
= penv_87
->en_sw
;
2253 penv_xmm
->en_tw
= penv_87
->en_tw
;
2254 penv_xmm
->en_fip
= penv_87
->en_fip
;
2255 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2256 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2257 penv_xmm
->en_foo
= penv_87
->en_foo
;
2258 penv_xmm
->en_fos
= penv_87
->en_fos
;
2261 for (i
= 0; i
< 8; ++i
)
2262 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2264 sv_xmm
->sv_ex_sw
= sv_87
->sv_ex_sw
;
2266 #endif /* CPU_DISABLE_SSE */
2269 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2271 #ifndef CPU_DISABLE_SSE
2273 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2274 (struct save87
*)fpregs
);
2277 #endif /* CPU_DISABLE_SSE */
2278 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2283 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2285 #ifndef CPU_DISABLE_SSE
2287 set_fpregs_xmm((struct save87
*)fpregs
,
2288 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2291 #endif /* CPU_DISABLE_SSE */
2292 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2297 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2300 dbregs
->dr0
= rdr0();
2301 dbregs
->dr1
= rdr1();
2302 dbregs
->dr2
= rdr2();
2303 dbregs
->dr3
= rdr3();
2304 dbregs
->dr4
= rdr4();
2305 dbregs
->dr5
= rdr5();
2306 dbregs
->dr6
= rdr6();
2307 dbregs
->dr7
= rdr7();
2311 pcb
= lp
->lwp_thread
->td_pcb
;
2312 dbregs
->dr0
= pcb
->pcb_dr0
;
2313 dbregs
->dr1
= pcb
->pcb_dr1
;
2314 dbregs
->dr2
= pcb
->pcb_dr2
;
2315 dbregs
->dr3
= pcb
->pcb_dr3
;
2318 dbregs
->dr6
= pcb
->pcb_dr6
;
2319 dbregs
->dr7
= pcb
->pcb_dr7
;
2325 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2328 load_dr0(dbregs
->dr0
);
2329 load_dr1(dbregs
->dr1
);
2330 load_dr2(dbregs
->dr2
);
2331 load_dr3(dbregs
->dr3
);
2332 load_dr4(dbregs
->dr4
);
2333 load_dr5(dbregs
->dr5
);
2334 load_dr6(dbregs
->dr6
);
2335 load_dr7(dbregs
->dr7
);
2338 struct ucred
*ucred
;
2340 uint32_t mask1
, mask2
;
2343 * Don't let an illegal value for dr7 get set. Specifically,
2344 * check for undefined settings. Setting these bit patterns
2345 * result in undefined behaviour and can lead to an unexpected
2348 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 8;
2349 i
++, mask1
<<= 2, mask2
<<= 2)
2350 if ((dbregs
->dr7
& mask1
) == mask2
)
2353 pcb
= lp
->lwp_thread
->td_pcb
;
2354 ucred
= lp
->lwp_proc
->p_ucred
;
2357 * Don't let a process set a breakpoint that is not within the
2358 * process's address space. If a process could do this, it
2359 * could halt the system by setting a breakpoint in the kernel
2360 * (if ddb was enabled). Thus, we need to check to make sure
2361 * that no breakpoints are being enabled for addresses outside
2362 * process's address space, unless, perhaps, we were called by
2365 * XXX - what about when the watched area of the user's
2366 * address space is written into from within the kernel
2367 * ... wouldn't that still cause a breakpoint to be generated
2368 * from within kernel mode?
2371 if (suser_cred(ucred
, 0) != 0) {
2372 if (dbregs
->dr7
& 0x3) {
2373 /* dr0 is enabled */
2374 if (dbregs
->dr0
>= VM_MAXUSER_ADDRESS
)
2378 if (dbregs
->dr7
& (0x3<<2)) {
2379 /* dr1 is enabled */
2380 if (dbregs
->dr1
>= VM_MAXUSER_ADDRESS
)
2384 if (dbregs
->dr7
& (0x3<<4)) {
2385 /* dr2 is enabled */
2386 if (dbregs
->dr2
>= VM_MAXUSER_ADDRESS
)
2390 if (dbregs
->dr7
& (0x3<<6)) {
2391 /* dr3 is enabled */
2392 if (dbregs
->dr3
>= VM_MAXUSER_ADDRESS
)
2397 pcb
->pcb_dr0
= dbregs
->dr0
;
2398 pcb
->pcb_dr1
= dbregs
->dr1
;
2399 pcb
->pcb_dr2
= dbregs
->dr2
;
2400 pcb
->pcb_dr3
= dbregs
->dr3
;
2401 pcb
->pcb_dr6
= dbregs
->dr6
;
2402 pcb
->pcb_dr7
= dbregs
->dr7
;
2404 pcb
->pcb_flags
|= PCB_DBREGS
;
2411 * Return > 0 if a hardware breakpoint has been hit, and the
2412 * breakpoint was in user space. Return 0, otherwise.
2415 user_dbreg_trap(void)
2417 u_int32_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2418 u_int32_t bp
; /* breakpoint bits extracted from dr6 */
2419 int nbp
; /* number of breakpoints that triggered */
2420 caddr_t addr
[4]; /* breakpoint addresses */
2424 if ((dr7
& 0x000000ff) == 0) {
2426 * all GE and LE bits in the dr7 register are zero,
2427 * thus the trap couldn't have been caused by the
2428 * hardware debug registers
2435 bp
= dr6
& 0x0000000f;
2439 * None of the breakpoint bits are set meaning this
2440 * trap was not caused by any of the debug registers
2446 * at least one of the breakpoints were hit, check to see
2447 * which ones and if any of them are user space addresses
2451 addr
[nbp
++] = (caddr_t
)rdr0();
2454 addr
[nbp
++] = (caddr_t
)rdr1();
2457 addr
[nbp
++] = (caddr_t
)rdr2();
2460 addr
[nbp
++] = (caddr_t
)rdr3();
2463 for (i
=0; i
<nbp
; i
++) {
2465 (caddr_t
)VM_MAXUSER_ADDRESS
) {
2467 * addr[i] is in user space
2474 * None of the breakpoints are in user space.
2482 Debugger(const char *msg
)
2484 printf("Debugger(\"%s\") called.\n", msg
);
2488 #include <sys/disklabel.h>
2491 * Determine the size of the transfer, and make sure it is
2492 * within the boundaries of the partition. Adjust transfer
2493 * if needed, and signal errors or early completion.
2495 * On success a new bio layer is pushed with the translated
2496 * block number, and returned.
2499 bounds_check_with_label(cdev_t dev
, struct bio
*bio
,
2500 struct disklabel
*lp
, int wlabel
)
2503 struct buf
*bp
= bio
->bio_buf
;
2504 struct partition
*p
= lp
->d_partitions
+ dkpart(dev
);
2505 int labelsect
= lp
->d_partitions
[0].p_offset
;
2506 int maxsz
= p
->p_size
,
2507 sz
= (bp
->b_bcount
+ DEV_BSIZE
- 1) >> DEV_BSHIFT
;
2508 daddr_t blkno
= (daddr_t
)(bio
->bio_offset
>> DEV_BSHIFT
);
2510 /* overwriting disk label ? */
2511 /* XXX should also protect bootstrap in first 8K */
2512 if (blkno
+ p
->p_offset
<= LABELSECTOR
+ labelsect
&&
2513 #if LABELSECTOR != 0
2514 blkno
+ p
->p_offset
+ sz
> LABELSECTOR
+ labelsect
&&
2516 bp
->b_cmd
!= BUF_CMD_READ
&& wlabel
== 0) {
2517 bp
->b_error
= EROFS
;
2521 #if defined(DOSBBSECTOR) && defined(notyet)
2522 /* overwriting master boot record? */
2523 if (blkno
+ p
->p_offset
<= DOSBBSECTOR
&&
2524 bp
->b_cmd
!= BUF_CMD_READ
&& wlabel
== 0) {
2525 bp
->b_error
= EROFS
;
2531 * Check for out of bounds, EOF, and EOF clipping.
2533 if (bio
->bio_offset
< 0)
2535 if (blkno
+ sz
> maxsz
) {
2537 * Past EOF or B_BNOCLIP flag was set, the request is bad.
2539 if (blkno
> maxsz
|| (bp
->b_flags
& B_BNOCLIP
))
2543 * If exactly on EOF just complete the I/O with no bytes
2544 * transfered. B_INVAL must be set to throw away the
2545 * contents of the buffer. Otherwise clip b_bcount.
2547 if (blkno
== maxsz
) {
2548 bp
->b_resid
= bp
->b_bcount
;
2549 bp
->b_flags
|= B_INVAL
;
2552 bp
->b_bcount
= (maxsz
- blkno
) << DEV_BSHIFT
;
2554 nbio
= push_bio(bio
);
2555 nbio
->bio_offset
= bio
->bio_offset
+ ((off_t
)p
->p_offset
<< DEV_BSHIFT
);
2559 * The caller is responsible for calling biodone() on the passed bio
2560 * when we return NULL.
2563 bp
->b_error
= EINVAL
;
2565 bp
->b_resid
= bp
->b_bcount
;
2566 bp
->b_flags
|= B_ERROR
| B_INVAL
;
2574 * Provide inb() and outb() as functions. They are normally only
2575 * available as macros calling inlined functions, thus cannot be
2576 * called inside DDB.
2578 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2584 /* silence compiler warnings */
2586 void outb(u_int
, u_char
);
2593 * We use %%dx and not %1 here because i/o is done at %dx and not at
2594 * %edx, while gcc generates inferior code (movw instead of movl)
2595 * if we tell it to load (u_short) port.
2597 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2602 outb(u_int port
, u_char data
)
2606 * Use an unnecessary assignment to help gcc's register allocator.
2607 * This make a large difference for gcc-1.40 and a tiny difference
2608 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2609 * best results. gcc-2.6.0 can't handle this.
2612 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2619 #include "opt_cpu.h"
2623 * initialize all the SMP locks
2626 /* critical region when masking or unmasking interupts */
2627 struct spinlock_deprecated imen_spinlock
;
2629 /* Make FAST_INTR() routines sequential */
2630 struct spinlock_deprecated fast_intr_spinlock
;
2632 /* critical region for old style disable_intr/enable_intr */
2633 struct spinlock_deprecated mpintr_spinlock
;
2635 /* critical region around INTR() routines */
2636 struct spinlock_deprecated intr_spinlock
;
2638 /* lock region used by kernel profiling */
2639 struct spinlock_deprecated mcount_spinlock
;
2641 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2642 struct spinlock_deprecated com_spinlock
;
2644 /* locks kernel printfs */
2645 struct spinlock_deprecated cons_spinlock
;
2647 /* lock regions around the clock hardware */
2648 struct spinlock_deprecated clock_spinlock
;
2650 /* lock around the MP rendezvous */
2651 struct spinlock_deprecated smp_rv_spinlock
;
2657 * mp_lock = 0; BSP already owns the MP lock
2660 * Get the initial mp_lock with a count of 1 for the BSP.
2661 * This uses a LOGICAL cpu ID, ie BSP == 0.
2664 cpu_get_initial_mplock();
2667 spin_lock_init(&mcount_spinlock
);
2668 spin_lock_init(&fast_intr_spinlock
);
2669 spin_lock_init(&intr_spinlock
);
2670 spin_lock_init(&mpintr_spinlock
);
2671 spin_lock_init(&imen_spinlock
);
2672 spin_lock_init(&smp_rv_spinlock
);
2673 spin_lock_init(&com_spinlock
);
2674 spin_lock_init(&clock_spinlock
);
2675 spin_lock_init(&cons_spinlock
);
2677 /* our token pool needs to work early */
2678 lwkt_token_pool_init();