2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
6 * This code is derived from software contributed to Berkeley by
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
38 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
39 * $DragonFly: src/sys/platform/pc32/i386/machdep.c,v 1.116 2007/01/14 21:07:12 dillon Exp $
43 #include "use_ether.h"
46 #include "opt_atalk.h"
47 #include "opt_compat.h"
50 #include "opt_directio.h"
53 #include "opt_maxmem.h"
54 #include "opt_msgbuf.h"
55 #include "opt_perfmon.h"
57 #include "opt_userconfig.h"
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/sysproto.h>
62 #include <sys/signalvar.h>
63 #include <sys/kernel.h>
64 #include <sys/linker.h>
65 #include <sys/malloc.h>
68 #include <sys/reboot.h>
70 #include <sys/msgbuf.h>
71 #include <sys/sysent.h>
72 #include <sys/sysctl.h>
73 #include <sys/vmmeter.h>
75 #include <sys/upcall.h>
76 #include <sys/usched.h>
80 #include <vm/vm_param.h>
82 #include <vm/vm_kern.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_map.h>
86 #include <vm/vm_pager.h>
87 #include <vm/vm_extern.h>
89 #include <sys/thread2.h>
97 #include <machine/cpu.h>
98 #include <machine/clock.h>
99 #include <machine/specialreg.h>
100 #include <machine/bootinfo.h>
101 #include <machine/md_var.h>
102 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
103 #include <machine/globaldata.h> /* CPU_prvspace */
104 #include <machine/smp.h>
106 #include <machine/perfmon.h>
108 #include <machine/cputypes.h>
111 #include <bus/isa/i386/isa_device.h>
113 #include <machine_base/isa/intr_machdep.h>
114 #include <bus/isa/rtc.h>
115 #include <machine/vm86.h>
116 #include <sys/random.h>
117 #include <sys/ptrace.h>
118 #include <machine/sigframe.h>
120 #define PHYSMAP_ENTRIES 10
122 extern void init386 (int first
);
123 extern void dblfault_handler (void);
125 extern void printcpuinfo(void); /* XXX header file */
126 extern void finishidentcpu(void);
127 extern void panicifcpuunsupported(void);
128 extern void initializecpu(void);
130 static void cpu_startup (void *);
131 #ifndef CPU_DISABLE_SSE
132 static void set_fpregs_xmm (struct save87
*, struct savexmm
*);
133 static void fill_fpregs_xmm (struct savexmm
*, struct save87
*);
134 #endif /* CPU_DISABLE_SSE */
136 extern void ffs_rawread_setup(void);
137 #endif /* DIRECTIO */
138 static void init_locks(void);
140 SYSINIT(cpu
, SI_SUB_CPU
, SI_ORDER_FIRST
, cpu_startup
, NULL
)
142 int _udatasel
, _ucodesel
;
145 int64_t tsc_offsets
[MAXCPU
];
147 int64_t tsc_offsets
[1];
150 #if defined(SWTCH_OPTIM_STATS)
151 extern int swtch_optim_stats
;
152 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
153 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
154 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
155 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
161 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
163 int error
= sysctl_handle_int(oidp
, 0, ctob(physmem
), req
);
167 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_INT
|CTLFLAG_RD
,
168 0, 0, sysctl_hw_physmem
, "IU", "");
171 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
173 int error
= sysctl_handle_int(oidp
, 0,
174 ctob(physmem
- vmstats
.v_wire_count
), req
);
178 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
179 0, 0, sysctl_hw_usermem
, "IU", "");
182 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
184 int error
= sysctl_handle_int(oidp
, 0,
185 i386_btop(avail_end
- avail_start
), req
);
189 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
190 0, 0, sysctl_hw_availpages
, "I", "");
193 sysctl_machdep_msgbuf(SYSCTL_HANDLER_ARGS
)
197 /* Unwind the buffer, so that it's linear (possibly starting with
198 * some initial nulls).
200 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
+msgbufp
->msg_bufr
,
201 msgbufp
->msg_size
-msgbufp
->msg_bufr
,req
);
202 if(error
) return(error
);
203 if(msgbufp
->msg_bufr
>0) {
204 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
,
205 msgbufp
->msg_bufr
,req
);
210 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf
, CTLTYPE_STRING
|CTLFLAG_RD
,
211 0, 0, sysctl_machdep_msgbuf
, "A","Contents of kernel message buffer");
213 static int msgbuf_clear
;
216 sysctl_machdep_msgbuf_clear(SYSCTL_HANDLER_ARGS
)
219 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
221 if (!error
&& req
->newptr
) {
222 /* Clear the buffer and reset write pointer */
223 bzero(msgbufp
->msg_ptr
,msgbufp
->msg_size
);
224 msgbufp
->msg_bufr
=msgbufp
->msg_bufx
=0;
230 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf_clear
, CTLTYPE_INT
|CTLFLAG_RW
,
231 &msgbuf_clear
, 0, sysctl_machdep_msgbuf_clear
, "I",
232 "Clear kernel message buffer");
234 vm_paddr_t Maxmem
= 0;
236 vm_paddr_t phys_avail
[PHYSMAP_ENTRIES
*2+2];
238 static vm_offset_t buffer_sva
, buffer_eva
;
239 vm_offset_t clean_sva
, clean_eva
;
240 static vm_offset_t pager_sva
, pager_eva
;
241 static struct trapframe proc0_tf
;
244 cpu_startup(void *dummy
)
250 vm_offset_t firstaddr
;
252 if (boothowto
& RB_VERBOSE
)
256 * Good {morning,afternoon,evening,night}.
258 kprintf("%s", version
);
261 panicifcpuunsupported();
265 kprintf("real memory = %llu (%lluK bytes)\n", ptoa(Maxmem
), ptoa(Maxmem
) / 1024);
267 * Display any holes after the first chunk of extended memory.
272 kprintf("Physical memory chunk(s):\n");
273 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
274 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
276 kprintf("0x%08llx - 0x%08llx, %llu bytes (%llu pages)\n",
277 phys_avail
[indx
], phys_avail
[indx
+ 1] - 1, size1
,
283 * Allocate space for system data structures.
284 * The first available kernel virtual address is in "v".
285 * As pages of kernel virtual memory are allocated, "v" is incremented.
286 * As pages of memory are allocated and cleared,
287 * "firstaddr" is incremented.
288 * An index into the kernel page table corresponding to the
289 * virtual memory address maintained in "v" is kept in "mapaddr".
293 * Make two passes. The first pass calculates how much memory is
294 * needed and allocates it. The second pass assigns virtual
295 * addresses to the various data structures.
299 v
= (caddr_t
)firstaddr
;
301 #define valloc(name, type, num) \
302 (name) = (type *)v; v = (caddr_t)((name)+(num))
303 #define valloclim(name, type, num, lim) \
304 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
307 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
308 * For the first 64MB of ram nominally allocate sufficient buffers to
309 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
310 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
311 * the buffer cache we limit the eventual kva reservation to
314 * factor represents the 1/4 x ram conversion.
317 int factor
= 4 * BKVASIZE
/ 1024;
318 int kbytes
= physmem
* (PAGE_SIZE
/ 1024);
322 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
324 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
325 if (maxbcache
&& nbuf
> maxbcache
/ BKVASIZE
)
326 nbuf
= maxbcache
/ BKVASIZE
;
330 * Do not allow the buffer_map to be more then 1/2 the size of the
333 if (nbuf
> (virtual_end
- virtual_start
) / (BKVASIZE
* 2)) {
334 nbuf
= (virtual_end
- virtual_start
) / (BKVASIZE
* 2);
335 kprintf("Warning: nbufs capped at %d\n", nbuf
);
338 nswbuf
= max(min(nbuf
/4, 256), 16);
340 if (nswbuf
< NSWBUF_MIN
)
347 valloc(swbuf
, struct buf
, nswbuf
);
348 valloc(buf
, struct buf
, nbuf
);
351 * End of first pass, size has been calculated so allocate memory
353 if (firstaddr
== 0) {
354 size
= (vm_size_t
)(v
- firstaddr
);
355 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
));
357 panic("startup: no room for tables");
362 * End of second pass, addresses have been assigned
364 if ((vm_size_t
)(v
- firstaddr
) != size
)
365 panic("startup: table size inconsistency");
367 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
368 (nbuf
*BKVASIZE
) + (nswbuf
*MAXPHYS
) + pager_map_size
);
369 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
371 buffer_map
.system_map
= 1;
372 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
373 (nswbuf
*MAXPHYS
) + pager_map_size
);
374 pager_map
.system_map
= 1;
375 kmem_suballoc(&kernel_map
, &exec_map
, &minaddr
, &maxaddr
,
376 (16*(ARG_MAX
+(PAGE_SIZE
*3))));
378 #if defined(USERCONFIG)
380 cninit(); /* the preferred console may have changed */
383 kprintf("avail memory = %u (%uK bytes)\n", ptoa(vmstats
.v_free_count
),
384 ptoa(vmstats
.v_free_count
) / 1024);
387 * Set up buffers, so they can be used to read disk labels.
390 vm_pager_bufferinit();
394 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
396 mp_start(); /* fire up the APs and APICs */
403 * Send an interrupt to process.
405 * Stack is set up to allow sigcode stored
406 * at top to call routine, followed by kcall
407 * to sigreturn routine below. After sigreturn
408 * resets the signal mask, the stack, and the
409 * frame pointer, it returns to the user
413 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
415 struct lwp
*lp
= curthread
->td_lwp
;
416 struct proc
*p
= lp
->lwp_proc
;
417 struct trapframe
*regs
;
418 struct sigacts
*psp
= p
->p_sigacts
;
419 struct sigframe sf
, *sfp
;
422 regs
= lp
->lwp_md
.md_regs
;
423 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
425 /* save user context */
426 bzero(&sf
, sizeof(struct sigframe
));
427 sf
.sf_uc
.uc_sigmask
= *mask
;
428 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
429 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
430 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_gs
, sizeof(struct trapframe
));
432 /* make the size of the saved context visible to userland */
433 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
435 /* save mailbox pending state for syscall interlock semantics */
436 if (p
->p_flag
& P_MAILBOX
)
437 sf
.sf_uc
.uc_mcontext
.mc_xflags
|= PGEX_MAILBOX
;
439 /* Allocate and validate space for the signal handler context. */
441 if ((p
->p_flag
& P_ALTSTACK
) != 0 && !oonstack
&&
442 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
443 sfp
= (struct sigframe
*)(lp
->lwp_sigstk
.ss_sp
+
444 lp
->lwp_sigstk
.ss_size
- sizeof(struct sigframe
));
445 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
447 sfp
= (struct sigframe
*)regs
->tf_esp
- 1;
450 /* Translate the signal is appropriate */
451 if (p
->p_sysent
->sv_sigtbl
) {
452 if (sig
<= p
->p_sysent
->sv_sigsize
)
453 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
456 /* Build the argument list for the signal handler. */
458 sf
.sf_ucontext
= (register_t
)&sfp
->sf_uc
;
459 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
460 /* Signal handler installed with SA_SIGINFO. */
461 sf
.sf_siginfo
= (register_t
)&sfp
->sf_si
;
462 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
464 /* fill siginfo structure */
465 sf
.sf_si
.si_signo
= sig
;
466 sf
.sf_si
.si_code
= code
;
467 sf
.sf_si
.si_addr
= (void*)regs
->tf_err
;
470 /* Old FreeBSD-style arguments. */
471 sf
.sf_siginfo
= code
;
472 sf
.sf_addr
= regs
->tf_err
;
473 sf
.sf_ahu
.sf_handler
= catcher
;
477 * If we're a vm86 process, we want to save the segment registers.
478 * We also change eflags to be our emulated eflags, not the actual
481 if (regs
->tf_eflags
& PSL_VM
) {
482 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
483 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
485 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
486 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
487 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
488 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
490 if (vm86
->vm86_has_vme
== 0)
491 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
492 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
493 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
496 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
497 * syscalls made by the signal handler. This just avoids
498 * wasting time for our lazy fixup of such faults. PSL_NT
499 * does nothing in vm86 mode, but vm86 programs can set it
500 * almost legitimately in probes for old cpu types.
502 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
506 * Copy the sigframe out to the user's stack.
508 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
510 * Something is wrong with the stack pointer.
511 * ...Kill the process.
516 regs
->tf_esp
= (int)sfp
;
517 regs
->tf_eip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
518 regs
->tf_eflags
&= ~PSL_T
;
519 regs
->tf_cs
= _ucodesel
;
520 regs
->tf_ds
= _udatasel
;
521 regs
->tf_es
= _udatasel
;
524 * Allow the signal handler to inherit %fs in addition to %gs as
525 * the userland program might be using both.
527 * However, if a T_PROTFLT occured the segment registers could be
528 * totally broken. They must be reset in order to be able to
529 * return to userland.
531 if (regs
->tf_trapno
== T_PROTFLT
) {
532 regs
->tf_fs
= _udatasel
;
533 regs
->tf_gs
= _udatasel
;
535 regs
->tf_ss
= _udatasel
;
539 * Sanitize the trapframe for a virtual kernel passing control to a custom
540 * VM context. Remove any items that would otherwise create a privilage
543 * XXX at the moment we allow userland to set the resume flag. Is this a
547 cpu_sanitize_frame(struct trapframe
*frame
)
549 frame
->tf_cs
= _ucodesel
;
550 frame
->tf_ds
= _udatasel
;
551 frame
->tf_es
= _udatasel
; /* XXX allow userland this one too? */
553 frame
->tf_fs
= _udatasel
;
554 frame
->tf_gs
= _udatasel
;
556 frame
->tf_ss
= _udatasel
;
557 frame
->tf_eflags
&= (PSL_RF
| PSL_USERCHANGE
);
558 frame
->tf_eflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
563 cpu_sanitize_tls(struct savetls
*tls
)
565 struct segment_descriptor
*desc
;
568 for (i
= 0; i
< NGTLS
; ++i
) {
570 if (desc
->sd_dpl
== 0 && desc
->sd_type
== 0)
572 if (desc
->sd_def32
== 0)
574 if (desc
->sd_type
!= SDT_MEMRWA
)
576 if (desc
->sd_dpl
!= SEL_UPL
)
578 if (desc
->sd_xx
!= 0 || desc
->sd_p
!= 1)
585 * sigreturn(ucontext_t *sigcntxp)
587 * System call to cleanup state after a signal
588 * has been taken. Reset signal mask and
589 * stack state from context left by sendsig (above).
590 * Return to previous pc and psl as specified by
591 * context left by sendsig. Check carefully to
592 * make sure that the user has not modified the
593 * state to gain improper privileges.
595 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
596 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
599 sys_sigreturn(struct sigreturn_args
*uap
)
601 struct lwp
*lp
= curthread
->td_lwp
;
602 struct proc
*p
= lp
->lwp_proc
;
603 struct trapframe
*regs
;
609 if (!useracc((caddr_t
)ucp
, sizeof(ucontext_t
), VM_PROT_READ
))
612 regs
= lp
->lwp_md
.md_regs
;
613 eflags
= ucp
->uc_mcontext
.mc_eflags
;
615 if (eflags
& PSL_VM
) {
616 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
617 struct vm86_kernel
*vm86
;
620 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
621 * set up the vm86 area, and we can't enter vm86 mode.
623 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
625 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
626 if (vm86
->vm86_inited
== 0)
629 /* go back to user mode if both flags are set */
630 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
631 trapsignal(lp
->lwp_proc
, SIGBUS
, 0);
633 if (vm86
->vm86_has_vme
) {
634 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
635 (eflags
& VME_USERCHANGE
) | PSL_VM
;
637 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
638 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) | (eflags
& VM_USERCHANGE
) | PSL_VM
;
640 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
641 tf
->tf_eflags
= eflags
;
642 tf
->tf_vm86_ds
= tf
->tf_ds
;
643 tf
->tf_vm86_es
= tf
->tf_es
;
644 tf
->tf_vm86_fs
= tf
->tf_fs
;
645 tf
->tf_vm86_gs
= tf
->tf_gs
;
646 tf
->tf_ds
= _udatasel
;
647 tf
->tf_es
= _udatasel
;
649 tf
->tf_fs
= _udatasel
;
650 tf
->tf_gs
= _udatasel
;
654 * Don't allow users to change privileged or reserved flags.
657 * XXX do allow users to change the privileged flag PSL_RF.
658 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
659 * should sometimes set it there too. tf_eflags is kept in
660 * the signal context during signal handling and there is no
661 * other place to remember it, so the PSL_RF bit may be
662 * corrupted by the signal handler without us knowing.
663 * Corruption of the PSL_RF bit at worst causes one more or
664 * one less debugger trap, so allowing it is fairly harmless.
666 if (!EFL_SECURE(eflags
& ~PSL_RF
, regs
->tf_eflags
& ~PSL_RF
)) {
667 kprintf("sigreturn: eflags = 0x%x\n", eflags
);
672 * Don't allow users to load a valid privileged %cs. Let the
673 * hardware check for invalid selectors, excess privilege in
674 * other selectors, invalid %eip's and invalid %esp's.
676 cs
= ucp
->uc_mcontext
.mc_cs
;
677 if (!CS_SECURE(cs
)) {
678 kprintf("sigreturn: cs = 0x%x\n", cs
);
679 trapsignal(lp
->lwp_proc
, SIGBUS
, T_PROTFLT
);
682 bcopy(&ucp
->uc_mcontext
.mc_gs
, regs
, sizeof(struct trapframe
));
686 * Merge saved signal mailbox pending flag to maintain interlock
687 * semantics against system calls.
689 if (ucp
->uc_mcontext
.mc_xflags
& PGEX_MAILBOX
)
690 p
->p_flag
|= P_MAILBOX
;
692 if (ucp
->uc_mcontext
.mc_onstack
& 1)
693 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
695 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
697 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
698 SIG_CANTMASK(lp
->lwp_sigmask
);
703 * Stack frame on entry to function. %eax will contain the function vector,
704 * %ecx will contain the function data. flags, ecx, and eax will have
705 * already been pushed on the stack.
716 sendupcall(struct vmupcall
*vu
, int morepending
)
718 struct lwp
*lp
= curthread
->td_lwp
;
719 struct proc
*p
= lp
->lwp_proc
;
720 struct trapframe
*regs
;
721 struct upcall upcall
;
722 struct upc_frame upc_frame
;
726 * If we are a virtual kernel running an emulated user process
727 * context, switch back to the virtual kernel context before
728 * trying to post the signal.
730 if (p
->p_vkernel
&& p
->p_vkernel
->vk_current
) {
731 lp
->lwp_md
.md_regs
->tf_trapno
= 0;
732 vkernel_trap(p
, lp
->lwp_md
.md_regs
);
736 * Get the upcall data structure
738 if (copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
)) ||
739 copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int))
742 kprintf("bad upcall address\n");
747 * If the data structure is already marked pending or has a critical
748 * section count, mark the data structure as pending and return
749 * without doing an upcall. vu_pending is left set.
751 if (upcall
.upc_pending
|| crit_count
>= vu
->vu_pending
) {
752 if (upcall
.upc_pending
< vu
->vu_pending
) {
753 upcall
.upc_pending
= vu
->vu_pending
;
754 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
755 sizeof(upcall
.upc_pending
));
761 * We can run this upcall now, clear vu_pending.
763 * Bump our critical section count and set or clear the
764 * user pending flag depending on whether more upcalls are
765 * pending. The user will be responsible for calling
766 * upc_dispatch(-1) to process remaining upcalls.
769 upcall
.upc_pending
= morepending
;
770 crit_count
+= TDPRI_CRIT
;
771 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
772 sizeof(upcall
.upc_pending
));
773 copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
,
777 * Construct a stack frame and issue the upcall
779 regs
= lp
->lwp_md
.md_regs
;
780 upc_frame
.eax
= regs
->tf_eax
;
781 upc_frame
.ecx
= regs
->tf_ecx
;
782 upc_frame
.edx
= regs
->tf_edx
;
783 upc_frame
.flags
= regs
->tf_eflags
;
784 upc_frame
.oldip
= regs
->tf_eip
;
785 if (copyout(&upc_frame
, (void *)(regs
->tf_esp
- sizeof(upc_frame
)),
786 sizeof(upc_frame
)) != 0) {
787 kprintf("bad stack on upcall\n");
789 regs
->tf_eax
= (register_t
)vu
->vu_func
;
790 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
791 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
792 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
793 regs
->tf_esp
-= sizeof(upc_frame
);
798 * fetchupcall occurs in the context of a system call, which means that
799 * we have to return EJUSTRETURN in order to prevent eax and edx from
800 * being overwritten by the syscall return value.
802 * if vu is not NULL we return the new context in %edx, the new data in %ecx,
803 * and the function pointer in %eax.
806 fetchupcall (struct vmupcall
*vu
, int morepending
, void *rsp
)
808 struct upc_frame upc_frame
;
809 struct lwp
*lp
= curthread
->td_lwp
;
810 struct trapframe
*regs
;
812 struct upcall upcall
;
815 regs
= lp
->lwp_md
.md_regs
;
817 error
= copyout(&morepending
, &lp
->lwp_upcall
->upc_pending
, sizeof(int));
821 * This jumps us to the next ready context.
824 error
= copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
));
827 error
= copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int));
828 crit_count
+= TDPRI_CRIT
;
830 error
= copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, sizeof(int));
831 regs
->tf_eax
= (register_t
)vu
->vu_func
;
832 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
833 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
834 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
835 regs
->tf_esp
= (register_t
)rsp
;
838 * This returns us to the originally interrupted code.
840 error
= copyin(rsp
, &upc_frame
, sizeof(upc_frame
));
841 regs
->tf_eax
= upc_frame
.eax
;
842 regs
->tf_ecx
= upc_frame
.ecx
;
843 regs
->tf_edx
= upc_frame
.edx
;
844 regs
->tf_eflags
= (regs
->tf_eflags
& ~PSL_USERCHANGE
) |
845 (upc_frame
.flags
& PSL_USERCHANGE
);
846 regs
->tf_eip
= upc_frame
.oldip
;
847 regs
->tf_esp
= (register_t
)((char *)rsp
+ sizeof(upc_frame
));
856 * Machine dependent boot() routine
858 * I haven't seen anything to put here yet
859 * Possibly some stuff might be grafted back here from boot()
867 * Shutdown the CPU as much as possible
873 __asm__
__volatile("hlt");
877 * cpu_idle() represents the idle LWKT. You cannot return from this function
878 * (unless you want to blow things up!). Instead we look for runnable threads
879 * and loop or halt as appropriate. Giant is not held on entry to the thread.
881 * The main loop is entered with a critical section held, we must release
882 * the critical section before doing anything else. lwkt_switch() will
883 * check for pending interrupts due to entering and exiting its own
886 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI
887 * to wake a HLTed cpu up. However, there are cases where the idlethread
888 * will be entered with the possibility that no IPI will occur and in such
889 * cases lwkt_switch() sets TDF_IDLE_NOHLT.
891 static int cpu_idle_hlt
= 1;
892 static int cpu_idle_hltcnt
;
893 static int cpu_idle_spincnt
;
894 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
895 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
896 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, CTLFLAG_RW
,
897 &cpu_idle_hltcnt
, 0, "Idle loop entry halts");
898 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_spincnt
, CTLFLAG_RW
,
899 &cpu_idle_spincnt
, 0, "Idle loop entry spins");
902 cpu_idle_default_hook(void)
905 * We must guarentee that hlt is exactly the instruction
908 __asm
__volatile("sti; hlt");
911 /* Other subsystems (e.g., ACPI) can hook this later. */
912 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
917 struct thread
*td
= curthread
;
920 KKASSERT(td
->td_pri
< TDPRI_CRIT
);
923 * See if there are any LWKTs ready to go.
928 * If we are going to halt call splz unconditionally after
929 * CLIing to catch any interrupt races. Note that we are
930 * at SPL0 and interrupts are enabled.
932 if (cpu_idle_hlt
&& !lwkt_runnable() &&
933 (td
->td_flags
& TDF_IDLE_NOHLT
) == 0) {
934 __asm
__volatile("cli");
936 if (!lwkt_runnable())
940 __asm
__volatile("pause");
944 td
->td_flags
&= ~TDF_IDLE_NOHLT
;
947 __asm
__volatile("sti; pause");
949 __asm
__volatile("sti");
957 * Clear registers on exec
960 setregs(struct lwp
*lp
, u_long entry
, u_long stack
, u_long ps_strings
)
962 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
963 struct pcb
*pcb
= lp
->lwp_thread
->td_pcb
;
965 /* was i386_user_cleanup() in NetBSD */
968 bzero((char *)regs
, sizeof(struct trapframe
));
969 regs
->tf_eip
= entry
;
970 regs
->tf_esp
= stack
;
971 regs
->tf_eflags
= PSL_USER
| (regs
->tf_eflags
& PSL_T
);
972 regs
->tf_ss
= _udatasel
;
973 regs
->tf_ds
= _udatasel
;
974 regs
->tf_es
= _udatasel
;
975 regs
->tf_fs
= _udatasel
;
976 regs
->tf_gs
= _udatasel
;
977 regs
->tf_cs
= _ucodesel
;
979 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
980 regs
->tf_ebx
= ps_strings
;
983 * Reset the hardware debug registers if they were in use.
984 * They won't have any meaning for the newly exec'd process.
986 if (pcb
->pcb_flags
& PCB_DBREGS
) {
993 if (pcb
== curthread
->td_pcb
) {
995 * Clear the debug registers on the running
996 * CPU, otherwise they will end up affecting
997 * the next process we switch to.
1001 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1005 * Initialize the math emulator (if any) for the current process.
1006 * Actually, just clear the bit that says that the emulator has
1007 * been initialized. Initialization is delayed until the process
1008 * traps to the emulator (if it is done at all) mainly because
1009 * emulators don't provide an entry point for initialization.
1011 lp
->lwp_thread
->td_pcb
->pcb_flags
&= ~FP_SOFTFP
;
1014 * note: do not set CR0_TS here. npxinit() must do it after clearing
1015 * gd_npxthread. Otherwise a preemptive interrupt thread may panic
1019 load_cr0(rcr0() | CR0_MP
);
1022 /* Initialize the npx (if any) for the current process. */
1023 npxinit(__INITIAL_NPXCW__
);
1028 * note: linux emulator needs edx to be 0x0 on entry, which is
1029 * handled in execve simply by setting the 64 bit syscall
1030 * return value to 0.
1040 cr0
|= CR0_NE
; /* Done by npxinit() */
1041 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1043 if (cpu_class
!= CPUCLASS_386
)
1045 cr0
|= CR0_WP
| CR0_AM
;
1051 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1054 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1056 if (!error
&& req
->newptr
)
1061 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1062 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1064 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1065 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1067 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1068 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1070 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1071 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1073 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1074 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1075 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1078 * Initialize 386 and configure to run kernel
1082 * Initialize segments & interrupt table
1086 union descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1087 static struct gate_descriptor idt0
[NIDT
];
1088 struct gate_descriptor
*idt
= &idt0
[0]; /* interrupt descriptor table */
1089 union descriptor ldt
[NLDT
]; /* local descriptor table */
1091 /* table descriptors - used to load tables by cpu */
1092 struct region_descriptor r_gdt
, r_idt
;
1094 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
1095 extern int has_f00f_bug
;
1098 static struct i386tss dblfault_tss
;
1099 static char dblfault_stack
[PAGE_SIZE
];
1101 extern struct user
*proc0paddr
;
1104 /* software prototypes -- in more palatable form */
1105 struct soft_segment_descriptor gdt_segs
[] = {
1106 /* GNULL_SEL 0 Null Descriptor */
1107 { 0x0, /* segment base address */
1109 0, /* segment type */
1110 0, /* segment descriptor priority level */
1111 0, /* segment descriptor present */
1113 0, /* default 32 vs 16 bit size */
1114 0 /* limit granularity (byte/page units)*/ },
1115 /* GCODE_SEL 1 Code Descriptor for kernel */
1116 { 0x0, /* segment base address */
1117 0xfffff, /* length - all address space */
1118 SDT_MEMERA
, /* segment type */
1119 0, /* segment descriptor priority level */
1120 1, /* segment descriptor present */
1122 1, /* default 32 vs 16 bit size */
1123 1 /* limit granularity (byte/page units)*/ },
1124 /* GDATA_SEL 2 Data Descriptor for kernel */
1125 { 0x0, /* segment base address */
1126 0xfffff, /* length - all address space */
1127 SDT_MEMRWA
, /* segment type */
1128 0, /* segment descriptor priority level */
1129 1, /* segment descriptor present */
1131 1, /* default 32 vs 16 bit size */
1132 1 /* limit granularity (byte/page units)*/ },
1133 /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */
1134 { 0x0, /* segment base address */
1135 0xfffff, /* length - all address space */
1136 SDT_MEMRWA
, /* segment type */
1137 0, /* segment descriptor priority level */
1138 1, /* segment descriptor present */
1140 1, /* default 32 vs 16 bit size */
1141 1 /* limit granularity (byte/page units)*/ },
1142 /* GPROC0_SEL 4 Proc 0 Tss Descriptor */
1144 0x0, /* segment base address */
1145 sizeof(struct i386tss
)-1,/* length - all address space */
1146 SDT_SYS386TSS
, /* segment type */
1147 0, /* segment descriptor priority level */
1148 1, /* segment descriptor present */
1150 0, /* unused - default 32 vs 16 bit size */
1151 0 /* limit granularity (byte/page units)*/ },
1152 /* GLDT_SEL 5 LDT Descriptor */
1153 { (int) ldt
, /* segment base address */
1154 sizeof(ldt
)-1, /* length - all address space */
1155 SDT_SYSLDT
, /* segment type */
1156 SEL_UPL
, /* segment descriptor priority level */
1157 1, /* segment descriptor present */
1159 0, /* unused - default 32 vs 16 bit size */
1160 0 /* limit granularity (byte/page units)*/ },
1161 /* GUSERLDT_SEL 6 User LDT Descriptor per process */
1162 { (int) ldt
, /* segment base address */
1163 (512 * sizeof(union descriptor
)-1), /* length */
1164 SDT_SYSLDT
, /* segment type */
1165 0, /* segment descriptor priority level */
1166 1, /* segment descriptor present */
1168 0, /* unused - default 32 vs 16 bit size */
1169 0 /* limit granularity (byte/page units)*/ },
1170 /* GTGATE_SEL 7 Null Descriptor - Placeholder */
1171 { 0x0, /* segment base address */
1172 0x0, /* length - all address space */
1173 0, /* segment type */
1174 0, /* segment descriptor priority level */
1175 0, /* segment descriptor present */
1177 0, /* default 32 vs 16 bit size */
1178 0 /* limit granularity (byte/page units)*/ },
1179 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
1180 { 0x400, /* segment base address */
1181 0xfffff, /* length */
1182 SDT_MEMRWA
, /* segment type */
1183 0, /* segment descriptor priority level */
1184 1, /* segment descriptor present */
1186 1, /* default 32 vs 16 bit size */
1187 1 /* limit granularity (byte/page units)*/ },
1188 /* GPANIC_SEL 9 Panic Tss Descriptor */
1189 { (int) &dblfault_tss
, /* segment base address */
1190 sizeof(struct i386tss
)-1,/* length - all address space */
1191 SDT_SYS386TSS
, /* segment type */
1192 0, /* segment descriptor priority level */
1193 1, /* segment descriptor present */
1195 0, /* unused - default 32 vs 16 bit size */
1196 0 /* limit granularity (byte/page units)*/ },
1197 /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */
1198 { 0, /* segment base address (overwritten) */
1199 0xfffff, /* length */
1200 SDT_MEMERA
, /* segment type */
1201 0, /* segment descriptor priority level */
1202 1, /* segment descriptor present */
1204 0, /* default 32 vs 16 bit size */
1205 1 /* limit granularity (byte/page units)*/ },
1206 /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */
1207 { 0, /* segment base address (overwritten) */
1208 0xfffff, /* length */
1209 SDT_MEMERA
, /* segment type */
1210 0, /* segment descriptor priority level */
1211 1, /* segment descriptor present */
1213 0, /* default 32 vs 16 bit size */
1214 1 /* limit granularity (byte/page units)*/ },
1215 /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */
1216 { 0, /* segment base address (overwritten) */
1217 0xfffff, /* length */
1218 SDT_MEMRWA
, /* segment type */
1219 0, /* segment descriptor priority level */
1220 1, /* segment descriptor present */
1222 1, /* default 32 vs 16 bit size */
1223 1 /* limit granularity (byte/page units)*/ },
1224 /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */
1225 { 0, /* segment base address (overwritten) */
1226 0xfffff, /* length */
1227 SDT_MEMRWA
, /* segment type */
1228 0, /* segment descriptor priority level */
1229 1, /* segment descriptor present */
1231 0, /* default 32 vs 16 bit size */
1232 1 /* limit granularity (byte/page units)*/ },
1233 /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */
1234 { 0, /* segment base address (overwritten) */
1235 0xfffff, /* length */
1236 SDT_MEMRWA
, /* segment type */
1237 0, /* segment descriptor priority level */
1238 1, /* segment descriptor present */
1240 0, /* default 32 vs 16 bit size */
1241 1 /* limit granularity (byte/page units)*/ },
1242 /* GTLS_START 15 TLS */
1243 { 0x0, /* segment base address */
1245 0, /* segment type */
1246 0, /* segment descriptor priority level */
1247 0, /* segment descriptor present */
1249 0, /* default 32 vs 16 bit size */
1250 0 /* limit granularity (byte/page units)*/ },
1251 /* GTLS_START+1 16 TLS */
1252 { 0x0, /* segment base address */
1254 0, /* segment type */
1255 0, /* segment descriptor priority level */
1256 0, /* segment descriptor present */
1258 0, /* default 32 vs 16 bit size */
1259 0 /* limit granularity (byte/page units)*/ },
1260 /* GTLS_END 17 TLS */
1261 { 0x0, /* segment base address */
1263 0, /* segment type */
1264 0, /* segment descriptor priority level */
1265 0, /* segment descriptor present */
1267 0, /* default 32 vs 16 bit size */
1268 0 /* limit granularity (byte/page units)*/ },
1271 static struct soft_segment_descriptor ldt_segs
[] = {
1272 /* Null Descriptor - overwritten by call gate */
1273 { 0x0, /* segment base address */
1274 0x0, /* length - all address space */
1275 0, /* segment type */
1276 0, /* segment descriptor priority level */
1277 0, /* segment descriptor present */
1279 0, /* default 32 vs 16 bit size */
1280 0 /* limit granularity (byte/page units)*/ },
1281 /* Null Descriptor - overwritten by call gate */
1282 { 0x0, /* segment base address */
1283 0x0, /* length - all address space */
1284 0, /* segment type */
1285 0, /* segment descriptor priority level */
1286 0, /* segment descriptor present */
1288 0, /* default 32 vs 16 bit size */
1289 0 /* limit granularity (byte/page units)*/ },
1290 /* Null Descriptor - overwritten by call gate */
1291 { 0x0, /* segment base address */
1292 0x0, /* length - all address space */
1293 0, /* segment type */
1294 0, /* segment descriptor priority level */
1295 0, /* segment descriptor present */
1297 0, /* default 32 vs 16 bit size */
1298 0 /* limit granularity (byte/page units)*/ },
1299 /* Code Descriptor for user */
1300 { 0x0, /* segment base address */
1301 0xfffff, /* length - all address space */
1302 SDT_MEMERA
, /* segment type */
1303 SEL_UPL
, /* segment descriptor priority level */
1304 1, /* segment descriptor present */
1306 1, /* default 32 vs 16 bit size */
1307 1 /* limit granularity (byte/page units)*/ },
1308 /* Null Descriptor - overwritten by call gate */
1309 { 0x0, /* segment base address */
1310 0x0, /* length - all address space */
1311 0, /* segment type */
1312 0, /* segment descriptor priority level */
1313 0, /* segment descriptor present */
1315 0, /* default 32 vs 16 bit size */
1316 0 /* limit granularity (byte/page units)*/ },
1317 /* Data Descriptor for user */
1318 { 0x0, /* segment base address */
1319 0xfffff, /* length - all address space */
1320 SDT_MEMRWA
, /* segment type */
1321 SEL_UPL
, /* segment descriptor priority level */
1322 1, /* segment descriptor present */
1324 1, /* default 32 vs 16 bit size */
1325 1 /* limit granularity (byte/page units)*/ },
1329 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int selec
)
1331 struct gate_descriptor
*ip
;
1334 ip
->gd_looffset
= (int)func
;
1335 ip
->gd_selector
= selec
;
1341 ip
->gd_hioffset
= ((int)func
)>>16 ;
1344 #define IDTVEC(name) __CONCAT(X,name)
1347 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1348 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1349 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1350 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(fpu
), IDTVEC(align
),
1351 IDTVEC(xmm
), IDTVEC(syscall
),
1354 IDTVEC(int0x80_syscall
);
1356 #ifdef DEBUG_INTERRUPTS
1357 extern inthand_t
*Xrsvdary
[256];
1361 sdtossd(struct segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1363 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1364 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1365 ssd
->ssd_type
= sd
->sd_type
;
1366 ssd
->ssd_dpl
= sd
->sd_dpl
;
1367 ssd
->ssd_p
= sd
->sd_p
;
1368 ssd
->ssd_def32
= sd
->sd_def32
;
1369 ssd
->ssd_gran
= sd
->sd_gran
;
1373 * Populate the (physmap) array with base/bound pairs describing the
1374 * available physical memory in the system, then test this memory and
1375 * build the phys_avail array describing the actually-available memory.
1377 * If we cannot accurately determine the physical memory map, then use
1378 * value from the 0xE801 call, and failing that, the RTC.
1380 * Total memory size may be set by the kernel environment variable
1381 * hw.physmem or the compile-time define MAXMEM.
1384 getmemsize(int first
)
1386 int i
, physmap_idx
, pa_indx
;
1388 u_int basemem
, extmem
;
1389 struct vm86frame vmf
;
1390 struct vm86context vmc
;
1392 vm_offset_t physmap
[PHYSMAP_ENTRIES
*2];
1400 quad_t dcons_addr
, dcons_size
;
1403 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12
);
1404 bzero(&vmf
, sizeof(struct vm86frame
));
1405 bzero(physmap
, sizeof(physmap
));
1409 * Some newer BIOSes has broken INT 12H implementation which cause
1410 * kernel panic immediately. In this case, we need to scan SMAP
1411 * with INT 15:E820 first, then determine base memory size.
1413 if (hasbrokenint12
) {
1418 * Perform "base memory" related probes & setup. If we get a crazy
1419 * value give the bios some scribble space just in case.
1421 vm86_intcall(0x12, &vmf
);
1422 basemem
= vmf
.vmf_ax
;
1423 if (basemem
> 640) {
1424 kprintf("Preposterous BIOS basemem of %uK, "
1425 "truncating to < 640K\n", basemem
);
1430 * XXX if biosbasemem is now < 640, there is a `hole'
1431 * between the end of base memory and the start of
1432 * ISA memory. The hole may be empty or it may
1433 * contain BIOS code or data. Map it read/write so
1434 * that the BIOS can write to it. (Memory from 0 to
1435 * the physical end of the kernel is mapped read-only
1436 * to begin with and then parts of it are remapped.
1437 * The parts that aren't remapped form holes that
1438 * remain read-only and are unused by the kernel.
1439 * The base memory area is below the physical end of
1440 * the kernel and right now forms a read-only hole.
1441 * The part of it from PAGE_SIZE to
1442 * (trunc_page(biosbasemem * 1024) - 1) will be
1443 * remapped and used by the kernel later.)
1445 * This code is similar to the code used in
1446 * pmap_mapdev, but since no memory needs to be
1447 * allocated we simply change the mapping.
1449 for (pa
= trunc_page(basemem
* 1024);
1450 pa
< ISA_HOLE_START
; pa
+= PAGE_SIZE
) {
1451 pte
= vtopte(pa
+ KERNBASE
);
1452 *pte
= pa
| PG_RW
| PG_V
;
1456 * if basemem != 640, map pages r/w into vm86 page table so
1457 * that the bios can scribble on it.
1460 for (i
= basemem
/ 4; i
< 160; i
++)
1461 pte
[i
] = (i
<< PAGE_SHIFT
) | PG_V
| PG_RW
| PG_U
;
1465 * map page 1 R/W into the kernel page table so we can use it
1466 * as a buffer. The kernel will unmap this page later.
1468 pte
= vtopte(KERNBASE
+ (1 << PAGE_SHIFT
));
1469 *pte
= (1 << PAGE_SHIFT
) | PG_RW
| PG_V
;
1472 * get memory map with INT 15:E820
1474 #define SMAPSIZ sizeof(*smap)
1475 #define SMAP_SIG 0x534D4150 /* 'SMAP' */
1478 smap
= (void *)vm86_addpage(&vmc
, 1, KERNBASE
+ (1 << PAGE_SHIFT
));
1479 vm86_getptr(&vmc
, (vm_offset_t
)smap
, &vmf
.vmf_es
, &vmf
.vmf_di
);
1484 vmf
.vmf_eax
= 0xE820;
1485 vmf
.vmf_edx
= SMAP_SIG
;
1486 vmf
.vmf_ecx
= SMAPSIZ
;
1487 i
= vm86_datacall(0x15, &vmf
, &vmc
);
1488 if (i
|| vmf
.vmf_eax
!= SMAP_SIG
)
1490 if (boothowto
& RB_VERBOSE
)
1491 kprintf("SMAP type=%02x base=%08x %08x len=%08x %08x\n",
1493 *(u_int32_t
*)((char *)&smap
->base
+ 4),
1494 (u_int32_t
)smap
->base
,
1495 *(u_int32_t
*)((char *)&smap
->length
+ 4),
1496 (u_int32_t
)smap
->length
);
1498 if (smap
->type
!= 0x01)
1501 if (smap
->length
== 0)
1504 if (smap
->base
>= 0xffffffff) {
1505 kprintf("%uK of memory above 4GB ignored\n",
1506 (u_int
)(smap
->length
/ 1024));
1510 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1511 if (smap
->base
< physmap
[i
+ 1]) {
1512 if (boothowto
& RB_VERBOSE
)
1514 "Overlapping or non-montonic memory region, ignoring second region\n");
1519 if (smap
->base
== physmap
[physmap_idx
+ 1]) {
1520 physmap
[physmap_idx
+ 1] += smap
->length
;
1525 if (physmap_idx
== PHYSMAP_ENTRIES
*2) {
1527 "Too many segments in the physical address map, giving up\n");
1530 physmap
[physmap_idx
] = smap
->base
;
1531 physmap
[physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1533 ; /* fix GCC3.x warning */
1534 } while (vmf
.vmf_ebx
!= 0);
1537 * Perform "base memory" related probes & setup based on SMAP
1540 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1541 if (physmap
[i
] == 0x00000000) {
1542 basemem
= physmap
[i
+ 1] / 1024;
1551 if (basemem
> 640) {
1552 kprintf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
1557 for (pa
= trunc_page(basemem
* 1024);
1558 pa
< ISA_HOLE_START
; pa
+= PAGE_SIZE
) {
1559 pte
= vtopte(pa
+ KERNBASE
);
1560 *pte
= pa
| PG_RW
| PG_V
;
1564 for (i
= basemem
/ 4; i
< 160; i
++)
1565 pte
[i
] = (i
<< PAGE_SHIFT
) | PG_V
| PG_RW
| PG_U
;
1568 if (physmap
[1] != 0)
1572 * If we failed above, try memory map with INT 15:E801
1574 vmf
.vmf_ax
= 0xE801;
1575 if (vm86_intcall(0x15, &vmf
) == 0) {
1576 extmem
= vmf
.vmf_cx
+ vmf
.vmf_dx
* 64;
1580 vm86_intcall(0x15, &vmf
);
1581 extmem
= vmf
.vmf_ax
;
1584 * Prefer the RTC value for extended memory.
1586 extmem
= rtcin(RTC_EXTLO
) + (rtcin(RTC_EXTHI
) << 8);
1591 * Special hack for chipsets that still remap the 384k hole when
1592 * there's 16MB of memory - this really confuses people that
1593 * are trying to use bus mastering ISA controllers with the
1594 * "16MB limit"; they only have 16MB, but the remapping puts
1595 * them beyond the limit.
1597 * If extended memory is between 15-16MB (16-17MB phys address range),
1600 if ((extmem
> 15 * 1024) && (extmem
< 16 * 1024))
1604 physmap
[1] = basemem
* 1024;
1606 physmap
[physmap_idx
] = 0x100000;
1607 physmap
[physmap_idx
+ 1] = physmap
[physmap_idx
] + extmem
* 1024;
1611 * Now, physmap contains a map of physical memory.
1615 /* make hole for AP bootstrap code YYY */
1616 physmap
[1] = mp_bootaddress(physmap
[1] / 1024);
1618 /* look for the MP hardware - needed for apic addresses */
1623 * Maxmem isn't the "maximum memory", it's one larger than the
1624 * highest page of the physical address space. It should be
1625 * called something like "Maxphyspage". We may adjust this
1626 * based on ``hw.physmem'' and the results of the memory test.
1628 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1631 Maxmem
= MAXMEM
/ 4;
1635 * hw.physmem is a size in bytes; we also allow k, m, and g suffixes
1636 * for the appropriate modifiers. This overrides MAXMEM.
1638 if ((cp
= kgetenv("hw.physmem")) != NULL
) {
1639 u_int64_t AllowMem
, sanity
;
1642 sanity
= AllowMem
= strtouq(cp
, &ep
, 0);
1643 if ((ep
!= cp
) && (*ep
!= 0)) {
1656 AllowMem
= sanity
= 0;
1658 if (AllowMem
< sanity
)
1662 kprintf("Ignoring invalid memory size of '%s'\n", cp
);
1664 Maxmem
= atop(AllowMem
);
1667 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1668 (boothowto
& RB_VERBOSE
))
1669 kprintf("Physical memory use set to %lluK\n", Maxmem
* 4);
1672 * If Maxmem has been increased beyond what the system has detected,
1673 * extend the last memory segment to the new limit.
1675 if (atop(physmap
[physmap_idx
+ 1]) < Maxmem
)
1676 physmap
[physmap_idx
+ 1] = ptoa(Maxmem
);
1678 /* call pmap initialization to make new kernel address space */
1679 pmap_bootstrap(first
, 0);
1682 * Size up each available chunk of physical memory.
1684 physmap
[0] = PAGE_SIZE
; /* mask off page 0 */
1686 phys_avail
[pa_indx
++] = physmap
[0];
1687 phys_avail
[pa_indx
] = physmap
[0];
1691 * Get dcons buffer address
1693 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
1694 kgetenv_quad("dcons.size", &dcons_size
) == 0)
1698 * physmap is in bytes, so when converting to page boundaries,
1699 * round up the start address and round down the end address.
1701 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1705 if (physmap
[i
+ 1] < end
)
1706 end
= trunc_page(physmap
[i
+ 1]);
1707 for (pa
= round_page(physmap
[i
]); pa
< end
; pa
+= PAGE_SIZE
) {
1712 int *ptr
= (int *)CADDR1
;
1716 * block out kernel memory as not available.
1718 if (pa
>= 0x100000 && pa
< first
)
1722 * block out dcons buffer
1725 && pa
>= trunc_page(dcons_addr
)
1726 && pa
< dcons_addr
+ dcons_size
)
1732 * map page into kernel: valid, read/write,non-cacheable
1734 *pte
= pa
| PG_V
| PG_RW
| PG_N
;
1739 * Test for alternating 1's and 0's
1741 *(volatile int *)ptr
= 0xaaaaaaaa;
1742 if (*(volatile int *)ptr
!= 0xaaaaaaaa) {
1746 * Test for alternating 0's and 1's
1748 *(volatile int *)ptr
= 0x55555555;
1749 if (*(volatile int *)ptr
!= 0x55555555) {
1755 *(volatile int *)ptr
= 0xffffffff;
1756 if (*(volatile int *)ptr
!= 0xffffffff) {
1762 *(volatile int *)ptr
= 0x0;
1763 if (*(volatile int *)ptr
!= 0x0) {
1767 * Restore original value.
1772 * Adjust array of valid/good pages.
1774 if (page_bad
== TRUE
) {
1778 * If this good page is a continuation of the
1779 * previous set of good pages, then just increase
1780 * the end pointer. Otherwise start a new chunk.
1781 * Note that "end" points one higher than end,
1782 * making the range >= start and < end.
1783 * If we're also doing a speculative memory
1784 * test and we at or past the end, bump up Maxmem
1785 * so that we keep going. The first bad page
1786 * will terminate the loop.
1788 if (phys_avail
[pa_indx
] == pa
) {
1789 phys_avail
[pa_indx
] += PAGE_SIZE
;
1792 if (pa_indx
>= PHYSMAP_ENTRIES
*2) {
1793 kprintf("Too many holes in the physical address space, giving up\n");
1797 phys_avail
[pa_indx
++] = pa
; /* start */
1798 phys_avail
[pa_indx
] = pa
+ PAGE_SIZE
; /* end */
1808 * The last chunk must contain at least one page plus the message
1809 * buffer to avoid complicating other code (message buffer address
1810 * calculation, etc.).
1812 while (phys_avail
[pa_indx
- 1] + PAGE_SIZE
+
1813 round_page(MSGBUF_SIZE
) >= phys_avail
[pa_indx
]) {
1814 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
1815 phys_avail
[pa_indx
--] = 0;
1816 phys_avail
[pa_indx
--] = 0;
1819 Maxmem
= atop(phys_avail
[pa_indx
]);
1821 /* Trim off space for the message buffer. */
1822 phys_avail
[pa_indx
] -= round_page(MSGBUF_SIZE
);
1824 avail_end
= phys_avail
[pa_indx
];
1836 * 7 Device Not Available (x87)
1838 * 9 Coprocessor Segment overrun (unsupported, reserved)
1840 * 11 Segment not present
1842 * 13 General Protection
1845 * 16 x87 FP Exception pending
1846 * 17 Alignment Check
1848 * 19 SIMD floating point
1850 * 32-255 INTn/external sources
1855 struct gate_descriptor
*gdp
;
1856 int gsel_tss
, metadata_missing
, off
, x
;
1857 struct mdglobaldata
*gd
;
1860 * Prevent lowering of the ipl if we call tsleep() early.
1862 gd
= &CPU_prvspace
[0].mdglobaldata
;
1863 bzero(gd
, sizeof(*gd
));
1865 gd
->mi
.gd_curthread
= &thread0
;
1866 thread0
.td_gd
= &gd
->mi
;
1868 atdevbase
= ISA_HOLE_START
+ KERNBASE
;
1870 metadata_missing
= 0;
1871 if (bootinfo
.bi_modulep
) {
1872 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
1873 preload_bootstrap_relocate(KERNBASE
);
1875 metadata_missing
= 1;
1877 if (bootinfo
.bi_envp
)
1878 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
1881 * start with one cpu. Note: ncpus2_shift and ncpus2_mask are left
1886 /* Init basic tunables, hz etc */
1890 * make gdt memory segments, the code segment goes up to end of the
1891 * page with etext in it, the data segment goes to the end of
1895 * XXX text protection is temporarily (?) disabled. The limit was
1896 * i386_btop(round_page(etext)) - 1.
1898 gdt_segs
[GCODE_SEL
].ssd_limit
= atop(0 - 1);
1899 gdt_segs
[GDATA_SEL
].ssd_limit
= atop(0 - 1);
1901 gdt_segs
[GPRIV_SEL
].ssd_limit
=
1902 atop(sizeof(struct privatespace
) - 1);
1903 gdt_segs
[GPRIV_SEL
].ssd_base
= (int) &CPU_prvspace
[0];
1904 gdt_segs
[GPROC0_SEL
].ssd_base
=
1905 (int) &CPU_prvspace
[0].mdglobaldata
.gd_common_tss
;
1907 gd
->mi
.gd_prvspace
= &CPU_prvspace
[0];
1910 * Note: on both UP and SMP curthread must be set non-NULL
1911 * early in the boot sequence because the system assumes
1912 * that 'curthread' is never NULL.
1915 for (x
= 0; x
< NGDT
; x
++) {
1917 /* avoid overwriting db entries with APM ones */
1918 if (x
>= GAPMCODE32_SEL
&& x
<= GAPMDATA_SEL
)
1921 ssdtosd(&gdt_segs
[x
], &gdt
[x
].sd
);
1924 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
1925 r_gdt
.rd_base
= (int) gdt
;
1928 mi_gdinit(&gd
->mi
, 0);
1930 mi_proc0init(&gd
->mi
, proc0paddr
);
1931 safepri
= TDPRI_MAX
;
1933 /* make ldt memory segments */
1935 * XXX - VM_MAX_USER_ADDRESS is an end address, not a max. And it
1936 * should be spelled ...MAX_USER...
1938 ldt_segs
[LUCODE_SEL
].ssd_limit
= atop(VM_MAX_USER_ADDRESS
- 1);
1939 ldt_segs
[LUDATA_SEL
].ssd_limit
= atop(VM_MAX_USER_ADDRESS
- 1);
1940 for (x
= 0; x
< sizeof ldt_segs
/ sizeof ldt_segs
[0]; x
++)
1941 ssdtosd(&ldt_segs
[x
], &ldt
[x
].sd
);
1943 _default_ldt
= GSEL(GLDT_SEL
, SEL_KPL
);
1945 gd
->gd_currentldt
= _default_ldt
;
1946 /* spinlocks and the BGL */
1950 * Setup the hardware exception table. Most exceptions use
1951 * SDT_SYS386TGT, known as a 'trap gate'. Trap gates leave
1952 * interrupts enabled. VM page faults use SDT_SYS386IGT, known as
1953 * an 'interrupt trap gate', which disables interrupts on entry,
1954 * in order to be able to poll the appropriate CRn register to
1955 * determine the fault address.
1957 for (x
= 0; x
< NIDT
; x
++) {
1958 #ifdef DEBUG_INTERRUPTS
1959 setidt(x
, Xrsvdary
[x
], SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1961 setidt(x
, &IDTVEC(rsvd0
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1964 setidt(0, &IDTVEC(div
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1965 setidt(1, &IDTVEC(dbg
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1966 setidt(2, &IDTVEC(nmi
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1967 setidt(3, &IDTVEC(bpt
), SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1968 setidt(4, &IDTVEC(ofl
), SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1969 setidt(5, &IDTVEC(bnd
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1970 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1971 setidt(7, &IDTVEC(dna
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1972 setidt(8, 0, SDT_SYSTASKGT
, SEL_KPL
, GSEL(GPANIC_SEL
, SEL_KPL
));
1973 setidt(9, &IDTVEC(fpusegm
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1974 setidt(10, &IDTVEC(tss
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1975 setidt(11, &IDTVEC(missing
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1976 setidt(12, &IDTVEC(stk
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1977 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1978 setidt(14, &IDTVEC(page
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1979 setidt(15, &IDTVEC(rsvd0
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1980 setidt(16, &IDTVEC(fpu
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1981 setidt(17, &IDTVEC(align
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1982 setidt(18, &IDTVEC(mchk
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1983 setidt(19, &IDTVEC(xmm
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1984 setidt(0x80, &IDTVEC(int0x80_syscall
),
1985 SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1987 r_idt
.rd_limit
= sizeof(idt0
) - 1;
1988 r_idt
.rd_base
= (int) idt
;
1992 * Initialize the console before we print anything out.
1996 if (metadata_missing
)
1997 kprintf("WARNING: loader(8) metadata is missing!\n");
2006 if (boothowto
& RB_KDB
)
2007 Debugger("Boot flags requested debugger");
2010 finishidentcpu(); /* Final stage of CPU initialization */
2011 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2012 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2013 initializecpu(); /* Initialize CPU registers */
2016 * make an initial tss so cpu can get interrupt stack on syscall!
2017 * The 16 bytes is to save room for a VM86 context.
2019 gd
->gd_common_tss
.tss_esp0
= (int) thread0
.td_pcb
- 16;
2020 gd
->gd_common_tss
.tss_ss0
= GSEL(GDATA_SEL
, SEL_KPL
) ;
2021 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2022 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
].sd
;
2023 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2024 gd
->gd_common_tss
.tss_ioopt
= (sizeof gd
->gd_common_tss
) << 16;
2027 dblfault_tss
.tss_esp
= dblfault_tss
.tss_esp0
= dblfault_tss
.tss_esp1
=
2028 dblfault_tss
.tss_esp2
= (int) &dblfault_stack
[sizeof(dblfault_stack
)];
2029 dblfault_tss
.tss_ss
= dblfault_tss
.tss_ss0
= dblfault_tss
.tss_ss1
=
2030 dblfault_tss
.tss_ss2
= GSEL(GDATA_SEL
, SEL_KPL
);
2031 dblfault_tss
.tss_cr3
= (int)IdlePTD
;
2032 dblfault_tss
.tss_eip
= (int) dblfault_handler
;
2033 dblfault_tss
.tss_eflags
= PSL_KERNEL
;
2034 dblfault_tss
.tss_ds
= dblfault_tss
.tss_es
=
2035 dblfault_tss
.tss_gs
= GSEL(GDATA_SEL
, SEL_KPL
);
2036 dblfault_tss
.tss_fs
= GSEL(GPRIV_SEL
, SEL_KPL
);
2037 dblfault_tss
.tss_cs
= GSEL(GCODE_SEL
, SEL_KPL
);
2038 dblfault_tss
.tss_ldt
= GSEL(GLDT_SEL
, SEL_KPL
);
2042 init_param2(physmem
);
2044 /* now running on new page tables, configured,and u/iom is accessible */
2046 /* Map the message buffer. */
2047 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2048 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2050 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2052 /* make a call gate to reenter kernel with */
2053 gdp
= &ldt
[LSYS5CALLS_SEL
].gd
;
2055 x
= (int) &IDTVEC(syscall
);
2056 gdp
->gd_looffset
= x
++;
2057 gdp
->gd_selector
= GSEL(GCODE_SEL
,SEL_KPL
);
2059 gdp
->gd_type
= SDT_SYS386CGT
;
2060 gdp
->gd_dpl
= SEL_UPL
;
2062 gdp
->gd_hioffset
= ((int) &IDTVEC(syscall
)) >>16;
2064 /* XXX does this work? */
2065 ldt
[LBSDICALLS_SEL
] = ldt
[LSYS5CALLS_SEL
];
2066 ldt
[LSOL26CALLS_SEL
] = ldt
[LSYS5CALLS_SEL
];
2068 /* transfer to user mode */
2070 _ucodesel
= LSEL(LUCODE_SEL
, SEL_UPL
);
2071 _udatasel
= LSEL(LUDATA_SEL
, SEL_UPL
);
2073 /* setup proc 0's pcb */
2074 thread0
.td_pcb
->pcb_flags
= 0;
2075 thread0
.td_pcb
->pcb_cr3
= (int)IdlePTD
; /* should already be setup */
2076 thread0
.td_pcb
->pcb_ext
= 0;
2077 proc0
.p_lwp
.lwp_md
.md_regs
= &proc0_tf
;
2081 * Initialize machine-dependant portions of the global data structure.
2082 * Note that the global data area and cpu0's idlestack in the private
2083 * data space were allocated in locore.
2085 * Note: the idlethread's cpl is 0
2087 * WARNING! Called from early boot, 'mycpu' may not work yet.
2090 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2093 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2095 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2096 gd
->mi
.gd_prvspace
->idlestack
,
2097 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2098 TDF_MPSAFE
, &gd
->mi
);
2099 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2100 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2101 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2102 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2106 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2108 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2109 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2116 globaldata_find(int cpu
)
2118 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2119 return(&CPU_prvspace
[cpu
].mdglobaldata
.mi
);
2122 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
2123 static void f00f_hack(void *unused
);
2124 SYSINIT(f00f_hack
, SI_SUB_INTRINSIC
, SI_ORDER_FIRST
, f00f_hack
, NULL
);
2127 f00f_hack(void *unused
)
2129 struct gate_descriptor
*new_idt
;
2135 kprintf("Intel Pentium detected, installing workaround for F00F bug\n");
2137 r_idt
.rd_limit
= sizeof(idt0
) - 1;
2139 tmp
= kmem_alloc(&kernel_map
, PAGE_SIZE
* 2);
2141 panic("kmem_alloc returned 0");
2142 if (((unsigned int)tmp
& (PAGE_SIZE
-1)) != 0)
2143 panic("kmem_alloc returned non-page-aligned memory");
2144 /* Put the first seven entries in the lower page */
2145 new_idt
= (struct gate_descriptor
*)(tmp
+ PAGE_SIZE
- (7*8));
2146 bcopy(idt
, new_idt
, sizeof(idt0
));
2147 r_idt
.rd_base
= (int)new_idt
;
2150 if (vm_map_protect(&kernel_map
, tmp
, tmp
+ PAGE_SIZE
,
2151 VM_PROT_READ
, FALSE
) != KERN_SUCCESS
)
2152 panic("vm_map_protect failed");
2155 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
2158 ptrace_set_pc(struct proc
*p
, unsigned long addr
)
2160 p
->p_md
.md_regs
->tf_eip
= addr
;
2165 ptrace_single_step(struct lwp
*lp
)
2167 lp
->lwp_md
.md_regs
->tf_eflags
|= PSL_T
;
2172 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2175 struct trapframe
*tp
;
2177 tp
= lp
->lwp_md
.md_regs
;
2178 regs
->r_gs
= tp
->tf_gs
;
2179 regs
->r_fs
= tp
->tf_fs
;
2180 regs
->r_es
= tp
->tf_es
;
2181 regs
->r_ds
= tp
->tf_ds
;
2182 regs
->r_edi
= tp
->tf_edi
;
2183 regs
->r_esi
= tp
->tf_esi
;
2184 regs
->r_ebp
= tp
->tf_ebp
;
2185 regs
->r_ebx
= tp
->tf_ebx
;
2186 regs
->r_edx
= tp
->tf_edx
;
2187 regs
->r_ecx
= tp
->tf_ecx
;
2188 regs
->r_eax
= tp
->tf_eax
;
2189 regs
->r_eip
= tp
->tf_eip
;
2190 regs
->r_cs
= tp
->tf_cs
;
2191 regs
->r_eflags
= tp
->tf_eflags
;
2192 regs
->r_esp
= tp
->tf_esp
;
2193 regs
->r_ss
= tp
->tf_ss
;
2194 pcb
= lp
->lwp_thread
->td_pcb
;
2199 set_regs(struct lwp
*lp
, struct reg
*regs
)
2202 struct trapframe
*tp
;
2204 tp
= lp
->lwp_md
.md_regs
;
2205 if (!EFL_SECURE(regs
->r_eflags
, tp
->tf_eflags
) ||
2206 !CS_SECURE(regs
->r_cs
))
2208 tp
->tf_gs
= regs
->r_gs
;
2209 tp
->tf_fs
= regs
->r_fs
;
2210 tp
->tf_es
= regs
->r_es
;
2211 tp
->tf_ds
= regs
->r_ds
;
2212 tp
->tf_edi
= regs
->r_edi
;
2213 tp
->tf_esi
= regs
->r_esi
;
2214 tp
->tf_ebp
= regs
->r_ebp
;
2215 tp
->tf_ebx
= regs
->r_ebx
;
2216 tp
->tf_edx
= regs
->r_edx
;
2217 tp
->tf_ecx
= regs
->r_ecx
;
2218 tp
->tf_eax
= regs
->r_eax
;
2219 tp
->tf_eip
= regs
->r_eip
;
2220 tp
->tf_cs
= regs
->r_cs
;
2221 tp
->tf_eflags
= regs
->r_eflags
;
2222 tp
->tf_esp
= regs
->r_esp
;
2223 tp
->tf_ss
= regs
->r_ss
;
2224 pcb
= lp
->lwp_thread
->td_pcb
;
2228 #ifndef CPU_DISABLE_SSE
2230 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2232 struct env87
*penv_87
= &sv_87
->sv_env
;
2233 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2236 /* FPU control/status */
2237 penv_87
->en_cw
= penv_xmm
->en_cw
;
2238 penv_87
->en_sw
= penv_xmm
->en_sw
;
2239 penv_87
->en_tw
= penv_xmm
->en_tw
;
2240 penv_87
->en_fip
= penv_xmm
->en_fip
;
2241 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2242 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2243 penv_87
->en_foo
= penv_xmm
->en_foo
;
2244 penv_87
->en_fos
= penv_xmm
->en_fos
;
2247 for (i
= 0; i
< 8; ++i
)
2248 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2250 sv_87
->sv_ex_sw
= sv_xmm
->sv_ex_sw
;
2254 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2256 struct env87
*penv_87
= &sv_87
->sv_env
;
2257 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2260 /* FPU control/status */
2261 penv_xmm
->en_cw
= penv_87
->en_cw
;
2262 penv_xmm
->en_sw
= penv_87
->en_sw
;
2263 penv_xmm
->en_tw
= penv_87
->en_tw
;
2264 penv_xmm
->en_fip
= penv_87
->en_fip
;
2265 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2266 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2267 penv_xmm
->en_foo
= penv_87
->en_foo
;
2268 penv_xmm
->en_fos
= penv_87
->en_fos
;
2271 for (i
= 0; i
< 8; ++i
)
2272 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2274 sv_xmm
->sv_ex_sw
= sv_87
->sv_ex_sw
;
2276 #endif /* CPU_DISABLE_SSE */
2279 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2281 #ifndef CPU_DISABLE_SSE
2283 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2284 (struct save87
*)fpregs
);
2287 #endif /* CPU_DISABLE_SSE */
2288 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2293 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2295 #ifndef CPU_DISABLE_SSE
2297 set_fpregs_xmm((struct save87
*)fpregs
,
2298 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2301 #endif /* CPU_DISABLE_SSE */
2302 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2307 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2310 dbregs
->dr0
= rdr0();
2311 dbregs
->dr1
= rdr1();
2312 dbregs
->dr2
= rdr2();
2313 dbregs
->dr3
= rdr3();
2314 dbregs
->dr4
= rdr4();
2315 dbregs
->dr5
= rdr5();
2316 dbregs
->dr6
= rdr6();
2317 dbregs
->dr7
= rdr7();
2321 pcb
= lp
->lwp_thread
->td_pcb
;
2322 dbregs
->dr0
= pcb
->pcb_dr0
;
2323 dbregs
->dr1
= pcb
->pcb_dr1
;
2324 dbregs
->dr2
= pcb
->pcb_dr2
;
2325 dbregs
->dr3
= pcb
->pcb_dr3
;
2328 dbregs
->dr6
= pcb
->pcb_dr6
;
2329 dbregs
->dr7
= pcb
->pcb_dr7
;
2335 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2338 load_dr0(dbregs
->dr0
);
2339 load_dr1(dbregs
->dr1
);
2340 load_dr2(dbregs
->dr2
);
2341 load_dr3(dbregs
->dr3
);
2342 load_dr4(dbregs
->dr4
);
2343 load_dr5(dbregs
->dr5
);
2344 load_dr6(dbregs
->dr6
);
2345 load_dr7(dbregs
->dr7
);
2348 struct ucred
*ucred
;
2350 uint32_t mask1
, mask2
;
2353 * Don't let an illegal value for dr7 get set. Specifically,
2354 * check for undefined settings. Setting these bit patterns
2355 * result in undefined behaviour and can lead to an unexpected
2358 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 8;
2359 i
++, mask1
<<= 2, mask2
<<= 2)
2360 if ((dbregs
->dr7
& mask1
) == mask2
)
2363 pcb
= lp
->lwp_thread
->td_pcb
;
2364 ucred
= lp
->lwp_proc
->p_ucred
;
2367 * Don't let a process set a breakpoint that is not within the
2368 * process's address space. If a process could do this, it
2369 * could halt the system by setting a breakpoint in the kernel
2370 * (if ddb was enabled). Thus, we need to check to make sure
2371 * that no breakpoints are being enabled for addresses outside
2372 * process's address space, unless, perhaps, we were called by
2375 * XXX - what about when the watched area of the user's
2376 * address space is written into from within the kernel
2377 * ... wouldn't that still cause a breakpoint to be generated
2378 * from within kernel mode?
2381 if (suser_cred(ucred
, 0) != 0) {
2382 if (dbregs
->dr7
& 0x3) {
2383 /* dr0 is enabled */
2384 if (dbregs
->dr0
>= VM_MAX_USER_ADDRESS
)
2388 if (dbregs
->dr7
& (0x3<<2)) {
2389 /* dr1 is enabled */
2390 if (dbregs
->dr1
>= VM_MAX_USER_ADDRESS
)
2394 if (dbregs
->dr7
& (0x3<<4)) {
2395 /* dr2 is enabled */
2396 if (dbregs
->dr2
>= VM_MAX_USER_ADDRESS
)
2400 if (dbregs
->dr7
& (0x3<<6)) {
2401 /* dr3 is enabled */
2402 if (dbregs
->dr3
>= VM_MAX_USER_ADDRESS
)
2407 pcb
->pcb_dr0
= dbregs
->dr0
;
2408 pcb
->pcb_dr1
= dbregs
->dr1
;
2409 pcb
->pcb_dr2
= dbregs
->dr2
;
2410 pcb
->pcb_dr3
= dbregs
->dr3
;
2411 pcb
->pcb_dr6
= dbregs
->dr6
;
2412 pcb
->pcb_dr7
= dbregs
->dr7
;
2414 pcb
->pcb_flags
|= PCB_DBREGS
;
2421 * Return > 0 if a hardware breakpoint has been hit, and the
2422 * breakpoint was in user space. Return 0, otherwise.
2425 user_dbreg_trap(void)
2427 u_int32_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2428 u_int32_t bp
; /* breakpoint bits extracted from dr6 */
2429 int nbp
; /* number of breakpoints that triggered */
2430 caddr_t addr
[4]; /* breakpoint addresses */
2434 if ((dr7
& 0x000000ff) == 0) {
2436 * all GE and LE bits in the dr7 register are zero,
2437 * thus the trap couldn't have been caused by the
2438 * hardware debug registers
2445 bp
= dr6
& 0x0000000f;
2449 * None of the breakpoint bits are set meaning this
2450 * trap was not caused by any of the debug registers
2456 * at least one of the breakpoints were hit, check to see
2457 * which ones and if any of them are user space addresses
2461 addr
[nbp
++] = (caddr_t
)rdr0();
2464 addr
[nbp
++] = (caddr_t
)rdr1();
2467 addr
[nbp
++] = (caddr_t
)rdr2();
2470 addr
[nbp
++] = (caddr_t
)rdr3();
2473 for (i
=0; i
<nbp
; i
++) {
2475 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2477 * addr[i] is in user space
2484 * None of the breakpoints are in user space.
2492 Debugger(const char *msg
)
2494 kprintf("Debugger(\"%s\") called.\n", msg
);
2498 #include <sys/disklabel.h>
2501 * Determine the size of the transfer, and make sure it is
2502 * within the boundaries of the partition. Adjust transfer
2503 * if needed, and signal errors or early completion.
2505 * On success a new bio layer is pushed with the translated
2506 * block number, and returned.
2509 bounds_check_with_label(cdev_t dev
, struct bio
*bio
,
2510 struct disklabel
*lp
, int wlabel
)
2513 struct buf
*bp
= bio
->bio_buf
;
2514 struct partition
*p
= lp
->d_partitions
+ dkpart(dev
);
2515 int labelsect
= lp
->d_partitions
[0].p_offset
;
2516 int maxsz
= p
->p_size
,
2517 sz
= (bp
->b_bcount
+ DEV_BSIZE
- 1) >> DEV_BSHIFT
;
2518 daddr_t blkno
= (daddr_t
)(bio
->bio_offset
>> DEV_BSHIFT
);
2520 /* overwriting disk label ? */
2521 /* XXX should also protect bootstrap in first 8K */
2522 if (blkno
+ p
->p_offset
<= LABELSECTOR
+ labelsect
&&
2523 #if LABELSECTOR != 0
2524 blkno
+ p
->p_offset
+ sz
> LABELSECTOR
+ labelsect
&&
2526 bp
->b_cmd
!= BUF_CMD_READ
&& wlabel
== 0) {
2527 bp
->b_error
= EROFS
;
2531 #if defined(DOSBBSECTOR) && defined(notyet)
2532 /* overwriting master boot record? */
2533 if (blkno
+ p
->p_offset
<= DOSBBSECTOR
&&
2534 bp
->b_cmd
!= BUF_CMD_READ
&& wlabel
== 0) {
2535 bp
->b_error
= EROFS
;
2541 * Check for out of bounds, EOF, and EOF clipping.
2543 if (bio
->bio_offset
< 0)
2545 if (blkno
+ sz
> maxsz
) {
2547 * Past EOF or B_BNOCLIP flag was set, the request is bad.
2549 if (blkno
> maxsz
|| (bp
->b_flags
& B_BNOCLIP
))
2553 * If exactly on EOF just complete the I/O with no bytes
2554 * transfered. B_INVAL must be set to throw away the
2555 * contents of the buffer. Otherwise clip b_bcount.
2557 if (blkno
== maxsz
) {
2558 bp
->b_resid
= bp
->b_bcount
;
2559 bp
->b_flags
|= B_INVAL
;
2562 bp
->b_bcount
= (maxsz
- blkno
) << DEV_BSHIFT
;
2564 nbio
= push_bio(bio
);
2565 nbio
->bio_offset
= bio
->bio_offset
+ ((off_t
)p
->p_offset
<< DEV_BSHIFT
);
2569 * The caller is responsible for calling biodone() on the passed bio
2570 * when we return NULL.
2573 bp
->b_error
= EINVAL
;
2575 bp
->b_resid
= bp
->b_bcount
;
2576 bp
->b_flags
|= B_ERROR
| B_INVAL
;
2584 * Provide inb() and outb() as functions. They are normally only
2585 * available as macros calling inlined functions, thus cannot be
2586 * called inside DDB.
2588 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2594 /* silence compiler warnings */
2596 void outb(u_int
, u_char
);
2603 * We use %%dx and not %1 here because i/o is done at %dx and not at
2604 * %edx, while gcc generates inferior code (movw instead of movl)
2605 * if we tell it to load (u_short) port.
2607 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2612 outb(u_int port
, u_char data
)
2616 * Use an unnecessary assignment to help gcc's register allocator.
2617 * This make a large difference for gcc-1.40 and a tiny difference
2618 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2619 * best results. gcc-2.6.0 can't handle this.
2622 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2629 #include "opt_cpu.h"
2633 * initialize all the SMP locks
2636 /* critical region when masking or unmasking interupts */
2637 struct spinlock_deprecated imen_spinlock
;
2639 /* Make FAST_INTR() routines sequential */
2640 struct spinlock_deprecated fast_intr_spinlock
;
2642 /* critical region for old style disable_intr/enable_intr */
2643 struct spinlock_deprecated mpintr_spinlock
;
2645 /* critical region around INTR() routines */
2646 struct spinlock_deprecated intr_spinlock
;
2648 /* lock region used by kernel profiling */
2649 struct spinlock_deprecated mcount_spinlock
;
2651 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2652 struct spinlock_deprecated com_spinlock
;
2654 /* locks kernel kprintfs */
2655 struct spinlock_deprecated cons_spinlock
;
2657 /* lock regions around the clock hardware */
2658 struct spinlock_deprecated clock_spinlock
;
2660 /* lock around the MP rendezvous */
2661 struct spinlock_deprecated smp_rv_spinlock
;
2667 * mp_lock = 0; BSP already owns the MP lock
2670 * Get the initial mp_lock with a count of 1 for the BSP.
2671 * This uses a LOGICAL cpu ID, ie BSP == 0.
2674 cpu_get_initial_mplock();
2677 spin_lock_init(&mcount_spinlock
);
2678 spin_lock_init(&fast_intr_spinlock
);
2679 spin_lock_init(&intr_spinlock
);
2680 spin_lock_init(&mpintr_spinlock
);
2681 spin_lock_init(&imen_spinlock
);
2682 spin_lock_init(&smp_rv_spinlock
);
2683 spin_lock_init(&com_spinlock
);
2684 spin_lock_init(&clock_spinlock
);
2685 spin_lock_init(&cons_spinlock
);
2687 /* our token pool needs to work early */
2688 lwkt_token_pool_init();