2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
6 * This code is derived from software contributed to Berkeley by
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
38 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
39 * $DragonFly: src/sys/platform/pc32/i386/machdep.c,v 1.125 2007/07/01 01:11:38 dillon Exp $
43 #include "use_ether.h"
46 #include "opt_atalk.h"
47 #include "opt_compat.h"
50 #include "opt_directio.h"
53 #include "opt_maxmem.h"
54 #include "opt_msgbuf.h"
55 #include "opt_perfmon.h"
57 #include "opt_userconfig.h"
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/sysproto.h>
62 #include <sys/signalvar.h>
63 #include <sys/kernel.h>
64 #include <sys/linker.h>
65 #include <sys/malloc.h>
68 #include <sys/reboot.h>
70 #include <sys/msgbuf.h>
71 #include <sys/sysent.h>
72 #include <sys/sysctl.h>
73 #include <sys/vmmeter.h>
75 #include <sys/upcall.h>
76 #include <sys/usched.h>
80 #include <vm/vm_param.h>
82 #include <vm/vm_kern.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_map.h>
86 #include <vm/vm_pager.h>
87 #include <vm/vm_extern.h>
89 #include <sys/thread2.h>
97 #include <machine/cpu.h>
98 #include <machine/clock.h>
99 #include <machine/specialreg.h>
100 #include <machine/bootinfo.h>
101 #include <machine/md_var.h>
102 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
103 #include <machine/globaldata.h> /* CPU_prvspace */
104 #include <machine/smp.h>
106 #include <machine/perfmon.h>
108 #include <machine/cputypes.h>
111 #include <bus/isa/i386/isa_device.h>
113 #include <machine_base/isa/intr_machdep.h>
114 #include <bus/isa/rtc.h>
115 #include <machine/vm86.h>
116 #include <sys/random.h>
117 #include <sys/ptrace.h>
118 #include <machine/sigframe.h>
120 #define PHYSMAP_ENTRIES 10
122 extern void init386 (int first
);
123 extern void dblfault_handler (void);
125 extern void printcpuinfo(void); /* XXX header file */
126 extern void finishidentcpu(void);
127 extern void panicifcpuunsupported(void);
128 extern void initializecpu(void);
130 static void cpu_startup (void *);
131 #ifndef CPU_DISABLE_SSE
132 static void set_fpregs_xmm (struct save87
*, struct savexmm
*);
133 static void fill_fpregs_xmm (struct savexmm
*, struct save87
*);
134 #endif /* CPU_DISABLE_SSE */
136 extern void ffs_rawread_setup(void);
137 #endif /* DIRECTIO */
138 static void init_locks(void);
140 SYSINIT(cpu
, SI_BOOT2_SMP
, SI_ORDER_FIRST
, cpu_startup
, NULL
)
142 int _udatasel
, _ucodesel
;
145 int64_t tsc_offsets
[MAXCPU
];
147 int64_t tsc_offsets
[1];
150 #if defined(SWTCH_OPTIM_STATS)
151 extern int swtch_optim_stats
;
152 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
153 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
154 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
155 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
161 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
163 int error
= sysctl_handle_int(oidp
, 0, ctob(physmem
), req
);
167 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_INT
|CTLFLAG_RD
,
168 0, 0, sysctl_hw_physmem
, "IU", "");
171 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
173 int error
= sysctl_handle_int(oidp
, 0,
174 ctob(physmem
- vmstats
.v_wire_count
), req
);
178 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
179 0, 0, sysctl_hw_usermem
, "IU", "");
182 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
184 int error
= sysctl_handle_int(oidp
, 0,
185 i386_btop(avail_end
- avail_start
), req
);
189 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
190 0, 0, sysctl_hw_availpages
, "I", "");
193 sysctl_machdep_msgbuf(SYSCTL_HANDLER_ARGS
)
197 /* Unwind the buffer, so that it's linear (possibly starting with
198 * some initial nulls).
200 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
+msgbufp
->msg_bufr
,
201 msgbufp
->msg_size
-msgbufp
->msg_bufr
,req
);
202 if(error
) return(error
);
203 if(msgbufp
->msg_bufr
>0) {
204 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
,
205 msgbufp
->msg_bufr
,req
);
210 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf
, CTLTYPE_STRING
|CTLFLAG_RD
,
211 0, 0, sysctl_machdep_msgbuf
, "A","Contents of kernel message buffer");
213 static int msgbuf_clear
;
216 sysctl_machdep_msgbuf_clear(SYSCTL_HANDLER_ARGS
)
219 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
221 if (!error
&& req
->newptr
) {
222 /* Clear the buffer and reset write pointer */
223 bzero(msgbufp
->msg_ptr
,msgbufp
->msg_size
);
224 msgbufp
->msg_bufr
=msgbufp
->msg_bufx
=0;
230 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf_clear
, CTLTYPE_INT
|CTLFLAG_RW
,
231 &msgbuf_clear
, 0, sysctl_machdep_msgbuf_clear
, "I",
232 "Clear kernel message buffer");
234 vm_paddr_t Maxmem
= 0;
236 vm_paddr_t phys_avail
[PHYSMAP_ENTRIES
*2+2];
238 static vm_offset_t buffer_sva
, buffer_eva
;
239 vm_offset_t clean_sva
, clean_eva
;
240 static vm_offset_t pager_sva
, pager_eva
;
241 static struct trapframe proc0_tf
;
244 cpu_startup(void *dummy
)
248 vm_offset_t firstaddr
;
250 if (boothowto
& RB_VERBOSE
)
254 * Good {morning,afternoon,evening,night}.
256 kprintf("%s", version
);
259 panicifcpuunsupported();
263 kprintf("real memory = %llu (%lluK bytes)\n", ptoa(Maxmem
), ptoa(Maxmem
) / 1024);
265 * Display any holes after the first chunk of extended memory.
270 kprintf("Physical memory chunk(s):\n");
271 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
272 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
274 kprintf("0x%08llx - 0x%08llx, %llu bytes (%llu pages)\n",
275 phys_avail
[indx
], phys_avail
[indx
+ 1] - 1, size1
,
281 * Allocate space for system data structures.
282 * The first available kernel virtual address is in "v".
283 * As pages of kernel virtual memory are allocated, "v" is incremented.
284 * As pages of memory are allocated and cleared,
285 * "firstaddr" is incremented.
286 * An index into the kernel page table corresponding to the
287 * virtual memory address maintained in "v" is kept in "mapaddr".
291 * Make two passes. The first pass calculates how much memory is
292 * needed and allocates it. The second pass assigns virtual
293 * addresses to the various data structures.
297 v
= (caddr_t
)firstaddr
;
299 #define valloc(name, type, num) \
300 (name) = (type *)v; v = (caddr_t)((name)+(num))
301 #define valloclim(name, type, num, lim) \
302 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
305 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
306 * For the first 64MB of ram nominally allocate sufficient buffers to
307 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
308 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
309 * the buffer cache we limit the eventual kva reservation to
312 * factor represents the 1/4 x ram conversion.
315 int factor
= 4 * BKVASIZE
/ 1024;
316 int kbytes
= physmem
* (PAGE_SIZE
/ 1024);
320 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
322 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
323 if (maxbcache
&& nbuf
> maxbcache
/ BKVASIZE
)
324 nbuf
= maxbcache
/ BKVASIZE
;
328 * Do not allow the buffer_map to be more then 1/2 the size of the
331 if (nbuf
> (virtual_end
- virtual_start
) / (BKVASIZE
* 2)) {
332 nbuf
= (virtual_end
- virtual_start
) / (BKVASIZE
* 2);
333 kprintf("Warning: nbufs capped at %d\n", nbuf
);
336 nswbuf
= max(min(nbuf
/4, 256), 16);
338 if (nswbuf
< NSWBUF_MIN
)
345 valloc(swbuf
, struct buf
, nswbuf
);
346 valloc(buf
, struct buf
, nbuf
);
349 * End of first pass, size has been calculated so allocate memory
351 if (firstaddr
== 0) {
352 size
= (vm_size_t
)(v
- firstaddr
);
353 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
));
355 panic("startup: no room for tables");
360 * End of second pass, addresses have been assigned
362 if ((vm_size_t
)(v
- firstaddr
) != size
)
363 panic("startup: table size inconsistency");
365 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
366 (nbuf
*BKVASIZE
) + (nswbuf
*MAXPHYS
) + pager_map_size
);
367 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
369 buffer_map
.system_map
= 1;
370 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
371 (nswbuf
*MAXPHYS
) + pager_map_size
);
372 pager_map
.system_map
= 1;
374 #if defined(USERCONFIG)
376 cninit(); /* the preferred console may have changed */
379 kprintf("avail memory = %u (%uK bytes)\n", ptoa(vmstats
.v_free_count
),
380 ptoa(vmstats
.v_free_count
) / 1024);
383 * Set up buffers, so they can be used to read disk labels.
386 vm_pager_bufferinit();
390 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
392 mp_start(); /* fire up the APs and APICs */
399 * Send an interrupt to process.
401 * Stack is set up to allow sigcode stored
402 * at top to call routine, followed by kcall
403 * to sigreturn routine below. After sigreturn
404 * resets the signal mask, the stack, and the
405 * frame pointer, it returns to the user
409 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
411 struct lwp
*lp
= curthread
->td_lwp
;
412 struct proc
*p
= lp
->lwp_proc
;
413 struct trapframe
*regs
;
414 struct sigacts
*psp
= p
->p_sigacts
;
415 struct sigframe sf
, *sfp
;
418 regs
= lp
->lwp_md
.md_regs
;
419 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
421 /* save user context */
422 bzero(&sf
, sizeof(struct sigframe
));
423 sf
.sf_uc
.uc_sigmask
= *mask
;
424 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
425 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
426 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_gs
, sizeof(struct trapframe
));
428 /* make the size of the saved context visible to userland */
429 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
431 /* save mailbox pending state for syscall interlock semantics */
432 if (p
->p_flag
& P_MAILBOX
)
433 sf
.sf_uc
.uc_mcontext
.mc_xflags
|= PGEX_MAILBOX
;
435 /* Allocate and validate space for the signal handler context. */
436 if ((lp
->lwp_flag
& LWP_ALTSTACK
) != 0 && !oonstack
&&
437 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
438 sfp
= (struct sigframe
*)(lp
->lwp_sigstk
.ss_sp
+
439 lp
->lwp_sigstk
.ss_size
- sizeof(struct sigframe
));
440 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
442 sfp
= (struct sigframe
*)regs
->tf_esp
- 1;
445 /* Translate the signal is appropriate */
446 if (p
->p_sysent
->sv_sigtbl
) {
447 if (sig
<= p
->p_sysent
->sv_sigsize
)
448 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
451 /* Build the argument list for the signal handler. */
453 sf
.sf_ucontext
= (register_t
)&sfp
->sf_uc
;
454 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
455 /* Signal handler installed with SA_SIGINFO. */
456 sf
.sf_siginfo
= (register_t
)&sfp
->sf_si
;
457 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
459 /* fill siginfo structure */
460 sf
.sf_si
.si_signo
= sig
;
461 sf
.sf_si
.si_code
= code
;
462 sf
.sf_si
.si_addr
= (void*)regs
->tf_err
;
465 /* Old FreeBSD-style arguments. */
466 sf
.sf_siginfo
= code
;
467 sf
.sf_addr
= regs
->tf_err
;
468 sf
.sf_ahu
.sf_handler
= catcher
;
472 * If we're a vm86 process, we want to save the segment registers.
473 * We also change eflags to be our emulated eflags, not the actual
476 if (regs
->tf_eflags
& PSL_VM
) {
477 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
478 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
480 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
481 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
482 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
483 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
485 if (vm86
->vm86_has_vme
== 0)
486 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
487 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
488 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
491 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
492 * syscalls made by the signal handler. This just avoids
493 * wasting time for our lazy fixup of such faults. PSL_NT
494 * does nothing in vm86 mode, but vm86 programs can set it
495 * almost legitimately in probes for old cpu types.
497 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
501 * Copy the sigframe out to the user's stack.
503 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
505 * Something is wrong with the stack pointer.
506 * ...Kill the process.
511 regs
->tf_esp
= (int)sfp
;
512 regs
->tf_eip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
513 regs
->tf_eflags
&= ~PSL_T
;
514 regs
->tf_cs
= _ucodesel
;
515 regs
->tf_ds
= _udatasel
;
516 regs
->tf_es
= _udatasel
;
519 * Allow the signal handler to inherit %fs in addition to %gs as
520 * the userland program might be using both.
522 * However, if a T_PROTFLT occured the segment registers could be
523 * totally broken. They must be reset in order to be able to
524 * return to userland.
526 if (regs
->tf_trapno
== T_PROTFLT
) {
527 regs
->tf_fs
= _udatasel
;
528 regs
->tf_gs
= _udatasel
;
530 regs
->tf_ss
= _udatasel
;
534 * Sanitize the trapframe for a virtual kernel passing control to a custom
535 * VM context. Remove any items that would otherwise create a privilage
538 * XXX at the moment we allow userland to set the resume flag. Is this a
542 cpu_sanitize_frame(struct trapframe
*frame
)
544 frame
->tf_cs
= _ucodesel
;
545 frame
->tf_ds
= _udatasel
;
546 frame
->tf_es
= _udatasel
; /* XXX allow userland this one too? */
548 frame
->tf_fs
= _udatasel
;
549 frame
->tf_gs
= _udatasel
;
551 frame
->tf_ss
= _udatasel
;
552 frame
->tf_eflags
&= (PSL_RF
| PSL_USERCHANGE
);
553 frame
->tf_eflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
558 cpu_sanitize_tls(struct savetls
*tls
)
560 struct segment_descriptor
*desc
;
563 for (i
= 0; i
< NGTLS
; ++i
) {
565 if (desc
->sd_dpl
== 0 && desc
->sd_type
== 0)
567 if (desc
->sd_def32
== 0)
569 if (desc
->sd_type
!= SDT_MEMRWA
)
571 if (desc
->sd_dpl
!= SEL_UPL
)
573 if (desc
->sd_xx
!= 0 || desc
->sd_p
!= 1)
580 * sigreturn(ucontext_t *sigcntxp)
582 * System call to cleanup state after a signal
583 * has been taken. Reset signal mask and
584 * stack state from context left by sendsig (above).
585 * Return to previous pc and psl as specified by
586 * context left by sendsig. Check carefully to
587 * make sure that the user has not modified the
588 * state to gain improper privileges.
590 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
591 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
594 sys_sigreturn(struct sigreturn_args
*uap
)
596 struct lwp
*lp
= curthread
->td_lwp
;
597 struct proc
*p
= lp
->lwp_proc
;
598 struct trapframe
*regs
;
604 if (!useracc((caddr_t
)ucp
, sizeof(ucontext_t
), VM_PROT_READ
))
607 regs
= lp
->lwp_md
.md_regs
;
608 eflags
= ucp
->uc_mcontext
.mc_eflags
;
610 if (eflags
& PSL_VM
) {
611 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
612 struct vm86_kernel
*vm86
;
615 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
616 * set up the vm86 area, and we can't enter vm86 mode.
618 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
620 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
621 if (vm86
->vm86_inited
== 0)
624 /* go back to user mode if both flags are set */
625 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
626 trapsignal(lp
, SIGBUS
, 0);
628 if (vm86
->vm86_has_vme
) {
629 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
630 (eflags
& VME_USERCHANGE
) | PSL_VM
;
632 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
633 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
634 (eflags
& VM_USERCHANGE
) | PSL_VM
;
636 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
637 tf
->tf_eflags
= eflags
;
638 tf
->tf_vm86_ds
= tf
->tf_ds
;
639 tf
->tf_vm86_es
= tf
->tf_es
;
640 tf
->tf_vm86_fs
= tf
->tf_fs
;
641 tf
->tf_vm86_gs
= tf
->tf_gs
;
642 tf
->tf_ds
= _udatasel
;
643 tf
->tf_es
= _udatasel
;
645 tf
->tf_fs
= _udatasel
;
646 tf
->tf_gs
= _udatasel
;
650 * Don't allow users to change privileged or reserved flags.
653 * XXX do allow users to change the privileged flag PSL_RF.
654 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
655 * should sometimes set it there too. tf_eflags is kept in
656 * the signal context during signal handling and there is no
657 * other place to remember it, so the PSL_RF bit may be
658 * corrupted by the signal handler without us knowing.
659 * Corruption of the PSL_RF bit at worst causes one more or
660 * one less debugger trap, so allowing it is fairly harmless.
662 if (!EFL_SECURE(eflags
& ~PSL_RF
, regs
->tf_eflags
& ~PSL_RF
)) {
663 kprintf("sigreturn: eflags = 0x%x\n", eflags
);
668 * Don't allow users to load a valid privileged %cs. Let the
669 * hardware check for invalid selectors, excess privilege in
670 * other selectors, invalid %eip's and invalid %esp's.
672 cs
= ucp
->uc_mcontext
.mc_cs
;
673 if (!CS_SECURE(cs
)) {
674 kprintf("sigreturn: cs = 0x%x\n", cs
);
675 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
678 bcopy(&ucp
->uc_mcontext
.mc_gs
, regs
, sizeof(struct trapframe
));
682 * Merge saved signal mailbox pending flag to maintain interlock
683 * semantics against system calls.
685 if (ucp
->uc_mcontext
.mc_xflags
& PGEX_MAILBOX
)
686 p
->p_flag
|= P_MAILBOX
;
688 if (ucp
->uc_mcontext
.mc_onstack
& 1)
689 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
691 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
693 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
694 SIG_CANTMASK(lp
->lwp_sigmask
);
699 * Stack frame on entry to function. %eax will contain the function vector,
700 * %ecx will contain the function data. flags, ecx, and eax will have
701 * already been pushed on the stack.
712 sendupcall(struct vmupcall
*vu
, int morepending
)
714 struct lwp
*lp
= curthread
->td_lwp
;
715 struct trapframe
*regs
;
716 struct upcall upcall
;
717 struct upc_frame upc_frame
;
721 * If we are a virtual kernel running an emulated user process
722 * context, switch back to the virtual kernel context before
723 * trying to post the signal.
725 if (lp
->lwp_vkernel
&& lp
->lwp_vkernel
->ve
) {
726 lp
->lwp_md
.md_regs
->tf_trapno
= 0;
727 vkernel_trap(lp
, lp
->lwp_md
.md_regs
);
731 * Get the upcall data structure
733 if (copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
)) ||
734 copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int))
737 kprintf("bad upcall address\n");
742 * If the data structure is already marked pending or has a critical
743 * section count, mark the data structure as pending and return
744 * without doing an upcall. vu_pending is left set.
746 if (upcall
.upc_pending
|| crit_count
>= vu
->vu_pending
) {
747 if (upcall
.upc_pending
< vu
->vu_pending
) {
748 upcall
.upc_pending
= vu
->vu_pending
;
749 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
750 sizeof(upcall
.upc_pending
));
756 * We can run this upcall now, clear vu_pending.
758 * Bump our critical section count and set or clear the
759 * user pending flag depending on whether more upcalls are
760 * pending. The user will be responsible for calling
761 * upc_dispatch(-1) to process remaining upcalls.
764 upcall
.upc_pending
= morepending
;
765 crit_count
+= TDPRI_CRIT
;
766 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
767 sizeof(upcall
.upc_pending
));
768 copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
,
772 * Construct a stack frame and issue the upcall
774 regs
= lp
->lwp_md
.md_regs
;
775 upc_frame
.eax
= regs
->tf_eax
;
776 upc_frame
.ecx
= regs
->tf_ecx
;
777 upc_frame
.edx
= regs
->tf_edx
;
778 upc_frame
.flags
= regs
->tf_eflags
;
779 upc_frame
.oldip
= regs
->tf_eip
;
780 if (copyout(&upc_frame
, (void *)(regs
->tf_esp
- sizeof(upc_frame
)),
781 sizeof(upc_frame
)) != 0) {
782 kprintf("bad stack on upcall\n");
784 regs
->tf_eax
= (register_t
)vu
->vu_func
;
785 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
786 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
787 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
788 regs
->tf_esp
-= sizeof(upc_frame
);
793 * fetchupcall occurs in the context of a system call, which means that
794 * we have to return EJUSTRETURN in order to prevent eax and edx from
795 * being overwritten by the syscall return value.
797 * if vu is not NULL we return the new context in %edx, the new data in %ecx,
798 * and the function pointer in %eax.
801 fetchupcall (struct vmupcall
*vu
, int morepending
, void *rsp
)
803 struct upc_frame upc_frame
;
804 struct lwp
*lp
= curthread
->td_lwp
;
805 struct trapframe
*regs
;
807 struct upcall upcall
;
810 regs
= lp
->lwp_md
.md_regs
;
812 error
= copyout(&morepending
, &lp
->lwp_upcall
->upc_pending
, sizeof(int));
816 * This jumps us to the next ready context.
819 error
= copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
));
822 error
= copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int));
823 crit_count
+= TDPRI_CRIT
;
825 error
= copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, sizeof(int));
826 regs
->tf_eax
= (register_t
)vu
->vu_func
;
827 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
828 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
829 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
830 regs
->tf_esp
= (register_t
)rsp
;
833 * This returns us to the originally interrupted code.
835 error
= copyin(rsp
, &upc_frame
, sizeof(upc_frame
));
836 regs
->tf_eax
= upc_frame
.eax
;
837 regs
->tf_ecx
= upc_frame
.ecx
;
838 regs
->tf_edx
= upc_frame
.edx
;
839 regs
->tf_eflags
= (regs
->tf_eflags
& ~PSL_USERCHANGE
) |
840 (upc_frame
.flags
& PSL_USERCHANGE
);
841 regs
->tf_eip
= upc_frame
.oldip
;
842 regs
->tf_esp
= (register_t
)((char *)rsp
+ sizeof(upc_frame
));
851 * Machine dependent boot() routine
853 * I haven't seen anything to put here yet
854 * Possibly some stuff might be grafted back here from boot()
862 * Shutdown the CPU as much as possible
868 __asm__
__volatile("hlt");
872 * cpu_idle() represents the idle LWKT. You cannot return from this function
873 * (unless you want to blow things up!). Instead we look for runnable threads
874 * and loop or halt as appropriate. Giant is not held on entry to the thread.
876 * The main loop is entered with a critical section held, we must release
877 * the critical section before doing anything else. lwkt_switch() will
878 * check for pending interrupts due to entering and exiting its own
881 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI
882 * to wake a HLTed cpu up. However, there are cases where the idlethread
883 * will be entered with the possibility that no IPI will occur and in such
884 * cases lwkt_switch() sets TDF_IDLE_NOHLT.
886 static int cpu_idle_hlt
= 1;
887 static int cpu_idle_hltcnt
;
888 static int cpu_idle_spincnt
;
889 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
890 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
891 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, CTLFLAG_RW
,
892 &cpu_idle_hltcnt
, 0, "Idle loop entry halts");
893 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_spincnt
, CTLFLAG_RW
,
894 &cpu_idle_spincnt
, 0, "Idle loop entry spins");
897 cpu_idle_default_hook(void)
900 * We must guarentee that hlt is exactly the instruction
903 __asm
__volatile("sti; hlt");
906 /* Other subsystems (e.g., ACPI) can hook this later. */
907 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
912 struct thread
*td
= curthread
;
915 KKASSERT(td
->td_pri
< TDPRI_CRIT
);
918 * See if there are any LWKTs ready to go.
923 * If we are going to halt call splz unconditionally after
924 * CLIing to catch any interrupt races. Note that we are
925 * at SPL0 and interrupts are enabled.
927 if (cpu_idle_hlt
&& !lwkt_runnable() &&
928 (td
->td_flags
& TDF_IDLE_NOHLT
) == 0) {
929 __asm
__volatile("cli");
931 if (!lwkt_runnable())
935 __asm
__volatile("pause");
939 td
->td_flags
&= ~TDF_IDLE_NOHLT
;
942 __asm
__volatile("sti; pause");
944 __asm
__volatile("sti");
952 * Clear registers on exec
955 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
957 struct thread
*td
= curthread
;
958 struct lwp
*lp
= td
->td_lwp
;
959 struct pcb
*pcb
= td
->td_pcb
;
960 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
962 /* was i386_user_cleanup() in NetBSD */
965 bzero((char *)regs
, sizeof(struct trapframe
));
966 regs
->tf_eip
= entry
;
967 regs
->tf_esp
= stack
;
968 regs
->tf_eflags
= PSL_USER
| (regs
->tf_eflags
& PSL_T
);
969 regs
->tf_ss
= _udatasel
;
970 regs
->tf_ds
= _udatasel
;
971 regs
->tf_es
= _udatasel
;
972 regs
->tf_fs
= _udatasel
;
973 regs
->tf_gs
= _udatasel
;
974 regs
->tf_cs
= _ucodesel
;
976 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
977 regs
->tf_ebx
= ps_strings
;
980 * Reset the hardware debug registers if they were in use.
981 * They won't have any meaning for the newly exec'd process.
983 if (pcb
->pcb_flags
& PCB_DBREGS
) {
990 if (pcb
== td
->td_pcb
) {
992 * Clear the debug registers on the running
993 * CPU, otherwise they will end up affecting
994 * the next process we switch to.
998 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1002 * Initialize the math emulator (if any) for the current process.
1003 * Actually, just clear the bit that says that the emulator has
1004 * been initialized. Initialization is delayed until the process
1005 * traps to the emulator (if it is done at all) mainly because
1006 * emulators don't provide an entry point for initialization.
1008 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1011 * note: do not set CR0_TS here. npxinit() must do it after clearing
1012 * gd_npxthread. Otherwise a preemptive interrupt thread may panic
1016 load_cr0(rcr0() | CR0_MP
);
1019 /* Initialize the npx (if any) for the current process. */
1020 npxinit(__INITIAL_NPXCW__
);
1025 * note: linux emulator needs edx to be 0x0 on entry, which is
1026 * handled in execve simply by setting the 64 bit syscall
1027 * return value to 0.
1037 cr0
|= CR0_NE
; /* Done by npxinit() */
1038 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1040 if (cpu_class
!= CPUCLASS_386
)
1042 cr0
|= CR0_WP
| CR0_AM
;
1048 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1051 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1053 if (!error
&& req
->newptr
)
1058 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1059 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1061 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1062 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1064 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1065 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1067 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1068 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1070 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1071 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1072 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1075 * Initialize 386 and configure to run kernel
1079 * Initialize segments & interrupt table
1083 union descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1084 static struct gate_descriptor idt0
[NIDT
];
1085 struct gate_descriptor
*idt
= &idt0
[0]; /* interrupt descriptor table */
1086 union descriptor ldt
[NLDT
]; /* local descriptor table */
1088 /* table descriptors - used to load tables by cpu */
1089 struct region_descriptor r_gdt
, r_idt
;
1091 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
1092 extern int has_f00f_bug
;
1095 static struct i386tss dblfault_tss
;
1096 static char dblfault_stack
[PAGE_SIZE
];
1098 extern struct user
*proc0paddr
;
1101 /* software prototypes -- in more palatable form */
1102 struct soft_segment_descriptor gdt_segs
[] = {
1103 /* GNULL_SEL 0 Null Descriptor */
1104 { 0x0, /* segment base address */
1106 0, /* segment type */
1107 0, /* segment descriptor priority level */
1108 0, /* segment descriptor present */
1110 0, /* default 32 vs 16 bit size */
1111 0 /* limit granularity (byte/page units)*/ },
1112 /* GCODE_SEL 1 Code Descriptor for kernel */
1113 { 0x0, /* segment base address */
1114 0xfffff, /* length - all address space */
1115 SDT_MEMERA
, /* segment type */
1116 0, /* segment descriptor priority level */
1117 1, /* segment descriptor present */
1119 1, /* default 32 vs 16 bit size */
1120 1 /* limit granularity (byte/page units)*/ },
1121 /* GDATA_SEL 2 Data Descriptor for kernel */
1122 { 0x0, /* segment base address */
1123 0xfffff, /* length - all address space */
1124 SDT_MEMRWA
, /* segment type */
1125 0, /* segment descriptor priority level */
1126 1, /* segment descriptor present */
1128 1, /* default 32 vs 16 bit size */
1129 1 /* limit granularity (byte/page units)*/ },
1130 /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */
1131 { 0x0, /* segment base address */
1132 0xfffff, /* length - all address space */
1133 SDT_MEMRWA
, /* segment type */
1134 0, /* segment descriptor priority level */
1135 1, /* segment descriptor present */
1137 1, /* default 32 vs 16 bit size */
1138 1 /* limit granularity (byte/page units)*/ },
1139 /* GPROC0_SEL 4 Proc 0 Tss Descriptor */
1141 0x0, /* segment base address */
1142 sizeof(struct i386tss
)-1,/* length - all address space */
1143 SDT_SYS386TSS
, /* segment type */
1144 0, /* segment descriptor priority level */
1145 1, /* segment descriptor present */
1147 0, /* unused - default 32 vs 16 bit size */
1148 0 /* limit granularity (byte/page units)*/ },
1149 /* GLDT_SEL 5 LDT Descriptor */
1150 { (int) ldt
, /* segment base address */
1151 sizeof(ldt
)-1, /* length - all address space */
1152 SDT_SYSLDT
, /* segment type */
1153 SEL_UPL
, /* segment descriptor priority level */
1154 1, /* segment descriptor present */
1156 0, /* unused - default 32 vs 16 bit size */
1157 0 /* limit granularity (byte/page units)*/ },
1158 /* GUSERLDT_SEL 6 User LDT Descriptor per process */
1159 { (int) ldt
, /* segment base address */
1160 (512 * sizeof(union descriptor
)-1), /* length */
1161 SDT_SYSLDT
, /* segment type */
1162 0, /* segment descriptor priority level */
1163 1, /* segment descriptor present */
1165 0, /* unused - default 32 vs 16 bit size */
1166 0 /* limit granularity (byte/page units)*/ },
1167 /* GTGATE_SEL 7 Null Descriptor - Placeholder */
1168 { 0x0, /* segment base address */
1169 0x0, /* length - all address space */
1170 0, /* segment type */
1171 0, /* segment descriptor priority level */
1172 0, /* segment descriptor present */
1174 0, /* default 32 vs 16 bit size */
1175 0 /* limit granularity (byte/page units)*/ },
1176 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
1177 { 0x400, /* segment base address */
1178 0xfffff, /* length */
1179 SDT_MEMRWA
, /* segment type */
1180 0, /* segment descriptor priority level */
1181 1, /* segment descriptor present */
1183 1, /* default 32 vs 16 bit size */
1184 1 /* limit granularity (byte/page units)*/ },
1185 /* GPANIC_SEL 9 Panic Tss Descriptor */
1186 { (int) &dblfault_tss
, /* segment base address */
1187 sizeof(struct i386tss
)-1,/* length - all address space */
1188 SDT_SYS386TSS
, /* segment type */
1189 0, /* segment descriptor priority level */
1190 1, /* segment descriptor present */
1192 0, /* unused - default 32 vs 16 bit size */
1193 0 /* limit granularity (byte/page units)*/ },
1194 /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */
1195 { 0, /* segment base address (overwritten) */
1196 0xfffff, /* length */
1197 SDT_MEMERA
, /* segment type */
1198 0, /* segment descriptor priority level */
1199 1, /* segment descriptor present */
1201 0, /* default 32 vs 16 bit size */
1202 1 /* limit granularity (byte/page units)*/ },
1203 /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */
1204 { 0, /* segment base address (overwritten) */
1205 0xfffff, /* length */
1206 SDT_MEMERA
, /* segment type */
1207 0, /* segment descriptor priority level */
1208 1, /* segment descriptor present */
1210 0, /* default 32 vs 16 bit size */
1211 1 /* limit granularity (byte/page units)*/ },
1212 /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */
1213 { 0, /* segment base address (overwritten) */
1214 0xfffff, /* length */
1215 SDT_MEMRWA
, /* segment type */
1216 0, /* segment descriptor priority level */
1217 1, /* segment descriptor present */
1219 1, /* default 32 vs 16 bit size */
1220 1 /* limit granularity (byte/page units)*/ },
1221 /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */
1222 { 0, /* segment base address (overwritten) */
1223 0xfffff, /* length */
1224 SDT_MEMRWA
, /* segment type */
1225 0, /* segment descriptor priority level */
1226 1, /* segment descriptor present */
1228 0, /* default 32 vs 16 bit size */
1229 1 /* limit granularity (byte/page units)*/ },
1230 /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */
1231 { 0, /* segment base address (overwritten) */
1232 0xfffff, /* length */
1233 SDT_MEMRWA
, /* segment type */
1234 0, /* segment descriptor priority level */
1235 1, /* segment descriptor present */
1237 0, /* default 32 vs 16 bit size */
1238 1 /* limit granularity (byte/page units)*/ },
1239 /* GTLS_START 15 TLS */
1240 { 0x0, /* segment base address */
1242 0, /* segment type */
1243 0, /* segment descriptor priority level */
1244 0, /* segment descriptor present */
1246 0, /* default 32 vs 16 bit size */
1247 0 /* limit granularity (byte/page units)*/ },
1248 /* GTLS_START+1 16 TLS */
1249 { 0x0, /* segment base address */
1251 0, /* segment type */
1252 0, /* segment descriptor priority level */
1253 0, /* segment descriptor present */
1255 0, /* default 32 vs 16 bit size */
1256 0 /* limit granularity (byte/page units)*/ },
1257 /* GTLS_END 17 TLS */
1258 { 0x0, /* segment base address */
1260 0, /* segment type */
1261 0, /* segment descriptor priority level */
1262 0, /* segment descriptor present */
1264 0, /* default 32 vs 16 bit size */
1265 0 /* limit granularity (byte/page units)*/ },
1268 static struct soft_segment_descriptor ldt_segs
[] = {
1269 /* Null Descriptor - overwritten by call gate */
1270 { 0x0, /* segment base address */
1271 0x0, /* length - all address space */
1272 0, /* segment type */
1273 0, /* segment descriptor priority level */
1274 0, /* segment descriptor present */
1276 0, /* default 32 vs 16 bit size */
1277 0 /* limit granularity (byte/page units)*/ },
1278 /* Null Descriptor - overwritten by call gate */
1279 { 0x0, /* segment base address */
1280 0x0, /* length - all address space */
1281 0, /* segment type */
1282 0, /* segment descriptor priority level */
1283 0, /* segment descriptor present */
1285 0, /* default 32 vs 16 bit size */
1286 0 /* limit granularity (byte/page units)*/ },
1287 /* Null Descriptor - overwritten by call gate */
1288 { 0x0, /* segment base address */
1289 0x0, /* length - all address space */
1290 0, /* segment type */
1291 0, /* segment descriptor priority level */
1292 0, /* segment descriptor present */
1294 0, /* default 32 vs 16 bit size */
1295 0 /* limit granularity (byte/page units)*/ },
1296 /* Code Descriptor for user */
1297 { 0x0, /* segment base address */
1298 0xfffff, /* length - all address space */
1299 SDT_MEMERA
, /* segment type */
1300 SEL_UPL
, /* segment descriptor priority level */
1301 1, /* segment descriptor present */
1303 1, /* default 32 vs 16 bit size */
1304 1 /* limit granularity (byte/page units)*/ },
1305 /* Null Descriptor - overwritten by call gate */
1306 { 0x0, /* segment base address */
1307 0x0, /* length - all address space */
1308 0, /* segment type */
1309 0, /* segment descriptor priority level */
1310 0, /* segment descriptor present */
1312 0, /* default 32 vs 16 bit size */
1313 0 /* limit granularity (byte/page units)*/ },
1314 /* Data Descriptor for user */
1315 { 0x0, /* segment base address */
1316 0xfffff, /* length - all address space */
1317 SDT_MEMRWA
, /* segment type */
1318 SEL_UPL
, /* segment descriptor priority level */
1319 1, /* segment descriptor present */
1321 1, /* default 32 vs 16 bit size */
1322 1 /* limit granularity (byte/page units)*/ },
1326 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int selec
)
1328 struct gate_descriptor
*ip
;
1331 ip
->gd_looffset
= (int)func
;
1332 ip
->gd_selector
= selec
;
1338 ip
->gd_hioffset
= ((int)func
)>>16 ;
1341 #define IDTVEC(name) __CONCAT(X,name)
1344 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1345 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1346 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1347 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(fpu
), IDTVEC(align
),
1348 IDTVEC(xmm
), IDTVEC(syscall
),
1351 IDTVEC(int0x80_syscall
);
1353 #ifdef DEBUG_INTERRUPTS
1354 extern inthand_t
*Xrsvdary
[256];
1358 sdtossd(struct segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1360 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1361 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1362 ssd
->ssd_type
= sd
->sd_type
;
1363 ssd
->ssd_dpl
= sd
->sd_dpl
;
1364 ssd
->ssd_p
= sd
->sd_p
;
1365 ssd
->ssd_def32
= sd
->sd_def32
;
1366 ssd
->ssd_gran
= sd
->sd_gran
;
1370 * Populate the (physmap) array with base/bound pairs describing the
1371 * available physical memory in the system, then test this memory and
1372 * build the phys_avail array describing the actually-available memory.
1374 * If we cannot accurately determine the physical memory map, then use
1375 * value from the 0xE801 call, and failing that, the RTC.
1377 * Total memory size may be set by the kernel environment variable
1378 * hw.physmem or the compile-time define MAXMEM.
1381 getmemsize(int first
)
1383 int i
, physmap_idx
, pa_indx
;
1385 u_int basemem
, extmem
;
1386 struct vm86frame vmf
;
1387 struct vm86context vmc
;
1389 vm_offset_t physmap
[PHYSMAP_ENTRIES
*2];
1397 quad_t dcons_addr
, dcons_size
;
1400 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12
);
1401 bzero(&vmf
, sizeof(struct vm86frame
));
1402 bzero(physmap
, sizeof(physmap
));
1406 * Some newer BIOSes has broken INT 12H implementation which cause
1407 * kernel panic immediately. In this case, we need to scan SMAP
1408 * with INT 15:E820 first, then determine base memory size.
1410 if (hasbrokenint12
) {
1415 * Perform "base memory" related probes & setup. If we get a crazy
1416 * value give the bios some scribble space just in case.
1418 vm86_intcall(0x12, &vmf
);
1419 basemem
= vmf
.vmf_ax
;
1420 if (basemem
> 640) {
1421 kprintf("Preposterous BIOS basemem of %uK, "
1422 "truncating to < 640K\n", basemem
);
1427 * XXX if biosbasemem is now < 640, there is a `hole'
1428 * between the end of base memory and the start of
1429 * ISA memory. The hole may be empty or it may
1430 * contain BIOS code or data. Map it read/write so
1431 * that the BIOS can write to it. (Memory from 0 to
1432 * the physical end of the kernel is mapped read-only
1433 * to begin with and then parts of it are remapped.
1434 * The parts that aren't remapped form holes that
1435 * remain read-only and are unused by the kernel.
1436 * The base memory area is below the physical end of
1437 * the kernel and right now forms a read-only hole.
1438 * The part of it from PAGE_SIZE to
1439 * (trunc_page(biosbasemem * 1024) - 1) will be
1440 * remapped and used by the kernel later.)
1442 * This code is similar to the code used in
1443 * pmap_mapdev, but since no memory needs to be
1444 * allocated we simply change the mapping.
1446 for (pa
= trunc_page(basemem
* 1024);
1447 pa
< ISA_HOLE_START
; pa
+= PAGE_SIZE
) {
1448 pte
= vtopte(pa
+ KERNBASE
);
1449 *pte
= pa
| PG_RW
| PG_V
;
1453 * if basemem != 640, map pages r/w into vm86 page table so
1454 * that the bios can scribble on it.
1457 for (i
= basemem
/ 4; i
< 160; i
++)
1458 pte
[i
] = (i
<< PAGE_SHIFT
) | PG_V
| PG_RW
| PG_U
;
1462 * map page 1 R/W into the kernel page table so we can use it
1463 * as a buffer. The kernel will unmap this page later.
1465 pte
= vtopte(KERNBASE
+ (1 << PAGE_SHIFT
));
1466 *pte
= (1 << PAGE_SHIFT
) | PG_RW
| PG_V
;
1469 * get memory map with INT 15:E820
1471 #define SMAPSIZ sizeof(*smap)
1472 #define SMAP_SIG 0x534D4150 /* 'SMAP' */
1475 smap
= (void *)vm86_addpage(&vmc
, 1, KERNBASE
+ (1 << PAGE_SHIFT
));
1476 vm86_getptr(&vmc
, (vm_offset_t
)smap
, &vmf
.vmf_es
, &vmf
.vmf_di
);
1481 vmf
.vmf_eax
= 0xE820;
1482 vmf
.vmf_edx
= SMAP_SIG
;
1483 vmf
.vmf_ecx
= SMAPSIZ
;
1484 i
= vm86_datacall(0x15, &vmf
, &vmc
);
1485 if (i
|| vmf
.vmf_eax
!= SMAP_SIG
)
1487 if (boothowto
& RB_VERBOSE
)
1488 kprintf("SMAP type=%02x base=%08x %08x len=%08x %08x\n",
1490 *(u_int32_t
*)((char *)&smap
->base
+ 4),
1491 (u_int32_t
)smap
->base
,
1492 *(u_int32_t
*)((char *)&smap
->length
+ 4),
1493 (u_int32_t
)smap
->length
);
1495 if (smap
->type
!= 0x01)
1498 if (smap
->length
== 0)
1501 if (smap
->base
>= 0xffffffff) {
1502 kprintf("%uK of memory above 4GB ignored\n",
1503 (u_int
)(smap
->length
/ 1024));
1507 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1508 if (smap
->base
< physmap
[i
+ 1]) {
1509 if (boothowto
& RB_VERBOSE
)
1511 "Overlapping or non-montonic memory region, ignoring second region\n");
1516 if (smap
->base
== physmap
[physmap_idx
+ 1]) {
1517 physmap
[physmap_idx
+ 1] += smap
->length
;
1522 if (physmap_idx
== PHYSMAP_ENTRIES
*2) {
1524 "Too many segments in the physical address map, giving up\n");
1527 physmap
[physmap_idx
] = smap
->base
;
1528 physmap
[physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1530 ; /* fix GCC3.x warning */
1531 } while (vmf
.vmf_ebx
!= 0);
1534 * Perform "base memory" related probes & setup based on SMAP
1537 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1538 if (physmap
[i
] == 0x00000000) {
1539 basemem
= physmap
[i
+ 1] / 1024;
1548 if (basemem
> 640) {
1549 kprintf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
1554 for (pa
= trunc_page(basemem
* 1024);
1555 pa
< ISA_HOLE_START
; pa
+= PAGE_SIZE
) {
1556 pte
= vtopte(pa
+ KERNBASE
);
1557 *pte
= pa
| PG_RW
| PG_V
;
1561 for (i
= basemem
/ 4; i
< 160; i
++)
1562 pte
[i
] = (i
<< PAGE_SHIFT
) | PG_V
| PG_RW
| PG_U
;
1565 if (physmap
[1] != 0)
1569 * If we failed above, try memory map with INT 15:E801
1571 vmf
.vmf_ax
= 0xE801;
1572 if (vm86_intcall(0x15, &vmf
) == 0) {
1573 extmem
= vmf
.vmf_cx
+ vmf
.vmf_dx
* 64;
1577 vm86_intcall(0x15, &vmf
);
1578 extmem
= vmf
.vmf_ax
;
1581 * Prefer the RTC value for extended memory.
1583 extmem
= rtcin(RTC_EXTLO
) + (rtcin(RTC_EXTHI
) << 8);
1588 * Special hack for chipsets that still remap the 384k hole when
1589 * there's 16MB of memory - this really confuses people that
1590 * are trying to use bus mastering ISA controllers with the
1591 * "16MB limit"; they only have 16MB, but the remapping puts
1592 * them beyond the limit.
1594 * If extended memory is between 15-16MB (16-17MB phys address range),
1597 if ((extmem
> 15 * 1024) && (extmem
< 16 * 1024))
1601 physmap
[1] = basemem
* 1024;
1603 physmap
[physmap_idx
] = 0x100000;
1604 physmap
[physmap_idx
+ 1] = physmap
[physmap_idx
] + extmem
* 1024;
1608 * Now, physmap contains a map of physical memory.
1612 /* make hole for AP bootstrap code YYY */
1613 physmap
[1] = mp_bootaddress(physmap
[1] / 1024);
1615 /* look for the MP hardware - needed for apic addresses */
1620 * Maxmem isn't the "maximum memory", it's one larger than the
1621 * highest page of the physical address space. It should be
1622 * called something like "Maxphyspage". We may adjust this
1623 * based on ``hw.physmem'' and the results of the memory test.
1625 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1628 Maxmem
= MAXMEM
/ 4;
1632 * hw.physmem is a size in bytes; we also allow k, m, and g suffixes
1633 * for the appropriate modifiers. This overrides MAXMEM.
1635 if ((cp
= kgetenv("hw.physmem")) != NULL
) {
1636 u_int64_t AllowMem
, sanity
;
1639 sanity
= AllowMem
= strtouq(cp
, &ep
, 0);
1640 if ((ep
!= cp
) && (*ep
!= 0)) {
1653 AllowMem
= sanity
= 0;
1655 if (AllowMem
< sanity
)
1659 kprintf("Ignoring invalid memory size of '%s'\n", cp
);
1661 Maxmem
= atop(AllowMem
);
1664 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1665 (boothowto
& RB_VERBOSE
))
1666 kprintf("Physical memory use set to %lluK\n", Maxmem
* 4);
1669 * If Maxmem has been increased beyond what the system has detected,
1670 * extend the last memory segment to the new limit.
1672 if (atop(physmap
[physmap_idx
+ 1]) < Maxmem
)
1673 physmap
[physmap_idx
+ 1] = ptoa(Maxmem
);
1675 /* call pmap initialization to make new kernel address space */
1676 pmap_bootstrap(first
, 0);
1679 * Size up each available chunk of physical memory.
1681 physmap
[0] = PAGE_SIZE
; /* mask off page 0 */
1683 phys_avail
[pa_indx
++] = physmap
[0];
1684 phys_avail
[pa_indx
] = physmap
[0];
1688 * Get dcons buffer address
1690 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
1691 kgetenv_quad("dcons.size", &dcons_size
) == 0)
1695 * physmap is in bytes, so when converting to page boundaries,
1696 * round up the start address and round down the end address.
1698 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1702 if (physmap
[i
+ 1] < end
)
1703 end
= trunc_page(physmap
[i
+ 1]);
1704 for (pa
= round_page(physmap
[i
]); pa
< end
; pa
+= PAGE_SIZE
) {
1709 int *ptr
= (int *)CADDR1
;
1713 * block out kernel memory as not available.
1715 if (pa
>= 0x100000 && pa
< first
)
1719 * block out dcons buffer
1722 && pa
>= trunc_page(dcons_addr
)
1723 && pa
< dcons_addr
+ dcons_size
)
1729 * map page into kernel: valid, read/write,non-cacheable
1731 *pte
= pa
| PG_V
| PG_RW
| PG_N
;
1736 * Test for alternating 1's and 0's
1738 *(volatile int *)ptr
= 0xaaaaaaaa;
1739 if (*(volatile int *)ptr
!= 0xaaaaaaaa) {
1743 * Test for alternating 0's and 1's
1745 *(volatile int *)ptr
= 0x55555555;
1746 if (*(volatile int *)ptr
!= 0x55555555) {
1752 *(volatile int *)ptr
= 0xffffffff;
1753 if (*(volatile int *)ptr
!= 0xffffffff) {
1759 *(volatile int *)ptr
= 0x0;
1760 if (*(volatile int *)ptr
!= 0x0) {
1764 * Restore original value.
1769 * Adjust array of valid/good pages.
1771 if (page_bad
== TRUE
) {
1775 * If this good page is a continuation of the
1776 * previous set of good pages, then just increase
1777 * the end pointer. Otherwise start a new chunk.
1778 * Note that "end" points one higher than end,
1779 * making the range >= start and < end.
1780 * If we're also doing a speculative memory
1781 * test and we at or past the end, bump up Maxmem
1782 * so that we keep going. The first bad page
1783 * will terminate the loop.
1785 if (phys_avail
[pa_indx
] == pa
) {
1786 phys_avail
[pa_indx
] += PAGE_SIZE
;
1789 if (pa_indx
>= PHYSMAP_ENTRIES
*2) {
1790 kprintf("Too many holes in the physical address space, giving up\n");
1794 phys_avail
[pa_indx
++] = pa
; /* start */
1795 phys_avail
[pa_indx
] = pa
+ PAGE_SIZE
; /* end */
1805 * The last chunk must contain at least one page plus the message
1806 * buffer to avoid complicating other code (message buffer address
1807 * calculation, etc.).
1809 while (phys_avail
[pa_indx
- 1] + PAGE_SIZE
+
1810 round_page(MSGBUF_SIZE
) >= phys_avail
[pa_indx
]) {
1811 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
1812 phys_avail
[pa_indx
--] = 0;
1813 phys_avail
[pa_indx
--] = 0;
1816 Maxmem
= atop(phys_avail
[pa_indx
]);
1818 /* Trim off space for the message buffer. */
1819 phys_avail
[pa_indx
] -= round_page(MSGBUF_SIZE
);
1821 avail_end
= phys_avail
[pa_indx
];
1833 * 7 Device Not Available (x87)
1835 * 9 Coprocessor Segment overrun (unsupported, reserved)
1837 * 11 Segment not present
1839 * 13 General Protection
1842 * 16 x87 FP Exception pending
1843 * 17 Alignment Check
1845 * 19 SIMD floating point
1847 * 32-255 INTn/external sources
1852 struct gate_descriptor
*gdp
;
1853 int gsel_tss
, metadata_missing
, off
, x
;
1854 struct mdglobaldata
*gd
;
1857 * Prevent lowering of the ipl if we call tsleep() early.
1859 gd
= &CPU_prvspace
[0].mdglobaldata
;
1860 bzero(gd
, sizeof(*gd
));
1862 gd
->mi
.gd_curthread
= &thread0
;
1863 thread0
.td_gd
= &gd
->mi
;
1865 atdevbase
= ISA_HOLE_START
+ KERNBASE
;
1867 metadata_missing
= 0;
1868 if (bootinfo
.bi_modulep
) {
1869 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
1870 preload_bootstrap_relocate(KERNBASE
);
1872 metadata_missing
= 1;
1874 if (bootinfo
.bi_envp
)
1875 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
1878 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
1879 * and ncpus_fit_mask remain 0.
1884 /* Init basic tunables, hz etc */
1888 * make gdt memory segments, the code segment goes up to end of the
1889 * page with etext in it, the data segment goes to the end of
1893 * XXX text protection is temporarily (?) disabled. The limit was
1894 * i386_btop(round_page(etext)) - 1.
1896 gdt_segs
[GCODE_SEL
].ssd_limit
= atop(0 - 1);
1897 gdt_segs
[GDATA_SEL
].ssd_limit
= atop(0 - 1);
1899 gdt_segs
[GPRIV_SEL
].ssd_limit
=
1900 atop(sizeof(struct privatespace
) - 1);
1901 gdt_segs
[GPRIV_SEL
].ssd_base
= (int) &CPU_prvspace
[0];
1902 gdt_segs
[GPROC0_SEL
].ssd_base
=
1903 (int) &CPU_prvspace
[0].mdglobaldata
.gd_common_tss
;
1905 gd
->mi
.gd_prvspace
= &CPU_prvspace
[0];
1908 * Note: on both UP and SMP curthread must be set non-NULL
1909 * early in the boot sequence because the system assumes
1910 * that 'curthread' is never NULL.
1913 for (x
= 0; x
< NGDT
; x
++) {
1915 /* avoid overwriting db entries with APM ones */
1916 if (x
>= GAPMCODE32_SEL
&& x
<= GAPMDATA_SEL
)
1919 ssdtosd(&gdt_segs
[x
], &gdt
[x
].sd
);
1922 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
1923 r_gdt
.rd_base
= (int) gdt
;
1926 mi_gdinit(&gd
->mi
, 0);
1928 mi_proc0init(&gd
->mi
, proc0paddr
);
1929 safepri
= TDPRI_MAX
;
1931 /* make ldt memory segments */
1933 * XXX - VM_MAX_USER_ADDRESS is an end address, not a max. And it
1934 * should be spelled ...MAX_USER...
1936 ldt_segs
[LUCODE_SEL
].ssd_limit
= atop(VM_MAX_USER_ADDRESS
- 1);
1937 ldt_segs
[LUDATA_SEL
].ssd_limit
= atop(VM_MAX_USER_ADDRESS
- 1);
1938 for (x
= 0; x
< sizeof ldt_segs
/ sizeof ldt_segs
[0]; x
++)
1939 ssdtosd(&ldt_segs
[x
], &ldt
[x
].sd
);
1941 _default_ldt
= GSEL(GLDT_SEL
, SEL_KPL
);
1943 gd
->gd_currentldt
= _default_ldt
;
1944 /* spinlocks and the BGL */
1948 * Setup the hardware exception table. Most exceptions use
1949 * SDT_SYS386TGT, known as a 'trap gate'. Trap gates leave
1950 * interrupts enabled. VM page faults use SDT_SYS386IGT, known as
1951 * an 'interrupt trap gate', which disables interrupts on entry,
1952 * in order to be able to poll the appropriate CRn register to
1953 * determine the fault address.
1955 for (x
= 0; x
< NIDT
; x
++) {
1956 #ifdef DEBUG_INTERRUPTS
1957 setidt(x
, Xrsvdary
[x
], SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1959 setidt(x
, &IDTVEC(rsvd0
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1962 setidt(0, &IDTVEC(div
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1963 setidt(1, &IDTVEC(dbg
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1964 setidt(2, &IDTVEC(nmi
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1965 setidt(3, &IDTVEC(bpt
), SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1966 setidt(4, &IDTVEC(ofl
), SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1967 setidt(5, &IDTVEC(bnd
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1968 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1969 setidt(7, &IDTVEC(dna
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1970 setidt(8, 0, SDT_SYSTASKGT
, SEL_KPL
, GSEL(GPANIC_SEL
, SEL_KPL
));
1971 setidt(9, &IDTVEC(fpusegm
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1972 setidt(10, &IDTVEC(tss
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1973 setidt(11, &IDTVEC(missing
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1974 setidt(12, &IDTVEC(stk
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1975 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1976 setidt(14, &IDTVEC(page
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1977 setidt(15, &IDTVEC(rsvd0
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1978 setidt(16, &IDTVEC(fpu
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1979 setidt(17, &IDTVEC(align
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1980 setidt(18, &IDTVEC(mchk
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1981 setidt(19, &IDTVEC(xmm
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1982 setidt(0x80, &IDTVEC(int0x80_syscall
),
1983 SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1985 r_idt
.rd_limit
= sizeof(idt0
) - 1;
1986 r_idt
.rd_base
= (int) idt
;
1990 * Initialize the console before we print anything out.
1994 if (metadata_missing
)
1995 kprintf("WARNING: loader(8) metadata is missing!\n");
2004 if (boothowto
& RB_KDB
)
2005 Debugger("Boot flags requested debugger");
2008 finishidentcpu(); /* Final stage of CPU initialization */
2009 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2010 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2011 initializecpu(); /* Initialize CPU registers */
2014 * make an initial tss so cpu can get interrupt stack on syscall!
2015 * The 16 bytes is to save room for a VM86 context.
2017 gd
->gd_common_tss
.tss_esp0
= (int) thread0
.td_pcb
- 16;
2018 gd
->gd_common_tss
.tss_ss0
= GSEL(GDATA_SEL
, SEL_KPL
) ;
2019 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2020 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
].sd
;
2021 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2022 gd
->gd_common_tss
.tss_ioopt
= (sizeof gd
->gd_common_tss
) << 16;
2025 dblfault_tss
.tss_esp
= dblfault_tss
.tss_esp0
= dblfault_tss
.tss_esp1
=
2026 dblfault_tss
.tss_esp2
= (int) &dblfault_stack
[sizeof(dblfault_stack
)];
2027 dblfault_tss
.tss_ss
= dblfault_tss
.tss_ss0
= dblfault_tss
.tss_ss1
=
2028 dblfault_tss
.tss_ss2
= GSEL(GDATA_SEL
, SEL_KPL
);
2029 dblfault_tss
.tss_cr3
= (int)IdlePTD
;
2030 dblfault_tss
.tss_eip
= (int) dblfault_handler
;
2031 dblfault_tss
.tss_eflags
= PSL_KERNEL
;
2032 dblfault_tss
.tss_ds
= dblfault_tss
.tss_es
=
2033 dblfault_tss
.tss_gs
= GSEL(GDATA_SEL
, SEL_KPL
);
2034 dblfault_tss
.tss_fs
= GSEL(GPRIV_SEL
, SEL_KPL
);
2035 dblfault_tss
.tss_cs
= GSEL(GCODE_SEL
, SEL_KPL
);
2036 dblfault_tss
.tss_ldt
= GSEL(GLDT_SEL
, SEL_KPL
);
2040 init_param2(physmem
);
2042 /* now running on new page tables, configured,and u/iom is accessible */
2044 /* Map the message buffer. */
2045 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2046 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2048 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2050 /* make a call gate to reenter kernel with */
2051 gdp
= &ldt
[LSYS5CALLS_SEL
].gd
;
2053 x
= (int) &IDTVEC(syscall
);
2054 gdp
->gd_looffset
= x
++;
2055 gdp
->gd_selector
= GSEL(GCODE_SEL
,SEL_KPL
);
2057 gdp
->gd_type
= SDT_SYS386CGT
;
2058 gdp
->gd_dpl
= SEL_UPL
;
2060 gdp
->gd_hioffset
= ((int) &IDTVEC(syscall
)) >>16;
2062 /* XXX does this work? */
2063 ldt
[LBSDICALLS_SEL
] = ldt
[LSYS5CALLS_SEL
];
2064 ldt
[LSOL26CALLS_SEL
] = ldt
[LSYS5CALLS_SEL
];
2066 /* transfer to user mode */
2068 _ucodesel
= LSEL(LUCODE_SEL
, SEL_UPL
);
2069 _udatasel
= LSEL(LUDATA_SEL
, SEL_UPL
);
2071 /* setup proc 0's pcb */
2072 thread0
.td_pcb
->pcb_flags
= 0;
2073 thread0
.td_pcb
->pcb_cr3
= (int)IdlePTD
; /* should already be setup */
2074 thread0
.td_pcb
->pcb_ext
= 0;
2075 lwp0
.lwp_md
.md_regs
= &proc0_tf
;
2079 * Initialize machine-dependant portions of the global data structure.
2080 * Note that the global data area and cpu0's idlestack in the private
2081 * data space were allocated in locore.
2083 * Note: the idlethread's cpl is 0
2085 * WARNING! Called from early boot, 'mycpu' may not work yet.
2088 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2091 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2093 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2094 gd
->mi
.gd_prvspace
->idlestack
,
2095 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2096 TDF_MPSAFE
, &gd
->mi
);
2097 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2098 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2099 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2100 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2104 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2106 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2107 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2114 globaldata_find(int cpu
)
2116 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2117 return(&CPU_prvspace
[cpu
].mdglobaldata
.mi
);
2120 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
2121 static void f00f_hack(void *unused
);
2122 SYSINIT(f00f_hack
, SI_BOOT2_BIOS
, SI_ORDER_ANY
, f00f_hack
, NULL
);
2125 f00f_hack(void *unused
)
2127 struct gate_descriptor
*new_idt
;
2133 kprintf("Intel Pentium detected, installing workaround for F00F bug\n");
2135 r_idt
.rd_limit
= sizeof(idt0
) - 1;
2137 tmp
= kmem_alloc(&kernel_map
, PAGE_SIZE
* 2);
2139 panic("kmem_alloc returned 0");
2140 if (((unsigned int)tmp
& (PAGE_SIZE
-1)) != 0)
2141 panic("kmem_alloc returned non-page-aligned memory");
2142 /* Put the first seven entries in the lower page */
2143 new_idt
= (struct gate_descriptor
*)(tmp
+ PAGE_SIZE
- (7*8));
2144 bcopy(idt
, new_idt
, sizeof(idt0
));
2145 r_idt
.rd_base
= (int)new_idt
;
2148 if (vm_map_protect(&kernel_map
, tmp
, tmp
+ PAGE_SIZE
,
2149 VM_PROT_READ
, FALSE
) != KERN_SUCCESS
)
2150 panic("vm_map_protect failed");
2153 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
2156 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2158 lp
->lwp_md
.md_regs
->tf_eip
= addr
;
2163 ptrace_single_step(struct lwp
*lp
)
2165 lp
->lwp_md
.md_regs
->tf_eflags
|= PSL_T
;
2170 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2173 struct trapframe
*tp
;
2175 tp
= lp
->lwp_md
.md_regs
;
2176 regs
->r_gs
= tp
->tf_gs
;
2177 regs
->r_fs
= tp
->tf_fs
;
2178 regs
->r_es
= tp
->tf_es
;
2179 regs
->r_ds
= tp
->tf_ds
;
2180 regs
->r_edi
= tp
->tf_edi
;
2181 regs
->r_esi
= tp
->tf_esi
;
2182 regs
->r_ebp
= tp
->tf_ebp
;
2183 regs
->r_ebx
= tp
->tf_ebx
;
2184 regs
->r_edx
= tp
->tf_edx
;
2185 regs
->r_ecx
= tp
->tf_ecx
;
2186 regs
->r_eax
= tp
->tf_eax
;
2187 regs
->r_eip
= tp
->tf_eip
;
2188 regs
->r_cs
= tp
->tf_cs
;
2189 regs
->r_eflags
= tp
->tf_eflags
;
2190 regs
->r_esp
= tp
->tf_esp
;
2191 regs
->r_ss
= tp
->tf_ss
;
2192 pcb
= lp
->lwp_thread
->td_pcb
;
2197 set_regs(struct lwp
*lp
, struct reg
*regs
)
2200 struct trapframe
*tp
;
2202 tp
= lp
->lwp_md
.md_regs
;
2203 if (!EFL_SECURE(regs
->r_eflags
, tp
->tf_eflags
) ||
2204 !CS_SECURE(regs
->r_cs
))
2206 tp
->tf_gs
= regs
->r_gs
;
2207 tp
->tf_fs
= regs
->r_fs
;
2208 tp
->tf_es
= regs
->r_es
;
2209 tp
->tf_ds
= regs
->r_ds
;
2210 tp
->tf_edi
= regs
->r_edi
;
2211 tp
->tf_esi
= regs
->r_esi
;
2212 tp
->tf_ebp
= regs
->r_ebp
;
2213 tp
->tf_ebx
= regs
->r_ebx
;
2214 tp
->tf_edx
= regs
->r_edx
;
2215 tp
->tf_ecx
= regs
->r_ecx
;
2216 tp
->tf_eax
= regs
->r_eax
;
2217 tp
->tf_eip
= regs
->r_eip
;
2218 tp
->tf_cs
= regs
->r_cs
;
2219 tp
->tf_eflags
= regs
->r_eflags
;
2220 tp
->tf_esp
= regs
->r_esp
;
2221 tp
->tf_ss
= regs
->r_ss
;
2222 pcb
= lp
->lwp_thread
->td_pcb
;
2226 #ifndef CPU_DISABLE_SSE
2228 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2230 struct env87
*penv_87
= &sv_87
->sv_env
;
2231 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2234 /* FPU control/status */
2235 penv_87
->en_cw
= penv_xmm
->en_cw
;
2236 penv_87
->en_sw
= penv_xmm
->en_sw
;
2237 penv_87
->en_tw
= penv_xmm
->en_tw
;
2238 penv_87
->en_fip
= penv_xmm
->en_fip
;
2239 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2240 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2241 penv_87
->en_foo
= penv_xmm
->en_foo
;
2242 penv_87
->en_fos
= penv_xmm
->en_fos
;
2245 for (i
= 0; i
< 8; ++i
)
2246 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2248 sv_87
->sv_ex_sw
= sv_xmm
->sv_ex_sw
;
2252 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2254 struct env87
*penv_87
= &sv_87
->sv_env
;
2255 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2258 /* FPU control/status */
2259 penv_xmm
->en_cw
= penv_87
->en_cw
;
2260 penv_xmm
->en_sw
= penv_87
->en_sw
;
2261 penv_xmm
->en_tw
= penv_87
->en_tw
;
2262 penv_xmm
->en_fip
= penv_87
->en_fip
;
2263 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2264 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2265 penv_xmm
->en_foo
= penv_87
->en_foo
;
2266 penv_xmm
->en_fos
= penv_87
->en_fos
;
2269 for (i
= 0; i
< 8; ++i
)
2270 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2272 sv_xmm
->sv_ex_sw
= sv_87
->sv_ex_sw
;
2274 #endif /* CPU_DISABLE_SSE */
2277 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2279 #ifndef CPU_DISABLE_SSE
2281 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2282 (struct save87
*)fpregs
);
2285 #endif /* CPU_DISABLE_SSE */
2286 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2291 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2293 #ifndef CPU_DISABLE_SSE
2295 set_fpregs_xmm((struct save87
*)fpregs
,
2296 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2299 #endif /* CPU_DISABLE_SSE */
2300 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2305 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2308 dbregs
->dr0
= rdr0();
2309 dbregs
->dr1
= rdr1();
2310 dbregs
->dr2
= rdr2();
2311 dbregs
->dr3
= rdr3();
2312 dbregs
->dr4
= rdr4();
2313 dbregs
->dr5
= rdr5();
2314 dbregs
->dr6
= rdr6();
2315 dbregs
->dr7
= rdr7();
2319 pcb
= lp
->lwp_thread
->td_pcb
;
2320 dbregs
->dr0
= pcb
->pcb_dr0
;
2321 dbregs
->dr1
= pcb
->pcb_dr1
;
2322 dbregs
->dr2
= pcb
->pcb_dr2
;
2323 dbregs
->dr3
= pcb
->pcb_dr3
;
2326 dbregs
->dr6
= pcb
->pcb_dr6
;
2327 dbregs
->dr7
= pcb
->pcb_dr7
;
2333 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2336 load_dr0(dbregs
->dr0
);
2337 load_dr1(dbregs
->dr1
);
2338 load_dr2(dbregs
->dr2
);
2339 load_dr3(dbregs
->dr3
);
2340 load_dr4(dbregs
->dr4
);
2341 load_dr5(dbregs
->dr5
);
2342 load_dr6(dbregs
->dr6
);
2343 load_dr7(dbregs
->dr7
);
2346 struct ucred
*ucred
;
2348 uint32_t mask1
, mask2
;
2351 * Don't let an illegal value for dr7 get set. Specifically,
2352 * check for undefined settings. Setting these bit patterns
2353 * result in undefined behaviour and can lead to an unexpected
2356 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 8;
2357 i
++, mask1
<<= 2, mask2
<<= 2)
2358 if ((dbregs
->dr7
& mask1
) == mask2
)
2361 pcb
= lp
->lwp_thread
->td_pcb
;
2362 ucred
= lp
->lwp_proc
->p_ucred
;
2365 * Don't let a process set a breakpoint that is not within the
2366 * process's address space. If a process could do this, it
2367 * could halt the system by setting a breakpoint in the kernel
2368 * (if ddb was enabled). Thus, we need to check to make sure
2369 * that no breakpoints are being enabled for addresses outside
2370 * process's address space, unless, perhaps, we were called by
2373 * XXX - what about when the watched area of the user's
2374 * address space is written into from within the kernel
2375 * ... wouldn't that still cause a breakpoint to be generated
2376 * from within kernel mode?
2379 if (suser_cred(ucred
, 0) != 0) {
2380 if (dbregs
->dr7
& 0x3) {
2381 /* dr0 is enabled */
2382 if (dbregs
->dr0
>= VM_MAX_USER_ADDRESS
)
2386 if (dbregs
->dr7
& (0x3<<2)) {
2387 /* dr1 is enabled */
2388 if (dbregs
->dr1
>= VM_MAX_USER_ADDRESS
)
2392 if (dbregs
->dr7
& (0x3<<4)) {
2393 /* dr2 is enabled */
2394 if (dbregs
->dr2
>= VM_MAX_USER_ADDRESS
)
2398 if (dbregs
->dr7
& (0x3<<6)) {
2399 /* dr3 is enabled */
2400 if (dbregs
->dr3
>= VM_MAX_USER_ADDRESS
)
2405 pcb
->pcb_dr0
= dbregs
->dr0
;
2406 pcb
->pcb_dr1
= dbregs
->dr1
;
2407 pcb
->pcb_dr2
= dbregs
->dr2
;
2408 pcb
->pcb_dr3
= dbregs
->dr3
;
2409 pcb
->pcb_dr6
= dbregs
->dr6
;
2410 pcb
->pcb_dr7
= dbregs
->dr7
;
2412 pcb
->pcb_flags
|= PCB_DBREGS
;
2419 * Return > 0 if a hardware breakpoint has been hit, and the
2420 * breakpoint was in user space. Return 0, otherwise.
2423 user_dbreg_trap(void)
2425 u_int32_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2426 u_int32_t bp
; /* breakpoint bits extracted from dr6 */
2427 int nbp
; /* number of breakpoints that triggered */
2428 caddr_t addr
[4]; /* breakpoint addresses */
2432 if ((dr7
& 0x000000ff) == 0) {
2434 * all GE and LE bits in the dr7 register are zero,
2435 * thus the trap couldn't have been caused by the
2436 * hardware debug registers
2443 bp
= dr6
& 0x0000000f;
2447 * None of the breakpoint bits are set meaning this
2448 * trap was not caused by any of the debug registers
2454 * at least one of the breakpoints were hit, check to see
2455 * which ones and if any of them are user space addresses
2459 addr
[nbp
++] = (caddr_t
)rdr0();
2462 addr
[nbp
++] = (caddr_t
)rdr1();
2465 addr
[nbp
++] = (caddr_t
)rdr2();
2468 addr
[nbp
++] = (caddr_t
)rdr3();
2471 for (i
=0; i
<nbp
; i
++) {
2473 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2475 * addr[i] is in user space
2482 * None of the breakpoints are in user space.
2490 Debugger(const char *msg
)
2492 kprintf("Debugger(\"%s\") called.\n", msg
);
2499 * Provide inb() and outb() as functions. They are normally only
2500 * available as macros calling inlined functions, thus cannot be
2501 * called inside DDB.
2503 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2509 /* silence compiler warnings */
2511 void outb(u_int
, u_char
);
2518 * We use %%dx and not %1 here because i/o is done at %dx and not at
2519 * %edx, while gcc generates inferior code (movw instead of movl)
2520 * if we tell it to load (u_short) port.
2522 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2527 outb(u_int port
, u_char data
)
2531 * Use an unnecessary assignment to help gcc's register allocator.
2532 * This make a large difference for gcc-1.40 and a tiny difference
2533 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2534 * best results. gcc-2.6.0 can't handle this.
2537 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2544 #include "opt_cpu.h"
2548 * initialize all the SMP locks
2551 /* critical region when masking or unmasking interupts */
2552 struct spinlock_deprecated imen_spinlock
;
2554 /* Make FAST_INTR() routines sequential */
2555 struct spinlock_deprecated fast_intr_spinlock
;
2557 /* critical region for old style disable_intr/enable_intr */
2558 struct spinlock_deprecated mpintr_spinlock
;
2560 /* critical region around INTR() routines */
2561 struct spinlock_deprecated intr_spinlock
;
2563 /* lock region used by kernel profiling */
2564 struct spinlock_deprecated mcount_spinlock
;
2566 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2567 struct spinlock_deprecated com_spinlock
;
2569 /* locks kernel kprintfs */
2570 struct spinlock_deprecated cons_spinlock
;
2572 /* lock regions around the clock hardware */
2573 struct spinlock_deprecated clock_spinlock
;
2575 /* lock around the MP rendezvous */
2576 struct spinlock_deprecated smp_rv_spinlock
;
2582 * mp_lock = 0; BSP already owns the MP lock
2585 * Get the initial mp_lock with a count of 1 for the BSP.
2586 * This uses a LOGICAL cpu ID, ie BSP == 0.
2589 cpu_get_initial_mplock();
2592 spin_lock_init(&mcount_spinlock
);
2593 spin_lock_init(&fast_intr_spinlock
);
2594 spin_lock_init(&intr_spinlock
);
2595 spin_lock_init(&mpintr_spinlock
);
2596 spin_lock_init(&imen_spinlock
);
2597 spin_lock_init(&smp_rv_spinlock
);
2598 spin_lock_init(&com_spinlock
);
2599 spin_lock_init(&clock_spinlock
);
2600 spin_lock_init(&cons_spinlock
);
2602 /* our token pool needs to work early */
2603 lwkt_token_pool_init();