2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
6 * This code is derived from software contributed to Berkeley by
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
38 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
39 * $DragonFly: src/sys/platform/pc32/i386/machdep.c,v 1.124 2007/06/29 21:54:10 dillon Exp $
43 #include "use_ether.h"
46 #include "opt_atalk.h"
47 #include "opt_compat.h"
50 #include "opt_directio.h"
53 #include "opt_maxmem.h"
54 #include "opt_msgbuf.h"
55 #include "opt_perfmon.h"
57 #include "opt_userconfig.h"
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/sysproto.h>
62 #include <sys/signalvar.h>
63 #include <sys/kernel.h>
64 #include <sys/linker.h>
65 #include <sys/malloc.h>
68 #include <sys/reboot.h>
70 #include <sys/msgbuf.h>
71 #include <sys/sysent.h>
72 #include <sys/sysctl.h>
73 #include <sys/vmmeter.h>
75 #include <sys/upcall.h>
76 #include <sys/usched.h>
80 #include <vm/vm_param.h>
82 #include <vm/vm_kern.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_map.h>
86 #include <vm/vm_pager.h>
87 #include <vm/vm_extern.h>
89 #include <sys/thread2.h>
97 #include <machine/cpu.h>
98 #include <machine/clock.h>
99 #include <machine/specialreg.h>
100 #include <machine/bootinfo.h>
101 #include <machine/md_var.h>
102 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
103 #include <machine/globaldata.h> /* CPU_prvspace */
104 #include <machine/smp.h>
106 #include <machine/perfmon.h>
108 #include <machine/cputypes.h>
111 #include <bus/isa/i386/isa_device.h>
113 #include <machine_base/isa/intr_machdep.h>
114 #include <bus/isa/rtc.h>
115 #include <machine/vm86.h>
116 #include <sys/random.h>
117 #include <sys/ptrace.h>
118 #include <machine/sigframe.h>
120 #define PHYSMAP_ENTRIES 10
122 extern void init386 (int first
);
123 extern void dblfault_handler (void);
125 extern void printcpuinfo(void); /* XXX header file */
126 extern void finishidentcpu(void);
127 extern void panicifcpuunsupported(void);
128 extern void initializecpu(void);
130 static void cpu_startup (void *);
131 #ifndef CPU_DISABLE_SSE
132 static void set_fpregs_xmm (struct save87
*, struct savexmm
*);
133 static void fill_fpregs_xmm (struct savexmm
*, struct save87
*);
134 #endif /* CPU_DISABLE_SSE */
136 extern void ffs_rawread_setup(void);
137 #endif /* DIRECTIO */
138 static void init_locks(void);
140 SYSINIT(cpu
, SI_BOOT2_SMP
, SI_ORDER_FIRST
, cpu_startup
, NULL
)
142 int _udatasel
, _ucodesel
;
145 int64_t tsc_offsets
[MAXCPU
];
147 int64_t tsc_offsets
[1];
150 #if defined(SWTCH_OPTIM_STATS)
151 extern int swtch_optim_stats
;
152 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
153 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
154 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
155 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
161 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
163 int error
= sysctl_handle_int(oidp
, 0, ctob(physmem
), req
);
167 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_INT
|CTLFLAG_RD
,
168 0, 0, sysctl_hw_physmem
, "IU", "");
171 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
173 int error
= sysctl_handle_int(oidp
, 0,
174 ctob(physmem
- vmstats
.v_wire_count
), req
);
178 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
179 0, 0, sysctl_hw_usermem
, "IU", "");
182 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
184 int error
= sysctl_handle_int(oidp
, 0,
185 i386_btop(avail_end
- avail_start
), req
);
189 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
190 0, 0, sysctl_hw_availpages
, "I", "");
193 sysctl_machdep_msgbuf(SYSCTL_HANDLER_ARGS
)
197 /* Unwind the buffer, so that it's linear (possibly starting with
198 * some initial nulls).
200 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
+msgbufp
->msg_bufr
,
201 msgbufp
->msg_size
-msgbufp
->msg_bufr
,req
);
202 if(error
) return(error
);
203 if(msgbufp
->msg_bufr
>0) {
204 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
,
205 msgbufp
->msg_bufr
,req
);
210 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf
, CTLTYPE_STRING
|CTLFLAG_RD
,
211 0, 0, sysctl_machdep_msgbuf
, "A","Contents of kernel message buffer");
213 static int msgbuf_clear
;
216 sysctl_machdep_msgbuf_clear(SYSCTL_HANDLER_ARGS
)
219 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
221 if (!error
&& req
->newptr
) {
222 /* Clear the buffer and reset write pointer */
223 bzero(msgbufp
->msg_ptr
,msgbufp
->msg_size
);
224 msgbufp
->msg_bufr
=msgbufp
->msg_bufx
=0;
230 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf_clear
, CTLTYPE_INT
|CTLFLAG_RW
,
231 &msgbuf_clear
, 0, sysctl_machdep_msgbuf_clear
, "I",
232 "Clear kernel message buffer");
234 vm_paddr_t Maxmem
= 0;
236 vm_paddr_t phys_avail
[PHYSMAP_ENTRIES
*2+2];
238 static vm_offset_t buffer_sva
, buffer_eva
;
239 vm_offset_t clean_sva
, clean_eva
;
240 static vm_offset_t pager_sva
, pager_eva
;
241 static struct trapframe proc0_tf
;
244 cpu_startup(void *dummy
)
248 vm_offset_t firstaddr
;
250 if (boothowto
& RB_VERBOSE
)
254 * Good {morning,afternoon,evening,night}.
256 kprintf("%s", version
);
259 panicifcpuunsupported();
263 kprintf("real memory = %llu (%lluK bytes)\n", ptoa(Maxmem
), ptoa(Maxmem
) / 1024);
265 * Display any holes after the first chunk of extended memory.
270 kprintf("Physical memory chunk(s):\n");
271 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
272 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
274 kprintf("0x%08llx - 0x%08llx, %llu bytes (%llu pages)\n",
275 phys_avail
[indx
], phys_avail
[indx
+ 1] - 1, size1
,
281 * Allocate space for system data structures.
282 * The first available kernel virtual address is in "v".
283 * As pages of kernel virtual memory are allocated, "v" is incremented.
284 * As pages of memory are allocated and cleared,
285 * "firstaddr" is incremented.
286 * An index into the kernel page table corresponding to the
287 * virtual memory address maintained in "v" is kept in "mapaddr".
291 * Make two passes. The first pass calculates how much memory is
292 * needed and allocates it. The second pass assigns virtual
293 * addresses to the various data structures.
297 v
= (caddr_t
)firstaddr
;
299 #define valloc(name, type, num) \
300 (name) = (type *)v; v = (caddr_t)((name)+(num))
301 #define valloclim(name, type, num, lim) \
302 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
305 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
306 * For the first 64MB of ram nominally allocate sufficient buffers to
307 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
308 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
309 * the buffer cache we limit the eventual kva reservation to
312 * factor represents the 1/4 x ram conversion.
315 int factor
= 4 * BKVASIZE
/ 1024;
316 int kbytes
= physmem
* (PAGE_SIZE
/ 1024);
320 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
322 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
323 if (maxbcache
&& nbuf
> maxbcache
/ BKVASIZE
)
324 nbuf
= maxbcache
/ BKVASIZE
;
328 * Do not allow the buffer_map to be more then 1/2 the size of the
331 if (nbuf
> (virtual_end
- virtual_start
) / (BKVASIZE
* 2)) {
332 nbuf
= (virtual_end
- virtual_start
) / (BKVASIZE
* 2);
333 kprintf("Warning: nbufs capped at %d\n", nbuf
);
336 nswbuf
= max(min(nbuf
/4, 256), 16);
338 if (nswbuf
< NSWBUF_MIN
)
345 valloc(swbuf
, struct buf
, nswbuf
);
346 valloc(buf
, struct buf
, nbuf
);
349 * End of first pass, size has been calculated so allocate memory
351 if (firstaddr
== 0) {
352 size
= (vm_size_t
)(v
- firstaddr
);
353 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
));
355 panic("startup: no room for tables");
360 * End of second pass, addresses have been assigned
362 if ((vm_size_t
)(v
- firstaddr
) != size
)
363 panic("startup: table size inconsistency");
365 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
366 (nbuf
*BKVASIZE
) + (nswbuf
*MAXPHYS
) + pager_map_size
);
367 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
369 buffer_map
.system_map
= 1;
370 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
371 (nswbuf
*MAXPHYS
) + pager_map_size
);
372 pager_map
.system_map
= 1;
374 #if defined(USERCONFIG)
376 cninit(); /* the preferred console may have changed */
379 kprintf("avail memory = %u (%uK bytes)\n", ptoa(vmstats
.v_free_count
),
380 ptoa(vmstats
.v_free_count
) / 1024);
383 * Set up buffers, so they can be used to read disk labels.
386 vm_pager_bufferinit();
390 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
392 mp_start(); /* fire up the APs and APICs */
399 * Send an interrupt to process.
401 * Stack is set up to allow sigcode stored
402 * at top to call routine, followed by kcall
403 * to sigreturn routine below. After sigreturn
404 * resets the signal mask, the stack, and the
405 * frame pointer, it returns to the user
409 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
411 struct lwp
*lp
= curthread
->td_lwp
;
412 struct proc
*p
= lp
->lwp_proc
;
413 struct trapframe
*regs
;
414 struct sigacts
*psp
= p
->p_sigacts
;
415 struct sigframe sf
, *sfp
;
418 regs
= lp
->lwp_md
.md_regs
;
419 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
421 /* save user context */
422 bzero(&sf
, sizeof(struct sigframe
));
423 sf
.sf_uc
.uc_sigmask
= *mask
;
424 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
425 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
426 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_gs
, sizeof(struct trapframe
));
428 /* make the size of the saved context visible to userland */
429 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
431 /* save mailbox pending state for syscall interlock semantics */
432 if (p
->p_flag
& P_MAILBOX
)
433 sf
.sf_uc
.uc_mcontext
.mc_xflags
|= PGEX_MAILBOX
;
435 /* Allocate and validate space for the signal handler context. */
436 if ((lp
->lwp_flag
& LWP_ALTSTACK
) != 0 && !oonstack
&&
437 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
438 sfp
= (struct sigframe
*)(lp
->lwp_sigstk
.ss_sp
+
439 lp
->lwp_sigstk
.ss_size
- sizeof(struct sigframe
));
440 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
442 sfp
= (struct sigframe
*)regs
->tf_esp
- 1;
445 /* Translate the signal is appropriate */
446 if (p
->p_sysent
->sv_sigtbl
) {
447 if (sig
<= p
->p_sysent
->sv_sigsize
)
448 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
451 /* Build the argument list for the signal handler. */
453 sf
.sf_ucontext
= (register_t
)&sfp
->sf_uc
;
454 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
455 /* Signal handler installed with SA_SIGINFO. */
456 sf
.sf_siginfo
= (register_t
)&sfp
->sf_si
;
457 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
459 /* fill siginfo structure */
460 sf
.sf_si
.si_signo
= sig
;
461 sf
.sf_si
.si_code
= code
;
462 sf
.sf_si
.si_addr
= (void*)regs
->tf_err
;
465 /* Old FreeBSD-style arguments. */
466 sf
.sf_siginfo
= code
;
467 sf
.sf_addr
= regs
->tf_err
;
468 sf
.sf_ahu
.sf_handler
= catcher
;
472 * If we're a vm86 process, we want to save the segment registers.
473 * We also change eflags to be our emulated eflags, not the actual
476 if (regs
->tf_eflags
& PSL_VM
) {
477 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
478 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
480 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
481 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
482 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
483 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
485 if (vm86
->vm86_has_vme
== 0)
486 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
487 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
488 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
491 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
492 * syscalls made by the signal handler. This just avoids
493 * wasting time for our lazy fixup of such faults. PSL_NT
494 * does nothing in vm86 mode, but vm86 programs can set it
495 * almost legitimately in probes for old cpu types.
497 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
501 * Copy the sigframe out to the user's stack.
503 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
505 * Something is wrong with the stack pointer.
506 * ...Kill the process.
511 regs
->tf_esp
= (int)sfp
;
512 regs
->tf_eip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
513 regs
->tf_eflags
&= ~PSL_T
;
514 regs
->tf_cs
= _ucodesel
;
515 regs
->tf_ds
= _udatasel
;
516 regs
->tf_es
= _udatasel
;
519 * Allow the signal handler to inherit %fs in addition to %gs as
520 * the userland program might be using both.
522 * However, if a T_PROTFLT occured the segment registers could be
523 * totally broken. They must be reset in order to be able to
524 * return to userland.
526 if (regs
->tf_trapno
== T_PROTFLT
) {
527 regs
->tf_fs
= _udatasel
;
528 regs
->tf_gs
= _udatasel
;
530 regs
->tf_ss
= _udatasel
;
534 * Sanitize the trapframe for a virtual kernel passing control to a custom
535 * VM context. Remove any items that would otherwise create a privilage
538 * XXX at the moment we allow userland to set the resume flag. Is this a
542 cpu_sanitize_frame(struct trapframe
*frame
)
544 frame
->tf_cs
= _ucodesel
;
545 frame
->tf_ds
= _udatasel
;
546 frame
->tf_es
= _udatasel
; /* XXX allow userland this one too? */
548 frame
->tf_fs
= _udatasel
;
549 frame
->tf_gs
= _udatasel
;
551 frame
->tf_ss
= _udatasel
;
552 frame
->tf_eflags
&= (PSL_RF
| PSL_USERCHANGE
);
553 frame
->tf_eflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
558 cpu_sanitize_tls(struct savetls
*tls
)
560 struct segment_descriptor
*desc
;
563 for (i
= 0; i
< NGTLS
; ++i
) {
565 if (desc
->sd_dpl
== 0 && desc
->sd_type
== 0)
567 if (desc
->sd_def32
== 0)
569 if (desc
->sd_type
!= SDT_MEMRWA
)
571 if (desc
->sd_dpl
!= SEL_UPL
)
573 if (desc
->sd_xx
!= 0 || desc
->sd_p
!= 1)
580 * sigreturn(ucontext_t *sigcntxp)
582 * System call to cleanup state after a signal
583 * has been taken. Reset signal mask and
584 * stack state from context left by sendsig (above).
585 * Return to previous pc and psl as specified by
586 * context left by sendsig. Check carefully to
587 * make sure that the user has not modified the
588 * state to gain improper privileges.
590 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
591 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
594 sys_sigreturn(struct sigreturn_args
*uap
)
596 struct lwp
*lp
= curthread
->td_lwp
;
597 struct proc
*p
= lp
->lwp_proc
;
598 struct trapframe
*regs
;
604 if (!useracc((caddr_t
)ucp
, sizeof(ucontext_t
), VM_PROT_READ
))
607 regs
= lp
->lwp_md
.md_regs
;
608 eflags
= ucp
->uc_mcontext
.mc_eflags
;
610 if (eflags
& PSL_VM
) {
611 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
612 struct vm86_kernel
*vm86
;
615 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
616 * set up the vm86 area, and we can't enter vm86 mode.
618 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
620 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
621 if (vm86
->vm86_inited
== 0)
624 /* go back to user mode if both flags are set */
625 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
626 trapsignal(lp
, SIGBUS
, 0);
628 if (vm86
->vm86_has_vme
) {
629 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
630 (eflags
& VME_USERCHANGE
) | PSL_VM
;
632 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
633 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
634 (eflags
& VM_USERCHANGE
) | PSL_VM
;
636 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
637 tf
->tf_eflags
= eflags
;
638 tf
->tf_vm86_ds
= tf
->tf_ds
;
639 tf
->tf_vm86_es
= tf
->tf_es
;
640 tf
->tf_vm86_fs
= tf
->tf_fs
;
641 tf
->tf_vm86_gs
= tf
->tf_gs
;
642 tf
->tf_ds
= _udatasel
;
643 tf
->tf_es
= _udatasel
;
645 tf
->tf_fs
= _udatasel
;
646 tf
->tf_gs
= _udatasel
;
650 * Don't allow users to change privileged or reserved flags.
653 * XXX do allow users to change the privileged flag PSL_RF.
654 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
655 * should sometimes set it there too. tf_eflags is kept in
656 * the signal context during signal handling and there is no
657 * other place to remember it, so the PSL_RF bit may be
658 * corrupted by the signal handler without us knowing.
659 * Corruption of the PSL_RF bit at worst causes one more or
660 * one less debugger trap, so allowing it is fairly harmless.
662 if (!EFL_SECURE(eflags
& ~PSL_RF
, regs
->tf_eflags
& ~PSL_RF
)) {
663 kprintf("sigreturn: eflags = 0x%x\n", eflags
);
668 * Don't allow users to load a valid privileged %cs. Let the
669 * hardware check for invalid selectors, excess privilege in
670 * other selectors, invalid %eip's and invalid %esp's.
672 cs
= ucp
->uc_mcontext
.mc_cs
;
673 if (!CS_SECURE(cs
)) {
674 kprintf("sigreturn: cs = 0x%x\n", cs
);
675 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
678 bcopy(&ucp
->uc_mcontext
.mc_gs
, regs
, sizeof(struct trapframe
));
682 * Merge saved signal mailbox pending flag to maintain interlock
683 * semantics against system calls.
685 if (ucp
->uc_mcontext
.mc_xflags
& PGEX_MAILBOX
)
686 p
->p_flag
|= P_MAILBOX
;
688 if (ucp
->uc_mcontext
.mc_onstack
& 1)
689 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
691 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
693 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
694 SIG_CANTMASK(lp
->lwp_sigmask
);
699 * Stack frame on entry to function. %eax will contain the function vector,
700 * %ecx will contain the function data. flags, ecx, and eax will have
701 * already been pushed on the stack.
712 sendupcall(struct vmupcall
*vu
, int morepending
)
714 struct lwp
*lp
= curthread
->td_lwp
;
715 struct proc
*p
= lp
->lwp_proc
;
716 struct trapframe
*regs
;
717 struct upcall upcall
;
718 struct upc_frame upc_frame
;
722 * If we are a virtual kernel running an emulated user process
723 * context, switch back to the virtual kernel context before
724 * trying to post the signal.
727 lp
->lwp_md
.md_regs
->tf_trapno
= 0;
728 vkernel_trap(lp
, lp
->lwp_md
.md_regs
);
732 * Get the upcall data structure
734 if (copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
)) ||
735 copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int))
738 kprintf("bad upcall address\n");
743 * If the data structure is already marked pending or has a critical
744 * section count, mark the data structure as pending and return
745 * without doing an upcall. vu_pending is left set.
747 if (upcall
.upc_pending
|| crit_count
>= vu
->vu_pending
) {
748 if (upcall
.upc_pending
< vu
->vu_pending
) {
749 upcall
.upc_pending
= vu
->vu_pending
;
750 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
751 sizeof(upcall
.upc_pending
));
757 * We can run this upcall now, clear vu_pending.
759 * Bump our critical section count and set or clear the
760 * user pending flag depending on whether more upcalls are
761 * pending. The user will be responsible for calling
762 * upc_dispatch(-1) to process remaining upcalls.
765 upcall
.upc_pending
= morepending
;
766 crit_count
+= TDPRI_CRIT
;
767 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
768 sizeof(upcall
.upc_pending
));
769 copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
,
773 * Construct a stack frame and issue the upcall
775 regs
= lp
->lwp_md
.md_regs
;
776 upc_frame
.eax
= regs
->tf_eax
;
777 upc_frame
.ecx
= regs
->tf_ecx
;
778 upc_frame
.edx
= regs
->tf_edx
;
779 upc_frame
.flags
= regs
->tf_eflags
;
780 upc_frame
.oldip
= regs
->tf_eip
;
781 if (copyout(&upc_frame
, (void *)(regs
->tf_esp
- sizeof(upc_frame
)),
782 sizeof(upc_frame
)) != 0) {
783 kprintf("bad stack on upcall\n");
785 regs
->tf_eax
= (register_t
)vu
->vu_func
;
786 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
787 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
788 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
789 regs
->tf_esp
-= sizeof(upc_frame
);
794 * fetchupcall occurs in the context of a system call, which means that
795 * we have to return EJUSTRETURN in order to prevent eax and edx from
796 * being overwritten by the syscall return value.
798 * if vu is not NULL we return the new context in %edx, the new data in %ecx,
799 * and the function pointer in %eax.
802 fetchupcall (struct vmupcall
*vu
, int morepending
, void *rsp
)
804 struct upc_frame upc_frame
;
805 struct lwp
*lp
= curthread
->td_lwp
;
806 struct trapframe
*regs
;
808 struct upcall upcall
;
811 regs
= lp
->lwp_md
.md_regs
;
813 error
= copyout(&morepending
, &lp
->lwp_upcall
->upc_pending
, sizeof(int));
817 * This jumps us to the next ready context.
820 error
= copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
));
823 error
= copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int));
824 crit_count
+= TDPRI_CRIT
;
826 error
= copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, sizeof(int));
827 regs
->tf_eax
= (register_t
)vu
->vu_func
;
828 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
829 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
830 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
831 regs
->tf_esp
= (register_t
)rsp
;
834 * This returns us to the originally interrupted code.
836 error
= copyin(rsp
, &upc_frame
, sizeof(upc_frame
));
837 regs
->tf_eax
= upc_frame
.eax
;
838 regs
->tf_ecx
= upc_frame
.ecx
;
839 regs
->tf_edx
= upc_frame
.edx
;
840 regs
->tf_eflags
= (regs
->tf_eflags
& ~PSL_USERCHANGE
) |
841 (upc_frame
.flags
& PSL_USERCHANGE
);
842 regs
->tf_eip
= upc_frame
.oldip
;
843 regs
->tf_esp
= (register_t
)((char *)rsp
+ sizeof(upc_frame
));
852 * Machine dependent boot() routine
854 * I haven't seen anything to put here yet
855 * Possibly some stuff might be grafted back here from boot()
863 * Shutdown the CPU as much as possible
869 __asm__
__volatile("hlt");
873 * cpu_idle() represents the idle LWKT. You cannot return from this function
874 * (unless you want to blow things up!). Instead we look for runnable threads
875 * and loop or halt as appropriate. Giant is not held on entry to the thread.
877 * The main loop is entered with a critical section held, we must release
878 * the critical section before doing anything else. lwkt_switch() will
879 * check for pending interrupts due to entering and exiting its own
882 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI
883 * to wake a HLTed cpu up. However, there are cases where the idlethread
884 * will be entered with the possibility that no IPI will occur and in such
885 * cases lwkt_switch() sets TDF_IDLE_NOHLT.
887 static int cpu_idle_hlt
= 1;
888 static int cpu_idle_hltcnt
;
889 static int cpu_idle_spincnt
;
890 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
891 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
892 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, CTLFLAG_RW
,
893 &cpu_idle_hltcnt
, 0, "Idle loop entry halts");
894 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_spincnt
, CTLFLAG_RW
,
895 &cpu_idle_spincnt
, 0, "Idle loop entry spins");
898 cpu_idle_default_hook(void)
901 * We must guarentee that hlt is exactly the instruction
904 __asm
__volatile("sti; hlt");
907 /* Other subsystems (e.g., ACPI) can hook this later. */
908 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
913 struct thread
*td
= curthread
;
916 KKASSERT(td
->td_pri
< TDPRI_CRIT
);
919 * See if there are any LWKTs ready to go.
924 * If we are going to halt call splz unconditionally after
925 * CLIing to catch any interrupt races. Note that we are
926 * at SPL0 and interrupts are enabled.
928 if (cpu_idle_hlt
&& !lwkt_runnable() &&
929 (td
->td_flags
& TDF_IDLE_NOHLT
) == 0) {
930 __asm
__volatile("cli");
932 if (!lwkt_runnable())
936 __asm
__volatile("pause");
940 td
->td_flags
&= ~TDF_IDLE_NOHLT
;
943 __asm
__volatile("sti; pause");
945 __asm
__volatile("sti");
953 * Clear registers on exec
956 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
958 struct thread
*td
= curthread
;
959 struct lwp
*lp
= td
->td_lwp
;
960 struct pcb
*pcb
= td
->td_pcb
;
961 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
963 /* was i386_user_cleanup() in NetBSD */
966 bzero((char *)regs
, sizeof(struct trapframe
));
967 regs
->tf_eip
= entry
;
968 regs
->tf_esp
= stack
;
969 regs
->tf_eflags
= PSL_USER
| (regs
->tf_eflags
& PSL_T
);
970 regs
->tf_ss
= _udatasel
;
971 regs
->tf_ds
= _udatasel
;
972 regs
->tf_es
= _udatasel
;
973 regs
->tf_fs
= _udatasel
;
974 regs
->tf_gs
= _udatasel
;
975 regs
->tf_cs
= _ucodesel
;
977 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
978 regs
->tf_ebx
= ps_strings
;
981 * Reset the hardware debug registers if they were in use.
982 * They won't have any meaning for the newly exec'd process.
984 if (pcb
->pcb_flags
& PCB_DBREGS
) {
991 if (pcb
== td
->td_pcb
) {
993 * Clear the debug registers on the running
994 * CPU, otherwise they will end up affecting
995 * the next process we switch to.
999 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1003 * Initialize the math emulator (if any) for the current process.
1004 * Actually, just clear the bit that says that the emulator has
1005 * been initialized. Initialization is delayed until the process
1006 * traps to the emulator (if it is done at all) mainly because
1007 * emulators don't provide an entry point for initialization.
1009 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1012 * note: do not set CR0_TS here. npxinit() must do it after clearing
1013 * gd_npxthread. Otherwise a preemptive interrupt thread may panic
1017 load_cr0(rcr0() | CR0_MP
);
1020 /* Initialize the npx (if any) for the current process. */
1021 npxinit(__INITIAL_NPXCW__
);
1026 * note: linux emulator needs edx to be 0x0 on entry, which is
1027 * handled in execve simply by setting the 64 bit syscall
1028 * return value to 0.
1038 cr0
|= CR0_NE
; /* Done by npxinit() */
1039 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1041 if (cpu_class
!= CPUCLASS_386
)
1043 cr0
|= CR0_WP
| CR0_AM
;
1049 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1052 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1054 if (!error
&& req
->newptr
)
1059 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1060 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1062 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1063 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1065 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1066 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1068 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1069 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1071 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1072 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1073 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1076 * Initialize 386 and configure to run kernel
1080 * Initialize segments & interrupt table
1084 union descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1085 static struct gate_descriptor idt0
[NIDT
];
1086 struct gate_descriptor
*idt
= &idt0
[0]; /* interrupt descriptor table */
1087 union descriptor ldt
[NLDT
]; /* local descriptor table */
1089 /* table descriptors - used to load tables by cpu */
1090 struct region_descriptor r_gdt
, r_idt
;
1092 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
1093 extern int has_f00f_bug
;
1096 static struct i386tss dblfault_tss
;
1097 static char dblfault_stack
[PAGE_SIZE
];
1099 extern struct user
*proc0paddr
;
1102 /* software prototypes -- in more palatable form */
1103 struct soft_segment_descriptor gdt_segs
[] = {
1104 /* GNULL_SEL 0 Null Descriptor */
1105 { 0x0, /* segment base address */
1107 0, /* segment type */
1108 0, /* segment descriptor priority level */
1109 0, /* segment descriptor present */
1111 0, /* default 32 vs 16 bit size */
1112 0 /* limit granularity (byte/page units)*/ },
1113 /* GCODE_SEL 1 Code Descriptor for kernel */
1114 { 0x0, /* segment base address */
1115 0xfffff, /* length - all address space */
1116 SDT_MEMERA
, /* segment type */
1117 0, /* segment descriptor priority level */
1118 1, /* segment descriptor present */
1120 1, /* default 32 vs 16 bit size */
1121 1 /* limit granularity (byte/page units)*/ },
1122 /* GDATA_SEL 2 Data Descriptor for kernel */
1123 { 0x0, /* segment base address */
1124 0xfffff, /* length - all address space */
1125 SDT_MEMRWA
, /* segment type */
1126 0, /* segment descriptor priority level */
1127 1, /* segment descriptor present */
1129 1, /* default 32 vs 16 bit size */
1130 1 /* limit granularity (byte/page units)*/ },
1131 /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */
1132 { 0x0, /* segment base address */
1133 0xfffff, /* length - all address space */
1134 SDT_MEMRWA
, /* segment type */
1135 0, /* segment descriptor priority level */
1136 1, /* segment descriptor present */
1138 1, /* default 32 vs 16 bit size */
1139 1 /* limit granularity (byte/page units)*/ },
1140 /* GPROC0_SEL 4 Proc 0 Tss Descriptor */
1142 0x0, /* segment base address */
1143 sizeof(struct i386tss
)-1,/* length - all address space */
1144 SDT_SYS386TSS
, /* segment type */
1145 0, /* segment descriptor priority level */
1146 1, /* segment descriptor present */
1148 0, /* unused - default 32 vs 16 bit size */
1149 0 /* limit granularity (byte/page units)*/ },
1150 /* GLDT_SEL 5 LDT Descriptor */
1151 { (int) ldt
, /* segment base address */
1152 sizeof(ldt
)-1, /* length - all address space */
1153 SDT_SYSLDT
, /* segment type */
1154 SEL_UPL
, /* segment descriptor priority level */
1155 1, /* segment descriptor present */
1157 0, /* unused - default 32 vs 16 bit size */
1158 0 /* limit granularity (byte/page units)*/ },
1159 /* GUSERLDT_SEL 6 User LDT Descriptor per process */
1160 { (int) ldt
, /* segment base address */
1161 (512 * sizeof(union descriptor
)-1), /* length */
1162 SDT_SYSLDT
, /* segment type */
1163 0, /* segment descriptor priority level */
1164 1, /* segment descriptor present */
1166 0, /* unused - default 32 vs 16 bit size */
1167 0 /* limit granularity (byte/page units)*/ },
1168 /* GTGATE_SEL 7 Null Descriptor - Placeholder */
1169 { 0x0, /* segment base address */
1170 0x0, /* length - all address space */
1171 0, /* segment type */
1172 0, /* segment descriptor priority level */
1173 0, /* segment descriptor present */
1175 0, /* default 32 vs 16 bit size */
1176 0 /* limit granularity (byte/page units)*/ },
1177 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
1178 { 0x400, /* segment base address */
1179 0xfffff, /* length */
1180 SDT_MEMRWA
, /* segment type */
1181 0, /* segment descriptor priority level */
1182 1, /* segment descriptor present */
1184 1, /* default 32 vs 16 bit size */
1185 1 /* limit granularity (byte/page units)*/ },
1186 /* GPANIC_SEL 9 Panic Tss Descriptor */
1187 { (int) &dblfault_tss
, /* segment base address */
1188 sizeof(struct i386tss
)-1,/* length - all address space */
1189 SDT_SYS386TSS
, /* segment type */
1190 0, /* segment descriptor priority level */
1191 1, /* segment descriptor present */
1193 0, /* unused - default 32 vs 16 bit size */
1194 0 /* limit granularity (byte/page units)*/ },
1195 /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */
1196 { 0, /* segment base address (overwritten) */
1197 0xfffff, /* length */
1198 SDT_MEMERA
, /* segment type */
1199 0, /* segment descriptor priority level */
1200 1, /* segment descriptor present */
1202 0, /* default 32 vs 16 bit size */
1203 1 /* limit granularity (byte/page units)*/ },
1204 /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */
1205 { 0, /* segment base address (overwritten) */
1206 0xfffff, /* length */
1207 SDT_MEMERA
, /* segment type */
1208 0, /* segment descriptor priority level */
1209 1, /* segment descriptor present */
1211 0, /* default 32 vs 16 bit size */
1212 1 /* limit granularity (byte/page units)*/ },
1213 /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */
1214 { 0, /* segment base address (overwritten) */
1215 0xfffff, /* length */
1216 SDT_MEMRWA
, /* segment type */
1217 0, /* segment descriptor priority level */
1218 1, /* segment descriptor present */
1220 1, /* default 32 vs 16 bit size */
1221 1 /* limit granularity (byte/page units)*/ },
1222 /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */
1223 { 0, /* segment base address (overwritten) */
1224 0xfffff, /* length */
1225 SDT_MEMRWA
, /* segment type */
1226 0, /* segment descriptor priority level */
1227 1, /* segment descriptor present */
1229 0, /* default 32 vs 16 bit size */
1230 1 /* limit granularity (byte/page units)*/ },
1231 /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */
1232 { 0, /* segment base address (overwritten) */
1233 0xfffff, /* length */
1234 SDT_MEMRWA
, /* segment type */
1235 0, /* segment descriptor priority level */
1236 1, /* segment descriptor present */
1238 0, /* default 32 vs 16 bit size */
1239 1 /* limit granularity (byte/page units)*/ },
1240 /* GTLS_START 15 TLS */
1241 { 0x0, /* segment base address */
1243 0, /* segment type */
1244 0, /* segment descriptor priority level */
1245 0, /* segment descriptor present */
1247 0, /* default 32 vs 16 bit size */
1248 0 /* limit granularity (byte/page units)*/ },
1249 /* GTLS_START+1 16 TLS */
1250 { 0x0, /* segment base address */
1252 0, /* segment type */
1253 0, /* segment descriptor priority level */
1254 0, /* segment descriptor present */
1256 0, /* default 32 vs 16 bit size */
1257 0 /* limit granularity (byte/page units)*/ },
1258 /* GTLS_END 17 TLS */
1259 { 0x0, /* segment base address */
1261 0, /* segment type */
1262 0, /* segment descriptor priority level */
1263 0, /* segment descriptor present */
1265 0, /* default 32 vs 16 bit size */
1266 0 /* limit granularity (byte/page units)*/ },
1269 static struct soft_segment_descriptor ldt_segs
[] = {
1270 /* Null Descriptor - overwritten by call gate */
1271 { 0x0, /* segment base address */
1272 0x0, /* length - all address space */
1273 0, /* segment type */
1274 0, /* segment descriptor priority level */
1275 0, /* segment descriptor present */
1277 0, /* default 32 vs 16 bit size */
1278 0 /* limit granularity (byte/page units)*/ },
1279 /* Null Descriptor - overwritten by call gate */
1280 { 0x0, /* segment base address */
1281 0x0, /* length - all address space */
1282 0, /* segment type */
1283 0, /* segment descriptor priority level */
1284 0, /* segment descriptor present */
1286 0, /* default 32 vs 16 bit size */
1287 0 /* limit granularity (byte/page units)*/ },
1288 /* Null Descriptor - overwritten by call gate */
1289 { 0x0, /* segment base address */
1290 0x0, /* length - all address space */
1291 0, /* segment type */
1292 0, /* segment descriptor priority level */
1293 0, /* segment descriptor present */
1295 0, /* default 32 vs 16 bit size */
1296 0 /* limit granularity (byte/page units)*/ },
1297 /* Code Descriptor for user */
1298 { 0x0, /* segment base address */
1299 0xfffff, /* length - all address space */
1300 SDT_MEMERA
, /* segment type */
1301 SEL_UPL
, /* segment descriptor priority level */
1302 1, /* segment descriptor present */
1304 1, /* default 32 vs 16 bit size */
1305 1 /* limit granularity (byte/page units)*/ },
1306 /* Null Descriptor - overwritten by call gate */
1307 { 0x0, /* segment base address */
1308 0x0, /* length - all address space */
1309 0, /* segment type */
1310 0, /* segment descriptor priority level */
1311 0, /* segment descriptor present */
1313 0, /* default 32 vs 16 bit size */
1314 0 /* limit granularity (byte/page units)*/ },
1315 /* Data Descriptor for user */
1316 { 0x0, /* segment base address */
1317 0xfffff, /* length - all address space */
1318 SDT_MEMRWA
, /* segment type */
1319 SEL_UPL
, /* segment descriptor priority level */
1320 1, /* segment descriptor present */
1322 1, /* default 32 vs 16 bit size */
1323 1 /* limit granularity (byte/page units)*/ },
1327 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int selec
)
1329 struct gate_descriptor
*ip
;
1332 ip
->gd_looffset
= (int)func
;
1333 ip
->gd_selector
= selec
;
1339 ip
->gd_hioffset
= ((int)func
)>>16 ;
1342 #define IDTVEC(name) __CONCAT(X,name)
1345 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1346 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1347 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1348 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(fpu
), IDTVEC(align
),
1349 IDTVEC(xmm
), IDTVEC(syscall
),
1352 IDTVEC(int0x80_syscall
);
1354 #ifdef DEBUG_INTERRUPTS
1355 extern inthand_t
*Xrsvdary
[256];
1359 sdtossd(struct segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1361 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1362 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1363 ssd
->ssd_type
= sd
->sd_type
;
1364 ssd
->ssd_dpl
= sd
->sd_dpl
;
1365 ssd
->ssd_p
= sd
->sd_p
;
1366 ssd
->ssd_def32
= sd
->sd_def32
;
1367 ssd
->ssd_gran
= sd
->sd_gran
;
1371 * Populate the (physmap) array with base/bound pairs describing the
1372 * available physical memory in the system, then test this memory and
1373 * build the phys_avail array describing the actually-available memory.
1375 * If we cannot accurately determine the physical memory map, then use
1376 * value from the 0xE801 call, and failing that, the RTC.
1378 * Total memory size may be set by the kernel environment variable
1379 * hw.physmem or the compile-time define MAXMEM.
1382 getmemsize(int first
)
1384 int i
, physmap_idx
, pa_indx
;
1386 u_int basemem
, extmem
;
1387 struct vm86frame vmf
;
1388 struct vm86context vmc
;
1390 vm_offset_t physmap
[PHYSMAP_ENTRIES
*2];
1398 quad_t dcons_addr
, dcons_size
;
1401 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12
);
1402 bzero(&vmf
, sizeof(struct vm86frame
));
1403 bzero(physmap
, sizeof(physmap
));
1407 * Some newer BIOSes has broken INT 12H implementation which cause
1408 * kernel panic immediately. In this case, we need to scan SMAP
1409 * with INT 15:E820 first, then determine base memory size.
1411 if (hasbrokenint12
) {
1416 * Perform "base memory" related probes & setup. If we get a crazy
1417 * value give the bios some scribble space just in case.
1419 vm86_intcall(0x12, &vmf
);
1420 basemem
= vmf
.vmf_ax
;
1421 if (basemem
> 640) {
1422 kprintf("Preposterous BIOS basemem of %uK, "
1423 "truncating to < 640K\n", basemem
);
1428 * XXX if biosbasemem is now < 640, there is a `hole'
1429 * between the end of base memory and the start of
1430 * ISA memory. The hole may be empty or it may
1431 * contain BIOS code or data. Map it read/write so
1432 * that the BIOS can write to it. (Memory from 0 to
1433 * the physical end of the kernel is mapped read-only
1434 * to begin with and then parts of it are remapped.
1435 * The parts that aren't remapped form holes that
1436 * remain read-only and are unused by the kernel.
1437 * The base memory area is below the physical end of
1438 * the kernel and right now forms a read-only hole.
1439 * The part of it from PAGE_SIZE to
1440 * (trunc_page(biosbasemem * 1024) - 1) will be
1441 * remapped and used by the kernel later.)
1443 * This code is similar to the code used in
1444 * pmap_mapdev, but since no memory needs to be
1445 * allocated we simply change the mapping.
1447 for (pa
= trunc_page(basemem
* 1024);
1448 pa
< ISA_HOLE_START
; pa
+= PAGE_SIZE
) {
1449 pte
= vtopte(pa
+ KERNBASE
);
1450 *pte
= pa
| PG_RW
| PG_V
;
1454 * if basemem != 640, map pages r/w into vm86 page table so
1455 * that the bios can scribble on it.
1458 for (i
= basemem
/ 4; i
< 160; i
++)
1459 pte
[i
] = (i
<< PAGE_SHIFT
) | PG_V
| PG_RW
| PG_U
;
1463 * map page 1 R/W into the kernel page table so we can use it
1464 * as a buffer. The kernel will unmap this page later.
1466 pte
= vtopte(KERNBASE
+ (1 << PAGE_SHIFT
));
1467 *pte
= (1 << PAGE_SHIFT
) | PG_RW
| PG_V
;
1470 * get memory map with INT 15:E820
1472 #define SMAPSIZ sizeof(*smap)
1473 #define SMAP_SIG 0x534D4150 /* 'SMAP' */
1476 smap
= (void *)vm86_addpage(&vmc
, 1, KERNBASE
+ (1 << PAGE_SHIFT
));
1477 vm86_getptr(&vmc
, (vm_offset_t
)smap
, &vmf
.vmf_es
, &vmf
.vmf_di
);
1482 vmf
.vmf_eax
= 0xE820;
1483 vmf
.vmf_edx
= SMAP_SIG
;
1484 vmf
.vmf_ecx
= SMAPSIZ
;
1485 i
= vm86_datacall(0x15, &vmf
, &vmc
);
1486 if (i
|| vmf
.vmf_eax
!= SMAP_SIG
)
1488 if (boothowto
& RB_VERBOSE
)
1489 kprintf("SMAP type=%02x base=%08x %08x len=%08x %08x\n",
1491 *(u_int32_t
*)((char *)&smap
->base
+ 4),
1492 (u_int32_t
)smap
->base
,
1493 *(u_int32_t
*)((char *)&smap
->length
+ 4),
1494 (u_int32_t
)smap
->length
);
1496 if (smap
->type
!= 0x01)
1499 if (smap
->length
== 0)
1502 if (smap
->base
>= 0xffffffff) {
1503 kprintf("%uK of memory above 4GB ignored\n",
1504 (u_int
)(smap
->length
/ 1024));
1508 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1509 if (smap
->base
< physmap
[i
+ 1]) {
1510 if (boothowto
& RB_VERBOSE
)
1512 "Overlapping or non-montonic memory region, ignoring second region\n");
1517 if (smap
->base
== physmap
[physmap_idx
+ 1]) {
1518 physmap
[physmap_idx
+ 1] += smap
->length
;
1523 if (physmap_idx
== PHYSMAP_ENTRIES
*2) {
1525 "Too many segments in the physical address map, giving up\n");
1528 physmap
[physmap_idx
] = smap
->base
;
1529 physmap
[physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1531 ; /* fix GCC3.x warning */
1532 } while (vmf
.vmf_ebx
!= 0);
1535 * Perform "base memory" related probes & setup based on SMAP
1538 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1539 if (physmap
[i
] == 0x00000000) {
1540 basemem
= physmap
[i
+ 1] / 1024;
1549 if (basemem
> 640) {
1550 kprintf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
1555 for (pa
= trunc_page(basemem
* 1024);
1556 pa
< ISA_HOLE_START
; pa
+= PAGE_SIZE
) {
1557 pte
= vtopte(pa
+ KERNBASE
);
1558 *pte
= pa
| PG_RW
| PG_V
;
1562 for (i
= basemem
/ 4; i
< 160; i
++)
1563 pte
[i
] = (i
<< PAGE_SHIFT
) | PG_V
| PG_RW
| PG_U
;
1566 if (physmap
[1] != 0)
1570 * If we failed above, try memory map with INT 15:E801
1572 vmf
.vmf_ax
= 0xE801;
1573 if (vm86_intcall(0x15, &vmf
) == 0) {
1574 extmem
= vmf
.vmf_cx
+ vmf
.vmf_dx
* 64;
1578 vm86_intcall(0x15, &vmf
);
1579 extmem
= vmf
.vmf_ax
;
1582 * Prefer the RTC value for extended memory.
1584 extmem
= rtcin(RTC_EXTLO
) + (rtcin(RTC_EXTHI
) << 8);
1589 * Special hack for chipsets that still remap the 384k hole when
1590 * there's 16MB of memory - this really confuses people that
1591 * are trying to use bus mastering ISA controllers with the
1592 * "16MB limit"; they only have 16MB, but the remapping puts
1593 * them beyond the limit.
1595 * If extended memory is between 15-16MB (16-17MB phys address range),
1598 if ((extmem
> 15 * 1024) && (extmem
< 16 * 1024))
1602 physmap
[1] = basemem
* 1024;
1604 physmap
[physmap_idx
] = 0x100000;
1605 physmap
[physmap_idx
+ 1] = physmap
[physmap_idx
] + extmem
* 1024;
1609 * Now, physmap contains a map of physical memory.
1613 /* make hole for AP bootstrap code YYY */
1614 physmap
[1] = mp_bootaddress(physmap
[1] / 1024);
1616 /* look for the MP hardware - needed for apic addresses */
1621 * Maxmem isn't the "maximum memory", it's one larger than the
1622 * highest page of the physical address space. It should be
1623 * called something like "Maxphyspage". We may adjust this
1624 * based on ``hw.physmem'' and the results of the memory test.
1626 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1629 Maxmem
= MAXMEM
/ 4;
1633 * hw.physmem is a size in bytes; we also allow k, m, and g suffixes
1634 * for the appropriate modifiers. This overrides MAXMEM.
1636 if ((cp
= kgetenv("hw.physmem")) != NULL
) {
1637 u_int64_t AllowMem
, sanity
;
1640 sanity
= AllowMem
= strtouq(cp
, &ep
, 0);
1641 if ((ep
!= cp
) && (*ep
!= 0)) {
1654 AllowMem
= sanity
= 0;
1656 if (AllowMem
< sanity
)
1660 kprintf("Ignoring invalid memory size of '%s'\n", cp
);
1662 Maxmem
= atop(AllowMem
);
1665 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1666 (boothowto
& RB_VERBOSE
))
1667 kprintf("Physical memory use set to %lluK\n", Maxmem
* 4);
1670 * If Maxmem has been increased beyond what the system has detected,
1671 * extend the last memory segment to the new limit.
1673 if (atop(physmap
[physmap_idx
+ 1]) < Maxmem
)
1674 physmap
[physmap_idx
+ 1] = ptoa(Maxmem
);
1676 /* call pmap initialization to make new kernel address space */
1677 pmap_bootstrap(first
, 0);
1680 * Size up each available chunk of physical memory.
1682 physmap
[0] = PAGE_SIZE
; /* mask off page 0 */
1684 phys_avail
[pa_indx
++] = physmap
[0];
1685 phys_avail
[pa_indx
] = physmap
[0];
1689 * Get dcons buffer address
1691 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
1692 kgetenv_quad("dcons.size", &dcons_size
) == 0)
1696 * physmap is in bytes, so when converting to page boundaries,
1697 * round up the start address and round down the end address.
1699 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1703 if (physmap
[i
+ 1] < end
)
1704 end
= trunc_page(physmap
[i
+ 1]);
1705 for (pa
= round_page(physmap
[i
]); pa
< end
; pa
+= PAGE_SIZE
) {
1710 int *ptr
= (int *)CADDR1
;
1714 * block out kernel memory as not available.
1716 if (pa
>= 0x100000 && pa
< first
)
1720 * block out dcons buffer
1723 && pa
>= trunc_page(dcons_addr
)
1724 && pa
< dcons_addr
+ dcons_size
)
1730 * map page into kernel: valid, read/write,non-cacheable
1732 *pte
= pa
| PG_V
| PG_RW
| PG_N
;
1737 * Test for alternating 1's and 0's
1739 *(volatile int *)ptr
= 0xaaaaaaaa;
1740 if (*(volatile int *)ptr
!= 0xaaaaaaaa) {
1744 * Test for alternating 0's and 1's
1746 *(volatile int *)ptr
= 0x55555555;
1747 if (*(volatile int *)ptr
!= 0x55555555) {
1753 *(volatile int *)ptr
= 0xffffffff;
1754 if (*(volatile int *)ptr
!= 0xffffffff) {
1760 *(volatile int *)ptr
= 0x0;
1761 if (*(volatile int *)ptr
!= 0x0) {
1765 * Restore original value.
1770 * Adjust array of valid/good pages.
1772 if (page_bad
== TRUE
) {
1776 * If this good page is a continuation of the
1777 * previous set of good pages, then just increase
1778 * the end pointer. Otherwise start a new chunk.
1779 * Note that "end" points one higher than end,
1780 * making the range >= start and < end.
1781 * If we're also doing a speculative memory
1782 * test and we at or past the end, bump up Maxmem
1783 * so that we keep going. The first bad page
1784 * will terminate the loop.
1786 if (phys_avail
[pa_indx
] == pa
) {
1787 phys_avail
[pa_indx
] += PAGE_SIZE
;
1790 if (pa_indx
>= PHYSMAP_ENTRIES
*2) {
1791 kprintf("Too many holes in the physical address space, giving up\n");
1795 phys_avail
[pa_indx
++] = pa
; /* start */
1796 phys_avail
[pa_indx
] = pa
+ PAGE_SIZE
; /* end */
1806 * The last chunk must contain at least one page plus the message
1807 * buffer to avoid complicating other code (message buffer address
1808 * calculation, etc.).
1810 while (phys_avail
[pa_indx
- 1] + PAGE_SIZE
+
1811 round_page(MSGBUF_SIZE
) >= phys_avail
[pa_indx
]) {
1812 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
1813 phys_avail
[pa_indx
--] = 0;
1814 phys_avail
[pa_indx
--] = 0;
1817 Maxmem
= atop(phys_avail
[pa_indx
]);
1819 /* Trim off space for the message buffer. */
1820 phys_avail
[pa_indx
] -= round_page(MSGBUF_SIZE
);
1822 avail_end
= phys_avail
[pa_indx
];
1834 * 7 Device Not Available (x87)
1836 * 9 Coprocessor Segment overrun (unsupported, reserved)
1838 * 11 Segment not present
1840 * 13 General Protection
1843 * 16 x87 FP Exception pending
1844 * 17 Alignment Check
1846 * 19 SIMD floating point
1848 * 32-255 INTn/external sources
1853 struct gate_descriptor
*gdp
;
1854 int gsel_tss
, metadata_missing
, off
, x
;
1855 struct mdglobaldata
*gd
;
1858 * Prevent lowering of the ipl if we call tsleep() early.
1860 gd
= &CPU_prvspace
[0].mdglobaldata
;
1861 bzero(gd
, sizeof(*gd
));
1863 gd
->mi
.gd_curthread
= &thread0
;
1864 thread0
.td_gd
= &gd
->mi
;
1866 atdevbase
= ISA_HOLE_START
+ KERNBASE
;
1868 metadata_missing
= 0;
1869 if (bootinfo
.bi_modulep
) {
1870 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
1871 preload_bootstrap_relocate(KERNBASE
);
1873 metadata_missing
= 1;
1875 if (bootinfo
.bi_envp
)
1876 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
1879 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
1880 * and ncpus_fit_mask remain 0.
1885 /* Init basic tunables, hz etc */
1889 * make gdt memory segments, the code segment goes up to end of the
1890 * page with etext in it, the data segment goes to the end of
1894 * XXX text protection is temporarily (?) disabled. The limit was
1895 * i386_btop(round_page(etext)) - 1.
1897 gdt_segs
[GCODE_SEL
].ssd_limit
= atop(0 - 1);
1898 gdt_segs
[GDATA_SEL
].ssd_limit
= atop(0 - 1);
1900 gdt_segs
[GPRIV_SEL
].ssd_limit
=
1901 atop(sizeof(struct privatespace
) - 1);
1902 gdt_segs
[GPRIV_SEL
].ssd_base
= (int) &CPU_prvspace
[0];
1903 gdt_segs
[GPROC0_SEL
].ssd_base
=
1904 (int) &CPU_prvspace
[0].mdglobaldata
.gd_common_tss
;
1906 gd
->mi
.gd_prvspace
= &CPU_prvspace
[0];
1909 * Note: on both UP and SMP curthread must be set non-NULL
1910 * early in the boot sequence because the system assumes
1911 * that 'curthread' is never NULL.
1914 for (x
= 0; x
< NGDT
; x
++) {
1916 /* avoid overwriting db entries with APM ones */
1917 if (x
>= GAPMCODE32_SEL
&& x
<= GAPMDATA_SEL
)
1920 ssdtosd(&gdt_segs
[x
], &gdt
[x
].sd
);
1923 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
1924 r_gdt
.rd_base
= (int) gdt
;
1927 mi_gdinit(&gd
->mi
, 0);
1929 mi_proc0init(&gd
->mi
, proc0paddr
);
1930 safepri
= TDPRI_MAX
;
1932 /* make ldt memory segments */
1934 * XXX - VM_MAX_USER_ADDRESS is an end address, not a max. And it
1935 * should be spelled ...MAX_USER...
1937 ldt_segs
[LUCODE_SEL
].ssd_limit
= atop(VM_MAX_USER_ADDRESS
- 1);
1938 ldt_segs
[LUDATA_SEL
].ssd_limit
= atop(VM_MAX_USER_ADDRESS
- 1);
1939 for (x
= 0; x
< sizeof ldt_segs
/ sizeof ldt_segs
[0]; x
++)
1940 ssdtosd(&ldt_segs
[x
], &ldt
[x
].sd
);
1942 _default_ldt
= GSEL(GLDT_SEL
, SEL_KPL
);
1944 gd
->gd_currentldt
= _default_ldt
;
1945 /* spinlocks and the BGL */
1949 * Setup the hardware exception table. Most exceptions use
1950 * SDT_SYS386TGT, known as a 'trap gate'. Trap gates leave
1951 * interrupts enabled. VM page faults use SDT_SYS386IGT, known as
1952 * an 'interrupt trap gate', which disables interrupts on entry,
1953 * in order to be able to poll the appropriate CRn register to
1954 * determine the fault address.
1956 for (x
= 0; x
< NIDT
; x
++) {
1957 #ifdef DEBUG_INTERRUPTS
1958 setidt(x
, Xrsvdary
[x
], SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1960 setidt(x
, &IDTVEC(rsvd0
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1963 setidt(0, &IDTVEC(div
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1964 setidt(1, &IDTVEC(dbg
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1965 setidt(2, &IDTVEC(nmi
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1966 setidt(3, &IDTVEC(bpt
), SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1967 setidt(4, &IDTVEC(ofl
), SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1968 setidt(5, &IDTVEC(bnd
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1969 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1970 setidt(7, &IDTVEC(dna
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1971 setidt(8, 0, SDT_SYSTASKGT
, SEL_KPL
, GSEL(GPANIC_SEL
, SEL_KPL
));
1972 setidt(9, &IDTVEC(fpusegm
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1973 setidt(10, &IDTVEC(tss
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1974 setidt(11, &IDTVEC(missing
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1975 setidt(12, &IDTVEC(stk
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1976 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1977 setidt(14, &IDTVEC(page
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1978 setidt(15, &IDTVEC(rsvd0
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1979 setidt(16, &IDTVEC(fpu
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1980 setidt(17, &IDTVEC(align
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1981 setidt(18, &IDTVEC(mchk
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1982 setidt(19, &IDTVEC(xmm
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1983 setidt(0x80, &IDTVEC(int0x80_syscall
),
1984 SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1986 r_idt
.rd_limit
= sizeof(idt0
) - 1;
1987 r_idt
.rd_base
= (int) idt
;
1991 * Initialize the console before we print anything out.
1995 if (metadata_missing
)
1996 kprintf("WARNING: loader(8) metadata is missing!\n");
2005 if (boothowto
& RB_KDB
)
2006 Debugger("Boot flags requested debugger");
2009 finishidentcpu(); /* Final stage of CPU initialization */
2010 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2011 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2012 initializecpu(); /* Initialize CPU registers */
2015 * make an initial tss so cpu can get interrupt stack on syscall!
2016 * The 16 bytes is to save room for a VM86 context.
2018 gd
->gd_common_tss
.tss_esp0
= (int) thread0
.td_pcb
- 16;
2019 gd
->gd_common_tss
.tss_ss0
= GSEL(GDATA_SEL
, SEL_KPL
) ;
2020 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2021 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
].sd
;
2022 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2023 gd
->gd_common_tss
.tss_ioopt
= (sizeof gd
->gd_common_tss
) << 16;
2026 dblfault_tss
.tss_esp
= dblfault_tss
.tss_esp0
= dblfault_tss
.tss_esp1
=
2027 dblfault_tss
.tss_esp2
= (int) &dblfault_stack
[sizeof(dblfault_stack
)];
2028 dblfault_tss
.tss_ss
= dblfault_tss
.tss_ss0
= dblfault_tss
.tss_ss1
=
2029 dblfault_tss
.tss_ss2
= GSEL(GDATA_SEL
, SEL_KPL
);
2030 dblfault_tss
.tss_cr3
= (int)IdlePTD
;
2031 dblfault_tss
.tss_eip
= (int) dblfault_handler
;
2032 dblfault_tss
.tss_eflags
= PSL_KERNEL
;
2033 dblfault_tss
.tss_ds
= dblfault_tss
.tss_es
=
2034 dblfault_tss
.tss_gs
= GSEL(GDATA_SEL
, SEL_KPL
);
2035 dblfault_tss
.tss_fs
= GSEL(GPRIV_SEL
, SEL_KPL
);
2036 dblfault_tss
.tss_cs
= GSEL(GCODE_SEL
, SEL_KPL
);
2037 dblfault_tss
.tss_ldt
= GSEL(GLDT_SEL
, SEL_KPL
);
2041 init_param2(physmem
);
2043 /* now running on new page tables, configured,and u/iom is accessible */
2045 /* Map the message buffer. */
2046 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2047 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2049 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2051 /* make a call gate to reenter kernel with */
2052 gdp
= &ldt
[LSYS5CALLS_SEL
].gd
;
2054 x
= (int) &IDTVEC(syscall
);
2055 gdp
->gd_looffset
= x
++;
2056 gdp
->gd_selector
= GSEL(GCODE_SEL
,SEL_KPL
);
2058 gdp
->gd_type
= SDT_SYS386CGT
;
2059 gdp
->gd_dpl
= SEL_UPL
;
2061 gdp
->gd_hioffset
= ((int) &IDTVEC(syscall
)) >>16;
2063 /* XXX does this work? */
2064 ldt
[LBSDICALLS_SEL
] = ldt
[LSYS5CALLS_SEL
];
2065 ldt
[LSOL26CALLS_SEL
] = ldt
[LSYS5CALLS_SEL
];
2067 /* transfer to user mode */
2069 _ucodesel
= LSEL(LUCODE_SEL
, SEL_UPL
);
2070 _udatasel
= LSEL(LUDATA_SEL
, SEL_UPL
);
2072 /* setup proc 0's pcb */
2073 thread0
.td_pcb
->pcb_flags
= 0;
2074 thread0
.td_pcb
->pcb_cr3
= (int)IdlePTD
; /* should already be setup */
2075 thread0
.td_pcb
->pcb_ext
= 0;
2076 lwp0
.lwp_md
.md_regs
= &proc0_tf
;
2080 * Initialize machine-dependant portions of the global data structure.
2081 * Note that the global data area and cpu0's idlestack in the private
2082 * data space were allocated in locore.
2084 * Note: the idlethread's cpl is 0
2086 * WARNING! Called from early boot, 'mycpu' may not work yet.
2089 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2092 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2094 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2095 gd
->mi
.gd_prvspace
->idlestack
,
2096 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2097 TDF_MPSAFE
, &gd
->mi
);
2098 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2099 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2100 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2101 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2105 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2107 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2108 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2115 globaldata_find(int cpu
)
2117 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2118 return(&CPU_prvspace
[cpu
].mdglobaldata
.mi
);
2121 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
2122 static void f00f_hack(void *unused
);
2123 SYSINIT(f00f_hack
, SI_BOOT2_BIOS
, SI_ORDER_ANY
, f00f_hack
, NULL
);
2126 f00f_hack(void *unused
)
2128 struct gate_descriptor
*new_idt
;
2134 kprintf("Intel Pentium detected, installing workaround for F00F bug\n");
2136 r_idt
.rd_limit
= sizeof(idt0
) - 1;
2138 tmp
= kmem_alloc(&kernel_map
, PAGE_SIZE
* 2);
2140 panic("kmem_alloc returned 0");
2141 if (((unsigned int)tmp
& (PAGE_SIZE
-1)) != 0)
2142 panic("kmem_alloc returned non-page-aligned memory");
2143 /* Put the first seven entries in the lower page */
2144 new_idt
= (struct gate_descriptor
*)(tmp
+ PAGE_SIZE
- (7*8));
2145 bcopy(idt
, new_idt
, sizeof(idt0
));
2146 r_idt
.rd_base
= (int)new_idt
;
2149 if (vm_map_protect(&kernel_map
, tmp
, tmp
+ PAGE_SIZE
,
2150 VM_PROT_READ
, FALSE
) != KERN_SUCCESS
)
2151 panic("vm_map_protect failed");
2154 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
2157 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2159 lp
->lwp_md
.md_regs
->tf_eip
= addr
;
2164 ptrace_single_step(struct lwp
*lp
)
2166 lp
->lwp_md
.md_regs
->tf_eflags
|= PSL_T
;
2171 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2174 struct trapframe
*tp
;
2176 tp
= lp
->lwp_md
.md_regs
;
2177 regs
->r_gs
= tp
->tf_gs
;
2178 regs
->r_fs
= tp
->tf_fs
;
2179 regs
->r_es
= tp
->tf_es
;
2180 regs
->r_ds
= tp
->tf_ds
;
2181 regs
->r_edi
= tp
->tf_edi
;
2182 regs
->r_esi
= tp
->tf_esi
;
2183 regs
->r_ebp
= tp
->tf_ebp
;
2184 regs
->r_ebx
= tp
->tf_ebx
;
2185 regs
->r_edx
= tp
->tf_edx
;
2186 regs
->r_ecx
= tp
->tf_ecx
;
2187 regs
->r_eax
= tp
->tf_eax
;
2188 regs
->r_eip
= tp
->tf_eip
;
2189 regs
->r_cs
= tp
->tf_cs
;
2190 regs
->r_eflags
= tp
->tf_eflags
;
2191 regs
->r_esp
= tp
->tf_esp
;
2192 regs
->r_ss
= tp
->tf_ss
;
2193 pcb
= lp
->lwp_thread
->td_pcb
;
2198 set_regs(struct lwp
*lp
, struct reg
*regs
)
2201 struct trapframe
*tp
;
2203 tp
= lp
->lwp_md
.md_regs
;
2204 if (!EFL_SECURE(regs
->r_eflags
, tp
->tf_eflags
) ||
2205 !CS_SECURE(regs
->r_cs
))
2207 tp
->tf_gs
= regs
->r_gs
;
2208 tp
->tf_fs
= regs
->r_fs
;
2209 tp
->tf_es
= regs
->r_es
;
2210 tp
->tf_ds
= regs
->r_ds
;
2211 tp
->tf_edi
= regs
->r_edi
;
2212 tp
->tf_esi
= regs
->r_esi
;
2213 tp
->tf_ebp
= regs
->r_ebp
;
2214 tp
->tf_ebx
= regs
->r_ebx
;
2215 tp
->tf_edx
= regs
->r_edx
;
2216 tp
->tf_ecx
= regs
->r_ecx
;
2217 tp
->tf_eax
= regs
->r_eax
;
2218 tp
->tf_eip
= regs
->r_eip
;
2219 tp
->tf_cs
= regs
->r_cs
;
2220 tp
->tf_eflags
= regs
->r_eflags
;
2221 tp
->tf_esp
= regs
->r_esp
;
2222 tp
->tf_ss
= regs
->r_ss
;
2223 pcb
= lp
->lwp_thread
->td_pcb
;
2227 #ifndef CPU_DISABLE_SSE
2229 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2231 struct env87
*penv_87
= &sv_87
->sv_env
;
2232 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2235 /* FPU control/status */
2236 penv_87
->en_cw
= penv_xmm
->en_cw
;
2237 penv_87
->en_sw
= penv_xmm
->en_sw
;
2238 penv_87
->en_tw
= penv_xmm
->en_tw
;
2239 penv_87
->en_fip
= penv_xmm
->en_fip
;
2240 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2241 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2242 penv_87
->en_foo
= penv_xmm
->en_foo
;
2243 penv_87
->en_fos
= penv_xmm
->en_fos
;
2246 for (i
= 0; i
< 8; ++i
)
2247 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2249 sv_87
->sv_ex_sw
= sv_xmm
->sv_ex_sw
;
2253 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2255 struct env87
*penv_87
= &sv_87
->sv_env
;
2256 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2259 /* FPU control/status */
2260 penv_xmm
->en_cw
= penv_87
->en_cw
;
2261 penv_xmm
->en_sw
= penv_87
->en_sw
;
2262 penv_xmm
->en_tw
= penv_87
->en_tw
;
2263 penv_xmm
->en_fip
= penv_87
->en_fip
;
2264 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2265 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2266 penv_xmm
->en_foo
= penv_87
->en_foo
;
2267 penv_xmm
->en_fos
= penv_87
->en_fos
;
2270 for (i
= 0; i
< 8; ++i
)
2271 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2273 sv_xmm
->sv_ex_sw
= sv_87
->sv_ex_sw
;
2275 #endif /* CPU_DISABLE_SSE */
2278 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2280 #ifndef CPU_DISABLE_SSE
2282 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2283 (struct save87
*)fpregs
);
2286 #endif /* CPU_DISABLE_SSE */
2287 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2292 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2294 #ifndef CPU_DISABLE_SSE
2296 set_fpregs_xmm((struct save87
*)fpregs
,
2297 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2300 #endif /* CPU_DISABLE_SSE */
2301 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2306 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2309 dbregs
->dr0
= rdr0();
2310 dbregs
->dr1
= rdr1();
2311 dbregs
->dr2
= rdr2();
2312 dbregs
->dr3
= rdr3();
2313 dbregs
->dr4
= rdr4();
2314 dbregs
->dr5
= rdr5();
2315 dbregs
->dr6
= rdr6();
2316 dbregs
->dr7
= rdr7();
2320 pcb
= lp
->lwp_thread
->td_pcb
;
2321 dbregs
->dr0
= pcb
->pcb_dr0
;
2322 dbregs
->dr1
= pcb
->pcb_dr1
;
2323 dbregs
->dr2
= pcb
->pcb_dr2
;
2324 dbregs
->dr3
= pcb
->pcb_dr3
;
2327 dbregs
->dr6
= pcb
->pcb_dr6
;
2328 dbregs
->dr7
= pcb
->pcb_dr7
;
2334 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2337 load_dr0(dbregs
->dr0
);
2338 load_dr1(dbregs
->dr1
);
2339 load_dr2(dbregs
->dr2
);
2340 load_dr3(dbregs
->dr3
);
2341 load_dr4(dbregs
->dr4
);
2342 load_dr5(dbregs
->dr5
);
2343 load_dr6(dbregs
->dr6
);
2344 load_dr7(dbregs
->dr7
);
2347 struct ucred
*ucred
;
2349 uint32_t mask1
, mask2
;
2352 * Don't let an illegal value for dr7 get set. Specifically,
2353 * check for undefined settings. Setting these bit patterns
2354 * result in undefined behaviour and can lead to an unexpected
2357 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 8;
2358 i
++, mask1
<<= 2, mask2
<<= 2)
2359 if ((dbregs
->dr7
& mask1
) == mask2
)
2362 pcb
= lp
->lwp_thread
->td_pcb
;
2363 ucred
= lp
->lwp_proc
->p_ucred
;
2366 * Don't let a process set a breakpoint that is not within the
2367 * process's address space. If a process could do this, it
2368 * could halt the system by setting a breakpoint in the kernel
2369 * (if ddb was enabled). Thus, we need to check to make sure
2370 * that no breakpoints are being enabled for addresses outside
2371 * process's address space, unless, perhaps, we were called by
2374 * XXX - what about when the watched area of the user's
2375 * address space is written into from within the kernel
2376 * ... wouldn't that still cause a breakpoint to be generated
2377 * from within kernel mode?
2380 if (suser_cred(ucred
, 0) != 0) {
2381 if (dbregs
->dr7
& 0x3) {
2382 /* dr0 is enabled */
2383 if (dbregs
->dr0
>= VM_MAX_USER_ADDRESS
)
2387 if (dbregs
->dr7
& (0x3<<2)) {
2388 /* dr1 is enabled */
2389 if (dbregs
->dr1
>= VM_MAX_USER_ADDRESS
)
2393 if (dbregs
->dr7
& (0x3<<4)) {
2394 /* dr2 is enabled */
2395 if (dbregs
->dr2
>= VM_MAX_USER_ADDRESS
)
2399 if (dbregs
->dr7
& (0x3<<6)) {
2400 /* dr3 is enabled */
2401 if (dbregs
->dr3
>= VM_MAX_USER_ADDRESS
)
2406 pcb
->pcb_dr0
= dbregs
->dr0
;
2407 pcb
->pcb_dr1
= dbregs
->dr1
;
2408 pcb
->pcb_dr2
= dbregs
->dr2
;
2409 pcb
->pcb_dr3
= dbregs
->dr3
;
2410 pcb
->pcb_dr6
= dbregs
->dr6
;
2411 pcb
->pcb_dr7
= dbregs
->dr7
;
2413 pcb
->pcb_flags
|= PCB_DBREGS
;
2420 * Return > 0 if a hardware breakpoint has been hit, and the
2421 * breakpoint was in user space. Return 0, otherwise.
2424 user_dbreg_trap(void)
2426 u_int32_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2427 u_int32_t bp
; /* breakpoint bits extracted from dr6 */
2428 int nbp
; /* number of breakpoints that triggered */
2429 caddr_t addr
[4]; /* breakpoint addresses */
2433 if ((dr7
& 0x000000ff) == 0) {
2435 * all GE and LE bits in the dr7 register are zero,
2436 * thus the trap couldn't have been caused by the
2437 * hardware debug registers
2444 bp
= dr6
& 0x0000000f;
2448 * None of the breakpoint bits are set meaning this
2449 * trap was not caused by any of the debug registers
2455 * at least one of the breakpoints were hit, check to see
2456 * which ones and if any of them are user space addresses
2460 addr
[nbp
++] = (caddr_t
)rdr0();
2463 addr
[nbp
++] = (caddr_t
)rdr1();
2466 addr
[nbp
++] = (caddr_t
)rdr2();
2469 addr
[nbp
++] = (caddr_t
)rdr3();
2472 for (i
=0; i
<nbp
; i
++) {
2474 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2476 * addr[i] is in user space
2483 * None of the breakpoints are in user space.
2491 Debugger(const char *msg
)
2493 kprintf("Debugger(\"%s\") called.\n", msg
);
2500 * Provide inb() and outb() as functions. They are normally only
2501 * available as macros calling inlined functions, thus cannot be
2502 * called inside DDB.
2504 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2510 /* silence compiler warnings */
2512 void outb(u_int
, u_char
);
2519 * We use %%dx and not %1 here because i/o is done at %dx and not at
2520 * %edx, while gcc generates inferior code (movw instead of movl)
2521 * if we tell it to load (u_short) port.
2523 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2528 outb(u_int port
, u_char data
)
2532 * Use an unnecessary assignment to help gcc's register allocator.
2533 * This make a large difference for gcc-1.40 and a tiny difference
2534 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2535 * best results. gcc-2.6.0 can't handle this.
2538 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2545 #include "opt_cpu.h"
2549 * initialize all the SMP locks
2552 /* critical region when masking or unmasking interupts */
2553 struct spinlock_deprecated imen_spinlock
;
2555 /* Make FAST_INTR() routines sequential */
2556 struct spinlock_deprecated fast_intr_spinlock
;
2558 /* critical region for old style disable_intr/enable_intr */
2559 struct spinlock_deprecated mpintr_spinlock
;
2561 /* critical region around INTR() routines */
2562 struct spinlock_deprecated intr_spinlock
;
2564 /* lock region used by kernel profiling */
2565 struct spinlock_deprecated mcount_spinlock
;
2567 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2568 struct spinlock_deprecated com_spinlock
;
2570 /* locks kernel kprintfs */
2571 struct spinlock_deprecated cons_spinlock
;
2573 /* lock regions around the clock hardware */
2574 struct spinlock_deprecated clock_spinlock
;
2576 /* lock around the MP rendezvous */
2577 struct spinlock_deprecated smp_rv_spinlock
;
2583 * mp_lock = 0; BSP already owns the MP lock
2586 * Get the initial mp_lock with a count of 1 for the BSP.
2587 * This uses a LOGICAL cpu ID, ie BSP == 0.
2590 cpu_get_initial_mplock();
2593 spin_lock_init(&mcount_spinlock
);
2594 spin_lock_init(&fast_intr_spinlock
);
2595 spin_lock_init(&intr_spinlock
);
2596 spin_lock_init(&mpintr_spinlock
);
2597 spin_lock_init(&imen_spinlock
);
2598 spin_lock_init(&smp_rv_spinlock
);
2599 spin_lock_init(&com_spinlock
);
2600 spin_lock_init(&clock_spinlock
);
2601 spin_lock_init(&cons_spinlock
);
2603 /* our token pool needs to work early */
2604 lwkt_token_pool_init();