2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
6 * This code is derived from software contributed to Berkeley by
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
38 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
39 * $DragonFly: src/sys/platform/pc32/i386/machdep.c,v 1.135 2008/08/02 01:14:43 dillon Exp $
43 #include "use_ether.h"
46 #include "opt_atalk.h"
47 #include "opt_compat.h"
50 #include "opt_directio.h"
53 #include "opt_maxmem.h"
54 #include "opt_msgbuf.h"
55 #include "opt_perfmon.h"
57 #include "opt_userconfig.h"
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/sysproto.h>
62 #include <sys/signalvar.h>
63 #include <sys/kernel.h>
64 #include <sys/linker.h>
65 #include <sys/malloc.h>
69 #include <sys/reboot.h>
71 #include <sys/msgbuf.h>
72 #include <sys/sysent.h>
73 #include <sys/sysctl.h>
74 #include <sys/vmmeter.h>
76 #include <sys/upcall.h>
77 #include <sys/usched.h>
81 #include <vm/vm_param.h>
83 #include <vm/vm_kern.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_pager.h>
88 #include <vm/vm_extern.h>
90 #include <sys/thread2.h>
91 #include <sys/mplock2.h>
99 #include <machine/cpu.h>
100 #include <machine/clock.h>
101 #include <machine/specialreg.h>
102 #include <machine/bootinfo.h>
103 #include <machine/md_var.h>
104 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
105 #include <machine/globaldata.h> /* CPU_prvspace */
106 #include <machine/smp.h>
108 #include <machine/perfmon.h>
110 #include <machine/cputypes.h>
113 #include <bus/isa/isa_device.h>
115 #include <machine_base/isa/intr_machdep.h>
116 #include <bus/isa/rtc.h>
117 #include <machine/vm86.h>
118 #include <sys/random.h>
119 #include <sys/ptrace.h>
120 #include <machine/sigframe.h>
122 #define PHYSMAP_ENTRIES 10
124 extern void init386(int first
);
125 extern void dblfault_handler(void);
127 extern void printcpuinfo(void); /* XXX header file */
128 extern void finishidentcpu(void);
129 extern void panicifcpuunsupported(void);
130 extern void initializecpu(void);
132 static void cpu_startup(void *);
133 #ifndef CPU_DISABLE_SSE
134 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
135 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
136 #endif /* CPU_DISABLE_SSE */
138 extern void ffs_rawread_setup(void);
139 #endif /* DIRECTIO */
140 static void init_locks(void);
142 SYSINIT(cpu
, SI_BOOT2_SMP
, SI_ORDER_FIRST
, cpu_startup
, NULL
)
144 int _udatasel
, _ucodesel
;
147 int64_t tsc_offsets
[MAXCPU
];
149 int64_t tsc_offsets
[1];
152 #if defined(SWTCH_OPTIM_STATS)
153 extern int swtch_optim_stats
;
154 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
155 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
156 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
157 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
162 u_long ebda_addr
= 0;
165 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
167 int error
= sysctl_handle_int(oidp
, 0, ctob(physmem
), req
);
171 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_INT
|CTLFLAG_RD
,
172 0, 0, sysctl_hw_physmem
, "IU", "");
175 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
177 int error
= sysctl_handle_int(oidp
, 0,
178 ctob(physmem
- vmstats
.v_wire_count
), req
);
182 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
183 0, 0, sysctl_hw_usermem
, "IU", "");
186 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
188 int error
= sysctl_handle_int(oidp
, 0,
189 i386_btop(avail_end
- avail_start
), req
);
193 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
194 0, 0, sysctl_hw_availpages
, "I", "");
196 vm_paddr_t Maxmem
= 0;
198 vm_paddr_t phys_avail
[PHYSMAP_ENTRIES
*2+2];
199 vm_paddr_t dump_avail
[PHYSMAP_ENTRIES
*2+2];
202 static vm_offset_t buffer_sva
, buffer_eva
;
203 vm_offset_t clean_sva
, clean_eva
;
204 static vm_offset_t pager_sva
, pager_eva
;
205 static struct trapframe proc0_tf
;
208 cpu_startup(void *dummy
)
212 vm_offset_t firstaddr
;
214 if (boothowto
& RB_VERBOSE
)
218 * Good {morning,afternoon,evening,night}.
220 kprintf("%s", version
);
223 panicifcpuunsupported();
227 kprintf("real memory = %ju (%ju MB)\n",
228 (intmax_t)ptoa(Maxmem
), (intmax_t)ptoa(Maxmem
) / 1024 / 1024);
230 * Display any holes after the first chunk of extended memory.
235 kprintf("Physical memory chunk(s):\n");
236 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
237 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
239 kprintf("0x%08llx - 0x%08llx, %llu bytes (%llu pages)\n",
240 phys_avail
[indx
], phys_avail
[indx
+ 1] - 1, size1
,
246 * Allocate space for system data structures.
247 * The first available kernel virtual address is in "v".
248 * As pages of kernel virtual memory are allocated, "v" is incremented.
249 * As pages of memory are allocated and cleared,
250 * "firstaddr" is incremented.
251 * An index into the kernel page table corresponding to the
252 * virtual memory address maintained in "v" is kept in "mapaddr".
256 * Make two passes. The first pass calculates how much memory is
257 * needed and allocates it. The second pass assigns virtual
258 * addresses to the various data structures.
262 v
= (caddr_t
)firstaddr
;
264 #define valloc(name, type, num) \
265 (name) = (type *)v; v = (caddr_t)((name)+(num))
266 #define valloclim(name, type, num, lim) \
267 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
270 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
271 * For the first 64MB of ram nominally allocate sufficient buffers to
272 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
273 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
274 * the buffer cache we limit the eventual kva reservation to
277 * factor represents the 1/4 x ram conversion.
280 int factor
= 4 * BKVASIZE
/ 1024;
281 int kbytes
= physmem
* (PAGE_SIZE
/ 1024);
285 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
287 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
288 if (maxbcache
&& nbuf
> maxbcache
/ BKVASIZE
)
289 nbuf
= maxbcache
/ BKVASIZE
;
293 * Do not allow the buffer_map to be more then 1/2 the size of the
296 if (nbuf
> (virtual_end
- virtual_start
) / (BKVASIZE
* 2)) {
297 nbuf
= (virtual_end
- virtual_start
) / (BKVASIZE
* 2);
298 kprintf("Warning: nbufs capped at %d\n", nbuf
);
301 nswbuf
= max(min(nbuf
/4, 256), 16);
303 if (nswbuf
< NSWBUF_MIN
)
310 valloc(swbuf
, struct buf
, nswbuf
);
311 valloc(buf
, struct buf
, nbuf
);
314 * End of first pass, size has been calculated so allocate memory
316 if (firstaddr
== 0) {
317 size
= (vm_size_t
)(v
- firstaddr
);
318 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
));
320 panic("startup: no room for tables");
325 * End of second pass, addresses have been assigned
327 if ((vm_size_t
)(v
- firstaddr
) != size
)
328 panic("startup: table size inconsistency");
330 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
331 (nbuf
*BKVASIZE
) + (nswbuf
*MAXPHYS
) + pager_map_size
);
332 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
334 buffer_map
.system_map
= 1;
335 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
336 (nswbuf
*MAXPHYS
) + pager_map_size
);
337 pager_map
.system_map
= 1;
339 #if defined(USERCONFIG)
341 cninit(); /* the preferred console may have changed */
344 kprintf("avail memory = %ju (%ju MB)\n",
345 (intmax_t)ptoa(vmstats
.v_free_count
),
346 (intmax_t)ptoa(vmstats
.v_free_count
) / 1024 / 1024);
349 * Set up buffers, so they can be used to read disk labels.
352 vm_pager_bufferinit();
356 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
358 mp_start(); /* fire up the APs and APICs */
365 * Send an interrupt to process.
367 * Stack is set up to allow sigcode stored
368 * at top to call routine, followed by kcall
369 * to sigreturn routine below. After sigreturn
370 * resets the signal mask, the stack, and the
371 * frame pointer, it returns to the user
375 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
377 struct lwp
*lp
= curthread
->td_lwp
;
378 struct proc
*p
= lp
->lwp_proc
;
379 struct trapframe
*regs
;
380 struct sigacts
*psp
= p
->p_sigacts
;
381 struct sigframe sf
, *sfp
;
384 regs
= lp
->lwp_md
.md_regs
;
385 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
387 /* save user context */
388 bzero(&sf
, sizeof(struct sigframe
));
389 sf
.sf_uc
.uc_sigmask
= *mask
;
390 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
391 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
392 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_gs
, sizeof(struct trapframe
));
394 /* make the size of the saved context visible to userland */
395 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
397 /* save mailbox pending state for syscall interlock semantics */
398 if (p
->p_flag
& P_MAILBOX
)
399 sf
.sf_uc
.uc_mcontext
.mc_xflags
|= PGEX_MAILBOX
;
401 /* Allocate and validate space for the signal handler context. */
402 if ((lp
->lwp_flag
& LWP_ALTSTACK
) != 0 && !oonstack
&&
403 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
404 sfp
= (struct sigframe
*)(lp
->lwp_sigstk
.ss_sp
+
405 lp
->lwp_sigstk
.ss_size
- sizeof(struct sigframe
));
406 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
408 sfp
= (struct sigframe
*)regs
->tf_esp
- 1;
411 /* Translate the signal is appropriate */
412 if (p
->p_sysent
->sv_sigtbl
) {
413 if (sig
<= p
->p_sysent
->sv_sigsize
)
414 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
417 /* Build the argument list for the signal handler. */
419 sf
.sf_ucontext
= (register_t
)&sfp
->sf_uc
;
420 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
421 /* Signal handler installed with SA_SIGINFO. */
422 sf
.sf_siginfo
= (register_t
)&sfp
->sf_si
;
423 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
425 /* fill siginfo structure */
426 sf
.sf_si
.si_signo
= sig
;
427 sf
.sf_si
.si_code
= code
;
428 sf
.sf_si
.si_addr
= (void*)regs
->tf_err
;
431 /* Old FreeBSD-style arguments. */
432 sf
.sf_siginfo
= code
;
433 sf
.sf_addr
= regs
->tf_err
;
434 sf
.sf_ahu
.sf_handler
= catcher
;
438 * If we're a vm86 process, we want to save the segment registers.
439 * We also change eflags to be our emulated eflags, not the actual
442 if (regs
->tf_eflags
& PSL_VM
) {
443 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
444 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
446 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
447 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
448 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
449 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
451 if (vm86
->vm86_has_vme
== 0)
452 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
453 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
454 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
457 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
458 * syscalls made by the signal handler. This just avoids
459 * wasting time for our lazy fixup of such faults. PSL_NT
460 * does nothing in vm86 mode, but vm86 programs can set it
461 * almost legitimately in probes for old cpu types.
463 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
467 * Save the FPU state and reinit the FP unit
469 npxpush(&sf
.sf_uc
.uc_mcontext
);
472 * Copy the sigframe out to the user's stack.
474 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
476 * Something is wrong with the stack pointer.
477 * ...Kill the process.
482 regs
->tf_esp
= (int)sfp
;
483 regs
->tf_eip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
486 * i386 abi specifies that the direction flag must be cleared
489 regs
->tf_eflags
&= ~(PSL_T
|PSL_D
);
491 regs
->tf_cs
= _ucodesel
;
492 regs
->tf_ds
= _udatasel
;
493 regs
->tf_es
= _udatasel
;
496 * Allow the signal handler to inherit %fs in addition to %gs as
497 * the userland program might be using both.
499 * However, if a T_PROTFLT occured the segment registers could be
500 * totally broken. They must be reset in order to be able to
501 * return to userland.
503 if (regs
->tf_trapno
== T_PROTFLT
) {
504 regs
->tf_fs
= _udatasel
;
505 regs
->tf_gs
= _udatasel
;
507 regs
->tf_ss
= _udatasel
;
511 * Sanitize the trapframe for a virtual kernel passing control to a custom
512 * VM context. Remove any items that would otherwise create a privilage
515 * XXX at the moment we allow userland to set the resume flag. Is this a
519 cpu_sanitize_frame(struct trapframe
*frame
)
521 frame
->tf_cs
= _ucodesel
;
522 frame
->tf_ds
= _udatasel
;
523 frame
->tf_es
= _udatasel
; /* XXX allow userland this one too? */
525 frame
->tf_fs
= _udatasel
;
526 frame
->tf_gs
= _udatasel
;
528 frame
->tf_ss
= _udatasel
;
529 frame
->tf_eflags
&= (PSL_RF
| PSL_USERCHANGE
);
530 frame
->tf_eflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
535 cpu_sanitize_tls(struct savetls
*tls
)
537 struct segment_descriptor
*desc
;
540 for (i
= 0; i
< NGTLS
; ++i
) {
542 if (desc
->sd_dpl
== 0 && desc
->sd_type
== 0)
544 if (desc
->sd_def32
== 0)
546 if (desc
->sd_type
!= SDT_MEMRWA
)
548 if (desc
->sd_dpl
!= SEL_UPL
)
550 if (desc
->sd_xx
!= 0 || desc
->sd_p
!= 1)
557 * sigreturn(ucontext_t *sigcntxp)
559 * System call to cleanup state after a signal
560 * has been taken. Reset signal mask and
561 * stack state from context left by sendsig (above).
562 * Return to previous pc and psl as specified by
563 * context left by sendsig. Check carefully to
564 * make sure that the user has not modified the
565 * state to gain improper privileges.
569 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
570 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
573 sys_sigreturn(struct sigreturn_args
*uap
)
575 struct lwp
*lp
= curthread
->td_lwp
;
576 struct proc
*p
= lp
->lwp_proc
;
577 struct trapframe
*regs
;
585 * We have to copy the information into kernel space so userland
586 * can't modify it while we are sniffing it.
588 regs
= lp
->lwp_md
.md_regs
;
589 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
593 eflags
= ucp
->uc_mcontext
.mc_eflags
;
595 if (eflags
& PSL_VM
) {
596 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
597 struct vm86_kernel
*vm86
;
600 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
601 * set up the vm86 area, and we can't enter vm86 mode.
603 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
605 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
606 if (vm86
->vm86_inited
== 0)
609 /* go back to user mode if both flags are set */
610 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
611 trapsignal(lp
, SIGBUS
, 0);
613 if (vm86
->vm86_has_vme
) {
614 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
615 (eflags
& VME_USERCHANGE
) | PSL_VM
;
617 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
618 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
619 (eflags
& VM_USERCHANGE
) | PSL_VM
;
621 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
622 tf
->tf_eflags
= eflags
;
623 tf
->tf_vm86_ds
= tf
->tf_ds
;
624 tf
->tf_vm86_es
= tf
->tf_es
;
625 tf
->tf_vm86_fs
= tf
->tf_fs
;
626 tf
->tf_vm86_gs
= tf
->tf_gs
;
627 tf
->tf_ds
= _udatasel
;
628 tf
->tf_es
= _udatasel
;
630 tf
->tf_fs
= _udatasel
;
631 tf
->tf_gs
= _udatasel
;
635 * Don't allow users to change privileged or reserved flags.
638 * XXX do allow users to change the privileged flag PSL_RF.
639 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
640 * should sometimes set it there too. tf_eflags is kept in
641 * the signal context during signal handling and there is no
642 * other place to remember it, so the PSL_RF bit may be
643 * corrupted by the signal handler without us knowing.
644 * Corruption of the PSL_RF bit at worst causes one more or
645 * one less debugger trap, so allowing it is fairly harmless.
647 if (!EFL_SECURE(eflags
& ~PSL_RF
, regs
->tf_eflags
& ~PSL_RF
)) {
648 kprintf("sigreturn: eflags = 0x%x\n", eflags
);
653 * Don't allow users to load a valid privileged %cs. Let the
654 * hardware check for invalid selectors, excess privilege in
655 * other selectors, invalid %eip's and invalid %esp's.
657 cs
= ucp
->uc_mcontext
.mc_cs
;
658 if (!CS_SECURE(cs
)) {
659 kprintf("sigreturn: cs = 0x%x\n", cs
);
660 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
663 bcopy(&ucp
->uc_mcontext
.mc_gs
, regs
, sizeof(struct trapframe
));
667 * Restore the FPU state from the frame
670 npxpop(&ucp
->uc_mcontext
);
673 * Merge saved signal mailbox pending flag to maintain interlock
674 * semantics against system calls.
676 if (ucp
->uc_mcontext
.mc_xflags
& PGEX_MAILBOX
)
677 p
->p_flag
|= P_MAILBOX
;
679 if (ucp
->uc_mcontext
.mc_onstack
& 1)
680 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
682 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
684 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
685 SIG_CANTMASK(lp
->lwp_sigmask
);
691 * Stack frame on entry to function. %eax will contain the function vector,
692 * %ecx will contain the function data. flags, ecx, and eax will have
693 * already been pushed on the stack.
704 sendupcall(struct vmupcall
*vu
, int morepending
)
706 struct lwp
*lp
= curthread
->td_lwp
;
707 struct trapframe
*regs
;
708 struct upcall upcall
;
709 struct upc_frame upc_frame
;
713 * If we are a virtual kernel running an emulated user process
714 * context, switch back to the virtual kernel context before
715 * trying to post the signal.
717 if (lp
->lwp_vkernel
&& lp
->lwp_vkernel
->ve
) {
718 lp
->lwp_md
.md_regs
->tf_trapno
= 0;
719 vkernel_trap(lp
, lp
->lwp_md
.md_regs
);
723 * Get the upcall data structure
725 if (copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
)) ||
726 copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int))
729 kprintf("bad upcall address\n");
734 * If the data structure is already marked pending or has a critical
735 * section count, mark the data structure as pending and return
736 * without doing an upcall. vu_pending is left set.
738 if (upcall
.upc_pending
|| crit_count
>= vu
->vu_pending
) {
739 if (upcall
.upc_pending
< vu
->vu_pending
) {
740 upcall
.upc_pending
= vu
->vu_pending
;
741 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
742 sizeof(upcall
.upc_pending
));
748 * We can run this upcall now, clear vu_pending.
750 * Bump our critical section count and set or clear the
751 * user pending flag depending on whether more upcalls are
752 * pending. The user will be responsible for calling
753 * upc_dispatch(-1) to process remaining upcalls.
756 upcall
.upc_pending
= morepending
;
757 crit_count
+= TDPRI_CRIT
;
758 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
759 sizeof(upcall
.upc_pending
));
760 copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
,
764 * Construct a stack frame and issue the upcall
766 regs
= lp
->lwp_md
.md_regs
;
767 upc_frame
.eax
= regs
->tf_eax
;
768 upc_frame
.ecx
= regs
->tf_ecx
;
769 upc_frame
.edx
= regs
->tf_edx
;
770 upc_frame
.flags
= regs
->tf_eflags
;
771 upc_frame
.oldip
= regs
->tf_eip
;
772 if (copyout(&upc_frame
, (void *)(regs
->tf_esp
- sizeof(upc_frame
)),
773 sizeof(upc_frame
)) != 0) {
774 kprintf("bad stack on upcall\n");
776 regs
->tf_eax
= (register_t
)vu
->vu_func
;
777 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
778 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
779 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
780 regs
->tf_esp
-= sizeof(upc_frame
);
785 * fetchupcall occurs in the context of a system call, which means that
786 * we have to return EJUSTRETURN in order to prevent eax and edx from
787 * being overwritten by the syscall return value.
789 * if vu is not NULL we return the new context in %edx, the new data in %ecx,
790 * and the function pointer in %eax.
793 fetchupcall(struct vmupcall
*vu
, int morepending
, void *rsp
)
795 struct upc_frame upc_frame
;
796 struct lwp
*lp
= curthread
->td_lwp
;
797 struct trapframe
*regs
;
799 struct upcall upcall
;
802 regs
= lp
->lwp_md
.md_regs
;
804 error
= copyout(&morepending
, &lp
->lwp_upcall
->upc_pending
, sizeof(int));
808 * This jumps us to the next ready context.
811 error
= copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
));
814 error
= copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int));
815 crit_count
+= TDPRI_CRIT
;
817 error
= copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, sizeof(int));
818 regs
->tf_eax
= (register_t
)vu
->vu_func
;
819 regs
->tf_ecx
= (register_t
)vu
->vu_data
;
820 regs
->tf_edx
= (register_t
)lp
->lwp_upcall
;
821 regs
->tf_eip
= (register_t
)vu
->vu_ctx
;
822 regs
->tf_esp
= (register_t
)rsp
;
825 * This returns us to the originally interrupted code.
827 error
= copyin(rsp
, &upc_frame
, sizeof(upc_frame
));
828 regs
->tf_eax
= upc_frame
.eax
;
829 regs
->tf_ecx
= upc_frame
.ecx
;
830 regs
->tf_edx
= upc_frame
.edx
;
831 regs
->tf_eflags
= (regs
->tf_eflags
& ~PSL_USERCHANGE
) |
832 (upc_frame
.flags
& PSL_USERCHANGE
);
833 regs
->tf_eip
= upc_frame
.oldip
;
834 regs
->tf_esp
= (register_t
)((char *)rsp
+ sizeof(upc_frame
));
843 * Machine dependent boot() routine
845 * I haven't seen anything to put here yet
846 * Possibly some stuff might be grafted back here from boot()
854 * Shutdown the CPU as much as possible
860 __asm__
__volatile("hlt");
864 * cpu_idle() represents the idle LWKT. You cannot return from this function
865 * (unless you want to blow things up!). Instead we look for runnable threads
866 * and loop or halt as appropriate. Giant is not held on entry to the thread.
868 * The main loop is entered with a critical section held, we must release
869 * the critical section before doing anything else. lwkt_switch() will
870 * check for pending interrupts due to entering and exiting its own
873 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI
874 * to wake a HLTed cpu up. However, there are cases where the idlethread
875 * will be entered with the possibility that no IPI will occur and in such
876 * cases lwkt_switch() sets TDF_IDLE_NOHLT.
878 static int cpu_idle_hlt
= 1;
879 static int cpu_idle_hltcnt
;
880 static int cpu_idle_spincnt
;
881 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
882 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
883 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, CTLFLAG_RW
,
884 &cpu_idle_hltcnt
, 0, "Idle loop entry halts");
885 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_spincnt
, CTLFLAG_RW
,
886 &cpu_idle_spincnt
, 0, "Idle loop entry spins");
889 cpu_idle_default_hook(void)
892 * We must guarentee that hlt is exactly the instruction
895 __asm
__volatile("sti; hlt");
898 /* Other subsystems (e.g., ACPI) can hook this later. */
899 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
904 struct thread
*td
= curthread
;
907 KKASSERT(td
->td_pri
< TDPRI_CRIT
);
910 * See if there are any LWKTs ready to go.
915 * If we are going to halt call splz unconditionally after
916 * CLIing to catch any interrupt races. Note that we are
917 * at SPL0 and interrupts are enabled.
919 if (cpu_idle_hlt
&& !lwkt_runnable() &&
920 (td
->td_flags
& TDF_IDLE_NOHLT
) == 0) {
921 __asm
__volatile("cli");
923 if (!lwkt_runnable())
927 __asm
__volatile("pause");
931 td
->td_flags
&= ~TDF_IDLE_NOHLT
;
934 __asm
__volatile("sti; pause");
936 __asm
__volatile("sti");
946 * This routine is called when the only runnable threads require
947 * the MP lock, and the scheduler couldn't get it. On a real cpu
948 * we let the scheduler spin.
951 cpu_mplock_contested(void)
957 * This routine is called if a spinlock has been held through the
958 * exponential backoff period and is seriously contested. On a real cpu
962 cpu_spinlock_contested(void)
970 * Clear registers on exec
973 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
975 struct thread
*td
= curthread
;
976 struct lwp
*lp
= td
->td_lwp
;
977 struct pcb
*pcb
= td
->td_pcb
;
978 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
980 /* was i386_user_cleanup() in NetBSD */
983 bzero((char *)regs
, sizeof(struct trapframe
));
984 regs
->tf_eip
= entry
;
985 regs
->tf_esp
= stack
;
986 regs
->tf_eflags
= PSL_USER
| (regs
->tf_eflags
& PSL_T
);
987 regs
->tf_ss
= _udatasel
;
988 regs
->tf_ds
= _udatasel
;
989 regs
->tf_es
= _udatasel
;
990 regs
->tf_fs
= _udatasel
;
991 regs
->tf_gs
= _udatasel
;
992 regs
->tf_cs
= _ucodesel
;
994 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
995 regs
->tf_ebx
= ps_strings
;
998 * Reset the hardware debug registers if they were in use.
999 * They won't have any meaning for the newly exec'd process.
1001 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1008 if (pcb
== td
->td_pcb
) {
1010 * Clear the debug registers on the running
1011 * CPU, otherwise they will end up affecting
1012 * the next process we switch to.
1016 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1020 * Initialize the math emulator (if any) for the current process.
1021 * Actually, just clear the bit that says that the emulator has
1022 * been initialized. Initialization is delayed until the process
1023 * traps to the emulator (if it is done at all) mainly because
1024 * emulators don't provide an entry point for initialization.
1026 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1029 * note: do not set CR0_TS here. npxinit() must do it after clearing
1030 * gd_npxthread. Otherwise a preemptive interrupt thread may panic
1034 load_cr0(rcr0() | CR0_MP
);
1037 /* Initialize the npx (if any) for the current process. */
1038 npxinit(__INITIAL_NPXCW__
);
1043 * note: linux emulator needs edx to be 0x0 on entry, which is
1044 * handled in execve simply by setting the 64 bit syscall
1045 * return value to 0.
1055 cr0
|= CR0_NE
; /* Done by npxinit() */
1056 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1057 cr0
|= CR0_WP
| CR0_AM
;
1063 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1066 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1068 if (!error
&& req
->newptr
)
1073 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1074 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1076 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1077 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1079 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1080 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1082 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1083 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1085 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1086 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1087 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1090 * Initialize 386 and configure to run kernel
1094 * Initialize segments & interrupt table
1098 union descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1099 static struct gate_descriptor idt0
[NIDT
];
1100 struct gate_descriptor
*idt
= &idt0
[0]; /* interrupt descriptor table */
1101 union descriptor ldt
[NLDT
]; /* local descriptor table */
1103 /* table descriptors - used to load tables by cpu */
1104 struct region_descriptor r_gdt
, r_idt
;
1106 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
1107 extern int has_f00f_bug
;
1110 static struct i386tss dblfault_tss
;
1111 static char dblfault_stack
[PAGE_SIZE
];
1113 extern struct user
*proc0paddr
;
1116 /* software prototypes -- in more palatable form */
1117 struct soft_segment_descriptor gdt_segs
[] = {
1118 /* GNULL_SEL 0 Null Descriptor */
1119 { 0x0, /* segment base address */
1121 0, /* segment type */
1122 0, /* segment descriptor priority level */
1123 0, /* segment descriptor present */
1125 0, /* default 32 vs 16 bit size */
1126 0 /* limit granularity (byte/page units)*/ },
1127 /* GCODE_SEL 1 Code Descriptor for kernel */
1128 { 0x0, /* segment base address */
1129 0xfffff, /* length - all address space */
1130 SDT_MEMERA
, /* segment type */
1131 0, /* segment descriptor priority level */
1132 1, /* segment descriptor present */
1134 1, /* default 32 vs 16 bit size */
1135 1 /* limit granularity (byte/page units)*/ },
1136 /* GDATA_SEL 2 Data Descriptor for kernel */
1137 { 0x0, /* segment base address */
1138 0xfffff, /* length - all address space */
1139 SDT_MEMRWA
, /* segment type */
1140 0, /* segment descriptor priority level */
1141 1, /* segment descriptor present */
1143 1, /* default 32 vs 16 bit size */
1144 1 /* limit granularity (byte/page units)*/ },
1145 /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */
1146 { 0x0, /* segment base address */
1147 0xfffff, /* length - all address space */
1148 SDT_MEMRWA
, /* segment type */
1149 0, /* segment descriptor priority level */
1150 1, /* segment descriptor present */
1152 1, /* default 32 vs 16 bit size */
1153 1 /* limit granularity (byte/page units)*/ },
1154 /* GPROC0_SEL 4 Proc 0 Tss Descriptor */
1156 0x0, /* segment base address */
1157 sizeof(struct i386tss
)-1,/* length - all address space */
1158 SDT_SYS386TSS
, /* segment type */
1159 0, /* segment descriptor priority level */
1160 1, /* segment descriptor present */
1162 0, /* unused - default 32 vs 16 bit size */
1163 0 /* limit granularity (byte/page units)*/ },
1164 /* GLDT_SEL 5 LDT Descriptor */
1165 { (int) ldt
, /* segment base address */
1166 sizeof(ldt
)-1, /* length - all address space */
1167 SDT_SYSLDT
, /* segment type */
1168 SEL_UPL
, /* segment descriptor priority level */
1169 1, /* segment descriptor present */
1171 0, /* unused - default 32 vs 16 bit size */
1172 0 /* limit granularity (byte/page units)*/ },
1173 /* GUSERLDT_SEL 6 User LDT Descriptor per process */
1174 { (int) ldt
, /* segment base address */
1175 (512 * sizeof(union descriptor
)-1), /* length */
1176 SDT_SYSLDT
, /* segment type */
1177 0, /* segment descriptor priority level */
1178 1, /* segment descriptor present */
1180 0, /* unused - default 32 vs 16 bit size */
1181 0 /* limit granularity (byte/page units)*/ },
1182 /* GTGATE_SEL 7 Null Descriptor - Placeholder */
1183 { 0x0, /* segment base address */
1184 0x0, /* length - all address space */
1185 0, /* segment type */
1186 0, /* segment descriptor priority level */
1187 0, /* segment descriptor present */
1189 0, /* default 32 vs 16 bit size */
1190 0 /* limit granularity (byte/page units)*/ },
1191 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
1192 { 0x400, /* segment base address */
1193 0xfffff, /* length */
1194 SDT_MEMRWA
, /* segment type */
1195 0, /* segment descriptor priority level */
1196 1, /* segment descriptor present */
1198 1, /* default 32 vs 16 bit size */
1199 1 /* limit granularity (byte/page units)*/ },
1200 /* GPANIC_SEL 9 Panic Tss Descriptor */
1201 { (int) &dblfault_tss
, /* segment base address */
1202 sizeof(struct i386tss
)-1,/* length - all address space */
1203 SDT_SYS386TSS
, /* segment type */
1204 0, /* segment descriptor priority level */
1205 1, /* segment descriptor present */
1207 0, /* unused - default 32 vs 16 bit size */
1208 0 /* limit granularity (byte/page units)*/ },
1209 /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */
1210 { 0, /* segment base address (overwritten) */
1211 0xfffff, /* length */
1212 SDT_MEMERA
, /* segment type */
1213 0, /* segment descriptor priority level */
1214 1, /* segment descriptor present */
1216 0, /* default 32 vs 16 bit size */
1217 1 /* limit granularity (byte/page units)*/ },
1218 /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */
1219 { 0, /* segment base address (overwritten) */
1220 0xfffff, /* length */
1221 SDT_MEMERA
, /* segment type */
1222 0, /* segment descriptor priority level */
1223 1, /* segment descriptor present */
1225 0, /* default 32 vs 16 bit size */
1226 1 /* limit granularity (byte/page units)*/ },
1227 /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */
1228 { 0, /* segment base address (overwritten) */
1229 0xfffff, /* length */
1230 SDT_MEMRWA
, /* segment type */
1231 0, /* segment descriptor priority level */
1232 1, /* segment descriptor present */
1234 1, /* default 32 vs 16 bit size */
1235 1 /* limit granularity (byte/page units)*/ },
1236 /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */
1237 { 0, /* segment base address (overwritten) */
1238 0xfffff, /* length */
1239 SDT_MEMRWA
, /* segment type */
1240 0, /* segment descriptor priority level */
1241 1, /* segment descriptor present */
1243 0, /* default 32 vs 16 bit size */
1244 1 /* limit granularity (byte/page units)*/ },
1245 /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */
1246 { 0, /* segment base address (overwritten) */
1247 0xfffff, /* length */
1248 SDT_MEMRWA
, /* segment type */
1249 0, /* segment descriptor priority level */
1250 1, /* segment descriptor present */
1252 0, /* default 32 vs 16 bit size */
1253 1 /* limit granularity (byte/page units)*/ },
1254 /* GTLS_START 15 TLS */
1255 { 0x0, /* segment base address */
1257 0, /* segment type */
1258 0, /* segment descriptor priority level */
1259 0, /* segment descriptor present */
1261 0, /* default 32 vs 16 bit size */
1262 0 /* limit granularity (byte/page units)*/ },
1263 /* GTLS_START+1 16 TLS */
1264 { 0x0, /* segment base address */
1266 0, /* segment type */
1267 0, /* segment descriptor priority level */
1268 0, /* segment descriptor present */
1270 0, /* default 32 vs 16 bit size */
1271 0 /* limit granularity (byte/page units)*/ },
1272 /* GTLS_END 17 TLS */
1273 { 0x0, /* segment base address */
1275 0, /* segment type */
1276 0, /* segment descriptor priority level */
1277 0, /* segment descriptor present */
1279 0, /* default 32 vs 16 bit size */
1280 0 /* limit granularity (byte/page units)*/ },
1283 static struct soft_segment_descriptor ldt_segs
[] = {
1284 /* Null Descriptor - overwritten by call gate */
1285 { 0x0, /* segment base address */
1286 0x0, /* length - all address space */
1287 0, /* segment type */
1288 0, /* segment descriptor priority level */
1289 0, /* segment descriptor present */
1291 0, /* default 32 vs 16 bit size */
1292 0 /* limit granularity (byte/page units)*/ },
1293 /* Null Descriptor - overwritten by call gate */
1294 { 0x0, /* segment base address */
1295 0x0, /* length - all address space */
1296 0, /* segment type */
1297 0, /* segment descriptor priority level */
1298 0, /* segment descriptor present */
1300 0, /* default 32 vs 16 bit size */
1301 0 /* limit granularity (byte/page units)*/ },
1302 /* Null Descriptor - overwritten by call gate */
1303 { 0x0, /* segment base address */
1304 0x0, /* length - all address space */
1305 0, /* segment type */
1306 0, /* segment descriptor priority level */
1307 0, /* segment descriptor present */
1309 0, /* default 32 vs 16 bit size */
1310 0 /* limit granularity (byte/page units)*/ },
1311 /* Code Descriptor for user */
1312 { 0x0, /* segment base address */
1313 0xfffff, /* length - all address space */
1314 SDT_MEMERA
, /* segment type */
1315 SEL_UPL
, /* segment descriptor priority level */
1316 1, /* segment descriptor present */
1318 1, /* default 32 vs 16 bit size */
1319 1 /* limit granularity (byte/page units)*/ },
1320 /* Null Descriptor - overwritten by call gate */
1321 { 0x0, /* segment base address */
1322 0x0, /* length - all address space */
1323 0, /* segment type */
1324 0, /* segment descriptor priority level */
1325 0, /* segment descriptor present */
1327 0, /* default 32 vs 16 bit size */
1328 0 /* limit granularity (byte/page units)*/ },
1329 /* Data Descriptor for user */
1330 { 0x0, /* segment base address */
1331 0xfffff, /* length - all address space */
1332 SDT_MEMRWA
, /* segment type */
1333 SEL_UPL
, /* segment descriptor priority level */
1334 1, /* segment descriptor present */
1336 1, /* default 32 vs 16 bit size */
1337 1 /* limit granularity (byte/page units)*/ },
1341 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int selec
)
1343 struct gate_descriptor
*ip
;
1346 ip
->gd_looffset
= (int)func
;
1347 ip
->gd_selector
= selec
;
1353 ip
->gd_hioffset
= ((int)func
)>>16 ;
1356 #define IDTVEC(name) __CONCAT(X,name)
1359 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1360 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1361 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1362 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(fpu
), IDTVEC(align
),
1363 IDTVEC(xmm
), IDTVEC(syscall
),
1366 IDTVEC(int0x80_syscall
);
1368 #ifdef DEBUG_INTERRUPTS
1369 extern inthand_t
*Xrsvdary
[256];
1373 sdtossd(struct segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1375 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1376 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1377 ssd
->ssd_type
= sd
->sd_type
;
1378 ssd
->ssd_dpl
= sd
->sd_dpl
;
1379 ssd
->ssd_p
= sd
->sd_p
;
1380 ssd
->ssd_def32
= sd
->sd_def32
;
1381 ssd
->ssd_gran
= sd
->sd_gran
;
1385 * Populate the (physmap) array with base/bound pairs describing the
1386 * available physical memory in the system, then test this memory and
1387 * build the phys_avail array describing the actually-available memory.
1389 * If we cannot accurately determine the physical memory map, then use
1390 * value from the 0xE801 call, and failing that, the RTC.
1392 * Total memory size may be set by the kernel environment variable
1393 * hw.physmem or the compile-time define MAXMEM.
1396 getmemsize(int first
)
1398 int i
, physmap_idx
, pa_indx
, da_indx
;
1400 u_int basemem
, extmem
;
1401 struct vm86frame vmf
;
1402 struct vm86context vmc
;
1404 vm_offset_t physmap
[PHYSMAP_ENTRIES
*2];
1412 quad_t dcons_addr
, dcons_size
;
1414 bzero(&vmf
, sizeof(struct vm86frame
));
1415 bzero(physmap
, sizeof(physmap
));
1419 * Some newer BIOSes has broken INT 12H implementation which cause
1420 * kernel panic immediately. In this case, we need to scan SMAP
1421 * with INT 15:E820 first, then determine base memory size.
1424 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12
);
1425 if (hasbrokenint12
) {
1430 * Perform "base memory" related probes & setup. If we get a crazy
1431 * value give the bios some scribble space just in case.
1433 vm86_intcall(0x12, &vmf
);
1434 basemem
= vmf
.vmf_ax
;
1435 if (basemem
> 640) {
1436 kprintf("Preposterous BIOS basemem of %uK, "
1437 "truncating to < 640K\n", basemem
);
1442 * XXX if biosbasemem is now < 640, there is a `hole'
1443 * between the end of base memory and the start of
1444 * ISA memory. The hole may be empty or it may
1445 * contain BIOS code or data. Map it read/write so
1446 * that the BIOS can write to it. (Memory from 0 to
1447 * the physical end of the kernel is mapped read-only
1448 * to begin with and then parts of it are remapped.
1449 * The parts that aren't remapped form holes that
1450 * remain read-only and are unused by the kernel.
1451 * The base memory area is below the physical end of
1452 * the kernel and right now forms a read-only hole.
1453 * The part of it from PAGE_SIZE to
1454 * (trunc_page(biosbasemem * 1024) - 1) will be
1455 * remapped and used by the kernel later.)
1457 * This code is similar to the code used in
1458 * pmap_mapdev, but since no memory needs to be
1459 * allocated we simply change the mapping.
1461 for (pa
= trunc_page(basemem
* 1024);
1462 pa
< ISA_HOLE_START
; pa
+= PAGE_SIZE
) {
1463 pte
= vtopte(pa
+ KERNBASE
);
1464 *pte
= pa
| PG_RW
| PG_V
;
1468 * if basemem != 640, map pages r/w into vm86 page table so
1469 * that the bios can scribble on it.
1472 for (i
= basemem
/ 4; i
< 160; i
++)
1473 pte
[i
] = (i
<< PAGE_SHIFT
) | PG_V
| PG_RW
| PG_U
;
1477 * map page 1 R/W into the kernel page table so we can use it
1478 * as a buffer. The kernel will unmap this page later.
1480 pte
= vtopte(KERNBASE
+ (1 << PAGE_SHIFT
));
1481 *pte
= (1 << PAGE_SHIFT
) | PG_RW
| PG_V
;
1484 * get memory map with INT 15:E820
1486 #define SMAPSIZ sizeof(*smap)
1487 #define SMAP_SIG 0x534D4150 /* 'SMAP' */
1490 smap
= (void *)vm86_addpage(&vmc
, 1, KERNBASE
+ (1 << PAGE_SHIFT
));
1491 vm86_getptr(&vmc
, (vm_offset_t
)smap
, &vmf
.vmf_es
, &vmf
.vmf_di
);
1496 vmf
.vmf_eax
= 0xE820;
1497 vmf
.vmf_edx
= SMAP_SIG
;
1498 vmf
.vmf_ecx
= SMAPSIZ
;
1499 i
= vm86_datacall(0x15, &vmf
, &vmc
);
1500 if (i
|| vmf
.vmf_eax
!= SMAP_SIG
)
1502 if (boothowto
& RB_VERBOSE
)
1503 kprintf("SMAP type=%02x base=%08x %08x len=%08x %08x\n",
1505 *(u_int32_t
*)((char *)&smap
->base
+ 4),
1506 (u_int32_t
)smap
->base
,
1507 *(u_int32_t
*)((char *)&smap
->length
+ 4),
1508 (u_int32_t
)smap
->length
);
1510 if (smap
->type
!= 0x01)
1513 if (smap
->length
== 0)
1516 if (smap
->base
>= 0xffffffff) {
1517 kprintf("%ju MB of memory above 4GB ignored\n",
1518 (uintmax_t)(smap
->length
/ 1024 / 1024));
1522 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1523 if (smap
->base
< physmap
[i
+ 1]) {
1524 if (boothowto
& RB_VERBOSE
)
1526 "Overlapping or non-montonic memory region, ignoring second region\n");
1531 if (smap
->base
== physmap
[physmap_idx
+ 1]) {
1532 physmap
[physmap_idx
+ 1] += smap
->length
;
1537 if (physmap_idx
== PHYSMAP_ENTRIES
*2) {
1539 "Too many segments in the physical address map, giving up\n");
1542 physmap
[physmap_idx
] = smap
->base
;
1543 physmap
[physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1545 ; /* fix GCC3.x warning */
1546 } while (vmf
.vmf_ebx
!= 0);
1549 * Perform "base memory" related probes & setup based on SMAP
1552 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1553 if (physmap
[i
] == 0x00000000) {
1554 basemem
= physmap
[i
+ 1] / 1024;
1563 if (basemem
> 640) {
1564 kprintf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
1569 for (pa
= trunc_page(basemem
* 1024);
1570 pa
< ISA_HOLE_START
; pa
+= PAGE_SIZE
) {
1571 pte
= vtopte(pa
+ KERNBASE
);
1572 *pte
= pa
| PG_RW
| PG_V
;
1576 for (i
= basemem
/ 4; i
< 160; i
++)
1577 pte
[i
] = (i
<< PAGE_SHIFT
) | PG_V
| PG_RW
| PG_U
;
1580 if (physmap
[1] != 0)
1584 * If we failed above, try memory map with INT 15:E801
1586 vmf
.vmf_ax
= 0xE801;
1587 if (vm86_intcall(0x15, &vmf
) == 0) {
1588 extmem
= vmf
.vmf_cx
+ vmf
.vmf_dx
* 64;
1592 vm86_intcall(0x15, &vmf
);
1593 extmem
= vmf
.vmf_ax
;
1596 * Prefer the RTC value for extended memory.
1598 extmem
= rtcin(RTC_EXTLO
) + (rtcin(RTC_EXTHI
) << 8);
1603 * Special hack for chipsets that still remap the 384k hole when
1604 * there's 16MB of memory - this really confuses people that
1605 * are trying to use bus mastering ISA controllers with the
1606 * "16MB limit"; they only have 16MB, but the remapping puts
1607 * them beyond the limit.
1609 * If extended memory is between 15-16MB (16-17MB phys address range),
1612 if ((extmem
> 15 * 1024) && (extmem
< 16 * 1024))
1616 physmap
[1] = basemem
* 1024;
1618 physmap
[physmap_idx
] = 0x100000;
1619 physmap
[physmap_idx
+ 1] = physmap
[physmap_idx
] + extmem
* 1024;
1623 * Now, physmap contains a map of physical memory.
1627 /* make hole for AP bootstrap code YYY */
1628 physmap
[1] = mp_bootaddress(physmap
[1]);
1630 /* Save EBDA address, if any */
1631 ebda_addr
= (u_long
)(*(u_short
*)(KERNBASE
+ 0x40e));
1636 * Maxmem isn't the "maximum memory", it's one larger than the
1637 * highest page of the physical address space. It should be
1638 * called something like "Maxphyspage". We may adjust this
1639 * based on ``hw.physmem'' and the results of the memory test.
1641 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1644 Maxmem
= MAXMEM
/ 4;
1647 if (kgetenv_quad("hw.physmem", &maxmem
))
1648 Maxmem
= atop(maxmem
);
1650 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1651 (boothowto
& RB_VERBOSE
))
1652 kprintf("Physical memory use set to %lluK\n", Maxmem
* 4);
1655 * If Maxmem has been increased beyond what the system has detected,
1656 * extend the last memory segment to the new limit.
1658 if (atop(physmap
[physmap_idx
+ 1]) < Maxmem
)
1659 physmap
[physmap_idx
+ 1] = ptoa(Maxmem
);
1661 /* call pmap initialization to make new kernel address space */
1662 pmap_bootstrap(first
, 0);
1665 * Size up each available chunk of physical memory.
1667 physmap
[0] = PAGE_SIZE
; /* mask off page 0 */
1670 phys_avail
[pa_indx
++] = physmap
[0];
1671 phys_avail
[pa_indx
] = physmap
[0];
1672 dump_avail
[da_indx
] = physmap
[0];
1677 * Get dcons buffer address
1679 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
1680 kgetenv_quad("dcons.size", &dcons_size
) == 0)
1684 * physmap is in bytes, so when converting to page boundaries,
1685 * round up the start address and round down the end address.
1687 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1691 if (physmap
[i
+ 1] < end
)
1692 end
= trunc_page(physmap
[i
+ 1]);
1693 for (pa
= round_page(physmap
[i
]); pa
< end
; pa
+= PAGE_SIZE
) {
1694 int tmp
, page_bad
, full
;
1698 int *ptr
= (int *)CADDR1
;
1703 * block out kernel memory as not available.
1705 if (pa
>= 0x100000 && pa
< first
)
1709 * block out dcons buffer
1712 && pa
>= trunc_page(dcons_addr
)
1713 && pa
< dcons_addr
+ dcons_size
)
1719 * map page into kernel: valid, read/write,non-cacheable
1721 *pte
= pa
| PG_V
| PG_RW
| PG_N
;
1726 * Test for alternating 1's and 0's
1728 *(volatile int *)ptr
= 0xaaaaaaaa;
1729 if (*(volatile int *)ptr
!= 0xaaaaaaaa) {
1733 * Test for alternating 0's and 1's
1735 *(volatile int *)ptr
= 0x55555555;
1736 if (*(volatile int *)ptr
!= 0x55555555) {
1742 *(volatile int *)ptr
= 0xffffffff;
1743 if (*(volatile int *)ptr
!= 0xffffffff) {
1749 *(volatile int *)ptr
= 0x0;
1750 if (*(volatile int *)ptr
!= 0x0) {
1754 * Restore original value.
1759 * Adjust array of valid/good pages.
1761 if (page_bad
== TRUE
) {
1765 * If this good page is a continuation of the
1766 * previous set of good pages, then just increase
1767 * the end pointer. Otherwise start a new chunk.
1768 * Note that "end" points one higher than end,
1769 * making the range >= start and < end.
1770 * If we're also doing a speculative memory
1771 * test and we at or past the end, bump up Maxmem
1772 * so that we keep going. The first bad page
1773 * will terminate the loop.
1775 if (phys_avail
[pa_indx
] == pa
) {
1776 phys_avail
[pa_indx
] += PAGE_SIZE
;
1779 if (pa_indx
>= PHYSMAP_ENTRIES
*2) {
1780 kprintf("Too many holes in the physical address space, giving up\n");
1785 phys_avail
[pa_indx
++] = pa
; /* start */
1786 phys_avail
[pa_indx
] = pa
+ PAGE_SIZE
; /* end */
1790 if (dump_avail
[da_indx
] == pa
) {
1791 dump_avail
[da_indx
] += PAGE_SIZE
;
1794 if (da_indx
>= PHYSMAP_ENTRIES
*2) {
1798 dump_avail
[da_indx
++] = pa
; /* start */
1799 dump_avail
[da_indx
] = pa
+ PAGE_SIZE
; /* end */
1812 * The last chunk must contain at least one page plus the message
1813 * buffer to avoid complicating other code (message buffer address
1814 * calculation, etc.).
1816 while (phys_avail
[pa_indx
- 1] + PAGE_SIZE
+
1817 round_page(MSGBUF_SIZE
) >= phys_avail
[pa_indx
]) {
1818 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
1819 phys_avail
[pa_indx
--] = 0;
1820 phys_avail
[pa_indx
--] = 0;
1823 Maxmem
= atop(phys_avail
[pa_indx
]);
1825 /* Trim off space for the message buffer. */
1826 phys_avail
[pa_indx
] -= round_page(MSGBUF_SIZE
);
1828 avail_end
= phys_avail
[pa_indx
];
1840 * 7 Device Not Available (x87)
1842 * 9 Coprocessor Segment overrun (unsupported, reserved)
1844 * 11 Segment not present
1846 * 13 General Protection
1849 * 16 x87 FP Exception pending
1850 * 17 Alignment Check
1852 * 19 SIMD floating point
1854 * 32-255 INTn/external sources
1859 struct gate_descriptor
*gdp
;
1860 int gsel_tss
, metadata_missing
, off
, x
;
1861 struct mdglobaldata
*gd
;
1864 * Prevent lowering of the ipl if we call tsleep() early.
1866 gd
= &CPU_prvspace
[0].mdglobaldata
;
1867 bzero(gd
, sizeof(*gd
));
1869 gd
->mi
.gd_curthread
= &thread0
;
1870 thread0
.td_gd
= &gd
->mi
;
1872 atdevbase
= ISA_HOLE_START
+ KERNBASE
;
1874 metadata_missing
= 0;
1875 if (bootinfo
.bi_modulep
) {
1876 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
1877 preload_bootstrap_relocate(KERNBASE
);
1879 metadata_missing
= 1;
1881 if (bootinfo
.bi_envp
)
1882 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
1885 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
1886 * and ncpus_fit_mask remain 0.
1891 /* Init basic tunables, hz etc */
1895 * make gdt memory segments, the code segment goes up to end of the
1896 * page with etext in it, the data segment goes to the end of
1900 * XXX text protection is temporarily (?) disabled. The limit was
1901 * i386_btop(round_page(etext)) - 1.
1903 gdt_segs
[GCODE_SEL
].ssd_limit
= atop(0 - 1);
1904 gdt_segs
[GDATA_SEL
].ssd_limit
= atop(0 - 1);
1906 gdt_segs
[GPRIV_SEL
].ssd_limit
=
1907 atop(sizeof(struct privatespace
) - 1);
1908 gdt_segs
[GPRIV_SEL
].ssd_base
= (int) &CPU_prvspace
[0];
1909 gdt_segs
[GPROC0_SEL
].ssd_base
=
1910 (int) &CPU_prvspace
[0].mdglobaldata
.gd_common_tss
;
1912 gd
->mi
.gd_prvspace
= &CPU_prvspace
[0];
1915 * Note: on both UP and SMP curthread must be set non-NULL
1916 * early in the boot sequence because the system assumes
1917 * that 'curthread' is never NULL.
1920 for (x
= 0; x
< NGDT
; x
++) {
1922 /* avoid overwriting db entries with APM ones */
1923 if (x
>= GAPMCODE32_SEL
&& x
<= GAPMDATA_SEL
)
1926 ssdtosd(&gdt_segs
[x
], &gdt
[x
].sd
);
1929 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
1930 r_gdt
.rd_base
= (int) gdt
;
1933 mi_gdinit(&gd
->mi
, 0);
1935 mi_proc0init(&gd
->mi
, proc0paddr
);
1936 safepri
= TDPRI_MAX
;
1938 /* make ldt memory segments */
1940 * XXX - VM_MAX_USER_ADDRESS is an end address, not a max. And it
1941 * should be spelled ...MAX_USER...
1943 ldt_segs
[LUCODE_SEL
].ssd_limit
= atop(VM_MAX_USER_ADDRESS
- 1);
1944 ldt_segs
[LUDATA_SEL
].ssd_limit
= atop(VM_MAX_USER_ADDRESS
- 1);
1945 for (x
= 0; x
< sizeof ldt_segs
/ sizeof ldt_segs
[0]; x
++)
1946 ssdtosd(&ldt_segs
[x
], &ldt
[x
].sd
);
1948 _default_ldt
= GSEL(GLDT_SEL
, SEL_KPL
);
1950 gd
->gd_currentldt
= _default_ldt
;
1951 /* spinlocks and the BGL */
1955 * Setup the hardware exception table. Most exceptions use
1956 * SDT_SYS386TGT, known as a 'trap gate'. Trap gates leave
1957 * interrupts enabled. VM page faults use SDT_SYS386IGT, known as
1958 * an 'interrupt trap gate', which disables interrupts on entry,
1959 * in order to be able to poll the appropriate CRn register to
1960 * determine the fault address.
1962 for (x
= 0; x
< NIDT
; x
++) {
1963 #ifdef DEBUG_INTERRUPTS
1964 setidt(x
, Xrsvdary
[x
], SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1966 setidt(x
, &IDTVEC(rsvd0
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1969 setidt(0, &IDTVEC(div
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1970 setidt(1, &IDTVEC(dbg
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1971 setidt(2, &IDTVEC(nmi
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1972 setidt(3, &IDTVEC(bpt
), SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1973 setidt(4, &IDTVEC(ofl
), SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1974 setidt(5, &IDTVEC(bnd
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1975 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1976 setidt(7, &IDTVEC(dna
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1977 setidt(8, 0, SDT_SYSTASKGT
, SEL_KPL
, GSEL(GPANIC_SEL
, SEL_KPL
));
1978 setidt(9, &IDTVEC(fpusegm
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1979 setidt(10, &IDTVEC(tss
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1980 setidt(11, &IDTVEC(missing
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1981 setidt(12, &IDTVEC(stk
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1982 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1983 setidt(14, &IDTVEC(page
), SDT_SYS386IGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1984 setidt(15, &IDTVEC(rsvd0
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1985 setidt(16, &IDTVEC(fpu
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1986 setidt(17, &IDTVEC(align
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1987 setidt(18, &IDTVEC(mchk
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1988 setidt(19, &IDTVEC(xmm
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1989 setidt(0x80, &IDTVEC(int0x80_syscall
),
1990 SDT_SYS386TGT
, SEL_UPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1992 r_idt
.rd_limit
= sizeof(idt0
) - 1;
1993 r_idt
.rd_base
= (int) idt
;
1997 * Initialize the console before we print anything out.
2001 if (metadata_missing
)
2002 kprintf("WARNING: loader(8) metadata is missing!\n");
2011 if (boothowto
& RB_KDB
)
2012 Debugger("Boot flags requested debugger");
2015 finishidentcpu(); /* Final stage of CPU initialization */
2016 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2017 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
2018 initializecpu(); /* Initialize CPU registers */
2021 * make an initial tss so cpu can get interrupt stack on syscall!
2022 * The 16 bytes is to save room for a VM86 context.
2024 gd
->gd_common_tss
.tss_esp0
= (int) thread0
.td_pcb
- 16;
2025 gd
->gd_common_tss
.tss_ss0
= GSEL(GDATA_SEL
, SEL_KPL
) ;
2026 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
2027 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
].sd
;
2028 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
2029 gd
->gd_common_tss
.tss_ioopt
= (sizeof gd
->gd_common_tss
) << 16;
2032 dblfault_tss
.tss_esp
= dblfault_tss
.tss_esp0
= dblfault_tss
.tss_esp1
=
2033 dblfault_tss
.tss_esp2
= (int) &dblfault_stack
[sizeof(dblfault_stack
)];
2034 dblfault_tss
.tss_ss
= dblfault_tss
.tss_ss0
= dblfault_tss
.tss_ss1
=
2035 dblfault_tss
.tss_ss2
= GSEL(GDATA_SEL
, SEL_KPL
);
2036 dblfault_tss
.tss_cr3
= (int)IdlePTD
;
2037 dblfault_tss
.tss_eip
= (int) dblfault_handler
;
2038 dblfault_tss
.tss_eflags
= PSL_KERNEL
;
2039 dblfault_tss
.tss_ds
= dblfault_tss
.tss_es
=
2040 dblfault_tss
.tss_gs
= GSEL(GDATA_SEL
, SEL_KPL
);
2041 dblfault_tss
.tss_fs
= GSEL(GPRIV_SEL
, SEL_KPL
);
2042 dblfault_tss
.tss_cs
= GSEL(GCODE_SEL
, SEL_KPL
);
2043 dblfault_tss
.tss_ldt
= GSEL(GLDT_SEL
, SEL_KPL
);
2047 init_param2(physmem
);
2049 /* now running on new page tables, configured,and u/iom is accessible */
2051 /* Map the message buffer. */
2052 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
2053 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
2055 msgbufinit(msgbufp
, MSGBUF_SIZE
);
2057 /* make a call gate to reenter kernel with */
2058 gdp
= &ldt
[LSYS5CALLS_SEL
].gd
;
2060 x
= (int) &IDTVEC(syscall
);
2061 gdp
->gd_looffset
= x
++;
2062 gdp
->gd_selector
= GSEL(GCODE_SEL
,SEL_KPL
);
2064 gdp
->gd_type
= SDT_SYS386CGT
;
2065 gdp
->gd_dpl
= SEL_UPL
;
2067 gdp
->gd_hioffset
= ((int) &IDTVEC(syscall
)) >>16;
2069 /* XXX does this work? */
2070 ldt
[LBSDICALLS_SEL
] = ldt
[LSYS5CALLS_SEL
];
2071 ldt
[LSOL26CALLS_SEL
] = ldt
[LSYS5CALLS_SEL
];
2073 /* transfer to user mode */
2075 _ucodesel
= LSEL(LUCODE_SEL
, SEL_UPL
);
2076 _udatasel
= LSEL(LUDATA_SEL
, SEL_UPL
);
2078 /* setup proc 0's pcb */
2079 thread0
.td_pcb
->pcb_flags
= 0;
2080 thread0
.td_pcb
->pcb_cr3
= (int)IdlePTD
; /* should already be setup */
2081 thread0
.td_pcb
->pcb_ext
= 0;
2082 lwp0
.lwp_md
.md_regs
= &proc0_tf
;
2086 * Initialize machine-dependant portions of the global data structure.
2087 * Note that the global data area and cpu0's idlestack in the private
2088 * data space were allocated in locore.
2090 * Note: the idlethread's cpl is 0
2092 * WARNING! Called from early boot, 'mycpu' may not work yet.
2095 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
2098 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
2100 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
2101 gd
->mi
.gd_prvspace
->idlestack
,
2102 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
2103 TDF_MPSAFE
, &gd
->mi
);
2104 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
2105 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
2106 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
2107 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
2111 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
2113 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
2114 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
2121 globaldata_find(int cpu
)
2123 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
2124 return(&CPU_prvspace
[cpu
].mdglobaldata
.mi
);
2127 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
2128 static void f00f_hack(void *unused
);
2129 SYSINIT(f00f_hack
, SI_BOOT2_BIOS
, SI_ORDER_ANY
, f00f_hack
, NULL
);
2132 f00f_hack(void *unused
)
2134 struct gate_descriptor
*new_idt
;
2140 kprintf("Intel Pentium detected, installing workaround for F00F bug\n");
2142 r_idt
.rd_limit
= sizeof(idt0
) - 1;
2144 tmp
= kmem_alloc(&kernel_map
, PAGE_SIZE
* 2);
2146 panic("kmem_alloc returned 0");
2147 if (((unsigned int)tmp
& (PAGE_SIZE
-1)) != 0)
2148 panic("kmem_alloc returned non-page-aligned memory");
2149 /* Put the first seven entries in the lower page */
2150 new_idt
= (struct gate_descriptor
*)(tmp
+ PAGE_SIZE
- (7*8));
2151 bcopy(idt
, new_idt
, sizeof(idt0
));
2152 r_idt
.rd_base
= (int)new_idt
;
2155 if (vm_map_protect(&kernel_map
, tmp
, tmp
+ PAGE_SIZE
,
2156 VM_PROT_READ
, FALSE
) != KERN_SUCCESS
)
2157 panic("vm_map_protect failed");
2160 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
2163 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
2165 lp
->lwp_md
.md_regs
->tf_eip
= addr
;
2170 ptrace_single_step(struct lwp
*lp
)
2172 lp
->lwp_md
.md_regs
->tf_eflags
|= PSL_T
;
2177 fill_regs(struct lwp
*lp
, struct reg
*regs
)
2180 struct trapframe
*tp
;
2182 tp
= lp
->lwp_md
.md_regs
;
2183 regs
->r_gs
= tp
->tf_gs
;
2184 regs
->r_fs
= tp
->tf_fs
;
2185 regs
->r_es
= tp
->tf_es
;
2186 regs
->r_ds
= tp
->tf_ds
;
2187 regs
->r_edi
= tp
->tf_edi
;
2188 regs
->r_esi
= tp
->tf_esi
;
2189 regs
->r_ebp
= tp
->tf_ebp
;
2190 regs
->r_ebx
= tp
->tf_ebx
;
2191 regs
->r_edx
= tp
->tf_edx
;
2192 regs
->r_ecx
= tp
->tf_ecx
;
2193 regs
->r_eax
= tp
->tf_eax
;
2194 regs
->r_eip
= tp
->tf_eip
;
2195 regs
->r_cs
= tp
->tf_cs
;
2196 regs
->r_eflags
= tp
->tf_eflags
;
2197 regs
->r_esp
= tp
->tf_esp
;
2198 regs
->r_ss
= tp
->tf_ss
;
2199 pcb
= lp
->lwp_thread
->td_pcb
;
2204 set_regs(struct lwp
*lp
, struct reg
*regs
)
2207 struct trapframe
*tp
;
2209 tp
= lp
->lwp_md
.md_regs
;
2210 if (!EFL_SECURE(regs
->r_eflags
, tp
->tf_eflags
) ||
2211 !CS_SECURE(regs
->r_cs
))
2213 tp
->tf_gs
= regs
->r_gs
;
2214 tp
->tf_fs
= regs
->r_fs
;
2215 tp
->tf_es
= regs
->r_es
;
2216 tp
->tf_ds
= regs
->r_ds
;
2217 tp
->tf_edi
= regs
->r_edi
;
2218 tp
->tf_esi
= regs
->r_esi
;
2219 tp
->tf_ebp
= regs
->r_ebp
;
2220 tp
->tf_ebx
= regs
->r_ebx
;
2221 tp
->tf_edx
= regs
->r_edx
;
2222 tp
->tf_ecx
= regs
->r_ecx
;
2223 tp
->tf_eax
= regs
->r_eax
;
2224 tp
->tf_eip
= regs
->r_eip
;
2225 tp
->tf_cs
= regs
->r_cs
;
2226 tp
->tf_eflags
= regs
->r_eflags
;
2227 tp
->tf_esp
= regs
->r_esp
;
2228 tp
->tf_ss
= regs
->r_ss
;
2229 pcb
= lp
->lwp_thread
->td_pcb
;
2233 #ifndef CPU_DISABLE_SSE
2235 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2237 struct env87
*penv_87
= &sv_87
->sv_env
;
2238 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2241 /* FPU control/status */
2242 penv_87
->en_cw
= penv_xmm
->en_cw
;
2243 penv_87
->en_sw
= penv_xmm
->en_sw
;
2244 penv_87
->en_tw
= penv_xmm
->en_tw
;
2245 penv_87
->en_fip
= penv_xmm
->en_fip
;
2246 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2247 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2248 penv_87
->en_foo
= penv_xmm
->en_foo
;
2249 penv_87
->en_fos
= penv_xmm
->en_fos
;
2252 for (i
= 0; i
< 8; ++i
)
2253 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2255 sv_87
->sv_ex_sw
= sv_xmm
->sv_ex_sw
;
2259 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2261 struct env87
*penv_87
= &sv_87
->sv_env
;
2262 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2265 /* FPU control/status */
2266 penv_xmm
->en_cw
= penv_87
->en_cw
;
2267 penv_xmm
->en_sw
= penv_87
->en_sw
;
2268 penv_xmm
->en_tw
= penv_87
->en_tw
;
2269 penv_xmm
->en_fip
= penv_87
->en_fip
;
2270 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2271 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2272 penv_xmm
->en_foo
= penv_87
->en_foo
;
2273 penv_xmm
->en_fos
= penv_87
->en_fos
;
2276 for (i
= 0; i
< 8; ++i
)
2277 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2279 sv_xmm
->sv_ex_sw
= sv_87
->sv_ex_sw
;
2281 #endif /* CPU_DISABLE_SSE */
2284 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2286 #ifndef CPU_DISABLE_SSE
2288 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2289 (struct save87
*)fpregs
);
2292 #endif /* CPU_DISABLE_SSE */
2293 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2298 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2300 #ifndef CPU_DISABLE_SSE
2302 set_fpregs_xmm((struct save87
*)fpregs
,
2303 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2306 #endif /* CPU_DISABLE_SSE */
2307 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2312 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2315 dbregs
->dr0
= rdr0();
2316 dbregs
->dr1
= rdr1();
2317 dbregs
->dr2
= rdr2();
2318 dbregs
->dr3
= rdr3();
2319 dbregs
->dr4
= rdr4();
2320 dbregs
->dr5
= rdr5();
2321 dbregs
->dr6
= rdr6();
2322 dbregs
->dr7
= rdr7();
2326 pcb
= lp
->lwp_thread
->td_pcb
;
2327 dbregs
->dr0
= pcb
->pcb_dr0
;
2328 dbregs
->dr1
= pcb
->pcb_dr1
;
2329 dbregs
->dr2
= pcb
->pcb_dr2
;
2330 dbregs
->dr3
= pcb
->pcb_dr3
;
2333 dbregs
->dr6
= pcb
->pcb_dr6
;
2334 dbregs
->dr7
= pcb
->pcb_dr7
;
2340 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2343 load_dr0(dbregs
->dr0
);
2344 load_dr1(dbregs
->dr1
);
2345 load_dr2(dbregs
->dr2
);
2346 load_dr3(dbregs
->dr3
);
2347 load_dr4(dbregs
->dr4
);
2348 load_dr5(dbregs
->dr5
);
2349 load_dr6(dbregs
->dr6
);
2350 load_dr7(dbregs
->dr7
);
2353 struct ucred
*ucred
;
2355 uint32_t mask1
, mask2
;
2358 * Don't let an illegal value for dr7 get set. Specifically,
2359 * check for undefined settings. Setting these bit patterns
2360 * result in undefined behaviour and can lead to an unexpected
2363 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 8;
2364 i
++, mask1
<<= 2, mask2
<<= 2)
2365 if ((dbregs
->dr7
& mask1
) == mask2
)
2368 pcb
= lp
->lwp_thread
->td_pcb
;
2369 ucred
= lp
->lwp_proc
->p_ucred
;
2372 * Don't let a process set a breakpoint that is not within the
2373 * process's address space. If a process could do this, it
2374 * could halt the system by setting a breakpoint in the kernel
2375 * (if ddb was enabled). Thus, we need to check to make sure
2376 * that no breakpoints are being enabled for addresses outside
2377 * process's address space, unless, perhaps, we were called by
2380 * XXX - what about when the watched area of the user's
2381 * address space is written into from within the kernel
2382 * ... wouldn't that still cause a breakpoint to be generated
2383 * from within kernel mode?
2386 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2387 if (dbregs
->dr7
& 0x3) {
2388 /* dr0 is enabled */
2389 if (dbregs
->dr0
>= VM_MAX_USER_ADDRESS
)
2393 if (dbregs
->dr7
& (0x3<<2)) {
2394 /* dr1 is enabled */
2395 if (dbregs
->dr1
>= VM_MAX_USER_ADDRESS
)
2399 if (dbregs
->dr7
& (0x3<<4)) {
2400 /* dr2 is enabled */
2401 if (dbregs
->dr2
>= VM_MAX_USER_ADDRESS
)
2405 if (dbregs
->dr7
& (0x3<<6)) {
2406 /* dr3 is enabled */
2407 if (dbregs
->dr3
>= VM_MAX_USER_ADDRESS
)
2412 pcb
->pcb_dr0
= dbregs
->dr0
;
2413 pcb
->pcb_dr1
= dbregs
->dr1
;
2414 pcb
->pcb_dr2
= dbregs
->dr2
;
2415 pcb
->pcb_dr3
= dbregs
->dr3
;
2416 pcb
->pcb_dr6
= dbregs
->dr6
;
2417 pcb
->pcb_dr7
= dbregs
->dr7
;
2419 pcb
->pcb_flags
|= PCB_DBREGS
;
2426 * Return > 0 if a hardware breakpoint has been hit, and the
2427 * breakpoint was in user space. Return 0, otherwise.
2430 user_dbreg_trap(void)
2432 u_int32_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2433 u_int32_t bp
; /* breakpoint bits extracted from dr6 */
2434 int nbp
; /* number of breakpoints that triggered */
2435 caddr_t addr
[4]; /* breakpoint addresses */
2439 if ((dr7
& 0x000000ff) == 0) {
2441 * all GE and LE bits in the dr7 register are zero,
2442 * thus the trap couldn't have been caused by the
2443 * hardware debug registers
2450 bp
= dr6
& 0x0000000f;
2454 * None of the breakpoint bits are set meaning this
2455 * trap was not caused by any of the debug registers
2461 * at least one of the breakpoints were hit, check to see
2462 * which ones and if any of them are user space addresses
2466 addr
[nbp
++] = (caddr_t
)rdr0();
2469 addr
[nbp
++] = (caddr_t
)rdr1();
2472 addr
[nbp
++] = (caddr_t
)rdr2();
2475 addr
[nbp
++] = (caddr_t
)rdr3();
2478 for (i
=0; i
<nbp
; i
++) {
2480 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2482 * addr[i] is in user space
2489 * None of the breakpoints are in user space.
2497 Debugger(const char *msg
)
2499 kprintf("Debugger(\"%s\") called.\n", msg
);
2506 * Provide inb() and outb() as functions. They are normally only
2507 * available as macros calling inlined functions, thus cannot be
2508 * called inside DDB.
2510 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2516 /* silence compiler warnings */
2518 void outb(u_int
, u_char
);
2525 * We use %%dx and not %1 here because i/o is done at %dx and not at
2526 * %edx, while gcc generates inferior code (movw instead of movl)
2527 * if we tell it to load (u_short) port.
2529 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2534 outb(u_int port
, u_char data
)
2538 * Use an unnecessary assignment to help gcc's register allocator.
2539 * This make a large difference for gcc-1.40 and a tiny difference
2540 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2541 * best results. gcc-2.6.0 can't handle this.
2544 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2551 #include "opt_cpu.h"
2555 * initialize all the SMP locks
2558 /* critical region when masking or unmasking interupts */
2559 struct spinlock_deprecated imen_spinlock
;
2561 /* Make FAST_INTR() routines sequential */
2562 struct spinlock_deprecated fast_intr_spinlock
;
2564 /* critical region for old style disable_intr/enable_intr */
2565 struct spinlock_deprecated mpintr_spinlock
;
2567 /* critical region around INTR() routines */
2568 struct spinlock_deprecated intr_spinlock
;
2570 /* lock region used by kernel profiling */
2571 struct spinlock_deprecated mcount_spinlock
;
2573 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2574 struct spinlock_deprecated com_spinlock
;
2576 /* locks kernel kprintfs */
2577 struct spinlock_deprecated cons_spinlock
;
2579 /* lock regions around the clock hardware */
2580 struct spinlock_deprecated clock_spinlock
;
2582 /* lock around the MP rendezvous */
2583 struct spinlock_deprecated smp_rv_spinlock
;
2589 * mp_lock = 0; BSP already owns the MP lock
2592 * Get the initial mp_lock with a count of 1 for the BSP.
2593 * This uses a LOGICAL cpu ID, ie BSP == 0.
2596 cpu_get_initial_mplock();
2599 spin_lock_init(&mcount_spinlock
);
2600 spin_lock_init(&fast_intr_spinlock
);
2601 spin_lock_init(&intr_spinlock
);
2602 spin_lock_init(&mpintr_spinlock
);
2603 spin_lock_init(&imen_spinlock
);
2604 spin_lock_init(&smp_rv_spinlock
);
2605 spin_lock_init(&com_spinlock
);
2606 spin_lock_init(&clock_spinlock
);
2607 spin_lock_init(&cons_spinlock
);
2609 /* our token pool needs to work early */
2610 lwkt_token_pool_init();