2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
41 * $DragonFly: src/sys/platform/pc64/amd64/machdep.c,v 1.1 2008/08/29 17:07:10 dillon Exp $
44 #include "use_ether.h"
45 //#include "use_npx.h"
47 #include "opt_atalk.h"
48 #include "opt_compat.h"
51 #include "opt_directio.h"
54 #include "opt_msgbuf.h"
57 #include <sys/param.h>
58 #include <sys/systm.h>
59 #include <sys/sysproto.h>
60 #include <sys/signalvar.h>
61 #include <sys/kernel.h>
62 #include <sys/linker.h>
63 #include <sys/malloc.h>
67 #include <sys/reboot.h>
69 #include <sys/msgbuf.h>
70 #include <sys/sysent.h>
71 #include <sys/sysctl.h>
72 #include <sys/vmmeter.h>
74 #include <sys/upcall.h>
75 #include <sys/usched.h>
79 #include <vm/vm_param.h>
81 #include <vm/vm_kern.h>
82 #include <vm/vm_object.h>
83 #include <vm/vm_page.h>
84 #include <vm/vm_map.h>
85 #include <vm/vm_pager.h>
86 #include <vm/vm_extern.h>
88 #include <sys/thread2.h>
96 #include <machine/cpu.h>
97 #include <machine/clock.h>
98 #include <machine/specialreg.h>
100 #include <machine/bootinfo.h>
102 #include <machine/md_var.h>
103 #include <machine/metadata.h>
104 #include <machine/pc/bios.h>
105 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
106 #include <machine/globaldata.h> /* CPU_prvspace */
107 #include <machine/smp.h>
109 #include <machine/perfmon.h>
111 #include <machine/cputypes.h>
114 #include <bus/isa/isa_device.h>
116 #include <machine_base/isa/intr_machdep.h>
117 #include <bus/isa/rtc.h>
118 #include <sys/random.h>
119 #include <sys/ptrace.h>
120 #include <machine/sigframe.h>
122 #define PHYSMAP_ENTRIES 10
124 extern void init386(int first
);
125 extern void dblfault_handler(void);
126 extern u_int64_t
hammer_time(u_int64_t
, u_int64_t
);
128 extern void printcpuinfo(void); /* XXX header file */
129 extern void identify_cpu(void);
131 extern void finishidentcpu(void);
133 extern void panicifcpuunsupported(void);
135 static void cpu_startup(void *);
136 #ifndef CPU_DISABLE_SSE
137 static void set_fpregs_xmm(struct save87
*, struct savexmm
*);
138 static void fill_fpregs_xmm(struct savexmm
*, struct save87
*);
139 #endif /* CPU_DISABLE_SSE */
141 extern void ffs_rawread_setup(void);
142 #endif /* DIRECTIO */
143 static void init_locks(void);
145 SYSINIT(cpu
, SI_BOOT2_SMP
, SI_ORDER_FIRST
, cpu_startup
, NULL
)
148 extern vm_offset_t ksym_start
, ksym_end
;
156 struct privatespace CPU_prvspace
[MAXCPU
];
158 int _udatasel
, _ucodesel
, _ucode32sel
;
161 int64_t tsc_offsets
[MAXCPU
];
163 int64_t tsc_offsets
[1];
166 #if defined(SWTCH_OPTIM_STATS)
167 extern int swtch_optim_stats
;
168 SYSCTL_INT(_debug
, OID_AUTO
, swtch_optim_stats
,
169 CTLFLAG_RD
, &swtch_optim_stats
, 0, "");
170 SYSCTL_INT(_debug
, OID_AUTO
, tlb_flush_count
,
171 CTLFLAG_RD
, &tlb_flush_count
, 0, "");
177 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS
)
179 int error
= sysctl_handle_int(oidp
, 0, ctob(physmem
), req
);
183 SYSCTL_PROC(_hw
, HW_PHYSMEM
, physmem
, CTLTYPE_INT
|CTLFLAG_RD
,
184 0, 0, sysctl_hw_physmem
, "IU", "");
187 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS
)
189 int error
= sysctl_handle_int(oidp
, 0,
190 ctob(physmem
- vmstats
.v_wire_count
), req
);
194 SYSCTL_PROC(_hw
, HW_USERMEM
, usermem
, CTLTYPE_INT
|CTLFLAG_RD
,
195 0, 0, sysctl_hw_usermem
, "IU", "");
198 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS
)
200 int error
= sysctl_handle_int(oidp
, 0,
201 amd64_btop(avail_end
- avail_start
), req
);
205 SYSCTL_PROC(_hw
, OID_AUTO
, availpages
, CTLTYPE_INT
|CTLFLAG_RD
,
206 0, 0, sysctl_hw_availpages
, "I", "");
209 sysctl_machdep_msgbuf(SYSCTL_HANDLER_ARGS
)
213 /* Unwind the buffer, so that it's linear (possibly starting with
214 * some initial nulls).
216 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
+msgbufp
->msg_bufr
,
217 msgbufp
->msg_size
-msgbufp
->msg_bufr
,req
);
218 if(error
) return(error
);
219 if(msgbufp
->msg_bufr
>0) {
220 error
=sysctl_handle_opaque(oidp
,msgbufp
->msg_ptr
,
221 msgbufp
->msg_bufr
,req
);
226 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf
, CTLTYPE_STRING
|CTLFLAG_RD
,
227 0, 0, sysctl_machdep_msgbuf
, "A","Contents of kernel message buffer");
229 static int msgbuf_clear
;
232 sysctl_machdep_msgbuf_clear(SYSCTL_HANDLER_ARGS
)
235 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
237 if (!error
&& req
->newptr
) {
238 /* Clear the buffer and reset write pointer */
239 bzero(msgbufp
->msg_ptr
,msgbufp
->msg_size
);
240 msgbufp
->msg_bufr
=msgbufp
->msg_bufx
=0;
246 SYSCTL_PROC(_machdep
, OID_AUTO
, msgbuf_clear
, CTLTYPE_INT
|CTLFLAG_RW
,
247 &msgbuf_clear
, 0, sysctl_machdep_msgbuf_clear
, "I",
248 "Clear kernel message buffer");
250 vm_paddr_t Maxmem
= 0;
253 * The number of PHYSMAP entries must be one less than the number of
254 * PHYSSEG entries because the PHYSMAP entry that spans the largest
255 * physical address that is accessible by ISA DMA is split into two
258 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
260 vm_paddr_t phys_avail
[PHYSMAP_SIZE
+ 2];
261 vm_paddr_t dump_avail
[PHYSMAP_SIZE
+ 2];
263 /* must be 2 less so 0 0 can signal end of chunks */
264 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
265 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
267 static vm_offset_t buffer_sva
, buffer_eva
;
268 vm_offset_t clean_sva
, clean_eva
;
269 static vm_offset_t pager_sva
, pager_eva
;
270 static struct trapframe proc0_tf
;
273 cpu_startup(void *dummy
)
277 vm_offset_t firstaddr
;
279 if (boothowto
& RB_VERBOSE
)
283 * Good {morning,afternoon,evening,night}.
285 kprintf("%s", version
);
288 panicifcpuunsupported();
292 kprintf("real memory = %ju (%juK bytes)\n",
293 (intmax_t)ptoa(Maxmem
),
294 (intmax_t)ptoa(Maxmem
) / 1024);
296 * Display any holes after the first chunk of extended memory.
301 kprintf("Physical memory chunk(s):\n");
302 for (indx
= 0; phys_avail
[indx
+ 1] != 0; indx
+= 2) {
303 vm_paddr_t size1
= phys_avail
[indx
+ 1] - phys_avail
[indx
];
305 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
306 (intmax_t)phys_avail
[indx
],
307 (intmax_t)phys_avail
[indx
+ 1] - 1,
309 (intmax_t)(size1
/ PAGE_SIZE
));
314 * Allocate space for system data structures.
315 * The first available kernel virtual address is in "v".
316 * As pages of kernel virtual memory are allocated, "v" is incremented.
317 * As pages of memory are allocated and cleared,
318 * "firstaddr" is incremented.
319 * An index into the kernel page table corresponding to the
320 * virtual memory address maintained in "v" is kept in "mapaddr".
324 * Make two passes. The first pass calculates how much memory is
325 * needed and allocates it. The second pass assigns virtual
326 * addresses to the various data structures.
330 v
= (caddr_t
)firstaddr
;
332 #define valloc(name, type, num) \
333 (name) = (type *)v; v = (caddr_t)((name)+(num))
334 #define valloclim(name, type, num, lim) \
335 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
338 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
339 * For the first 64MB of ram nominally allocate sufficient buffers to
340 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
341 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
342 * the buffer cache we limit the eventual kva reservation to
345 * factor represents the 1/4 x ram conversion.
348 int factor
= 4 * BKVASIZE
/ 1024;
349 int kbytes
= physmem
* (PAGE_SIZE
/ 1024);
353 nbuf
+= min((kbytes
- 4096) / factor
, 65536 / factor
);
355 nbuf
+= (kbytes
- 65536) * 2 / (factor
* 5);
356 if (maxbcache
&& nbuf
> maxbcache
/ BKVASIZE
)
357 nbuf
= maxbcache
/ BKVASIZE
;
361 * Do not allow the buffer_map to be more then 1/2 the size of the
364 if (nbuf
> (virtual_end
- virtual_start
) / (BKVASIZE
* 2)) {
365 nbuf
= (virtual_end
- virtual_start
) / (BKVASIZE
* 2);
366 kprintf("Warning: nbufs capped at %d\n", nbuf
);
369 nswbuf
= max(min(nbuf
/4, 256), 16);
371 if (nswbuf
< NSWBUF_MIN
)
378 valloc(swbuf
, struct buf
, nswbuf
);
379 valloc(buf
, struct buf
, nbuf
);
382 * End of first pass, size has been calculated so allocate memory
384 if (firstaddr
== 0) {
385 size
= (vm_size_t
)(v
- firstaddr
);
386 firstaddr
= kmem_alloc(&kernel_map
, round_page(size
));
388 panic("startup: no room for tables");
393 * End of second pass, addresses have been assigned
395 if ((vm_size_t
)(v
- firstaddr
) != size
)
396 panic("startup: table size inconsistency");
398 kmem_suballoc(&kernel_map
, &clean_map
, &clean_sva
, &clean_eva
,
399 (nbuf
*BKVASIZE
) + (nswbuf
*MAXPHYS
) + pager_map_size
);
400 kmem_suballoc(&clean_map
, &buffer_map
, &buffer_sva
, &buffer_eva
,
402 buffer_map
.system_map
= 1;
403 kmem_suballoc(&clean_map
, &pager_map
, &pager_sva
, &pager_eva
,
404 (nswbuf
*MAXPHYS
) + pager_map_size
);
405 pager_map
.system_map
= 1;
407 #if defined(USERCONFIG)
409 cninit(); /* the preferred console may have changed */
412 kprintf("avail memory = %lu (%luK bytes)\n",
413 ptoa(vmstats
.v_free_count
),
414 ptoa(vmstats
.v_free_count
) / 1024);
417 * Set up buffers, so they can be used to read disk labels.
420 vm_pager_bufferinit();
424 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
426 mp_start(); /* fire up the APs and APICs */
433 * Send an interrupt to process.
435 * Stack is set up to allow sigcode stored
436 * at top to call routine, followed by kcall
437 * to sigreturn routine below. After sigreturn
438 * resets the signal mask, the stack, and the
439 * frame pointer, it returns to the user
443 sendsig(sig_t catcher
, int sig
, sigset_t
*mask
, u_long code
)
445 struct lwp
*lp
= curthread
->td_lwp
;
446 struct proc
*p
= lp
->lwp_proc
;
447 struct trapframe
*regs
;
448 struct sigacts
*psp
= p
->p_sigacts
;
449 struct sigframe sf
, *sfp
;
453 regs
= lp
->lwp_md
.md_regs
;
454 oonstack
= (lp
->lwp_sigstk
.ss_flags
& SS_ONSTACK
) ? 1 : 0;
456 /* Save user context */
457 bzero(&sf
, sizeof(struct sigframe
));
458 sf
.sf_uc
.uc_sigmask
= *mask
;
459 sf
.sf_uc
.uc_stack
= lp
->lwp_sigstk
;
460 sf
.sf_uc
.uc_mcontext
.mc_onstack
= oonstack
;
461 KKASSERT(__offsetof(struct trapframe
, tf_rdi
) == 0);
462 bcopy(regs
, &sf
.sf_uc
.uc_mcontext
.mc_rdi
, sizeof(struct trapframe
));
464 /* Make the size of the saved context visible to userland */
465 sf
.sf_uc
.uc_mcontext
.mc_len
= sizeof(sf
.sf_uc
.uc_mcontext
);
467 /* Save mailbox pending state for syscall interlock semantics */
468 if (p
->p_flag
& P_MAILBOX
)
469 sf
.sf_uc
.uc_mcontext
.mc_xflags
|= PGEX_MAILBOX
;
471 /* Allocate and validate space for the signal handler context. */
472 if ((lp
->lwp_flag
& LWP_ALTSTACK
) != 0 && !oonstack
&&
473 SIGISMEMBER(psp
->ps_sigonstack
, sig
)) {
474 sp
= (char *)(lp
->lwp_sigstk
.ss_sp
+ lp
->lwp_sigstk
.ss_size
-
475 sizeof(struct sigframe
));
476 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
478 /* We take red zone into account */
479 sp
= (char *)regs
->tf_rsp
- sizeof(struct sigframe
) - 128;
482 /* Align to 16 bytes */
483 sfp
= (struct sigframe
*)((intptr_t)sp
& ~0xFUL
);
485 /* Translate the signal is appropriate */
486 if (p
->p_sysent
->sv_sigtbl
) {
487 if (sig
<= p
->p_sysent
->sv_sigsize
)
488 sig
= p
->p_sysent
->sv_sigtbl
[_SIG_IDX(sig
)];
492 * Build the argument list for the signal handler.
494 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
496 regs
->tf_rdi
= sig
; /* argument 1 */
497 regs
->tf_rdx
= (register_t
)&sfp
->sf_uc
; /* argument 3 */
499 if (SIGISMEMBER(psp
->ps_siginfo
, sig
)) {
501 * Signal handler installed with SA_SIGINFO.
503 * action(signo, siginfo, ucontext)
505 regs
->tf_rsi
= (register_t
)&sfp
->sf_si
; /* argument 2 */
506 regs
->tf_rcx
= (register_t
)regs
->tf_err
; /* argument 4 */
507 sf
.sf_ahu
.sf_action
= (__siginfohandler_t
*)catcher
;
509 /* fill siginfo structure */
510 sf
.sf_si
.si_signo
= sig
;
511 sf
.sf_si
.si_code
= code
;
512 sf
.sf_si
.si_addr
= (void *)regs
->tf_err
;
515 * Old FreeBSD-style arguments.
517 * handler (signo, code, [uc], addr)
519 regs
->tf_rsi
= (register_t
)code
; /* argument 2 */
520 regs
->tf_rcx
= (register_t
)regs
->tf_err
; /* argument 4 */
521 sf
.sf_ahu
.sf_handler
= catcher
;
525 * If we're a vm86 process, we want to save the segment registers.
526 * We also change eflags to be our emulated eflags, not the actual
530 if (regs
->tf_eflags
& PSL_VM
) {
531 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
532 struct vm86_kernel
*vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
534 sf
.sf_uc
.uc_mcontext
.mc_gs
= tf
->tf_vm86_gs
;
535 sf
.sf_uc
.uc_mcontext
.mc_fs
= tf
->tf_vm86_fs
;
536 sf
.sf_uc
.uc_mcontext
.mc_es
= tf
->tf_vm86_es
;
537 sf
.sf_uc
.uc_mcontext
.mc_ds
= tf
->tf_vm86_ds
;
539 if (vm86
->vm86_has_vme
== 0)
540 sf
.sf_uc
.uc_mcontext
.mc_eflags
=
541 (tf
->tf_eflags
& ~(PSL_VIF
| PSL_VIP
)) |
542 (vm86
->vm86_eflags
& (PSL_VIF
| PSL_VIP
));
545 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
546 * syscalls made by the signal handler. This just avoids
547 * wasting time for our lazy fixup of such faults. PSL_NT
548 * does nothing in vm86 mode, but vm86 programs can set it
549 * almost legitimately in probes for old cpu types.
551 tf
->tf_eflags
&= ~(PSL_VM
| PSL_NT
| PSL_VIF
| PSL_VIP
);
556 * Save the FPU state and reinit the FP unit
558 npxpush(&sf
.sf_uc
.uc_mcontext
);
561 * Copy the sigframe out to the user's stack.
563 if (copyout(&sf
, sfp
, sizeof(struct sigframe
)) != 0) {
565 * Something is wrong with the stack pointer.
566 * ...Kill the process.
571 regs
->tf_rsp
= (register_t
)sfp
;
572 regs
->tf_rip
= PS_STRINGS
- *(p
->p_sysent
->sv_szsigcode
);
575 * i386 abi specifies that the direction flag must be cleared
578 regs
->tf_rflags
&= ~(PSL_T
|PSL_D
);
581 * 64 bit mode has a code and stack selector but
582 * no data or extra selector. %fs and %gs are not
585 regs
->tf_cs
= _ucodesel
;
586 regs
->tf_ss
= _udatasel
;
590 * Sanitize the trapframe for a virtual kernel passing control to a custom
591 * VM context. Remove any items that would otherwise create a privilage
594 * XXX at the moment we allow userland to set the resume flag. Is this a
598 cpu_sanitize_frame(struct trapframe
*frame
)
600 frame
->tf_cs
= _ucodesel
;
601 frame
->tf_ss
= _udatasel
;
602 /* XXX VM (8086) mode not supported? */
603 frame
->tf_rflags
&= (PSL_RF
| PSL_USERCHANGE
| PSL_VM_UNSUPP
);
604 frame
->tf_rflags
|= PSL_RESERVED_DEFAULT
| PSL_I
;
610 * Sanitize the tls so loading the descriptor does not blow up
611 * on us. For AMD64 we don't have to do anything.
614 cpu_sanitize_tls(struct savetls
*tls
)
620 * sigreturn(ucontext_t *sigcntxp)
622 * System call to cleanup state after a signal
623 * has been taken. Reset signal mask and
624 * stack state from context left by sendsig (above).
625 * Return to previous pc and psl as specified by
626 * context left by sendsig. Check carefully to
627 * make sure that the user has not modified the
628 * state to gain improper privileges.
630 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
631 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
634 sys_sigreturn(struct sigreturn_args
*uap
)
636 struct lwp
*lp
= curthread
->td_lwp
;
637 struct proc
*p
= lp
->lwp_proc
;
638 struct trapframe
*regs
;
646 * We have to copy the information into kernel space so userland
647 * can't modify it while we are sniffing it.
649 regs
= lp
->lwp_md
.md_regs
;
650 error
= copyin(uap
->sigcntxp
, &uc
, sizeof(uc
));
654 rflags
= ucp
->uc_mcontext
.mc_rflags
;
656 /* VM (8086) mode not supported */
657 rflags
&= ~PSL_VM_UNSUPP
;
660 if (eflags
& PSL_VM
) {
661 struct trapframe_vm86
*tf
= (struct trapframe_vm86
*)regs
;
662 struct vm86_kernel
*vm86
;
665 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
666 * set up the vm86 area, and we can't enter vm86 mode.
668 if (lp
->lwp_thread
->td_pcb
->pcb_ext
== 0)
670 vm86
= &lp
->lwp_thread
->td_pcb
->pcb_ext
->ext_vm86
;
671 if (vm86
->vm86_inited
== 0)
674 /* go back to user mode if both flags are set */
675 if ((eflags
& PSL_VIP
) && (eflags
& PSL_VIF
))
676 trapsignal(lp
, SIGBUS
, 0);
678 if (vm86
->vm86_has_vme
) {
679 eflags
= (tf
->tf_eflags
& ~VME_USERCHANGE
) |
680 (eflags
& VME_USERCHANGE
) | PSL_VM
;
682 vm86
->vm86_eflags
= eflags
; /* save VIF, VIP */
683 eflags
= (tf
->tf_eflags
& ~VM_USERCHANGE
) |
684 (eflags
& VM_USERCHANGE
) | PSL_VM
;
686 bcopy(&ucp
->uc_mcontext
.mc_gs
, tf
, sizeof(struct trapframe
));
687 tf
->tf_eflags
= eflags
;
688 tf
->tf_vm86_ds
= tf
->tf_ds
;
689 tf
->tf_vm86_es
= tf
->tf_es
;
690 tf
->tf_vm86_fs
= tf
->tf_fs
;
691 tf
->tf_vm86_gs
= tf
->tf_gs
;
692 tf
->tf_ds
= _udatasel
;
693 tf
->tf_es
= _udatasel
;
694 tf
->tf_fs
= _udatasel
;
695 tf
->tf_gs
= _udatasel
;
700 * Don't allow users to change privileged or reserved flags.
703 * XXX do allow users to change the privileged flag PSL_RF.
704 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
705 * should sometimes set it there too. tf_eflags is kept in
706 * the signal context during signal handling and there is no
707 * other place to remember it, so the PSL_RF bit may be
708 * corrupted by the signal handler without us knowing.
709 * Corruption of the PSL_RF bit at worst causes one more or
710 * one less debugger trap, so allowing it is fairly harmless.
712 if (!EFL_SECURE(rflags
& ~PSL_RF
, regs
->tf_rflags
& ~PSL_RF
)) {
713 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags
);
718 * Don't allow users to load a valid privileged %cs. Let the
719 * hardware check for invalid selectors, excess privilege in
720 * other selectors, invalid %eip's and invalid %esp's.
722 cs
= ucp
->uc_mcontext
.mc_cs
;
723 if (!CS_SECURE(cs
)) {
724 kprintf("sigreturn: cs = 0x%x\n", cs
);
725 trapsignal(lp
, SIGBUS
, T_PROTFLT
);
728 bcopy(&ucp
->uc_mcontext
.mc_rdi
, regs
, sizeof(struct trapframe
));
732 * Restore the FPU state from the frame
734 npxpop(&ucp
->uc_mcontext
);
737 * Merge saved signal mailbox pending flag to maintain interlock
738 * semantics against system calls.
740 if (ucp
->uc_mcontext
.mc_xflags
& PGEX_MAILBOX
)
741 p
->p_flag
|= P_MAILBOX
;
743 if (ucp
->uc_mcontext
.mc_onstack
& 1)
744 lp
->lwp_sigstk
.ss_flags
|= SS_ONSTACK
;
746 lp
->lwp_sigstk
.ss_flags
&= ~SS_ONSTACK
;
748 lp
->lwp_sigmask
= ucp
->uc_sigmask
;
749 SIG_CANTMASK(lp
->lwp_sigmask
);
754 * Stack frame on entry to function. %rax will contain the function vector,
755 * %rcx will contain the function data. flags, rcx, and rax will have
756 * already been pushed on the stack.
767 sendupcall(struct vmupcall
*vu
, int morepending
)
769 struct lwp
*lp
= curthread
->td_lwp
;
770 struct trapframe
*regs
;
771 struct upcall upcall
;
772 struct upc_frame upc_frame
;
776 * If we are a virtual kernel running an emulated user process
777 * context, switch back to the virtual kernel context before
778 * trying to post the signal.
780 if (lp
->lwp_vkernel
&& lp
->lwp_vkernel
->ve
) {
781 lp
->lwp_md
.md_regs
->tf_trapno
= 0;
782 vkernel_trap(lp
, lp
->lwp_md
.md_regs
);
786 * Get the upcall data structure
788 if (copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
)) ||
789 copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int))
792 kprintf("bad upcall address\n");
797 * If the data structure is already marked pending or has a critical
798 * section count, mark the data structure as pending and return
799 * without doing an upcall. vu_pending is left set.
801 if (upcall
.upc_pending
|| crit_count
>= vu
->vu_pending
) {
802 if (upcall
.upc_pending
< vu
->vu_pending
) {
803 upcall
.upc_pending
= vu
->vu_pending
;
804 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
805 sizeof(upcall
.upc_pending
));
811 * We can run this upcall now, clear vu_pending.
813 * Bump our critical section count and set or clear the
814 * user pending flag depending on whether more upcalls are
815 * pending. The user will be responsible for calling
816 * upc_dispatch(-1) to process remaining upcalls.
819 upcall
.upc_pending
= morepending
;
820 crit_count
+= TDPRI_CRIT
;
821 copyout(&upcall
.upc_pending
, &lp
->lwp_upcall
->upc_pending
,
822 sizeof(upcall
.upc_pending
));
823 copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
,
827 * Construct a stack frame and issue the upcall
829 regs
= lp
->lwp_md
.md_regs
;
830 upc_frame
.rax
= regs
->tf_rax
;
831 upc_frame
.rcx
= regs
->tf_rcx
;
832 upc_frame
.rdx
= regs
->tf_rdx
;
833 upc_frame
.flags
= regs
->tf_rflags
;
834 upc_frame
.oldip
= regs
->tf_rip
;
835 if (copyout(&upc_frame
, (void *)(regs
->tf_rsp
- sizeof(upc_frame
)),
836 sizeof(upc_frame
)) != 0) {
837 kprintf("bad stack on upcall\n");
839 regs
->tf_rax
= (register_t
)vu
->vu_func
;
840 regs
->tf_rcx
= (register_t
)vu
->vu_data
;
841 regs
->tf_rdx
= (register_t
)lp
->lwp_upcall
;
842 regs
->tf_rip
= (register_t
)vu
->vu_ctx
;
843 regs
->tf_rsp
-= sizeof(upc_frame
);
848 * fetchupcall occurs in the context of a system call, which means that
849 * we have to return EJUSTRETURN in order to prevent eax and edx from
850 * being overwritten by the syscall return value.
852 * if vu is not NULL we return the new context in %edx, the new data in %ecx,
853 * and the function pointer in %eax.
856 fetchupcall(struct vmupcall
*vu
, int morepending
, void *rsp
)
858 struct upc_frame upc_frame
;
859 struct lwp
*lp
= curthread
->td_lwp
;
860 struct trapframe
*regs
;
862 struct upcall upcall
;
865 regs
= lp
->lwp_md
.md_regs
;
867 error
= copyout(&morepending
, &lp
->lwp_upcall
->upc_pending
, sizeof(int));
871 * This jumps us to the next ready context.
874 error
= copyin(lp
->lwp_upcall
, &upcall
, sizeof(upcall
));
877 error
= copyin((char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, &crit_count
, sizeof(int));
878 crit_count
+= TDPRI_CRIT
;
880 error
= copyout(&crit_count
, (char *)upcall
.upc_uthread
+ upcall
.upc_critoff
, sizeof(int));
881 regs
->tf_rax
= (register_t
)vu
->vu_func
;
882 regs
->tf_rcx
= (register_t
)vu
->vu_data
;
883 regs
->tf_rdx
= (register_t
)lp
->lwp_upcall
;
884 regs
->tf_rip
= (register_t
)vu
->vu_ctx
;
885 regs
->tf_rsp
= (register_t
)rsp
;
888 * This returns us to the originally interrupted code.
890 error
= copyin(rsp
, &upc_frame
, sizeof(upc_frame
));
891 regs
->tf_rax
= upc_frame
.rax
;
892 regs
->tf_rcx
= upc_frame
.rcx
;
893 regs
->tf_rdx
= upc_frame
.rdx
;
894 regs
->tf_rflags
= (regs
->tf_rflags
& ~PSL_USERCHANGE
) |
895 (upc_frame
.flags
& PSL_USERCHANGE
);
896 regs
->tf_rip
= upc_frame
.oldip
;
897 regs
->tf_rsp
= (register_t
)((char *)rsp
+ sizeof(upc_frame
));
906 * Machine dependent boot() routine
908 * I haven't seen anything to put here yet
909 * Possibly some stuff might be grafted back here from boot()
917 * Shutdown the CPU as much as possible
923 __asm__
__volatile("hlt");
927 * cpu_idle() represents the idle LWKT. You cannot return from this function
928 * (unless you want to blow things up!). Instead we look for runnable threads
929 * and loop or halt as appropriate. Giant is not held on entry to the thread.
931 * The main loop is entered with a critical section held, we must release
932 * the critical section before doing anything else. lwkt_switch() will
933 * check for pending interrupts due to entering and exiting its own
936 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI
937 * to wake a HLTed cpu up. However, there are cases where the idlethread
938 * will be entered with the possibility that no IPI will occur and in such
939 * cases lwkt_switch() sets TDF_IDLE_NOHLT.
941 static int cpu_idle_hlt
= 1;
942 static int cpu_idle_hltcnt
;
943 static int cpu_idle_spincnt
;
944 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hlt
, CTLFLAG_RW
,
945 &cpu_idle_hlt
, 0, "Idle loop HLT enable");
946 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_hltcnt
, CTLFLAG_RW
,
947 &cpu_idle_hltcnt
, 0, "Idle loop entry halts");
948 SYSCTL_INT(_machdep
, OID_AUTO
, cpu_idle_spincnt
, CTLFLAG_RW
,
949 &cpu_idle_spincnt
, 0, "Idle loop entry spins");
952 cpu_idle_default_hook(void)
955 * We must guarentee that hlt is exactly the instruction
958 __asm
__volatile("sti; hlt");
961 /* Other subsystems (e.g., ACPI) can hook this later. */
962 void (*cpu_idle_hook
)(void) = cpu_idle_default_hook
;
967 struct thread
*td
= curthread
;
970 KKASSERT(td
->td_pri
< TDPRI_CRIT
);
973 * See if there are any LWKTs ready to go.
978 * If we are going to halt call splz unconditionally after
979 * CLIing to catch any interrupt races. Note that we are
980 * at SPL0 and interrupts are enabled.
982 if (cpu_idle_hlt
&& !lwkt_runnable() &&
983 (td
->td_flags
& TDF_IDLE_NOHLT
) == 0) {
984 __asm
__volatile("cli");
986 if (!lwkt_runnable())
990 __asm
__volatile("pause");
994 td
->td_flags
&= ~TDF_IDLE_NOHLT
;
997 __asm
__volatile("sti; pause");
999 __asm
__volatile("sti");
1007 * This routine is called when the only runnable threads require
1008 * the MP lock, and the scheduler couldn't get it. On a real cpu
1009 * we let the scheduler spin.
1012 cpu_mplock_contested(void)
1018 * This routine is called if a spinlock has been held through the
1019 * exponential backoff period and is seriously contested. On a real cpu
1023 cpu_spinlock_contested(void)
1029 * Clear registers on exec
1032 exec_setregs(u_long entry
, u_long stack
, u_long ps_strings
)
1034 struct thread
*td
= curthread
;
1035 struct lwp
*lp
= td
->td_lwp
;
1036 struct pcb
*pcb
= td
->td_pcb
;
1037 struct trapframe
*regs
= lp
->lwp_md
.md_regs
;
1039 /* was i386_user_cleanup() in NetBSD */
1042 bzero((char *)regs
, sizeof(struct trapframe
));
1043 regs
->tf_rip
= entry
;
1044 regs
->tf_rsp
= ((stack
- 8) & ~0xFul
) + 8; /* align the stack */
1045 regs
->tf_rdi
= stack
; /* argv */
1046 regs
->tf_rflags
= PSL_USER
| (regs
->tf_rflags
& PSL_T
);
1047 regs
->tf_ss
= _udatasel
;
1048 regs
->tf_cs
= _ucodesel
;
1049 regs
->tf_rbx
= ps_strings
;
1052 * Reset the hardware debug registers if they were in use.
1053 * They won't have any meaning for the newly exec'd process.
1055 if (pcb
->pcb_flags
& PCB_DBREGS
) {
1061 pcb
->pcb_dr7
= 0; /* JG set bit 10? */
1062 if (pcb
== td
->td_pcb
) {
1064 * Clear the debug registers on the running
1065 * CPU, otherwise they will end up affecting
1066 * the next process we switch to.
1070 pcb
->pcb_flags
&= ~PCB_DBREGS
;
1074 * Initialize the math emulator (if any) for the current process.
1075 * Actually, just clear the bit that says that the emulator has
1076 * been initialized. Initialization is delayed until the process
1077 * traps to the emulator (if it is done at all) mainly because
1078 * emulators don't provide an entry point for initialization.
1080 pcb
->pcb_flags
&= ~FP_SOFTFP
;
1083 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1084 * gd_npxthread. Otherwise a preemptive interrupt thread
1085 * may panic in npxdna().
1088 load_cr0(rcr0() | CR0_MP
);
1091 * NOTE: The MSR values must be correct so we can return to
1092 * userland. gd_user_fs/gs must be correct so the switch
1093 * code knows what the current MSR values are.
1095 pcb
->pcb_fsbase
= 0; /* Values loaded from PCB on switch */
1096 pcb
->pcb_gsbase
= 0;
1097 mdcpu
->gd_user_fs
= 0; /* Cache of current MSR values */
1098 mdcpu
->gd_user_gs
= 0;
1099 wrmsr(MSR_FSBASE
, 0); /* Set MSR values for return to userland */
1100 wrmsr(MSR_KGSBASE
, 0);
1102 /* Initialize the npx (if any) for the current process. */
1103 npxinit(__INITIAL_NPXCW__
);
1106 pcb
->pcb_ds
= _udatasel
;
1107 pcb
->pcb_es
= _udatasel
;
1108 pcb
->pcb_fs
= _udatasel
;
1109 pcb
->pcb_gs
= _udatasel
;
1118 cr0
|= CR0_NE
; /* Done by npxinit() */
1119 cr0
|= CR0_MP
| CR0_TS
; /* Done at every execve() too. */
1120 cr0
|= CR0_WP
| CR0_AM
;
1126 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS
)
1129 error
= sysctl_handle_int(oidp
, oidp
->oid_arg1
, oidp
->oid_arg2
,
1131 if (!error
&& req
->newptr
)
1136 SYSCTL_PROC(_machdep
, CPU_ADJKERNTZ
, adjkerntz
, CTLTYPE_INT
|CTLFLAG_RW
,
1137 &adjkerntz
, 0, sysctl_machdep_adjkerntz
, "I", "");
1140 SYSCTL_INT(_machdep
, CPU_DISRTCSET
, disable_rtc_set
,
1141 CTLFLAG_RW
, &disable_rtc_set
, 0, "");
1145 SYSCTL_STRUCT(_machdep
, CPU_BOOTINFO
, bootinfo
,
1146 CTLFLAG_RD
, &bootinfo
, bootinfo
, "");
1149 SYSCTL_INT(_machdep
, CPU_WALLCLOCK
, wall_cmos_clock
,
1150 CTLFLAG_RW
, &wall_cmos_clock
, 0, "");
1152 extern u_long bootdev
; /* not a cdev_t - encoding is different */
1153 SYSCTL_ULONG(_machdep
, OID_AUTO
, guessed_bootdev
,
1154 CTLFLAG_RD
, &bootdev
, 0, "Boot device (not in cdev_t format)");
1157 * Initialize 386 and configure to run kernel
1161 * Initialize segments & interrupt table
1165 struct user_segment_descriptor gdt
[NGDT
* MAXCPU
]; /* global descriptor table */
1166 static struct gate_descriptor idt0
[NIDT
];
1167 struct gate_descriptor
*idt
= &idt0
[0]; /* interrupt descriptor table */
1169 union descriptor ldt
[NLDT
]; /* local descriptor table */
1172 /* table descriptors - used to load tables by cpu */
1173 struct region_descriptor r_gdt
, r_idt
;
1175 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
1176 extern int has_f00f_bug
;
1179 static char dblfault_stack
[PAGE_SIZE
] __aligned(16);
1181 /* JG proc0paddr is a virtual address */
1184 char proc0paddr_buff
[LWKT_THREAD_STACK
];
1187 /* software prototypes -- in more palatable form */
1188 struct soft_segment_descriptor gdt_segs
[] = {
1189 /* GNULL_SEL 0 Null Descriptor */
1190 { 0x0, /* segment base address */
1192 0, /* segment type */
1193 0, /* segment descriptor priority level */
1194 0, /* segment descriptor present */
1196 0, /* default 32 vs 16 bit size */
1197 0 /* limit granularity (byte/page units)*/ },
1198 /* GCODE_SEL 1 Code Descriptor for kernel */
1199 { 0x0, /* segment base address */
1200 0xfffff, /* length - all address space */
1201 SDT_MEMERA
, /* segment type */
1202 SEL_KPL
, /* segment descriptor priority level */
1203 1, /* segment descriptor present */
1205 0, /* default 32 vs 16 bit size */
1206 1 /* limit granularity (byte/page units)*/ },
1207 /* GDATA_SEL 2 Data Descriptor for kernel */
1208 { 0x0, /* segment base address */
1209 0xfffff, /* length - all address space */
1210 SDT_MEMRWA
, /* segment type */
1211 SEL_KPL
, /* segment descriptor priority level */
1212 1, /* segment descriptor present */
1214 0, /* default 32 vs 16 bit size */
1215 1 /* limit granularity (byte/page units)*/ },
1216 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1217 { 0x0, /* segment base address */
1218 0xfffff, /* length - all address space */
1219 SDT_MEMERA
, /* segment type */
1220 SEL_UPL
, /* segment descriptor priority level */
1221 1, /* segment descriptor present */
1223 1, /* default 32 vs 16 bit size */
1224 1 /* limit granularity (byte/page units)*/ },
1225 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1226 { 0x0, /* segment base address */
1227 0xfffff, /* length - all address space */
1228 SDT_MEMRWA
, /* segment type */
1229 SEL_UPL
, /* segment descriptor priority level */
1230 1, /* segment descriptor present */
1232 1, /* default 32 vs 16 bit size */
1233 1 /* limit granularity (byte/page units)*/ },
1234 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1235 { 0x0, /* segment base address */
1236 0xfffff, /* length - all address space */
1237 SDT_MEMERA
, /* segment type */
1238 SEL_UPL
, /* segment descriptor priority level */
1239 1, /* segment descriptor present */
1241 0, /* default 32 vs 16 bit size */
1242 1 /* limit granularity (byte/page units)*/ },
1243 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1245 0x0, /* segment base address */
1246 sizeof(struct amd64tss
)-1,/* length - all address space */
1247 SDT_SYSTSS
, /* segment type */
1248 SEL_KPL
, /* segment descriptor priority level */
1249 1, /* segment descriptor present */
1251 0, /* unused - default 32 vs 16 bit size */
1252 0 /* limit granularity (byte/page units)*/ },
1253 /* Actually, the TSS is a system descriptor which is double size */
1254 { 0x0, /* segment base address */
1256 0, /* segment type */
1257 0, /* segment descriptor priority level */
1258 0, /* segment descriptor present */
1260 0, /* default 32 vs 16 bit size */
1261 0 /* limit granularity (byte/page units)*/ },
1262 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1263 { 0x0, /* segment base address */
1264 0xfffff, /* length - all address space */
1265 SDT_MEMRWA
, /* segment type */
1266 SEL_UPL
, /* segment descriptor priority level */
1267 1, /* segment descriptor present */
1269 1, /* default 32 vs 16 bit size */
1270 1 /* limit granularity (byte/page units)*/ },
1274 setidt(int idx
, inthand_t
*func
, int typ
, int dpl
, int ist
)
1276 struct gate_descriptor
*ip
;
1279 ip
->gd_looffset
= (uintptr_t)func
;
1280 ip
->gd_selector
= GSEL(GCODE_SEL
, SEL_KPL
);
1286 ip
->gd_hioffset
= ((uintptr_t)func
)>>16 ;
1289 #define IDTVEC(name) __CONCAT(X,name)
1292 IDTVEC(div
), IDTVEC(dbg
), IDTVEC(nmi
), IDTVEC(bpt
), IDTVEC(ofl
),
1293 IDTVEC(bnd
), IDTVEC(ill
), IDTVEC(dna
), IDTVEC(fpusegm
),
1294 IDTVEC(tss
), IDTVEC(missing
), IDTVEC(stk
), IDTVEC(prot
),
1295 IDTVEC(page
), IDTVEC(mchk
), IDTVEC(rsvd
), IDTVEC(fpu
), IDTVEC(align
),
1296 IDTVEC(xmm
), IDTVEC(dblfault
),
1297 IDTVEC(fast_syscall
), IDTVEC(fast_syscall32
);
1299 #ifdef DEBUG_INTERRUPTS
1300 extern inthand_t
*Xrsvdary
[256];
1304 sdtossd(struct user_segment_descriptor
*sd
, struct soft_segment_descriptor
*ssd
)
1306 ssd
->ssd_base
= (sd
->sd_hibase
<< 24) | sd
->sd_lobase
;
1307 ssd
->ssd_limit
= (sd
->sd_hilimit
<< 16) | sd
->sd_lolimit
;
1308 ssd
->ssd_type
= sd
->sd_type
;
1309 ssd
->ssd_dpl
= sd
->sd_dpl
;
1310 ssd
->ssd_p
= sd
->sd_p
;
1311 ssd
->ssd_def32
= sd
->sd_def32
;
1312 ssd
->ssd_gran
= sd
->sd_gran
;
1316 ssdtosd(struct soft_segment_descriptor
*ssd
, struct user_segment_descriptor
*sd
)
1319 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1320 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xff;
1321 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1322 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1323 sd
->sd_type
= ssd
->ssd_type
;
1324 sd
->sd_dpl
= ssd
->ssd_dpl
;
1325 sd
->sd_p
= ssd
->ssd_p
;
1326 sd
->sd_long
= ssd
->ssd_long
;
1327 sd
->sd_def32
= ssd
->ssd_def32
;
1328 sd
->sd_gran
= ssd
->ssd_gran
;
1332 ssdtosyssd(struct soft_segment_descriptor
*ssd
,
1333 struct system_segment_descriptor
*sd
)
1336 sd
->sd_lobase
= (ssd
->ssd_base
) & 0xffffff;
1337 sd
->sd_hibase
= (ssd
->ssd_base
>> 24) & 0xfffffffffful
;
1338 sd
->sd_lolimit
= (ssd
->ssd_limit
) & 0xffff;
1339 sd
->sd_hilimit
= (ssd
->ssd_limit
>> 16) & 0xf;
1340 sd
->sd_type
= ssd
->ssd_type
;
1341 sd
->sd_dpl
= ssd
->ssd_dpl
;
1342 sd
->sd_p
= ssd
->ssd_p
;
1343 sd
->sd_gran
= ssd
->ssd_gran
;
1349 * Populate the (physmap) array with base/bound pairs describing the
1350 * available physical memory in the system, then test this memory and
1351 * build the phys_avail array describing the actually-available memory.
1353 * If we cannot accurately determine the physical memory map, then use
1354 * value from the 0xE801 call, and failing that, the RTC.
1356 * Total memory size may be set by the kernel environment variable
1357 * hw.physmem or the compile-time define MAXMEM.
1359 * XXX first should be vm_paddr_t.
1362 getmemsize(caddr_t kmdp
, u_int64_t first
)
1364 int i
, off
, physmap_idx
, pa_indx
, da_indx
;
1365 vm_paddr_t pa
, physmap
[PHYSMAP_SIZE
];
1366 u_long physmem_tunable
;
1368 struct bios_smap
*smapbase
, *smap
, *smapend
;
1370 quad_t dcons_addr
, dcons_size
;
1372 bzero(physmap
, sizeof(physmap
));
1377 * get memory map from INT 15:E820, kindly supplied by the loader.
1379 * subr_module.c says:
1380 * "Consumer may safely assume that size value precedes data."
1381 * ie: an int32_t immediately precedes smap.
1383 smapbase
= (struct bios_smap
*)preload_search_info(kmdp
,
1384 MODINFO_METADATA
| MODINFOMD_SMAP
);
1385 if (smapbase
== NULL
)
1386 panic("No BIOS smap info from loader!");
1388 smapsize
= *((u_int32_t
*)smapbase
- 1);
1389 smapend
= (struct bios_smap
*)((uintptr_t)smapbase
+ smapsize
);
1391 for (smap
= smapbase
; smap
< smapend
; smap
++) {
1392 if (boothowto
& RB_VERBOSE
)
1393 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1394 smap
->type
, smap
->base
, smap
->length
);
1396 if (smap
->type
!= SMAP_TYPE_MEMORY
)
1399 if (smap
->length
== 0)
1402 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1403 if (smap
->base
< physmap
[i
+ 1]) {
1404 if (boothowto
& RB_VERBOSE
)
1406 "Overlapping or non-monotonic memory region, ignoring second region\n");
1411 if (smap
->base
== physmap
[physmap_idx
+ 1]) {
1412 physmap
[physmap_idx
+ 1] += smap
->length
;
1417 if (physmap_idx
== PHYSMAP_SIZE
) {
1419 "Too many segments in the physical address map, giving up\n");
1422 physmap
[physmap_idx
] = smap
->base
;
1423 physmap
[physmap_idx
+ 1] = smap
->base
+ smap
->length
;
1427 * Find the 'base memory' segment for SMP
1430 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1431 if (physmap
[i
] == 0x00000000) {
1432 basemem
= physmap
[i
+ 1] / 1024;
1437 panic("BIOS smap did not include a basemem segment!");
1440 /* make hole for AP bootstrap code */
1441 physmap
[1] = mp_bootaddress(physmap
[1] / 1024);
1443 /* look for the MP hardware - needed for apic addresses */
1448 * Maxmem isn't the "maximum memory", it's one larger than the
1449 * highest page of the physical address space. It should be
1450 * called something like "Maxphyspage". We may adjust this
1451 * based on ``hw.physmem'' and the results of the memory test.
1453 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1456 Maxmem
= MAXMEM
/ 4;
1459 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable
))
1460 Maxmem
= atop(physmem_tunable
);
1463 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1466 if (Maxmem
> atop(physmap
[physmap_idx
+ 1]))
1467 Maxmem
= atop(physmap
[physmap_idx
+ 1]);
1469 if (atop(physmap
[physmap_idx
+ 1]) != Maxmem
&&
1470 (boothowto
& RB_VERBOSE
))
1471 kprintf("Physical memory use set to %ldK\n", Maxmem
* 4);
1473 /* call pmap initialization to make new kernel address space */
1474 pmap_bootstrap(&first
);
1477 * Size up each available chunk of physical memory.
1479 physmap
[0] = PAGE_SIZE
; /* mask off page 0 */
1482 phys_avail
[pa_indx
++] = physmap
[0];
1483 phys_avail
[pa_indx
] = physmap
[0];
1484 dump_avail
[da_indx
] = physmap
[0];
1488 * Get dcons buffer address
1490 if (kgetenv_quad("dcons.addr", &dcons_addr
) == 0 ||
1491 kgetenv_quad("dcons.size", &dcons_size
) == 0)
1495 * physmap is in bytes, so when converting to page boundaries,
1496 * round up the start address and round down the end address.
1498 for (i
= 0; i
<= physmap_idx
; i
+= 2) {
1501 end
= ptoa((vm_paddr_t
)Maxmem
);
1502 if (physmap
[i
+ 1] < end
)
1503 end
= trunc_page(physmap
[i
+ 1]);
1504 for (pa
= round_page(physmap
[i
]); pa
< end
; pa
+= PAGE_SIZE
) {
1505 int tmp
, page_bad
, full
;
1506 int *ptr
= (int *)CADDR1
;
1510 * block out kernel memory as not available.
1512 if (pa
>= 0x100000 && pa
< first
)
1516 * block out dcons buffer
1519 && pa
>= trunc_page(dcons_addr
)
1520 && pa
< dcons_addr
+ dcons_size
)
1526 * map page into kernel: valid, read/write,non-cacheable
1528 *pte
= pa
| PG_V
| PG_RW
| PG_N
;
1533 * Test for alternating 1's and 0's
1535 *(volatile int *)ptr
= 0xaaaaaaaa;
1536 if (*(volatile int *)ptr
!= 0xaaaaaaaa)
1539 * Test for alternating 0's and 1's
1541 *(volatile int *)ptr
= 0x55555555;
1542 if (*(volatile int *)ptr
!= 0x55555555)
1547 *(volatile int *)ptr
= 0xffffffff;
1548 if (*(volatile int *)ptr
!= 0xffffffff)
1553 *(volatile int *)ptr
= 0x0;
1554 if (*(volatile int *)ptr
!= 0x0)
1557 * Restore original value.
1562 * Adjust array of valid/good pages.
1564 if (page_bad
== TRUE
)
1567 * If this good page is a continuation of the
1568 * previous set of good pages, then just increase
1569 * the end pointer. Otherwise start a new chunk.
1570 * Note that "end" points one higher than end,
1571 * making the range >= start and < end.
1572 * If we're also doing a speculative memory
1573 * test and we at or past the end, bump up Maxmem
1574 * so that we keep going. The first bad page
1575 * will terminate the loop.
1577 if (phys_avail
[pa_indx
] == pa
) {
1578 phys_avail
[pa_indx
] += PAGE_SIZE
;
1581 if (pa_indx
== PHYS_AVAIL_ARRAY_END
) {
1583 "Too many holes in the physical address space, giving up\n");
1588 phys_avail
[pa_indx
++] = pa
; /* start */
1589 phys_avail
[pa_indx
] = pa
+ PAGE_SIZE
; /* end */
1593 if (dump_avail
[da_indx
] == pa
) {
1594 dump_avail
[da_indx
] += PAGE_SIZE
;
1597 if (da_indx
== DUMP_AVAIL_ARRAY_END
) {
1601 dump_avail
[da_indx
++] = pa
; /* start */
1602 dump_avail
[da_indx
] = pa
+ PAGE_SIZE
; /* end */
1614 * The last chunk must contain at least one page plus the message
1615 * buffer to avoid complicating other code (message buffer address
1616 * calculation, etc.).
1618 while (phys_avail
[pa_indx
- 1] + PAGE_SIZE
+
1619 round_page(MSGBUF_SIZE
) >= phys_avail
[pa_indx
]) {
1620 physmem
-= atop(phys_avail
[pa_indx
] - phys_avail
[pa_indx
- 1]);
1621 phys_avail
[pa_indx
--] = 0;
1622 phys_avail
[pa_indx
--] = 0;
1625 Maxmem
= atop(phys_avail
[pa_indx
]);
1627 /* Trim off space for the message buffer. */
1628 phys_avail
[pa_indx
] -= round_page(MSGBUF_SIZE
);
1630 avail_end
= phys_avail
[pa_indx
];
1632 /* Map the message buffer. */
1633 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
1634 pmap_kenter((vm_offset_t
)msgbufp
+ off
, phys_avail
[pa_indx
] +
1647 * 7 Device Not Available (x87)
1649 * 9 Coprocessor Segment overrun (unsupported, reserved)
1651 * 11 Segment not present
1653 * 13 General Protection
1656 * 16 x87 FP Exception pending
1657 * 17 Alignment Check
1659 * 19 SIMD floating point
1661 * 32-255 INTn/external sources
1664 hammer_time(u_int64_t modulep
, u_int64_t physfree
)
1669 int metadata_missing
, off
;
1671 struct mdglobaldata
*gd
;
1677 * This must be done before the first references
1678 * to CPU_prvspace[0] are made.
1680 init_paging(&physfree
);
1684 * Prevent lowering of the ipl if we call tsleep() early.
1686 gd
= &CPU_prvspace
[0].mdglobaldata
;
1687 bzero(gd
, sizeof(*gd
));
1690 * Note: on both UP and SMP curthread must be set non-NULL
1691 * early in the boot sequence because the system assumes
1692 * that 'curthread' is never NULL.
1695 gd
->mi
.gd_curthread
= &thread0
;
1696 thread0
.td_gd
= &gd
->mi
;
1698 atdevbase
= ISA_HOLE_START
+ PTOV_OFFSET
;
1701 metadata_missing
= 0;
1702 if (bootinfo
.bi_modulep
) {
1703 preload_metadata
= (caddr_t
)bootinfo
.bi_modulep
+ KERNBASE
;
1704 preload_bootstrap_relocate(KERNBASE
);
1706 metadata_missing
= 1;
1708 if (bootinfo
.bi_envp
)
1709 kern_envp
= (caddr_t
)bootinfo
.bi_envp
+ KERNBASE
;
1712 preload_metadata
= (caddr_t
)(uintptr_t)(modulep
+ PTOV_OFFSET
);
1713 preload_bootstrap_relocate(PTOV_OFFSET
);
1714 kmdp
= preload_search_by_type("elf kernel");
1716 kmdp
= preload_search_by_type("elf64 kernel");
1717 boothowto
= MD_FETCH(kmdp
, MODINFOMD_HOWTO
, int);
1718 kern_envp
= MD_FETCH(kmdp
, MODINFOMD_ENVP
, char *) + PTOV_OFFSET
;
1720 ksym_start
= MD_FETCH(kmdp
, MODINFOMD_SSYM
, uintptr_t);
1721 ksym_end
= MD_FETCH(kmdp
, MODINFOMD_ESYM
, uintptr_t);
1725 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
1726 * and ncpus_fit_mask remain 0.
1731 /* Init basic tunables, hz etc */
1735 * make gdt memory segments
1737 gdt_segs
[GPROC0_SEL
].ssd_base
=
1738 (uintptr_t) &CPU_prvspace
[0].mdglobaldata
.gd_common_tss
;
1740 gd
->mi
.gd_prvspace
= &CPU_prvspace
[0];
1742 for (x
= 0; x
< NGDT
; x
++) {
1743 if (x
!= GPROC0_SEL
&& x
!= (GPROC0_SEL
+ 1))
1744 ssdtosd(&gdt_segs
[x
], &gdt
[x
]);
1746 ssdtosyssd(&gdt_segs
[GPROC0_SEL
],
1747 (struct system_segment_descriptor
*)&gdt
[GPROC0_SEL
]);
1749 r_gdt
.rd_limit
= NGDT
* sizeof(gdt
[0]) - 1;
1750 r_gdt
.rd_base
= (long) gdt
;
1753 wrmsr(MSR_FSBASE
, 0); /* User value */
1754 wrmsr(MSR_GSBASE
, (u_int64_t
)&gd
->mi
);
1755 wrmsr(MSR_KGSBASE
, 0); /* User value while in the kernel */
1757 mi_gdinit(&gd
->mi
, 0);
1759 proc0paddr
= proc0paddr_buff
;
1760 mi_proc0init(&gd
->mi
, proc0paddr
);
1761 safepri
= TDPRI_MAX
;
1763 /* spinlocks and the BGL */
1767 for (x
= 0; x
< NIDT
; x
++)
1768 setidt(x
, &IDTVEC(rsvd
), SDT_SYSIGT
, SEL_KPL
, 0);
1769 setidt(IDT_DE
, &IDTVEC(div
), SDT_SYSIGT
, SEL_KPL
, 0);
1770 setidt(IDT_DB
, &IDTVEC(dbg
), SDT_SYSIGT
, SEL_KPL
, 0);
1771 setidt(IDT_NMI
, &IDTVEC(nmi
), SDT_SYSIGT
, SEL_KPL
, 1);
1772 setidt(IDT_BP
, &IDTVEC(bpt
), SDT_SYSIGT
, SEL_UPL
, 0);
1773 setidt(IDT_OF
, &IDTVEC(ofl
), SDT_SYSIGT
, SEL_KPL
, 0);
1774 setidt(IDT_BR
, &IDTVEC(bnd
), SDT_SYSIGT
, SEL_KPL
, 0);
1775 setidt(IDT_UD
, &IDTVEC(ill
), SDT_SYSIGT
, SEL_KPL
, 0);
1776 setidt(IDT_NM
, &IDTVEC(dna
), SDT_SYSIGT
, SEL_KPL
, 0);
1777 setidt(IDT_DF
, &IDTVEC(dblfault
), SDT_SYSIGT
, SEL_KPL
, 1);
1778 setidt(IDT_FPUGP
, &IDTVEC(fpusegm
), SDT_SYSIGT
, SEL_KPL
, 0);
1779 setidt(IDT_TS
, &IDTVEC(tss
), SDT_SYSIGT
, SEL_KPL
, 0);
1780 setidt(IDT_NP
, &IDTVEC(missing
), SDT_SYSIGT
, SEL_KPL
, 0);
1781 setidt(IDT_SS
, &IDTVEC(stk
), SDT_SYSIGT
, SEL_KPL
, 0);
1782 setidt(IDT_GP
, &IDTVEC(prot
), SDT_SYSIGT
, SEL_KPL
, 0);
1783 setidt(IDT_PF
, &IDTVEC(page
), SDT_SYSIGT
, SEL_KPL
, 0);
1784 setidt(IDT_MF
, &IDTVEC(fpu
), SDT_SYSIGT
, SEL_KPL
, 0);
1785 setidt(IDT_AC
, &IDTVEC(align
), SDT_SYSIGT
, SEL_KPL
, 0);
1786 setidt(IDT_MC
, &IDTVEC(mchk
), SDT_SYSIGT
, SEL_KPL
, 0);
1787 setidt(IDT_XF
, &IDTVEC(xmm
), SDT_SYSIGT
, SEL_KPL
, 0);
1789 r_idt
.rd_limit
= sizeof(idt0
) - 1;
1790 r_idt
.rd_base
= (long) idt
;
1794 * Initialize the console before we print anything out.
1799 if (metadata_missing
)
1800 kprintf("WARNING: loader(8) metadata is missing!\n");
1810 if (boothowto
& RB_KDB
)
1811 Debugger("Boot flags requested debugger");
1815 finishidentcpu(); /* Final stage of CPU initialization */
1816 setidt(6, &IDTVEC(ill
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1817 setidt(13, &IDTVEC(prot
), SDT_SYS386TGT
, SEL_KPL
, GSEL(GCODE_SEL
, SEL_KPL
));
1819 identify_cpu(); /* Final stage of CPU initialization */
1820 initializecpu(); /* Initialize CPU registers */
1822 /* make an initial tss so cpu can get interrupt stack on syscall! */
1823 gd
->gd_common_tss
.tss_rsp0
=
1824 (register_t
)(thread0
.td_kstack
+
1825 KSTACK_PAGES
* PAGE_SIZE
- sizeof(struct pcb
));
1826 /* Ensure the stack is aligned to 16 bytes */
1827 gd
->gd_common_tss
.tss_rsp0
&= ~0xFul
;
1828 gd
->gd_rsp0
= gd
->gd_common_tss
.tss_rsp0
;
1830 /* doublefault stack space, runs on ist1 */
1831 gd
->gd_common_tss
.tss_ist1
= (long)&dblfault_stack
[sizeof(dblfault_stack
)];
1833 /* Set the IO permission bitmap (empty due to tss seg limit) */
1834 gd
->gd_common_tss
.tss_iobase
= sizeof(struct amd64tss
);
1836 gsel_tss
= GSEL(GPROC0_SEL
, SEL_KPL
);
1837 gd
->gd_tss_gdt
= &gdt
[GPROC0_SEL
];
1838 gd
->gd_common_tssd
= *gd
->gd_tss_gdt
;
1841 /* Set up the fast syscall stuff */
1842 msr
= rdmsr(MSR_EFER
) | EFER_SCE
;
1843 wrmsr(MSR_EFER
, msr
);
1844 wrmsr(MSR_LSTAR
, (u_int64_t
)IDTVEC(fast_syscall
));
1845 wrmsr(MSR_CSTAR
, (u_int64_t
)IDTVEC(fast_syscall32
));
1846 msr
= ((u_int64_t
)GSEL(GCODE_SEL
, SEL_KPL
) << 32) |
1847 ((u_int64_t
)GSEL(GUCODE32_SEL
, SEL_UPL
) << 48);
1848 wrmsr(MSR_STAR
, msr
);
1849 wrmsr(MSR_SF_MASK
, PSL_NT
|PSL_T
|PSL_I
|PSL_C
|PSL_D
);
1851 getmemsize(kmdp
, physfree
);
1852 init_param2(physmem
);
1854 /* now running on new page tables, configured,and u/iom is accessible */
1856 /* Map the message buffer. */
1858 for (off
= 0; off
< round_page(MSGBUF_SIZE
); off
+= PAGE_SIZE
)
1859 pmap_kenter((vm_offset_t
)msgbufp
+ off
, avail_end
+ off
);
1862 msgbufinit(msgbufp
, MSGBUF_SIZE
);
1865 /* transfer to user mode */
1867 _ucodesel
= GSEL(GUCODE_SEL
, SEL_UPL
);
1868 _udatasel
= GSEL(GUDATA_SEL
, SEL_UPL
);
1869 _ucode32sel
= GSEL(GUCODE32_SEL
, SEL_UPL
);
1875 /* setup proc 0's pcb */
1876 thread0
.td_pcb
->pcb_flags
= 0;
1877 thread0
.td_pcb
->pcb_cr3
= KPML4phys
;
1878 thread0
.td_pcb
->pcb_ext
= 0;
1879 lwp0
.lwp_md
.md_regs
= &proc0_tf
;
1880 env
= kgetenv("kernelname");
1882 strlcpy(kernelname
, env
, sizeof(kernelname
));
1884 /* Location of kernel stack for locore */
1885 return ((u_int64_t
)thread0
.td_pcb
);
1889 * Initialize machine-dependant portions of the global data structure.
1890 * Note that the global data area and cpu0's idlestack in the private
1891 * data space were allocated in locore.
1893 * Note: the idlethread's cpl is 0
1895 * WARNING! Called from early boot, 'mycpu' may not work yet.
1898 cpu_gdinit(struct mdglobaldata
*gd
, int cpu
)
1901 gd
->mi
.gd_curthread
= &gd
->mi
.gd_idlethread
;
1903 lwkt_init_thread(&gd
->mi
.gd_idlethread
,
1904 gd
->mi
.gd_prvspace
->idlestack
,
1905 sizeof(gd
->mi
.gd_prvspace
->idlestack
),
1906 TDF_MPSAFE
, &gd
->mi
);
1907 lwkt_set_comm(&gd
->mi
.gd_idlethread
, "idle_%d", cpu
);
1908 gd
->mi
.gd_idlethread
.td_switch
= cpu_lwkt_switch
;
1909 gd
->mi
.gd_idlethread
.td_sp
-= sizeof(void *);
1910 *(void **)gd
->mi
.gd_idlethread
.td_sp
= cpu_idle_restore
;
1914 is_globaldata_space(vm_offset_t saddr
, vm_offset_t eaddr
)
1916 if (saddr
>= (vm_offset_t
)&CPU_prvspace
[0] &&
1917 eaddr
<= (vm_offset_t
)&CPU_prvspace
[MAXCPU
]) {
1924 globaldata_find(int cpu
)
1926 KKASSERT(cpu
>= 0 && cpu
< ncpus
);
1927 return(&CPU_prvspace
[cpu
].mdglobaldata
.mi
);
1930 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
1931 static void f00f_hack(void *unused
);
1932 SYSINIT(f00f_hack
, SI_BOOT2_BIOS
, SI_ORDER_ANY
, f00f_hack
, NULL
);
1935 f00f_hack(void *unused
)
1937 struct gate_descriptor
*new_idt
;
1943 kprintf("Intel Pentium detected, installing workaround for F00F bug\n");
1945 r_idt
.rd_limit
= sizeof(idt0
) - 1;
1947 tmp
= kmem_alloc(&kernel_map
, PAGE_SIZE
* 2);
1949 panic("kmem_alloc returned 0");
1950 if (((unsigned int)tmp
& (PAGE_SIZE
-1)) != 0)
1951 panic("kmem_alloc returned non-page-aligned memory");
1952 /* Put the first seven entries in the lower page */
1953 new_idt
= (struct gate_descriptor
*)(tmp
+ PAGE_SIZE
- (7*8));
1954 bcopy(idt
, new_idt
, sizeof(idt0
));
1955 r_idt
.rd_base
= (int)new_idt
;
1958 if (vm_map_protect(&kernel_map
, tmp
, tmp
+ PAGE_SIZE
,
1959 VM_PROT_READ
, FALSE
) != KERN_SUCCESS
)
1960 panic("vm_map_protect failed");
1963 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
1966 ptrace_set_pc(struct lwp
*lp
, unsigned long addr
)
1968 lp
->lwp_md
.md_regs
->tf_rip
= addr
;
1973 ptrace_single_step(struct lwp
*lp
)
1975 lp
->lwp_md
.md_regs
->tf_rflags
|= PSL_T
;
1980 fill_regs(struct lwp
*lp
, struct reg
*regs
)
1983 struct trapframe
*tp
;
1985 tp
= lp
->lwp_md
.md_regs
;
1986 bcopy(&tp
->tf_rdi
, ®s
->r_rdi
, sizeof(*regs
));
1988 pcb
= lp
->lwp_thread
->td_pcb
;
1993 set_regs(struct lwp
*lp
, struct reg
*regs
)
1996 struct trapframe
*tp
;
1998 tp
= lp
->lwp_md
.md_regs
;
1999 if (!EFL_SECURE(regs
->r_rflags
, tp
->tf_rflags
) ||
2000 !CS_SECURE(regs
->r_cs
))
2002 bcopy(®s
->r_rdi
, &tp
->tf_rdi
, sizeof(*regs
));
2003 pcb
= lp
->lwp_thread
->td_pcb
;
2007 #ifndef CPU_DISABLE_SSE
2009 fill_fpregs_xmm(struct savexmm
*sv_xmm
, struct save87
*sv_87
)
2011 struct env87
*penv_87
= &sv_87
->sv_env
;
2012 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2015 /* FPU control/status */
2016 penv_87
->en_cw
= penv_xmm
->en_cw
;
2017 penv_87
->en_sw
= penv_xmm
->en_sw
;
2018 penv_87
->en_tw
= penv_xmm
->en_tw
;
2019 penv_87
->en_fip
= penv_xmm
->en_fip
;
2020 penv_87
->en_fcs
= penv_xmm
->en_fcs
;
2021 penv_87
->en_opcode
= penv_xmm
->en_opcode
;
2022 penv_87
->en_foo
= penv_xmm
->en_foo
;
2023 penv_87
->en_fos
= penv_xmm
->en_fos
;
2026 for (i
= 0; i
< 8; ++i
)
2027 sv_87
->sv_ac
[i
] = sv_xmm
->sv_fp
[i
].fp_acc
;
2029 sv_87
->sv_ex_sw
= sv_xmm
->sv_ex_sw
;
2033 set_fpregs_xmm(struct save87
*sv_87
, struct savexmm
*sv_xmm
)
2035 struct env87
*penv_87
= &sv_87
->sv_env
;
2036 struct envxmm
*penv_xmm
= &sv_xmm
->sv_env
;
2039 /* FPU control/status */
2040 penv_xmm
->en_cw
= penv_87
->en_cw
;
2041 penv_xmm
->en_sw
= penv_87
->en_sw
;
2042 penv_xmm
->en_tw
= penv_87
->en_tw
;
2043 penv_xmm
->en_fip
= penv_87
->en_fip
;
2044 penv_xmm
->en_fcs
= penv_87
->en_fcs
;
2045 penv_xmm
->en_opcode
= penv_87
->en_opcode
;
2046 penv_xmm
->en_foo
= penv_87
->en_foo
;
2047 penv_xmm
->en_fos
= penv_87
->en_fos
;
2050 for (i
= 0; i
< 8; ++i
)
2051 sv_xmm
->sv_fp
[i
].fp_acc
= sv_87
->sv_ac
[i
];
2053 sv_xmm
->sv_ex_sw
= sv_87
->sv_ex_sw
;
2055 #endif /* CPU_DISABLE_SSE */
2058 fill_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2060 #ifndef CPU_DISABLE_SSE
2062 fill_fpregs_xmm(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
,
2063 (struct save87
*)fpregs
);
2066 #endif /* CPU_DISABLE_SSE */
2067 bcopy(&lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, fpregs
, sizeof *fpregs
);
2072 set_fpregs(struct lwp
*lp
, struct fpreg
*fpregs
)
2074 #ifndef CPU_DISABLE_SSE
2076 set_fpregs_xmm((struct save87
*)fpregs
,
2077 &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_xmm
);
2080 #endif /* CPU_DISABLE_SSE */
2081 bcopy(fpregs
, &lp
->lwp_thread
->td_pcb
->pcb_save
.sv_87
, sizeof *fpregs
);
2086 fill_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2089 dbregs
->dr
[0] = rdr0();
2090 dbregs
->dr
[1] = rdr1();
2091 dbregs
->dr
[2] = rdr2();
2092 dbregs
->dr
[3] = rdr3();
2093 dbregs
->dr
[4] = rdr4();
2094 dbregs
->dr
[5] = rdr5();
2095 dbregs
->dr
[6] = rdr6();
2096 dbregs
->dr
[7] = rdr7();
2100 pcb
= lp
->lwp_thread
->td_pcb
;
2101 dbregs
->dr
[0] = pcb
->pcb_dr0
;
2102 dbregs
->dr
[1] = pcb
->pcb_dr1
;
2103 dbregs
->dr
[2] = pcb
->pcb_dr2
;
2104 dbregs
->dr
[3] = pcb
->pcb_dr3
;
2107 dbregs
->dr
[6] = pcb
->pcb_dr6
;
2108 dbregs
->dr
[7] = pcb
->pcb_dr7
;
2114 set_dbregs(struct lwp
*lp
, struct dbreg
*dbregs
)
2117 load_dr0(dbregs
->dr
[0]);
2118 load_dr1(dbregs
->dr
[1]);
2119 load_dr2(dbregs
->dr
[2]);
2120 load_dr3(dbregs
->dr
[3]);
2121 load_dr4(dbregs
->dr
[4]);
2122 load_dr5(dbregs
->dr
[5]);
2123 load_dr6(dbregs
->dr
[6]);
2124 load_dr7(dbregs
->dr
[7]);
2127 struct ucred
*ucred
;
2129 uint64_t mask1
, mask2
;
2132 * Don't let an illegal value for dr7 get set. Specifically,
2133 * check for undefined settings. Setting these bit patterns
2134 * result in undefined behaviour and can lead to an unexpected
2137 /* JG this loop looks unreadable */
2138 /* Check 4 2-bit fields for invalid patterns.
2139 * These fields are R/Wi, for i = 0..3
2141 /* Is 10 in LENi allowed when running in compatibility mode? */
2142 /* Pattern 10 in R/Wi might be used to indicate
2143 * breakpoint on I/O. Further analysis should be
2144 * carried to decide if it is safe and useful to
2145 * provide access to that capability
2147 for (i
= 0, mask1
= 0x3<<16, mask2
= 0x2<<16; i
< 4;
2148 i
++, mask1
<<= 4, mask2
<<= 4)
2149 if ((dbregs
->dr
[7] & mask1
) == mask2
)
2152 pcb
= lp
->lwp_thread
->td_pcb
;
2153 ucred
= lp
->lwp_proc
->p_ucred
;
2156 * Don't let a process set a breakpoint that is not within the
2157 * process's address space. If a process could do this, it
2158 * could halt the system by setting a breakpoint in the kernel
2159 * (if ddb was enabled). Thus, we need to check to make sure
2160 * that no breakpoints are being enabled for addresses outside
2161 * process's address space, unless, perhaps, we were called by
2164 * XXX - what about when the watched area of the user's
2165 * address space is written into from within the kernel
2166 * ... wouldn't that still cause a breakpoint to be generated
2167 * from within kernel mode?
2170 if (priv_check_cred(ucred
, PRIV_ROOT
, 0) != 0) {
2171 if (dbregs
->dr
[7] & 0x3) {
2172 /* dr0 is enabled */
2173 if (dbregs
->dr
[0] >= VM_MAX_USER_ADDRESS
)
2177 if (dbregs
->dr
[7] & (0x3<<2)) {
2178 /* dr1 is enabled */
2179 if (dbregs
->dr
[1] >= VM_MAX_USER_ADDRESS
)
2183 if (dbregs
->dr
[7] & (0x3<<4)) {
2184 /* dr2 is enabled */
2185 if (dbregs
->dr
[2] >= VM_MAX_USER_ADDRESS
)
2189 if (dbregs
->dr
[7] & (0x3<<6)) {
2190 /* dr3 is enabled */
2191 if (dbregs
->dr
[3] >= VM_MAX_USER_ADDRESS
)
2196 pcb
->pcb_dr0
= dbregs
->dr
[0];
2197 pcb
->pcb_dr1
= dbregs
->dr
[1];
2198 pcb
->pcb_dr2
= dbregs
->dr
[2];
2199 pcb
->pcb_dr3
= dbregs
->dr
[3];
2200 pcb
->pcb_dr6
= dbregs
->dr
[6];
2201 pcb
->pcb_dr7
= dbregs
->dr
[7];
2203 pcb
->pcb_flags
|= PCB_DBREGS
;
2210 * Return > 0 if a hardware breakpoint has been hit, and the
2211 * breakpoint was in user space. Return 0, otherwise.
2214 user_dbreg_trap(void)
2216 u_int64_t dr7
, dr6
; /* debug registers dr6 and dr7 */
2217 u_int64_t bp
; /* breakpoint bits extracted from dr6 */
2218 int nbp
; /* number of breakpoints that triggered */
2219 caddr_t addr
[4]; /* breakpoint addresses */
2223 if ((dr7
& 0xff) == 0) {
2225 * all GE and LE bits in the dr7 register are zero,
2226 * thus the trap couldn't have been caused by the
2227 * hardware debug registers
2238 * None of the breakpoint bits are set meaning this
2239 * trap was not caused by any of the debug registers
2245 * at least one of the breakpoints were hit, check to see
2246 * which ones and if any of them are user space addresses
2250 addr
[nbp
++] = (caddr_t
)rdr0();
2253 addr
[nbp
++] = (caddr_t
)rdr1();
2256 addr
[nbp
++] = (caddr_t
)rdr2();
2259 addr
[nbp
++] = (caddr_t
)rdr3();
2262 for (i
=0; i
<nbp
; i
++) {
2264 (caddr_t
)VM_MAX_USER_ADDRESS
) {
2266 * addr[i] is in user space
2273 * None of the breakpoints are in user space.
2281 Debugger(const char *msg
)
2283 kprintf("Debugger(\"%s\") called.\n", msg
);
2290 * Provide inb() and outb() as functions. They are normally only
2291 * available as macros calling inlined functions, thus cannot be
2292 * called inside DDB.
2294 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2300 /* silence compiler warnings */
2302 void outb(u_int
, u_char
);
2309 * We use %%dx and not %1 here because i/o is done at %dx and not at
2310 * %edx, while gcc generates inferior code (movw instead of movl)
2311 * if we tell it to load (u_short) port.
2313 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
2318 outb(u_int port
, u_char data
)
2322 * Use an unnecessary assignment to help gcc's register allocator.
2323 * This make a large difference for gcc-1.40 and a tiny difference
2324 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2325 * best results. gcc-2.6.0 can't handle this.
2328 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
2335 #include "opt_cpu.h"
2339 * initialize all the SMP locks
2342 /* critical region when masking or unmasking interupts */
2343 struct spinlock_deprecated imen_spinlock
;
2345 /* Make FAST_INTR() routines sequential */
2346 struct spinlock_deprecated fast_intr_spinlock
;
2348 /* critical region for old style disable_intr/enable_intr */
2349 struct spinlock_deprecated mpintr_spinlock
;
2351 /* critical region around INTR() routines */
2352 struct spinlock_deprecated intr_spinlock
;
2354 /* lock region used by kernel profiling */
2355 struct spinlock_deprecated mcount_spinlock
;
2357 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2358 struct spinlock_deprecated com_spinlock
;
2360 /* locks kernel kprintfs */
2361 struct spinlock_deprecated cons_spinlock
;
2363 /* lock regions around the clock hardware */
2364 struct spinlock_deprecated clock_spinlock
;
2366 /* lock around the MP rendezvous */
2367 struct spinlock_deprecated smp_rv_spinlock
;
2373 * mp_lock = 0; BSP already owns the MP lock
2376 * Get the initial mp_lock with a count of 1 for the BSP.
2377 * This uses a LOGICAL cpu ID, ie BSP == 0.
2380 cpu_get_initial_mplock();
2383 spin_lock_init(&mcount_spinlock
);
2384 spin_lock_init(&fast_intr_spinlock
);
2385 spin_lock_init(&intr_spinlock
);
2386 spin_lock_init(&mpintr_spinlock
);
2387 spin_lock_init(&imen_spinlock
);
2388 spin_lock_init(&smp_rv_spinlock
);
2389 spin_lock_init(&com_spinlock
);
2390 spin_lock_init(&clock_spinlock
);
2391 spin_lock_init(&cons_spinlock
);
2393 /* our token pool needs to work early */
2394 lwkt_token_pool_init();