preprocessor cleanup: __xpv
[unleashed.git] / arch / x86 / kernel / platform / i86pc / ml / locore.s
blob8c5019ad7ef532a3e6ba65a9686b982425a280cf
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright (c) 2016, Joyent, Inc. All rights reserved.
29 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
31 /* All Rights Reserved */
33 /* Copyright (c) 1987, 1988 Microsoft Corporation */
34 /* All Rights Reserved */
37 #include <sys/asm_linkage.h>
38 #include <sys/asm_misc.h>
39 #include <sys/regset.h>
40 #include <sys/privregs.h>
41 #include <sys/psw.h>
42 #include <sys/reboot.h>
43 #include <sys/x86_archext.h>
44 #include <sys/machparam.h>
47 #include <sys/segments.h>
48 #include <sys/pcb.h>
49 #include <sys/trap.h>
50 #include <sys/ftrace.h>
51 #include <sys/traptrace.h>
52 #include <sys/clock.h>
53 #include <sys/cmn_err.h>
54 #include <sys/pit.h>
55 #include <sys/panic.h>
58 #include "assym.h"
60 #if !defined(__amd64)
61 #error "unsupported architecture"
62 #endif
65 * Our assumptions:
66 * - We are running in protected-paged mode.
67 * - Interrupts are disabled.
68 * - The GDT and IDT are the callers; we need our copies.
69 * - The kernel's text, initialized data and bss are mapped.
71 * Our actions:
72 * - Save arguments
73 * - Initialize our stack pointer to the thread 0 stack (t0stack)
74 * and leave room for a phony "struct regs".
75 * - Our GDT and IDT need to get munged.
76 * - Since we are using the boot's GDT descriptors, we need
77 * to copy them into our GDT before we switch to ours.
78 * - We start using our GDT by loading correct values in the
79 * selector registers (cs=KCS_SEL, ds=es=ss=KDS_SEL, fs=KFS_SEL,
80 * gs=KGS_SEL).
81 * - The default LDT entry for syscall is set.
82 * - We load the default LDT into the hardware LDT register.
83 * - We load the default TSS into the hardware task register.
84 * - mlsetup(%esp) gets called.
85 * - We change our appearance to look like the real thread 0.
86 * (NOTE: making ourselves to be a real thread may be a noop)
87 * - main() gets called. (NOTE: main() never returns).
89 * NOW, the real code!
92 * The very first thing in the kernel's text segment must be a jump
93 * to the os/fakebop.c startup code.
95 .text
96 jmp _start
99 * Globals:
101 .globl _locore_start
102 .globl mlsetup
103 .globl main
104 .globl panic
105 .globl t0stack
106 .globl t0
107 .globl sysp
108 .globl edata
111 * call back into boot - sysp (bootsvcs.h) and bootops (bootconf.h)
113 .globl bootops
114 .globl bootopsp
117 * NOTE: t0stack should be the first thing in the data section so that
118 * if it ever overflows, it will fault on the last kernel text page.
120 .data
121 .comm t0stack, DEFAULTSTKSZ, 32
122 .comm t0, 4094, 32
126 * kobj_init() vectors us back to here with (note) a slightly different
127 * set of arguments than _start is given.
129 * XXX Make this less vile, please.
131 ENTRY_NP(_locore_start)
134 * %rdi = boot services (should die someday)
135 * %rdx = bootops
136 * end
139 leaq edata(%rip), %rbp /* reference edata for ksyms */
140 movq $0, (%rbp) /* limit stack back trace */
143 * Initialize our stack pointer to the thread 0 stack (t0stack)
144 * and leave room for a "struct regs" for lwp0. Note that the
145 * stack doesn't actually align to a 16-byte boundary until just
146 * before we call mlsetup because we want to use %rsp to point at
147 * our regs structure.
149 leaq t0stack(%rip), %rsp
150 addq $_CONST(DEFAULTSTKSZ - REGSIZE), %rsp
151 #if (REGSIZE & 15) == 0
152 subq $8, %rsp
153 #endif
155 * Save call back for special x86 boot services vector
157 movq %rdi, sysp(%rip)
159 movq %rdx, bootops(%rip) /* save bootops */
160 movq $bootops, bootopsp(%rip)
163 * Save arguments and flags, if only for debugging ..
165 movq %rdi, REGOFF_RDI(%rsp)
166 movq %rsi, REGOFF_RSI(%rsp)
167 movq %rdx, REGOFF_RDX(%rsp)
168 movq %rcx, REGOFF_RCX(%rsp)
169 movq %r8, REGOFF_R8(%rsp)
170 movq %r9, REGOFF_R9(%rsp)
171 pushf
172 popq %r11
173 movq %r11, REGOFF_RFL(%rsp)
176 * Enable write protect and alignment check faults.
178 movq %cr0, %rax
179 orq $_CONST(CR0_WP|CR0_AM), %rax
180 andq $_BITNOT(CR0_WT|CR0_CE), %rax
181 movq %rax, %cr0
184 * (We just assert this works by virtue of being here)
186 bts $X86FSET_CPUID, x86_featureset(%rip)
189 * mlsetup() gets called with a struct regs as argument, while
190 * main takes no args and should never return.
192 xorl %ebp, %ebp
193 movq %rsp, %rdi
194 pushq %rbp
195 /* (stack pointer now aligned on 16-byte boundary right here) */
196 movq %rsp, %rbp
197 call mlsetup
198 call main
199 /* NOTREACHED */
200 leaq __return_from_main(%rip), %rdi
201 xorl %eax, %eax
202 call panic
203 SET_SIZE(_locore_start)
206 __return_from_main:
207 .string "main() returned"
211 * For stack layout, see privregs.h
212 * When cmntrap gets called, the error code and trap number have been pushed.
213 * When cmntrap_pushed gets called, the entire struct regs has been pushed.
215 .globl trap /* C handler called below */
217 ENTRY_NP2(cmntrap, _cmntrap)
219 INTR_PUSH
221 ALTENTRY(cmntrap_pushed)
223 movq %rsp, %rbp
226 * - if this is a #pf i.e. T_PGFLT, %r15 is live
227 * and contains the faulting address i.e. a copy of %cr2
229 * - if this is a #db i.e. T_SGLSTP, %r15 is live
230 * and contains the value of %db6
233 TRACE_PTR(%rdi, %rbx, %ebx, %rcx, $TT_TRAP) /* Uses labels 8 and 9 */
234 TRACE_REGS(%rdi, %rsp, %rbx, %rcx) /* Uses label 9 */
235 TRACE_STAMP(%rdi) /* Clobbers %eax, %edx, uses 9 */
238 * We must first check if DTrace has set its NOFAULT bit. This
239 * regrettably must happen before the trap stack is recorded, because
240 * this requires a call to getpcstack() and may induce recursion if an
241 * fbt::getpcstack: enabling is inducing the bad load.
243 movl %gs:CPU_ID, %eax
244 shlq $CPU_CORE_SHIFT, %rax
245 leaq cpu_core(%rip), %r8
246 addq %r8, %rax
247 movw CPUC_DTRACE_FLAGS(%rax), %cx
248 testw $CPU_DTRACE_NOFAULT, %cx
249 jnz .dtrace_induced
251 TRACE_STACK(%rdi)
253 movq %rbp, %rdi
254 movq %r15, %rsi
255 movl %gs:CPU_ID, %edx
258 * We know that this isn't a DTrace non-faulting load; we can now safely
259 * reenable interrupts. (In the case of pagefaults, we enter through an
260 * interrupt gate.)
262 ENABLE_INTR_FLAGS
264 call trap /* trap(rp, addr, cpuid) handles all traps */
265 jmp _sys_rtt
267 .dtrace_induced:
268 cmpw $KCS_SEL, REGOFF_CS(%rbp) /* test CS for user-mode trap */
269 jne 3f /* if from user, panic */
271 cmpl $T_PGFLT, REGOFF_TRAPNO(%rbp)
272 je 1f
274 cmpl $T_GPFLT, REGOFF_TRAPNO(%rbp)
275 je 0f
277 cmpl $T_ILLINST, REGOFF_TRAPNO(%rbp)
278 je 0f
280 cmpl $T_ZERODIV, REGOFF_TRAPNO(%rbp)
281 jne 4f /* if not PF/GP/UD/DE, panic */
283 orw $CPU_DTRACE_DIVZERO, %cx
284 movw %cx, CPUC_DTRACE_FLAGS(%rax)
285 jmp 2f
288 * If we've taken a GPF, we don't (unfortunately) have the address that
289 * induced the fault. So instead of setting the fault to BADADDR,
290 * we'll set the fault to ILLOP.
293 orw $CPU_DTRACE_ILLOP, %cx
294 movw %cx, CPUC_DTRACE_FLAGS(%rax)
295 jmp 2f
297 orw $CPU_DTRACE_BADADDR, %cx
298 movw %cx, CPUC_DTRACE_FLAGS(%rax) /* set fault to bad addr */
299 movq %r15, CPUC_DTRACE_ILLVAL(%rax)
300 /* fault addr is illegal value */
302 movq REGOFF_RIP(%rbp), %rdi
303 movq %rdi, %r12
304 call dtrace_instr_size
305 addq %rax, %r12
306 movq %r12, REGOFF_RIP(%rbp)
307 INTR_POP
308 IRET
309 /*NOTREACHED*/
311 leaq dtrace_badflags(%rip), %rdi
312 xorl %eax, %eax
313 call panic
315 leaq dtrace_badtrap(%rip), %rdi
316 xorl %eax, %eax
317 call panic
318 SET_SIZE(cmntrap)
319 SET_SIZE(_cmntrap)
322 * Declare a uintptr_t which has the size of _cmntrap to enable stack
323 * traceback code to know when a regs structure is on the stack.
325 .globl _cmntrap_size
326 .align CLONGSIZE
327 _cmntrap_size:
328 .NWORD . - _cmntrap
329 .type _cmntrap_size, @object
331 dtrace_badflags:
332 .string "bad DTrace flags"
334 dtrace_badtrap:
335 .string "bad DTrace trap"
339 .globl trap /* C handler called below */
341 ENTRY_NP(cmninttrap)
343 INTR_PUSH
344 INTGATE_INIT_KERNEL_FLAGS
346 TRACE_PTR(%rdi, %rbx, %ebx, %rcx, $TT_TRAP) /* Uses labels 8 and 9 */
347 TRACE_REGS(%rdi, %rsp, %rbx, %rcx) /* Uses label 9 */
348 TRACE_STAMP(%rdi) /* Clobbers %eax, %edx, uses 9 */
350 movq %rsp, %rbp
352 movl %gs:CPU_ID, %edx
353 xorl %esi, %esi
354 movq %rsp, %rdi
355 call trap /* trap(rp, addr, cpuid) handles all traps */
356 jmp _sys_rtt
357 SET_SIZE(cmninttrap)
360 * Handle traps early in boot. Just revectors into C quickly as
361 * these are always fatal errors.
363 * Adjust %rsp to get same stack layout as in 32bit mode for bop_trap().
365 ENTRY(bop_trap_handler)
366 movq %rsp, %rdi
367 sub $8, %rsp
368 call bop_trap
369 SET_SIZE(bop_trap_handler)
372 .globl dtrace_user_probe
374 ENTRY_NP(dtrace_trap)
376 INTR_PUSH
378 TRACE_PTR(%rdi, %rbx, %ebx, %rcx, $TT_TRAP) /* Uses labels 8 and 9 */
379 TRACE_REGS(%rdi, %rsp, %rbx, %rcx) /* Uses label 9 */
380 TRACE_STAMP(%rdi) /* Clobbers %eax, %edx, uses 9 */
382 movq %rsp, %rbp
384 movl %gs:CPU_ID, %edx
385 movq %cr2, %rsi
386 movq %rsp, %rdi
388 ENABLE_INTR_FLAGS
390 call dtrace_user_probe /* dtrace_user_probe(rp, addr, cpuid) */
391 jmp _sys_rtt
393 SET_SIZE(dtrace_trap)
397 * Return from _sys_trap routine.
400 ENTRY_NP(lwp_rtt_initial)
401 movq %gs:CPU_THREAD, %r15
402 movq T_STACK(%r15), %rsp /* switch to the thread stack */
403 movq %rsp, %rbp
404 call __dtrace_probe___proc_start
405 jmp _lwp_rtt
407 ENTRY_NP(lwp_rtt)
410 * r14 lwp
411 * rdx lwp->lwp_procp
412 * r15 curthread
415 movq %gs:CPU_THREAD, %r15
416 movq T_STACK(%r15), %rsp /* switch to the thread stack */
417 movq %rsp, %rbp
418 _lwp_rtt:
419 call __dtrace_probe___proc_lwp__start
420 movq %gs:CPU_LWP, %r14
421 movq LWP_PROCP(%r14), %rdx
424 * XX64 Is the stack misaligned correctly at this point?
425 * If not, we need to do a push before calling anything ..
428 #if defined(DEBUG)
430 * If we were to run lwp_savectx at this point -without-
431 * pcb_rupdate being set to 1, we'd end up sampling the hardware
432 * state left by the previous running lwp, rather than setting
433 * the values requested by the lwp creator. Bad.
435 testb $0x1, PCB_RUPDATE(%r14)
436 jne 1f
437 leaq _no_pending_updates(%rip), %rdi
438 movl $__LINE__, %esi
439 movq %r14, %rdx
440 xorl %eax, %eax
441 call panic
442 _no_pending_updates:
443 .string "locore.s:%d lwp_rtt(lwp %p) but pcb_rupdate != 1"
445 #endif
448 * If agent lwp, clear %fs and %gs
450 cmpq %r15, P_AGENTTP(%rdx)
451 jne 1f
452 xorl %ecx, %ecx
453 movq %rcx, REGOFF_FS(%rsp)
454 movq %rcx, REGOFF_GS(%rsp)
455 movw %cx, LWP_PCB_FS(%r14)
456 movw %cx, LWP_PCB_GS(%r14)
458 call dtrace_systrace_rtt
459 movq REGOFF_RDX(%rsp), %rsi
460 movq REGOFF_RAX(%rsp), %rdi
461 call post_syscall /* post_syscall(rval1, rval2) */
464 * set up to take fault on first use of fp
466 STTS(%rdi)
469 * XXX - may want a fast path that avoids sys_rtt_common in the
470 * most common case.
472 ALTENTRY(_sys_rtt)
473 CLI(%rax) /* disable interrupts */
474 ALTENTRY(_sys_rtt_ints_disabled)
475 movq %rsp, %rdi /* pass rp to sys_rtt_common */
476 call sys_rtt_common /* do common sys_rtt tasks */
477 testq %rax, %rax /* returning to userland? */
478 jz sr_sup
481 * Return to user
483 ASSERT_UPCALL_MASK_IS_SET
484 cmpw $UCS_SEL, REGOFF_CS(%rsp) /* test for native (64-bit) lwp? */
485 je sys_rtt_syscall
488 * Return to 32-bit userland
490 ALTENTRY(sys_rtt_syscall32)
491 USER32_POP
492 IRET
493 /*NOTREACHED*/
495 ALTENTRY(sys_rtt_syscall)
497 * Return to 64-bit userland
499 USER_POP
500 ALTENTRY(nopop_sys_rtt_syscall)
501 IRET
502 /*NOTREACHED*/
503 SET_SIZE(nopop_sys_rtt_syscall)
506 * Return to supervisor
507 * NOTE: to make the check in trap() that tests if we are executing
508 * segment register fixup/restore code work properly, sr_sup MUST be
509 * after _sys_rtt .
511 ALTENTRY(sr_sup)
513 * Restore regs before doing iretq to kernel mode
515 INTR_POP
516 IRET
517 .globl _sys_rtt_end
518 _sys_rtt_end:
519 /*NOTREACHED*/
520 SET_SIZE(sr_sup)
521 SET_SIZE(_sys_rtt_end)
522 SET_SIZE(lwp_rtt)
523 SET_SIZE(lwp_rtt_initial)
524 SET_SIZE(_sys_rtt_ints_disabled)
525 SET_SIZE(_sys_rtt)
526 SET_SIZE(sys_rtt_syscall)
527 SET_SIZE(sys_rtt_syscall32)
531 * So why do we have to deal with all this crud in the world of ia32?
533 * Basically there are four classes of ia32 implementations, those that do not
534 * have a TSC, those that have a marginal TSC that is broken to the extent
535 * that it is useless, those that have a marginal TSC that is not quite so
536 * horribly broken and can be used with some care, and those that have a
537 * reliable TSC. This crud has to be here in order to sift through all the
538 * variants.
542 * XX64 quick and dirty port from the i386 version. Since we
543 * believe the amd64 tsc is more reliable, could this code be
544 * simpler?
546 ENTRY_NP(freq_tsc)
547 pushq %rbp
548 movq %rsp, %rbp
549 movq %rdi, %r9 /* save pit_counter */
550 pushq %rbx
552 / We have a TSC, but we have no way in general to know how reliable it is.
553 / Usually a marginal TSC behaves appropriately unless not enough time
554 / elapses between reads. A reliable TSC can be read as often and as rapidly
555 / as desired. The simplistic approach of reading the TSC counter and
556 / correlating to the PIT counter cannot be naively followed. Instead estimates
557 / have to be taken to successively refine a guess at the speed of the cpu
558 / and then the TSC and PIT counter are correlated. In practice very rarely
559 / is more than one quick loop required for an estimate. Measures have to be
560 / taken to prevent the PIT counter from wrapping beyond its resolution and for
561 / measuring the clock rate of very fast processors.
563 / The following constant can be tuned. It should be such that the loop does
564 / not take too many nor too few PIT counts to execute. If this value is too
565 / large, then on slow machines the loop will take a long time, or the PIT
566 / counter may even wrap. If this value is too small, then on fast machines
567 / the PIT counter may count so few ticks that the resolution of the PIT
568 / itself causes a bad guess. Because this code is used in machines with
569 / marginal TSC's and/or IO, if this value is too small on those, it may
570 / cause the calculated cpu frequency to vary slightly from boot to boot.
572 / In all cases even if this constant is set inappropriately, the algorithm
573 / will still work and the caller should be able to handle variances in the
574 / calculation of cpu frequency, but the calculation will be inefficient and
575 / take a disproportionate amount of time relative to a well selected value.
576 / As the slowest supported cpu becomes faster, this constant should be
577 / carefully increased.
579 movl $0x8000, %ecx
581 / to make sure the instruction cache has been warmed
584 jmp freq_tsc_loop
586 / The following block of code up to and including the latching of the PIT
587 / counter after freq_tsc_perf_loop is very critical and very carefully
588 / written, it should only be modified with great care. freq_tsc_loop to
589 / freq_tsc_perf_loop fits exactly in 16 bytes as do the instructions in
590 / freq_tsc_perf_loop up to the unlatching of the PIT counter.
592 .align 32
593 freq_tsc_loop:
594 / save the loop count in %ebx
595 movl %ecx, %ebx
597 / initialize the PIT counter and start a count down
598 movb $PIT_LOADMODE, %al
599 outb $PITCTL_PORT
600 movb $0xff, %al
601 outb $PITCTR0_PORT
602 outb $PITCTR0_PORT
604 / read the TSC and store the TS in %edi:%esi
605 rdtsc
606 movl %eax, %esi
608 freq_tsc_perf_loop:
609 movl %edx, %edi
610 movl %eax, %esi
611 movl %edx, %edi
612 loop freq_tsc_perf_loop
614 / read the TSC and store the LSW in %ecx
615 rdtsc
616 movl %eax, %ecx
618 / latch the PIT counter and status
619 movb $_CONST(PIT_READBACK|PIT_READBACKC0), %al
620 outb $PITCTL_PORT
622 / remember if the icache has been warmed
623 setc %ah
625 / read the PIT status
626 inb $PITCTR0_PORT
627 shll $8, %eax
629 / read PIT count
630 inb $PITCTR0_PORT
631 shll $8, %eax
632 inb $PITCTR0_PORT
633 bswap %eax
635 / check to see if the PIT count was loaded into the CE
636 btw $_CONST(PITSTAT_NULLCNT+8), %ax
637 jc freq_tsc_increase_count
639 / check to see if PIT counter wrapped
640 btw $_CONST(PITSTAT_OUTPUT+8), %ax
641 jnc freq_tsc_pit_did_not_wrap
643 / halve count
644 shrl $1, %ebx
645 movl %ebx, %ecx
647 / the instruction cache has been warmed
650 jmp freq_tsc_loop
652 freq_tsc_increase_count:
653 shll $1, %ebx
654 jc freq_tsc_too_fast
656 movl %ebx, %ecx
658 / the instruction cache has been warmed
661 jmp freq_tsc_loop
663 freq_tsc_pit_did_not_wrap:
664 roll $16, %eax
666 cmpw $0x2000, %ax
667 notw %ax
668 jb freq_tsc_sufficient_duration
670 freq_tsc_calculate:
671 / in mode 0, the PIT loads the count into the CE on the first CLK pulse,
672 / then on the second CLK pulse the CE is decremented, therefore mode 0
673 / is really a (count + 1) counter, ugh
674 xorl %esi, %esi
675 movw %ax, %si
676 incl %esi
678 movl $0xf000, %eax
679 mull %ebx
681 / tuck away (target_pit_count * loop_count)
682 movl %edx, %ecx
683 movl %eax, %ebx
685 movl %esi, %eax
686 movl $0xffffffff, %edx
687 mull %edx
689 addl %esi, %eax
690 adcl $0, %edx
692 cmpl %ecx, %edx
693 ja freq_tsc_div_safe
694 jb freq_tsc_too_fast
696 cmpl %ebx, %eax
697 jbe freq_tsc_too_fast
699 freq_tsc_div_safe:
700 movl %ecx, %edx
701 movl %ebx, %eax
703 movl %esi, %ecx
704 divl %ecx
706 movl %eax, %ecx
708 / the instruction cache has been warmed
711 jmp freq_tsc_loop
713 freq_tsc_sufficient_duration:
714 / test to see if the icache has been warmed
715 btl $16, %eax
716 jnc freq_tsc_calculate
718 / recall mode 0 is a (count + 1) counter
719 andl $0xffff, %eax
720 incl %eax
722 / save the number of PIT counts
723 movl %eax, (%r9)
725 / calculate the number of TS's that elapsed
726 movl %ecx, %eax
727 subl %esi, %eax
728 sbbl %edi, %edx
730 jmp freq_tsc_end
732 freq_tsc_too_fast:
733 / return 0 as a 64 bit quantity
734 xorl %eax, %eax
735 xorl %edx, %edx
737 freq_tsc_end:
738 shlq $32, %rdx
739 orq %rdx, %rax
741 popq %rbx
742 leaveq
744 SET_SIZE(freq_tsc)