4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright (c) 2016, Joyent, Inc. All rights reserved.
29 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
31 /* All Rights Reserved */
33 /* Copyright (c) 1987, 1988 Microsoft Corporation */
34 /* All Rights Reserved */
37 #include <sys/asm_linkage.h>
38 #include <sys/asm_misc.h>
39 #include <sys/regset.h>
40 #include <sys/privregs.h>
42 #include <sys/reboot.h>
43 #include <sys/x86_archext.h>
44 #include <sys/machparam.h>
47 #include <sys/segments.h>
50 #include <sys/ftrace.h>
51 #include <sys/traptrace.h>
52 #include <sys/clock.h>
53 #include <sys/cmn_err.h>
55 #include <sys/panic.h>
61 #error "unsupported architecture"
66 * - We are running in protected-paged mode.
67 * - Interrupts are disabled.
68 * - The GDT and IDT are the callers; we need our copies.
69 * - The kernel's text, initialized data and bss are mapped.
73 * - Initialize our stack pointer to the thread 0 stack (t0stack)
74 * and leave room for a phony "struct regs".
75 * - Our GDT and IDT need to get munged.
76 * - Since we are using the boot's GDT descriptors, we need
77 * to copy them into our GDT before we switch to ours.
78 * - We start using our GDT by loading correct values in the
79 * selector registers (cs=KCS_SEL, ds=es=ss=KDS_SEL, fs=KFS_SEL,
81 * - The default LDT entry for syscall is set.
82 * - We load the default LDT into the hardware LDT register.
83 * - We load the default TSS into the hardware task register.
84 * - mlsetup(%esp) gets called.
85 * - We change our appearance to look like the real thread 0.
86 * (NOTE: making ourselves to be a real thread may be a noop)
87 * - main() gets called. (NOTE: main() never returns).
92 * The very first thing in the kernel's text segment must be a jump
93 * to the os/fakebop.c startup code.
111 * call back into boot - sysp (bootsvcs.h) and bootops (bootconf.h)
117 * NOTE: t0stack should be the first thing in the data section so that
118 * if it ever overflows, it will fault on the last kernel text page.
121 .comm t0stack, DEFAULTSTKSZ, 32
126 * kobj_init() vectors us back to here with (note) a slightly different
127 * set of arguments than _start is given.
129 * XXX Make this less vile, please.
131 ENTRY_NP
(_locore_start
)
134 * %rdi = boot services (should die someday)
139 leaq edata
(%rip
), %rbp
/* reference edata for ksyms */
140 movq $
0, (%rbp
) /* limit stack back trace */
143 * Initialize our stack pointer to the thread 0 stack (t0stack)
144 * and leave room for a "struct regs" for lwp0. Note that the
145 * stack doesn't actually align to a 16-byte boundary until just
146 * before we call mlsetup because we want to use %rsp to point at
147 * our regs structure.
149 leaq t0stack
(%rip
), %rsp
150 addq $_CONST
(DEFAULTSTKSZ
- REGSIZE
), %rsp
151 #if (REGSIZE & 15) == 0
155 * Save call back for special x86 boot services vector
157 movq
%rdi
, sysp
(%rip
)
159 movq
%rdx
, bootops
(%rip
) /* save bootops */
160 movq $bootops
, bootopsp
(%rip
)
163 * Save arguments and flags, if only for debugging ..
165 movq
%rdi
, REGOFF_RDI
(%rsp
)
166 movq
%rsi
, REGOFF_RSI
(%rsp
)
167 movq
%rdx
, REGOFF_RDX
(%rsp
)
168 movq
%rcx
, REGOFF_RCX
(%rsp
)
169 movq
%r8, REGOFF_R8
(%rsp
)
170 movq
%r9, REGOFF_R9
(%rsp
)
173 movq
%r11, REGOFF_RFL
(%rsp
)
176 * Enable write protect and alignment check faults.
179 orq $_CONST
(CR0_WP|CR0_AM
), %rax
180 andq $_BITNOT
(CR0_WT|CR0_CE
), %rax
184 * (We just assert this works by virtue of being here)
186 bts $X86FSET_CPUID
, x86_featureset
(%rip
)
189 * mlsetup() gets called with a struct regs as argument, while
190 * main takes no args and should never return.
195 /* (stack pointer now aligned on 16-byte boundary right here) */
200 leaq __return_from_main
(%rip
), %rdi
203 SET_SIZE
(_locore_start
)
207 .string "main() returned"
211 * For stack layout, see privregs.h
212 * When cmntrap gets called, the error code and trap number have been pushed.
213 * When cmntrap_pushed gets called, the entire struct regs has been pushed.
215 .globl trap /* C handler called below */
217 ENTRY_NP2
(cmntrap
, _cmntrap
)
221 ALTENTRY
(cmntrap_pushed
)
226 * - if this is a #pf i.e. T_PGFLT, %r15 is live
227 * and contains the faulting address i.e. a copy of %cr2
229 * - if this is a #db i.e. T_SGLSTP, %r15 is live
230 * and contains the value of %db6
233 TRACE_PTR
(%rdi
, %rbx
, %ebx
, %rcx
, $TT_TRAP
) /* Uses labels 8 and 9 */
234 TRACE_REGS
(%rdi
, %rsp
, %rbx
, %rcx
) /* Uses label 9 */
235 TRACE_STAMP
(%rdi
) /* Clobbers %eax, %edx, uses 9 */
238 * We must first check if DTrace has set its NOFAULT bit. This
239 * regrettably must happen before the trap stack is recorded, because
240 * this requires a call to getpcstack() and may induce recursion if an
241 * fbt::getpcstack: enabling is inducing the bad load.
243 movl
%gs
:CPU_ID
, %eax
244 shlq $CPU_CORE_SHIFT
, %rax
245 leaq cpu_core
(%rip
), %r8
247 movw CPUC_DTRACE_FLAGS
(%rax
), %cx
248 testw $CPU_DTRACE_NOFAULT
, %cx
255 movl
%gs
:CPU_ID
, %edx
258 * We know that this isn't a DTrace non-faulting load; we can now safely
259 * reenable interrupts. (In the case of pagefaults, we enter through an
264 call
trap /* trap(rp, addr, cpuid) handles all traps */
268 cmpw $KCS_SEL
, REGOFF_CS
(%rbp
) /* test CS for user-mode trap */
269 jne
3f
/* if from user, panic */
271 cmpl $T_PGFLT
, REGOFF_TRAPNO
(%rbp
)
274 cmpl $T_GPFLT
, REGOFF_TRAPNO
(%rbp
)
277 cmpl $T_ILLINST
, REGOFF_TRAPNO
(%rbp
)
280 cmpl $T_ZERODIV
, REGOFF_TRAPNO
(%rbp
)
281 jne
4f
/* if not PF/GP/UD/DE, panic */
283 orw $CPU_DTRACE_DIVZERO
, %cx
284 movw
%cx
, CPUC_DTRACE_FLAGS
(%rax
)
288 * If we've taken a GPF, we don't (unfortunately) have the address that
289 * induced the fault. So instead of setting the fault to BADADDR,
290 * we'll set the fault to ILLOP.
293 orw $CPU_DTRACE_ILLOP
, %cx
294 movw
%cx
, CPUC_DTRACE_FLAGS
(%rax
)
297 orw $CPU_DTRACE_BADADDR
, %cx
298 movw
%cx
, CPUC_DTRACE_FLAGS
(%rax
) /* set fault to bad addr */
299 movq
%r15, CPUC_DTRACE_ILLVAL
(%rax
)
300 /* fault addr is illegal value */
302 movq REGOFF_RIP
(%rbp
), %rdi
304 call dtrace_instr_size
306 movq
%r12, REGOFF_RIP
(%rbp
)
311 leaq dtrace_badflags
(%rip
), %rdi
315 leaq dtrace_badtrap
(%rip
), %rdi
322 * Declare a uintptr_t which has the size of _cmntrap to enable stack
323 * traceback code to know when a regs structure is on the stack.
329 .type _cmntrap_size, @object
332 .string "bad DTrace flags"
335 .string "bad DTrace trap"
339 .globl trap /* C handler called below */
344 INTGATE_INIT_KERNEL_FLAGS
346 TRACE_PTR
(%rdi
, %rbx
, %ebx
, %rcx
, $TT_TRAP
) /* Uses labels 8 and 9 */
347 TRACE_REGS
(%rdi
, %rsp
, %rbx
, %rcx
) /* Uses label 9 */
348 TRACE_STAMP
(%rdi
) /* Clobbers %eax, %edx, uses 9 */
352 movl
%gs
:CPU_ID
, %edx
355 call
trap /* trap(rp, addr, cpuid) handles all traps */
360 * Handle traps early in boot. Just revectors into C quickly as
361 * these are always fatal errors.
363 * Adjust %rsp to get same stack layout as in 32bit mode for bop_trap().
365 ENTRY
(bop_trap_handler
)
369 SET_SIZE
(bop_trap_handler
)
372 .globl dtrace_user_probe
374 ENTRY_NP
(dtrace_trap
)
378 TRACE_PTR
(%rdi
, %rbx
, %ebx
, %rcx
, $TT_TRAP
) /* Uses labels 8 and 9 */
379 TRACE_REGS
(%rdi
, %rsp
, %rbx
, %rcx
) /* Uses label 9 */
380 TRACE_STAMP
(%rdi
) /* Clobbers %eax, %edx, uses 9 */
384 movl
%gs
:CPU_ID
, %edx
390 call dtrace_user_probe
/* dtrace_user_probe(rp, addr, cpuid) */
393 SET_SIZE
(dtrace_trap
)
397 * Return from _sys_trap routine.
400 ENTRY_NP
(lwp_rtt_initial
)
401 movq
%gs
:CPU_THREAD
, %r15
402 movq T_STACK
(%r15), %rsp
/* switch to the thread stack */
404 call __dtrace_probe___proc_start
415 movq
%gs
:CPU_THREAD
, %r15
416 movq T_STACK
(%r15), %rsp
/* switch to the thread stack */
419 call __dtrace_probe___proc_lwp__start
420 movq
%gs
:CPU_LWP
, %r14
421 movq LWP_PROCP
(%r14), %rdx
424 * XX64 Is the stack misaligned correctly at this point?
425 * If not, we need to do a push before calling anything ..
430 * If we were to run lwp_savectx at this point -without-
431 * pcb_rupdate being set to 1, we'd end up sampling the hardware
432 * state left by the previous running lwp, rather than setting
433 * the values requested by the lwp creator. Bad.
435 testb $
0x1, PCB_RUPDATE
(%r14)
437 leaq _no_pending_updates
(%rip
), %rdi
443 .string "locore.s:%d lwp_rtt(lwp %p) but pcb_rupdate != 1"
448 * If agent lwp, clear %fs and %gs
450 cmpq
%r15, P_AGENTTP
(%rdx
)
453 movq
%rcx
, REGOFF_FS
(%rsp
)
454 movq
%rcx
, REGOFF_GS
(%rsp
)
455 movw
%cx
, LWP_PCB_FS
(%r14)
456 movw
%cx
, LWP_PCB_GS
(%r14)
458 call dtrace_systrace_rtt
459 movq REGOFF_RDX
(%rsp
), %rsi
460 movq REGOFF_RAX
(%rsp
), %rdi
461 call post_syscall
/* post_syscall(rval1, rval2) */
464 * set up to take fault on first use of fp
469 * XXX - may want a fast path that avoids sys_rtt_common in the
473 CLI(%rax
) /* disable interrupts */
474 ALTENTRY
(_sys_rtt_ints_disabled
)
475 movq
%rsp
, %rdi
/* pass rp to sys_rtt_common */
476 call sys_rtt_common
/* do common sys_rtt tasks */
477 testq
%rax
, %rax
/* returning to userland? */
483 ASSERT_UPCALL_MASK_IS_SET
484 cmpw $UCS_SEL
, REGOFF_CS
(%rsp
) /* test for native (64-bit) lwp? */
488 * Return to 32-bit userland
490 ALTENTRY
(sys_rtt_syscall32
)
495 ALTENTRY
(sys_rtt_syscall
)
497 * Return to 64-bit userland
500 ALTENTRY
(nopop_sys_rtt_syscall
)
503 SET_SIZE
(nopop_sys_rtt_syscall
)
506 * Return to supervisor
507 * NOTE: to make the check in trap() that tests if we are executing
508 * segment register fixup/restore code work properly, sr_sup MUST be
513 * Restore regs before doing iretq to kernel mode
521 SET_SIZE
(_sys_rtt_end
)
523 SET_SIZE
(lwp_rtt_initial
)
524 SET_SIZE
(_sys_rtt_ints_disabled
)
526 SET_SIZE
(sys_rtt_syscall
)
527 SET_SIZE
(sys_rtt_syscall32
)
531 * So why do we have to deal with all this crud in the world of ia32?
533 * Basically there are four classes of ia32 implementations, those that do not
534 * have a TSC, those that have a marginal TSC that is broken to the extent
535 * that it is useless, those that have a marginal TSC that is not quite so
536 * horribly broken and can be used with some care, and those that have a
537 * reliable TSC. This crud has to be here in order to sift through all the
542 * XX64 quick and dirty port from the i386 version. Since we
543 * believe the amd64 tsc is more reliable, could this code be
549 movq
%rdi
, %r9 /* save pit_counter */
552 / We have
a TSC
, but we have no way in general to know how reliable it is.
553 / Usually
a marginal TSC behaves appropriately unless
not enough time
554 / elapses between reads.
A reliable TSC can
be read as often
and as rapidly
555 / as desired. The simplistic approach of reading the TSC counter
and
556 / correlating to the PIT counter cannot
be naively followed. Instead estimates
557 / have to
be taken to successively refine
a guess at the speed of the cpu
558 / and then the TSC
and PIT counter are correlated. In practice very rarely
559 / is more than one quick loop required for an estimate. Measures have to
be
560 / taken to prevent the PIT counter from wrapping beyond its resolution
and for
561 / measuring the clock rate of very fast processors.
563 / The following constant can
be tuned. It should
be such that the loop does
564 / not take too many
nor too few PIT counts to execute. If this value is too
565 / large
, then on slow machines the loop will take
a long time
, or the PIT
566 / counter may even wrap. If this value is too small
, then on fast machines
567 / the PIT counter may count so few ticks that the resolution of the PIT
568 / itself causes
a bad guess. Because this code is used in machines with
569 / marginal TSC
's and/or IO, if this value is too small on those, it may
570 / cause the calculated cpu frequency to vary slightly from boot to boot.
572 / In all cases even if this constant is set inappropriately, the algorithm
573 / will still work and the caller should be able to handle variances in the
574 / calculation of cpu frequency, but the calculation will be inefficient and
575 / take a disproportionate amount of time relative to a well selected value.
576 / As the slowest supported cpu becomes faster, this constant should be
577 / carefully increased.
581 / to make sure the instruction cache has been warmed
586 / The following block of code up to and including the latching of the PIT
587 / counter after freq_tsc_perf_loop is very critical and very carefully
588 / written, it should only be modified with great care. freq_tsc_loop to
589 / freq_tsc_perf_loop fits exactly in 16 bytes as do the instructions in
590 / freq_tsc_perf_loop up to the unlatching of the PIT counter.
594 / save the loop count in %ebx
597 / initialize the PIT counter and start a count down
598 movb $PIT_LOADMODE, %al
604 / read the TSC and store the TS in %edi:%esi
612 loop freq_tsc_perf_loop
614 / read the TSC and store the LSW in %ecx
618 / latch the PIT counter and status
619 movb $_CONST(PIT_READBACK|PIT_READBACKC0), %al
622 / remember if the icache has been warmed
625 / read the PIT status
635 / check to see if the PIT count was loaded into the CE
636 btw $_CONST(PITSTAT_NULLCNT+8), %ax
637 jc freq_tsc_increase_count
639 / check to see if PIT counter wrapped
640 btw $_CONST(PITSTAT_OUTPUT+8), %ax
641 jnc freq_tsc_pit_did_not_wrap
647 / the instruction cache has been warmed
652 freq_tsc_increase_count:
658 / the instruction cache has been warmed
663 freq_tsc_pit_did_not_wrap:
668 jb freq_tsc_sufficient_duration
671 / in mode 0, the PIT loads the count into the CE on the first CLK pulse,
672 / then on the second CLK pulse the CE is decremented, therefore mode 0
673 / is really a (count + 1) counter, ugh
681 / tuck away (target_pit_count * loop_count)
686 movl $0xffffffff, %edx
697 jbe freq_tsc_too_fast
708 / the instruction cache has been warmed
713 freq_tsc_sufficient_duration:
714 / test to see if the icache has been warmed
716 jnc freq_tsc_calculate
718 / recall mode 0 is a (count + 1) counter
722 / save the number of PIT counts
725 / calculate the number of TS's that elapsed
733 / return
0 as
a 64 bit quantity