2 * linux/arch/x86_64/entry.S
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
10 * entry.S contains the system-call and fault low-level handling routines.
12 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call.
15 * Normal syscalls and interrupts don't save a full stack frame, this is
16 * only done for syscall tracing, signals or fork/exec et.al.
18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11.
22 * - full stack frame: Like partial stack frame, but all register saved.
25 * - CFI macros are used to generate dwarf2 unwind information for better
26 * backtraces. They don't change any code.
27 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
28 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
29 * There are unfortunately lots of special cases where some registers
30 * not touched. The macro is a big mess that should be cleaned up.
31 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
32 * Gives a full stack frame.
33 * - ENTRY/END Define functions in the symbol table.
34 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
35 * frame that is otherwise undefined after a SYSCALL
36 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
37 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
40 #include <linux/linkage.h>
41 #include <asm/segment.h>
42 #include <asm/cache.h>
43 #include <asm/errno.h>
44 #include <asm/dwarf2.h>
45 #include <asm/calling.h>
46 #include <asm/asm-offsets.h>
48 #include <asm/unistd.h>
49 #include <asm/thread_info.h>
50 #include <asm/hw_irq.h>
52 #include <asm/irqflags.h>
53 #include <asm/paravirt.h>
54 #include <asm/ftrace.h>
56 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57 #include <linux/elf-em.h>
58 #define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
59 #define __AUDIT_ARCH_64BIT 0x80000000
60 #define __AUDIT_ARCH_LE 0x40000000
64 #ifdef CONFIG_FUNCTION_TRACER
65 #ifdef CONFIG_DYNAMIC_FTRACE
71 cmpl $0, function_trace_stop
74 /* taken from glibc */
86 subq $MCOUNT_INSN_SIZE, %rdi
106 #else /* ! CONFIG_DYNAMIC_FTRACE */
108 cmpl $0, function_trace_stop
111 cmpq $ftrace_stub, ftrace_trace_function
118 /* taken from glibc */
128 movq 0x38(%rsp), %rdi
130 subq $MCOUNT_INSN_SIZE, %rdi
132 call *ftrace_trace_function
145 #endif /* CONFIG_DYNAMIC_FTRACE */
146 #endif /* CONFIG_FUNCTION_TRACER */
148 #ifndef CONFIG_PREEMPT
149 #define retint_kernel retint_restore_args
152 #ifdef CONFIG_PARAVIRT
153 ENTRY(native_usergs_sysret64)
156 #endif /* CONFIG_PARAVIRT */
159 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
160 #ifdef CONFIG_TRACE_IRQFLAGS
161 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
169 * C code is not supposed to know about undefined top of stack. Every time
170 * a C function with an pt_regs argument is called from the SYSCALL based
171 * fast path FIXUP_TOP_OF_STACK is needed.
172 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
176 /* %rsp:at FRAMEEND */
177 .macro FIXUP_TOP_OF_STACK tmp
178 movq %gs:pda_oldrsp,\tmp
180 movq $__USER_DS,SS(%rsp)
181 movq $__USER_CS,CS(%rsp)
183 movq R11(%rsp),\tmp /* get eflags */
184 movq \tmp,EFLAGS(%rsp)
187 .macro RESTORE_TOP_OF_STACK tmp,offset=0
188 movq RSP-\offset(%rsp),\tmp
189 movq \tmp,%gs:pda_oldrsp
190 movq EFLAGS-\offset(%rsp),\tmp
191 movq \tmp,R11-\offset(%rsp)
194 .macro FAKE_STACK_FRAME child_rip
195 /* push in order ss, rsp, eflags, cs, rip */
197 pushq $__KERNEL_DS /* ss */
198 CFI_ADJUST_CFA_OFFSET 8
199 /*CFI_REL_OFFSET ss,0*/
201 CFI_ADJUST_CFA_OFFSET 8
203 pushq $(1<<9) /* eflags - interrupts on */
204 CFI_ADJUST_CFA_OFFSET 8
205 /*CFI_REL_OFFSET rflags,0*/
206 pushq $__KERNEL_CS /* cs */
207 CFI_ADJUST_CFA_OFFSET 8
208 /*CFI_REL_OFFSET cs,0*/
209 pushq \child_rip /* rip */
210 CFI_ADJUST_CFA_OFFSET 8
212 pushq %rax /* orig rax */
213 CFI_ADJUST_CFA_OFFSET 8
216 .macro UNFAKE_STACK_FRAME
218 CFI_ADJUST_CFA_OFFSET -(6*8)
221 .macro CFI_DEFAULT_STACK start=1
227 CFI_DEF_CFA_OFFSET SS+8
229 CFI_REL_OFFSET r15,R15
230 CFI_REL_OFFSET r14,R14
231 CFI_REL_OFFSET r13,R13
232 CFI_REL_OFFSET r12,R12
233 CFI_REL_OFFSET rbp,RBP
234 CFI_REL_OFFSET rbx,RBX
235 CFI_REL_OFFSET r11,R11
236 CFI_REL_OFFSET r10,R10
239 CFI_REL_OFFSET rax,RAX
240 CFI_REL_OFFSET rcx,RCX
241 CFI_REL_OFFSET rdx,RDX
242 CFI_REL_OFFSET rsi,RSI
243 CFI_REL_OFFSET rdi,RDI
244 CFI_REL_OFFSET rip,RIP
245 /*CFI_REL_OFFSET cs,CS*/
246 /*CFI_REL_OFFSET rflags,EFLAGS*/
247 CFI_REL_OFFSET rsp,RSP
248 /*CFI_REL_OFFSET ss,SS*/
251 * A newly forked process directly context switches into this.
256 push kernel_eflags(%rip)
257 CFI_ADJUST_CFA_OFFSET 8
258 popf # reset kernel eflags
259 CFI_ADJUST_CFA_OFFSET -8
261 GET_THREAD_INFO(%rcx)
262 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
266 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
267 je int_ret_from_sys_call
268 testl $_TIF_IA32,TI_flags(%rcx)
269 jnz int_ret_from_sys_call
270 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
271 jmp ret_from_sys_call
274 call syscall_trace_leave
275 GET_THREAD_INFO(%rcx)
281 * System call entry. Upto 6 arguments in registers are supported.
283 * SYSCALL does not save anything on the stack and does not change the
289 * rax system call number
291 * rcx return address for syscall/sysret, C arg3
294 * r10 arg3 (--> moved to rcx for C)
297 * r11 eflags for syscall/sysret, temporary for C
298 * r12-r15,rbp,rbx saved by C code, not touched.
300 * Interrupts are off on entry.
301 * Only called from user space.
303 * XXX if we had a free scratch register we could save the RSP into the stack frame
304 * and report it properly in ps. Unfortunately we haven't.
306 * When user can change the frames always force IRET. That is because
307 * it deals with uncanonical addresses better. SYSRET has trouble
308 * with them due to bugs in both AMD and Intel CPUs.
314 CFI_DEF_CFA rsp,PDA_STACKOFFSET
316 /*CFI_REGISTER rflags,r11*/
319 * A hypervisor implementation might want to use a label
320 * after the swapgs, so that it can do the swapgs
321 * for the guest and jump here on syscall.
323 ENTRY(system_call_after_swapgs)
325 movq %rsp,%gs:pda_oldrsp
326 movq %gs:pda_kernelstack,%rsp
328 * No need to follow this irqs off/on section - it's straight
331 ENABLE_INTERRUPTS(CLBR_NONE)
333 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
334 movq %rcx,RIP-ARGOFFSET(%rsp)
335 CFI_REL_OFFSET rip,RIP-ARGOFFSET
336 GET_THREAD_INFO(%rcx)
337 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
339 system_call_fastpath:
340 cmpq $__NR_syscall_max,%rax
343 call *sys_call_table(,%rax,8) # XXX: rip relative
344 movq %rax,RAX-ARGOFFSET(%rsp)
346 * Syscall return path ending with SYSRET (fast path)
347 * Has incomplete stack frame and undefined top of stack.
350 movl $_TIF_ALLWORK_MASK,%edi
354 GET_THREAD_INFO(%rcx)
355 DISABLE_INTERRUPTS(CLBR_NONE)
357 movl TI_flags(%rcx),%edx
362 * sysretq will re-enable interrupts:
365 movq RIP-ARGOFFSET(%rsp),%rcx
367 RESTORE_ARGS 0,-ARG_SKIP,1
368 /*CFI_REGISTER rflags,r11*/
369 movq %gs:pda_oldrsp, %rsp
373 /* Handle reschedules */
374 /* edx: work, edi: workmask */
376 bt $TIF_NEED_RESCHED,%edx
379 ENABLE_INTERRUPTS(CLBR_NONE)
381 CFI_ADJUST_CFA_OFFSET 8
384 CFI_ADJUST_CFA_OFFSET -8
387 /* Handle a signal */
390 ENABLE_INTERRUPTS(CLBR_NONE)
391 #ifdef CONFIG_AUDITSYSCALL
392 bt $TIF_SYSCALL_AUDIT,%edx
395 /* edx: work flags (arg3) */
396 leaq do_notify_resume(%rip),%rax
397 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
398 xorl %esi,%esi # oldset -> arg2
399 call ptregscall_common
400 movl $_TIF_WORK_MASK,%edi
401 /* Use IRET because user could have changed frame. This
402 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
403 DISABLE_INTERRUPTS(CLBR_NONE)
408 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
409 jmp ret_from_sys_call
411 #ifdef CONFIG_AUDITSYSCALL
413 * Fast path for syscall audit without full syscall trace.
414 * We just call audit_syscall_entry() directly, and then
415 * jump back to the normal fast path.
418 movq %r10,%r9 /* 6th arg: 4th syscall arg */
419 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
420 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
421 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
422 movq %rax,%rsi /* 2nd arg: syscall number */
423 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
424 call audit_syscall_entry
425 LOAD_ARGS 0 /* reload call-clobbered registers */
426 jmp system_call_fastpath
429 * Return fast path for syscall audit. Call audit_syscall_exit()
430 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
434 movq %rax,%rsi /* second arg, syscall return value */
435 cmpq $0,%rax /* is it < 0? */
436 setl %al /* 1 if so, 0 if not */
437 movzbl %al,%edi /* zero-extend that into %edi */
438 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
439 call audit_syscall_exit
440 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
442 #endif /* CONFIG_AUDITSYSCALL */
444 /* Do syscall tracing */
446 #ifdef CONFIG_AUDITSYSCALL
447 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
451 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
452 FIXUP_TOP_OF_STACK %rdi
454 call syscall_trace_enter
456 * Reload arg registers from stack in case ptrace changed them.
457 * We don't reload %rax because syscall_trace_enter() returned
458 * the value it wants us to use in the table lookup.
460 LOAD_ARGS ARGOFFSET, 1
462 cmpq $__NR_syscall_max,%rax
463 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
464 movq %r10,%rcx /* fixup for C */
465 call *sys_call_table(,%rax,8)
466 movq %rax,RAX-ARGOFFSET(%rsp)
467 /* Use IRET because user could have changed frame */
470 * Syscall return path ending with IRET.
471 * Has correct top of stack, but partial stack frame.
473 .globl int_ret_from_sys_call
474 .globl int_with_check
475 int_ret_from_sys_call:
476 DISABLE_INTERRUPTS(CLBR_NONE)
478 testl $3,CS-ARGOFFSET(%rsp)
479 je retint_restore_args
480 movl $_TIF_ALLWORK_MASK,%edi
481 /* edi: mask to check */
484 GET_THREAD_INFO(%rcx)
485 movl TI_flags(%rcx),%edx
488 andl $~TS_COMPAT,TI_status(%rcx)
491 /* Either reschedule or signal or syscall exit tracking needed. */
492 /* First do a reschedule test. */
493 /* edx: work, edi: workmask */
495 bt $TIF_NEED_RESCHED,%edx
498 ENABLE_INTERRUPTS(CLBR_NONE)
500 CFI_ADJUST_CFA_OFFSET 8
503 CFI_ADJUST_CFA_OFFSET -8
504 DISABLE_INTERRUPTS(CLBR_NONE)
508 /* handle signals and tracing -- both require a full stack frame */
511 ENABLE_INTERRUPTS(CLBR_NONE)
513 /* Check for syscall exit trace */
514 testl $_TIF_WORK_SYSCALL_EXIT,%edx
517 CFI_ADJUST_CFA_OFFSET 8
518 leaq 8(%rsp),%rdi # &ptregs -> arg1
519 call syscall_trace_leave
521 CFI_ADJUST_CFA_OFFSET -8
522 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
526 testl $_TIF_DO_NOTIFY_MASK,%edx
528 movq %rsp,%rdi # &ptregs -> arg1
529 xorl %esi,%esi # oldset -> arg2
530 call do_notify_resume
531 1: movl $_TIF_WORK_MASK,%edi
534 DISABLE_INTERRUPTS(CLBR_NONE)
541 * Certain special system calls that need to save a complete full stack frame.
544 .macro PTREGSCALL label,func,arg
547 leaq \func(%rip),%rax
548 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
549 jmp ptregscall_common
555 PTREGSCALL stub_clone, sys_clone, %r8
556 PTREGSCALL stub_fork, sys_fork, %rdi
557 PTREGSCALL stub_vfork, sys_vfork, %rdi
558 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
559 PTREGSCALL stub_iopl, sys_iopl, %rsi
561 ENTRY(ptregscall_common)
563 CFI_ADJUST_CFA_OFFSET -8
564 CFI_REGISTER rip, r11
567 CFI_REGISTER rip, r15
568 FIXUP_TOP_OF_STACK %r11
570 RESTORE_TOP_OF_STACK %r11
572 CFI_REGISTER rip, r11
575 CFI_ADJUST_CFA_OFFSET 8
576 CFI_REL_OFFSET rip, 0
579 END(ptregscall_common)
584 CFI_ADJUST_CFA_OFFSET -8
585 CFI_REGISTER rip, r11
587 FIXUP_TOP_OF_STACK %r11
590 RESTORE_TOP_OF_STACK %r11
593 jmp int_ret_from_sys_call
598 * sigreturn is special because it needs to restore all registers on return.
599 * This cannot be done with SYSRET, so use the IRET return path instead.
601 ENTRY(stub_rt_sigreturn)
604 CFI_ADJUST_CFA_OFFSET -8
607 FIXUP_TOP_OF_STACK %r11
608 call sys_rt_sigreturn
609 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
611 jmp int_ret_from_sys_call
613 END(stub_rt_sigreturn)
616 * initial frame state for interrupts and exceptions
621 CFI_DEF_CFA rsp,SS+8-\ref
622 /*CFI_REL_OFFSET ss,SS-\ref*/
623 CFI_REL_OFFSET rsp,RSP-\ref
624 /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
625 /*CFI_REL_OFFSET cs,CS-\ref*/
626 CFI_REL_OFFSET rip,RIP-\ref
629 /* initial frame state for interrupts (and exceptions without error code) */
630 #define INTR_FRAME _frame RIP
631 /* initial frame state for exceptions with error code (and interrupts with
632 vector already pushed) */
633 #define XCPT_FRAME _frame ORIG_RAX
636 * Interrupt entry/exit.
638 * Interrupt entry points save only callee clobbered registers in fast path.
640 * Entry runs with interrupts off.
643 /* 0(%rsp): interrupt number */
644 .macro interrupt func
647 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
650 * Save rbp twice: One is for marking the stack frame, as usual, and the
651 * other, to fill pt_regs properly. This is because bx comes right
652 * before the last saved register in that structure, and not bp. If the
653 * base pointer were in the place bx is today, this would not be needed.
656 CFI_ADJUST_CFA_OFFSET 8
657 CFI_REL_OFFSET rbp, 0
659 CFI_DEF_CFA_REGISTER rbp
663 /* irqcount is used to check if a CPU is already on an interrupt
664 stack or not. While this is essentially redundant with preempt_count
665 it is a little cheaper to use a separate counter in the PDA
666 (short of moving irq_enter into assembly, which would be too
668 1: incl %gs:pda_irqcount
669 cmoveq %gs:pda_irqstackptr,%rsp
670 push %rbp # backlink for old unwinder
672 * We entered an interrupt context - irqs are off:
678 ENTRY(common_interrupt)
681 /* 0(%rsp): oldrsp-ARGOFFSET */
683 DISABLE_INTERRUPTS(CLBR_NONE)
685 decl %gs:pda_irqcount
687 CFI_DEF_CFA_REGISTER rsp
688 CFI_ADJUST_CFA_OFFSET -8
690 GET_THREAD_INFO(%rcx)
691 testl $3,CS-ARGOFFSET(%rsp)
694 /* Interrupt came from user space */
696 * Has a correct top of stack, but a partial stack frame
697 * %rcx: thread info. Interrupts off.
699 retint_with_reschedule:
700 movl $_TIF_WORK_MASK,%edi
703 movl TI_flags(%rcx),%edx
708 retint_swapgs: /* return to user-space */
710 * The iretq could re-enable interrupts:
712 DISABLE_INTERRUPTS(CLBR_ANY)
717 retint_restore_args: /* return to kernel space */
718 DISABLE_INTERRUPTS(CLBR_ANY)
720 * The iretq could re-enable interrupts:
729 .section __ex_table, "a"
730 .quad irq_return, bad_iret
733 #ifdef CONFIG_PARAVIRT
737 .section __ex_table,"a"
738 .quad native_iret, bad_iret
745 * The iret traps when the %cs or %ss being restored is bogus.
746 * We've lost the original trap vector and error code.
747 * #GPF is the most likely one to get for an invalid selector.
748 * So pretend we completed the iret and took the #GPF in user mode.
750 * We are now running with the kernel GS after exception recovery.
751 * But error_entry expects us to have user GS to match the user %cs,
757 jmp general_protection
761 /* edi: workmask, edx: work */
764 bt $TIF_NEED_RESCHED,%edx
767 ENABLE_INTERRUPTS(CLBR_NONE)
769 CFI_ADJUST_CFA_OFFSET 8
772 CFI_ADJUST_CFA_OFFSET -8
773 GET_THREAD_INFO(%rcx)
774 DISABLE_INTERRUPTS(CLBR_NONE)
779 testl $_TIF_DO_NOTIFY_MASK,%edx
782 ENABLE_INTERRUPTS(CLBR_NONE)
784 movq $-1,ORIG_RAX(%rsp)
785 xorl %esi,%esi # oldset
786 movq %rsp,%rdi # &pt_regs
787 call do_notify_resume
789 DISABLE_INTERRUPTS(CLBR_NONE)
791 GET_THREAD_INFO(%rcx)
792 jmp retint_with_reschedule
794 #ifdef CONFIG_PREEMPT
795 /* Returning to kernel space. Check if we need preemption */
796 /* rcx: threadinfo. interrupts off. */
798 cmpl $0,TI_preempt_count(%rcx)
799 jnz retint_restore_args
800 bt $TIF_NEED_RESCHED,TI_flags(%rcx)
801 jnc retint_restore_args
802 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
803 jnc retint_restore_args
804 call preempt_schedule_irq
809 END(common_interrupt)
814 .macro apicinterrupt num,func
817 CFI_ADJUST_CFA_OFFSET 8
823 ENTRY(thermal_interrupt)
824 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
825 END(thermal_interrupt)
827 ENTRY(threshold_interrupt)
828 apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
829 END(threshold_interrupt)
832 ENTRY(reschedule_interrupt)
833 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
834 END(reschedule_interrupt)
836 .macro INVALIDATE_ENTRY num
837 ENTRY(invalidate_interrupt\num)
838 apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
839 END(invalidate_interrupt\num)
851 ENTRY(call_function_interrupt)
852 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
853 END(call_function_interrupt)
854 ENTRY(call_function_single_interrupt)
855 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
856 END(call_function_single_interrupt)
857 ENTRY(irq_move_cleanup_interrupt)
858 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
859 END(irq_move_cleanup_interrupt)
862 ENTRY(apic_timer_interrupt)
863 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
864 END(apic_timer_interrupt)
866 ENTRY(uv_bau_message_intr1)
867 apicinterrupt 220,uv_bau_message_interrupt
868 END(uv_bau_message_intr1)
870 ENTRY(error_interrupt)
871 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
874 ENTRY(spurious_interrupt)
875 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
876 END(spurious_interrupt)
879 * Exception entry points.
883 PARAVIRT_ADJUST_EXCEPTION_FRAME
884 pushq $0 /* push error code/oldrax */
885 CFI_ADJUST_CFA_OFFSET 8
886 pushq %rax /* push real oldrax to the rdi slot */
887 CFI_ADJUST_CFA_OFFSET 8
894 .macro errorentry sym
896 PARAVIRT_ADJUST_EXCEPTION_FRAME
898 CFI_ADJUST_CFA_OFFSET 8
905 /* error code is on the stack already */
906 /* handle NMI like exceptions that can happen everywhere */
907 .macro paranoidentry sym, ist=0, irqtrace=1
911 movl $MSR_GS_BASE,%ecx
919 movq %gs:pda_data_offset, %rbp
925 movq ORIG_RAX(%rsp),%rsi
926 movq $-1,ORIG_RAX(%rsp)
928 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
932 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
934 DISABLE_INTERRUPTS(CLBR_NONE)
941 * "Paranoid" exit path from exception stack.
942 * Paranoid because this is used by NMIs and cannot take
943 * any kernel state for granted.
944 * We don't do kernel preemption checks here, because only
945 * NMI should be common and it does not enable IRQs and
946 * cannot get reschedule ticks.
948 * "trace" is 0 for the NMI handler only, because irq-tracing
949 * is fundamentally NMI-unsafe. (we cannot change the soft and
950 * hard flags at once, atomically)
952 .macro paranoidexit trace=1
953 /* ebx: no swapgs flag */
955 testl %ebx,%ebx /* swapgs needed? */
956 jnz paranoid_restore\trace
958 jnz paranoid_userspace\trace
959 paranoid_swapgs\trace:
964 paranoid_restore\trace:
967 paranoid_userspace\trace:
968 GET_THREAD_INFO(%rcx)
969 movl TI_flags(%rcx),%ebx
970 andl $_TIF_WORK_MASK,%ebx
971 jz paranoid_swapgs\trace
972 movq %rsp,%rdi /* &pt_regs */
974 movq %rax,%rsp /* switch stack for scheduling */
975 testl $_TIF_NEED_RESCHED,%ebx
976 jnz paranoid_schedule\trace
977 movl %ebx,%edx /* arg3: thread flags */
981 ENABLE_INTERRUPTS(CLBR_NONE)
982 xorl %esi,%esi /* arg2: oldset */
983 movq %rsp,%rdi /* arg1: &pt_regs */
984 call do_notify_resume
985 DISABLE_INTERRUPTS(CLBR_NONE)
989 jmp paranoid_userspace\trace
990 paranoid_schedule\trace:
994 ENABLE_INTERRUPTS(CLBR_ANY)
996 DISABLE_INTERRUPTS(CLBR_ANY)
1000 jmp paranoid_userspace\trace
1005 * Exception entry point. This expects an error code/orig_rax on the stack
1006 * and the exception handler in %rax.
1008 KPROBE_ENTRY(error_entry)
1010 CFI_REL_OFFSET rax,0
1011 /* rdi slot contains rax, oldrax contains error code */
1014 CFI_ADJUST_CFA_OFFSET (14*8)
1015 movq %rsi,13*8(%rsp)
1016 CFI_REL_OFFSET rsi,RSI
1017 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
1018 CFI_REGISTER rax,rsi
1019 movq %rdx,12*8(%rsp)
1020 CFI_REL_OFFSET rdx,RDX
1021 movq %rcx,11*8(%rsp)
1022 CFI_REL_OFFSET rcx,RCX
1023 movq %rsi,10*8(%rsp) /* store rax */
1024 CFI_REL_OFFSET rax,RAX
1026 CFI_REL_OFFSET r8,R8
1028 CFI_REL_OFFSET r9,R9
1030 CFI_REL_OFFSET r10,R10
1032 CFI_REL_OFFSET r11,R11
1034 CFI_REL_OFFSET rbx,RBX
1036 CFI_REL_OFFSET rbp,RBP
1038 CFI_REL_OFFSET r12,R12
1040 CFI_REL_OFFSET r13,R13
1042 CFI_REL_OFFSET r14,R14
1044 CFI_REL_OFFSET r15,R15
1047 je error_kernelspace
1053 CFI_REL_OFFSET rdi,RDI
1055 movq ORIG_RAX(%rsp),%rsi /* get error code */
1056 movq $-1,ORIG_RAX(%rsp)
1058 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1062 DISABLE_INTERRUPTS(CLBR_NONE)
1064 GET_THREAD_INFO(%rcx)
1067 LOCKDEP_SYS_EXIT_IRQ
1068 movl TI_flags(%rcx),%edx
1069 movl $_TIF_WORK_MASK,%edi
1077 /* There are two places in the kernel that can potentially fault with
1078 usergs. Handle them here. The exception handlers after
1079 iret run with kernel gs again, so don't set the user space flag.
1080 B stepping K8s sometimes report an truncated RIP for IRET
1081 exceptions returning to compat mode. Check for these here too. */
1082 leaq irq_return(%rip),%rcx
1085 movl %ecx,%ecx /* zero extend */
1088 cmpq $gs_change,RIP(%rsp)
1091 KPROBE_END(error_entry)
1093 /* Reload gs selector with exception handling */
1094 /* edi: new selector */
1095 ENTRY(native_load_gs_index)
1098 CFI_ADJUST_CFA_OFFSET 8
1099 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
1103 2: mfence /* workaround */
1106 CFI_ADJUST_CFA_OFFSET -8
1109 ENDPROC(native_load_gs_index)
1111 .section __ex_table,"a"
1113 .quad gs_change,bad_gs
1115 .section .fixup,"ax"
1116 /* running with kernelgs */
1118 SWAPGS /* switch back to user gs */
1125 * Create a kernel thread.
1127 * C extern interface:
1128 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
1130 * asm input arguments:
1131 * rdi: fn, rsi: arg, rdx: flags
1133 ENTRY(kernel_thread)
1135 FAKE_STACK_FRAME $child_rip
1138 # rdi: flags, rsi: usp, rdx: will be &pt_regs
1140 orq kernel_thread_flags(%rip),%rdi
1153 * It isn't worth to check for reschedule here,
1154 * so internally to the x86_64 port you can rely on kernel_thread()
1155 * not to reschedule the child before returning, this avoids the need
1156 * of hacks for example to fork off the per-CPU idle tasks.
1157 * [Hopefully no generic code relies on the reschedule -AK]
1163 ENDPROC(kernel_thread)
1166 pushq $0 # fake return address
1169 * Here we are in the child and the registers are set as they were
1170 * at kernel_thread() invocation in the parent.
1182 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
1184 * C extern interface:
1185 * extern long execve(char *name, char **argv, char **envp)
1187 * asm input arguments:
1188 * rdi: name, rsi: argv, rdx: envp
1190 * We want to fallback into:
1191 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
1193 * do_sys_execve asm fallback arguments:
1194 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
1196 ENTRY(kernel_execve)
1202 movq %rax, RAX(%rsp)
1205 je int_ret_from_sys_call
1210 ENDPROC(kernel_execve)
1212 KPROBE_ENTRY(page_fault)
1213 errorentry do_page_fault
1214 KPROBE_END(page_fault)
1216 ENTRY(coprocessor_error)
1217 zeroentry do_coprocessor_error
1218 END(coprocessor_error)
1220 ENTRY(simd_coprocessor_error)
1221 zeroentry do_simd_coprocessor_error
1222 END(simd_coprocessor_error)
1224 ENTRY(device_not_available)
1225 zeroentry do_device_not_available
1226 END(device_not_available)
1228 /* runs on exception stack */
1231 PARAVIRT_ADJUST_EXCEPTION_FRAME
1233 CFI_ADJUST_CFA_OFFSET 8
1234 paranoidentry do_debug, DEBUG_STACK
1238 /* runs on exception stack */
1241 PARAVIRT_ADJUST_EXCEPTION_FRAME
1243 CFI_ADJUST_CFA_OFFSET 8
1244 paranoidentry do_nmi, 0, 0
1245 #ifdef CONFIG_TRACE_IRQFLAGS
1255 PARAVIRT_ADJUST_EXCEPTION_FRAME
1257 CFI_ADJUST_CFA_OFFSET 8
1258 paranoidentry do_int3, DEBUG_STACK
1264 zeroentry do_overflow
1272 zeroentry do_invalid_op
1275 ENTRY(coprocessor_segment_overrun)
1276 zeroentry do_coprocessor_segment_overrun
1277 END(coprocessor_segment_overrun)
1279 /* runs on exception stack */
1282 PARAVIRT_ADJUST_EXCEPTION_FRAME
1283 paranoidentry do_double_fault
1289 errorentry do_invalid_TSS
1292 ENTRY(segment_not_present)
1293 errorentry do_segment_not_present
1294 END(segment_not_present)
1296 /* runs on exception stack */
1297 ENTRY(stack_segment)
1299 PARAVIRT_ADJUST_EXCEPTION_FRAME
1300 paranoidentry do_stack_segment
1305 KPROBE_ENTRY(general_protection)
1306 errorentry do_general_protection
1307 KPROBE_END(general_protection)
1309 ENTRY(alignment_check)
1310 errorentry do_alignment_check
1311 END(alignment_check)
1314 zeroentry do_divide_error
1317 ENTRY(spurious_interrupt_bug)
1318 zeroentry do_spurious_interrupt_bug
1319 END(spurious_interrupt_bug)
1321 #ifdef CONFIG_X86_MCE
1322 /* runs on exception stack */
1323 ENTRY(machine_check)
1325 PARAVIRT_ADJUST_EXCEPTION_FRAME
1327 CFI_ADJUST_CFA_OFFSET 8
1328 paranoidentry do_machine_check
1334 /* Call softirq on interrupt stack. Interrupts are off. */
1338 CFI_ADJUST_CFA_OFFSET 8
1339 CFI_REL_OFFSET rbp,0
1341 CFI_DEF_CFA_REGISTER rbp
1342 incl %gs:pda_irqcount
1343 cmove %gs:pda_irqstackptr,%rsp
1344 push %rbp # backlink for old unwinder
1347 CFI_DEF_CFA_REGISTER rsp
1348 CFI_ADJUST_CFA_OFFSET -8
1349 decl %gs:pda_irqcount
1352 ENDPROC(call_softirq)
1354 KPROBE_ENTRY(ignore_sysret)
1359 ENDPROC(ignore_sysret)
1362 ENTRY(xen_hypervisor_callback)
1363 zeroentry xen_do_hypervisor_callback
1364 END(xen_hypervisor_callback)
1367 # A note on the "critical region" in our callback handler.
1368 # We want to avoid stacking callback handlers due to events occurring
1369 # during handling of the last event. To do this, we keep events disabled
1370 # until we've done all processing. HOWEVER, we must enable events before
1371 # popping the stack frame (can't be done atomically) and so it would still
1372 # be possible to get enough handler activations to overflow the stack.
1373 # Although unlikely, bugs of that kind are hard to track down, so we'd
1374 # like to avoid the possibility.
1375 # So, on entry to the handler we detect whether we interrupted an
1376 # existing activation in its critical region -- if so, we pop the current
1377 # activation and restart the handler using the previous one.
1379 ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1381 /* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1382 see the correct pointer to the pt_regs */
1383 movq %rdi, %rsp # we don't return, adjust the stack frame
1386 11: incl %gs:pda_irqcount
1388 CFI_DEF_CFA_REGISTER rbp
1389 cmovzq %gs:pda_irqstackptr,%rsp
1390 pushq %rbp # backlink for old unwinder
1391 call xen_evtchn_do_upcall
1393 CFI_DEF_CFA_REGISTER rsp
1394 decl %gs:pda_irqcount
1397 END(do_hypervisor_callback)
1400 # Hypervisor uses this for application faults while it executes.
1401 # We get here for two reasons:
1402 # 1. Fault while reloading DS, ES, FS or GS
1403 # 2. Fault while executing IRET
1404 # Category 1 we do not need to fix up as Xen has already reloaded all segment
1405 # registers that could be reloaded and zeroed the others.
1406 # Category 2 we fix up by killing the current process. We cannot use the
1407 # normal Linux return path in this case because if we use the IRET hypercall
1408 # to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1409 # We distinguish between categories by comparing each saved segment register
1410 # with its current contents: any discrepancy means we in category 1.
1412 ENTRY(xen_failsafe_callback)
1413 framesz = (RIP-0x30) /* workaround buggy gas */
1415 CFI_REL_OFFSET rcx, 0
1416 CFI_REL_OFFSET r11, 8
1430 /* All segments match their saved values => Category 2 (Bad IRET). */
1436 CFI_ADJUST_CFA_OFFSET -0x30
1438 CFI_ADJUST_CFA_OFFSET 8
1440 CFI_ADJUST_CFA_OFFSET 8
1442 CFI_ADJUST_CFA_OFFSET 8
1443 jmp general_protection
1445 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1451 CFI_ADJUST_CFA_OFFSET -0x30
1453 CFI_ADJUST_CFA_OFFSET 8
1457 END(xen_failsafe_callback)
1459 #endif /* CONFIG_XEN */