2 * linux/arch/x86-64/mm/fault.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
8 #include <linux/config.h>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
18 #include <linux/smp.h>
19 #include <linux/smp_lock.h>
20 #include <linux/interrupt.h>
21 #include <linux/init.h>
22 #include <linux/tty.h>
23 #include <linux/vt_kern.h> /* For unblank_screen() */
24 #include <linux/compiler.h>
25 #include <linux/module.h>
26 #include <linux/kprobes.h>
28 #include <asm/system.h>
29 #include <asm/uaccess.h>
30 #include <asm/pgalloc.h>
32 #include <asm/tlbflush.h>
33 #include <asm/proto.h>
34 #include <asm/kdebug.h>
35 #include <asm-generic/sections.h>
37 /* Page fault error code bits */
38 #define PF_PROT (1<<0) /* or no page found */
39 #define PF_WRITE (1<<1)
40 #define PF_USER (1<<2)
41 #define PF_RSVD (1<<3)
42 #define PF_INSTR (1<<4)
45 ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain
);
47 /* Hook to register for page fault notifications */
48 int register_page_fault_notifier(struct notifier_block
*nb
)
51 return atomic_notifier_chain_register(¬ify_page_fault_chain
, nb
);
54 int unregister_page_fault_notifier(struct notifier_block
*nb
)
56 return atomic_notifier_chain_unregister(¬ify_page_fault_chain
, nb
);
59 static inline int notify_page_fault(enum die_val val
, const char *str
,
60 struct pt_regs
*regs
, long err
, int trap
, int sig
)
62 struct die_args args
= {
69 return atomic_notifier_call_chain(¬ify_page_fault_chain
, val
, &args
);
72 static inline int notify_page_fault(enum die_val val
, const char *str
,
73 struct pt_regs
*regs
, long err
, int trap
, int sig
)
79 void bust_spinlocks(int yes
)
81 int loglevel_save
= console_loglevel
;
90 * OK, the message is on the console. Now we call printk()
91 * without oops_in_progress set so that printk will give klogd
92 * a poke. Hold onto your hats...
94 console_loglevel
= 15; /* NMI oopser may have shut the console up */
96 console_loglevel
= loglevel_save
;
100 /* Sometimes the CPU reports invalid exceptions on prefetch.
101 Check that here and ignore.
102 Opcode checker based on code by Richard Brunner */
103 static noinline
int is_prefetch(struct pt_regs
*regs
, unsigned long addr
,
104 unsigned long error_code
)
106 unsigned char *instr
;
109 unsigned char *max_instr
;
111 /* If it was a exec fault ignore */
112 if (error_code
& PF_INSTR
)
115 instr
= (unsigned char *)convert_rip_to_linear(current
, regs
);
116 max_instr
= instr
+ 15;
118 if (user_mode(regs
) && instr
>= (unsigned char *)TASK_SIZE
)
121 while (scan_more
&& instr
< max_instr
) {
122 unsigned char opcode
;
123 unsigned char instr_hi
;
124 unsigned char instr_lo
;
126 if (__get_user(opcode
, instr
))
129 instr_hi
= opcode
& 0xf0;
130 instr_lo
= opcode
& 0x0f;
136 /* Values 0x26,0x2E,0x36,0x3E are valid x86
137 prefixes. In long mode, the CPU will signal
138 invalid opcode if some of these prefixes are
139 present so we will never get here anyway */
140 scan_more
= ((instr_lo
& 7) == 0x6);
144 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
145 Need to figure out under what instruction mode the
146 instruction was issued ... */
147 /* Could check the LDT for lm, but for now it's good
148 enough to assume that long mode only uses well known
149 segments or kernel. */
150 scan_more
= (!user_mode(regs
)) || (regs
->cs
== __USER_CS
);
154 /* 0x64 thru 0x67 are valid prefixes in all modes. */
155 scan_more
= (instr_lo
& 0xC) == 0x4;
158 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
159 scan_more
= !instr_lo
|| (instr_lo
>>1) == 1;
162 /* Prefetch instruction is 0x0F0D or 0x0F18 */
164 if (__get_user(opcode
, instr
))
166 prefetch
= (instr_lo
== 0xF) &&
167 (opcode
== 0x0D || opcode
== 0x18);
177 static int bad_address(void *p
)
180 return __get_user(dummy
, (unsigned long *)p
);
183 void dump_pagetable(unsigned long address
)
190 asm("movq %%cr3,%0" : "=r" (pgd
));
192 pgd
= __va((unsigned long)pgd
& PHYSICAL_PAGE_MASK
);
193 pgd
+= pgd_index(address
);
194 if (bad_address(pgd
)) goto bad
;
195 printk("PGD %lx ", pgd_val(*pgd
));
196 if (!pgd_present(*pgd
)) goto ret
;
198 pud
= pud_offset(pgd
, address
);
199 if (bad_address(pud
)) goto bad
;
200 printk("PUD %lx ", pud_val(*pud
));
201 if (!pud_present(*pud
)) goto ret
;
203 pmd
= pmd_offset(pud
, address
);
204 if (bad_address(pmd
)) goto bad
;
205 printk("PMD %lx ", pmd_val(*pmd
));
206 if (!pmd_present(*pmd
)) goto ret
;
208 pte
= pte_offset_kernel(pmd
, address
);
209 if (bad_address(pte
)) goto bad
;
210 printk("PTE %lx", pte_val(*pte
));
218 static const char errata93_warning
[] =
219 KERN_ERR
"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
220 KERN_ERR
"******* Working around it, but it may cause SEGVs or burn power.\n"
221 KERN_ERR
"******* Please consider a BIOS update.\n"
222 KERN_ERR
"******* Disabling USB legacy in the BIOS may also help.\n";
224 /* Workaround for K8 erratum #93 & buggy BIOS.
225 BIOS SMM functions are required to use a specific workaround
226 to avoid corruption of the 64bit RIP register on C stepping K8.
227 A lot of BIOS that didn't get tested properly miss this.
228 The OS sees this as a page fault with the upper 32bits of RIP cleared.
229 Try to work around it here.
230 Note we only handle faults in kernel here. */
232 static int is_errata93(struct pt_regs
*regs
, unsigned long address
)
235 if (address
!= regs
->rip
)
237 if ((address
>> 32) != 0)
239 address
|= 0xffffffffUL
<< 32;
240 if ((address
>= (u64
)_stext
&& address
<= (u64
)_etext
) ||
241 (address
>= MODULES_VADDR
&& address
<= MODULES_END
)) {
243 printk(errata93_warning
);
252 int unhandled_signal(struct task_struct
*tsk
, int sig
)
256 if (tsk
->ptrace
& PT_PTRACED
)
258 return (tsk
->sighand
->action
[sig
-1].sa
.sa_handler
== SIG_IGN
) ||
259 (tsk
->sighand
->action
[sig
-1].sa
.sa_handler
== SIG_DFL
);
262 static noinline
void pgtable_bad(unsigned long address
, struct pt_regs
*regs
,
263 unsigned long error_code
)
265 unsigned long flags
= oops_begin();
266 struct task_struct
*tsk
;
268 printk(KERN_ALERT
"%s: Corrupted page table at address %lx\n",
269 current
->comm
, address
);
270 dump_pagetable(address
);
272 tsk
->thread
.cr2
= address
;
273 tsk
->thread
.trap_no
= 14;
274 tsk
->thread
.error_code
= error_code
;
275 __die("Bad pagetable", regs
, error_code
);
281 * Handle a fault on the vmalloc area
283 * This assumes no large pages in there.
285 static int vmalloc_fault(unsigned long address
)
287 pgd_t
*pgd
, *pgd_ref
;
288 pud_t
*pud
, *pud_ref
;
289 pmd_t
*pmd
, *pmd_ref
;
290 pte_t
*pte
, *pte_ref
;
292 /* Copy kernel mappings over when needed. This can also
293 happen within a race in page table update. In the later
296 pgd
= pgd_offset(current
->mm
?: &init_mm
, address
);
297 pgd_ref
= pgd_offset_k(address
);
298 if (pgd_none(*pgd_ref
))
301 set_pgd(pgd
, *pgd_ref
);
303 BUG_ON(pgd_page(*pgd
) != pgd_page(*pgd_ref
));
305 /* Below here mismatches are bugs because these lower tables
308 pud
= pud_offset(pgd
, address
);
309 pud_ref
= pud_offset(pgd_ref
, address
);
310 if (pud_none(*pud_ref
))
312 if (pud_none(*pud
) || pud_page(*pud
) != pud_page(*pud_ref
))
314 pmd
= pmd_offset(pud
, address
);
315 pmd_ref
= pmd_offset(pud_ref
, address
);
316 if (pmd_none(*pmd_ref
))
318 if (pmd_none(*pmd
) || pmd_page(*pmd
) != pmd_page(*pmd_ref
))
320 pte_ref
= pte_offset_kernel(pmd_ref
, address
);
321 if (!pte_present(*pte_ref
))
323 pte
= pte_offset_kernel(pmd
, address
);
324 /* Don't use pte_page here, because the mappings can point
325 outside mem_map, and the NUMA hash lookup cannot handle
327 if (!pte_present(*pte
) || pte_pfn(*pte
) != pte_pfn(*pte_ref
))
332 int page_fault_trace
= 0;
333 int exception_trace
= 1;
336 * This routine handles page faults. It determines the address,
337 * and the problem, and then passes it off to one of the appropriate
340 asmlinkage
void __kprobes
do_page_fault(struct pt_regs
*regs
,
341 unsigned long error_code
)
343 struct task_struct
*tsk
;
344 struct mm_struct
*mm
;
345 struct vm_area_struct
* vma
;
346 unsigned long address
;
347 const struct exception_table_entry
*fixup
;
354 prefetchw(&mm
->mmap_sem
);
356 /* get the address */
357 __asm__("movq %%cr2,%0":"=r" (address
));
359 info
.si_code
= SEGV_MAPERR
;
363 * We fault-in kernel-space virtual memory on-demand. The
364 * 'reference' page table is init_mm.pgd.
366 * NOTE! We MUST NOT take any locks for this case. We may
367 * be in an interrupt or a critical region, and should
368 * only copy the information from the master page table,
371 * This verifies that the fault happens in kernel space
372 * (error_code & 4) == 0, and that the fault was not a
373 * protection error (error_code & 9) == 0.
375 if (unlikely(address
>= TASK_SIZE64
)) {
377 * Don't check for the module range here: its PML4
378 * is always initialized because it's shared with the main
379 * kernel text. Only vmalloc may need PML4 syncups.
381 if (!(error_code
& (PF_RSVD
|PF_USER
|PF_PROT
)) &&
382 ((address
>= VMALLOC_START
&& address
< VMALLOC_END
))) {
383 if (vmalloc_fault(address
) >= 0)
386 if (notify_page_fault(DIE_PAGE_FAULT
, "page fault", regs
, error_code
, 14,
387 SIGSEGV
) == NOTIFY_STOP
)
390 * Don't take the mm semaphore here. If we fixup a prefetch
391 * fault we could otherwise deadlock.
393 goto bad_area_nosemaphore
;
396 if (notify_page_fault(DIE_PAGE_FAULT
, "page fault", regs
, error_code
, 14,
397 SIGSEGV
) == NOTIFY_STOP
)
400 if (likely(regs
->eflags
& X86_EFLAGS_IF
))
403 if (unlikely(page_fault_trace
))
404 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
405 regs
->rip
,regs
->rsp
,regs
->cs
,regs
->ss
,address
,error_code
);
407 if (unlikely(error_code
& PF_RSVD
))
408 pgtable_bad(address
, regs
, error_code
);
411 * If we're in an interrupt or have no user
412 * context, we must not take the fault..
414 if (unlikely(in_atomic() || !mm
))
415 goto bad_area_nosemaphore
;
418 /* When running in the kernel we expect faults to occur only to
419 * addresses in user space. All other faults represent errors in the
420 * kernel and should generate an OOPS. Unfortunatly, in the case of an
421 * erroneous fault occuring in a code path which already holds mmap_sem
422 * we will deadlock attempting to validate the fault against the
423 * address space. Luckily the kernel only validly references user
424 * space from well defined areas of code, which are listed in the
427 * As the vast majority of faults will be valid we will only perform
428 * the source reference check when there is a possibilty of a deadlock.
429 * Attempt to lock the address space, if we cannot we then validate the
430 * source. If this is invalid we can skip the address space check,
431 * thus avoiding the deadlock.
433 if (!down_read_trylock(&mm
->mmap_sem
)) {
434 if ((error_code
& PF_USER
) == 0 &&
435 !search_exception_tables(regs
->rip
))
436 goto bad_area_nosemaphore
;
437 down_read(&mm
->mmap_sem
);
440 vma
= find_vma(mm
, address
);
443 if (likely(vma
->vm_start
<= address
))
445 if (!(vma
->vm_flags
& VM_GROWSDOWN
))
447 if (error_code
& 4) {
448 /* Allow userspace just enough access below the stack pointer
449 * to let the 'enter' instruction work.
451 if (address
+ 65536 + 32 * sizeof(unsigned long) < regs
->rsp
)
454 if (expand_stack(vma
, address
))
457 * Ok, we have a good vm_area for this memory access, so
461 info
.si_code
= SEGV_ACCERR
;
463 switch (error_code
& (PF_PROT
|PF_WRITE
)) {
464 default: /* 3: write, present */
466 case PF_WRITE
: /* write, not present */
467 if (!(vma
->vm_flags
& VM_WRITE
))
471 case PF_PROT
: /* read, present */
473 case 0: /* read, not present */
474 if (!(vma
->vm_flags
& (VM_READ
| VM_EXEC
)))
479 * If for any reason at all we couldn't handle the fault,
480 * make sure we exit gracefully rather than endlessly redo
483 switch (handle_mm_fault(mm
, vma
, address
, write
)) {
490 case VM_FAULT_SIGBUS
:
496 up_read(&mm
->mmap_sem
);
500 * Something tried to access memory that isn't in our memory map..
501 * Fix it, but check if it's kernel or user first..
504 up_read(&mm
->mmap_sem
);
506 bad_area_nosemaphore
:
507 /* User mode accesses just cause a SIGSEGV */
508 if (error_code
& PF_USER
) {
509 if (is_prefetch(regs
, address
, error_code
))
512 /* Work around K8 erratum #100 K8 in compat mode
513 occasionally jumps to illegal addresses >4GB. We
514 catch this here in the page fault handler because
515 these addresses are not reachable. Just detect this
516 case and return. Any code segment in LDT is
517 compatibility mode. */
518 if ((regs
->cs
== __USER32_CS
|| (regs
->cs
& (1<<2))) &&
522 if (exception_trace
&& unhandled_signal(tsk
, SIGSEGV
)) {
524 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
525 tsk
->pid
> 1 ? KERN_INFO
: KERN_EMERG
,
526 tsk
->comm
, tsk
->pid
, address
, regs
->rip
,
527 regs
->rsp
, error_code
);
530 tsk
->thread
.cr2
= address
;
531 /* Kernel addresses are always protection faults */
532 tsk
->thread
.error_code
= error_code
| (address
>= TASK_SIZE
);
533 tsk
->thread
.trap_no
= 14;
534 info
.si_signo
= SIGSEGV
;
536 /* info.si_code has been set above */
537 info
.si_addr
= (void __user
*)address
;
538 force_sig_info(SIGSEGV
, &info
, tsk
);
544 /* Are we prepared to handle this kernel fault? */
545 fixup
= search_exception_tables(regs
->rip
);
547 regs
->rip
= fixup
->fixup
;
552 * Hall of shame of CPU/BIOS bugs.
555 if (is_prefetch(regs
, address
, error_code
))
558 if (is_errata93(regs
, address
))
562 * Oops. The kernel tried to access some bad page. We'll have to
563 * terminate things with extreme prejudice.
566 flags
= oops_begin();
568 if (address
< PAGE_SIZE
)
569 printk(KERN_ALERT
"Unable to handle kernel NULL pointer dereference");
571 printk(KERN_ALERT
"Unable to handle kernel paging request");
572 printk(" at %016lx RIP: \n" KERN_ALERT
,address
);
573 printk_address(regs
->rip
);
575 dump_pagetable(address
);
576 tsk
->thread
.cr2
= address
;
577 tsk
->thread
.trap_no
= 14;
578 tsk
->thread
.error_code
= error_code
;
579 __die("Oops", regs
, error_code
);
580 /* Executive summary in case the body of the oops scrolled away */
581 printk(KERN_EMERG
"CR2: %016lx\n", address
);
586 * We ran out of memory, or some other thing happened to us that made
587 * us unable to handle the page fault gracefully.
590 up_read(&mm
->mmap_sem
);
591 if (current
->pid
== 1) {
595 printk("VM: killing process %s\n", tsk
->comm
);
601 up_read(&mm
->mmap_sem
);
603 /* Kernel mode? Handle exceptions or die */
604 if (!(error_code
& PF_USER
))
607 tsk
->thread
.cr2
= address
;
608 tsk
->thread
.error_code
= error_code
;
609 tsk
->thread
.trap_no
= 14;
610 info
.si_signo
= SIGBUS
;
612 info
.si_code
= BUS_ADRERR
;
613 info
.si_addr
= (void __user
*)address
;
614 force_sig_info(SIGBUS
, &info
, tsk
);
618 DEFINE_SPINLOCK(pgd_lock
);
619 struct page
*pgd_list
;
621 void vmalloc_sync_all(void)
623 /* Note that races in the updates of insync and start aren't
625 insync can only get set bits added, and updates to start are only
626 improving performance (without affecting correctness if undone). */
627 static DECLARE_BITMAP(insync
, PTRS_PER_PGD
);
628 static unsigned long start
= VMALLOC_START
& PGDIR_MASK
;
629 unsigned long address
;
631 for (address
= start
; address
<= VMALLOC_END
; address
+= PGDIR_SIZE
) {
632 if (!test_bit(pgd_index(address
), insync
)) {
633 const pgd_t
*pgd_ref
= pgd_offset_k(address
);
636 if (pgd_none(*pgd_ref
))
638 spin_lock(&pgd_lock
);
639 for (page
= pgd_list
; page
;
640 page
= (struct page
*)page
->index
) {
642 pgd
= (pgd_t
*)page_address(page
) + pgd_index(address
);
644 set_pgd(pgd
, *pgd_ref
);
646 BUG_ON(pgd_page(*pgd
) != pgd_page(*pgd_ref
));
648 spin_unlock(&pgd_lock
);
649 set_bit(pgd_index(address
), insync
);
651 if (address
== start
)
652 start
= address
+ PGDIR_SIZE
;
654 /* Check that there is no need to do the same for the modules area. */
655 BUILD_BUG_ON(!(MODULES_VADDR
> __START_KERNEL
));
656 BUILD_BUG_ON(!(((MODULES_END
- 1) & PGDIR_MASK
) ==
657 (__START_KERNEL
& PGDIR_MASK
)));
660 static int __init
enable_pagefaulttrace(char *str
)
662 page_fault_trace
= 1;
665 __setup("pagefaulttrace", enable_pagefaulttrace
);