Merge branch 'x86-64'
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / arch / x86_64 / mm / fault.c
blob08dc696f54ee4543fe4aff4cf551f1f7253e2da3
1 /*
2 * linux/arch/x86-64/mm/fault.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
8 #include <linux/config.h>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
17 #include <linux/mm.h>
18 #include <linux/smp.h>
19 #include <linux/smp_lock.h>
20 #include <linux/interrupt.h>
21 #include <linux/init.h>
22 #include <linux/tty.h>
23 #include <linux/vt_kern.h> /* For unblank_screen() */
24 #include <linux/compiler.h>
25 #include <linux/module.h>
26 #include <linux/kprobes.h>
28 #include <asm/system.h>
29 #include <asm/uaccess.h>
30 #include <asm/pgalloc.h>
31 #include <asm/smp.h>
32 #include <asm/tlbflush.h>
33 #include <asm/proto.h>
34 #include <asm/kdebug.h>
35 #include <asm-generic/sections.h>
37 /* Page fault error code bits */
38 #define PF_PROT (1<<0) /* or no page found */
39 #define PF_WRITE (1<<1)
40 #define PF_USER (1<<2)
41 #define PF_RSVD (1<<3)
42 #define PF_INSTR (1<<4)
44 #ifdef CONFIG_KPROBES
45 ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
47 /* Hook to register for page fault notifications */
48 int register_page_fault_notifier(struct notifier_block *nb)
50 vmalloc_sync_all();
51 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
54 int unregister_page_fault_notifier(struct notifier_block *nb)
56 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
59 static inline int notify_page_fault(enum die_val val, const char *str,
60 struct pt_regs *regs, long err, int trap, int sig)
62 struct die_args args = {
63 .regs = regs,
64 .str = str,
65 .err = err,
66 .trapnr = trap,
67 .signr = sig
69 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
71 #else
72 static inline int notify_page_fault(enum die_val val, const char *str,
73 struct pt_regs *regs, long err, int trap, int sig)
75 return NOTIFY_DONE;
77 #endif
79 void bust_spinlocks(int yes)
81 int loglevel_save = console_loglevel;
82 if (yes) {
83 oops_in_progress = 1;
84 } else {
85 #ifdef CONFIG_VT
86 unblank_screen();
87 #endif
88 oops_in_progress = 0;
90 * OK, the message is on the console. Now we call printk()
91 * without oops_in_progress set so that printk will give klogd
92 * a poke. Hold onto your hats...
94 console_loglevel = 15; /* NMI oopser may have shut the console up */
95 printk(" ");
96 console_loglevel = loglevel_save;
100 /* Sometimes the CPU reports invalid exceptions on prefetch.
101 Check that here and ignore.
102 Opcode checker based on code by Richard Brunner */
103 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
104 unsigned long error_code)
106 unsigned char *instr;
107 int scan_more = 1;
108 int prefetch = 0;
109 unsigned char *max_instr;
111 /* If it was a exec fault ignore */
112 if (error_code & PF_INSTR)
113 return 0;
115 instr = (unsigned char *)convert_rip_to_linear(current, regs);
116 max_instr = instr + 15;
118 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
119 return 0;
121 while (scan_more && instr < max_instr) {
122 unsigned char opcode;
123 unsigned char instr_hi;
124 unsigned char instr_lo;
126 if (__get_user(opcode, instr))
127 break;
129 instr_hi = opcode & 0xf0;
130 instr_lo = opcode & 0x0f;
131 instr++;
133 switch (instr_hi) {
134 case 0x20:
135 case 0x30:
136 /* Values 0x26,0x2E,0x36,0x3E are valid x86
137 prefixes. In long mode, the CPU will signal
138 invalid opcode if some of these prefixes are
139 present so we will never get here anyway */
140 scan_more = ((instr_lo & 7) == 0x6);
141 break;
143 case 0x40:
144 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
145 Need to figure out under what instruction mode the
146 instruction was issued ... */
147 /* Could check the LDT for lm, but for now it's good
148 enough to assume that long mode only uses well known
149 segments or kernel. */
150 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
151 break;
153 case 0x60:
154 /* 0x64 thru 0x67 are valid prefixes in all modes. */
155 scan_more = (instr_lo & 0xC) == 0x4;
156 break;
157 case 0xF0:
158 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
159 scan_more = !instr_lo || (instr_lo>>1) == 1;
160 break;
161 case 0x00:
162 /* Prefetch instruction is 0x0F0D or 0x0F18 */
163 scan_more = 0;
164 if (__get_user(opcode, instr))
165 break;
166 prefetch = (instr_lo == 0xF) &&
167 (opcode == 0x0D || opcode == 0x18);
168 break;
169 default:
170 scan_more = 0;
171 break;
174 return prefetch;
177 static int bad_address(void *p)
179 unsigned long dummy;
180 return __get_user(dummy, (unsigned long *)p);
183 void dump_pagetable(unsigned long address)
185 pgd_t *pgd;
186 pud_t *pud;
187 pmd_t *pmd;
188 pte_t *pte;
190 asm("movq %%cr3,%0" : "=r" (pgd));
192 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
193 pgd += pgd_index(address);
194 if (bad_address(pgd)) goto bad;
195 printk("PGD %lx ", pgd_val(*pgd));
196 if (!pgd_present(*pgd)) goto ret;
198 pud = pud_offset(pgd, address);
199 if (bad_address(pud)) goto bad;
200 printk("PUD %lx ", pud_val(*pud));
201 if (!pud_present(*pud)) goto ret;
203 pmd = pmd_offset(pud, address);
204 if (bad_address(pmd)) goto bad;
205 printk("PMD %lx ", pmd_val(*pmd));
206 if (!pmd_present(*pmd)) goto ret;
208 pte = pte_offset_kernel(pmd, address);
209 if (bad_address(pte)) goto bad;
210 printk("PTE %lx", pte_val(*pte));
211 ret:
212 printk("\n");
213 return;
214 bad:
215 printk("BAD\n");
218 static const char errata93_warning[] =
219 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
220 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
221 KERN_ERR "******* Please consider a BIOS update.\n"
222 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
224 /* Workaround for K8 erratum #93 & buggy BIOS.
225 BIOS SMM functions are required to use a specific workaround
226 to avoid corruption of the 64bit RIP register on C stepping K8.
227 A lot of BIOS that didn't get tested properly miss this.
228 The OS sees this as a page fault with the upper 32bits of RIP cleared.
229 Try to work around it here.
230 Note we only handle faults in kernel here. */
232 static int is_errata93(struct pt_regs *regs, unsigned long address)
234 static int warned;
235 if (address != regs->rip)
236 return 0;
237 if ((address >> 32) != 0)
238 return 0;
239 address |= 0xffffffffUL << 32;
240 if ((address >= (u64)_stext && address <= (u64)_etext) ||
241 (address >= MODULES_VADDR && address <= MODULES_END)) {
242 if (!warned) {
243 printk(errata93_warning);
244 warned = 1;
246 regs->rip = address;
247 return 1;
249 return 0;
252 int unhandled_signal(struct task_struct *tsk, int sig)
254 if (tsk->pid == 1)
255 return 1;
256 if (tsk->ptrace & PT_PTRACED)
257 return 0;
258 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
259 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
262 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
263 unsigned long error_code)
265 unsigned long flags = oops_begin();
266 struct task_struct *tsk;
268 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
269 current->comm, address);
270 dump_pagetable(address);
271 tsk = current;
272 tsk->thread.cr2 = address;
273 tsk->thread.trap_no = 14;
274 tsk->thread.error_code = error_code;
275 __die("Bad pagetable", regs, error_code);
276 oops_end(flags);
277 do_exit(SIGKILL);
281 * Handle a fault on the vmalloc area
283 * This assumes no large pages in there.
285 static int vmalloc_fault(unsigned long address)
287 pgd_t *pgd, *pgd_ref;
288 pud_t *pud, *pud_ref;
289 pmd_t *pmd, *pmd_ref;
290 pte_t *pte, *pte_ref;
292 /* Copy kernel mappings over when needed. This can also
293 happen within a race in page table update. In the later
294 case just flush. */
296 pgd = pgd_offset(current->mm ?: &init_mm, address);
297 pgd_ref = pgd_offset_k(address);
298 if (pgd_none(*pgd_ref))
299 return -1;
300 if (pgd_none(*pgd))
301 set_pgd(pgd, *pgd_ref);
302 else
303 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
305 /* Below here mismatches are bugs because these lower tables
306 are shared */
308 pud = pud_offset(pgd, address);
309 pud_ref = pud_offset(pgd_ref, address);
310 if (pud_none(*pud_ref))
311 return -1;
312 if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
313 BUG();
314 pmd = pmd_offset(pud, address);
315 pmd_ref = pmd_offset(pud_ref, address);
316 if (pmd_none(*pmd_ref))
317 return -1;
318 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
319 BUG();
320 pte_ref = pte_offset_kernel(pmd_ref, address);
321 if (!pte_present(*pte_ref))
322 return -1;
323 pte = pte_offset_kernel(pmd, address);
324 /* Don't use pte_page here, because the mappings can point
325 outside mem_map, and the NUMA hash lookup cannot handle
326 that. */
327 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
328 BUG();
329 return 0;
332 int page_fault_trace = 0;
333 int exception_trace = 1;
336 * This routine handles page faults. It determines the address,
337 * and the problem, and then passes it off to one of the appropriate
338 * routines.
340 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
341 unsigned long error_code)
343 struct task_struct *tsk;
344 struct mm_struct *mm;
345 struct vm_area_struct * vma;
346 unsigned long address;
347 const struct exception_table_entry *fixup;
348 int write;
349 unsigned long flags;
350 siginfo_t info;
352 tsk = current;
353 mm = tsk->mm;
354 prefetchw(&mm->mmap_sem);
356 /* get the address */
357 __asm__("movq %%cr2,%0":"=r" (address));
359 info.si_code = SEGV_MAPERR;
363 * We fault-in kernel-space virtual memory on-demand. The
364 * 'reference' page table is init_mm.pgd.
366 * NOTE! We MUST NOT take any locks for this case. We may
367 * be in an interrupt or a critical region, and should
368 * only copy the information from the master page table,
369 * nothing more.
371 * This verifies that the fault happens in kernel space
372 * (error_code & 4) == 0, and that the fault was not a
373 * protection error (error_code & 9) == 0.
375 if (unlikely(address >= TASK_SIZE64)) {
377 * Don't check for the module range here: its PML4
378 * is always initialized because it's shared with the main
379 * kernel text. Only vmalloc may need PML4 syncups.
381 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
382 ((address >= VMALLOC_START && address < VMALLOC_END))) {
383 if (vmalloc_fault(address) >= 0)
384 return;
386 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
387 SIGSEGV) == NOTIFY_STOP)
388 return;
390 * Don't take the mm semaphore here. If we fixup a prefetch
391 * fault we could otherwise deadlock.
393 goto bad_area_nosemaphore;
396 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
397 SIGSEGV) == NOTIFY_STOP)
398 return;
400 if (likely(regs->eflags & X86_EFLAGS_IF))
401 local_irq_enable();
403 if (unlikely(page_fault_trace))
404 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
405 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
407 if (unlikely(error_code & PF_RSVD))
408 pgtable_bad(address, regs, error_code);
411 * If we're in an interrupt or have no user
412 * context, we must not take the fault..
414 if (unlikely(in_atomic() || !mm))
415 goto bad_area_nosemaphore;
417 again:
418 /* When running in the kernel we expect faults to occur only to
419 * addresses in user space. All other faults represent errors in the
420 * kernel and should generate an OOPS. Unfortunatly, in the case of an
421 * erroneous fault occuring in a code path which already holds mmap_sem
422 * we will deadlock attempting to validate the fault against the
423 * address space. Luckily the kernel only validly references user
424 * space from well defined areas of code, which are listed in the
425 * exceptions table.
427 * As the vast majority of faults will be valid we will only perform
428 * the source reference check when there is a possibilty of a deadlock.
429 * Attempt to lock the address space, if we cannot we then validate the
430 * source. If this is invalid we can skip the address space check,
431 * thus avoiding the deadlock.
433 if (!down_read_trylock(&mm->mmap_sem)) {
434 if ((error_code & PF_USER) == 0 &&
435 !search_exception_tables(regs->rip))
436 goto bad_area_nosemaphore;
437 down_read(&mm->mmap_sem);
440 vma = find_vma(mm, address);
441 if (!vma)
442 goto bad_area;
443 if (likely(vma->vm_start <= address))
444 goto good_area;
445 if (!(vma->vm_flags & VM_GROWSDOWN))
446 goto bad_area;
447 if (error_code & 4) {
448 /* Allow userspace just enough access below the stack pointer
449 * to let the 'enter' instruction work.
451 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
452 goto bad_area;
454 if (expand_stack(vma, address))
455 goto bad_area;
457 * Ok, we have a good vm_area for this memory access, so
458 * we can handle it..
460 good_area:
461 info.si_code = SEGV_ACCERR;
462 write = 0;
463 switch (error_code & (PF_PROT|PF_WRITE)) {
464 default: /* 3: write, present */
465 /* fall through */
466 case PF_WRITE: /* write, not present */
467 if (!(vma->vm_flags & VM_WRITE))
468 goto bad_area;
469 write++;
470 break;
471 case PF_PROT: /* read, present */
472 goto bad_area;
473 case 0: /* read, not present */
474 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
475 goto bad_area;
479 * If for any reason at all we couldn't handle the fault,
480 * make sure we exit gracefully rather than endlessly redo
481 * the fault.
483 switch (handle_mm_fault(mm, vma, address, write)) {
484 case VM_FAULT_MINOR:
485 tsk->min_flt++;
486 break;
487 case VM_FAULT_MAJOR:
488 tsk->maj_flt++;
489 break;
490 case VM_FAULT_SIGBUS:
491 goto do_sigbus;
492 default:
493 goto out_of_memory;
496 up_read(&mm->mmap_sem);
497 return;
500 * Something tried to access memory that isn't in our memory map..
501 * Fix it, but check if it's kernel or user first..
503 bad_area:
504 up_read(&mm->mmap_sem);
506 bad_area_nosemaphore:
507 /* User mode accesses just cause a SIGSEGV */
508 if (error_code & PF_USER) {
509 if (is_prefetch(regs, address, error_code))
510 return;
512 /* Work around K8 erratum #100 K8 in compat mode
513 occasionally jumps to illegal addresses >4GB. We
514 catch this here in the page fault handler because
515 these addresses are not reachable. Just detect this
516 case and return. Any code segment in LDT is
517 compatibility mode. */
518 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
519 (address >> 32))
520 return;
522 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
523 printk(
524 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
525 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
526 tsk->comm, tsk->pid, address, regs->rip,
527 regs->rsp, error_code);
530 tsk->thread.cr2 = address;
531 /* Kernel addresses are always protection faults */
532 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
533 tsk->thread.trap_no = 14;
534 info.si_signo = SIGSEGV;
535 info.si_errno = 0;
536 /* info.si_code has been set above */
537 info.si_addr = (void __user *)address;
538 force_sig_info(SIGSEGV, &info, tsk);
539 return;
542 no_context:
544 /* Are we prepared to handle this kernel fault? */
545 fixup = search_exception_tables(regs->rip);
546 if (fixup) {
547 regs->rip = fixup->fixup;
548 return;
552 * Hall of shame of CPU/BIOS bugs.
555 if (is_prefetch(regs, address, error_code))
556 return;
558 if (is_errata93(regs, address))
559 return;
562 * Oops. The kernel tried to access some bad page. We'll have to
563 * terminate things with extreme prejudice.
566 flags = oops_begin();
568 if (address < PAGE_SIZE)
569 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
570 else
571 printk(KERN_ALERT "Unable to handle kernel paging request");
572 printk(" at %016lx RIP: \n" KERN_ALERT,address);
573 printk_address(regs->rip);
574 printk("\n");
575 dump_pagetable(address);
576 tsk->thread.cr2 = address;
577 tsk->thread.trap_no = 14;
578 tsk->thread.error_code = error_code;
579 __die("Oops", regs, error_code);
580 /* Executive summary in case the body of the oops scrolled away */
581 printk(KERN_EMERG "CR2: %016lx\n", address);
582 oops_end(flags);
583 do_exit(SIGKILL);
586 * We ran out of memory, or some other thing happened to us that made
587 * us unable to handle the page fault gracefully.
589 out_of_memory:
590 up_read(&mm->mmap_sem);
591 if (current->pid == 1) {
592 yield();
593 goto again;
595 printk("VM: killing process %s\n", tsk->comm);
596 if (error_code & 4)
597 do_exit(SIGKILL);
598 goto no_context;
600 do_sigbus:
601 up_read(&mm->mmap_sem);
603 /* Kernel mode? Handle exceptions or die */
604 if (!(error_code & PF_USER))
605 goto no_context;
607 tsk->thread.cr2 = address;
608 tsk->thread.error_code = error_code;
609 tsk->thread.trap_no = 14;
610 info.si_signo = SIGBUS;
611 info.si_errno = 0;
612 info.si_code = BUS_ADRERR;
613 info.si_addr = (void __user *)address;
614 force_sig_info(SIGBUS, &info, tsk);
615 return;
618 DEFINE_SPINLOCK(pgd_lock);
619 struct page *pgd_list;
621 void vmalloc_sync_all(void)
623 /* Note that races in the updates of insync and start aren't
624 problematic:
625 insync can only get set bits added, and updates to start are only
626 improving performance (without affecting correctness if undone). */
627 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
628 static unsigned long start = VMALLOC_START & PGDIR_MASK;
629 unsigned long address;
631 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
632 if (!test_bit(pgd_index(address), insync)) {
633 const pgd_t *pgd_ref = pgd_offset_k(address);
634 struct page *page;
636 if (pgd_none(*pgd_ref))
637 continue;
638 spin_lock(&pgd_lock);
639 for (page = pgd_list; page;
640 page = (struct page *)page->index) {
641 pgd_t *pgd;
642 pgd = (pgd_t *)page_address(page) + pgd_index(address);
643 if (pgd_none(*pgd))
644 set_pgd(pgd, *pgd_ref);
645 else
646 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
648 spin_unlock(&pgd_lock);
649 set_bit(pgd_index(address), insync);
651 if (address == start)
652 start = address + PGDIR_SIZE;
654 /* Check that there is no need to do the same for the modules area. */
655 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
656 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
657 (__START_KERNEL & PGDIR_MASK)));
660 static int __init enable_pagefaulttrace(char *str)
662 page_fault_trace = 1;
663 return 1;
665 __setup("pagefaulttrace", enable_pagefaulttrace);