x86: do_page_fault small unification
[linux-2.6/x86.git] / arch / x86 / mm / fault_64.c
blobc6b3ad515cf12c5e071c0e2761fd049db4be914f
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
6 #include <linux/signal.h>
7 #include <linux/sched.h>
8 #include <linux/kernel.h>
9 #include <linux/errno.h>
10 #include <linux/string.h>
11 #include <linux/types.h>
12 #include <linux/ptrace.h>
13 #include <linux/mman.h>
14 #include <linux/mm.h>
15 #include <linux/smp.h>
16 #include <linux/interrupt.h>
17 #include <linux/init.h>
18 #include <linux/tty.h>
19 #include <linux/vt_kern.h> /* For unblank_screen() */
20 #include <linux/compiler.h>
21 #include <linux/vmalloc.h>
22 #include <linux/module.h>
23 #include <linux/kprobes.h>
24 #include <linux/uaccess.h>
25 #include <linux/kdebug.h>
27 #include <asm/system.h>
28 #include <asm/pgalloc.h>
29 #include <asm/smp.h>
30 #include <asm/tlbflush.h>
31 #include <asm/proto.h>
32 #include <asm-generic/sections.h>
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
42 #define PF_PROT (1<<0)
43 #define PF_WRITE (1<<1)
44 #define PF_USER (1<<2)
45 #define PF_RSVD (1<<3)
46 #define PF_INSTR (1<<4)
48 static inline int notify_page_fault(struct pt_regs *regs)
50 #ifdef CONFIG_KPROBES
51 int ret = 0;
53 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
61 return ret;
62 #else
63 return 0;
64 #endif
68 * X86_32
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
72 * X86_64
73 * Sometimes the CPU reports invalid exceptions on prefetch.
74 * Check that here and ignore it.
76 * Opcode checker based on code by Richard Brunner
78 static int is_prefetch(struct pt_regs *regs, unsigned long addr,
79 unsigned long error_code)
81 unsigned char *instr;
82 int scan_more = 1;
83 int prefetch = 0;
84 unsigned char *max_instr;
86 #ifdef CONFIG_X86_32
87 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
88 boot_cpu_data.x86 >= 6)) {
89 /* Catch an obscure case of prefetch inside an NX page. */
90 if (nx_enabled && (error_code & PF_INSTR))
91 return 0;
92 } else {
93 return 0;
95 #else
96 /* If it was a exec fault ignore */
97 if (error_code & PF_INSTR)
98 return 0;
99 #endif
101 instr = (unsigned char *)convert_ip_to_linear(current, regs);
102 max_instr = instr + 15;
104 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
105 return 0;
107 while (scan_more && instr < max_instr) {
108 unsigned char opcode;
109 unsigned char instr_hi;
110 unsigned char instr_lo;
112 if (probe_kernel_address(instr, opcode))
113 break;
115 instr_hi = opcode & 0xf0;
116 instr_lo = opcode & 0x0f;
117 instr++;
119 switch (instr_hi) {
120 case 0x20:
121 case 0x30:
123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
124 * In X86_64 long mode, the CPU will signal invalid
125 * opcode if some of these prefixes are present so
126 * X86_64 will never get here anyway
128 scan_more = ((instr_lo & 7) == 0x6);
129 break;
130 #ifdef CONFIG_X86_64
131 case 0x40:
133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
134 * Need to figure out under what instruction mode the
135 * instruction was issued. Could check the LDT for lm,
136 * but for now it's good enough to assume that long
137 * mode only uses well known segments or kernel.
139 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
140 break;
141 #endif
142 case 0x60:
143 /* 0x64 thru 0x67 are valid prefixes in all modes. */
144 scan_more = (instr_lo & 0xC) == 0x4;
145 break;
146 case 0xF0:
147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
148 scan_more = !instr_lo || (instr_lo>>1) == 1;
149 break;
150 case 0x00:
151 /* Prefetch instruction is 0x0F0D or 0x0F18 */
152 scan_more = 0;
154 if (probe_kernel_address(instr, opcode))
155 break;
156 prefetch = (instr_lo == 0xF) &&
157 (opcode == 0x0D || opcode == 0x18);
158 break;
159 default:
160 scan_more = 0;
161 break;
164 return prefetch;
167 static void force_sig_info_fault(int si_signo, int si_code,
168 unsigned long address, struct task_struct *tsk)
170 siginfo_t info;
172 info.si_signo = si_signo;
173 info.si_errno = 0;
174 info.si_code = si_code;
175 info.si_addr = (void __user *)address;
176 force_sig_info(si_signo, &info, tsk);
179 static int bad_address(void *p)
181 unsigned long dummy;
182 return probe_kernel_address((unsigned long *)p, dummy);
185 void dump_pagetable(unsigned long address)
187 pgd_t *pgd;
188 pud_t *pud;
189 pmd_t *pmd;
190 pte_t *pte;
192 pgd = (pgd_t *)read_cr3();
194 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
195 pgd += pgd_index(address);
196 if (bad_address(pgd)) goto bad;
197 printk("PGD %lx ", pgd_val(*pgd));
198 if (!pgd_present(*pgd)) goto ret;
200 pud = pud_offset(pgd, address);
201 if (bad_address(pud)) goto bad;
202 printk("PUD %lx ", pud_val(*pud));
203 if (!pud_present(*pud)) goto ret;
205 pmd = pmd_offset(pud, address);
206 if (bad_address(pmd)) goto bad;
207 printk("PMD %lx ", pmd_val(*pmd));
208 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
210 pte = pte_offset_kernel(pmd, address);
211 if (bad_address(pte)) goto bad;
212 printk("PTE %lx", pte_val(*pte));
213 ret:
214 printk("\n");
215 return;
216 bad:
217 printk("BAD\n");
220 #ifdef CONFIG_X86_64
221 static const char errata93_warning[] =
222 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
223 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
224 KERN_ERR "******* Please consider a BIOS update.\n"
225 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
227 /* Workaround for K8 erratum #93 & buggy BIOS.
228 BIOS SMM functions are required to use a specific workaround
229 to avoid corruption of the 64bit RIP register on C stepping K8.
230 A lot of BIOS that didn't get tested properly miss this.
231 The OS sees this as a page fault with the upper 32bits of RIP cleared.
232 Try to work around it here.
233 Note we only handle faults in kernel here. */
235 static int is_errata93(struct pt_regs *regs, unsigned long address)
237 static int warned;
238 if (address != regs->ip)
239 return 0;
240 if ((address >> 32) != 0)
241 return 0;
242 address |= 0xffffffffUL << 32;
243 if ((address >= (u64)_stext && address <= (u64)_etext) ||
244 (address >= MODULES_VADDR && address <= MODULES_END)) {
245 if (!warned) {
246 printk(errata93_warning);
247 warned = 1;
249 regs->ip = address;
250 return 1;
252 return 0;
254 #endif
256 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
257 unsigned long error_code)
259 unsigned long flags = oops_begin();
260 struct task_struct *tsk;
262 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
263 current->comm, address);
264 dump_pagetable(address);
265 tsk = current;
266 tsk->thread.cr2 = address;
267 tsk->thread.trap_no = 14;
268 tsk->thread.error_code = error_code;
269 if (__die("Bad pagetable", regs, error_code))
270 regs = NULL;
271 oops_end(flags, regs, SIGKILL);
275 * Handle a fault on the vmalloc area
277 * This assumes no large pages in there.
279 static int vmalloc_fault(unsigned long address)
281 pgd_t *pgd, *pgd_ref;
282 pud_t *pud, *pud_ref;
283 pmd_t *pmd, *pmd_ref;
284 pte_t *pte, *pte_ref;
286 /* Copy kernel mappings over when needed. This can also
287 happen within a race in page table update. In the later
288 case just flush. */
290 pgd = pgd_offset(current->mm ?: &init_mm, address);
291 pgd_ref = pgd_offset_k(address);
292 if (pgd_none(*pgd_ref))
293 return -1;
294 if (pgd_none(*pgd))
295 set_pgd(pgd, *pgd_ref);
296 else
297 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
299 /* Below here mismatches are bugs because these lower tables
300 are shared */
302 pud = pud_offset(pgd, address);
303 pud_ref = pud_offset(pgd_ref, address);
304 if (pud_none(*pud_ref))
305 return -1;
306 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
307 BUG();
308 pmd = pmd_offset(pud, address);
309 pmd_ref = pmd_offset(pud_ref, address);
310 if (pmd_none(*pmd_ref))
311 return -1;
312 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
313 BUG();
314 pte_ref = pte_offset_kernel(pmd_ref, address);
315 if (!pte_present(*pte_ref))
316 return -1;
317 pte = pte_offset_kernel(pmd, address);
318 /* Don't use pte_page here, because the mappings can point
319 outside mem_map, and the NUMA hash lookup cannot handle
320 that. */
321 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
322 BUG();
323 return 0;
326 int show_unhandled_signals = 1;
329 * This routine handles page faults. It determines the address,
330 * and the problem, and then passes it off to one of the appropriate
331 * routines.
333 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
334 unsigned long error_code)
336 struct task_struct *tsk;
337 struct mm_struct *mm;
338 struct vm_area_struct *vma;
339 unsigned long address;
340 int write, fault;
341 unsigned long flags;
342 int si_code;
345 * We can fault from pretty much anywhere, with unknown IRQ state.
347 trace_hardirqs_fixup();
349 tsk = current;
350 mm = tsk->mm;
351 prefetchw(&mm->mmap_sem);
353 /* get the address */
354 address = read_cr2();
356 si_code = SEGV_MAPERR;
358 if (notify_page_fault(regs))
359 return;
362 * We fault-in kernel-space virtual memory on-demand. The
363 * 'reference' page table is init_mm.pgd.
365 * NOTE! We MUST NOT take any locks for this case. We may
366 * be in an interrupt or a critical region, and should
367 * only copy the information from the master page table,
368 * nothing more.
370 * This verifies that the fault happens in kernel space
371 * (error_code & 4) == 0, and that the fault was not a
372 * protection error (error_code & 9) == 0.
374 if (unlikely(address >= TASK_SIZE64)) {
376 * Don't check for the module range here: its PML4
377 * is always initialized because it's shared with the main
378 * kernel text. Only vmalloc may need PML4 syncups.
380 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
381 ((address >= VMALLOC_START && address < VMALLOC_END))) {
382 if (vmalloc_fault(address) >= 0)
383 return;
386 * Don't take the mm semaphore here. If we fixup a prefetch
387 * fault we could otherwise deadlock.
389 goto bad_area_nosemaphore;
392 if (likely(regs->flags & X86_EFLAGS_IF))
393 local_irq_enable();
395 if (unlikely(error_code & PF_RSVD))
396 pgtable_bad(address, regs, error_code);
399 * If we're in an interrupt, have no user context or are running in an
400 * atomic region then we must not take the fault.
402 if (unlikely(in_atomic() || !mm))
403 goto bad_area_nosemaphore;
406 * User-mode registers count as a user access even for any
407 * potential system fault or CPU buglet.
409 if (user_mode_vm(regs))
410 error_code |= PF_USER;
412 again:
413 /* When running in the kernel we expect faults to occur only to
414 * addresses in user space. All other faults represent errors in the
415 * kernel and should generate an OOPS. Unfortunately, in the case of an
416 * erroneous fault occurring in a code path which already holds mmap_sem
417 * we will deadlock attempting to validate the fault against the
418 * address space. Luckily the kernel only validly references user
419 * space from well defined areas of code, which are listed in the
420 * exceptions table.
422 * As the vast majority of faults will be valid we will only perform
423 * the source reference check when there is a possibility of a deadlock.
424 * Attempt to lock the address space, if we cannot we then validate the
425 * source. If this is invalid we can skip the address space check,
426 * thus avoiding the deadlock.
428 if (!down_read_trylock(&mm->mmap_sem)) {
429 if ((error_code & PF_USER) == 0 &&
430 !search_exception_tables(regs->ip))
431 goto bad_area_nosemaphore;
432 down_read(&mm->mmap_sem);
435 vma = find_vma(mm, address);
436 if (!vma)
437 goto bad_area;
438 if (likely(vma->vm_start <= address))
439 goto good_area;
440 if (!(vma->vm_flags & VM_GROWSDOWN))
441 goto bad_area;
442 if (error_code & PF_USER) {
443 /* Allow userspace just enough access below the stack pointer
444 * to let the 'enter' instruction work.
446 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
447 goto bad_area;
449 if (expand_stack(vma, address))
450 goto bad_area;
452 * Ok, we have a good vm_area for this memory access, so
453 * we can handle it..
455 good_area:
456 si_code = SEGV_ACCERR;
457 write = 0;
458 switch (error_code & (PF_PROT|PF_WRITE)) {
459 default: /* 3: write, present */
460 /* fall through */
461 case PF_WRITE: /* write, not present */
462 if (!(vma->vm_flags & VM_WRITE))
463 goto bad_area;
464 write++;
465 break;
466 case PF_PROT: /* read, present */
467 goto bad_area;
468 case 0: /* read, not present */
469 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
470 goto bad_area;
474 * If for any reason at all we couldn't handle the fault,
475 * make sure we exit gracefully rather than endlessly redo
476 * the fault.
478 fault = handle_mm_fault(mm, vma, address, write);
479 if (unlikely(fault & VM_FAULT_ERROR)) {
480 if (fault & VM_FAULT_OOM)
481 goto out_of_memory;
482 else if (fault & VM_FAULT_SIGBUS)
483 goto do_sigbus;
484 BUG();
486 if (fault & VM_FAULT_MAJOR)
487 tsk->maj_flt++;
488 else
489 tsk->min_flt++;
490 up_read(&mm->mmap_sem);
491 return;
494 * Something tried to access memory that isn't in our memory map..
495 * Fix it, but check if it's kernel or user first..
497 bad_area:
498 up_read(&mm->mmap_sem);
500 bad_area_nosemaphore:
501 /* User mode accesses just cause a SIGSEGV */
502 if (error_code & PF_USER) {
505 * It's possible to have interrupts off here.
507 local_irq_enable();
509 if (is_prefetch(regs, address, error_code))
510 return;
512 /* Work around K8 erratum #100 K8 in compat mode
513 occasionally jumps to illegal addresses >4GB. We
514 catch this here in the page fault handler because
515 these addresses are not reachable. Just detect this
516 case and return. Any code segment in LDT is
517 compatibility mode. */
518 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
519 (address >> 32))
520 return;
522 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
523 printk_ratelimit()) {
524 printk(
525 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
526 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
527 tsk->comm, tsk->pid, address, regs->ip,
528 regs->sp, error_code);
531 tsk->thread.cr2 = address;
532 /* Kernel addresses are always protection faults */
533 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
534 tsk->thread.trap_no = 14;
536 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
537 return;
540 no_context:
541 /* Are we prepared to handle this kernel fault? */
542 if (fixup_exception(regs))
543 return;
546 * Hall of shame of CPU/BIOS bugs.
549 if (is_prefetch(regs, address, error_code))
550 return;
552 if (is_errata93(regs, address))
553 return;
556 * Oops. The kernel tried to access some bad page. We'll have to
557 * terminate things with extreme prejudice.
560 flags = oops_begin();
562 if (address < PAGE_SIZE)
563 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
564 else
565 printk(KERN_ALERT "Unable to handle kernel paging request");
566 printk(" at %016lx RIP: \n" KERN_ALERT, address);
567 printk_address(regs->ip, regs->bp);
568 dump_pagetable(address);
569 tsk->thread.cr2 = address;
570 tsk->thread.trap_no = 14;
571 tsk->thread.error_code = error_code;
572 if (__die("Oops", regs, error_code))
573 regs = NULL;
574 /* Executive summary in case the body of the oops scrolled away */
575 printk(KERN_EMERG "CR2: %016lx\n", address);
576 oops_end(flags, regs, SIGKILL);
579 * We ran out of memory, or some other thing happened to us that made
580 * us unable to handle the page fault gracefully.
582 out_of_memory:
583 up_read(&mm->mmap_sem);
584 if (is_global_init(current)) {
585 yield();
586 goto again;
588 printk("VM: killing process %s\n", tsk->comm);
589 if (error_code & PF_USER)
590 do_group_exit(SIGKILL);
591 goto no_context;
593 do_sigbus:
594 up_read(&mm->mmap_sem);
596 /* Kernel mode? Handle exceptions or die */
597 if (!(error_code & PF_USER))
598 goto no_context;
600 tsk->thread.cr2 = address;
601 tsk->thread.error_code = error_code;
602 tsk->thread.trap_no = 14;
603 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
604 return;
607 DEFINE_SPINLOCK(pgd_lock);
608 LIST_HEAD(pgd_list);
610 void vmalloc_sync_all(void)
612 /* Note that races in the updates of insync and start aren't
613 problematic:
614 insync can only get set bits added, and updates to start are only
615 improving performance (without affecting correctness if undone). */
616 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
617 static unsigned long start = VMALLOC_START & PGDIR_MASK;
618 unsigned long address;
620 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
621 if (!test_bit(pgd_index(address), insync)) {
622 const pgd_t *pgd_ref = pgd_offset_k(address);
623 struct page *page;
625 if (pgd_none(*pgd_ref))
626 continue;
627 spin_lock(&pgd_lock);
628 list_for_each_entry(page, &pgd_list, lru) {
629 pgd_t *pgd;
630 pgd = (pgd_t *)page_address(page) + pgd_index(address);
631 if (pgd_none(*pgd))
632 set_pgd(pgd, *pgd_ref);
633 else
634 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
636 spin_unlock(&pgd_lock);
637 set_bit(pgd_index(address), insync);
639 if (address == start)
640 start = address + PGDIR_SIZE;
642 /* Check that there is no need to do the same for the modules area. */
643 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
644 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
645 (__START_KERNEL & PGDIR_MASK)));