x86: cosmetic fixes fault_{32|64}.c
[linux-2.6/verdex.git] / arch / x86 / mm / fault_64.c
blob7e98a76912837c92b02b1d83f0fdb35daf59fc39
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
6 #include <linux/signal.h>
7 #include <linux/sched.h>
8 #include <linux/kernel.h>
9 #include <linux/errno.h>
10 #include <linux/string.h>
11 #include <linux/types.h>
12 #include <linux/ptrace.h>
13 #include <linux/mman.h>
14 #include <linux/mm.h>
15 #include <linux/smp.h>
16 #include <linux/interrupt.h>
17 #include <linux/init.h>
18 #include <linux/tty.h>
19 #include <linux/vt_kern.h> /* For unblank_screen() */
20 #include <linux/compiler.h>
21 #include <linux/vmalloc.h>
22 #include <linux/module.h>
23 #include <linux/kprobes.h>
24 #include <linux/uaccess.h>
25 #include <linux/kdebug.h>
27 #include <asm/system.h>
28 #include <asm/pgalloc.h>
29 #include <asm/smp.h>
30 #include <asm/tlbflush.h>
31 #include <asm/proto.h>
32 #include <asm-generic/sections.h>
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
42 #define PF_PROT (1<<0)
43 #define PF_WRITE (1<<1)
44 #define PF_USER (1<<2)
45 #define PF_RSVD (1<<3)
46 #define PF_INSTR (1<<4)
48 static inline int notify_page_fault(struct pt_regs *regs)
50 #ifdef CONFIG_KPROBES
51 int ret = 0;
53 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
61 return ret;
62 #else
63 return 0;
64 #endif
67 /* Sometimes the CPU reports invalid exceptions on prefetch.
68 Check that here and ignore.
69 Opcode checker based on code by Richard Brunner */
70 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
71 unsigned long error_code)
73 unsigned char *instr;
74 int scan_more = 1;
75 int prefetch = 0;
76 unsigned char *max_instr;
78 /* If it was a exec fault ignore */
79 if (error_code & PF_INSTR)
80 return 0;
82 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
83 max_instr = instr + 15;
85 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
86 return 0;
88 while (scan_more && instr < max_instr) {
89 unsigned char opcode;
90 unsigned char instr_hi;
91 unsigned char instr_lo;
93 if (probe_kernel_address(instr, opcode))
94 break;
96 instr_hi = opcode & 0xf0;
97 instr_lo = opcode & 0x0f;
98 instr++;
100 switch (instr_hi) {
101 case 0x20:
102 case 0x30:
104 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
105 * In X86_64 long mode, the CPU will signal invalid
106 * opcode if some of these prefixes are present so
107 * X86_64 will never get here anyway
109 scan_more = ((instr_lo & 7) == 0x6);
110 break;
111 #ifdef CONFIG_X86_64
112 case 0x40:
114 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
115 * Need to figure out under what instruction mode the
116 * instruction was issued. Could check the LDT for lm,
117 * but for now it's good enough to assume that long
118 * mode only uses well known segments or kernel.
120 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
121 break;
122 #endif
123 case 0x60:
124 /* 0x64 thru 0x67 are valid prefixes in all modes. */
125 scan_more = (instr_lo & 0xC) == 0x4;
126 break;
127 case 0xF0:
128 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
129 scan_more = !instr_lo || (instr_lo>>1) == 1;
130 break;
131 case 0x00:
132 /* Prefetch instruction is 0x0F0D or 0x0F18 */
133 scan_more = 0;
134 if (probe_kernel_address(instr, opcode))
135 break;
136 prefetch = (instr_lo == 0xF) &&
137 (opcode == 0x0D || opcode == 0x18);
138 break;
139 default:
140 scan_more = 0;
141 break;
144 return prefetch;
147 static int bad_address(void *p)
149 unsigned long dummy;
150 return probe_kernel_address((unsigned long *)p, dummy);
153 void dump_pagetable(unsigned long address)
155 pgd_t *pgd;
156 pud_t *pud;
157 pmd_t *pmd;
158 pte_t *pte;
160 pgd = (pgd_t *)read_cr3();
162 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
163 pgd += pgd_index(address);
164 if (bad_address(pgd)) goto bad;
165 printk("PGD %lx ", pgd_val(*pgd));
166 if (!pgd_present(*pgd)) goto ret;
168 pud = pud_offset(pgd, address);
169 if (bad_address(pud)) goto bad;
170 printk("PUD %lx ", pud_val(*pud));
171 if (!pud_present(*pud)) goto ret;
173 pmd = pmd_offset(pud, address);
174 if (bad_address(pmd)) goto bad;
175 printk("PMD %lx ", pmd_val(*pmd));
176 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
178 pte = pte_offset_kernel(pmd, address);
179 if (bad_address(pte)) goto bad;
180 printk("PTE %lx", pte_val(*pte));
181 ret:
182 printk("\n");
183 return;
184 bad:
185 printk("BAD\n");
188 static const char errata93_warning[] =
189 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
190 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
191 KERN_ERR "******* Please consider a BIOS update.\n"
192 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
194 /* Workaround for K8 erratum #93 & buggy BIOS.
195 BIOS SMM functions are required to use a specific workaround
196 to avoid corruption of the 64bit RIP register on C stepping K8.
197 A lot of BIOS that didn't get tested properly miss this.
198 The OS sees this as a page fault with the upper 32bits of RIP cleared.
199 Try to work around it here.
200 Note we only handle faults in kernel here. */
202 static int is_errata93(struct pt_regs *regs, unsigned long address)
204 static int warned;
205 if (address != regs->ip)
206 return 0;
207 if ((address >> 32) != 0)
208 return 0;
209 address |= 0xffffffffUL << 32;
210 if ((address >= (u64)_stext && address <= (u64)_etext) ||
211 (address >= MODULES_VADDR && address <= MODULES_END)) {
212 if (!warned) {
213 printk(errata93_warning);
214 warned = 1;
216 regs->ip = address;
217 return 1;
219 return 0;
222 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
223 unsigned long error_code)
225 unsigned long flags = oops_begin();
226 struct task_struct *tsk;
228 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
229 current->comm, address);
230 dump_pagetable(address);
231 tsk = current;
232 tsk->thread.cr2 = address;
233 tsk->thread.trap_no = 14;
234 tsk->thread.error_code = error_code;
235 if (__die("Bad pagetable", regs, error_code))
236 regs = NULL;
237 oops_end(flags, regs, SIGKILL);
241 * Handle a fault on the vmalloc area
243 * This assumes no large pages in there.
245 static int vmalloc_fault(unsigned long address)
247 pgd_t *pgd, *pgd_ref;
248 pud_t *pud, *pud_ref;
249 pmd_t *pmd, *pmd_ref;
250 pte_t *pte, *pte_ref;
252 /* Copy kernel mappings over when needed. This can also
253 happen within a race in page table update. In the later
254 case just flush. */
256 pgd = pgd_offset(current->mm ?: &init_mm, address);
257 pgd_ref = pgd_offset_k(address);
258 if (pgd_none(*pgd_ref))
259 return -1;
260 if (pgd_none(*pgd))
261 set_pgd(pgd, *pgd_ref);
262 else
263 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
265 /* Below here mismatches are bugs because these lower tables
266 are shared */
268 pud = pud_offset(pgd, address);
269 pud_ref = pud_offset(pgd_ref, address);
270 if (pud_none(*pud_ref))
271 return -1;
272 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
273 BUG();
274 pmd = pmd_offset(pud, address);
275 pmd_ref = pmd_offset(pud_ref, address);
276 if (pmd_none(*pmd_ref))
277 return -1;
278 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
279 BUG();
280 pte_ref = pte_offset_kernel(pmd_ref, address);
281 if (!pte_present(*pte_ref))
282 return -1;
283 pte = pte_offset_kernel(pmd, address);
284 /* Don't use pte_page here, because the mappings can point
285 outside mem_map, and the NUMA hash lookup cannot handle
286 that. */
287 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
288 BUG();
289 return 0;
292 int show_unhandled_signals = 1;
295 * This routine handles page faults. It determines the address,
296 * and the problem, and then passes it off to one of the appropriate
297 * routines.
299 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
300 unsigned long error_code)
302 struct task_struct *tsk;
303 struct mm_struct *mm;
304 struct vm_area_struct *vma;
305 unsigned long address;
306 int write, fault;
307 unsigned long flags;
308 siginfo_t info;
311 * We can fault from pretty much anywhere, with unknown IRQ state.
313 trace_hardirqs_fixup();
315 tsk = current;
316 mm = tsk->mm;
317 prefetchw(&mm->mmap_sem);
319 /* get the address */
320 address = read_cr2();
322 info.si_code = SEGV_MAPERR;
326 * We fault-in kernel-space virtual memory on-demand. The
327 * 'reference' page table is init_mm.pgd.
329 * NOTE! We MUST NOT take any locks for this case. We may
330 * be in an interrupt or a critical region, and should
331 * only copy the information from the master page table,
332 * nothing more.
334 * This verifies that the fault happens in kernel space
335 * (error_code & 4) == 0, and that the fault was not a
336 * protection error (error_code & 9) == 0.
338 if (unlikely(address >= TASK_SIZE64)) {
340 * Don't check for the module range here: its PML4
341 * is always initialized because it's shared with the main
342 * kernel text. Only vmalloc may need PML4 syncups.
344 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
345 ((address >= VMALLOC_START && address < VMALLOC_END))) {
346 if (vmalloc_fault(address) >= 0)
347 return;
349 if (notify_page_fault(regs))
350 return;
352 * Don't take the mm semaphore here. If we fixup a prefetch
353 * fault we could otherwise deadlock.
355 goto bad_area_nosemaphore;
358 if (notify_page_fault(regs))
359 return;
361 if (likely(regs->flags & X86_EFLAGS_IF))
362 local_irq_enable();
364 if (unlikely(error_code & PF_RSVD))
365 pgtable_bad(address, regs, error_code);
368 * If we're in an interrupt, have no user context or are running in an
369 * atomic region then we must not take the fault.
371 if (unlikely(in_atomic() || !mm))
372 goto bad_area_nosemaphore;
375 * User-mode registers count as a user access even for any
376 * potential system fault or CPU buglet.
378 if (user_mode_vm(regs))
379 error_code |= PF_USER;
381 again:
382 /* When running in the kernel we expect faults to occur only to
383 * addresses in user space. All other faults represent errors in the
384 * kernel and should generate an OOPS. Unfortunately, in the case of an
385 * erroneous fault occurring in a code path which already holds mmap_sem
386 * we will deadlock attempting to validate the fault against the
387 * address space. Luckily the kernel only validly references user
388 * space from well defined areas of code, which are listed in the
389 * exceptions table.
391 * As the vast majority of faults will be valid we will only perform
392 * the source reference check when there is a possibility of a deadlock.
393 * Attempt to lock the address space, if we cannot we then validate the
394 * source. If this is invalid we can skip the address space check,
395 * thus avoiding the deadlock.
397 if (!down_read_trylock(&mm->mmap_sem)) {
398 if ((error_code & PF_USER) == 0 &&
399 !search_exception_tables(regs->ip))
400 goto bad_area_nosemaphore;
401 down_read(&mm->mmap_sem);
404 vma = find_vma(mm, address);
405 if (!vma)
406 goto bad_area;
407 if (likely(vma->vm_start <= address))
408 goto good_area;
409 if (!(vma->vm_flags & VM_GROWSDOWN))
410 goto bad_area;
411 if (error_code & PF_USER) {
412 /* Allow userspace just enough access below the stack pointer
413 * to let the 'enter' instruction work.
415 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
416 goto bad_area;
418 if (expand_stack(vma, address))
419 goto bad_area;
421 * Ok, we have a good vm_area for this memory access, so
422 * we can handle it..
424 good_area:
425 info.si_code = SEGV_ACCERR;
426 write = 0;
427 switch (error_code & (PF_PROT|PF_WRITE)) {
428 default: /* 3: write, present */
429 /* fall through */
430 case PF_WRITE: /* write, not present */
431 if (!(vma->vm_flags & VM_WRITE))
432 goto bad_area;
433 write++;
434 break;
435 case PF_PROT: /* read, present */
436 goto bad_area;
437 case 0: /* read, not present */
438 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
439 goto bad_area;
443 * If for any reason at all we couldn't handle the fault,
444 * make sure we exit gracefully rather than endlessly redo
445 * the fault.
447 fault = handle_mm_fault(mm, vma, address, write);
448 if (unlikely(fault & VM_FAULT_ERROR)) {
449 if (fault & VM_FAULT_OOM)
450 goto out_of_memory;
451 else if (fault & VM_FAULT_SIGBUS)
452 goto do_sigbus;
453 BUG();
455 if (fault & VM_FAULT_MAJOR)
456 tsk->maj_flt++;
457 else
458 tsk->min_flt++;
459 up_read(&mm->mmap_sem);
460 return;
463 * Something tried to access memory that isn't in our memory map..
464 * Fix it, but check if it's kernel or user first..
466 bad_area:
467 up_read(&mm->mmap_sem);
469 bad_area_nosemaphore:
470 /* User mode accesses just cause a SIGSEGV */
471 if (error_code & PF_USER) {
474 * It's possible to have interrupts off here.
476 local_irq_enable();
478 if (is_prefetch(regs, address, error_code))
479 return;
481 /* Work around K8 erratum #100 K8 in compat mode
482 occasionally jumps to illegal addresses >4GB. We
483 catch this here in the page fault handler because
484 these addresses are not reachable. Just detect this
485 case and return. Any code segment in LDT is
486 compatibility mode. */
487 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
488 (address >> 32))
489 return;
491 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
492 printk_ratelimit()) {
493 printk(
494 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
495 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
496 tsk->comm, tsk->pid, address, regs->ip,
497 regs->sp, error_code);
500 tsk->thread.cr2 = address;
501 /* Kernel addresses are always protection faults */
502 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
503 tsk->thread.trap_no = 14;
504 info.si_signo = SIGSEGV;
505 info.si_errno = 0;
506 /* info.si_code has been set above */
507 info.si_addr = (void __user *)address;
508 force_sig_info(SIGSEGV, &info, tsk);
509 return;
512 no_context:
513 /* Are we prepared to handle this kernel fault? */
514 if (fixup_exception(regs))
515 return;
518 * Hall of shame of CPU/BIOS bugs.
521 if (is_prefetch(regs, address, error_code))
522 return;
524 if (is_errata93(regs, address))
525 return;
528 * Oops. The kernel tried to access some bad page. We'll have to
529 * terminate things with extreme prejudice.
532 flags = oops_begin();
534 if (address < PAGE_SIZE)
535 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
536 else
537 printk(KERN_ALERT "Unable to handle kernel paging request");
538 printk(" at %016lx RIP: \n" KERN_ALERT, address);
539 printk_address(regs->ip);
540 dump_pagetable(address);
541 tsk->thread.cr2 = address;
542 tsk->thread.trap_no = 14;
543 tsk->thread.error_code = error_code;
544 if (__die("Oops", regs, error_code))
545 regs = NULL;
546 /* Executive summary in case the body of the oops scrolled away */
547 printk(KERN_EMERG "CR2: %016lx\n", address);
548 oops_end(flags, regs, SIGKILL);
551 * We ran out of memory, or some other thing happened to us that made
552 * us unable to handle the page fault gracefully.
554 out_of_memory:
555 up_read(&mm->mmap_sem);
556 if (is_global_init(current)) {
557 yield();
558 goto again;
560 printk("VM: killing process %s\n", tsk->comm);
561 if (error_code & 4)
562 do_group_exit(SIGKILL);
563 goto no_context;
565 do_sigbus:
566 up_read(&mm->mmap_sem);
568 /* Kernel mode? Handle exceptions or die */
569 if (!(error_code & PF_USER))
570 goto no_context;
572 tsk->thread.cr2 = address;
573 tsk->thread.error_code = error_code;
574 tsk->thread.trap_no = 14;
575 info.si_signo = SIGBUS;
576 info.si_errno = 0;
577 info.si_code = BUS_ADRERR;
578 info.si_addr = (void __user *)address;
579 force_sig_info(SIGBUS, &info, tsk);
580 return;
583 DEFINE_SPINLOCK(pgd_lock);
584 LIST_HEAD(pgd_list);
586 void vmalloc_sync_all(void)
588 /* Note that races in the updates of insync and start aren't
589 problematic:
590 insync can only get set bits added, and updates to start are only
591 improving performance (without affecting correctness if undone). */
592 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
593 static unsigned long start = VMALLOC_START & PGDIR_MASK;
594 unsigned long address;
596 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
597 if (!test_bit(pgd_index(address), insync)) {
598 const pgd_t *pgd_ref = pgd_offset_k(address);
599 struct page *page;
601 if (pgd_none(*pgd_ref))
602 continue;
603 spin_lock(&pgd_lock);
604 list_for_each_entry(page, &pgd_list, lru) {
605 pgd_t *pgd;
606 pgd = (pgd_t *)page_address(page) + pgd_index(address);
607 if (pgd_none(*pgd))
608 set_pgd(pgd, *pgd_ref);
609 else
610 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
612 spin_unlock(&pgd_lock);
613 set_bit(pgd_index(address), insync);
615 if (address == start)
616 start = address + PGDIR_SIZE;
618 /* Check that there is no need to do the same for the modules area. */
619 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
620 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
621 (__START_KERNEL & PGDIR_MASK)));