x86: simplify __change_page_attr()
[linux-2.6.git] / arch / x86 / mm / fault_64.c
blobdd26e680a43166ca9abe0ced6390989edddf4349
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
6 #include <linux/signal.h>
7 #include <linux/sched.h>
8 #include <linux/kernel.h>
9 #include <linux/errno.h>
10 #include <linux/string.h>
11 #include <linux/types.h>
12 #include <linux/ptrace.h>
13 #include <linux/mman.h>
14 #include <linux/mm.h>
15 #include <linux/smp.h>
16 #include <linux/interrupt.h>
17 #include <linux/init.h>
18 #include <linux/tty.h>
19 #include <linux/vt_kern.h> /* For unblank_screen() */
20 #include <linux/compiler.h>
21 #include <linux/vmalloc.h>
22 #include <linux/module.h>
23 #include <linux/kprobes.h>
24 #include <linux/uaccess.h>
25 #include <linux/kdebug.h>
27 #include <asm/system.h>
28 #include <asm/pgalloc.h>
29 #include <asm/smp.h>
30 #include <asm/tlbflush.h>
31 #include <asm/proto.h>
32 #include <asm-generic/sections.h>
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
42 #define PF_PROT (1<<0)
43 #define PF_WRITE (1<<1)
44 #define PF_USER (1<<2)
45 #define PF_RSVD (1<<3)
46 #define PF_INSTR (1<<4)
48 static inline int notify_page_fault(struct pt_regs *regs)
50 #ifdef CONFIG_KPROBES
51 int ret = 0;
53 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
61 return ret;
62 #else
63 return 0;
64 #endif
68 * X86_32
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
72 * X86_64
73 * Sometimes the CPU reports invalid exceptions on prefetch.
74 * Check that here and ignore it.
76 * Opcode checker based on code by Richard Brunner
78 static int is_prefetch(struct pt_regs *regs, unsigned long addr,
79 unsigned long error_code)
81 unsigned char *instr;
82 int scan_more = 1;
83 int prefetch = 0;
84 unsigned char *max_instr;
86 #ifdef CONFIG_X86_32
87 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
88 boot_cpu_data.x86 >= 6)) {
89 /* Catch an obscure case of prefetch inside an NX page. */
90 if (nx_enabled && (error_code & PF_INSTR))
91 return 0;
92 } else {
93 return 0;
95 #else
96 /* If it was a exec fault ignore */
97 if (error_code & PF_INSTR)
98 return 0;
99 #endif
101 instr = (unsigned char *)convert_ip_to_linear(current, regs);
102 max_instr = instr + 15;
104 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
105 return 0;
107 while (scan_more && instr < max_instr) {
108 unsigned char opcode;
109 unsigned char instr_hi;
110 unsigned char instr_lo;
112 if (probe_kernel_address(instr, opcode))
113 break;
115 instr_hi = opcode & 0xf0;
116 instr_lo = opcode & 0x0f;
117 instr++;
119 switch (instr_hi) {
120 case 0x20:
121 case 0x30:
123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
124 * In X86_64 long mode, the CPU will signal invalid
125 * opcode if some of these prefixes are present so
126 * X86_64 will never get here anyway
128 scan_more = ((instr_lo & 7) == 0x6);
129 break;
130 #ifdef CONFIG_X86_64
131 case 0x40:
133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
134 * Need to figure out under what instruction mode the
135 * instruction was issued. Could check the LDT for lm,
136 * but for now it's good enough to assume that long
137 * mode only uses well known segments or kernel.
139 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
140 break;
141 #endif
142 case 0x60:
143 /* 0x64 thru 0x67 are valid prefixes in all modes. */
144 scan_more = (instr_lo & 0xC) == 0x4;
145 break;
146 case 0xF0:
147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
148 scan_more = !instr_lo || (instr_lo>>1) == 1;
149 break;
150 case 0x00:
151 /* Prefetch instruction is 0x0F0D or 0x0F18 */
152 scan_more = 0;
154 if (probe_kernel_address(instr, opcode))
155 break;
156 prefetch = (instr_lo == 0xF) &&
157 (opcode == 0x0D || opcode == 0x18);
158 break;
159 default:
160 scan_more = 0;
161 break;
164 return prefetch;
167 static void force_sig_info_fault(int si_signo, int si_code,
168 unsigned long address, struct task_struct *tsk)
170 siginfo_t info;
172 info.si_signo = si_signo;
173 info.si_errno = 0;
174 info.si_code = si_code;
175 info.si_addr = (void __user *)address;
176 force_sig_info(si_signo, &info, tsk);
179 static int bad_address(void *p)
181 unsigned long dummy;
182 return probe_kernel_address((unsigned long *)p, dummy);
185 void dump_pagetable(unsigned long address)
187 pgd_t *pgd;
188 pud_t *pud;
189 pmd_t *pmd;
190 pte_t *pte;
192 pgd = (pgd_t *)read_cr3();
194 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
195 pgd += pgd_index(address);
196 if (bad_address(pgd)) goto bad;
197 printk("PGD %lx ", pgd_val(*pgd));
198 if (!pgd_present(*pgd)) goto ret;
200 pud = pud_offset(pgd, address);
201 if (bad_address(pud)) goto bad;
202 printk("PUD %lx ", pud_val(*pud));
203 if (!pud_present(*pud)) goto ret;
205 pmd = pmd_offset(pud, address);
206 if (bad_address(pmd)) goto bad;
207 printk("PMD %lx ", pmd_val(*pmd));
208 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
210 pte = pte_offset_kernel(pmd, address);
211 if (bad_address(pte)) goto bad;
212 printk("PTE %lx", pte_val(*pte));
213 ret:
214 printk("\n");
215 return;
216 bad:
217 printk("BAD\n");
220 #ifdef CONFIG_X86_64
221 static const char errata93_warning[] =
222 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
223 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
224 KERN_ERR "******* Please consider a BIOS update.\n"
225 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
226 #endif
228 /* Workaround for K8 erratum #93 & buggy BIOS.
229 BIOS SMM functions are required to use a specific workaround
230 to avoid corruption of the 64bit RIP register on C stepping K8.
231 A lot of BIOS that didn't get tested properly miss this.
232 The OS sees this as a page fault with the upper 32bits of RIP cleared.
233 Try to work around it here.
234 Note we only handle faults in kernel here.
235 Does nothing for X86_32
237 static int is_errata93(struct pt_regs *regs, unsigned long address)
239 #ifdef CONFIG_X86_64
240 static int warned;
241 if (address != regs->ip)
242 return 0;
243 if ((address >> 32) != 0)
244 return 0;
245 address |= 0xffffffffUL << 32;
246 if ((address >= (u64)_stext && address <= (u64)_etext) ||
247 (address >= MODULES_VADDR && address <= MODULES_END)) {
248 if (!warned) {
249 printk(errata93_warning);
250 warned = 1;
252 regs->ip = address;
253 return 1;
255 #endif
256 return 0;
259 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
260 unsigned long error_code)
262 unsigned long flags = oops_begin();
263 struct task_struct *tsk;
265 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
266 current->comm, address);
267 dump_pagetable(address);
268 tsk = current;
269 tsk->thread.cr2 = address;
270 tsk->thread.trap_no = 14;
271 tsk->thread.error_code = error_code;
272 if (__die("Bad pagetable", regs, error_code))
273 regs = NULL;
274 oops_end(flags, regs, SIGKILL);
278 * Handle a fault on the vmalloc area
280 * This assumes no large pages in there.
282 static int vmalloc_fault(unsigned long address)
284 #ifdef CONFIG_X86_32
285 unsigned long pgd_paddr;
286 pmd_t *pmd_k;
287 pte_t *pte_k;
289 * Synchronize this task's top level page-table
290 * with the 'reference' page table.
292 * Do _not_ use "current" here. We might be inside
293 * an interrupt in the middle of a task switch..
295 pgd_paddr = read_cr3();
296 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
297 if (!pmd_k)
298 return -1;
299 pte_k = pte_offset_kernel(pmd_k, address);
300 if (!pte_present(*pte_k))
301 return -1;
302 return 0;
303 #else
304 pgd_t *pgd, *pgd_ref;
305 pud_t *pud, *pud_ref;
306 pmd_t *pmd, *pmd_ref;
307 pte_t *pte, *pte_ref;
309 /* Copy kernel mappings over when needed. This can also
310 happen within a race in page table update. In the later
311 case just flush. */
313 pgd = pgd_offset(current->mm ?: &init_mm, address);
314 pgd_ref = pgd_offset_k(address);
315 if (pgd_none(*pgd_ref))
316 return -1;
317 if (pgd_none(*pgd))
318 set_pgd(pgd, *pgd_ref);
319 else
320 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
322 /* Below here mismatches are bugs because these lower tables
323 are shared */
325 pud = pud_offset(pgd, address);
326 pud_ref = pud_offset(pgd_ref, address);
327 if (pud_none(*pud_ref))
328 return -1;
329 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
330 BUG();
331 pmd = pmd_offset(pud, address);
332 pmd_ref = pmd_offset(pud_ref, address);
333 if (pmd_none(*pmd_ref))
334 return -1;
335 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
336 BUG();
337 pte_ref = pte_offset_kernel(pmd_ref, address);
338 if (!pte_present(*pte_ref))
339 return -1;
340 pte = pte_offset_kernel(pmd, address);
341 /* Don't use pte_page here, because the mappings can point
342 outside mem_map, and the NUMA hash lookup cannot handle
343 that. */
344 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
345 BUG();
346 return 0;
347 #endif
350 int show_unhandled_signals = 1;
353 * This routine handles page faults. It determines the address,
354 * and the problem, and then passes it off to one of the appropriate
355 * routines.
357 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
358 unsigned long error_code)
360 struct task_struct *tsk;
361 struct mm_struct *mm;
362 struct vm_area_struct *vma;
363 unsigned long address;
364 int write, fault;
365 unsigned long flags;
366 int si_code;
369 * We can fault from pretty much anywhere, with unknown IRQ state.
371 trace_hardirqs_fixup();
373 tsk = current;
374 mm = tsk->mm;
375 prefetchw(&mm->mmap_sem);
377 /* get the address */
378 address = read_cr2();
380 si_code = SEGV_MAPERR;
382 if (notify_page_fault(regs))
383 return;
386 * We fault-in kernel-space virtual memory on-demand. The
387 * 'reference' page table is init_mm.pgd.
389 * NOTE! We MUST NOT take any locks for this case. We may
390 * be in an interrupt or a critical region, and should
391 * only copy the information from the master page table,
392 * nothing more.
394 * This verifies that the fault happens in kernel space
395 * (error_code & 4) == 0, and that the fault was not a
396 * protection error (error_code & 9) == 0.
398 if (unlikely(address >= TASK_SIZE64)) {
400 * Don't check for the module range here: its PML4
401 * is always initialized because it's shared with the main
402 * kernel text. Only vmalloc may need PML4 syncups.
404 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
405 ((address >= VMALLOC_START && address < VMALLOC_END))) {
406 if (vmalloc_fault(address) >= 0)
407 return;
410 * Don't take the mm semaphore here. If we fixup a prefetch
411 * fault we could otherwise deadlock.
413 goto bad_area_nosemaphore;
416 if (likely(regs->flags & X86_EFLAGS_IF))
417 local_irq_enable();
419 if (unlikely(error_code & PF_RSVD))
420 pgtable_bad(address, regs, error_code);
423 * If we're in an interrupt, have no user context or are running in an
424 * atomic region then we must not take the fault.
426 if (unlikely(in_atomic() || !mm))
427 goto bad_area_nosemaphore;
430 * User-mode registers count as a user access even for any
431 * potential system fault or CPU buglet.
433 if (user_mode_vm(regs))
434 error_code |= PF_USER;
436 again:
437 /* When running in the kernel we expect faults to occur only to
438 * addresses in user space. All other faults represent errors in the
439 * kernel and should generate an OOPS. Unfortunately, in the case of an
440 * erroneous fault occurring in a code path which already holds mmap_sem
441 * we will deadlock attempting to validate the fault against the
442 * address space. Luckily the kernel only validly references user
443 * space from well defined areas of code, which are listed in the
444 * exceptions table.
446 * As the vast majority of faults will be valid we will only perform
447 * the source reference check when there is a possibility of a deadlock.
448 * Attempt to lock the address space, if we cannot we then validate the
449 * source. If this is invalid we can skip the address space check,
450 * thus avoiding the deadlock.
452 if (!down_read_trylock(&mm->mmap_sem)) {
453 if ((error_code & PF_USER) == 0 &&
454 !search_exception_tables(regs->ip))
455 goto bad_area_nosemaphore;
456 down_read(&mm->mmap_sem);
459 vma = find_vma(mm, address);
460 if (!vma)
461 goto bad_area;
462 if (likely(vma->vm_start <= address))
463 goto good_area;
464 if (!(vma->vm_flags & VM_GROWSDOWN))
465 goto bad_area;
466 if (error_code & PF_USER) {
468 * Accessing the stack below %sp is always a bug.
469 * The large cushion allows instructions like enter
470 * and pusha to work. ("enter $65535,$31" pushes
471 * 32 pointers and then decrements %sp by 65535.)
473 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
474 goto bad_area;
476 if (expand_stack(vma, address))
477 goto bad_area;
479 * Ok, we have a good vm_area for this memory access, so
480 * we can handle it..
482 good_area:
483 si_code = SEGV_ACCERR;
484 write = 0;
485 switch (error_code & (PF_PROT|PF_WRITE)) {
486 default: /* 3: write, present */
487 /* fall through */
488 case PF_WRITE: /* write, not present */
489 if (!(vma->vm_flags & VM_WRITE))
490 goto bad_area;
491 write++;
492 break;
493 case PF_PROT: /* read, present */
494 goto bad_area;
495 case 0: /* read, not present */
496 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
497 goto bad_area;
501 * If for any reason at all we couldn't handle the fault,
502 * make sure we exit gracefully rather than endlessly redo
503 * the fault.
505 fault = handle_mm_fault(mm, vma, address, write);
506 if (unlikely(fault & VM_FAULT_ERROR)) {
507 if (fault & VM_FAULT_OOM)
508 goto out_of_memory;
509 else if (fault & VM_FAULT_SIGBUS)
510 goto do_sigbus;
511 BUG();
513 if (fault & VM_FAULT_MAJOR)
514 tsk->maj_flt++;
515 else
516 tsk->min_flt++;
518 #ifdef CONFIG_X86_32
520 * Did it hit the DOS screen memory VA from vm86 mode?
522 if (v8086_mode(regs)) {
523 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
524 if (bit < 32)
525 tsk->thread.screen_bitmap |= 1 << bit;
527 #endif
528 up_read(&mm->mmap_sem);
529 return;
532 * Something tried to access memory that isn't in our memory map..
533 * Fix it, but check if it's kernel or user first..
535 bad_area:
536 up_read(&mm->mmap_sem);
538 bad_area_nosemaphore:
539 /* User mode accesses just cause a SIGSEGV */
540 if (error_code & PF_USER) {
543 * It's possible to have interrupts off here.
545 local_irq_enable();
547 if (is_prefetch(regs, address, error_code))
548 return;
550 /* Work around K8 erratum #100 K8 in compat mode
551 occasionally jumps to illegal addresses >4GB. We
552 catch this here in the page fault handler because
553 these addresses are not reachable. Just detect this
554 case and return. Any code segment in LDT is
555 compatibility mode. */
556 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
557 (address >> 32))
558 return;
560 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
561 printk_ratelimit()) {
562 printk(
563 #ifdef CONFIG_X86_32
564 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
565 #else
566 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
567 #endif
568 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
569 tsk->comm, task_pid_nr(tsk), address, regs->ip,
570 regs->sp, error_code);
571 print_vma_addr(" in ", regs->ip);
572 printk("\n");
575 tsk->thread.cr2 = address;
576 /* Kernel addresses are always protection faults */
577 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
578 tsk->thread.trap_no = 14;
580 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
581 return;
584 no_context:
585 /* Are we prepared to handle this kernel fault? */
586 if (fixup_exception(regs))
587 return;
590 * Hall of shame of CPU/BIOS bugs.
593 if (is_prefetch(regs, address, error_code))
594 return;
596 if (is_errata93(regs, address))
597 return;
600 * Oops. The kernel tried to access some bad page. We'll have to
601 * terminate things with extreme prejudice.
604 flags = oops_begin();
606 if (address < PAGE_SIZE)
607 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
608 else
609 printk(KERN_ALERT "Unable to handle kernel paging request");
610 printk(" at %016lx RIP: \n" KERN_ALERT, address);
611 printk_address(regs->ip, 1);
612 dump_pagetable(address);
613 tsk->thread.cr2 = address;
614 tsk->thread.trap_no = 14;
615 tsk->thread.error_code = error_code;
616 if (__die("Oops", regs, error_code))
617 regs = NULL;
618 /* Executive summary in case the body of the oops scrolled away */
619 printk(KERN_EMERG "CR2: %016lx\n", address);
620 oops_end(flags, regs, SIGKILL);
623 * We ran out of memory, or some other thing happened to us that made
624 * us unable to handle the page fault gracefully.
626 out_of_memory:
627 up_read(&mm->mmap_sem);
628 if (is_global_init(current)) {
629 yield();
630 goto again;
632 printk("VM: killing process %s\n", tsk->comm);
633 if (error_code & PF_USER)
634 do_group_exit(SIGKILL);
635 goto no_context;
637 do_sigbus:
638 up_read(&mm->mmap_sem);
640 /* Kernel mode? Handle exceptions or die */
641 if (!(error_code & PF_USER))
642 goto no_context;
644 tsk->thread.cr2 = address;
645 tsk->thread.error_code = error_code;
646 tsk->thread.trap_no = 14;
647 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
648 return;
651 DEFINE_SPINLOCK(pgd_lock);
652 LIST_HEAD(pgd_list);
654 void vmalloc_sync_all(void)
657 * Note that races in the updates of insync and start aren't
658 * problematic: insync can only get set bits added, and updates to
659 * start are only improving performance (without affecting correctness
660 * if undone).
662 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
663 static unsigned long start = VMALLOC_START & PGDIR_MASK;
664 unsigned long address;
666 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
667 if (!test_bit(pgd_index(address), insync)) {
668 const pgd_t *pgd_ref = pgd_offset_k(address);
669 struct page *page;
671 if (pgd_none(*pgd_ref))
672 continue;
673 spin_lock(&pgd_lock);
674 list_for_each_entry(page, &pgd_list, lru) {
675 pgd_t *pgd;
676 pgd = (pgd_t *)page_address(page) + pgd_index(address);
677 if (pgd_none(*pgd))
678 set_pgd(pgd, *pgd_ref);
679 else
680 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
682 spin_unlock(&pgd_lock);
683 set_bit(pgd_index(address), insync);
685 if (address == start)
686 start = address + PGDIR_SIZE;
688 /* Check that there is no need to do the same for the modules area. */
689 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
690 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
691 (__START_KERNEL & PGDIR_MASK)));