[PATCH] pidspace: is_init()
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / arch / x86_64 / mm / fault.c
blob3751b4788e288748112c05df0121a1186aac5fab
1 /*
2 * linux/arch/x86-64/mm/fault.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
8 #include <linux/signal.h>
9 #include <linux/sched.h>
10 #include <linux/kernel.h>
11 #include <linux/errno.h>
12 #include <linux/string.h>
13 #include <linux/types.h>
14 #include <linux/ptrace.h>
15 #include <linux/mman.h>
16 #include <linux/mm.h>
17 #include <linux/smp.h>
18 #include <linux/smp_lock.h>
19 #include <linux/interrupt.h>
20 #include <linux/init.h>
21 #include <linux/tty.h>
22 #include <linux/vt_kern.h> /* For unblank_screen() */
23 #include <linux/compiler.h>
24 #include <linux/module.h>
25 #include <linux/kprobes.h>
27 #include <asm/system.h>
28 #include <asm/uaccess.h>
29 #include <asm/pgalloc.h>
30 #include <asm/smp.h>
31 #include <asm/tlbflush.h>
32 #include <asm/proto.h>
33 #include <asm/kdebug.h>
34 #include <asm-generic/sections.h>
36 /* Page fault error code bits */
37 #define PF_PROT (1<<0) /* or no page found */
38 #define PF_WRITE (1<<1)
39 #define PF_USER (1<<2)
40 #define PF_RSVD (1<<3)
41 #define PF_INSTR (1<<4)
43 static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
45 /* Hook to register for page fault notifications */
46 int register_page_fault_notifier(struct notifier_block *nb)
48 vmalloc_sync_all();
49 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
51 EXPORT_SYMBOL_GPL(register_page_fault_notifier);
53 int unregister_page_fault_notifier(struct notifier_block *nb)
55 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
57 EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
59 static inline int notify_page_fault(enum die_val val, const char *str,
60 struct pt_regs *regs, long err, int trap, int sig)
62 struct die_args args = {
63 .regs = regs,
64 .str = str,
65 .err = err,
66 .trapnr = trap,
67 .signr = sig
69 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
72 void bust_spinlocks(int yes)
74 int loglevel_save = console_loglevel;
75 if (yes) {
76 oops_in_progress = 1;
77 } else {
78 #ifdef CONFIG_VT
79 unblank_screen();
80 #endif
81 oops_in_progress = 0;
83 * OK, the message is on the console. Now we call printk()
84 * without oops_in_progress set so that printk will give klogd
85 * a poke. Hold onto your hats...
87 console_loglevel = 15; /* NMI oopser may have shut the console up */
88 printk(" ");
89 console_loglevel = loglevel_save;
93 /* Sometimes the CPU reports invalid exceptions on prefetch.
94 Check that here and ignore.
95 Opcode checker based on code by Richard Brunner */
96 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
97 unsigned long error_code)
99 unsigned char __user *instr;
100 int scan_more = 1;
101 int prefetch = 0;
102 unsigned char *max_instr;
104 /* If it was a exec fault ignore */
105 if (error_code & PF_INSTR)
106 return 0;
108 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
109 max_instr = instr + 15;
111 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
112 return 0;
114 while (scan_more && instr < max_instr) {
115 unsigned char opcode;
116 unsigned char instr_hi;
117 unsigned char instr_lo;
119 if (__get_user(opcode, (char __user *)instr))
120 break;
122 instr_hi = opcode & 0xf0;
123 instr_lo = opcode & 0x0f;
124 instr++;
126 switch (instr_hi) {
127 case 0x20:
128 case 0x30:
129 /* Values 0x26,0x2E,0x36,0x3E are valid x86
130 prefixes. In long mode, the CPU will signal
131 invalid opcode if some of these prefixes are
132 present so we will never get here anyway */
133 scan_more = ((instr_lo & 7) == 0x6);
134 break;
136 case 0x40:
137 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
138 Need to figure out under what instruction mode the
139 instruction was issued ... */
140 /* Could check the LDT for lm, but for now it's good
141 enough to assume that long mode only uses well known
142 segments or kernel. */
143 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
144 break;
146 case 0x60:
147 /* 0x64 thru 0x67 are valid prefixes in all modes. */
148 scan_more = (instr_lo & 0xC) == 0x4;
149 break;
150 case 0xF0:
151 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
152 scan_more = !instr_lo || (instr_lo>>1) == 1;
153 break;
154 case 0x00:
155 /* Prefetch instruction is 0x0F0D or 0x0F18 */
156 scan_more = 0;
157 if (__get_user(opcode, (char __user *)instr))
158 break;
159 prefetch = (instr_lo == 0xF) &&
160 (opcode == 0x0D || opcode == 0x18);
161 break;
162 default:
163 scan_more = 0;
164 break;
167 return prefetch;
170 static int bad_address(void *p)
172 unsigned long dummy;
173 return __get_user(dummy, (unsigned long __user *)p);
176 void dump_pagetable(unsigned long address)
178 pgd_t *pgd;
179 pud_t *pud;
180 pmd_t *pmd;
181 pte_t *pte;
183 asm("movq %%cr3,%0" : "=r" (pgd));
185 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
186 pgd += pgd_index(address);
187 if (bad_address(pgd)) goto bad;
188 printk("PGD %lx ", pgd_val(*pgd));
189 if (!pgd_present(*pgd)) goto ret;
191 pud = pud_offset(pgd, address);
192 if (bad_address(pud)) goto bad;
193 printk("PUD %lx ", pud_val(*pud));
194 if (!pud_present(*pud)) goto ret;
196 pmd = pmd_offset(pud, address);
197 if (bad_address(pmd)) goto bad;
198 printk("PMD %lx ", pmd_val(*pmd));
199 if (!pmd_present(*pmd)) goto ret;
201 pte = pte_offset_kernel(pmd, address);
202 if (bad_address(pte)) goto bad;
203 printk("PTE %lx", pte_val(*pte));
204 ret:
205 printk("\n");
206 return;
207 bad:
208 printk("BAD\n");
211 static const char errata93_warning[] =
212 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
213 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
214 KERN_ERR "******* Please consider a BIOS update.\n"
215 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
217 /* Workaround for K8 erratum #93 & buggy BIOS.
218 BIOS SMM functions are required to use a specific workaround
219 to avoid corruption of the 64bit RIP register on C stepping K8.
220 A lot of BIOS that didn't get tested properly miss this.
221 The OS sees this as a page fault with the upper 32bits of RIP cleared.
222 Try to work around it here.
223 Note we only handle faults in kernel here. */
225 static int is_errata93(struct pt_regs *regs, unsigned long address)
227 static int warned;
228 if (address != regs->rip)
229 return 0;
230 if ((address >> 32) != 0)
231 return 0;
232 address |= 0xffffffffUL << 32;
233 if ((address >= (u64)_stext && address <= (u64)_etext) ||
234 (address >= MODULES_VADDR && address <= MODULES_END)) {
235 if (!warned) {
236 printk(errata93_warning);
237 warned = 1;
239 regs->rip = address;
240 return 1;
242 return 0;
245 int unhandled_signal(struct task_struct *tsk, int sig)
247 if (is_init(tsk))
248 return 1;
249 if (tsk->ptrace & PT_PTRACED)
250 return 0;
251 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
252 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
255 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
256 unsigned long error_code)
258 unsigned long flags = oops_begin();
259 struct task_struct *tsk;
261 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
262 current->comm, address);
263 dump_pagetable(address);
264 tsk = current;
265 tsk->thread.cr2 = address;
266 tsk->thread.trap_no = 14;
267 tsk->thread.error_code = error_code;
268 __die("Bad pagetable", regs, error_code);
269 oops_end(flags);
270 do_exit(SIGKILL);
274 * Handle a fault on the vmalloc area
276 * This assumes no large pages in there.
278 static int vmalloc_fault(unsigned long address)
280 pgd_t *pgd, *pgd_ref;
281 pud_t *pud, *pud_ref;
282 pmd_t *pmd, *pmd_ref;
283 pte_t *pte, *pte_ref;
285 /* Copy kernel mappings over when needed. This can also
286 happen within a race in page table update. In the later
287 case just flush. */
289 pgd = pgd_offset(current->mm ?: &init_mm, address);
290 pgd_ref = pgd_offset_k(address);
291 if (pgd_none(*pgd_ref))
292 return -1;
293 if (pgd_none(*pgd))
294 set_pgd(pgd, *pgd_ref);
295 else
296 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
298 /* Below here mismatches are bugs because these lower tables
299 are shared */
301 pud = pud_offset(pgd, address);
302 pud_ref = pud_offset(pgd_ref, address);
303 if (pud_none(*pud_ref))
304 return -1;
305 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
306 BUG();
307 pmd = pmd_offset(pud, address);
308 pmd_ref = pmd_offset(pud_ref, address);
309 if (pmd_none(*pmd_ref))
310 return -1;
311 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
312 BUG();
313 pte_ref = pte_offset_kernel(pmd_ref, address);
314 if (!pte_present(*pte_ref))
315 return -1;
316 pte = pte_offset_kernel(pmd, address);
317 /* Don't use pte_page here, because the mappings can point
318 outside mem_map, and the NUMA hash lookup cannot handle
319 that. */
320 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
321 BUG();
322 return 0;
325 int page_fault_trace = 0;
326 int exception_trace = 1;
329 * This routine handles page faults. It determines the address,
330 * and the problem, and then passes it off to one of the appropriate
331 * routines.
333 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
334 unsigned long error_code)
336 struct task_struct *tsk;
337 struct mm_struct *mm;
338 struct vm_area_struct * vma;
339 unsigned long address;
340 const struct exception_table_entry *fixup;
341 int write;
342 unsigned long flags;
343 siginfo_t info;
345 tsk = current;
346 mm = tsk->mm;
347 prefetchw(&mm->mmap_sem);
349 /* get the address */
350 __asm__("movq %%cr2,%0":"=r" (address));
352 info.si_code = SEGV_MAPERR;
356 * We fault-in kernel-space virtual memory on-demand. The
357 * 'reference' page table is init_mm.pgd.
359 * NOTE! We MUST NOT take any locks for this case. We may
360 * be in an interrupt or a critical region, and should
361 * only copy the information from the master page table,
362 * nothing more.
364 * This verifies that the fault happens in kernel space
365 * (error_code & 4) == 0, and that the fault was not a
366 * protection error (error_code & 9) == 0.
368 if (unlikely(address >= TASK_SIZE64)) {
370 * Don't check for the module range here: its PML4
371 * is always initialized because it's shared with the main
372 * kernel text. Only vmalloc may need PML4 syncups.
374 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
375 ((address >= VMALLOC_START && address < VMALLOC_END))) {
376 if (vmalloc_fault(address) >= 0)
377 return;
379 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
380 SIGSEGV) == NOTIFY_STOP)
381 return;
383 * Don't take the mm semaphore here. If we fixup a prefetch
384 * fault we could otherwise deadlock.
386 goto bad_area_nosemaphore;
389 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
390 SIGSEGV) == NOTIFY_STOP)
391 return;
393 if (likely(regs->eflags & X86_EFLAGS_IF))
394 local_irq_enable();
396 if (unlikely(page_fault_trace))
397 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
398 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
400 if (unlikely(error_code & PF_RSVD))
401 pgtable_bad(address, regs, error_code);
404 * If we're in an interrupt or have no user
405 * context, we must not take the fault..
407 if (unlikely(in_atomic() || !mm))
408 goto bad_area_nosemaphore;
410 again:
411 /* When running in the kernel we expect faults to occur only to
412 * addresses in user space. All other faults represent errors in the
413 * kernel and should generate an OOPS. Unfortunatly, in the case of an
414 * erroneous fault occurring in a code path which already holds mmap_sem
415 * we will deadlock attempting to validate the fault against the
416 * address space. Luckily the kernel only validly references user
417 * space from well defined areas of code, which are listed in the
418 * exceptions table.
420 * As the vast majority of faults will be valid we will only perform
421 * the source reference check when there is a possibilty of a deadlock.
422 * Attempt to lock the address space, if we cannot we then validate the
423 * source. If this is invalid we can skip the address space check,
424 * thus avoiding the deadlock.
426 if (!down_read_trylock(&mm->mmap_sem)) {
427 if ((error_code & PF_USER) == 0 &&
428 !search_exception_tables(regs->rip))
429 goto bad_area_nosemaphore;
430 down_read(&mm->mmap_sem);
433 vma = find_vma(mm, address);
434 if (!vma)
435 goto bad_area;
436 if (likely(vma->vm_start <= address))
437 goto good_area;
438 if (!(vma->vm_flags & VM_GROWSDOWN))
439 goto bad_area;
440 if (error_code & 4) {
441 /* Allow userspace just enough access below the stack pointer
442 * to let the 'enter' instruction work.
444 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
445 goto bad_area;
447 if (expand_stack(vma, address))
448 goto bad_area;
450 * Ok, we have a good vm_area for this memory access, so
451 * we can handle it..
453 good_area:
454 info.si_code = SEGV_ACCERR;
455 write = 0;
456 switch (error_code & (PF_PROT|PF_WRITE)) {
457 default: /* 3: write, present */
458 /* fall through */
459 case PF_WRITE: /* write, not present */
460 if (!(vma->vm_flags & VM_WRITE))
461 goto bad_area;
462 write++;
463 break;
464 case PF_PROT: /* read, present */
465 goto bad_area;
466 case 0: /* read, not present */
467 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
468 goto bad_area;
472 * If for any reason at all we couldn't handle the fault,
473 * make sure we exit gracefully rather than endlessly redo
474 * the fault.
476 switch (handle_mm_fault(mm, vma, address, write)) {
477 case VM_FAULT_MINOR:
478 tsk->min_flt++;
479 break;
480 case VM_FAULT_MAJOR:
481 tsk->maj_flt++;
482 break;
483 case VM_FAULT_SIGBUS:
484 goto do_sigbus;
485 default:
486 goto out_of_memory;
489 up_read(&mm->mmap_sem);
490 return;
493 * Something tried to access memory that isn't in our memory map..
494 * Fix it, but check if it's kernel or user first..
496 bad_area:
497 up_read(&mm->mmap_sem);
499 bad_area_nosemaphore:
500 /* User mode accesses just cause a SIGSEGV */
501 if (error_code & PF_USER) {
502 if (is_prefetch(regs, address, error_code))
503 return;
505 /* Work around K8 erratum #100 K8 in compat mode
506 occasionally jumps to illegal addresses >4GB. We
507 catch this here in the page fault handler because
508 these addresses are not reachable. Just detect this
509 case and return. Any code segment in LDT is
510 compatibility mode. */
511 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
512 (address >> 32))
513 return;
515 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
516 printk(
517 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
518 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
519 tsk->comm, tsk->pid, address, regs->rip,
520 regs->rsp, error_code);
523 tsk->thread.cr2 = address;
524 /* Kernel addresses are always protection faults */
525 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
526 tsk->thread.trap_no = 14;
527 info.si_signo = SIGSEGV;
528 info.si_errno = 0;
529 /* info.si_code has been set above */
530 info.si_addr = (void __user *)address;
531 force_sig_info(SIGSEGV, &info, tsk);
532 return;
535 no_context:
537 /* Are we prepared to handle this kernel fault? */
538 fixup = search_exception_tables(regs->rip);
539 if (fixup) {
540 regs->rip = fixup->fixup;
541 return;
545 * Hall of shame of CPU/BIOS bugs.
548 if (is_prefetch(regs, address, error_code))
549 return;
551 if (is_errata93(regs, address))
552 return;
555 * Oops. The kernel tried to access some bad page. We'll have to
556 * terminate things with extreme prejudice.
559 flags = oops_begin();
561 if (address < PAGE_SIZE)
562 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
563 else
564 printk(KERN_ALERT "Unable to handle kernel paging request");
565 printk(" at %016lx RIP: \n" KERN_ALERT,address);
566 printk_address(regs->rip);
567 dump_pagetable(address);
568 tsk->thread.cr2 = address;
569 tsk->thread.trap_no = 14;
570 tsk->thread.error_code = error_code;
571 __die("Oops", regs, error_code);
572 /* Executive summary in case the body of the oops scrolled away */
573 printk(KERN_EMERG "CR2: %016lx\n", address);
574 oops_end(flags);
575 do_exit(SIGKILL);
578 * We ran out of memory, or some other thing happened to us that made
579 * us unable to handle the page fault gracefully.
581 out_of_memory:
582 up_read(&mm->mmap_sem);
583 if (is_init(current)) {
584 yield();
585 goto again;
587 printk("VM: killing process %s\n", tsk->comm);
588 if (error_code & 4)
589 do_exit(SIGKILL);
590 goto no_context;
592 do_sigbus:
593 up_read(&mm->mmap_sem);
595 /* Kernel mode? Handle exceptions or die */
596 if (!(error_code & PF_USER))
597 goto no_context;
599 tsk->thread.cr2 = address;
600 tsk->thread.error_code = error_code;
601 tsk->thread.trap_no = 14;
602 info.si_signo = SIGBUS;
603 info.si_errno = 0;
604 info.si_code = BUS_ADRERR;
605 info.si_addr = (void __user *)address;
606 force_sig_info(SIGBUS, &info, tsk);
607 return;
610 DEFINE_SPINLOCK(pgd_lock);
611 struct page *pgd_list;
613 void vmalloc_sync_all(void)
615 /* Note that races in the updates of insync and start aren't
616 problematic:
617 insync can only get set bits added, and updates to start are only
618 improving performance (without affecting correctness if undone). */
619 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
620 static unsigned long start = VMALLOC_START & PGDIR_MASK;
621 unsigned long address;
623 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
624 if (!test_bit(pgd_index(address), insync)) {
625 const pgd_t *pgd_ref = pgd_offset_k(address);
626 struct page *page;
628 if (pgd_none(*pgd_ref))
629 continue;
630 spin_lock(&pgd_lock);
631 for (page = pgd_list; page;
632 page = (struct page *)page->index) {
633 pgd_t *pgd;
634 pgd = (pgd_t *)page_address(page) + pgd_index(address);
635 if (pgd_none(*pgd))
636 set_pgd(pgd, *pgd_ref);
637 else
638 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
640 spin_unlock(&pgd_lock);
641 set_bit(pgd_index(address), insync);
643 if (address == start)
644 start = address + PGDIR_SIZE;
646 /* Check that there is no need to do the same for the modules area. */
647 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
648 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
649 (__START_KERNEL & PGDIR_MASK)));
652 static int __init enable_pagefaulttrace(char *str)
654 page_fault_trace = 1;
655 return 1;
657 __setup("pagefaulttrace", enable_pagefaulttrace);