2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 #include <linux/signal.h>
7 #include <linux/sched.h>
8 #include <linux/kernel.h>
9 #include <linux/errno.h>
10 #include <linux/string.h>
11 #include <linux/types.h>
12 #include <linux/ptrace.h>
13 #include <linux/mman.h>
15 #include <linux/smp.h>
16 #include <linux/interrupt.h>
17 #include <linux/init.h>
18 #include <linux/tty.h>
19 #include <linux/vt_kern.h> /* For unblank_screen() */
20 #include <linux/compiler.h>
21 #include <linux/vmalloc.h>
22 #include <linux/module.h>
23 #include <linux/kprobes.h>
24 #include <linux/uaccess.h>
25 #include <linux/kdebug.h>
27 #include <asm/system.h>
28 #include <asm/pgalloc.h>
30 #include <asm/tlbflush.h>
31 #include <asm/proto.h>
32 #include <asm-generic/sections.h>
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
42 #define PF_PROT (1<<0)
43 #define PF_WRITE (1<<1)
44 #define PF_USER (1<<2)
45 #define PF_RSVD (1<<3)
46 #define PF_INSTR (1<<4)
48 static inline int notify_page_fault(struct pt_regs
*regs
)
53 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs
)) {
56 if (kprobe_running() && kprobe_fault_handler(regs
, 14))
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
73 * Sometimes the CPU reports invalid exceptions on prefetch.
74 * Check that here and ignore it.
76 * Opcode checker based on code by Richard Brunner
78 static int is_prefetch(struct pt_regs
*regs
, unsigned long addr
,
79 unsigned long error_code
)
84 unsigned char *max_instr
;
87 if (unlikely(boot_cpu_data
.x86_vendor
== X86_VENDOR_AMD
&&
88 boot_cpu_data
.x86
>= 6)) {
89 /* Catch an obscure case of prefetch inside an NX page. */
90 if (nx_enabled
&& (error_code
& PF_INSTR
))
96 /* If it was a exec fault ignore */
97 if (error_code
& PF_INSTR
)
101 instr
= (unsigned char *)convert_ip_to_linear(current
, regs
);
102 max_instr
= instr
+ 15;
104 if (user_mode(regs
) && instr
>= (unsigned char *)TASK_SIZE
)
107 while (scan_more
&& instr
< max_instr
) {
108 unsigned char opcode
;
109 unsigned char instr_hi
;
110 unsigned char instr_lo
;
112 if (probe_kernel_address(instr
, opcode
))
115 instr_hi
= opcode
& 0xf0;
116 instr_lo
= opcode
& 0x0f;
123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
124 * In X86_64 long mode, the CPU will signal invalid
125 * opcode if some of these prefixes are present so
126 * X86_64 will never get here anyway
128 scan_more
= ((instr_lo
& 7) == 0x6);
133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
134 * Need to figure out under what instruction mode the
135 * instruction was issued. Could check the LDT for lm,
136 * but for now it's good enough to assume that long
137 * mode only uses well known segments or kernel.
139 scan_more
= (!user_mode(regs
)) || (regs
->cs
== __USER_CS
);
143 /* 0x64 thru 0x67 are valid prefixes in all modes. */
144 scan_more
= (instr_lo
& 0xC) == 0x4;
147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
148 scan_more
= !instr_lo
|| (instr_lo
>>1) == 1;
151 /* Prefetch instruction is 0x0F0D or 0x0F18 */
154 if (probe_kernel_address(instr
, opcode
))
156 prefetch
= (instr_lo
== 0xF) &&
157 (opcode
== 0x0D || opcode
== 0x18);
167 static void force_sig_info_fault(int si_signo
, int si_code
,
168 unsigned long address
, struct task_struct
*tsk
)
172 info
.si_signo
= si_signo
;
174 info
.si_code
= si_code
;
175 info
.si_addr
= (void __user
*)address
;
176 force_sig_info(si_signo
, &info
, tsk
);
179 static int bad_address(void *p
)
182 return probe_kernel_address((unsigned long *)p
, dummy
);
185 void dump_pagetable(unsigned long address
)
192 pgd
= (pgd_t
*)read_cr3();
194 pgd
= __va((unsigned long)pgd
& PHYSICAL_PAGE_MASK
);
195 pgd
+= pgd_index(address
);
196 if (bad_address(pgd
)) goto bad
;
197 printk("PGD %lx ", pgd_val(*pgd
));
198 if (!pgd_present(*pgd
)) goto ret
;
200 pud
= pud_offset(pgd
, address
);
201 if (bad_address(pud
)) goto bad
;
202 printk("PUD %lx ", pud_val(*pud
));
203 if (!pud_present(*pud
)) goto ret
;
205 pmd
= pmd_offset(pud
, address
);
206 if (bad_address(pmd
)) goto bad
;
207 printk("PMD %lx ", pmd_val(*pmd
));
208 if (!pmd_present(*pmd
) || pmd_large(*pmd
)) goto ret
;
210 pte
= pte_offset_kernel(pmd
, address
);
211 if (bad_address(pte
)) goto bad
;
212 printk("PTE %lx", pte_val(*pte
));
221 static const char errata93_warning
[] =
222 KERN_ERR
"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
223 KERN_ERR
"******* Working around it, but it may cause SEGVs or burn power.\n"
224 KERN_ERR
"******* Please consider a BIOS update.\n"
225 KERN_ERR
"******* Disabling USB legacy in the BIOS may also help.\n";
227 /* Workaround for K8 erratum #93 & buggy BIOS.
228 BIOS SMM functions are required to use a specific workaround
229 to avoid corruption of the 64bit RIP register on C stepping K8.
230 A lot of BIOS that didn't get tested properly miss this.
231 The OS sees this as a page fault with the upper 32bits of RIP cleared.
232 Try to work around it here.
233 Note we only handle faults in kernel here. */
235 static int is_errata93(struct pt_regs
*regs
, unsigned long address
)
238 if (address
!= regs
->ip
)
240 if ((address
>> 32) != 0)
242 address
|= 0xffffffffUL
<< 32;
243 if ((address
>= (u64
)_stext
&& address
<= (u64
)_etext
) ||
244 (address
>= MODULES_VADDR
&& address
<= MODULES_END
)) {
246 printk(errata93_warning
);
256 static noinline
void pgtable_bad(unsigned long address
, struct pt_regs
*regs
,
257 unsigned long error_code
)
259 unsigned long flags
= oops_begin();
260 struct task_struct
*tsk
;
262 printk(KERN_ALERT
"%s: Corrupted page table at address %lx\n",
263 current
->comm
, address
);
264 dump_pagetable(address
);
266 tsk
->thread
.cr2
= address
;
267 tsk
->thread
.trap_no
= 14;
268 tsk
->thread
.error_code
= error_code
;
269 if (__die("Bad pagetable", regs
, error_code
))
271 oops_end(flags
, regs
, SIGKILL
);
275 * Handle a fault on the vmalloc area
277 * This assumes no large pages in there.
279 static int vmalloc_fault(unsigned long address
)
281 pgd_t
*pgd
, *pgd_ref
;
282 pud_t
*pud
, *pud_ref
;
283 pmd_t
*pmd
, *pmd_ref
;
284 pte_t
*pte
, *pte_ref
;
286 /* Copy kernel mappings over when needed. This can also
287 happen within a race in page table update. In the later
290 pgd
= pgd_offset(current
->mm
?: &init_mm
, address
);
291 pgd_ref
= pgd_offset_k(address
);
292 if (pgd_none(*pgd_ref
))
295 set_pgd(pgd
, *pgd_ref
);
297 BUG_ON(pgd_page_vaddr(*pgd
) != pgd_page_vaddr(*pgd_ref
));
299 /* Below here mismatches are bugs because these lower tables
302 pud
= pud_offset(pgd
, address
);
303 pud_ref
= pud_offset(pgd_ref
, address
);
304 if (pud_none(*pud_ref
))
306 if (pud_none(*pud
) || pud_page_vaddr(*pud
) != pud_page_vaddr(*pud_ref
))
308 pmd
= pmd_offset(pud
, address
);
309 pmd_ref
= pmd_offset(pud_ref
, address
);
310 if (pmd_none(*pmd_ref
))
312 if (pmd_none(*pmd
) || pmd_page(*pmd
) != pmd_page(*pmd_ref
))
314 pte_ref
= pte_offset_kernel(pmd_ref
, address
);
315 if (!pte_present(*pte_ref
))
317 pte
= pte_offset_kernel(pmd
, address
);
318 /* Don't use pte_page here, because the mappings can point
319 outside mem_map, and the NUMA hash lookup cannot handle
321 if (!pte_present(*pte
) || pte_pfn(*pte
) != pte_pfn(*pte_ref
))
326 int show_unhandled_signals
= 1;
329 * This routine handles page faults. It determines the address,
330 * and the problem, and then passes it off to one of the appropriate
333 asmlinkage
void __kprobes
do_page_fault(struct pt_regs
*regs
,
334 unsigned long error_code
)
336 struct task_struct
*tsk
;
337 struct mm_struct
*mm
;
338 struct vm_area_struct
*vma
;
339 unsigned long address
;
345 * We can fault from pretty much anywhere, with unknown IRQ state.
347 trace_hardirqs_fixup();
351 prefetchw(&mm
->mmap_sem
);
353 /* get the address */
354 address
= read_cr2();
356 si_code
= SEGV_MAPERR
;
358 if (notify_page_fault(regs
))
362 * We fault-in kernel-space virtual memory on-demand. The
363 * 'reference' page table is init_mm.pgd.
365 * NOTE! We MUST NOT take any locks for this case. We may
366 * be in an interrupt or a critical region, and should
367 * only copy the information from the master page table,
370 * This verifies that the fault happens in kernel space
371 * (error_code & 4) == 0, and that the fault was not a
372 * protection error (error_code & 9) == 0.
374 if (unlikely(address
>= TASK_SIZE64
)) {
376 * Don't check for the module range here: its PML4
377 * is always initialized because it's shared with the main
378 * kernel text. Only vmalloc may need PML4 syncups.
380 if (!(error_code
& (PF_RSVD
|PF_USER
|PF_PROT
)) &&
381 ((address
>= VMALLOC_START
&& address
< VMALLOC_END
))) {
382 if (vmalloc_fault(address
) >= 0)
386 * Don't take the mm semaphore here. If we fixup a prefetch
387 * fault we could otherwise deadlock.
389 goto bad_area_nosemaphore
;
392 if (likely(regs
->flags
& X86_EFLAGS_IF
))
395 if (unlikely(error_code
& PF_RSVD
))
396 pgtable_bad(address
, regs
, error_code
);
399 * If we're in an interrupt, have no user context or are running in an
400 * atomic region then we must not take the fault.
402 if (unlikely(in_atomic() || !mm
))
403 goto bad_area_nosemaphore
;
406 * User-mode registers count as a user access even for any
407 * potential system fault or CPU buglet.
409 if (user_mode_vm(regs
))
410 error_code
|= PF_USER
;
413 /* When running in the kernel we expect faults to occur only to
414 * addresses in user space. All other faults represent errors in the
415 * kernel and should generate an OOPS. Unfortunately, in the case of an
416 * erroneous fault occurring in a code path which already holds mmap_sem
417 * we will deadlock attempting to validate the fault against the
418 * address space. Luckily the kernel only validly references user
419 * space from well defined areas of code, which are listed in the
422 * As the vast majority of faults will be valid we will only perform
423 * the source reference check when there is a possibility of a deadlock.
424 * Attempt to lock the address space, if we cannot we then validate the
425 * source. If this is invalid we can skip the address space check,
426 * thus avoiding the deadlock.
428 if (!down_read_trylock(&mm
->mmap_sem
)) {
429 if ((error_code
& PF_USER
) == 0 &&
430 !search_exception_tables(regs
->ip
))
431 goto bad_area_nosemaphore
;
432 down_read(&mm
->mmap_sem
);
435 vma
= find_vma(mm
, address
);
438 if (likely(vma
->vm_start
<= address
))
440 if (!(vma
->vm_flags
& VM_GROWSDOWN
))
442 if (error_code
& PF_USER
) {
443 /* Allow userspace just enough access below the stack pointer
444 * to let the 'enter' instruction work.
446 if (address
+ 65536 + 32 * sizeof(unsigned long) < regs
->sp
)
449 if (expand_stack(vma
, address
))
452 * Ok, we have a good vm_area for this memory access, so
456 si_code
= SEGV_ACCERR
;
458 switch (error_code
& (PF_PROT
|PF_WRITE
)) {
459 default: /* 3: write, present */
461 case PF_WRITE
: /* write, not present */
462 if (!(vma
->vm_flags
& VM_WRITE
))
466 case PF_PROT
: /* read, present */
468 case 0: /* read, not present */
469 if (!(vma
->vm_flags
& (VM_READ
| VM_EXEC
| VM_WRITE
)))
474 * If for any reason at all we couldn't handle the fault,
475 * make sure we exit gracefully rather than endlessly redo
478 fault
= handle_mm_fault(mm
, vma
, address
, write
);
479 if (unlikely(fault
& VM_FAULT_ERROR
)) {
480 if (fault
& VM_FAULT_OOM
)
482 else if (fault
& VM_FAULT_SIGBUS
)
486 if (fault
& VM_FAULT_MAJOR
)
490 up_read(&mm
->mmap_sem
);
494 * Something tried to access memory that isn't in our memory map..
495 * Fix it, but check if it's kernel or user first..
498 up_read(&mm
->mmap_sem
);
500 bad_area_nosemaphore
:
501 /* User mode accesses just cause a SIGSEGV */
502 if (error_code
& PF_USER
) {
505 * It's possible to have interrupts off here.
509 if (is_prefetch(regs
, address
, error_code
))
512 /* Work around K8 erratum #100 K8 in compat mode
513 occasionally jumps to illegal addresses >4GB. We
514 catch this here in the page fault handler because
515 these addresses are not reachable. Just detect this
516 case and return. Any code segment in LDT is
517 compatibility mode. */
518 if ((regs
->cs
== __USER32_CS
|| (regs
->cs
& (1<<2))) &&
522 if (show_unhandled_signals
&& unhandled_signal(tsk
, SIGSEGV
) &&
523 printk_ratelimit()) {
525 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
526 tsk
->pid
> 1 ? KERN_INFO
: KERN_EMERG
,
527 tsk
->comm
, tsk
->pid
, address
, regs
->ip
,
528 regs
->sp
, error_code
);
531 tsk
->thread
.cr2
= address
;
532 /* Kernel addresses are always protection faults */
533 tsk
->thread
.error_code
= error_code
| (address
>= TASK_SIZE
);
534 tsk
->thread
.trap_no
= 14;
536 force_sig_info_fault(SIGSEGV
, si_code
, address
, tsk
);
541 /* Are we prepared to handle this kernel fault? */
542 if (fixup_exception(regs
))
546 * Hall of shame of CPU/BIOS bugs.
549 if (is_prefetch(regs
, address
, error_code
))
552 if (is_errata93(regs
, address
))
556 * Oops. The kernel tried to access some bad page. We'll have to
557 * terminate things with extreme prejudice.
560 flags
= oops_begin();
562 if (address
< PAGE_SIZE
)
563 printk(KERN_ALERT
"Unable to handle kernel NULL pointer dereference");
565 printk(KERN_ALERT
"Unable to handle kernel paging request");
566 printk(" at %016lx RIP: \n" KERN_ALERT
, address
);
567 printk_address(regs
->ip
, regs
->bp
);
568 dump_pagetable(address
);
569 tsk
->thread
.cr2
= address
;
570 tsk
->thread
.trap_no
= 14;
571 tsk
->thread
.error_code
= error_code
;
572 if (__die("Oops", regs
, error_code
))
574 /* Executive summary in case the body of the oops scrolled away */
575 printk(KERN_EMERG
"CR2: %016lx\n", address
);
576 oops_end(flags
, regs
, SIGKILL
);
579 * We ran out of memory, or some other thing happened to us that made
580 * us unable to handle the page fault gracefully.
583 up_read(&mm
->mmap_sem
);
584 if (is_global_init(current
)) {
588 printk("VM: killing process %s\n", tsk
->comm
);
589 if (error_code
& PF_USER
)
590 do_group_exit(SIGKILL
);
594 up_read(&mm
->mmap_sem
);
596 /* Kernel mode? Handle exceptions or die */
597 if (!(error_code
& PF_USER
))
600 tsk
->thread
.cr2
= address
;
601 tsk
->thread
.error_code
= error_code
;
602 tsk
->thread
.trap_no
= 14;
603 force_sig_info_fault(SIGBUS
, BUS_ADRERR
, address
, tsk
);
607 DEFINE_SPINLOCK(pgd_lock
);
610 void vmalloc_sync_all(void)
612 /* Note that races in the updates of insync and start aren't
614 insync can only get set bits added, and updates to start are only
615 improving performance (without affecting correctness if undone). */
616 static DECLARE_BITMAP(insync
, PTRS_PER_PGD
);
617 static unsigned long start
= VMALLOC_START
& PGDIR_MASK
;
618 unsigned long address
;
620 for (address
= start
; address
<= VMALLOC_END
; address
+= PGDIR_SIZE
) {
621 if (!test_bit(pgd_index(address
), insync
)) {
622 const pgd_t
*pgd_ref
= pgd_offset_k(address
);
625 if (pgd_none(*pgd_ref
))
627 spin_lock(&pgd_lock
);
628 list_for_each_entry(page
, &pgd_list
, lru
) {
630 pgd
= (pgd_t
*)page_address(page
) + pgd_index(address
);
632 set_pgd(pgd
, *pgd_ref
);
634 BUG_ON(pgd_page_vaddr(*pgd
) != pgd_page_vaddr(*pgd_ref
));
636 spin_unlock(&pgd_lock
);
637 set_bit(pgd_index(address
), insync
);
639 if (address
== start
)
640 start
= address
+ PGDIR_SIZE
;
642 /* Check that there is no need to do the same for the modules area. */
643 BUILD_BUG_ON(!(MODULES_VADDR
> __START_KERNEL
));
644 BUILD_BUG_ON(!(((MODULES_END
- 1) & PGDIR_MASK
) ==
645 (__START_KERNEL
& PGDIR_MASK
)));