arch/x86/mm/fault_64.c

   1 /*
   2  *  linux/arch/x86-64/mm/fault.c
   3  *
   4  *  Copyright (C) 1995  Linus Torvalds
   5  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
   6  */
   7
   8 #include <linux/signal.h>
   9 #include <linux/sched.h>
  10 #include <linux/kernel.h>
  11 #include <linux/errno.h>
  12 #include <linux/string.h>
  13 #include <linux/types.h>
  14 #include <linux/ptrace.h>
  15 #include <linux/mman.h>
  16 #include <linux/mm.h>
  17 #include <linux/smp.h>
  18 #include <linux/interrupt.h>
  19 #include <linux/init.h>
  20 #include <linux/tty.h>
  21 #include <linux/vt_kern.h>              /* For unblank_screen() */
  22 #include <linux/compiler.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/module.h>
  25 #include <linux/kprobes.h>
  26 #include <linux/uaccess.h>
  27 #include <linux/kdebug.h>
  28 #include <linux/kprobes.h>
  29
  30 #include <asm/system.h>
  31 #include <asm/pgalloc.h>
  32 #include <asm/smp.h>
  33 #include <asm/tlbflush.h>
  34 #include <asm/proto.h>
  35 #include <asm-generic/sections.h>
  36
  37 /* Page fault error code bits */
  38 #define PF_PROT (1<<0)          /* or no page found */
  39 #define PF_WRITE        (1<<1)
  40 #define PF_USER (1<<2)
  41 #define PF_RSVD (1<<3)
  42 #define PF_INSTR        (1<<4)
  43
  44 #ifdef CONFIG_KPROBES
  45 static inline int notify_page_fault(struct pt_regs *regs)
  46 {
  47         int ret = 0;
  48
  49         /* kprobe_running() needs smp_processor_id() */
  50         if (!user_mode(regs)) {
  51                 preempt_disable();
  52                 if (kprobe_running() && kprobe_fault_handler(regs, 14))
  53                         ret = 1;
  54                 preempt_enable();
  55         }
  56
  57         return ret;
  58 }
  59 #else
  60 static inline int notify_page_fault(struct pt_regs *regs)
  61 {
  62         return 0;
  63 }
  64 #endif
  65
  66 /* Sometimes the CPU reports invalid exceptions on prefetch.
  67    Check that here and ignore.
  68    Opcode checker based on code by Richard Brunner */
  69 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
  70                                 unsigned long error_code)
  71 {
  72         unsigned char *instr;
  73         int scan_more = 1;
  74         int prefetch = 0;
  75         unsigned char *max_instr;
  76
  77         /* If it was a exec fault ignore */
  78         if (error_code & PF_INSTR)
  79                 return 0;
  80
  81         instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
  82         max_instr = instr + 15;
  83
  84         if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
  85                 return 0;
  86
  87         while (scan_more && instr < max_instr) {
  88                 unsigned char opcode;
  89                 unsigned char instr_hi;
  90                 unsigned char instr_lo;
  91
  92                 if (probe_kernel_address(instr, opcode))
  93                         break;
  94
  95                 instr_hi = opcode & 0xf0;
  96                 instr_lo = opcode & 0x0f;
  97                 instr++;
  98
  99                 switch (instr_hi) {
 100                 case 0x20:
 101                 case 0x30:
 102                         /* Values 0x26,0x2E,0x36,0x3E are valid x86
 103                            prefixes.  In long mode, the CPU will signal
 104                            invalid opcode if some of these prefixes are
 105                            present so we will never get here anyway */
 106                         scan_more = ((instr_lo & 7) == 0x6);
 107                         break;
 108
 109                 case 0x40:
 110                         /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
 111                            Need to figure out under what instruction mode the
 112                            instruction was issued ... */
 113                         /* Could check the LDT for lm, but for now it's good
 114                            enough to assume that long mode only uses well known
 115                            segments or kernel. */
 116                         scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
 117                         break;
 118
 119                 case 0x60:
 120                         /* 0x64 thru 0x67 are valid prefixes in all modes. */
 121                         scan_more = (instr_lo & 0xC) == 0x4;
 122                         break;
 123                 case 0xF0:
 124                         /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
 125                         scan_more = !instr_lo || (instr_lo>>1) == 1;
 126                         break;
 127                 case 0x00:
 128                         /* Prefetch instruction is 0x0F0D or 0x0F18 */
 129                         scan_more = 0;
 130                         if (probe_kernel_address(instr, opcode))
 131                                 break;
 132                         prefetch = (instr_lo == 0xF) &&
 133                                 (opcode == 0x0D || opcode == 0x18);
 134                         break;
 135                 default:
 136                         scan_more = 0;
 137                         break;
 138                 }
 139         }
 140         return prefetch;
 141 }
 142
 143 static int bad_address(void *p)
 144 {
 145         unsigned long dummy;
 146         return probe_kernel_address((unsigned long *)p, dummy);
 147 }
 148
 149 void dump_pagetable(unsigned long address)
 150 {
 151         pgd_t *pgd;
 152         pud_t *pud;
 153         pmd_t *pmd;
 154         pte_t *pte;
 155
 156         pgd = (pgd_t *)read_cr3();
 157
 158         pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
 159         pgd += pgd_index(address);
 160         if (bad_address(pgd)) goto bad;
 161         printk("PGD %lx ", pgd_val(*pgd));
 162         if (!pgd_present(*pgd)) goto ret;
 163
 164         pud = pud_offset(pgd, address);
 165         if (bad_address(pud)) goto bad;
 166         printk("PUD %lx ", pud_val(*pud));
 167         if (!pud_present(*pud)) goto ret;
 168
 169         pmd = pmd_offset(pud, address);
 170         if (bad_address(pmd)) goto bad;
 171         printk("PMD %lx ", pmd_val(*pmd));
 172         if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
 173
 174         pte = pte_offset_kernel(pmd, address);
 175         if (bad_address(pte)) goto bad;
 176         printk("PTE %lx", pte_val(*pte));
 177 ret:
 178         printk("\n");
 179         return;
 180 bad:
 181         printk("BAD\n");
 182 }
 183
 184 static const char errata93_warning[] =
 185 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 186 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
 187 KERN_ERR "******* Please consider a BIOS update.\n"
 188 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
 189
 190 /* Workaround for K8 erratum #93 & buggy BIOS.
 191    BIOS SMM functions are required to use a specific workaround
 192    to avoid corruption of the 64bit RIP register on C stepping K8.
 193    A lot of BIOS that didn't get tested properly miss this.
 194    The OS sees this as a page fault with the upper 32bits of RIP cleared.
 195    Try to work around it here.
 196    Note we only handle faults in kernel here. */
 197
 198 static int is_errata93(struct pt_regs *regs, unsigned long address)
 199 {
 200         static int warned;
 201         if (address != regs->rip)
 202                 return 0;
 203         if ((address >> 32) != 0)
 204                 return 0;
 205         address |= 0xffffffffUL << 32;
 206         if ((address >= (u64)_stext && address <= (u64)_etext) ||
 207             (address >= MODULES_VADDR && address <= MODULES_END)) {
 208                 if (!warned) {
 209                         printk(errata93_warning);
 210                         warned = 1;
 211                 }
 212                 regs->rip = address;
 213                 return 1;
 214         }
 215         return 0;
 216 }
 217
 218 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 219                                  unsigned long error_code)
 220 {
 221         unsigned long flags = oops_begin();
 222         struct task_struct *tsk;
 223
 224         printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
 225                current->comm, address);
 226         dump_pagetable(address);
 227         tsk = current;
 228         tsk->thread.cr2 = address;
 229         tsk->thread.trap_no = 14;
 230         tsk->thread.error_code = error_code;
 231         __die("Bad pagetable", regs, error_code);
 232         oops_end(flags);
 233         do_exit(SIGKILL);
 234 }
 235
 236 /*
 237  * Handle a fault on the vmalloc area
 238  *
 239  * This assumes no large pages in there.
 240  */
 241 static int vmalloc_fault(unsigned long address)
 242 {
 243         pgd_t *pgd, *pgd_ref;
 244         pud_t *pud, *pud_ref;
 245         pmd_t *pmd, *pmd_ref;
 246         pte_t *pte, *pte_ref;
 247
 248         /* Copy kernel mappings over when needed. This can also
 249            happen within a race in page table update. In the later
 250            case just flush. */
 251
 252         pgd = pgd_offset(current->mm ?: &init_mm, address);
 253         pgd_ref = pgd_offset_k(address);
 254         if (pgd_none(*pgd_ref))
 255                 return -1;
 256         if (pgd_none(*pgd))
 257                 set_pgd(pgd, *pgd_ref);
 258         else
 259                 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 260
 261         /* Below here mismatches are bugs because these lower tables
 262            are shared */
 263
 264         pud = pud_offset(pgd, address);
 265         pud_ref = pud_offset(pgd_ref, address);
 266         if (pud_none(*pud_ref))
 267                 return -1;
 268         if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
 269                 BUG();
 270         pmd = pmd_offset(pud, address);
 271         pmd_ref = pmd_offset(pud_ref, address);
 272         if (pmd_none(*pmd_ref))
 273                 return -1;
 274         if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
 275                 BUG();
 276         pte_ref = pte_offset_kernel(pmd_ref, address);
 277         if (!pte_present(*pte_ref))
 278                 return -1;
 279         pte = pte_offset_kernel(pmd, address);
 280         /* Don't use pte_page here, because the mappings can point
 281            outside mem_map, and the NUMA hash lookup cannot handle
 282            that. */
 283         if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
 284                 BUG();
 285         return 0;
 286 }
 287
 288 int show_unhandled_signals = 1;
 289
 290 /*
 291  * This routine handles page faults.  It determines the address,
 292  * and the problem, and then passes it off to one of the appropriate
 293  * routines.
 294  */
 295 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 296                                         unsigned long error_code)
 297 {
 298         struct task_struct *tsk;
 299         struct mm_struct *mm;
 300         struct vm_area_struct * vma;
 301         unsigned long address;
 302         const struct exception_table_entry *fixup;
 303         int write, fault;
 304         unsigned long flags;
 305         siginfo_t info;
 306
 307         tsk = current;
 308         mm = tsk->mm;
 309         prefetchw(&mm->mmap_sem);
 310
 311         /* get the address */
 312         address = read_cr2();
 313
 314         info.si_code = SEGV_MAPERR;
 315
 316
 317         /*
 318          * We fault-in kernel-space virtual memory on-demand. The
 319          * 'reference' page table is init_mm.pgd.
 320          *
 321          * NOTE! We MUST NOT take any locks for this case. We may
 322          * be in an interrupt or a critical region, and should
 323          * only copy the information from the master page table,
 324          * nothing more.
 325          *
 326          * This verifies that the fault happens in kernel space
 327          * (error_code & 4) == 0, and that the fault was not a
 328          * protection error (error_code & 9) == 0.
 329          */
 330         if (unlikely(address >= TASK_SIZE64)) {
 331                 /*
 332                  * Don't check for the module range here: its PML4
 333                  * is always initialized because it's shared with the main
 334                  * kernel text. Only vmalloc may need PML4 syncups.
 335                  */
 336                 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
 337                       ((address >= VMALLOC_START && address < VMALLOC_END))) {
 338                         if (vmalloc_fault(address) >= 0)
 339                                 return;
 340                 }
 341                 if (notify_page_fault(regs))
 342                         return;
 343                 /*
 344                  * Don't take the mm semaphore here. If we fixup a prefetch
 345                  * fault we could otherwise deadlock.
 346                  */
 347                 goto bad_area_nosemaphore;
 348         }
 349
 350         if (notify_page_fault(regs))
 351                 return;
 352
 353         if (likely(regs->eflags & X86_EFLAGS_IF))
 354                 local_irq_enable();
 355
 356         if (unlikely(error_code & PF_RSVD))
 357                 pgtable_bad(address, regs, error_code);
 358
 359         /*
 360          * If we're in an interrupt or have no user
 361          * context, we must not take the fault..
 362          */
 363         if (unlikely(in_atomic() || !mm))
 364                 goto bad_area_nosemaphore;
 365
 366         /*
 367          * User-mode registers count as a user access even for any
 368          * potential system fault or CPU buglet.
 369          */
 370         if (user_mode_vm(regs))
 371                 error_code |= PF_USER;
 372
 373  again:
 374         /* When running in the kernel we expect faults to occur only to
 375          * addresses in user space.  All other faults represent errors in the
 376          * kernel and should generate an OOPS.  Unfortunatly, in the case of an
 377          * erroneous fault occurring in a code path which already holds mmap_sem
 378          * we will deadlock attempting to validate the fault against the
 379          * address space.  Luckily the kernel only validly references user
 380          * space from well defined areas of code, which are listed in the
 381          * exceptions table.
 382          *
 383          * As the vast majority of faults will be valid we will only perform
 384          * the source reference check when there is a possibilty of a deadlock.
 385          * Attempt to lock the address space, if we cannot we then validate the
 386          * source.  If this is invalid we can skip the address space check,
 387          * thus avoiding the deadlock.
 388          */
 389         if (!down_read_trylock(&mm->mmap_sem)) {
 390                 if ((error_code & PF_USER) == 0 &&
 391                     !search_exception_tables(regs->rip))
 392                         goto bad_area_nosemaphore;
 393                 down_read(&mm->mmap_sem);
 394         }
 395
 396         vma = find_vma(mm, address);
 397         if (!vma)
 398                 goto bad_area;
 399         if (likely(vma->vm_start <= address))
 400                 goto good_area;
 401         if (!(vma->vm_flags & VM_GROWSDOWN))
 402                 goto bad_area;
 403         if (error_code & 4) {
 404                 /* Allow userspace just enough access below the stack pointer
 405                  * to let the 'enter' instruction work.
 406                  */
 407                 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
 408                         goto bad_area;
 409         }
 410         if (expand_stack(vma, address))
 411                 goto bad_area;
 412 /*
 413  * Ok, we have a good vm_area for this memory access, so
 414  * we can handle it..
 415  */
 416 good_area:
 417         info.si_code = SEGV_ACCERR;
 418         write = 0;
 419         switch (error_code & (PF_PROT|PF_WRITE)) {
 420                 default:        /* 3: write, present */
 421                         /* fall through */
 422                 case PF_WRITE:          /* write, not present */
 423                         if (!(vma->vm_flags & VM_WRITE))
 424                                 goto bad_area;
 425                         write++;
 426                         break;
 427                 case PF_PROT:           /* read, present */
 428                         goto bad_area;
 429                 case 0:                 /* read, not present */
 430                         if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 431                                 goto bad_area;
 432         }
 433
 434         /*
 435          * If for any reason at all we couldn't handle the fault,
 436          * make sure we exit gracefully rather than endlessly redo
 437          * the fault.
 438          */
 439         fault = handle_mm_fault(mm, vma, address, write);
 440         if (unlikely(fault & VM_FAULT_ERROR)) {
 441                 if (fault & VM_FAULT_OOM)
 442                         goto out_of_memory;
 443                 else if (fault & VM_FAULT_SIGBUS)
 444                         goto do_sigbus;
 445                 BUG();
 446         }
 447         if (fault & VM_FAULT_MAJOR)
 448                 tsk->maj_flt++;
 449         else
 450                 tsk->min_flt++;
 451         up_read(&mm->mmap_sem);
 452         return;
 453
 454 /*
 455  * Something tried to access memory that isn't in our memory map..
 456  * Fix it, but check if it's kernel or user first..
 457  */
 458 bad_area:
 459         up_read(&mm->mmap_sem);
 460
 461 bad_area_nosemaphore:
 462         /* User mode accesses just cause a SIGSEGV */
 463         if (error_code & PF_USER) {
 464
 465                 /*
 466                  * It's possible to have interrupts off here.
 467                  */
 468                 local_irq_enable();
 469
 470                 if (is_prefetch(regs, address, error_code))
 471                         return;
 472
 473                 /* Work around K8 erratum #100 K8 in compat mode
 474                    occasionally jumps to illegal addresses >4GB.  We
 475                    catch this here in the page fault handler because
 476                    these addresses are not reachable. Just detect this
 477                    case and return.  Any code segment in LDT is
 478                    compatibility mode. */
 479                 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
 480                     (address >> 32))
 481                         return;
 482
 483                 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 484                     printk_ratelimit()) {
 485                         printk(
 486                        "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
 487                                         tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
 488                                         tsk->comm, tsk->pid, address, regs->rip,
 489                                         regs->rsp, error_code);
 490                 }
 491
 492                 tsk->thread.cr2 = address;
 493                 /* Kernel addresses are always protection faults */
 494                 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
 495                 tsk->thread.trap_no = 14;
 496                 info.si_signo = SIGSEGV;
 497                 info.si_errno = 0;
 498                 /* info.si_code has been set above */
 499                 info.si_addr = (void __user *)address;
 500                 force_sig_info(SIGSEGV, &info, tsk);
 501                 return;
 502         }
 503
 504 no_context:
 505
 506         /* Are we prepared to handle this kernel fault?  */
 507         fixup = search_exception_tables(regs->rip);
 508         if (fixup) {
 509                 regs->rip = fixup->fixup;
 510                 return;
 511         }
 512
 513         /*
 514          * Hall of shame of CPU/BIOS bugs.
 515          */
 516
 517         if (is_prefetch(regs, address, error_code))
 518                 return;
 519
 520         if (is_errata93(regs, address))
 521                 return;
 522
 523 /*
 524  * Oops. The kernel tried to access some bad page. We'll have to
 525  * terminate things with extreme prejudice.
 526  */
 527
 528         flags = oops_begin();
 529
 530         if (address < PAGE_SIZE)
 531                 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
 532         else
 533                 printk(KERN_ALERT "Unable to handle kernel paging request");
 534         printk(" at %016lx RIP: \n" KERN_ALERT,address);
 535         printk_address(regs->rip);
 536         dump_pagetable(address);
 537         tsk->thread.cr2 = address;
 538         tsk->thread.trap_no = 14;
 539         tsk->thread.error_code = error_code;
 540         __die("Oops", regs, error_code);
 541         /* Executive summary in case the body of the oops scrolled away */
 542         printk(KERN_EMERG "CR2: %016lx\n", address);
 543         oops_end(flags);
 544         do_exit(SIGKILL);
 545
 546 /*
 547  * We ran out of memory, or some other thing happened to us that made
 548  * us unable to handle the page fault gracefully.
 549  */
 550 out_of_memory:
 551         up_read(&mm->mmap_sem);
 552         if (is_global_init(current)) {
 553                 yield();
 554                 goto again;
 555         }
 556         printk("VM: killing process %s\n", tsk->comm);
 557         if (error_code & 4)
 558                 do_group_exit(SIGKILL);
 559         goto no_context;
 560
 561 do_sigbus:
 562         up_read(&mm->mmap_sem);
 563
 564         /* Kernel mode? Handle exceptions or die */
 565         if (!(error_code & PF_USER))
 566                 goto no_context;
 567
 568         tsk->thread.cr2 = address;
 569         tsk->thread.error_code = error_code;
 570         tsk->thread.trap_no = 14;
 571         info.si_signo = SIGBUS;
 572         info.si_errno = 0;
 573         info.si_code = BUS_ADRERR;
 574         info.si_addr = (void __user *)address;
 575         force_sig_info(SIGBUS, &info, tsk);
 576         return;
 577 }
 578
 579 DEFINE_SPINLOCK(pgd_lock);
 580 LIST_HEAD(pgd_list);
 581
 582 void vmalloc_sync_all(void)
 583 {
 584         /* Note that races in the updates of insync and start aren't
 585            problematic:
 586            insync can only get set bits added, and updates to start are only
 587            improving performance (without affecting correctness if undone). */
 588         static DECLARE_BITMAP(insync, PTRS_PER_PGD);
 589         static unsigned long start = VMALLOC_START & PGDIR_MASK;
 590         unsigned long address;
 591
 592         for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
 593                 if (!test_bit(pgd_index(address), insync)) {
 594                         const pgd_t *pgd_ref = pgd_offset_k(address);
 595                         struct page *page;
 596
 597                         if (pgd_none(*pgd_ref))
 598                                 continue;
 599                         spin_lock(&pgd_lock);
 600                         list_for_each_entry(page, &pgd_list, lru) {
 601                                 pgd_t *pgd;
 602                                 pgd = (pgd_t *)page_address(page) + pgd_index(address);
 603                                 if (pgd_none(*pgd))
 604                                         set_pgd(pgd, *pgd_ref);
 605                                 else
 606                                         BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 607                         }
 608                         spin_unlock(&pgd_lock);
 609                         set_bit(pgd_index(address), insync);
 610                 }
 611                 if (address == start)
 612                         start = address + PGDIR_SIZE;
 613         }
 614         /* Check that there is no need to do the same for the modules area. */
 615         BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
 616         BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
 617                                 (__START_KERNEL & PGDIR_MASK)));
 618 }