arch/x86/mm/fault_64.c

   1 /*
   2  *  Copyright (C) 1995  Linus Torvalds
   3  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
   4  */
   5
   6 #include <linux/signal.h>
   7 #include <linux/sched.h>
   8 #include <linux/kernel.h>
   9 #include <linux/errno.h>
  10 #include <linux/string.h>
  11 #include <linux/types.h>
  12 #include <linux/ptrace.h>
  13 #include <linux/mman.h>
  14 #include <linux/mm.h>
  15 #include <linux/smp.h>
  16 #include <linux/interrupt.h>
  17 #include <linux/init.h>
  18 #include <linux/tty.h>
  19 #include <linux/vt_kern.h>              /* For unblank_screen() */
  20 #include <linux/compiler.h>
  21 #include <linux/vmalloc.h>
  22 #include <linux/module.h>
  23 #include <linux/kprobes.h>
  24 #include <linux/uaccess.h>
  25 #include <linux/kdebug.h>
  26
  27 #include <asm/system.h>
  28 #include <asm/pgalloc.h>
  29 #include <asm/smp.h>
  30 #include <asm/tlbflush.h>
  31 #include <asm/proto.h>
  32 #include <asm-generic/sections.h>
  33
  34 /*
  35  * Page fault error code bits
  36  *      bit 0 == 0 means no page found, 1 means protection fault
  37  *      bit 1 == 0 means read, 1 means write
  38  *      bit 2 == 0 means kernel, 1 means user-mode
  39  *      bit 3 == 1 means use of reserved bit detected
  40  *      bit 4 == 1 means fault was an instruction fetch
  41  */
  42 #define PF_PROT         (1<<0)
  43 #define PF_WRITE        (1<<1)
  44 #define PF_USER         (1<<2)
  45 #define PF_RSVD         (1<<3)
  46 #define PF_INSTR        (1<<4)
  47
  48 static inline int notify_page_fault(struct pt_regs *regs)
  49 {
  50 #ifdef CONFIG_KPROBES
  51         int ret = 0;
  52
  53         /* kprobe_running() needs smp_processor_id() */
  54         if (!user_mode(regs)) {
  55                 preempt_disable();
  56                 if (kprobe_running() && kprobe_fault_handler(regs, 14))
  57                         ret = 1;
  58                 preempt_enable();
  59         }
  60
  61         return ret;
  62 #else
  63         return 0;
  64 #endif
  65 }
  66
  67 /*
  68  * X86_32
  69  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
  70  * Check that here and ignore it.
  71  *
  72  * X86_64
  73  * Sometimes the CPU reports invalid exceptions on prefetch.
  74  * Check that here and ignore it.
  75  *
  76  * Opcode checker based on code by Richard Brunner
  77  */
  78 static int is_prefetch(struct pt_regs *regs, unsigned long addr,
  79                        unsigned long error_code)
  80 {
  81         unsigned char *instr;
  82         int scan_more = 1;
  83         int prefetch = 0;
  84         unsigned char *max_instr;
  85
  86 #ifdef CONFIG_X86_32
  87         if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
  88                      boot_cpu_data.x86 >= 6)) {
  89                 /* Catch an obscure case of prefetch inside an NX page. */
  90                 if (nx_enabled && (error_code & PF_INSTR))
  91                         return 0;
  92         } else {
  93                 return 0;
  94         }
  95 #else
  96         /* If it was a exec fault ignore */
  97         if (error_code & PF_INSTR)
  98                 return 0;
  99 #endif
 100
 101         instr = (unsigned char *)convert_ip_to_linear(current, regs);
 102         max_instr = instr + 15;
 103
 104         if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
 105                 return 0;
 106
 107         while (scan_more && instr < max_instr) {
 108                 unsigned char opcode;
 109                 unsigned char instr_hi;
 110                 unsigned char instr_lo;
 111
 112                 if (probe_kernel_address(instr, opcode))
 113                         break;
 114
 115                 instr_hi = opcode & 0xf0;
 116                 instr_lo = opcode & 0x0f;
 117                 instr++;
 118
 119                 switch (instr_hi) {
 120                 case 0x20:
 121                 case 0x30:
 122                         /*
 123                          * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
 124                          * In X86_64 long mode, the CPU will signal invalid
 125                          * opcode if some of these prefixes are present so
 126                          * X86_64 will never get here anyway
 127                          */
 128                         scan_more = ((instr_lo & 7) == 0x6);
 129                         break;
 130 #ifdef CONFIG_X86_64
 131                 case 0x40:
 132                         /*
 133                          * In AMD64 long mode 0x40..0x4F are valid REX prefixes
 134                          * Need to figure out under what instruction mode the
 135                          * instruction was issued. Could check the LDT for lm,
 136                          * but for now it's good enough to assume that long
 137                          * mode only uses well known segments or kernel.
 138                          */
 139                         scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
 140                         break;
 141 #endif
 142                 case 0x60:
 143                         /* 0x64 thru 0x67 are valid prefixes in all modes. */
 144                         scan_more = (instr_lo & 0xC) == 0x4;
 145                         break;
 146                 case 0xF0:
 147                         /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
 148                         scan_more = !instr_lo || (instr_lo>>1) == 1;
 149                         break;
 150                 case 0x00:
 151                         /* Prefetch instruction is 0x0F0D or 0x0F18 */
 152                         scan_more = 0;
 153
 154                         if (probe_kernel_address(instr, opcode))
 155                                 break;
 156                         prefetch = (instr_lo == 0xF) &&
 157                                 (opcode == 0x0D || opcode == 0x18);
 158                         break;
 159                 default:
 160                         scan_more = 0;
 161                         break;
 162                 }
 163         }
 164         return prefetch;
 165 }
 166
 167 static void force_sig_info_fault(int si_signo, int si_code,
 168         unsigned long address, struct task_struct *tsk)
 169 {
 170         siginfo_t info;
 171
 172         info.si_signo = si_signo;
 173         info.si_errno = 0;
 174         info.si_code = si_code;
 175         info.si_addr = (void __user *)address;
 176         force_sig_info(si_signo, &info, tsk);
 177 }
 178
 179 static int bad_address(void *p)
 180 {
 181         unsigned long dummy;
 182         return probe_kernel_address((unsigned long *)p, dummy);
 183 }
 184
 185 void dump_pagetable(unsigned long address)
 186 {
 187         pgd_t *pgd;
 188         pud_t *pud;
 189         pmd_t *pmd;
 190         pte_t *pte;
 191
 192         pgd = (pgd_t *)read_cr3();
 193
 194         pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
 195         pgd += pgd_index(address);
 196         if (bad_address(pgd)) goto bad;
 197         printk("PGD %lx ", pgd_val(*pgd));
 198         if (!pgd_present(*pgd)) goto ret;
 199
 200         pud = pud_offset(pgd, address);
 201         if (bad_address(pud)) goto bad;
 202         printk("PUD %lx ", pud_val(*pud));
 203         if (!pud_present(*pud)) goto ret;
 204
 205         pmd = pmd_offset(pud, address);
 206         if (bad_address(pmd)) goto bad;
 207         printk("PMD %lx ", pmd_val(*pmd));
 208         if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
 209
 210         pte = pte_offset_kernel(pmd, address);
 211         if (bad_address(pte)) goto bad;
 212         printk("PTE %lx", pte_val(*pte));
 213 ret:
 214         printk("\n");
 215         return;
 216 bad:
 217         printk("BAD\n");
 218 }
 219
 220 #ifdef CONFIG_X86_64
 221 static const char errata93_warning[] =
 222 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 223 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
 224 KERN_ERR "******* Please consider a BIOS update.\n"
 225 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
 226
 227 /* Workaround for K8 erratum #93 & buggy BIOS.
 228    BIOS SMM functions are required to use a specific workaround
 229    to avoid corruption of the 64bit RIP register on C stepping K8.
 230    A lot of BIOS that didn't get tested properly miss this.
 231    The OS sees this as a page fault with the upper 32bits of RIP cleared.
 232    Try to work around it here.
 233    Note we only handle faults in kernel here. */
 234
 235 static int is_errata93(struct pt_regs *regs, unsigned long address)
 236 {
 237         static int warned;
 238         if (address != regs->ip)
 239                 return 0;
 240         if ((address >> 32) != 0)
 241                 return 0;
 242         address |= 0xffffffffUL << 32;
 243         if ((address >= (u64)_stext && address <= (u64)_etext) ||
 244             (address >= MODULES_VADDR && address <= MODULES_END)) {
 245                 if (!warned) {
 246                         printk(errata93_warning);
 247                         warned = 1;
 248                 }
 249                 regs->ip = address;
 250                 return 1;
 251         }
 252         return 0;
 253 }
 254 #endif
 255
 256 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 257                                  unsigned long error_code)
 258 {
 259         unsigned long flags = oops_begin();
 260         struct task_struct *tsk;
 261
 262         printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
 263                current->comm, address);
 264         dump_pagetable(address);
 265         tsk = current;
 266         tsk->thread.cr2 = address;
 267         tsk->thread.trap_no = 14;
 268         tsk->thread.error_code = error_code;
 269         if (__die("Bad pagetable", regs, error_code))
 270                 regs = NULL;
 271         oops_end(flags, regs, SIGKILL);
 272 }
 273
 274 /*
 275  * Handle a fault on the vmalloc area
 276  *
 277  * This assumes no large pages in there.
 278  */
 279 static int vmalloc_fault(unsigned long address)
 280 {
 281         pgd_t *pgd, *pgd_ref;
 282         pud_t *pud, *pud_ref;
 283         pmd_t *pmd, *pmd_ref;
 284         pte_t *pte, *pte_ref;
 285
 286         /* Copy kernel mappings over when needed. This can also
 287            happen within a race in page table update. In the later
 288            case just flush. */
 289
 290         pgd = pgd_offset(current->mm ?: &init_mm, address);
 291         pgd_ref = pgd_offset_k(address);
 292         if (pgd_none(*pgd_ref))
 293                 return -1;
 294         if (pgd_none(*pgd))
 295                 set_pgd(pgd, *pgd_ref);
 296         else
 297                 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 298
 299         /* Below here mismatches are bugs because these lower tables
 300            are shared */
 301
 302         pud = pud_offset(pgd, address);
 303         pud_ref = pud_offset(pgd_ref, address);
 304         if (pud_none(*pud_ref))
 305                 return -1;
 306         if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
 307                 BUG();
 308         pmd = pmd_offset(pud, address);
 309         pmd_ref = pmd_offset(pud_ref, address);
 310         if (pmd_none(*pmd_ref))
 311                 return -1;
 312         if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
 313                 BUG();
 314         pte_ref = pte_offset_kernel(pmd_ref, address);
 315         if (!pte_present(*pte_ref))
 316                 return -1;
 317         pte = pte_offset_kernel(pmd, address);
 318         /* Don't use pte_page here, because the mappings can point
 319            outside mem_map, and the NUMA hash lookup cannot handle
 320            that. */
 321         if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
 322                 BUG();
 323         return 0;
 324 }
 325
 326 int show_unhandled_signals = 1;
 327
 328 /*
 329  * This routine handles page faults.  It determines the address,
 330  * and the problem, and then passes it off to one of the appropriate
 331  * routines.
 332  */
 333 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 334                                         unsigned long error_code)
 335 {
 336         struct task_struct *tsk;
 337         struct mm_struct *mm;
 338         struct vm_area_struct *vma;
 339         unsigned long address;
 340         int write, fault;
 341         unsigned long flags;
 342         int si_code;
 343
 344         /*
 345          * We can fault from pretty much anywhere, with unknown IRQ state.
 346          */
 347         trace_hardirqs_fixup();
 348
 349         tsk = current;
 350         mm = tsk->mm;
 351         prefetchw(&mm->mmap_sem);
 352
 353         /* get the address */
 354         address = read_cr2();
 355
 356         si_code = SEGV_MAPERR;
 357
 358         if (notify_page_fault(regs))
 359                 return;
 360
 361         /*
 362          * We fault-in kernel-space virtual memory on-demand. The
 363          * 'reference' page table is init_mm.pgd.
 364          *
 365          * NOTE! We MUST NOT take any locks for this case. We may
 366          * be in an interrupt or a critical region, and should
 367          * only copy the information from the master page table,
 368          * nothing more.
 369          *
 370          * This verifies that the fault happens in kernel space
 371          * (error_code & 4) == 0, and that the fault was not a
 372          * protection error (error_code & 9) == 0.
 373          */
 374         if (unlikely(address >= TASK_SIZE64)) {
 375                 /*
 376                  * Don't check for the module range here: its PML4
 377                  * is always initialized because it's shared with the main
 378                  * kernel text. Only vmalloc may need PML4 syncups.
 379                  */
 380                 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
 381                       ((address >= VMALLOC_START && address < VMALLOC_END))) {
 382                         if (vmalloc_fault(address) >= 0)
 383                                 return;
 384                 }
 385                 /*
 386                  * Don't take the mm semaphore here. If we fixup a prefetch
 387                  * fault we could otherwise deadlock.
 388                  */
 389                 goto bad_area_nosemaphore;
 390         }
 391
 392         if (likely(regs->flags & X86_EFLAGS_IF))
 393                 local_irq_enable();
 394
 395         if (unlikely(error_code & PF_RSVD))
 396                 pgtable_bad(address, regs, error_code);
 397
 398         /*
 399          * If we're in an interrupt, have no user context or are running in an
 400          * atomic region then we must not take the fault.
 401          */
 402         if (unlikely(in_atomic() || !mm))
 403                 goto bad_area_nosemaphore;
 404
 405         /*
 406          * User-mode registers count as a user access even for any
 407          * potential system fault or CPU buglet.
 408          */
 409         if (user_mode_vm(regs))
 410                 error_code |= PF_USER;
 411
 412  again:
 413         /* When running in the kernel we expect faults to occur only to
 414          * addresses in user space.  All other faults represent errors in the
 415          * kernel and should generate an OOPS.  Unfortunately, in the case of an
 416          * erroneous fault occurring in a code path which already holds mmap_sem
 417          * we will deadlock attempting to validate the fault against the
 418          * address space.  Luckily the kernel only validly references user
 419          * space from well defined areas of code, which are listed in the
 420          * exceptions table.
 421          *
 422          * As the vast majority of faults will be valid we will only perform
 423          * the source reference check when there is a possibility of a deadlock.
 424          * Attempt to lock the address space, if we cannot we then validate the
 425          * source.  If this is invalid we can skip the address space check,
 426          * thus avoiding the deadlock.
 427          */
 428         if (!down_read_trylock(&mm->mmap_sem)) {
 429                 if ((error_code & PF_USER) == 0 &&
 430                     !search_exception_tables(regs->ip))
 431                         goto bad_area_nosemaphore;
 432                 down_read(&mm->mmap_sem);
 433         }
 434
 435         vma = find_vma(mm, address);
 436         if (!vma)
 437                 goto bad_area;
 438         if (likely(vma->vm_start <= address))
 439                 goto good_area;
 440         if (!(vma->vm_flags & VM_GROWSDOWN))
 441                 goto bad_area;
 442         if (error_code & PF_USER) {
 443                 /* Allow userspace just enough access below the stack pointer
 444                  * to let the 'enter' instruction work.
 445                  */
 446                 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
 447                         goto bad_area;
 448         }
 449         if (expand_stack(vma, address))
 450                 goto bad_area;
 451 /*
 452  * Ok, we have a good vm_area for this memory access, so
 453  * we can handle it..
 454  */
 455 good_area:
 456         si_code = SEGV_ACCERR;
 457         write = 0;
 458         switch (error_code & (PF_PROT|PF_WRITE)) {
 459         default:        /* 3: write, present */
 460                 /* fall through */
 461         case PF_WRITE:          /* write, not present */
 462                 if (!(vma->vm_flags & VM_WRITE))
 463                         goto bad_area;
 464                 write++;
 465                 break;
 466         case PF_PROT:           /* read, present */
 467                 goto bad_area;
 468         case 0:                 /* read, not present */
 469                 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 470                         goto bad_area;
 471         }
 472
 473         /*
 474          * If for any reason at all we couldn't handle the fault,
 475          * make sure we exit gracefully rather than endlessly redo
 476          * the fault.
 477          */
 478         fault = handle_mm_fault(mm, vma, address, write);
 479         if (unlikely(fault & VM_FAULT_ERROR)) {
 480                 if (fault & VM_FAULT_OOM)
 481                         goto out_of_memory;
 482                 else if (fault & VM_FAULT_SIGBUS)
 483                         goto do_sigbus;
 484                 BUG();
 485         }
 486         if (fault & VM_FAULT_MAJOR)
 487                 tsk->maj_flt++;
 488         else
 489                 tsk->min_flt++;
 490         up_read(&mm->mmap_sem);
 491         return;
 492
 493 /*
 494  * Something tried to access memory that isn't in our memory map..
 495  * Fix it, but check if it's kernel or user first..
 496  */
 497 bad_area:
 498         up_read(&mm->mmap_sem);
 499
 500 bad_area_nosemaphore:
 501         /* User mode accesses just cause a SIGSEGV */
 502         if (error_code & PF_USER) {
 503
 504                 /*
 505                  * It's possible to have interrupts off here.
 506                  */
 507                 local_irq_enable();
 508
 509                 if (is_prefetch(regs, address, error_code))
 510                         return;
 511
 512                 /* Work around K8 erratum #100 K8 in compat mode
 513                    occasionally jumps to illegal addresses >4GB.  We
 514                    catch this here in the page fault handler because
 515                    these addresses are not reachable. Just detect this
 516                    case and return.  Any code segment in LDT is
 517                    compatibility mode. */
 518                 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
 519                     (address >> 32))
 520                         return;
 521
 522                 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 523                     printk_ratelimit()) {
 524                         printk(
 525                        "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
 526                                         tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
 527                                         tsk->comm, tsk->pid, address, regs->ip,
 528                                         regs->sp, error_code);
 529                 }
 530
 531                 tsk->thread.cr2 = address;
 532                 /* Kernel addresses are always protection faults */
 533                 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
 534                 tsk->thread.trap_no = 14;
 535
 536                 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
 537                 return;
 538         }
 539
 540 no_context:
 541         /* Are we prepared to handle this kernel fault?  */
 542         if (fixup_exception(regs))
 543                 return;
 544
 545         /*
 546          * Hall of shame of CPU/BIOS bugs.
 547          */
 548
 549         if (is_prefetch(regs, address, error_code))
 550                 return;
 551
 552         if (is_errata93(regs, address))
 553                 return;
 554
 555 /*
 556  * Oops. The kernel tried to access some bad page. We'll have to
 557  * terminate things with extreme prejudice.
 558  */
 559
 560         flags = oops_begin();
 561
 562         if (address < PAGE_SIZE)
 563                 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
 564         else
 565                 printk(KERN_ALERT "Unable to handle kernel paging request");
 566         printk(" at %016lx RIP: \n" KERN_ALERT, address);
 567         printk_address(regs->ip, regs->bp);
 568         dump_pagetable(address);
 569         tsk->thread.cr2 = address;
 570         tsk->thread.trap_no = 14;
 571         tsk->thread.error_code = error_code;
 572         if (__die("Oops", regs, error_code))
 573                 regs = NULL;
 574         /* Executive summary in case the body of the oops scrolled away */
 575         printk(KERN_EMERG "CR2: %016lx\n", address);
 576         oops_end(flags, regs, SIGKILL);
 577
 578 /*
 579  * We ran out of memory, or some other thing happened to us that made
 580  * us unable to handle the page fault gracefully.
 581  */
 582 out_of_memory:
 583         up_read(&mm->mmap_sem);
 584         if (is_global_init(current)) {
 585                 yield();
 586                 goto again;
 587         }
 588         printk("VM: killing process %s\n", tsk->comm);
 589         if (error_code & PF_USER)
 590                 do_group_exit(SIGKILL);
 591         goto no_context;
 592
 593 do_sigbus:
 594         up_read(&mm->mmap_sem);
 595
 596         /* Kernel mode? Handle exceptions or die */
 597         if (!(error_code & PF_USER))
 598                 goto no_context;
 599
 600         tsk->thread.cr2 = address;
 601         tsk->thread.error_code = error_code;
 602         tsk->thread.trap_no = 14;
 603         force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
 604         return;
 605 }
 606
 607 DEFINE_SPINLOCK(pgd_lock);
 608 LIST_HEAD(pgd_list);
 609
 610 void vmalloc_sync_all(void)
 611 {
 612         /* Note that races in the updates of insync and start aren't
 613            problematic:
 614            insync can only get set bits added, and updates to start are only
 615            improving performance (without affecting correctness if undone). */
 616         static DECLARE_BITMAP(insync, PTRS_PER_PGD);
 617         static unsigned long start = VMALLOC_START & PGDIR_MASK;
 618         unsigned long address;
 619
 620         for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
 621                 if (!test_bit(pgd_index(address), insync)) {
 622                         const pgd_t *pgd_ref = pgd_offset_k(address);
 623                         struct page *page;
 624
 625                         if (pgd_none(*pgd_ref))
 626                                 continue;
 627                         spin_lock(&pgd_lock);
 628                         list_for_each_entry(page, &pgd_list, lru) {
 629                                 pgd_t *pgd;
 630                                 pgd = (pgd_t *)page_address(page) + pgd_index(address);
 631                                 if (pgd_none(*pgd))
 632                                         set_pgd(pgd, *pgd_ref);
 633                                 else
 634                                         BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 635                         }
 636                         spin_unlock(&pgd_lock);
 637                         set_bit(pgd_index(address), insync);
 638                 }
 639                 if (address == start)
 640                         start = address + PGDIR_SIZE;
 641         }
 642         /* Check that there is no need to do the same for the modules area. */
 643         BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
 644         BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
 645                                 (__START_KERNEL & PGDIR_MASK)));
 646 }