arch/x86/mm/fault_64.c

   1 /*
   2  *  Copyright (C) 1995  Linus Torvalds
   3  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
   4  */
   5
   6 #include <linux/signal.h>
   7 #include <linux/sched.h>
   8 #include <linux/kernel.h>
   9 #include <linux/errno.h>
  10 #include <linux/string.h>
  11 #include <linux/types.h>
  12 #include <linux/ptrace.h>
  13 #include <linux/mman.h>
  14 #include <linux/mm.h>
  15 #include <linux/smp.h>
  16 #include <linux/interrupt.h>
  17 #include <linux/init.h>
  18 #include <linux/tty.h>
  19 #include <linux/vt_kern.h>              /* For unblank_screen() */
  20 #include <linux/compiler.h>
  21 #include <linux/vmalloc.h>
  22 #include <linux/module.h>
  23 #include <linux/kprobes.h>
  24 #include <linux/uaccess.h>
  25 #include <linux/kdebug.h>
  26
  27 #include <asm/system.h>
  28 #include <asm/pgalloc.h>
  29 #include <asm/smp.h>
  30 #include <asm/tlbflush.h>
  31 #include <asm/proto.h>
  32 #include <asm-generic/sections.h>
  33
  34 /*
  35  * Page fault error code bits
  36  *      bit 0 == 0 means no page found, 1 means protection fault
  37  *      bit 1 == 0 means read, 1 means write
  38  *      bit 2 == 0 means kernel, 1 means user-mode
  39  *      bit 3 == 1 means use of reserved bit detected
  40  *      bit 4 == 1 means fault was an instruction fetch
  41  */
  42 #define PF_PROT (1<<0)
  43 #define PF_WRITE        (1<<1)
  44 #define PF_USER (1<<2)
  45 #define PF_RSVD (1<<3)
  46 #define PF_INSTR        (1<<4)
  47
  48 static inline int notify_page_fault(struct pt_regs *regs)
  49 {
  50 #ifdef CONFIG_KPROBES
  51         int ret = 0;
  52
  53         /* kprobe_running() needs smp_processor_id() */
  54         if (!user_mode(regs)) {
  55                 preempt_disable();
  56                 if (kprobe_running() && kprobe_fault_handler(regs, 14))
  57                         ret = 1;
  58                 preempt_enable();
  59         }
  60
  61         return ret;
  62 #else
  63         return 0;
  64 #endif
  65 }
  66
  67 /* Sometimes the CPU reports invalid exceptions on prefetch.
  68    Check that here and ignore.
  69    Opcode checker based on code by Richard Brunner */
  70 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
  71                                 unsigned long error_code)
  72 {
  73         unsigned char *instr;
  74         int scan_more = 1;
  75         int prefetch = 0;
  76         unsigned char *max_instr;
  77
  78         /* If it was a exec fault ignore */
  79         if (error_code & PF_INSTR)
  80                 return 0;
  81
  82         instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
  83         max_instr = instr + 15;
  84
  85         if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
  86                 return 0;
  87
  88         while (scan_more && instr < max_instr) {
  89                 unsigned char opcode;
  90                 unsigned char instr_hi;
  91                 unsigned char instr_lo;
  92
  93                 if (probe_kernel_address(instr, opcode))
  94                         break;
  95
  96                 instr_hi = opcode & 0xf0;
  97                 instr_lo = opcode & 0x0f;
  98                 instr++;
  99
 100                 switch (instr_hi) {
 101                 case 0x20:
 102                 case 0x30:
 103                         /*
 104                          * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
 105                          * In X86_64 long mode, the CPU will signal invalid
 106                          * opcode if some of these prefixes are present so
 107                          * X86_64 will never get here anyway
 108                          */
 109                         scan_more = ((instr_lo & 7) == 0x6);
 110                         break;
 111 #ifdef CONFIG_X86_64
 112                 case 0x40:
 113                         /*
 114                          * In AMD64 long mode 0x40..0x4F are valid REX prefixes
 115                          * Need to figure out under what instruction mode the
 116                          * instruction was issued. Could check the LDT for lm,
 117                          * but for now it's good enough to assume that long
 118                          * mode only uses well known segments or kernel.
 119                          */
 120                         scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
 121                         break;
 122 #endif
 123                 case 0x60:
 124                         /* 0x64 thru 0x67 are valid prefixes in all modes. */
 125                         scan_more = (instr_lo & 0xC) == 0x4;
 126                         break;
 127                 case 0xF0:
 128                         /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
 129                         scan_more = !instr_lo || (instr_lo>>1) == 1;
 130                         break;
 131                 case 0x00:
 132                         /* Prefetch instruction is 0x0F0D or 0x0F18 */
 133                         scan_more = 0;
 134                         if (probe_kernel_address(instr, opcode))
 135                                 break;
 136                         prefetch = (instr_lo == 0xF) &&
 137                                 (opcode == 0x0D || opcode == 0x18);
 138                         break;
 139                 default:
 140                         scan_more = 0;
 141                         break;
 142                 }
 143         }
 144         return prefetch;
 145 }
 146
 147 static int bad_address(void *p)
 148 {
 149         unsigned long dummy;
 150         return probe_kernel_address((unsigned long *)p, dummy);
 151 }
 152
 153 void dump_pagetable(unsigned long address)
 154 {
 155         pgd_t *pgd;
 156         pud_t *pud;
 157         pmd_t *pmd;
 158         pte_t *pte;
 159
 160         pgd = (pgd_t *)read_cr3();
 161
 162         pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
 163         pgd += pgd_index(address);
 164         if (bad_address(pgd)) goto bad;
 165         printk("PGD %lx ", pgd_val(*pgd));
 166         if (!pgd_present(*pgd)) goto ret;
 167
 168         pud = pud_offset(pgd, address);
 169         if (bad_address(pud)) goto bad;
 170         printk("PUD %lx ", pud_val(*pud));
 171         if (!pud_present(*pud)) goto ret;
 172
 173         pmd = pmd_offset(pud, address);
 174         if (bad_address(pmd)) goto bad;
 175         printk("PMD %lx ", pmd_val(*pmd));
 176         if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
 177
 178         pte = pte_offset_kernel(pmd, address);
 179         if (bad_address(pte)) goto bad;
 180         printk("PTE %lx", pte_val(*pte));
 181 ret:
 182         printk("\n");
 183         return;
 184 bad:
 185         printk("BAD\n");
 186 }
 187
 188 static const char errata93_warning[] =
 189 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 190 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
 191 KERN_ERR "******* Please consider a BIOS update.\n"
 192 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
 193
 194 /* Workaround for K8 erratum #93 & buggy BIOS.
 195    BIOS SMM functions are required to use a specific workaround
 196    to avoid corruption of the 64bit RIP register on C stepping K8.
 197    A lot of BIOS that didn't get tested properly miss this.
 198    The OS sees this as a page fault with the upper 32bits of RIP cleared.
 199    Try to work around it here.
 200    Note we only handle faults in kernel here. */
 201
 202 static int is_errata93(struct pt_regs *regs, unsigned long address)
 203 {
 204         static int warned;
 205         if (address != regs->ip)
 206                 return 0;
 207         if ((address >> 32) != 0)
 208                 return 0;
 209         address |= 0xffffffffUL << 32;
 210         if ((address >= (u64)_stext && address <= (u64)_etext) ||
 211             (address >= MODULES_VADDR && address <= MODULES_END)) {
 212                 if (!warned) {
 213                         printk(errata93_warning);
 214                         warned = 1;
 215                 }
 216                 regs->ip = address;
 217                 return 1;
 218         }
 219         return 0;
 220 }
 221
 222 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 223                                  unsigned long error_code)
 224 {
 225         unsigned long flags = oops_begin();
 226         struct task_struct *tsk;
 227
 228         printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
 229                current->comm, address);
 230         dump_pagetable(address);
 231         tsk = current;
 232         tsk->thread.cr2 = address;
 233         tsk->thread.trap_no = 14;
 234         tsk->thread.error_code = error_code;
 235         if (__die("Bad pagetable", regs, error_code))
 236                 regs = NULL;
 237         oops_end(flags, regs, SIGKILL);
 238 }
 239
 240 /*
 241  * Handle a fault on the vmalloc area
 242  *
 243  * This assumes no large pages in there.
 244  */
 245 static int vmalloc_fault(unsigned long address)
 246 {
 247         pgd_t *pgd, *pgd_ref;
 248         pud_t *pud, *pud_ref;
 249         pmd_t *pmd, *pmd_ref;
 250         pte_t *pte, *pte_ref;
 251
 252         /* Copy kernel mappings over when needed. This can also
 253            happen within a race in page table update. In the later
 254            case just flush. */
 255
 256         pgd = pgd_offset(current->mm ?: &init_mm, address);
 257         pgd_ref = pgd_offset_k(address);
 258         if (pgd_none(*pgd_ref))
 259                 return -1;
 260         if (pgd_none(*pgd))
 261                 set_pgd(pgd, *pgd_ref);
 262         else
 263                 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 264
 265         /* Below here mismatches are bugs because these lower tables
 266            are shared */
 267
 268         pud = pud_offset(pgd, address);
 269         pud_ref = pud_offset(pgd_ref, address);
 270         if (pud_none(*pud_ref))
 271                 return -1;
 272         if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
 273                 BUG();
 274         pmd = pmd_offset(pud, address);
 275         pmd_ref = pmd_offset(pud_ref, address);
 276         if (pmd_none(*pmd_ref))
 277                 return -1;
 278         if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
 279                 BUG();
 280         pte_ref = pte_offset_kernel(pmd_ref, address);
 281         if (!pte_present(*pte_ref))
 282                 return -1;
 283         pte = pte_offset_kernel(pmd, address);
 284         /* Don't use pte_page here, because the mappings can point
 285            outside mem_map, and the NUMA hash lookup cannot handle
 286            that. */
 287         if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
 288                 BUG();
 289         return 0;
 290 }
 291
 292 int show_unhandled_signals = 1;
 293
 294 /*
 295  * This routine handles page faults.  It determines the address,
 296  * and the problem, and then passes it off to one of the appropriate
 297  * routines.
 298  */
 299 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 300                                         unsigned long error_code)
 301 {
 302         struct task_struct *tsk;
 303         struct mm_struct *mm;
 304         struct vm_area_struct *vma;
 305         unsigned long address;
 306         int write, fault;
 307         unsigned long flags;
 308         siginfo_t info;
 309
 310         /*
 311          * We can fault from pretty much anywhere, with unknown IRQ state.
 312          */
 313         trace_hardirqs_fixup();
 314
 315         tsk = current;
 316         mm = tsk->mm;
 317         prefetchw(&mm->mmap_sem);
 318
 319         /* get the address */
 320         address = read_cr2();
 321
 322         info.si_code = SEGV_MAPERR;
 323
 324
 325         /*
 326          * We fault-in kernel-space virtual memory on-demand. The
 327          * 'reference' page table is init_mm.pgd.
 328          *
 329          * NOTE! We MUST NOT take any locks for this case. We may
 330          * be in an interrupt or a critical region, and should
 331          * only copy the information from the master page table,
 332          * nothing more.
 333          *
 334          * This verifies that the fault happens in kernel space
 335          * (error_code & 4) == 0, and that the fault was not a
 336          * protection error (error_code & 9) == 0.
 337          */
 338         if (unlikely(address >= TASK_SIZE64)) {
 339                 /*
 340                  * Don't check for the module range here: its PML4
 341                  * is always initialized because it's shared with the main
 342                  * kernel text. Only vmalloc may need PML4 syncups.
 343                  */
 344                 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
 345                       ((address >= VMALLOC_START && address < VMALLOC_END))) {
 346                         if (vmalloc_fault(address) >= 0)
 347                                 return;
 348                 }
 349                 if (notify_page_fault(regs))
 350                         return;
 351                 /*
 352                  * Don't take the mm semaphore here. If we fixup a prefetch
 353                  * fault we could otherwise deadlock.
 354                  */
 355                 goto bad_area_nosemaphore;
 356         }
 357
 358         if (notify_page_fault(regs))
 359                 return;
 360
 361         if (likely(regs->flags & X86_EFLAGS_IF))
 362                 local_irq_enable();
 363
 364         if (unlikely(error_code & PF_RSVD))
 365                 pgtable_bad(address, regs, error_code);
 366
 367         /*
 368          * If we're in an interrupt, have no user context or are running in an
 369          * atomic region then we must not take the fault.
 370          */
 371         if (unlikely(in_atomic() || !mm))
 372                 goto bad_area_nosemaphore;
 373
 374         /*
 375          * User-mode registers count as a user access even for any
 376          * potential system fault or CPU buglet.
 377          */
 378         if (user_mode_vm(regs))
 379                 error_code |= PF_USER;
 380
 381  again:
 382         /* When running in the kernel we expect faults to occur only to
 383          * addresses in user space.  All other faults represent errors in the
 384          * kernel and should generate an OOPS.  Unfortunately, in the case of an
 385          * erroneous fault occurring in a code path which already holds mmap_sem
 386          * we will deadlock attempting to validate the fault against the
 387          * address space.  Luckily the kernel only validly references user
 388          * space from well defined areas of code, which are listed in the
 389          * exceptions table.
 390          *
 391          * As the vast majority of faults will be valid we will only perform
 392          * the source reference check when there is a possibility of a deadlock.
 393          * Attempt to lock the address space, if we cannot we then validate the
 394          * source.  If this is invalid we can skip the address space check,
 395          * thus avoiding the deadlock.
 396          */
 397         if (!down_read_trylock(&mm->mmap_sem)) {
 398                 if ((error_code & PF_USER) == 0 &&
 399                     !search_exception_tables(regs->ip))
 400                         goto bad_area_nosemaphore;
 401                 down_read(&mm->mmap_sem);
 402         }
 403
 404         vma = find_vma(mm, address);
 405         if (!vma)
 406                 goto bad_area;
 407         if (likely(vma->vm_start <= address))
 408                 goto good_area;
 409         if (!(vma->vm_flags & VM_GROWSDOWN))
 410                 goto bad_area;
 411         if (error_code & PF_USER) {
 412                 /* Allow userspace just enough access below the stack pointer
 413                  * to let the 'enter' instruction work.
 414                  */
 415                 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
 416                         goto bad_area;
 417         }
 418         if (expand_stack(vma, address))
 419                 goto bad_area;
 420 /*
 421  * Ok, we have a good vm_area for this memory access, so
 422  * we can handle it..
 423  */
 424 good_area:
 425         info.si_code = SEGV_ACCERR;
 426         write = 0;
 427         switch (error_code & (PF_PROT|PF_WRITE)) {
 428         default:        /* 3: write, present */
 429                 /* fall through */
 430         case PF_WRITE:          /* write, not present */
 431                 if (!(vma->vm_flags & VM_WRITE))
 432                         goto bad_area;
 433                 write++;
 434                 break;
 435         case PF_PROT:           /* read, present */
 436                 goto bad_area;
 437         case 0:                 /* read, not present */
 438                 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 439                         goto bad_area;
 440         }
 441
 442         /*
 443          * If for any reason at all we couldn't handle the fault,
 444          * make sure we exit gracefully rather than endlessly redo
 445          * the fault.
 446          */
 447         fault = handle_mm_fault(mm, vma, address, write);
 448         if (unlikely(fault & VM_FAULT_ERROR)) {
 449                 if (fault & VM_FAULT_OOM)
 450                         goto out_of_memory;
 451                 else if (fault & VM_FAULT_SIGBUS)
 452                         goto do_sigbus;
 453                 BUG();
 454         }
 455         if (fault & VM_FAULT_MAJOR)
 456                 tsk->maj_flt++;
 457         else
 458                 tsk->min_flt++;
 459         up_read(&mm->mmap_sem);
 460         return;
 461
 462 /*
 463  * Something tried to access memory that isn't in our memory map..
 464  * Fix it, but check if it's kernel or user first..
 465  */
 466 bad_area:
 467         up_read(&mm->mmap_sem);
 468
 469 bad_area_nosemaphore:
 470         /* User mode accesses just cause a SIGSEGV */
 471         if (error_code & PF_USER) {
 472
 473                 /*
 474                  * It's possible to have interrupts off here.
 475                  */
 476                 local_irq_enable();
 477
 478                 if (is_prefetch(regs, address, error_code))
 479                         return;
 480
 481                 /* Work around K8 erratum #100 K8 in compat mode
 482                    occasionally jumps to illegal addresses >4GB.  We
 483                    catch this here in the page fault handler because
 484                    these addresses are not reachable. Just detect this
 485                    case and return.  Any code segment in LDT is
 486                    compatibility mode. */
 487                 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
 488                     (address >> 32))
 489                         return;
 490
 491                 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 492                     printk_ratelimit()) {
 493                         printk(
 494                        "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
 495                                         tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
 496                                         tsk->comm, tsk->pid, address, regs->ip,
 497                                         regs->sp, error_code);
 498                 }
 499
 500                 tsk->thread.cr2 = address;
 501                 /* Kernel addresses are always protection faults */
 502                 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
 503                 tsk->thread.trap_no = 14;
 504                 info.si_signo = SIGSEGV;
 505                 info.si_errno = 0;
 506                 /* info.si_code has been set above */
 507                 info.si_addr = (void __user *)address;
 508                 force_sig_info(SIGSEGV, &info, tsk);
 509                 return;
 510         }
 511
 512 no_context:
 513         /* Are we prepared to handle this kernel fault?  */
 514         if (fixup_exception(regs))
 515                 return;
 516
 517         /*
 518          * Hall of shame of CPU/BIOS bugs.
 519          */
 520
 521         if (is_prefetch(regs, address, error_code))
 522                 return;
 523
 524         if (is_errata93(regs, address))
 525                 return;
 526
 527 /*
 528  * Oops. The kernel tried to access some bad page. We'll have to
 529  * terminate things with extreme prejudice.
 530  */
 531
 532         flags = oops_begin();
 533
 534         if (address < PAGE_SIZE)
 535                 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
 536         else
 537                 printk(KERN_ALERT "Unable to handle kernel paging request");
 538         printk(" at %016lx RIP: \n" KERN_ALERT, address);
 539         printk_address(regs->ip);
 540         dump_pagetable(address);
 541         tsk->thread.cr2 = address;
 542         tsk->thread.trap_no = 14;
 543         tsk->thread.error_code = error_code;
 544         if (__die("Oops", regs, error_code))
 545                 regs = NULL;
 546         /* Executive summary in case the body of the oops scrolled away */
 547         printk(KERN_EMERG "CR2: %016lx\n", address);
 548         oops_end(flags, regs, SIGKILL);
 549
 550 /*
 551  * We ran out of memory, or some other thing happened to us that made
 552  * us unable to handle the page fault gracefully.
 553  */
 554 out_of_memory:
 555         up_read(&mm->mmap_sem);
 556         if (is_global_init(current)) {
 557                 yield();
 558                 goto again;
 559         }
 560         printk("VM: killing process %s\n", tsk->comm);
 561         if (error_code & 4)
 562                 do_group_exit(SIGKILL);
 563         goto no_context;
 564
 565 do_sigbus:
 566         up_read(&mm->mmap_sem);
 567
 568         /* Kernel mode? Handle exceptions or die */
 569         if (!(error_code & PF_USER))
 570                 goto no_context;
 571
 572         tsk->thread.cr2 = address;
 573         tsk->thread.error_code = error_code;
 574         tsk->thread.trap_no = 14;
 575         info.si_signo = SIGBUS;
 576         info.si_errno = 0;
 577         info.si_code = BUS_ADRERR;
 578         info.si_addr = (void __user *)address;
 579         force_sig_info(SIGBUS, &info, tsk);
 580         return;
 581 }
 582
 583 DEFINE_SPINLOCK(pgd_lock);
 584 LIST_HEAD(pgd_list);
 585
 586 void vmalloc_sync_all(void)
 587 {
 588         /* Note that races in the updates of insync and start aren't
 589            problematic:
 590            insync can only get set bits added, and updates to start are only
 591            improving performance (without affecting correctness if undone). */
 592         static DECLARE_BITMAP(insync, PTRS_PER_PGD);
 593         static unsigned long start = VMALLOC_START & PGDIR_MASK;
 594         unsigned long address;
 595
 596         for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
 597                 if (!test_bit(pgd_index(address), insync)) {
 598                         const pgd_t *pgd_ref = pgd_offset_k(address);
 599                         struct page *page;
 600
 601                         if (pgd_none(*pgd_ref))
 602                                 continue;
 603                         spin_lock(&pgd_lock);
 604                         list_for_each_entry(page, &pgd_list, lru) {
 605                                 pgd_t *pgd;
 606                                 pgd = (pgd_t *)page_address(page) + pgd_index(address);
 607                                 if (pgd_none(*pgd))
 608                                         set_pgd(pgd, *pgd_ref);
 609                                 else
 610                                         BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 611                         }
 612                         spin_unlock(&pgd_lock);
 613                         set_bit(pgd_index(address), insync);
 614                 }
 615                 if (address == start)
 616                         start = address + PGDIR_SIZE;
 617         }
 618         /* Check that there is no need to do the same for the modules area. */
 619         BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
 620         BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
 621                                 (__START_KERNEL & PGDIR_MASK)));
 622 }