arch/powerpc/kvm/book3s_64_mmu_hv.c

   1 /*
   2  * This program is free software; you can redistribute it and/or modify
   3  * it under the terms of the GNU General Public License, version 2, as
   4  * published by the Free Software Foundation.
   5  *
   6  * This program is distributed in the hope that it will be useful,
   7  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   8  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   9  * GNU General Public License for more details.
  10  *
  11  * You should have received a copy of the GNU General Public License
  12  * along with this program; if not, write to the Free Software
  13  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  14  *
  15  * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  16  */
  17
  18 #include <linux/types.h>
  19 #include <linux/string.h>
  20 #include <linux/kvm.h>
  21 #include <linux/kvm_host.h>
  22 #include <linux/highmem.h>
  23 #include <linux/gfp.h>
  24 #include <linux/slab.h>
  25 #include <linux/hugetlb.h>
  26 #include <linux/vmalloc.h>
  27
  28 #include <asm/tlbflush.h>
  29 #include <asm/kvm_ppc.h>
  30 #include <asm/kvm_book3s.h>
  31 #include <asm/mmu-hash64.h>
  32 #include <asm/hvcall.h>
  33 #include <asm/synch.h>
  34 #include <asm/ppc-opcode.h>
  35 #include <asm/cputable.h>
  36
  37 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
  38 #define MAX_LPID_970    63
  39 #define NR_LPIDS        (LPID_RSVD + 1)
  40 unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
  41
  42 long kvmppc_alloc_hpt(struct kvm *kvm)
  43 {
  44         unsigned long hpt;
  45         unsigned long lpid;
  46         struct revmap_entry *rev;
  47
  48         /* Allocate guest's hashed page table */
  49         hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
  50                                HPT_ORDER - PAGE_SHIFT);
  51         if (!hpt) {
  52                 pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
  53                 return -ENOMEM;
  54         }
  55         kvm->arch.hpt_virt = hpt;
  56
  57         /* Allocate reverse map array */
  58         rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE);
  59         if (!rev) {
  60                 pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
  61                 goto out_freehpt;
  62         }
  63         kvm->arch.revmap = rev;
  64
  65         /* Allocate the guest's logical partition ID */
  66         do {
  67                 lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
  68                 if (lpid >= NR_LPIDS) {
  69                         pr_err("kvm_alloc_hpt: No LPIDs free\n");
  70                         goto out_freeboth;
  71                 }
  72         } while (test_and_set_bit(lpid, lpid_inuse));
  73
  74         kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
  75         kvm->arch.lpid = lpid;
  76
  77         pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
  78         return 0;
  79
  80  out_freeboth:
  81         vfree(rev);
  82  out_freehpt:
  83         free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
  84         return -ENOMEM;
  85 }
  86
  87 void kvmppc_free_hpt(struct kvm *kvm)
  88 {
  89         clear_bit(kvm->arch.lpid, lpid_inuse);
  90         vfree(kvm->arch.revmap);
  91         free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
  92 }
  93
  94 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
  95 static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
  96 {
  97         return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
  98 }
  99
 100 /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
 101 static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
 102 {
 103         return (pgsize == 0x10000) ? 0x1000 : 0;
 104 }
 105
 106 void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 107                      unsigned long porder)
 108 {
 109         unsigned long i;
 110         unsigned long npages;
 111         unsigned long hp_v, hp_r;
 112         unsigned long addr, hash;
 113         unsigned long psize;
 114         unsigned long hp0, hp1;
 115         long ret;
 116
 117         psize = 1ul << porder;
 118         npages = memslot->npages >> (porder - PAGE_SHIFT);
 119
 120         /* VRMA can't be > 1TB */
 121         if (npages > 1ul << (40 - porder))
 122                 npages = 1ul << (40 - porder);
 123         /* Can't use more than 1 HPTE per HPTEG */
 124         if (npages > HPT_NPTEG)
 125                 npages = HPT_NPTEG;
 126
 127         hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
 128                 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
 129         hp1 = hpte1_pgsize_encoding(psize) |
 130                 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
 131
 132         for (i = 0; i < npages; ++i) {
 133                 addr = i << porder;
 134                 /* can't use hpt_hash since va > 64 bits */
 135                 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
 136                 /*
 137                  * We assume that the hash table is empty and no
 138                  * vcpus are using it at this stage.  Since we create
 139                  * at most one HPTE per HPTEG, we just assume entry 7
 140                  * is available and use it.
 141                  */
 142                 hash = (hash << 3) + 7;
 143                 hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
 144                 hp_r = hp1 | addr;
 145                 ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r);
 146                 if (ret != H_SUCCESS) {
 147                         pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
 148                                addr, ret);
 149                         break;
 150                 }
 151         }
 152 }
 153
 154 int kvmppc_mmu_hv_init(void)
 155 {
 156         unsigned long host_lpid, rsvd_lpid;
 157
 158         if (!cpu_has_feature(CPU_FTR_HVMODE))
 159                 return -EINVAL;
 160
 161         memset(lpid_inuse, 0, sizeof(lpid_inuse));
 162
 163         if (cpu_has_feature(CPU_FTR_ARCH_206)) {
 164                 host_lpid = mfspr(SPRN_LPID);   /* POWER7 */
 165                 rsvd_lpid = LPID_RSVD;
 166         } else {
 167                 host_lpid = 0;                  /* PPC970 */
 168                 rsvd_lpid = MAX_LPID_970;
 169         }
 170
 171         set_bit(host_lpid, lpid_inuse);
 172         /* rsvd_lpid is reserved for use in partition switching */
 173         set_bit(rsvd_lpid, lpid_inuse);
 174
 175         return 0;
 176 }
 177
 178 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 179 {
 180 }
 181
 182 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 183 {
 184         kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
 185 }
 186
 187 /*
 188  * This is called to get a reference to a guest page if there isn't
 189  * one already in the kvm->arch.slot_phys[][] arrays.
 190  */
 191 static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
 192                                   struct kvm_memory_slot *memslot,
 193                                   unsigned long psize)
 194 {
 195         unsigned long start;
 196         long np, err;
 197         struct page *page, *hpage, *pages[1];
 198         unsigned long s, pgsize;
 199         unsigned long *physp;
 200         unsigned int is_io, got, pgorder;
 201         struct vm_area_struct *vma;
 202         unsigned long pfn, i, npages;
 203
 204         physp = kvm->arch.slot_phys[memslot->id];
 205         if (!physp)
 206                 return -EINVAL;
 207         if (physp[gfn - memslot->base_gfn])
 208                 return 0;
 209
 210         is_io = 0;
 211         got = 0;
 212         page = NULL;
 213         pgsize = psize;
 214         err = -EINVAL;
 215         start = gfn_to_hva_memslot(memslot, gfn);
 216
 217         /* Instantiate and get the page we want access to */
 218         np = get_user_pages_fast(start, 1, 1, pages);
 219         if (np != 1) {
 220                 /* Look up the vma for the page */
 221                 down_read(&current->mm->mmap_sem);
 222                 vma = find_vma(current->mm, start);
 223                 if (!vma || vma->vm_start > start ||
 224                     start + psize > vma->vm_end ||
 225                     !(vma->vm_flags & VM_PFNMAP))
 226                         goto up_err;
 227                 is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
 228                 pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 229                 /* check alignment of pfn vs. requested page size */
 230                 if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1)))
 231                         goto up_err;
 232                 up_read(&current->mm->mmap_sem);
 233
 234         } else {
 235                 page = pages[0];
 236                 got = KVMPPC_GOT_PAGE;
 237
 238                 /* See if this is a large page */
 239                 s = PAGE_SIZE;
 240                 if (PageHuge(page)) {
 241                         hpage = compound_head(page);
 242                         s <<= compound_order(hpage);
 243                         /* Get the whole large page if slot alignment is ok */
 244                         if (s > psize && slot_is_aligned(memslot, s) &&
 245                             !(memslot->userspace_addr & (s - 1))) {
 246                                 start &= ~(s - 1);
 247                                 pgsize = s;
 248                                 page = hpage;
 249                         }
 250                 }
 251                 if (s < psize)
 252                         goto out;
 253                 pfn = page_to_pfn(page);
 254         }
 255
 256         npages = pgsize >> PAGE_SHIFT;
 257         pgorder = __ilog2(npages);
 258         physp += (gfn - memslot->base_gfn) & ~(npages - 1);
 259         spin_lock(&kvm->arch.slot_phys_lock);
 260         for (i = 0; i < npages; ++i) {
 261                 if (!physp[i]) {
 262                         physp[i] = ((pfn + i) << PAGE_SHIFT) +
 263                                 got + is_io + pgorder;
 264                         got = 0;
 265                 }
 266         }
 267         spin_unlock(&kvm->arch.slot_phys_lock);
 268         err = 0;
 269
 270  out:
 271         if (got) {
 272                 if (PageHuge(page))
 273                         page = compound_head(page);
 274                 put_page(page);
 275         }
 276         return err;
 277
 278  up_err:
 279         up_read(&current->mm->mmap_sem);
 280         return err;
 281 }
 282
 283 /*
 284  * We come here on a H_ENTER call from the guest when we are not
 285  * using mmu notifiers and we don't have the requested page pinned
 286  * already.
 287  */
 288 long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 289                         long pte_index, unsigned long pteh, unsigned long ptel)
 290 {
 291         struct kvm *kvm = vcpu->kvm;
 292         unsigned long psize, gpa, gfn;
 293         struct kvm_memory_slot *memslot;
 294         long ret;
 295
 296         if (kvm->arch.using_mmu_notifiers)
 297                 goto do_insert;
 298
 299         psize = hpte_page_size(pteh, ptel);
 300         if (!psize)
 301                 return H_PARAMETER;
 302
 303         pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
 304
 305         /* Find the memslot (if any) for this address */
 306         gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
 307         gfn = gpa >> PAGE_SHIFT;
 308         memslot = gfn_to_memslot(kvm, gfn);
 309         if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) {
 310                 if (!slot_is_aligned(memslot, psize))
 311                         return H_PARAMETER;
 312                 if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0)
 313                         return H_PARAMETER;
 314         }
 315
 316  do_insert:
 317         /* Protect linux PTE lookup from page table destruction */
 318         rcu_read_lock_sched();  /* this disables preemption too */
 319         vcpu->arch.pgdir = current->mm->pgd;
 320         ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel);
 321         rcu_read_unlock_sched();
 322         if (ret == H_TOO_HARD) {
 323                 /* this can't happen */
 324                 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
 325                 ret = H_RESOURCE;       /* or something */
 326         }
 327         return ret;
 328
 329 }
 330
 331 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
 332                                                          gva_t eaddr)
 333 {
 334         u64 mask;
 335         int i;
 336
 337         for (i = 0; i < vcpu->arch.slb_nr; i++) {
 338                 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
 339                         continue;
 340
 341                 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
 342                         mask = ESID_MASK_1T;
 343                 else
 344                         mask = ESID_MASK;
 345
 346                 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
 347                         return &vcpu->arch.slb[i];
 348         }
 349         return NULL;
 350 }
 351
 352 static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
 353                         unsigned long ea)
 354 {
 355         unsigned long ra_mask;
 356
 357         ra_mask = hpte_page_size(v, r) - 1;
 358         return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
 359 }
 360
 361 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 362                         struct kvmppc_pte *gpte, bool data)
 363 {
 364         struct kvm *kvm = vcpu->kvm;
 365         struct kvmppc_slb *slbe;
 366         unsigned long slb_v;
 367         unsigned long pp, key;
 368         unsigned long v, gr;
 369         unsigned long *hptep;
 370         int index;
 371         int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
 372
 373         /* Get SLB entry */
 374         if (virtmode) {
 375                 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
 376                 if (!slbe)
 377                         return -EINVAL;
 378                 slb_v = slbe->origv;
 379         } else {
 380                 /* real mode access */
 381                 slb_v = vcpu->kvm->arch.vrma_slb_v;
 382         }
 383
 384         /* Find the HPTE in the hash table */
 385         index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
 386                                          HPTE_V_VALID | HPTE_V_ABSENT);
 387         if (index < 0)
 388                 return -ENOENT;
 389         hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
 390         v = hptep[0] & ~HPTE_V_HVLOCK;
 391         gr = kvm->arch.revmap[index].guest_rpte;
 392
 393         /* Unlock the HPTE */
 394         asm volatile("lwsync" : : : "memory");
 395         hptep[0] = v;
 396
 397         gpte->eaddr = eaddr;
 398         gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
 399
 400         /* Get PP bits and key for permission check */
 401         pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
 402         key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
 403         key &= slb_v;
 404
 405         /* Calculate permissions */
 406         gpte->may_read = hpte_read_permission(pp, key);
 407         gpte->may_write = hpte_write_permission(pp, key);
 408         gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
 409
 410         /* Storage key permission check for POWER7 */
 411         if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) {
 412                 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
 413                 if (amrfield & 1)
 414                         gpte->may_read = 0;
 415                 if (amrfield & 2)
 416                         gpte->may_write = 0;
 417         }
 418
 419         /* Get the guest physical address */
 420         gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
 421         return 0;
 422 }
 423
 424 /*
 425  * Quick test for whether an instruction is a load or a store.
 426  * If the instruction is a load or a store, then this will indicate
 427  * which it is, at least on server processors.  (Embedded processors
 428  * have some external PID instructions that don't follow the rule
 429  * embodied here.)  If the instruction isn't a load or store, then
 430  * this doesn't return anything useful.
 431  */
 432 static int instruction_is_store(unsigned int instr)
 433 {
 434         unsigned int mask;
 435
 436         mask = 0x10000000;
 437         if ((instr & 0xfc000000) == 0x7c000000)
 438                 mask = 0x100;           /* major opcode 31 */
 439         return (instr & mask) != 0;
 440 }
 441
 442 static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
 443                                   unsigned long gpa, int is_store)
 444 {
 445         int ret;
 446         u32 last_inst;
 447         unsigned long srr0 = kvmppc_get_pc(vcpu);
 448
 449         /* We try to load the last instruction.  We don't let
 450          * emulate_instruction do it as it doesn't check what
 451          * kvmppc_ld returns.
 452          * If we fail, we just return to the guest and try executing it again.
 453          */
 454         if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) {
 455                 ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
 456                 if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED)
 457                         return RESUME_GUEST;
 458                 vcpu->arch.last_inst = last_inst;
 459         }
 460
 461         /*
 462          * WARNING: We do not know for sure whether the instruction we just
 463          * read from memory is the same that caused the fault in the first
 464          * place.  If the instruction we read is neither an load or a store,
 465          * then it can't access memory, so we don't need to worry about
 466          * enforcing access permissions.  So, assuming it is a load or
 467          * store, we just check that its direction (load or store) is
 468          * consistent with the original fault, since that's what we
 469          * checked the access permissions against.  If there is a mismatch
 470          * we just return and retry the instruction.
 471          */
 472
 473         if (instruction_is_store(vcpu->arch.last_inst) != !!is_store)
 474                 return RESUME_GUEST;
 475
 476         /*
 477          * Emulated accesses are emulated by looking at the hash for
 478          * translation once, then performing the access later. The
 479          * translation could be invalidated in the meantime in which
 480          * point performing the subsequent memory access on the old
 481          * physical address could possibly be a security hole for the
 482          * guest (but not the host).
 483          *
 484          * This is less of an issue for MMIO stores since they aren't
 485          * globally visible. It could be an issue for MMIO loads to
 486          * a certain extent but we'll ignore it for now.
 487          */
 488
 489         vcpu->arch.paddr_accessed = gpa;
 490         return kvmppc_emulate_mmio(run, vcpu);
 491 }
 492
 493 int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 494                                 unsigned long ea, unsigned long dsisr)
 495 {
 496         struct kvm *kvm = vcpu->kvm;
 497         unsigned long *hptep, hpte[3], r;
 498         unsigned long mmu_seq, psize, pte_size;
 499         unsigned long gfn, hva, pfn;
 500         struct kvm_memory_slot *memslot;
 501         unsigned long *rmap;
 502         struct revmap_entry *rev;
 503         struct page *page, *pages[1];
 504         long index, ret, npages;
 505         unsigned long is_io;
 506         unsigned int writing, write_ok;
 507         struct vm_area_struct *vma;
 508         unsigned long rcbits;
 509
 510         /*
 511          * Real-mode code has already searched the HPT and found the
 512          * entry we're interested in.  Lock the entry and check that
 513          * it hasn't changed.  If it has, just return and re-execute the
 514          * instruction.
 515          */
 516         if (ea != vcpu->arch.pgfault_addr)
 517                 return RESUME_GUEST;
 518         index = vcpu->arch.pgfault_index;
 519         hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
 520         rev = &kvm->arch.revmap[index];
 521         preempt_disable();
 522         while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
 523                 cpu_relax();
 524         hpte[0] = hptep[0] & ~HPTE_V_HVLOCK;
 525         hpte[1] = hptep[1];
 526         hpte[2] = r = rev->guest_rpte;
 527         asm volatile("lwsync" : : : "memory");
 528         hptep[0] = hpte[0];
 529         preempt_enable();
 530
 531         if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
 532             hpte[1] != vcpu->arch.pgfault_hpte[1])
 533                 return RESUME_GUEST;
 534
 535         /* Translate the logical address and get the page */
 536         psize = hpte_page_size(hpte[0], r);
 537         gfn = hpte_rpn(r, psize);
 538         memslot = gfn_to_memslot(kvm, gfn);
 539
 540         /* No memslot means it's an emulated MMIO region */
 541         if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
 542                 unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
 543                 return kvmppc_hv_emulate_mmio(run, vcpu, gpa,
 544                                               dsisr & DSISR_ISSTORE);
 545         }
 546
 547         if (!kvm->arch.using_mmu_notifiers)
 548                 return -EFAULT;         /* should never get here */
 549
 550         /* used to check for invalidations in progress */
 551         mmu_seq = kvm->mmu_notifier_seq;
 552         smp_rmb();
 553
 554         is_io = 0;
 555         pfn = 0;
 556         page = NULL;
 557         pte_size = PAGE_SIZE;
 558         writing = (dsisr & DSISR_ISSTORE) != 0;
 559         /* If writing != 0, then the HPTE must allow writing, if we get here */
 560         write_ok = writing;
 561         hva = gfn_to_hva_memslot(memslot, gfn);
 562         npages = get_user_pages_fast(hva, 1, writing, pages);
 563         if (npages < 1) {
 564                 /* Check if it's an I/O mapping */
 565                 down_read(&current->mm->mmap_sem);
 566                 vma = find_vma(current->mm, hva);
 567                 if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
 568                     (vma->vm_flags & VM_PFNMAP)) {
 569                         pfn = vma->vm_pgoff +
 570                                 ((hva - vma->vm_start) >> PAGE_SHIFT);
 571                         pte_size = psize;
 572                         is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
 573                         write_ok = vma->vm_flags & VM_WRITE;
 574                 }
 575                 up_read(&current->mm->mmap_sem);
 576                 if (!pfn)
 577                         return -EFAULT;
 578         } else {
 579                 page = pages[0];
 580                 if (PageHuge(page)) {
 581                         page = compound_head(page);
 582                         pte_size <<= compound_order(page);
 583                 }
 584                 /* if the guest wants write access, see if that is OK */
 585                 if (!writing && hpte_is_writable(r)) {
 586                         pte_t *ptep, pte;
 587
 588                         /*
 589                          * We need to protect against page table destruction
 590                          * while looking up and updating the pte.
 591                          */
 592                         rcu_read_lock_sched();
 593                         ptep = find_linux_pte_or_hugepte(current->mm->pgd,
 594                                                          hva, NULL);
 595                         if (ptep && pte_present(*ptep)) {
 596                                 pte = kvmppc_read_update_linux_pte(ptep, 1);
 597                                 if (pte_write(pte))
 598                                         write_ok = 1;
 599                         }
 600                         rcu_read_unlock_sched();
 601                 }
 602                 pfn = page_to_pfn(page);
 603         }
 604
 605         ret = -EFAULT;
 606         if (psize > pte_size)
 607                 goto out_put;
 608
 609         /* Check WIMG vs. the actual page we're accessing */
 610         if (!hpte_cache_flags_ok(r, is_io)) {
 611                 if (is_io)
 612                         return -EFAULT;
 613                 /*
 614                  * Allow guest to map emulated device memory as
 615                  * uncacheable, but actually make it cacheable.
 616                  */
 617                 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
 618         }
 619
 620         /* Set the HPTE to point to pfn */
 621         r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT);
 622         if (hpte_is_writable(r) && !write_ok)
 623                 r = hpte_make_readonly(r);
 624         ret = RESUME_GUEST;
 625         preempt_disable();
 626         while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
 627                 cpu_relax();
 628         if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] ||
 629             rev->guest_rpte != hpte[2])
 630                 /* HPTE has been changed under us; let the guest retry */
 631                 goto out_unlock;
 632         hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 633
 634         rmap = &memslot->rmap[gfn - memslot->base_gfn];
 635         lock_rmap(rmap);
 636
 637         /* Check if we might have been invalidated; let the guest retry if so */
 638         ret = RESUME_GUEST;
 639         if (mmu_notifier_retry(vcpu, mmu_seq)) {
 640                 unlock_rmap(rmap);
 641                 goto out_unlock;
 642         }
 643
 644         /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
 645         rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
 646         r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
 647
 648         if (hptep[0] & HPTE_V_VALID) {
 649                 /* HPTE was previously valid, so we need to invalidate it */
 650                 unlock_rmap(rmap);
 651                 hptep[0] |= HPTE_V_ABSENT;
 652                 kvmppc_invalidate_hpte(kvm, hptep, index);
 653                 /* don't lose previous R and C bits */
 654                 r |= hptep[1] & (HPTE_R_R | HPTE_R_C);
 655         } else {
 656                 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
 657         }
 658
 659         hptep[1] = r;
 660         eieio();
 661         hptep[0] = hpte[0];
 662         asm volatile("ptesync" : : : "memory");
 663         preempt_enable();
 664         if (page && hpte_is_writable(r))
 665                 SetPageDirty(page);
 666
 667  out_put:
 668         if (page)
 669                 put_page(page);
 670         return ret;
 671
 672  out_unlock:
 673         hptep[0] &= ~HPTE_V_HVLOCK;
 674         preempt_enable();
 675         goto out_put;
 676 }
 677
 678 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 679                           int (*handler)(struct kvm *kvm, unsigned long *rmapp,
 680                                          unsigned long gfn))
 681 {
 682         int ret;
 683         int retval = 0;
 684         struct kvm_memslots *slots;
 685         struct kvm_memory_slot *memslot;
 686
 687         slots = kvm_memslots(kvm);
 688         kvm_for_each_memslot(memslot, slots) {
 689                 unsigned long start = memslot->userspace_addr;
 690                 unsigned long end;
 691
 692                 end = start + (memslot->npages << PAGE_SHIFT);
 693                 if (hva >= start && hva < end) {
 694                         gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
 695
 696                         ret = handler(kvm, &memslot->rmap[gfn_offset],
 697                                       memslot->base_gfn + gfn_offset);
 698                         retval |= ret;
 699                 }
 700         }
 701
 702         return retval;
 703 }
 704
 705 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 706                            unsigned long gfn)
 707 {
 708         struct revmap_entry *rev = kvm->arch.revmap;
 709         unsigned long h, i, j;
 710         unsigned long *hptep;
 711         unsigned long ptel, psize, rcbits;
 712
 713         for (;;) {
 714                 lock_rmap(rmapp);
 715                 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
 716                         unlock_rmap(rmapp);
 717                         break;
 718                 }
 719
 720                 /*
 721                  * To avoid an ABBA deadlock with the HPTE lock bit,
 722                  * we can't spin on the HPTE lock while holding the
 723                  * rmap chain lock.
 724                  */
 725                 i = *rmapp & KVMPPC_RMAP_INDEX;
 726                 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
 727                 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
 728                         /* unlock rmap before spinning on the HPTE lock */
 729                         unlock_rmap(rmapp);
 730                         while (hptep[0] & HPTE_V_HVLOCK)
 731                                 cpu_relax();
 732                         continue;
 733                 }
 734                 j = rev[i].forw;
 735                 if (j == i) {
 736                         /* chain is now empty */
 737                         *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
 738                 } else {
 739                         /* remove i from chain */
 740                         h = rev[i].back;
 741                         rev[h].forw = j;
 742                         rev[j].back = h;
 743                         rev[i].forw = rev[i].back = i;
 744                         *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
 745                 }
 746
 747                 /* Now check and modify the HPTE */
 748                 ptel = rev[i].guest_rpte;
 749                 psize = hpte_page_size(hptep[0], ptel);
 750                 if ((hptep[0] & HPTE_V_VALID) &&
 751                     hpte_rpn(ptel, psize) == gfn) {
 752                         hptep[0] |= HPTE_V_ABSENT;
 753                         kvmppc_invalidate_hpte(kvm, hptep, i);
 754                         /* Harvest R and C */
 755                         rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
 756                         *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
 757                         rev[i].guest_rpte = ptel | rcbits;
 758                 }
 759                 unlock_rmap(rmapp);
 760                 hptep[0] &= ~HPTE_V_HVLOCK;
 761         }
 762         return 0;
 763 }
 764
 765 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 766 {
 767         if (kvm->arch.using_mmu_notifiers)
 768                 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
 769         return 0;
 770 }
 771
 772 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 773                          unsigned long gfn)
 774 {
 775         if (!kvm->arch.using_mmu_notifiers)
 776                 return 0;
 777         if (!(*rmapp & KVMPPC_RMAP_REFERENCED))
 778                 return 0;
 779         kvm_unmap_rmapp(kvm, rmapp, gfn);
 780         while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp))
 781                 cpu_relax();
 782         *rmapp &= ~KVMPPC_RMAP_REFERENCED;
 783         __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp);
 784         return 1;
 785 }
 786
 787 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 788 {
 789         if (!kvm->arch.using_mmu_notifiers)
 790                 return 0;
 791         return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
 792 }
 793
 794 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 795                               unsigned long gfn)
 796 {
 797         return !!(*rmapp & KVMPPC_RMAP_REFERENCED);
 798 }
 799
 800 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
 801 {
 802         if (!kvm->arch.using_mmu_notifiers)
 803                 return 0;
 804         return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
 805 }
 806
 807 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 808 {
 809         if (!kvm->arch.using_mmu_notifiers)
 810                 return;
 811         kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
 812 }
 813
 814 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 815                             unsigned long *nb_ret)
 816 {
 817         struct kvm_memory_slot *memslot;
 818         unsigned long gfn = gpa >> PAGE_SHIFT;
 819         struct page *page, *pages[1];
 820         int npages;
 821         unsigned long hva, psize, offset;
 822         unsigned long pa;
 823         unsigned long *physp;
 824
 825         memslot = gfn_to_memslot(kvm, gfn);
 826         if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
 827                 return NULL;
 828         if (!kvm->arch.using_mmu_notifiers) {
 829                 physp = kvm->arch.slot_phys[memslot->id];
 830                 if (!physp)
 831                         return NULL;
 832                 physp += gfn - memslot->base_gfn;
 833                 pa = *physp;
 834                 if (!pa) {
 835                         if (kvmppc_get_guest_page(kvm, gfn, memslot,
 836                                                   PAGE_SIZE) < 0)
 837                                 return NULL;
 838                         pa = *physp;
 839                 }
 840                 page = pfn_to_page(pa >> PAGE_SHIFT);
 841         } else {
 842                 hva = gfn_to_hva_memslot(memslot, gfn);
 843                 npages = get_user_pages_fast(hva, 1, 1, pages);
 844                 if (npages < 1)
 845                         return NULL;
 846                 page = pages[0];
 847         }
 848         psize = PAGE_SIZE;
 849         if (PageHuge(page)) {
 850                 page = compound_head(page);
 851                 psize <<= compound_order(page);
 852         }
 853         if (!kvm->arch.using_mmu_notifiers)
 854                 get_page(page);
 855         offset = gpa & (psize - 1);
 856         if (nb_ret)
 857                 *nb_ret = psize - offset;
 858         return page_address(page) + offset;
 859 }
 860
 861 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
 862 {
 863         struct page *page = virt_to_page(va);
 864
 865         page = compound_head(page);
 866         put_page(page);
 867 }
 868
 869 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 870 {
 871         struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
 872
 873         if (cpu_has_feature(CPU_FTR_ARCH_206))
 874                 vcpu->arch.slb_nr = 32;         /* POWER7 */
 875         else
 876                 vcpu->arch.slb_nr = 64;
 877
 878         mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
 879         mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
 880
 881         vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
 882 }