target/ppc/kvm.c

   1 /*
   2  * PowerPC implementation of KVM hooks
   3  *
   4  * Copyright IBM Corp. 2007
   5  * Copyright (C) 2011 Freescale Semiconductor, Inc.
   6  *
   7  * Authors:
   8  *  Jerone Young <jyoung5@us.ibm.com>
   9  *  Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
  10  *  Hollis Blanchard <hollisb@us.ibm.com>
  11  *
  12  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  13  * See the COPYING file in the top-level directory.
  14  *
  15  */
  16
  17 #include "qemu/osdep.h"
  18 #include <dirent.h>
  19 #include <sys/ioctl.h>
  20 #include <sys/vfs.h>
  21
  22 #include <linux/kvm.h>
  23
  24 #include "qemu-common.h"
  25 #include "qapi/error.h"
  26 #include "qemu/error-report.h"
  27 #include "cpu.h"
  28 #include "cpu-models.h"
  29 #include "qemu/timer.h"
  30 #include "sysemu/sysemu.h"
  31 #include "sysemu/hw_accel.h"
  32 #include "kvm_ppc.h"
  33 #include "sysemu/cpus.h"
  34 #include "sysemu/device_tree.h"
  35 #include "mmu-hash64.h"
  36
  37 #include "hw/sysbus.h"
  38 #include "hw/ppc/spapr.h"
  39 #include "hw/ppc/spapr_vio.h"
  40 #include "hw/ppc/spapr_cpu_core.h"
  41 #include "hw/ppc/ppc.h"
  42 #include "sysemu/watchdog.h"
  43 #include "trace.h"
  44 #include "exec/gdbstub.h"
  45 #include "exec/memattrs.h"
  46 #include "exec/ram_addr.h"
  47 #include "sysemu/hostmem.h"
  48 #include "qemu/cutils.h"
  49 #include "qemu/mmap-alloc.h"
  50 #include "elf.h"
  51 #include "sysemu/kvm_int.h"
  52
  53 //#define DEBUG_KVM
  54
  55 #ifdef DEBUG_KVM
  56 #define DPRINTF(fmt, ...) \
  57     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  58 #else
  59 #define DPRINTF(fmt, ...) \
  60     do { } while (0)
  61 #endif
  62
  63 #define PROC_DEVTREE_CPU      "/proc/device-tree/cpus/"
  64
  65 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
  66     KVM_CAP_LAST_INFO
  67 };
  68
  69 static int cap_interrupt_unset = false;
  70 static int cap_interrupt_level = false;
  71 static int cap_segstate;
  72 static int cap_booke_sregs;
  73 static int cap_ppc_smt;
  74 static int cap_ppc_smt_possible;
  75 static int cap_ppc_rma;
  76 static int cap_spapr_tce;
  77 static int cap_spapr_tce_64;
  78 static int cap_spapr_multitce;
  79 static int cap_spapr_vfio;
  80 static int cap_hior;
  81 static int cap_one_reg;
  82 static int cap_epr;
  83 static int cap_ppc_watchdog;
  84 static int cap_papr;
  85 static int cap_htab_fd;
  86 static int cap_fixup_hcalls;
  87 static int cap_htm;             /* Hardware transactional memory support */
  88 static int cap_mmu_radix;
  89 static int cap_mmu_hash_v3;
  90 static int cap_resize_hpt;
  91 static int cap_ppc_pvr_compat;
  92 static int cap_ppc_safe_cache;
  93 static int cap_ppc_safe_bounds_check;
  94 static int cap_ppc_safe_indirect_branch;
  95
  96 static uint32_t debug_inst_opcode;
  97
  98 /* XXX We have a race condition where we actually have a level triggered
  99  *     interrupt, but the infrastructure can't expose that yet, so the guest
 100  *     takes but ignores it, goes to sleep and never gets notified that there's
 101  *     still an interrupt pending.
 102  *
 103  *     As a quick workaround, let's just wake up again 20 ms after we injected
 104  *     an interrupt. That way we can assure that we're always reinjecting
 105  *     interrupts in case the guest swallowed them.
 106  */
 107 static QEMUTimer *idle_timer;
 108
 109 static void kvm_kick_cpu(void *opaque)
 110 {
 111     PowerPCCPU *cpu = opaque;
 112
 113     qemu_cpu_kick(CPU(cpu));
 114 }
 115
 116 /* Check whether we are running with KVM-PR (instead of KVM-HV).  This
 117  * should only be used for fallback tests - generally we should use
 118  * explicit capabilities for the features we want, rather than
 119  * assuming what is/isn't available depending on the KVM variant. */
 120 static bool kvmppc_is_pr(KVMState *ks)
 121 {
 122     /* Assume KVM-PR if the GET_PVINFO capability is available */
 123     return kvm_vm_check_extension(ks, KVM_CAP_PPC_GET_PVINFO) != 0;
 124 }
 125
 126 static int kvm_ppc_register_host_cpu_type(MachineState *ms);
 127 static void kvmppc_get_cpu_characteristics(KVMState *s);
 128
 129 int kvm_arch_init(MachineState *ms, KVMState *s)
 130 {
 131     cap_interrupt_unset = kvm_check_extension(s, KVM_CAP_PPC_UNSET_IRQ);
 132     cap_interrupt_level = kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL);
 133     cap_segstate = kvm_check_extension(s, KVM_CAP_PPC_SEGSTATE);
 134     cap_booke_sregs = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_SREGS);
 135     cap_ppc_smt_possible = kvm_vm_check_extension(s, KVM_CAP_PPC_SMT_POSSIBLE);
 136     cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
 137     cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
 138     cap_spapr_tce_64 = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_64);
 139     cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
 140     cap_spapr_vfio = kvm_vm_check_extension(s, KVM_CAP_SPAPR_TCE_VFIO);
 141     cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
 142     cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
 143     cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
 144     cap_ppc_watchdog = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_WATCHDOG);
 145     /* Note: we don't set cap_papr here, because this capability is
 146      * only activated after this by kvmppc_set_papr() */
 147     cap_htab_fd = kvm_vm_check_extension(s, KVM_CAP_PPC_HTAB_FD);
 148     cap_fixup_hcalls = kvm_check_extension(s, KVM_CAP_PPC_FIXUP_HCALL);
 149     cap_ppc_smt = kvm_vm_check_extension(s, KVM_CAP_PPC_SMT);
 150     cap_htm = kvm_vm_check_extension(s, KVM_CAP_PPC_HTM);
 151     cap_mmu_radix = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_RADIX);
 152     cap_mmu_hash_v3 = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_HASH_V3);
 153     cap_resize_hpt = kvm_vm_check_extension(s, KVM_CAP_SPAPR_RESIZE_HPT);
 154     kvmppc_get_cpu_characteristics(s);
 155     /*
 156      * Note: setting it to false because there is not such capability
 157      * in KVM at this moment.
 158      *
 159      * TODO: call kvm_vm_check_extension() with the right capability
 160      * after the kernel starts implementing it.*/
 161     cap_ppc_pvr_compat = false;
 162
 163     if (!cap_interrupt_level) {
 164         fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the "
 165                         "VM to stall at times!\n");
 166     }
 167
 168     kvm_ppc_register_host_cpu_type(ms);
 169
 170     return 0;
 171 }
 172
 173 int kvm_arch_irqchip_create(MachineState *ms, KVMState *s)
 174 {
 175     return 0;
 176 }
 177
 178 static int kvm_arch_sync_sregs(PowerPCCPU *cpu)
 179 {
 180     CPUPPCState *cenv = &cpu->env;
 181     CPUState *cs = CPU(cpu);
 182     struct kvm_sregs sregs;
 183     int ret;
 184
 185     if (cenv->excp_model == POWERPC_EXCP_BOOKE) {
 186         /* What we're really trying to say is "if we're on BookE, we use
 187            the native PVR for now". This is the only sane way to check
 188            it though, so we potentially confuse users that they can run
 189            BookE guests on BookS. Let's hope nobody dares enough :) */
 190         return 0;
 191     } else {
 192         if (!cap_segstate) {
 193             fprintf(stderr, "kvm error: missing PVR setting capability\n");
 194             return -ENOSYS;
 195         }
 196     }
 197
 198     ret = kvm_vcpu_ioctl(cs, KVM_GET_SREGS, &sregs);
 199     if (ret) {
 200         return ret;
 201     }
 202
 203     sregs.pvr = cenv->spr[SPR_PVR];
 204     return kvm_vcpu_ioctl(cs, KVM_SET_SREGS, &sregs);
 205 }
 206
 207 /* Set up a shared TLB array with KVM */
 208 static int kvm_booke206_tlb_init(PowerPCCPU *cpu)
 209 {
 210     CPUPPCState *env = &cpu->env;
 211     CPUState *cs = CPU(cpu);
 212     struct kvm_book3e_206_tlb_params params = {};
 213     struct kvm_config_tlb cfg = {};
 214     unsigned int entries = 0;
 215     int ret, i;
 216
 217     if (!kvm_enabled() ||
 218         !kvm_check_extension(cs->kvm_state, KVM_CAP_SW_TLB)) {
 219         return 0;
 220     }
 221
 222     assert(ARRAY_SIZE(params.tlb_sizes) == BOOKE206_MAX_TLBN);
 223
 224     for (i = 0; i < BOOKE206_MAX_TLBN; i++) {
 225         params.tlb_sizes[i] = booke206_tlb_size(env, i);
 226         params.tlb_ways[i] = booke206_tlb_ways(env, i);
 227         entries += params.tlb_sizes[i];
 228     }
 229
 230     assert(entries == env->nb_tlb);
 231     assert(sizeof(struct kvm_book3e_206_tlb_entry) == sizeof(ppcmas_tlb_t));
 232
 233     env->tlb_dirty = true;
 234
 235     cfg.array = (uintptr_t)env->tlb.tlbm;
 236     cfg.array_len = sizeof(ppcmas_tlb_t) * entries;
 237     cfg.params = (uintptr_t)&params;
 238     cfg.mmu_type = KVM_MMU_FSL_BOOKE_NOHV;
 239
 240     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_SW_TLB, 0, (uintptr_t)&cfg);
 241     if (ret < 0) {
 242         fprintf(stderr, "%s: couldn't enable KVM_CAP_SW_TLB: %s\n",
 243                 __func__, strerror(-ret));
 244         return ret;
 245     }
 246
 247     env->kvm_sw_tlb = true;
 248     return 0;
 249 }
 250
 251
 252 #if defined(TARGET_PPC64)
 253 static void kvm_get_fallback_smmu_info(PowerPCCPU *cpu,
 254                                        struct kvm_ppc_smmu_info *info)
 255 {
 256     CPUPPCState *env = &cpu->env;
 257     CPUState *cs = CPU(cpu);
 258
 259     memset(info, 0, sizeof(*info));
 260
 261     /* We don't have the new KVM_PPC_GET_SMMU_INFO ioctl, so
 262      * need to "guess" what the supported page sizes are.
 263      *
 264      * For that to work we make a few assumptions:
 265      *
 266      * - Check whether we are running "PR" KVM which only supports 4K
 267      *   and 16M pages, but supports them regardless of the backing
 268      *   store characteritics. We also don't support 1T segments.
 269      *
 270      *   This is safe as if HV KVM ever supports that capability or PR
 271      *   KVM grows supports for more page/segment sizes, those versions
 272      *   will have implemented KVM_CAP_PPC_GET_SMMU_INFO and thus we
 273      *   will not hit this fallback
 274      *
 275      * - Else we are running HV KVM. This means we only support page
 276      *   sizes that fit in the backing store. Additionally we only
 277      *   advertize 64K pages if the processor is ARCH 2.06 and we assume
 278      *   P7 encodings for the SLB and hash table. Here too, we assume
 279      *   support for any newer processor will mean a kernel that
 280      *   implements KVM_CAP_PPC_GET_SMMU_INFO and thus doesn't hit
 281      *   this fallback.
 282      */
 283     if (kvmppc_is_pr(cs->kvm_state)) {
 284         /* No flags */
 285         info->flags = 0;
 286         info->slb_size = 64;
 287
 288         /* Standard 4k base page size segment */
 289         info->sps[0].page_shift = 12;
 290         info->sps[0].slb_enc = 0;
 291         info->sps[0].enc[0].page_shift = 12;
 292         info->sps[0].enc[0].pte_enc = 0;
 293
 294         /* Standard 16M large page size segment */
 295         info->sps[1].page_shift = 24;
 296         info->sps[1].slb_enc = SLB_VSID_L;
 297         info->sps[1].enc[0].page_shift = 24;
 298         info->sps[1].enc[0].pte_enc = 0;
 299     } else {
 300         int i = 0;
 301
 302         /* HV KVM has backing store size restrictions */
 303         info->flags = KVM_PPC_PAGE_SIZES_REAL;
 304
 305         if (env->mmu_model & POWERPC_MMU_1TSEG) {
 306             info->flags |= KVM_PPC_1T_SEGMENTS;
 307         }
 308
 309         if (POWERPC_MMU_VER(env->mmu_model) == POWERPC_MMU_VER_2_06 ||
 310            POWERPC_MMU_VER(env->mmu_model) == POWERPC_MMU_VER_2_07) {
 311             info->slb_size = 32;
 312         } else {
 313             info->slb_size = 64;
 314         }
 315
 316         /* Standard 4k base page size segment */
 317         info->sps[i].page_shift = 12;
 318         info->sps[i].slb_enc = 0;
 319         info->sps[i].enc[0].page_shift = 12;
 320         info->sps[i].enc[0].pte_enc = 0;
 321         i++;
 322
 323         /* 64K on MMU 2.06 and later */
 324         if (POWERPC_MMU_VER(env->mmu_model) == POWERPC_MMU_VER_2_06 ||
 325             POWERPC_MMU_VER(env->mmu_model) == POWERPC_MMU_VER_2_07) {
 326             info->sps[i].page_shift = 16;
 327             info->sps[i].slb_enc = 0x110;
 328             info->sps[i].enc[0].page_shift = 16;
 329             info->sps[i].enc[0].pte_enc = 1;
 330             i++;
 331         }
 332
 333         /* Standard 16M large page size segment */
 334         info->sps[i].page_shift = 24;
 335         info->sps[i].slb_enc = SLB_VSID_L;
 336         info->sps[i].enc[0].page_shift = 24;
 337         info->sps[i].enc[0].pte_enc = 0;
 338     }
 339 }
 340
 341 static void kvm_get_smmu_info(PowerPCCPU *cpu, struct kvm_ppc_smmu_info *info)
 342 {
 343     CPUState *cs = CPU(cpu);
 344     int ret;
 345
 346     if (kvm_check_extension(cs->kvm_state, KVM_CAP_PPC_GET_SMMU_INFO)) {
 347         ret = kvm_vm_ioctl(cs->kvm_state, KVM_PPC_GET_SMMU_INFO, info);
 348         if (ret == 0) {
 349             return;
 350         }
 351     }
 352
 353     kvm_get_fallback_smmu_info(cpu, info);
 354 }
 355
 356 struct ppc_radix_page_info *kvm_get_radix_page_info(void)
 357 {
 358     KVMState *s = KVM_STATE(current_machine->accelerator);
 359     struct ppc_radix_page_info *radix_page_info;
 360     struct kvm_ppc_rmmu_info rmmu_info;
 361     int i;
 362
 363     if (!kvm_check_extension(s, KVM_CAP_PPC_MMU_RADIX)) {
 364         return NULL;
 365     }
 366     if (kvm_vm_ioctl(s, KVM_PPC_GET_RMMU_INFO, &rmmu_info)) {
 367         return NULL;
 368     }
 369     radix_page_info = g_malloc0(sizeof(*radix_page_info));
 370     radix_page_info->count = 0;
 371     for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
 372         if (rmmu_info.ap_encodings[i]) {
 373             radix_page_info->entries[i] = rmmu_info.ap_encodings[i];
 374             radix_page_info->count++;
 375         }
 376     }
 377     return radix_page_info;
 378 }
 379
 380 target_ulong kvmppc_configure_v3_mmu(PowerPCCPU *cpu,
 381                                      bool radix, bool gtse,
 382                                      uint64_t proc_tbl)
 383 {
 384     CPUState *cs = CPU(cpu);
 385     int ret;
 386     uint64_t flags = 0;
 387     struct kvm_ppc_mmuv3_cfg cfg = {
 388         .process_table = proc_tbl,
 389     };
 390
 391     if (radix) {
 392         flags |= KVM_PPC_MMUV3_RADIX;
 393     }
 394     if (gtse) {
 395         flags |= KVM_PPC_MMUV3_GTSE;
 396     }
 397     cfg.flags = flags;
 398     ret = kvm_vm_ioctl(cs->kvm_state, KVM_PPC_CONFIGURE_V3_MMU, &cfg);
 399     switch (ret) {
 400     case 0:
 401         return H_SUCCESS;
 402     case -EINVAL:
 403         return H_PARAMETER;
 404     case -ENODEV:
 405         return H_NOT_AVAILABLE;
 406     default:
 407         return H_HARDWARE;
 408     }
 409 }
 410
 411 static bool kvm_valid_page_size(uint32_t flags, long rampgsize, uint32_t shift)
 412 {
 413     if (!(flags & KVM_PPC_PAGE_SIZES_REAL)) {
 414         return true;
 415     }
 416
 417     return (1ul << shift) <= rampgsize;
 418 }
 419
 420 static long max_cpu_page_size;
 421
 422 static void kvm_fixup_page_sizes(PowerPCCPU *cpu)
 423 {
 424     static struct kvm_ppc_smmu_info smmu_info;
 425     static bool has_smmu_info;
 426     CPUPPCState *env = &cpu->env;
 427     int iq, ik, jq, jk;
 428     bool has_64k_pages = false;
 429
 430     /* We only handle page sizes for 64-bit server guests for now */
 431     if (!(env->mmu_model & POWERPC_MMU_64)) {
 432         return;
 433     }
 434
 435     /* Collect MMU info from kernel if not already */
 436     if (!has_smmu_info) {
 437         kvm_get_smmu_info(cpu, &smmu_info);
 438         has_smmu_info = true;
 439     }
 440
 441     if (!max_cpu_page_size) {
 442         max_cpu_page_size = qemu_getrampagesize();
 443     }
 444
 445     /* Convert to QEMU form */
 446     memset(&env->sps, 0, sizeof(env->sps));
 447
 448     /* If we have HV KVM, we need to forbid CI large pages if our
 449      * host page size is smaller than 64K.
 450      */
 451     if (smmu_info.flags & KVM_PPC_PAGE_SIZES_REAL) {
 452         env->ci_large_pages = getpagesize() >= 0x10000;
 453     }
 454
 455     /*
 456      * XXX This loop should be an entry wide AND of the capabilities that
 457      *     the selected CPU has with the capabilities that KVM supports.
 458      */
 459     for (ik = iq = 0; ik < KVM_PPC_PAGE_SIZES_MAX_SZ; ik++) {
 460         struct ppc_one_seg_page_size *qsps = &env->sps.sps[iq];
 461         struct kvm_ppc_one_seg_page_size *ksps = &smmu_info.sps[ik];
 462
 463         if (!kvm_valid_page_size(smmu_info.flags, max_cpu_page_size,
 464                                  ksps->page_shift)) {
 465             continue;
 466         }
 467         qsps->page_shift = ksps->page_shift;
 468         qsps->slb_enc = ksps->slb_enc;
 469         for (jk = jq = 0; jk < KVM_PPC_PAGE_SIZES_MAX_SZ; jk++) {
 470             if (!kvm_valid_page_size(smmu_info.flags, max_cpu_page_size,
 471                                      ksps->enc[jk].page_shift)) {
 472                 continue;
 473             }
 474             if (ksps->enc[jk].page_shift == 16) {
 475                 has_64k_pages = true;
 476             }
 477             qsps->enc[jq].page_shift = ksps->enc[jk].page_shift;
 478             qsps->enc[jq].pte_enc = ksps->enc[jk].pte_enc;
 479             if (++jq >= PPC_PAGE_SIZES_MAX_SZ) {
 480                 break;
 481             }
 482         }
 483         if (++iq >= PPC_PAGE_SIZES_MAX_SZ) {
 484             break;
 485         }
 486     }
 487     env->slb_nr = smmu_info.slb_size;
 488     if (!(smmu_info.flags & KVM_PPC_1T_SEGMENTS)) {
 489         env->mmu_model &= ~POWERPC_MMU_1TSEG;
 490     }
 491     if (!has_64k_pages) {
 492         env->mmu_model &= ~POWERPC_MMU_64K;
 493     }
 494 }
 495
 496 bool kvmppc_is_mem_backend_page_size_ok(const char *obj_path)
 497 {
 498     Object *mem_obj = object_resolve_path(obj_path, NULL);
 499     char *mempath = object_property_get_str(mem_obj, "mem-path", NULL);
 500     long pagesize;
 501
 502     if (mempath) {
 503         pagesize = qemu_mempath_getpagesize(mempath);
 504         g_free(mempath);
 505     } else {
 506         pagesize = getpagesize();
 507     }
 508
 509     return pagesize >= max_cpu_page_size;
 510 }
 511
 512 #else /* defined (TARGET_PPC64) */
 513
 514 static inline void kvm_fixup_page_sizes(PowerPCCPU *cpu)
 515 {
 516 }
 517
 518 bool kvmppc_is_mem_backend_page_size_ok(const char *obj_path)
 519 {
 520     return true;
 521 }
 522
 523 #endif /* !defined (TARGET_PPC64) */
 524
 525 unsigned long kvm_arch_vcpu_id(CPUState *cpu)
 526 {
 527     return POWERPC_CPU(cpu)->vcpu_id;
 528 }
 529
 530 /* e500 supports 2 h/w breakpoint and 2 watchpoint.
 531  * book3s supports only 1 watchpoint, so array size
 532  * of 4 is sufficient for now.
 533  */
 534 #define MAX_HW_BKPTS 4
 535
 536 static struct HWBreakpoint {
 537     target_ulong addr;
 538     int type;
 539 } hw_debug_points[MAX_HW_BKPTS];
 540
 541 static CPUWatchpoint hw_watchpoint;
 542
 543 /* Default there is no breakpoint and watchpoint supported */
 544 static int max_hw_breakpoint;
 545 static int max_hw_watchpoint;
 546 static int nb_hw_breakpoint;
 547 static int nb_hw_watchpoint;
 548
 549 static void kvmppc_hw_debug_points_init(CPUPPCState *cenv)
 550 {
 551     if (cenv->excp_model == POWERPC_EXCP_BOOKE) {
 552         max_hw_breakpoint = 2;
 553         max_hw_watchpoint = 2;
 554     }
 555
 556     if ((max_hw_breakpoint + max_hw_watchpoint) > MAX_HW_BKPTS) {
 557         fprintf(stderr, "Error initializing h/w breakpoints\n");
 558         return;
 559     }
 560 }
 561
 562 int kvm_arch_init_vcpu(CPUState *cs)
 563 {
 564     PowerPCCPU *cpu = POWERPC_CPU(cs);
 565     CPUPPCState *cenv = &cpu->env;
 566     int ret;
 567
 568     /* Gather server mmu info from KVM and update the CPU state */
 569     kvm_fixup_page_sizes(cpu);
 570
 571     /* Synchronize sregs with kvm */
 572     ret = kvm_arch_sync_sregs(cpu);
 573     if (ret) {
 574         if (ret == -EINVAL) {
 575             error_report("Register sync failed... If you're using kvm-hv.ko,"
 576                          " only \"-cpu host\" is possible");
 577         }
 578         return ret;
 579     }
 580
 581     idle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
 582
 583     switch (cenv->mmu_model) {
 584     case POWERPC_MMU_BOOKE206:
 585         /* This target supports access to KVM's guest TLB */
 586         ret = kvm_booke206_tlb_init(cpu);
 587         break;
 588     case POWERPC_MMU_2_07:
 589         if (!cap_htm && !kvmppc_is_pr(cs->kvm_state)) {
 590             /* KVM-HV has transactional memory on POWER8 also without the
 591              * KVM_CAP_PPC_HTM extension, so enable it here instead as
 592              * long as it's availble to userspace on the host. */
 593             if (qemu_getauxval(AT_HWCAP2) & PPC_FEATURE2_HAS_HTM) {
 594                 cap_htm = true;
 595             }
 596         }
 597         break;
 598     default:
 599         break;
 600     }
 601
 602     kvm_get_one_reg(cs, KVM_REG_PPC_DEBUG_INST, &debug_inst_opcode);
 603     kvmppc_hw_debug_points_init(cenv);
 604
 605     return ret;
 606 }
 607
 608 static void kvm_sw_tlb_put(PowerPCCPU *cpu)
 609 {
 610     CPUPPCState *env = &cpu->env;
 611     CPUState *cs = CPU(cpu);
 612     struct kvm_dirty_tlb dirty_tlb;
 613     unsigned char *bitmap;
 614     int ret;
 615
 616     if (!env->kvm_sw_tlb) {
 617         return;
 618     }
 619
 620     bitmap = g_malloc((env->nb_tlb + 7) / 8);
 621     memset(bitmap, 0xFF, (env->nb_tlb + 7) / 8);
 622
 623     dirty_tlb.bitmap = (uintptr_t)bitmap;
 624     dirty_tlb.num_dirty = env->nb_tlb;
 625
 626     ret = kvm_vcpu_ioctl(cs, KVM_DIRTY_TLB, &dirty_tlb);
 627     if (ret) {
 628         fprintf(stderr, "%s: KVM_DIRTY_TLB: %s\n",
 629                 __func__, strerror(-ret));
 630     }
 631
 632     g_free(bitmap);
 633 }
 634
 635 static void kvm_get_one_spr(CPUState *cs, uint64_t id, int spr)
 636 {
 637     PowerPCCPU *cpu = POWERPC_CPU(cs);
 638     CPUPPCState *env = &cpu->env;
 639     union {
 640         uint32_t u32;
 641         uint64_t u64;
 642     } val;
 643     struct kvm_one_reg reg = {
 644         .id = id,
 645         .addr = (uintptr_t) &val,
 646     };
 647     int ret;
 648
 649     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 650     if (ret != 0) {
 651         trace_kvm_failed_spr_get(spr, strerror(errno));
 652     } else {
 653         switch (id & KVM_REG_SIZE_MASK) {
 654         case KVM_REG_SIZE_U32:
 655             env->spr[spr] = val.u32;
 656             break;
 657
 658         case KVM_REG_SIZE_U64:
 659             env->spr[spr] = val.u64;
 660             break;
 661
 662         default:
 663             /* Don't handle this size yet */
 664             abort();
 665         }
 666     }
 667 }
 668
 669 static void kvm_put_one_spr(CPUState *cs, uint64_t id, int spr)
 670 {
 671     PowerPCCPU *cpu = POWERPC_CPU(cs);
 672     CPUPPCState *env = &cpu->env;
 673     union {
 674         uint32_t u32;
 675         uint64_t u64;
 676     } val;
 677     struct kvm_one_reg reg = {
 678         .id = id,
 679         .addr = (uintptr_t) &val,
 680     };
 681     int ret;
 682
 683     switch (id & KVM_REG_SIZE_MASK) {
 684     case KVM_REG_SIZE_U32:
 685         val.u32 = env->spr[spr];
 686         break;
 687
 688     case KVM_REG_SIZE_U64:
 689         val.u64 = env->spr[spr];
 690         break;
 691
 692     default:
 693         /* Don't handle this size yet */
 694         abort();
 695     }
 696
 697     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 698     if (ret != 0) {
 699         trace_kvm_failed_spr_set(spr, strerror(errno));
 700     }
 701 }
 702
 703 static int kvm_put_fp(CPUState *cs)
 704 {
 705     PowerPCCPU *cpu = POWERPC_CPU(cs);
 706     CPUPPCState *env = &cpu->env;
 707     struct kvm_one_reg reg;
 708     int i;
 709     int ret;
 710
 711     if (env->insns_flags & PPC_FLOAT) {
 712         uint64_t fpscr = env->fpscr;
 713         bool vsx = !!(env->insns_flags2 & PPC2_VSX);
 714
 715         reg.id = KVM_REG_PPC_FPSCR;
 716         reg.addr = (uintptr_t)&fpscr;
 717         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 718         if (ret < 0) {
 719             DPRINTF("Unable to set FPSCR to KVM: %s\n", strerror(errno));
 720             return ret;
 721         }
 722
 723         for (i = 0; i < 32; i++) {
 724             uint64_t vsr[2];
 725
 726 #ifdef HOST_WORDS_BIGENDIAN
 727             vsr[0] = float64_val(env->fpr[i]);
 728             vsr[1] = env->vsr[i];
 729 #else
 730             vsr[0] = env->vsr[i];
 731             vsr[1] = float64_val(env->fpr[i]);
 732 #endif
 733             reg.addr = (uintptr_t) &vsr;
 734             reg.id = vsx ? KVM_REG_PPC_VSR(i) : KVM_REG_PPC_FPR(i);
 735
 736             ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 737             if (ret < 0) {
 738                 DPRINTF("Unable to set %s%d to KVM: %s\n", vsx ? "VSR" : "FPR",
 739                         i, strerror(errno));
 740                 return ret;
 741             }
 742         }
 743     }
 744
 745     if (env->insns_flags & PPC_ALTIVEC) {
 746         reg.id = KVM_REG_PPC_VSCR;
 747         reg.addr = (uintptr_t)&env->vscr;
 748         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 749         if (ret < 0) {
 750             DPRINTF("Unable to set VSCR to KVM: %s\n", strerror(errno));
 751             return ret;
 752         }
 753
 754         for (i = 0; i < 32; i++) {
 755             reg.id = KVM_REG_PPC_VR(i);
 756             reg.addr = (uintptr_t)&env->avr[i];
 757             ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 758             if (ret < 0) {
 759                 DPRINTF("Unable to set VR%d to KVM: %s\n", i, strerror(errno));
 760                 return ret;
 761             }
 762         }
 763     }
 764
 765     return 0;
 766 }
 767
 768 static int kvm_get_fp(CPUState *cs)
 769 {
 770     PowerPCCPU *cpu = POWERPC_CPU(cs);
 771     CPUPPCState *env = &cpu->env;
 772     struct kvm_one_reg reg;
 773     int i;
 774     int ret;
 775
 776     if (env->insns_flags & PPC_FLOAT) {
 777         uint64_t fpscr;
 778         bool vsx = !!(env->insns_flags2 & PPC2_VSX);
 779
 780         reg.id = KVM_REG_PPC_FPSCR;
 781         reg.addr = (uintptr_t)&fpscr;
 782         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 783         if (ret < 0) {
 784             DPRINTF("Unable to get FPSCR from KVM: %s\n", strerror(errno));
 785             return ret;
 786         } else {
 787             env->fpscr = fpscr;
 788         }
 789
 790         for (i = 0; i < 32; i++) {
 791             uint64_t vsr[2];
 792
 793             reg.addr = (uintptr_t) &vsr;
 794             reg.id = vsx ? KVM_REG_PPC_VSR(i) : KVM_REG_PPC_FPR(i);
 795
 796             ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 797             if (ret < 0) {
 798                 DPRINTF("Unable to get %s%d from KVM: %s\n",
 799                         vsx ? "VSR" : "FPR", i, strerror(errno));
 800                 return ret;
 801             } else {
 802 #ifdef HOST_WORDS_BIGENDIAN
 803                 env->fpr[i] = vsr[0];
 804                 if (vsx) {
 805                     env->vsr[i] = vsr[1];
 806                 }
 807 #else
 808                 env->fpr[i] = vsr[1];
 809                 if (vsx) {
 810                     env->vsr[i] = vsr[0];
 811                 }
 812 #endif
 813             }
 814         }
 815     }
 816
 817     if (env->insns_flags & PPC_ALTIVEC) {
 818         reg.id = KVM_REG_PPC_VSCR;
 819         reg.addr = (uintptr_t)&env->vscr;
 820         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 821         if (ret < 0) {
 822             DPRINTF("Unable to get VSCR from KVM: %s\n", strerror(errno));
 823             return ret;
 824         }
 825
 826         for (i = 0; i < 32; i++) {
 827             reg.id = KVM_REG_PPC_VR(i);
 828             reg.addr = (uintptr_t)&env->avr[i];
 829             ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 830             if (ret < 0) {
 831                 DPRINTF("Unable to get VR%d from KVM: %s\n",
 832                         i, strerror(errno));
 833                 return ret;
 834             }
 835         }
 836     }
 837
 838     return 0;
 839 }
 840
 841 #if defined(TARGET_PPC64)
 842 static int kvm_get_vpa(CPUState *cs)
 843 {
 844     PowerPCCPU *cpu = POWERPC_CPU(cs);
 845     CPUPPCState *env = &cpu->env;
 846     struct kvm_one_reg reg;
 847     int ret;
 848
 849     reg.id = KVM_REG_PPC_VPA_ADDR;
 850     reg.addr = (uintptr_t)&env->vpa_addr;
 851     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 852     if (ret < 0) {
 853         DPRINTF("Unable to get VPA address from KVM: %s\n", strerror(errno));
 854         return ret;
 855     }
 856
 857     assert((uintptr_t)&env->slb_shadow_size
 858            == ((uintptr_t)&env->slb_shadow_addr + 8));
 859     reg.id = KVM_REG_PPC_VPA_SLB;
 860     reg.addr = (uintptr_t)&env->slb_shadow_addr;
 861     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 862     if (ret < 0) {
 863         DPRINTF("Unable to get SLB shadow state from KVM: %s\n",
 864                 strerror(errno));
 865         return ret;
 866     }
 867
 868     assert((uintptr_t)&env->dtl_size == ((uintptr_t)&env->dtl_addr + 8));
 869     reg.id = KVM_REG_PPC_VPA_DTL;
 870     reg.addr = (uintptr_t)&env->dtl_addr;
 871     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 872     if (ret < 0) {
 873         DPRINTF("Unable to get dispatch trace log state from KVM: %s\n",
 874                 strerror(errno));
 875         return ret;
 876     }
 877
 878     return 0;
 879 }
 880
 881 static int kvm_put_vpa(CPUState *cs)
 882 {
 883     PowerPCCPU *cpu = POWERPC_CPU(cs);
 884     CPUPPCState *env = &cpu->env;
 885     struct kvm_one_reg reg;
 886     int ret;
 887
 888     /* SLB shadow or DTL can't be registered unless a master VPA is
 889      * registered.  That means when restoring state, if a VPA *is*
 890      * registered, we need to set that up first.  If not, we need to
 891      * deregister the others before deregistering the master VPA */
 892     assert(env->vpa_addr || !(env->slb_shadow_addr || env->dtl_addr));
 893
 894     if (env->vpa_addr) {
 895         reg.id = KVM_REG_PPC_VPA_ADDR;
 896         reg.addr = (uintptr_t)&env->vpa_addr;
 897         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 898         if (ret < 0) {
 899             DPRINTF("Unable to set VPA address to KVM: %s\n", strerror(errno));
 900             return ret;
 901         }
 902     }
 903
 904     assert((uintptr_t)&env->slb_shadow_size
 905            == ((uintptr_t)&env->slb_shadow_addr + 8));
 906     reg.id = KVM_REG_PPC_VPA_SLB;
 907     reg.addr = (uintptr_t)&env->slb_shadow_addr;
 908     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 909     if (ret < 0) {
 910         DPRINTF("Unable to set SLB shadow state to KVM: %s\n", strerror(errno));
 911         return ret;
 912     }
 913
 914     assert((uintptr_t)&env->dtl_size == ((uintptr_t)&env->dtl_addr + 8));
 915     reg.id = KVM_REG_PPC_VPA_DTL;
 916     reg.addr = (uintptr_t)&env->dtl_addr;
 917     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 918     if (ret < 0) {
 919         DPRINTF("Unable to set dispatch trace log state to KVM: %s\n",
 920                 strerror(errno));
 921         return ret;
 922     }
 923
 924     if (!env->vpa_addr) {
 925         reg.id = KVM_REG_PPC_VPA_ADDR;
 926         reg.addr = (uintptr_t)&env->vpa_addr;
 927         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 928         if (ret < 0) {
 929             DPRINTF("Unable to set VPA address to KVM: %s\n", strerror(errno));
 930             return ret;
 931         }
 932     }
 933
 934     return 0;
 935 }
 936 #endif /* TARGET_PPC64 */
 937
 938 int kvmppc_put_books_sregs(PowerPCCPU *cpu)
 939 {
 940     CPUPPCState *env = &cpu->env;
 941     struct kvm_sregs sregs;
 942     int i;
 943
 944     sregs.pvr = env->spr[SPR_PVR];
 945
 946     if (cpu->vhyp) {
 947         PPCVirtualHypervisorClass *vhc =
 948             PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
 949         sregs.u.s.sdr1 = vhc->encode_hpt_for_kvm_pr(cpu->vhyp);
 950     } else {
 951         sregs.u.s.sdr1 = env->spr[SPR_SDR1];
 952     }
 953
 954     /* Sync SLB */
 955 #ifdef TARGET_PPC64
 956     for (i = 0; i < ARRAY_SIZE(env->slb); i++) {
 957         sregs.u.s.ppc64.slb[i].slbe = env->slb[i].esid;
 958         if (env->slb[i].esid & SLB_ESID_V) {
 959             sregs.u.s.ppc64.slb[i].slbe |= i;
 960         }
 961         sregs.u.s.ppc64.slb[i].slbv = env->slb[i].vsid;
 962     }
 963 #endif
 964
 965     /* Sync SRs */
 966     for (i = 0; i < 16; i++) {
 967         sregs.u.s.ppc32.sr[i] = env->sr[i];
 968     }
 969
 970     /* Sync BATs */
 971     for (i = 0; i < 8; i++) {
 972         /* Beware. We have to swap upper and lower bits here */
 973         sregs.u.s.ppc32.dbat[i] = ((uint64_t)env->DBAT[0][i] << 32)
 974             | env->DBAT[1][i];
 975         sregs.u.s.ppc32.ibat[i] = ((uint64_t)env->IBAT[0][i] << 32)
 976             | env->IBAT[1][i];
 977     }
 978
 979     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
 980 }
 981
 982 int kvm_arch_put_registers(CPUState *cs, int level)
 983 {
 984     PowerPCCPU *cpu = POWERPC_CPU(cs);
 985     CPUPPCState *env = &cpu->env;
 986     struct kvm_regs regs;
 987     int ret;
 988     int i;
 989
 990     ret = kvm_vcpu_ioctl(cs, KVM_GET_REGS, &regs);
 991     if (ret < 0) {
 992         return ret;
 993     }
 994
 995     regs.ctr = env->ctr;
 996     regs.lr  = env->lr;
 997     regs.xer = cpu_read_xer(env);
 998     regs.msr = env->msr;
 999     regs.pc = env->nip;
1000
1001     regs.srr0 = env->spr[SPR_SRR0];
1002     regs.srr1 = env->spr[SPR_SRR1];
1003
1004     regs.sprg0 = env->spr[SPR_SPRG0];
1005     regs.sprg1 = env->spr[SPR_SPRG1];
1006     regs.sprg2 = env->spr[SPR_SPRG2];
1007     regs.sprg3 = env->spr[SPR_SPRG3];
1008     regs.sprg4 = env->spr[SPR_SPRG4];
1009     regs.sprg5 = env->spr[SPR_SPRG5];
1010     regs.sprg6 = env->spr[SPR_SPRG6];
1011     regs.sprg7 = env->spr[SPR_SPRG7];
1012
1013     regs.pid = env->spr[SPR_BOOKE_PID];
1014
1015     for (i = 0;i < 32; i++)
1016         regs.gpr[i] = env->gpr[i];
1017
1018     regs.cr = 0;
1019     for (i = 0; i < 8; i++) {
1020         regs.cr |= (env->crf[i] & 15) << (4 * (7 - i));
1021     }
1022
1023     ret = kvm_vcpu_ioctl(cs, KVM_SET_REGS, &regs);
1024     if (ret < 0)
1025         return ret;
1026
1027     kvm_put_fp(cs);
1028
1029     if (env->tlb_dirty) {
1030         kvm_sw_tlb_put(cpu);
1031         env->tlb_dirty = false;
1032     }
1033
1034     if (cap_segstate && (level >= KVM_PUT_RESET_STATE)) {
1035         ret = kvmppc_put_books_sregs(cpu);
1036         if (ret < 0) {
1037             return ret;
1038         }
1039     }
1040
1041     if (cap_hior && (level >= KVM_PUT_RESET_STATE)) {
1042         kvm_put_one_spr(cs, KVM_REG_PPC_HIOR, SPR_HIOR);
1043     }
1044
1045     if (cap_one_reg) {
1046         int i;
1047
1048         /* We deliberately ignore errors here, for kernels which have
1049          * the ONE_REG calls, but don't support the specific
1050          * registers, there's a reasonable chance things will still
1051          * work, at least until we try to migrate. */
1052         for (i = 0; i < 1024; i++) {
1053             uint64_t id = env->spr_cb[i].one_reg_id;
1054
1055             if (id != 0) {
1056                 kvm_put_one_spr(cs, id, i);
1057             }
1058         }
1059
1060 #ifdef TARGET_PPC64
1061         if (msr_ts) {
1062             for (i = 0; i < ARRAY_SIZE(env->tm_gpr); i++) {
1063                 kvm_set_one_reg(cs, KVM_REG_PPC_TM_GPR(i), &env->tm_gpr[i]);
1064             }
1065             for (i = 0; i < ARRAY_SIZE(env->tm_vsr); i++) {
1066                 kvm_set_one_reg(cs, KVM_REG_PPC_TM_VSR(i), &env->tm_vsr[i]);
1067             }
1068             kvm_set_one_reg(cs, KVM_REG_PPC_TM_CR, &env->tm_cr);
1069             kvm_set_one_reg(cs, KVM_REG_PPC_TM_LR, &env->tm_lr);
1070             kvm_set_one_reg(cs, KVM_REG_PPC_TM_CTR, &env->tm_ctr);
1071             kvm_set_one_reg(cs, KVM_REG_PPC_TM_FPSCR, &env->tm_fpscr);
1072             kvm_set_one_reg(cs, KVM_REG_PPC_TM_AMR, &env->tm_amr);
1073             kvm_set_one_reg(cs, KVM_REG_PPC_TM_PPR, &env->tm_ppr);
1074             kvm_set_one_reg(cs, KVM_REG_PPC_TM_VRSAVE, &env->tm_vrsave);
1075             kvm_set_one_reg(cs, KVM_REG_PPC_TM_VSCR, &env->tm_vscr);
1076             kvm_set_one_reg(cs, KVM_REG_PPC_TM_DSCR, &env->tm_dscr);
1077             kvm_set_one_reg(cs, KVM_REG_PPC_TM_TAR, &env->tm_tar);
1078         }
1079
1080         if (cap_papr) {
1081             if (kvm_put_vpa(cs) < 0) {
1082                 DPRINTF("Warning: Unable to set VPA information to KVM\n");
1083             }
1084         }
1085
1086         kvm_set_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &env->tb_env->tb_offset);
1087 #endif /* TARGET_PPC64 */
1088     }
1089
1090     return ret;
1091 }
1092
1093 static void kvm_sync_excp(CPUPPCState *env, int vector, int ivor)
1094 {
1095      env->excp_vectors[vector] = env->spr[ivor] + env->spr[SPR_BOOKE_IVPR];
1096 }
1097
1098 static int kvmppc_get_booke_sregs(PowerPCCPU *cpu)
1099 {
1100     CPUPPCState *env = &cpu->env;
1101     struct kvm_sregs sregs;
1102     int ret;
1103
1104     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
1105     if (ret < 0) {
1106         return ret;
1107     }
1108
1109     if (sregs.u.e.features & KVM_SREGS_E_BASE) {
1110         env->spr[SPR_BOOKE_CSRR0] = sregs.u.e.csrr0;
1111         env->spr[SPR_BOOKE_CSRR1] = sregs.u.e.csrr1;
1112         env->spr[SPR_BOOKE_ESR] = sregs.u.e.esr;
1113         env->spr[SPR_BOOKE_DEAR] = sregs.u.e.dear;
1114         env->spr[SPR_BOOKE_MCSR] = sregs.u.e.mcsr;
1115         env->spr[SPR_BOOKE_TSR] = sregs.u.e.tsr;
1116         env->spr[SPR_BOOKE_TCR] = sregs.u.e.tcr;
1117         env->spr[SPR_DECR] = sregs.u.e.dec;
1118         env->spr[SPR_TBL] = sregs.u.e.tb & 0xffffffff;
1119         env->spr[SPR_TBU] = sregs.u.e.tb >> 32;
1120         env->spr[SPR_VRSAVE] = sregs.u.e.vrsave;
1121     }
1122
1123     if (sregs.u.e.features & KVM_SREGS_E_ARCH206) {
1124         env->spr[SPR_BOOKE_PIR] = sregs.u.e.pir;
1125         env->spr[SPR_BOOKE_MCSRR0] = sregs.u.e.mcsrr0;
1126         env->spr[SPR_BOOKE_MCSRR1] = sregs.u.e.mcsrr1;
1127         env->spr[SPR_BOOKE_DECAR] = sregs.u.e.decar;
1128         env->spr[SPR_BOOKE_IVPR] = sregs.u.e.ivpr;
1129     }
1130
1131     if (sregs.u.e.features & KVM_SREGS_E_64) {
1132         env->spr[SPR_BOOKE_EPCR] = sregs.u.e.epcr;
1133     }
1134
1135     if (sregs.u.e.features & KVM_SREGS_E_SPRG8) {
1136         env->spr[SPR_BOOKE_SPRG8] = sregs.u.e.sprg8;
1137     }
1138
1139     if (sregs.u.e.features & KVM_SREGS_E_IVOR) {
1140         env->spr[SPR_BOOKE_IVOR0] = sregs.u.e.ivor_low[0];
1141         kvm_sync_excp(env, POWERPC_EXCP_CRITICAL,  SPR_BOOKE_IVOR0);
1142         env->spr[SPR_BOOKE_IVOR1] = sregs.u.e.ivor_low[1];
1143         kvm_sync_excp(env, POWERPC_EXCP_MCHECK,  SPR_BOOKE_IVOR1);
1144         env->spr[SPR_BOOKE_IVOR2] = sregs.u.e.ivor_low[2];
1145         kvm_sync_excp(env, POWERPC_EXCP_DSI,  SPR_BOOKE_IVOR2);
1146         env->spr[SPR_BOOKE_IVOR3] = sregs.u.e.ivor_low[3];
1147         kvm_sync_excp(env, POWERPC_EXCP_ISI,  SPR_BOOKE_IVOR3);
1148         env->spr[SPR_BOOKE_IVOR4] = sregs.u.e.ivor_low[4];
1149         kvm_sync_excp(env, POWERPC_EXCP_EXTERNAL,  SPR_BOOKE_IVOR4);
1150         env->spr[SPR_BOOKE_IVOR5] = sregs.u.e.ivor_low[5];
1151         kvm_sync_excp(env, POWERPC_EXCP_ALIGN,  SPR_BOOKE_IVOR5);
1152         env->spr[SPR_BOOKE_IVOR6] = sregs.u.e.ivor_low[6];
1153         kvm_sync_excp(env, POWERPC_EXCP_PROGRAM,  SPR_BOOKE_IVOR6);
1154         env->spr[SPR_BOOKE_IVOR7] = sregs.u.e.ivor_low[7];
1155         kvm_sync_excp(env, POWERPC_EXCP_FPU,  SPR_BOOKE_IVOR7);
1156         env->spr[SPR_BOOKE_IVOR8] = sregs.u.e.ivor_low[8];
1157         kvm_sync_excp(env, POWERPC_EXCP_SYSCALL,  SPR_BOOKE_IVOR8);
1158         env->spr[SPR_BOOKE_IVOR9] = sregs.u.e.ivor_low[9];
1159         kvm_sync_excp(env, POWERPC_EXCP_APU,  SPR_BOOKE_IVOR9);
1160         env->spr[SPR_BOOKE_IVOR10] = sregs.u.e.ivor_low[10];
1161         kvm_sync_excp(env, POWERPC_EXCP_DECR,  SPR_BOOKE_IVOR10);
1162         env->spr[SPR_BOOKE_IVOR11] = sregs.u.e.ivor_low[11];
1163         kvm_sync_excp(env, POWERPC_EXCP_FIT,  SPR_BOOKE_IVOR11);
1164         env->spr[SPR_BOOKE_IVOR12] = sregs.u.e.ivor_low[12];
1165         kvm_sync_excp(env, POWERPC_EXCP_WDT,  SPR_BOOKE_IVOR12);
1166         env->spr[SPR_BOOKE_IVOR13] = sregs.u.e.ivor_low[13];
1167         kvm_sync_excp(env, POWERPC_EXCP_DTLB,  SPR_BOOKE_IVOR13);
1168         env->spr[SPR_BOOKE_IVOR14] = sregs.u.e.ivor_low[14];
1169         kvm_sync_excp(env, POWERPC_EXCP_ITLB,  SPR_BOOKE_IVOR14);
1170         env->spr[SPR_BOOKE_IVOR15] = sregs.u.e.ivor_low[15];
1171         kvm_sync_excp(env, POWERPC_EXCP_DEBUG,  SPR_BOOKE_IVOR15);
1172
1173         if (sregs.u.e.features & KVM_SREGS_E_SPE) {
1174             env->spr[SPR_BOOKE_IVOR32] = sregs.u.e.ivor_high[0];
1175             kvm_sync_excp(env, POWERPC_EXCP_SPEU,  SPR_BOOKE_IVOR32);
1176             env->spr[SPR_BOOKE_IVOR33] = sregs.u.e.ivor_high[1];
1177             kvm_sync_excp(env, POWERPC_EXCP_EFPDI,  SPR_BOOKE_IVOR33);
1178             env->spr[SPR_BOOKE_IVOR34] = sregs.u.e.ivor_high[2];
1179             kvm_sync_excp(env, POWERPC_EXCP_EFPRI,  SPR_BOOKE_IVOR34);
1180         }
1181
1182         if (sregs.u.e.features & KVM_SREGS_E_PM) {
1183             env->spr[SPR_BOOKE_IVOR35] = sregs.u.e.ivor_high[3];
1184             kvm_sync_excp(env, POWERPC_EXCP_EPERFM,  SPR_BOOKE_IVOR35);
1185         }
1186
1187         if (sregs.u.e.features & KVM_SREGS_E_PC) {
1188             env->spr[SPR_BOOKE_IVOR36] = sregs.u.e.ivor_high[4];
1189             kvm_sync_excp(env, POWERPC_EXCP_DOORI,  SPR_BOOKE_IVOR36);
1190             env->spr[SPR_BOOKE_IVOR37] = sregs.u.e.ivor_high[5];
1191             kvm_sync_excp(env, POWERPC_EXCP_DOORCI, SPR_BOOKE_IVOR37);
1192         }
1193     }
1194
1195     if (sregs.u.e.features & KVM_SREGS_E_ARCH206_MMU) {
1196         env->spr[SPR_BOOKE_MAS0] = sregs.u.e.mas0;
1197         env->spr[SPR_BOOKE_MAS1] = sregs.u.e.mas1;
1198         env->spr[SPR_BOOKE_MAS2] = sregs.u.e.mas2;
1199         env->spr[SPR_BOOKE_MAS3] = sregs.u.e.mas7_3 & 0xffffffff;
1200         env->spr[SPR_BOOKE_MAS4] = sregs.u.e.mas4;
1201         env->spr[SPR_BOOKE_MAS6] = sregs.u.e.mas6;
1202         env->spr[SPR_BOOKE_MAS7] = sregs.u.e.mas7_3 >> 32;
1203         env->spr[SPR_MMUCFG] = sregs.u.e.mmucfg;
1204         env->spr[SPR_BOOKE_TLB0CFG] = sregs.u.e.tlbcfg[0];
1205         env->spr[SPR_BOOKE_TLB1CFG] = sregs.u.e.tlbcfg[1];
1206     }
1207
1208     if (sregs.u.e.features & KVM_SREGS_EXP) {
1209         env->spr[SPR_BOOKE_EPR] = sregs.u.e.epr;
1210     }
1211
1212     if (sregs.u.e.features & KVM_SREGS_E_PD) {
1213         env->spr[SPR_BOOKE_EPLC] = sregs.u.e.eplc;
1214         env->spr[SPR_BOOKE_EPSC] = sregs.u.e.epsc;
1215     }
1216
1217     if (sregs.u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
1218         env->spr[SPR_E500_SVR] = sregs.u.e.impl.fsl.svr;
1219         env->spr[SPR_Exxx_MCAR] = sregs.u.e.impl.fsl.mcar;
1220         env->spr[SPR_HID0] = sregs.u.e.impl.fsl.hid0;
1221
1222         if (sregs.u.e.impl.fsl.features & KVM_SREGS_E_FSL_PIDn) {
1223             env->spr[SPR_BOOKE_PID1] = sregs.u.e.impl.fsl.pid1;
1224             env->spr[SPR_BOOKE_PID2] = sregs.u.e.impl.fsl.pid2;
1225         }
1226     }
1227
1228     return 0;
1229 }
1230
1231 static int kvmppc_get_books_sregs(PowerPCCPU *cpu)
1232 {
1233     CPUPPCState *env = &cpu->env;
1234     struct kvm_sregs sregs;
1235     int ret;
1236     int i;
1237
1238     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
1239     if (ret < 0) {
1240         return ret;
1241     }
1242
1243     if (!cpu->vhyp) {
1244         ppc_store_sdr1(env, sregs.u.s.sdr1);
1245     }
1246
1247     /* Sync SLB */
1248 #ifdef TARGET_PPC64
1249     /*
1250      * The packed SLB array we get from KVM_GET_SREGS only contains
1251      * information about valid entries. So we flush our internal copy
1252      * to get rid of stale ones, then put all valid SLB entries back
1253      * in.
1254      */
1255     memset(env->slb, 0, sizeof(env->slb));
1256     for (i = 0; i < ARRAY_SIZE(env->slb); i++) {
1257         target_ulong rb = sregs.u.s.ppc64.slb[i].slbe;
1258         target_ulong rs = sregs.u.s.ppc64.slb[i].slbv;
1259         /*
1260          * Only restore valid entries
1261          */
1262         if (rb & SLB_ESID_V) {
1263             ppc_store_slb(cpu, rb & 0xfff, rb & ~0xfffULL, rs);
1264         }
1265     }
1266 #endif
1267
1268     /* Sync SRs */
1269     for (i = 0; i < 16; i++) {
1270         env->sr[i] = sregs.u.s.ppc32.sr[i];
1271     }
1272
1273     /* Sync BATs */
1274     for (i = 0; i < 8; i++) {
1275         env->DBAT[0][i] = sregs.u.s.ppc32.dbat[i] & 0xffffffff;
1276         env->DBAT[1][i] = sregs.u.s.ppc32.dbat[i] >> 32;
1277         env->IBAT[0][i] = sregs.u.s.ppc32.ibat[i] & 0xffffffff;
1278         env->IBAT[1][i] = sregs.u.s.ppc32.ibat[i] >> 32;
1279     }
1280
1281     return 0;
1282 }
1283
1284 int kvm_arch_get_registers(CPUState *cs)
1285 {
1286     PowerPCCPU *cpu = POWERPC_CPU(cs);
1287     CPUPPCState *env = &cpu->env;
1288     struct kvm_regs regs;
1289     uint32_t cr;
1290     int i, ret;
1291
1292     ret = kvm_vcpu_ioctl(cs, KVM_GET_REGS, &regs);
1293     if (ret < 0)
1294         return ret;
1295
1296     cr = regs.cr;
1297     for (i = 7; i >= 0; i--) {
1298         env->crf[i] = cr & 15;
1299         cr >>= 4;
1300     }
1301
1302     env->ctr = regs.ctr;
1303     env->lr = regs.lr;
1304     cpu_write_xer(env, regs.xer);
1305     env->msr = regs.msr;
1306     env->nip = regs.pc;
1307
1308     env->spr[SPR_SRR0] = regs.srr0;
1309     env->spr[SPR_SRR1] = regs.srr1;
1310
1311     env->spr[SPR_SPRG0] = regs.sprg0;
1312     env->spr[SPR_SPRG1] = regs.sprg1;
1313     env->spr[SPR_SPRG2] = regs.sprg2;
1314     env->spr[SPR_SPRG3] = regs.sprg3;
1315     env->spr[SPR_SPRG4] = regs.sprg4;
1316     env->spr[SPR_SPRG5] = regs.sprg5;
1317     env->spr[SPR_SPRG6] = regs.sprg6;
1318     env->spr[SPR_SPRG7] = regs.sprg7;
1319
1320     env->spr[SPR_BOOKE_PID] = regs.pid;
1321
1322     for (i = 0;i < 32; i++)
1323         env->gpr[i] = regs.gpr[i];
1324
1325     kvm_get_fp(cs);
1326
1327     if (cap_booke_sregs) {
1328         ret = kvmppc_get_booke_sregs(cpu);
1329         if (ret < 0) {
1330             return ret;
1331         }
1332     }
1333
1334     if (cap_segstate) {
1335         ret = kvmppc_get_books_sregs(cpu);
1336         if (ret < 0) {
1337             return ret;
1338         }
1339     }
1340
1341     if (cap_hior) {
1342         kvm_get_one_spr(cs, KVM_REG_PPC_HIOR, SPR_HIOR);
1343     }
1344
1345     if (cap_one_reg) {
1346         int i;
1347
1348         /* We deliberately ignore errors here, for kernels which have
1349          * the ONE_REG calls, but don't support the specific
1350          * registers, there's a reasonable chance things will still
1351          * work, at least until we try to migrate. */
1352         for (i = 0; i < 1024; i++) {
1353             uint64_t id = env->spr_cb[i].one_reg_id;
1354
1355             if (id != 0) {
1356                 kvm_get_one_spr(cs, id, i);
1357             }
1358         }
1359
1360 #ifdef TARGET_PPC64
1361         if (msr_ts) {
1362             for (i = 0; i < ARRAY_SIZE(env->tm_gpr); i++) {
1363                 kvm_get_one_reg(cs, KVM_REG_PPC_TM_GPR(i), &env->tm_gpr[i]);
1364             }
1365             for (i = 0; i < ARRAY_SIZE(env->tm_vsr); i++) {
1366                 kvm_get_one_reg(cs, KVM_REG_PPC_TM_VSR(i), &env->tm_vsr[i]);
1367             }
1368             kvm_get_one_reg(cs, KVM_REG_PPC_TM_CR, &env->tm_cr);
1369             kvm_get_one_reg(cs, KVM_REG_PPC_TM_LR, &env->tm_lr);
1370             kvm_get_one_reg(cs, KVM_REG_PPC_TM_CTR, &env->tm_ctr);
1371             kvm_get_one_reg(cs, KVM_REG_PPC_TM_FPSCR, &env->tm_fpscr);
1372             kvm_get_one_reg(cs, KVM_REG_PPC_TM_AMR, &env->tm_amr);
1373             kvm_get_one_reg(cs, KVM_REG_PPC_TM_PPR, &env->tm_ppr);
1374             kvm_get_one_reg(cs, KVM_REG_PPC_TM_VRSAVE, &env->tm_vrsave);
1375             kvm_get_one_reg(cs, KVM_REG_PPC_TM_VSCR, &env->tm_vscr);
1376             kvm_get_one_reg(cs, KVM_REG_PPC_TM_DSCR, &env->tm_dscr);
1377             kvm_get_one_reg(cs, KVM_REG_PPC_TM_TAR, &env->tm_tar);
1378         }
1379
1380         if (cap_papr) {
1381             if (kvm_get_vpa(cs) < 0) {
1382                 DPRINTF("Warning: Unable to get VPA information from KVM\n");
1383             }
1384         }
1385
1386         kvm_get_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &env->tb_env->tb_offset);
1387 #endif
1388     }
1389
1390     return 0;
1391 }
1392
1393 int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int level)
1394 {
1395     unsigned virq = level ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
1396
1397     if (irq != PPC_INTERRUPT_EXT) {
1398         return 0;
1399     }
1400
1401     if (!kvm_enabled() || !cap_interrupt_unset || !cap_interrupt_level) {
1402         return 0;
1403     }
1404
1405     kvm_vcpu_ioctl(CPU(cpu), KVM_INTERRUPT, &virq);
1406
1407     return 0;
1408 }
1409
1410 #if defined(TARGET_PPCEMB)
1411 #define PPC_INPUT_INT PPC40x_INPUT_INT
1412 #elif defined(TARGET_PPC64)
1413 #define PPC_INPUT_INT PPC970_INPUT_INT
1414 #else
1415 #define PPC_INPUT_INT PPC6xx_INPUT_INT
1416 #endif
1417
1418 void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
1419 {
1420     PowerPCCPU *cpu = POWERPC_CPU(cs);
1421     CPUPPCState *env = &cpu->env;
1422     int r;
1423     unsigned irq;
1424
1425     qemu_mutex_lock_iothread();
1426
1427     /* PowerPC QEMU tracks the various core input pins (interrupt, critical
1428      * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
1429     if (!cap_interrupt_level &&
1430         run->ready_for_interrupt_injection &&
1431         (cs->interrupt_request & CPU_INTERRUPT_HARD) &&
1432         (env->irq_input_state & (1<<PPC_INPUT_INT)))
1433     {
1434         /* For now KVM disregards the 'irq' argument. However, in the
1435          * future KVM could cache it in-kernel to avoid a heavyweight exit
1436          * when reading the UIC.
1437          */
1438         irq = KVM_INTERRUPT_SET;
1439
1440         DPRINTF("injected interrupt %d\n", irq);
1441         r = kvm_vcpu_ioctl(cs, KVM_INTERRUPT, &irq);
1442         if (r < 0) {
1443             printf("cpu %d fail inject %x\n", cs->cpu_index, irq);
1444         }
1445
1446         /* Always wake up soon in case the interrupt was level based */
1447         timer_mod(idle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
1448                        (NANOSECONDS_PER_SECOND / 50));
1449     }
1450
1451     /* We don't know if there are more interrupts pending after this. However,
1452      * the guest will return to userspace in the course of handling this one
1453      * anyways, so we will get a chance to deliver the rest. */
1454
1455     qemu_mutex_unlock_iothread();
1456 }
1457
1458 MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
1459 {
1460     return MEMTXATTRS_UNSPECIFIED;
1461 }
1462
1463 int kvm_arch_process_async_events(CPUState *cs)
1464 {
1465     return cs->halted;
1466 }
1467
1468 static int kvmppc_handle_halt(PowerPCCPU *cpu)
1469 {
1470     CPUState *cs = CPU(cpu);
1471     CPUPPCState *env = &cpu->env;
1472
1473     if (!(cs->interrupt_request & CPU_INTERRUPT_HARD) && (msr_ee)) {
1474         cs->halted = 1;
1475         cs->exception_index = EXCP_HLT;
1476     }
1477
1478     return 0;
1479 }
1480
1481 /* map dcr access to existing qemu dcr emulation */
1482 static int kvmppc_handle_dcr_read(CPUPPCState *env, uint32_t dcrn, uint32_t *data)
1483 {
1484     if (ppc_dcr_read(env->dcr_env, dcrn, data) < 0)
1485         fprintf(stderr, "Read to unhandled DCR (0x%x)\n", dcrn);
1486
1487     return 0;
1488 }
1489
1490 static int kvmppc_handle_dcr_write(CPUPPCState *env, uint32_t dcrn, uint32_t data)
1491 {
1492     if (ppc_dcr_write(env->dcr_env, dcrn, data) < 0)
1493         fprintf(stderr, "Write to unhandled DCR (0x%x)\n", dcrn);
1494
1495     return 0;
1496 }
1497
1498 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
1499 {
1500     /* Mixed endian case is not handled */
1501     uint32_t sc = debug_inst_opcode;
1502
1503     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn,
1504                             sizeof(sc), 0) ||
1505         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&sc, sizeof(sc), 1)) {
1506         return -EINVAL;
1507     }
1508
1509     return 0;
1510 }
1511
1512 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
1513 {
1514     uint32_t sc;
1515
1516     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&sc, sizeof(sc), 0) ||
1517         sc != debug_inst_opcode ||
1518         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn,
1519                             sizeof(sc), 1)) {
1520         return -EINVAL;
1521     }
1522
1523     return 0;
1524 }
1525
1526 static int find_hw_breakpoint(target_ulong addr, int type)
1527 {
1528     int n;
1529
1530     assert((nb_hw_breakpoint + nb_hw_watchpoint)
1531            <= ARRAY_SIZE(hw_debug_points));
1532
1533     for (n = 0; n < nb_hw_breakpoint + nb_hw_watchpoint; n++) {
1534         if (hw_debug_points[n].addr == addr &&
1535              hw_debug_points[n].type == type) {
1536             return n;
1537         }
1538     }
1539
1540     return -1;
1541 }
1542
1543 static int find_hw_watchpoint(target_ulong addr, int *flag)
1544 {
1545     int n;
1546
1547     n = find_hw_breakpoint(addr, GDB_WATCHPOINT_ACCESS);
1548     if (n >= 0) {
1549         *flag = BP_MEM_ACCESS;
1550         return n;
1551     }
1552
1553     n = find_hw_breakpoint(addr, GDB_WATCHPOINT_WRITE);
1554     if (n >= 0) {
1555         *flag = BP_MEM_WRITE;
1556         return n;
1557     }
1558
1559     n = find_hw_breakpoint(addr, GDB_WATCHPOINT_READ);
1560     if (n >= 0) {
1561         *flag = BP_MEM_READ;
1562         return n;
1563     }
1564
1565     return -1;
1566 }
1567
1568 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
1569                                   target_ulong len, int type)
1570 {
1571     if ((nb_hw_breakpoint + nb_hw_watchpoint) >= ARRAY_SIZE(hw_debug_points)) {
1572         return -ENOBUFS;
1573     }
1574
1575     hw_debug_points[nb_hw_breakpoint + nb_hw_watchpoint].addr = addr;
1576     hw_debug_points[nb_hw_breakpoint + nb_hw_watchpoint].type = type;
1577
1578     switch (type) {
1579     case GDB_BREAKPOINT_HW:
1580         if (nb_hw_breakpoint >= max_hw_breakpoint) {
1581             return -ENOBUFS;
1582         }
1583
1584         if (find_hw_breakpoint(addr, type) >= 0) {
1585             return -EEXIST;
1586         }
1587
1588         nb_hw_breakpoint++;
1589         break;
1590
1591     case GDB_WATCHPOINT_WRITE:
1592     case GDB_WATCHPOINT_READ:
1593     case GDB_WATCHPOINT_ACCESS:
1594         if (nb_hw_watchpoint >= max_hw_watchpoint) {
1595             return -ENOBUFS;
1596         }
1597
1598         if (find_hw_breakpoint(addr, type) >= 0) {
1599             return -EEXIST;
1600         }
1601
1602         nb_hw_watchpoint++;
1603         break;
1604
1605     default:
1606         return -ENOSYS;
1607     }
1608
1609     return 0;
1610 }
1611
1612 int kvm_arch_remove_hw_breakpoint(target_ulong addr,
1613                                   target_ulong len, int type)
1614 {
1615     int n;
1616
1617     n = find_hw_breakpoint(addr, type);
1618     if (n < 0) {
1619         return -ENOENT;
1620     }
1621
1622     switch (type) {
1623     case GDB_BREAKPOINT_HW:
1624         nb_hw_breakpoint--;
1625         break;
1626
1627     case GDB_WATCHPOINT_WRITE:
1628     case GDB_WATCHPOINT_READ:
1629     case GDB_WATCHPOINT_ACCESS:
1630         nb_hw_watchpoint--;
1631         break;
1632
1633     default:
1634         return -ENOSYS;
1635     }
1636     hw_debug_points[n] = hw_debug_points[nb_hw_breakpoint + nb_hw_watchpoint];
1637
1638     return 0;
1639 }
1640
1641 void kvm_arch_remove_all_hw_breakpoints(void)
1642 {
1643     nb_hw_breakpoint = nb_hw_watchpoint = 0;
1644 }
1645
1646 void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
1647 {
1648     int n;
1649
1650     /* Software Breakpoint updates */
1651     if (kvm_sw_breakpoints_active(cs)) {
1652         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
1653     }
1654
1655     assert((nb_hw_breakpoint + nb_hw_watchpoint)
1656            <= ARRAY_SIZE(hw_debug_points));
1657     assert((nb_hw_breakpoint + nb_hw_watchpoint) <= ARRAY_SIZE(dbg->arch.bp));
1658
1659     if (nb_hw_breakpoint + nb_hw_watchpoint > 0) {
1660         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
1661         memset(dbg->arch.bp, 0, sizeof(dbg->arch.bp));
1662         for (n = 0; n < nb_hw_breakpoint + nb_hw_watchpoint; n++) {
1663             switch (hw_debug_points[n].type) {
1664             case GDB_BREAKPOINT_HW:
1665                 dbg->arch.bp[n].type = KVMPPC_DEBUG_BREAKPOINT;
1666                 break;
1667             case GDB_WATCHPOINT_WRITE:
1668                 dbg->arch.bp[n].type = KVMPPC_DEBUG_WATCH_WRITE;
1669                 break;
1670             case GDB_WATCHPOINT_READ:
1671                 dbg->arch.bp[n].type = KVMPPC_DEBUG_WATCH_READ;
1672                 break;
1673             case GDB_WATCHPOINT_ACCESS:
1674                 dbg->arch.bp[n].type = KVMPPC_DEBUG_WATCH_WRITE |
1675                                         KVMPPC_DEBUG_WATCH_READ;
1676                 break;
1677             default:
1678                 cpu_abort(cs, "Unsupported breakpoint type\n");
1679             }
1680             dbg->arch.bp[n].addr = hw_debug_points[n].addr;
1681         }
1682     }
1683 }
1684
1685 static int kvm_handle_debug(PowerPCCPU *cpu, struct kvm_run *run)
1686 {
1687     CPUState *cs = CPU(cpu);
1688     CPUPPCState *env = &cpu->env;
1689     struct kvm_debug_exit_arch *arch_info = &run->debug.arch;
1690     int handle = 0;
1691     int n;
1692     int flag = 0;
1693
1694     if (cs->singlestep_enabled) {
1695         handle = 1;
1696     } else if (arch_info->status) {
1697         if (nb_hw_breakpoint + nb_hw_watchpoint > 0) {
1698             if (arch_info->status & KVMPPC_DEBUG_BREAKPOINT) {
1699                 n = find_hw_breakpoint(arch_info->address, GDB_BREAKPOINT_HW);
1700                 if (n >= 0) {
1701                     handle = 1;
1702                 }
1703             } else if (arch_info->status & (KVMPPC_DEBUG_WATCH_READ |
1704                                             KVMPPC_DEBUG_WATCH_WRITE)) {
1705                 n = find_hw_watchpoint(arch_info->address,  &flag);
1706                 if (n >= 0) {
1707                     handle = 1;
1708                     cs->watchpoint_hit = &hw_watchpoint;
1709                     hw_watchpoint.vaddr = hw_debug_points[n].addr;
1710                     hw_watchpoint.flags = flag;
1711                 }
1712             }
1713         }
1714     } else if (kvm_find_sw_breakpoint(cs, arch_info->address)) {
1715         handle = 1;
1716     } else {
1717         /* QEMU is not able to handle debug exception, so inject
1718          * program exception to guest;
1719          * Yes program exception NOT debug exception !!
1720          * When QEMU is using debug resources then debug exception must
1721          * be always set. To achieve this we set MSR_DE and also set
1722          * MSRP_DEP so guest cannot change MSR_DE.
1723          * When emulating debug resource for guest we want guest
1724          * to control MSR_DE (enable/disable debug interrupt on need).
1725          * Supporting both configurations are NOT possible.
1726          * So the result is that we cannot share debug resources
1727          * between QEMU and Guest on BOOKE architecture.
1728          * In the current design QEMU gets the priority over guest,
1729          * this means that if QEMU is using debug resources then guest
1730          * cannot use them;
1731          * For software breakpoint QEMU uses a privileged instruction;
1732          * So there cannot be any reason that we are here for guest
1733          * set debug exception, only possibility is guest executed a
1734          * privileged / illegal instruction and that's why we are
1735          * injecting a program interrupt.
1736          */
1737
1738         cpu_synchronize_state(cs);
1739         /* env->nip is PC, so increment this by 4 to use
1740          * ppc_cpu_do_interrupt(), which set srr0 = env->nip - 4.
1741          */
1742         env->nip += 4;
1743         cs->exception_index = POWERPC_EXCP_PROGRAM;
1744         env->error_code = POWERPC_EXCP_INVAL;
1745         ppc_cpu_do_interrupt(cs);
1746     }
1747
1748     return handle;
1749 }
1750
1751 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
1752 {
1753     PowerPCCPU *cpu = POWERPC_CPU(cs);
1754     CPUPPCState *env = &cpu->env;
1755     int ret;
1756
1757     qemu_mutex_lock_iothread();
1758
1759     switch (run->exit_reason) {
1760     case KVM_EXIT_DCR:
1761         if (run->dcr.is_write) {
1762             DPRINTF("handle dcr write\n");
1763             ret = kvmppc_handle_dcr_write(env, run->dcr.dcrn, run->dcr.data);
1764         } else {
1765             DPRINTF("handle dcr read\n");
1766             ret = kvmppc_handle_dcr_read(env, run->dcr.dcrn, &run->dcr.data);
1767         }
1768         break;
1769     case KVM_EXIT_HLT:
1770         DPRINTF("handle halt\n");
1771         ret = kvmppc_handle_halt(cpu);
1772         break;
1773 #if defined(TARGET_PPC64)
1774     case KVM_EXIT_PAPR_HCALL:
1775         DPRINTF("handle PAPR hypercall\n");
1776         run->papr_hcall.ret = spapr_hypercall(cpu,
1777                                               run->papr_hcall.nr,
1778                                               run->papr_hcall.args);
1779         ret = 0;
1780         break;
1781 #endif
1782     case KVM_EXIT_EPR:
1783         DPRINTF("handle epr\n");
1784         run->epr.epr = ldl_phys(cs->as, env->mpic_iack);
1785         ret = 0;
1786         break;
1787     case KVM_EXIT_WATCHDOG:
1788         DPRINTF("handle watchdog expiry\n");
1789         watchdog_perform_action();
1790         ret = 0;
1791         break;
1792
1793     case KVM_EXIT_DEBUG:
1794         DPRINTF("handle debug exception\n");
1795         if (kvm_handle_debug(cpu, run)) {
1796             ret = EXCP_DEBUG;
1797             break;
1798         }
1799         /* re-enter, this exception was guest-internal */
1800         ret = 0;
1801         break;
1802
1803     default:
1804         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
1805         ret = -1;
1806         break;
1807     }
1808
1809     qemu_mutex_unlock_iothread();
1810     return ret;
1811 }
1812
1813 int kvmppc_or_tsr_bits(PowerPCCPU *cpu, uint32_t tsr_bits)
1814 {
1815     CPUState *cs = CPU(cpu);
1816     uint32_t bits = tsr_bits;
1817     struct kvm_one_reg reg = {
1818         .id = KVM_REG_PPC_OR_TSR,
1819         .addr = (uintptr_t) &bits,
1820     };
1821
1822     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1823 }
1824
1825 int kvmppc_clear_tsr_bits(PowerPCCPU *cpu, uint32_t tsr_bits)
1826 {
1827
1828     CPUState *cs = CPU(cpu);
1829     uint32_t bits = tsr_bits;
1830     struct kvm_one_reg reg = {
1831         .id = KVM_REG_PPC_CLEAR_TSR,
1832         .addr = (uintptr_t) &bits,
1833     };
1834
1835     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1836 }
1837
1838 int kvmppc_set_tcr(PowerPCCPU *cpu)
1839 {
1840     CPUState *cs = CPU(cpu);
1841     CPUPPCState *env = &cpu->env;
1842     uint32_t tcr = env->spr[SPR_BOOKE_TCR];
1843
1844     struct kvm_one_reg reg = {
1845         .id = KVM_REG_PPC_TCR,
1846         .addr = (uintptr_t) &tcr,
1847     };
1848
1849     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1850 }
1851
1852 int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu)
1853 {
1854     CPUState *cs = CPU(cpu);
1855     int ret;
1856
1857     if (!kvm_enabled()) {
1858         return -1;
1859     }
1860
1861     if (!cap_ppc_watchdog) {
1862         printf("warning: KVM does not support watchdog");
1863         return -1;
1864     }
1865
1866     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_BOOKE_WATCHDOG, 0);
1867     if (ret < 0) {
1868         fprintf(stderr, "%s: couldn't enable KVM_CAP_PPC_BOOKE_WATCHDOG: %s\n",
1869                 __func__, strerror(-ret));
1870         return ret;
1871     }
1872
1873     return ret;
1874 }
1875
1876 static int read_cpuinfo(const char *field, char *value, int len)
1877 {
1878     FILE *f;
1879     int ret = -1;
1880     int field_len = strlen(field);
1881     char line[512];
1882
1883     f = fopen("/proc/cpuinfo", "r");
1884     if (!f) {
1885         return -1;
1886     }
1887
1888     do {
1889         if (!fgets(line, sizeof(line), f)) {
1890             break;
1891         }
1892         if (!strncmp(line, field, field_len)) {
1893             pstrcpy(value, len, line);
1894             ret = 0;
1895             break;
1896         }
1897     } while(*line);
1898
1899     fclose(f);
1900
1901     return ret;
1902 }
1903
1904 uint32_t kvmppc_get_tbfreq(void)
1905 {
1906     char line[512];
1907     char *ns;
1908     uint32_t retval = NANOSECONDS_PER_SECOND;
1909
1910     if (read_cpuinfo("timebase", line, sizeof(line))) {
1911         return retval;
1912     }
1913
1914     if (!(ns = strchr(line, ':'))) {
1915         return retval;
1916     }
1917
1918     ns++;
1919
1920     return atoi(ns);
1921 }
1922
1923 bool kvmppc_get_host_serial(char **value)
1924 {
1925     return g_file_get_contents("/proc/device-tree/system-id", value, NULL,
1926                                NULL);
1927 }
1928
1929 bool kvmppc_get_host_model(char **value)
1930 {
1931     return g_file_get_contents("/proc/device-tree/model", value, NULL, NULL);
1932 }
1933
1934 /* Try to find a device tree node for a CPU with clock-frequency property */
1935 static int kvmppc_find_cpu_dt(char *buf, int buf_len)
1936 {
1937     struct dirent *dirp;
1938     DIR *dp;
1939
1940     if ((dp = opendir(PROC_DEVTREE_CPU)) == NULL) {
1941         printf("Can't open directory " PROC_DEVTREE_CPU "\n");
1942         return -1;
1943     }
1944
1945     buf[0] = '\0';
1946     while ((dirp = readdir(dp)) != NULL) {
1947         FILE *f;
1948         snprintf(buf, buf_len, "%s%s/clock-frequency", PROC_DEVTREE_CPU,
1949                  dirp->d_name);
1950         f = fopen(buf, "r");
1951         if (f) {
1952             snprintf(buf, buf_len, "%s%s", PROC_DEVTREE_CPU, dirp->d_name);
1953             fclose(f);
1954             break;
1955         }
1956         buf[0] = '\0';
1957     }
1958     closedir(dp);
1959     if (buf[0] == '\0') {
1960         printf("Unknown host!\n");
1961         return -1;
1962     }
1963
1964     return 0;
1965 }
1966
1967 static uint64_t kvmppc_read_int_dt(const char *filename)
1968 {
1969     union {
1970         uint32_t v32;
1971         uint64_t v64;
1972     } u;
1973     FILE *f;
1974     int len;
1975
1976     f = fopen(filename, "rb");
1977     if (!f) {
1978         return -1;
1979     }
1980
1981     len = fread(&u, 1, sizeof(u), f);
1982     fclose(f);
1983     switch (len) {
1984     case 4:
1985         /* property is a 32-bit quantity */
1986         return be32_to_cpu(u.v32);
1987     case 8:
1988         return be64_to_cpu(u.v64);
1989     }
1990
1991     return 0;
1992 }
1993
1994 /* Read a CPU node property from the host device tree that's a single
1995  * integer (32-bit or 64-bit).  Returns 0 if anything goes wrong
1996  * (can't find or open the property, or doesn't understand the
1997  * format) */
1998 static uint64_t kvmppc_read_int_cpu_dt(const char *propname)
1999 {
2000     char buf[PATH_MAX], *tmp;
2001     uint64_t val;
2002
2003     if (kvmppc_find_cpu_dt(buf, sizeof(buf))) {
2004         return -1;
2005     }
2006
2007     tmp = g_strdup_printf("%s/%s", buf, propname);
2008     val = kvmppc_read_int_dt(tmp);
2009     g_free(tmp);
2010
2011     return val;
2012 }
2013
2014 uint64_t kvmppc_get_clockfreq(void)
2015 {
2016     return kvmppc_read_int_cpu_dt("clock-frequency");
2017 }
2018
2019 static int kvmppc_get_pvinfo(CPUPPCState *env, struct kvm_ppc_pvinfo *pvinfo)
2020  {
2021      PowerPCCPU *cpu = ppc_env_get_cpu(env);
2022      CPUState *cs = CPU(cpu);
2023
2024     if (kvm_vm_check_extension(cs->kvm_state, KVM_CAP_PPC_GET_PVINFO) &&
2025         !kvm_vm_ioctl(cs->kvm_state, KVM_PPC_GET_PVINFO, pvinfo)) {
2026         return 0;
2027     }
2028
2029     return 1;
2030 }
2031
2032 int kvmppc_get_hasidle(CPUPPCState *env)
2033 {
2034     struct kvm_ppc_pvinfo pvinfo;
2035
2036     if (!kvmppc_get_pvinfo(env, &pvinfo) &&
2037         (pvinfo.flags & KVM_PPC_PVINFO_FLAGS_EV_IDLE)) {
2038         return 1;
2039     }
2040
2041     return 0;
2042 }
2043
2044 int kvmppc_get_hypercall(CPUPPCState *env, uint8_t *buf, int buf_len)
2045 {
2046     uint32_t *hc = (uint32_t*)buf;
2047     struct kvm_ppc_pvinfo pvinfo;
2048
2049     if (!kvmppc_get_pvinfo(env, &pvinfo)) {
2050         memcpy(buf, pvinfo.hcall, buf_len);
2051         return 0;
2052     }
2053
2054     /*
2055      * Fallback to always fail hypercalls regardless of endianness:
2056      *
2057      *     tdi 0,r0,72 (becomes b .+8 in wrong endian, nop in good endian)
2058      *     li r3, -1
2059      *     b .+8       (becomes nop in wrong endian)
2060      *     bswap32(li r3, -1)
2061      */
2062
2063     hc[0] = cpu_to_be32(0x08000048);
2064     hc[1] = cpu_to_be32(0x3860ffff);
2065     hc[2] = cpu_to_be32(0x48000008);
2066     hc[3] = cpu_to_be32(bswap32(0x3860ffff));
2067
2068     return 1;
2069 }
2070
2071 static inline int kvmppc_enable_hcall(KVMState *s, target_ulong hcall)
2072 {
2073     return kvm_vm_enable_cap(s, KVM_CAP_PPC_ENABLE_HCALL, 0, hcall, 1);
2074 }
2075
2076 void kvmppc_enable_logical_ci_hcalls(void)
2077 {
2078     /*
2079      * FIXME: it would be nice if we could detect the cases where
2080      * we're using a device which requires the in kernel
2081      * implementation of these hcalls, but the kernel lacks them and
2082      * produce a warning.
2083      */
2084     kvmppc_enable_hcall(kvm_state, H_LOGICAL_CI_LOAD);
2085     kvmppc_enable_hcall(kvm_state, H_LOGICAL_CI_STORE);
2086 }
2087
2088 void kvmppc_enable_set_mode_hcall(void)
2089 {
2090     kvmppc_enable_hcall(kvm_state, H_SET_MODE);
2091 }
2092
2093 void kvmppc_enable_clear_ref_mod_hcalls(void)
2094 {
2095     kvmppc_enable_hcall(kvm_state, H_CLEAR_REF);
2096     kvmppc_enable_hcall(kvm_state, H_CLEAR_MOD);
2097 }
2098
2099 void kvmppc_set_papr(PowerPCCPU *cpu)
2100 {
2101     CPUState *cs = CPU(cpu);
2102     int ret;
2103
2104     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_PAPR, 0);
2105     if (ret) {
2106         error_report("This vCPU type or KVM version does not support PAPR");
2107         exit(1);
2108     }
2109
2110     /* Update the capability flag so we sync the right information
2111      * with kvm */
2112     cap_papr = 1;
2113 }
2114
2115 int kvmppc_set_compat(PowerPCCPU *cpu, uint32_t compat_pvr)
2116 {
2117     return kvm_set_one_reg(CPU(cpu), KVM_REG_PPC_ARCH_COMPAT, &compat_pvr);
2118 }
2119
2120 void kvmppc_set_mpic_proxy(PowerPCCPU *cpu, int mpic_proxy)
2121 {
2122     CPUState *cs = CPU(cpu);
2123     int ret;
2124
2125     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_EPR, 0, mpic_proxy);
2126     if (ret && mpic_proxy) {
2127         error_report("This KVM version does not support EPR");
2128         exit(1);
2129     }
2130 }
2131
2132 int kvmppc_smt_threads(void)
2133 {
2134     return cap_ppc_smt ? cap_ppc_smt : 1;
2135 }
2136
2137 int kvmppc_set_smt_threads(int smt)
2138 {
2139     int ret;
2140
2141     ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_SMT, 0, smt, 0);
2142     if (!ret) {
2143         cap_ppc_smt = smt;
2144     }
2145     return ret;
2146 }
2147
2148 void kvmppc_hint_smt_possible(Error **errp)
2149 {
2150     int i;
2151     GString *g;
2152     char *s;
2153
2154     assert(kvm_enabled());
2155     if (cap_ppc_smt_possible) {
2156         g = g_string_new("Available VSMT modes:");
2157         for (i = 63; i >= 0; i--) {
2158             if ((1UL << i) & cap_ppc_smt_possible) {
2159                 g_string_append_printf(g, " %lu", (1UL << i));
2160             }
2161         }
2162         s = g_string_free(g, false);
2163         error_append_hint(errp, "%s.\n", s);
2164         g_free(s);
2165     } else {
2166         error_append_hint(errp,
2167                           "This KVM seems to be too old to support VSMT.\n");
2168     }
2169 }
2170
2171
2172 #ifdef TARGET_PPC64
2173 off_t kvmppc_alloc_rma(void **rma)
2174 {
2175     off_t size;
2176     int fd;
2177     struct kvm_allocate_rma ret;
2178
2179     /* If cap_ppc_rma == 0, contiguous RMA allocation is not supported
2180      * if cap_ppc_rma == 1, contiguous RMA allocation is supported, but
2181      *                      not necessary on this hardware
2182      * if cap_ppc_rma == 2, contiguous RMA allocation is needed on this hardware
2183      *
2184      * FIXME: We should allow the user to force contiguous RMA
2185      * allocation in the cap_ppc_rma==1 case.
2186      */
2187     if (cap_ppc_rma < 2) {
2188         return 0;
2189     }
2190
2191     fd = kvm_vm_ioctl(kvm_state, KVM_ALLOCATE_RMA, &ret);
2192     if (fd < 0) {
2193         fprintf(stderr, "KVM: Error on KVM_ALLOCATE_RMA: %s\n",
2194                 strerror(errno));
2195         return -1;
2196     }
2197
2198     size = MIN(ret.rma_size, 256ul << 20);
2199
2200     *rma = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
2201     if (*rma == MAP_FAILED) {
2202         fprintf(stderr, "KVM: Error mapping RMA: %s\n", strerror(errno));
2203         return -1;
2204     };
2205
2206     return size;
2207 }
2208
2209 uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift)
2210 {
2211     struct kvm_ppc_smmu_info info;
2212     long rampagesize, best_page_shift;
2213     int i;
2214
2215     if (cap_ppc_rma >= 2) {
2216         return current_size;
2217     }
2218
2219     /* Find the largest hardware supported page size that's less than
2220      * or equal to the (logical) backing page size of guest RAM */
2221     kvm_get_smmu_info(POWERPC_CPU(first_cpu), &info);
2222     rampagesize = qemu_getrampagesize();
2223     best_page_shift = 0;
2224
2225     for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
2226         struct kvm_ppc_one_seg_page_size *sps = &info.sps[i];
2227
2228         if (!sps->page_shift) {
2229             continue;
2230         }
2231
2232         if ((sps->page_shift > best_page_shift)
2233             && ((1UL << sps->page_shift) <= rampagesize)) {
2234             best_page_shift = sps->page_shift;
2235         }
2236     }
2237
2238     return MIN(current_size,
2239                1ULL << (best_page_shift + hash_shift - 7));
2240 }
2241 #endif
2242
2243 bool kvmppc_spapr_use_multitce(void)
2244 {
2245     return cap_spapr_multitce;
2246 }
2247
2248 int kvmppc_spapr_enable_inkernel_multitce(void)
2249 {
2250     int ret;
2251
2252     ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_ENABLE_HCALL, 0,
2253                             H_PUT_TCE_INDIRECT, 1);
2254     if (!ret) {
2255         ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_ENABLE_HCALL, 0,
2256                                 H_STUFF_TCE, 1);
2257     }
2258
2259     return ret;
2260 }
2261
2262 void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t page_shift,
2263                               uint64_t bus_offset, uint32_t nb_table,
2264                               int *pfd, bool need_vfio)
2265 {
2266     long len;
2267     int fd;
2268     void *table;
2269
2270     /* Must set fd to -1 so we don't try to munmap when called for
2271      * destroying the table, which the upper layers -will- do
2272      */
2273     *pfd = -1;
2274     if (!cap_spapr_tce || (need_vfio && !cap_spapr_vfio)) {
2275         return NULL;
2276     }
2277
2278     if (cap_spapr_tce_64) {
2279         struct kvm_create_spapr_tce_64 args = {
2280             .liobn = liobn,
2281             .page_shift = page_shift,
2282             .offset = bus_offset >> page_shift,
2283             .size = nb_table,
2284             .flags = 0
2285         };
2286         fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE_64, &args);
2287         if (fd < 0) {
2288             fprintf(stderr,
2289                     "KVM: Failed to create TCE64 table for liobn 0x%x\n",
2290                     liobn);
2291             return NULL;
2292         }
2293     } else if (cap_spapr_tce) {
2294         uint64_t window_size = (uint64_t) nb_table << page_shift;
2295         struct kvm_create_spapr_tce args = {
2296             .liobn = liobn,
2297             .window_size = window_size,
2298         };
2299         if ((window_size != args.window_size) || bus_offset) {
2300             return NULL;
2301         }
2302         fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE, &args);
2303         if (fd < 0) {
2304             fprintf(stderr, "KVM: Failed to create TCE table for liobn 0x%x\n",
2305                     liobn);
2306             return NULL;
2307         }
2308     } else {
2309         return NULL;
2310     }
2311
2312     len = nb_table * sizeof(uint64_t);
2313     /* FIXME: round this up to page size */
2314
2315     table = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
2316     if (table == MAP_FAILED) {
2317         fprintf(stderr, "KVM: Failed to map TCE table for liobn 0x%x\n",
2318                 liobn);
2319         close(fd);
2320         return NULL;
2321     }
2322
2323     *pfd = fd;
2324     return table;
2325 }
2326
2327 int kvmppc_remove_spapr_tce(void *table, int fd, uint32_t nb_table)
2328 {
2329     long len;
2330
2331     if (fd < 0) {
2332         return -1;
2333     }
2334
2335     len = nb_table * sizeof(uint64_t);
2336     if ((munmap(table, len) < 0) ||
2337         (close(fd) < 0)) {
2338         fprintf(stderr, "KVM: Unexpected error removing TCE table: %s",
2339                 strerror(errno));
2340         /* Leak the table */
2341     }
2342
2343     return 0;
2344 }
2345
2346 int kvmppc_reset_htab(int shift_hint)
2347 {
2348     uint32_t shift = shift_hint;
2349
2350     if (!kvm_enabled()) {
2351         /* Full emulation, tell caller to allocate htab itself */
2352         return 0;
2353     }
2354     if (kvm_vm_check_extension(kvm_state, KVM_CAP_PPC_ALLOC_HTAB)) {
2355         int ret;
2356         ret = kvm_vm_ioctl(kvm_state, KVM_PPC_ALLOCATE_HTAB, &shift);
2357         if (ret == -ENOTTY) {
2358             /* At least some versions of PR KVM advertise the
2359              * capability, but don't implement the ioctl().  Oops.
2360              * Return 0 so that we allocate the htab in qemu, as is
2361              * correct for PR. */
2362             return 0;
2363         } else if (ret < 0) {
2364             return ret;
2365         }
2366         return shift;
2367     }
2368
2369     /* We have a kernel that predates the htab reset calls.  For PR
2370      * KVM, we need to allocate the htab ourselves, for an HV KVM of
2371      * this era, it has allocated a 16MB fixed size hash table already. */
2372     if (kvmppc_is_pr(kvm_state)) {
2373         /* PR - tell caller to allocate htab */
2374         return 0;
2375     } else {
2376         /* HV - assume 16MB kernel allocated htab */
2377         return 24;
2378     }
2379 }
2380
2381 static inline uint32_t mfpvr(void)
2382 {
2383     uint32_t pvr;
2384
2385     asm ("mfpvr %0"
2386          : "=r"(pvr));
2387     return pvr;
2388 }
2389
2390 static void alter_insns(uint64_t *word, uint64_t flags, bool on)
2391 {
2392     if (on) {
2393         *word |= flags;
2394     } else {
2395         *word &= ~flags;
2396     }
2397 }
2398
2399 static void kvmppc_host_cpu_class_init(ObjectClass *oc, void *data)
2400 {
2401     PowerPCCPUClass *pcc = POWERPC_CPU_CLASS(oc);
2402     uint32_t dcache_size = kvmppc_read_int_cpu_dt("d-cache-size");
2403     uint32_t icache_size = kvmppc_read_int_cpu_dt("i-cache-size");
2404
2405     /* Now fix up the class with information we can query from the host */
2406     pcc->pvr = mfpvr();
2407
2408     alter_insns(&pcc->insns_flags, PPC_ALTIVEC,
2409                 qemu_getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC);
2410     alter_insns(&pcc->insns_flags2, PPC2_VSX,
2411                 qemu_getauxval(AT_HWCAP) & PPC_FEATURE_HAS_VSX);
2412     alter_insns(&pcc->insns_flags2, PPC2_DFP,
2413                 qemu_getauxval(AT_HWCAP) & PPC_FEATURE_HAS_DFP);
2414
2415     if (dcache_size != -1) {
2416         pcc->l1_dcache_size = dcache_size;
2417     }
2418
2419     if (icache_size != -1) {
2420         pcc->l1_icache_size = icache_size;
2421     }
2422
2423 #if defined(TARGET_PPC64)
2424     pcc->radix_page_info = kvm_get_radix_page_info();
2425
2426     if ((pcc->pvr & 0xffffff00) == CPU_POWERPC_POWER9_DD1) {
2427         /*
2428          * POWER9 DD1 has some bugs which make it not really ISA 3.00
2429          * compliant.  More importantly, advertising ISA 3.00
2430          * architected mode may prevent guests from activating
2431          * necessary DD1 workarounds.
2432          */
2433         pcc->pcr_supported &= ~(PCR_COMPAT_3_00 | PCR_COMPAT_2_07
2434                                 | PCR_COMPAT_2_06 | PCR_COMPAT_2_05);
2435     }
2436 #endif /* defined(TARGET_PPC64) */
2437 }
2438
2439 bool kvmppc_has_cap_epr(void)
2440 {
2441     return cap_epr;
2442 }
2443
2444 bool kvmppc_has_cap_fixup_hcalls(void)
2445 {
2446     return cap_fixup_hcalls;
2447 }
2448
2449 bool kvmppc_has_cap_htm(void)
2450 {
2451     return cap_htm;
2452 }
2453
2454 bool kvmppc_has_cap_mmu_radix(void)
2455 {
2456     return cap_mmu_radix;
2457 }
2458
2459 bool kvmppc_has_cap_mmu_hash_v3(void)
2460 {
2461     return cap_mmu_hash_v3;
2462 }
2463
2464 static void kvmppc_get_cpu_characteristics(KVMState *s)
2465 {
2466     struct kvm_ppc_cpu_char c;
2467     int ret;
2468
2469     /* Assume broken */
2470     cap_ppc_safe_cache = 0;
2471     cap_ppc_safe_bounds_check = 0;
2472     cap_ppc_safe_indirect_branch = 0;
2473
2474     ret = kvm_vm_check_extension(s, KVM_CAP_PPC_GET_CPU_CHAR);
2475     if (!ret) {
2476         return;
2477     }
2478     ret = kvm_vm_ioctl(s, KVM_PPC_GET_CPU_CHAR, &c);
2479     if (ret < 0) {
2480         return;
2481     }
2482     /* Parse and set cap_ppc_safe_cache */
2483     if (~c.behaviour & c.behaviour_mask & H_CPU_BEHAV_L1D_FLUSH_PR) {
2484         cap_ppc_safe_cache = 2;
2485     } else if ((c.character & c.character_mask & H_CPU_CHAR_L1D_THREAD_PRIV) &&
2486                (c.character & c.character_mask
2487                 & (H_CPU_CHAR_L1D_FLUSH_ORI30 | H_CPU_CHAR_L1D_FLUSH_TRIG2))) {
2488         cap_ppc_safe_cache = 1;
2489     }
2490     /* Parse and set cap_ppc_safe_bounds_check */
2491     if (~c.behaviour & c.behaviour_mask & H_CPU_BEHAV_BNDS_CHK_SPEC_BAR) {
2492         cap_ppc_safe_bounds_check = 2;
2493     } else if (c.character & c.character_mask & H_CPU_CHAR_SPEC_BAR_ORI31) {
2494         cap_ppc_safe_bounds_check = 1;
2495     }
2496     /* Parse and set cap_ppc_safe_indirect_branch */
2497     if (c.character & H_CPU_CHAR_BCCTRL_SERIALISED) {
2498         cap_ppc_safe_indirect_branch = 2;
2499     }
2500 }
2501
2502 int kvmppc_get_cap_safe_cache(void)
2503 {
2504     return cap_ppc_safe_cache;
2505 }
2506
2507 int kvmppc_get_cap_safe_bounds_check(void)
2508 {
2509     return cap_ppc_safe_bounds_check;
2510 }
2511
2512 int kvmppc_get_cap_safe_indirect_branch(void)
2513 {
2514     return cap_ppc_safe_indirect_branch;
2515 }
2516
2517 bool kvmppc_has_cap_spapr_vfio(void)
2518 {
2519     return cap_spapr_vfio;
2520 }
2521
2522 PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
2523 {
2524     uint32_t host_pvr = mfpvr();
2525     PowerPCCPUClass *pvr_pcc;
2526
2527     pvr_pcc = ppc_cpu_class_by_pvr(host_pvr);
2528     if (pvr_pcc == NULL) {
2529         pvr_pcc = ppc_cpu_class_by_pvr_mask(host_pvr);
2530     }
2531
2532     return pvr_pcc;
2533 }
2534
2535 static int kvm_ppc_register_host_cpu_type(MachineState *ms)
2536 {
2537     TypeInfo type_info = {
2538         .name = TYPE_HOST_POWERPC_CPU,
2539         .class_init = kvmppc_host_cpu_class_init,
2540     };
2541     MachineClass *mc = MACHINE_GET_CLASS(ms);
2542     PowerPCCPUClass *pvr_pcc;
2543     ObjectClass *oc;
2544     DeviceClass *dc;
2545     int i;
2546
2547     pvr_pcc = kvm_ppc_get_host_cpu_class();
2548     if (pvr_pcc == NULL) {
2549         return -1;
2550     }
2551     type_info.parent = object_class_get_name(OBJECT_CLASS(pvr_pcc));
2552     type_register(&type_info);
2553     if (object_dynamic_cast(OBJECT(ms), TYPE_SPAPR_MACHINE)) {
2554         /* override TCG default cpu type with 'host' cpu model */
2555         mc->default_cpu_type = TYPE_HOST_POWERPC_CPU;
2556     }
2557
2558     oc = object_class_by_name(type_info.name);
2559     g_assert(oc);
2560
2561     /*
2562      * Update generic CPU family class alias (e.g. on a POWER8NVL host,
2563      * we want "POWER8" to be a "family" alias that points to the current
2564      * host CPU type, too)
2565      */
2566     dc = DEVICE_CLASS(ppc_cpu_get_family_class(pvr_pcc));
2567     for (i = 0; ppc_cpu_aliases[i].alias != NULL; i++) {
2568         if (strcasecmp(ppc_cpu_aliases[i].alias, dc->desc) == 0) {
2569             char *suffix;
2570
2571             ppc_cpu_aliases[i].model = g_strdup(object_class_get_name(oc));
2572             suffix = strstr(ppc_cpu_aliases[i].model, POWERPC_CPU_TYPE_SUFFIX);
2573             if (suffix) {
2574                 *suffix = 0;
2575             }
2576             break;
2577         }
2578     }
2579
2580     return 0;
2581 }
2582
2583 int kvmppc_define_rtas_kernel_token(uint32_t token, const char *function)
2584 {
2585     struct kvm_rtas_token_args args = {
2586         .token = token,
2587     };
2588
2589     if (!kvm_check_extension(kvm_state, KVM_CAP_PPC_RTAS)) {
2590         return -ENOENT;
2591     }
2592
2593     strncpy(args.name, function, sizeof(args.name));
2594
2595     return kvm_vm_ioctl(kvm_state, KVM_PPC_RTAS_DEFINE_TOKEN, &args);
2596 }
2597
2598 int kvmppc_get_htab_fd(bool write, uint64_t index, Error **errp)
2599 {
2600     struct kvm_get_htab_fd s = {
2601         .flags = write ? KVM_GET_HTAB_WRITE : 0,
2602         .start_index = index,
2603     };
2604     int ret;
2605
2606     if (!cap_htab_fd) {
2607         error_setg(errp, "KVM version doesn't support %s the HPT",
2608                    write ? "writing" : "reading");
2609         return -ENOTSUP;
2610     }
2611
2612     ret = kvm_vm_ioctl(kvm_state, KVM_PPC_GET_HTAB_FD, &s);
2613     if (ret < 0) {
2614         error_setg(errp, "Unable to open fd for %s HPT %s KVM: %s",
2615                    write ? "writing" : "reading", write ? "to" : "from",
2616                    strerror(errno));
2617         return -errno;
2618     }
2619
2620     return ret;
2621 }
2622
2623 int kvmppc_save_htab(QEMUFile *f, int fd, size_t bufsize, int64_t max_ns)
2624 {
2625     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2626     uint8_t buf[bufsize];
2627     ssize_t rc;
2628
2629     do {
2630         rc = read(fd, buf, bufsize);
2631         if (rc < 0) {
2632             fprintf(stderr, "Error reading data from KVM HTAB fd: %s\n",
2633                     strerror(errno));
2634             return rc;
2635         } else if (rc) {
2636             uint8_t *buffer = buf;
2637             ssize_t n = rc;
2638             while (n) {
2639                 struct kvm_get_htab_header *head =
2640                     (struct kvm_get_htab_header *) buffer;
2641                 size_t chunksize = sizeof(*head) +
2642                      HASH_PTE_SIZE_64 * head->n_valid;
2643
2644                 qemu_put_be32(f, head->index);
2645                 qemu_put_be16(f, head->n_valid);
2646                 qemu_put_be16(f, head->n_invalid);
2647                 qemu_put_buffer(f, (void *)(head + 1),
2648                                 HASH_PTE_SIZE_64 * head->n_valid);
2649
2650                 buffer += chunksize;
2651                 n -= chunksize;
2652             }
2653         }
2654     } while ((rc != 0)
2655              && ((max_ns < 0)
2656                  || ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) < max_ns)));
2657
2658     return (rc == 0) ? 1 : 0;
2659 }
2660
2661 int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
2662                            uint16_t n_valid, uint16_t n_invalid)
2663 {
2664     struct kvm_get_htab_header *buf;
2665     size_t chunksize = sizeof(*buf) + n_valid*HASH_PTE_SIZE_64;
2666     ssize_t rc;
2667
2668     buf = alloca(chunksize);
2669     buf->index = index;
2670     buf->n_valid = n_valid;
2671     buf->n_invalid = n_invalid;
2672
2673     qemu_get_buffer(f, (void *)(buf + 1), HASH_PTE_SIZE_64*n_valid);
2674
2675     rc = write(fd, buf, chunksize);
2676     if (rc < 0) {
2677         fprintf(stderr, "Error writing KVM hash table: %s\n",
2678                 strerror(errno));
2679         return rc;
2680     }
2681     if (rc != chunksize) {
2682         /* We should never get a short write on a single chunk */
2683         fprintf(stderr, "Short write, restoring KVM hash table\n");
2684         return -1;
2685     }
2686     return 0;
2687 }
2688
2689 bool kvm_arch_stop_on_emulation_error(CPUState *cpu)
2690 {
2691     return true;
2692 }
2693
2694 void kvm_arch_init_irq_routing(KVMState *s)
2695 {
2696 }
2697
2698 void kvmppc_read_hptes(ppc_hash_pte64_t *hptes, hwaddr ptex, int n)
2699 {
2700     int fd, rc;
2701     int i;
2702
2703     fd = kvmppc_get_htab_fd(false, ptex, &error_abort);
2704
2705     i = 0;
2706     while (i < n) {
2707         struct kvm_get_htab_header *hdr;
2708         int m = n < HPTES_PER_GROUP ? n : HPTES_PER_GROUP;
2709         char buf[sizeof(*hdr) + m * HASH_PTE_SIZE_64];
2710
2711         rc = read(fd, buf, sizeof(buf));
2712         if (rc < 0) {
2713             hw_error("kvmppc_read_hptes: Unable to read HPTEs");
2714         }
2715
2716         hdr = (struct kvm_get_htab_header *)buf;
2717         while ((i < n) && ((char *)hdr < (buf + rc))) {
2718             int invalid = hdr->n_invalid, valid = hdr->n_valid;
2719
2720             if (hdr->index != (ptex + i)) {
2721                 hw_error("kvmppc_read_hptes: Unexpected HPTE index %"PRIu32
2722                          " != (%"HWADDR_PRIu" + %d", hdr->index, ptex, i);
2723             }
2724
2725             if (n - i < valid) {
2726                 valid = n - i;
2727             }
2728             memcpy(hptes + i, hdr + 1, HASH_PTE_SIZE_64 * valid);
2729             i += valid;
2730
2731             if ((n - i) < invalid) {
2732                 invalid = n - i;
2733             }
2734             memset(hptes + i, 0, invalid * HASH_PTE_SIZE_64);
2735             i += invalid;
2736
2737             hdr = (struct kvm_get_htab_header *)
2738                 ((char *)(hdr + 1) + HASH_PTE_SIZE_64 * hdr->n_valid);
2739         }
2740     }
2741
2742     close(fd);
2743 }
2744
2745 void kvmppc_write_hpte(hwaddr ptex, uint64_t pte0, uint64_t pte1)
2746 {
2747     int fd, rc;
2748     struct {
2749         struct kvm_get_htab_header hdr;
2750         uint64_t pte0;
2751         uint64_t pte1;
2752     } buf;
2753
2754     fd = kvmppc_get_htab_fd(true, 0 /* Ignored */, &error_abort);
2755
2756     buf.hdr.n_valid = 1;
2757     buf.hdr.n_invalid = 0;
2758     buf.hdr.index = ptex;
2759     buf.pte0 = cpu_to_be64(pte0);
2760     buf.pte1 = cpu_to_be64(pte1);
2761
2762     rc = write(fd, &buf, sizeof(buf));
2763     if (rc != sizeof(buf)) {
2764         hw_error("kvmppc_write_hpte: Unable to update KVM HPT");
2765     }
2766     close(fd);
2767 }
2768
2769 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
2770                              uint64_t address, uint32_t data, PCIDevice *dev)
2771 {
2772     return 0;
2773 }
2774
2775 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
2776                                 int vector, PCIDevice *dev)
2777 {
2778     return 0;
2779 }
2780
2781 int kvm_arch_release_virq_post(int virq)
2782 {
2783     return 0;
2784 }
2785
2786 int kvm_arch_msi_data_to_gsi(uint32_t data)
2787 {
2788     return data & 0xffff;
2789 }
2790
2791 int kvmppc_enable_hwrng(void)
2792 {
2793     if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) {
2794         return -1;
2795     }
2796
2797     return kvmppc_enable_hcall(kvm_state, H_RANDOM);
2798 }
2799
2800 void kvmppc_check_papr_resize_hpt(Error **errp)
2801 {
2802     if (!kvm_enabled()) {
2803         return; /* No KVM, we're good */
2804     }
2805
2806     if (cap_resize_hpt) {
2807         return; /* Kernel has explicit support, we're good */
2808     }
2809
2810     /* Otherwise fallback on looking for PR KVM */
2811     if (kvmppc_is_pr(kvm_state)) {
2812         return;
2813     }
2814
2815     error_setg(errp,
2816                "Hash page table resizing not available with this KVM version");
2817 }
2818
2819 int kvmppc_resize_hpt_prepare(PowerPCCPU *cpu, target_ulong flags, int shift)
2820 {
2821     CPUState *cs = CPU(cpu);
2822     struct kvm_ppc_resize_hpt rhpt = {
2823         .flags = flags,
2824         .shift = shift,
2825     };
2826
2827     if (!cap_resize_hpt) {
2828         return -ENOSYS;
2829     }
2830
2831     return kvm_vm_ioctl(cs->kvm_state, KVM_PPC_RESIZE_HPT_PREPARE, &rhpt);
2832 }
2833
2834 int kvmppc_resize_hpt_commit(PowerPCCPU *cpu, target_ulong flags, int shift)
2835 {
2836     CPUState *cs = CPU(cpu);
2837     struct kvm_ppc_resize_hpt rhpt = {
2838         .flags = flags,
2839         .shift = shift,
2840     };
2841
2842     if (!cap_resize_hpt) {
2843         return -ENOSYS;
2844     }
2845
2846     return kvm_vm_ioctl(cs->kvm_state, KVM_PPC_RESIZE_HPT_COMMIT, &rhpt);
2847 }
2848
2849 /*
2850  * This is a helper function to detect a post migration scenario
2851  * in which a guest, running as KVM-HV, freezes in cpu_post_load because
2852  * the guest kernel can't handle a PVR value other than the actual host
2853  * PVR in KVM_SET_SREGS, even if pvr_match() returns true.
2854  *
2855  * If we don't have cap_ppc_pvr_compat and we're not running in PR
2856  * (so, we're HV), return true. The workaround itself is done in
2857  * cpu_post_load.
2858  *
2859  * The order here is important: we'll only check for KVM PR as a
2860  * fallback if the guest kernel can't handle the situation itself.
2861  * We need to avoid as much as possible querying the running KVM type
2862  * in QEMU level.
2863  */
2864 bool kvmppc_pvr_workaround_required(PowerPCCPU *cpu)
2865 {
2866     CPUState *cs = CPU(cpu);
2867
2868     if (!kvm_enabled()) {
2869         return false;
2870     }
2871
2872     if (cap_ppc_pvr_compat) {
2873         return false;
2874     }
2875
2876     return !kvmppc_is_pr(cs->kvm_state);
2877 }