accel/kvm/kvm-all.c

   1 /*
   2  * QEMU KVM support
   3  *
   4  * Copyright IBM, Corp. 2008
   5  *           Red Hat, Inc. 2008
   6  *
   7  * Authors:
   8  *  Anthony Liguori   <aliguori@us.ibm.com>
   9  *  Glauber Costa     <gcosta@redhat.com>
  10  *
  11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12  * See the COPYING file in the top-level directory.
  13  *
  14  */
  15
  16 #include "qemu/osdep.h"
  17 #include <sys/ioctl.h>
  18 #include <poll.h>
  19
  20 #include <linux/kvm.h>
  21
  22 #include "qemu/atomic.h"
  23 #include "qemu/option.h"
  24 #include "qemu/config-file.h"
  25 #include "qemu/error-report.h"
  26 #include "qapi/error.h"
  27 #include "hw/pci/msi.h"
  28 #include "hw/pci/msix.h"
  29 #include "hw/s390x/adapter.h"
  30 #include "exec/gdbstub.h"
  31 #include "sysemu/kvm_int.h"
  32 #include "sysemu/runstate.h"
  33 #include "sysemu/cpus.h"
  34 #include "qemu/bswap.h"
  35 #include "exec/memory.h"
  36 #include "exec/ram_addr.h"
  37 #include "qemu/event_notifier.h"
  38 #include "qemu/main-loop.h"
  39 #include "trace.h"
  40 #include "hw/irq.h"
  41 #include "qapi/visitor.h"
  42 #include "qapi/qapi-types-common.h"
  43 #include "qapi/qapi-visit-common.h"
  44 #include "sysemu/reset.h"
  45 #include "qemu/guest-random.h"
  46 #include "sysemu/hw_accel.h"
  47 #include "kvm-cpus.h"
  48
  49 #include "hw/boards.h"
  50 #include "monitor/stats.h"
  51
  52 /* This check must be after config-host.h is included */
  53 #ifdef CONFIG_EVENTFD
  54 #include <sys/eventfd.h>
  55 #endif
  56
  57 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
  58  * need to use the real host PAGE_SIZE, as that's what KVM will use.
  59  */
  60 #ifdef PAGE_SIZE
  61 #undef PAGE_SIZE
  62 #endif
  63 #define PAGE_SIZE qemu_real_host_page_size()
  64
  65 #ifndef KVM_GUESTDBG_BLOCKIRQ
  66 #define KVM_GUESTDBG_BLOCKIRQ 0
  67 #endif
  68
  69 //#define DEBUG_KVM
  70
  71 #ifdef DEBUG_KVM
  72 #define DPRINTF(fmt, ...) \
  73     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  74 #else
  75 #define DPRINTF(fmt, ...) \
  76     do { } while (0)
  77 #endif
  78
  79 #define KVM_MSI_HASHTAB_SIZE    256
  80
  81 struct KVMParkedVcpu {
  82     unsigned long vcpu_id;
  83     int kvm_fd;
  84     QLIST_ENTRY(KVMParkedVcpu) node;
  85 };
  86
  87 enum KVMDirtyRingReaperState {
  88     KVM_DIRTY_RING_REAPER_NONE = 0,
  89     /* The reaper is sleeping */
  90     KVM_DIRTY_RING_REAPER_WAIT,
  91     /* The reaper is reaping for dirty pages */
  92     KVM_DIRTY_RING_REAPER_REAPING,
  93 };
  94
  95 /*
  96  * KVM reaper instance, responsible for collecting the KVM dirty bits
  97  * via the dirty ring.
  98  */
  99 struct KVMDirtyRingReaper {
 100     /* The reaper thread */
 101     QemuThread reaper_thr;
 102     volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
 103     volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
 104 };
 105
 106 struct KVMState
 107 {
 108     AccelState parent_obj;
 109
 110     int nr_slots;
 111     int fd;
 112     int vmfd;
 113     int coalesced_mmio;
 114     int coalesced_pio;
 115     struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
 116     bool coalesced_flush_in_progress;
 117     int vcpu_events;
 118     int robust_singlestep;
 119     int debugregs;
 120 #ifdef KVM_CAP_SET_GUEST_DEBUG
 121     QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
 122 #endif
 123     int max_nested_state_len;
 124     int many_ioeventfds;
 125     int intx_set_mask;
 126     int kvm_shadow_mem;
 127     bool kernel_irqchip_allowed;
 128     bool kernel_irqchip_required;
 129     OnOffAuto kernel_irqchip_split;
 130     bool sync_mmu;
 131     uint64_t manual_dirty_log_protect;
 132     /* The man page (and posix) say ioctl numbers are signed int, but
 133      * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
 134      * unsigned, and treating them as signed here can break things */
 135     unsigned irq_set_ioctl;
 136     unsigned int sigmask_len;
 137     GHashTable *gsimap;
 138 #ifdef KVM_CAP_IRQ_ROUTING
 139     struct kvm_irq_routing *irq_routes;
 140     int nr_allocated_irq_routes;
 141     unsigned long *used_gsi_bitmap;
 142     unsigned int gsi_count;
 143     QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
 144 #endif
 145     KVMMemoryListener memory_listener;
 146     QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
 147
 148     /* For "info mtree -f" to tell if an MR is registered in KVM */
 149     int nr_as;
 150     struct KVMAs {
 151         KVMMemoryListener *ml;
 152         AddressSpace *as;
 153     } *as;
 154     uint64_t kvm_dirty_ring_bytes;  /* Size of the per-vcpu dirty ring */
 155     uint32_t kvm_dirty_ring_size;   /* Number of dirty GFNs per ring */
 156     struct KVMDirtyRingReaper reaper;
 157 };
 158
 159 KVMState *kvm_state;
 160 bool kvm_kernel_irqchip;
 161 bool kvm_split_irqchip;
 162 bool kvm_async_interrupts_allowed;
 163 bool kvm_halt_in_kernel_allowed;
 164 bool kvm_eventfds_allowed;
 165 bool kvm_irqfds_allowed;
 166 bool kvm_resamplefds_allowed;
 167 bool kvm_msi_via_irqfd_allowed;
 168 bool kvm_gsi_routing_allowed;
 169 bool kvm_gsi_direct_mapping;
 170 bool kvm_allowed;
 171 bool kvm_readonly_mem_allowed;
 172 bool kvm_vm_attributes_allowed;
 173 bool kvm_direct_msi_allowed;
 174 bool kvm_ioeventfd_any_length_allowed;
 175 bool kvm_msi_use_devid;
 176 bool kvm_has_guest_debug;
 177 int kvm_sstep_flags;
 178 static bool kvm_immediate_exit;
 179 static hwaddr kvm_max_slot_size = ~0;
 180
 181 static const KVMCapabilityInfo kvm_required_capabilites[] = {
 182     KVM_CAP_INFO(USER_MEMORY),
 183     KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
 184     KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
 185     KVM_CAP_LAST_INFO
 186 };
 187
 188 static NotifierList kvm_irqchip_change_notifiers =
 189     NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
 190
 191 struct KVMResampleFd {
 192     int gsi;
 193     EventNotifier *resample_event;
 194     QLIST_ENTRY(KVMResampleFd) node;
 195 };
 196 typedef struct KVMResampleFd KVMResampleFd;
 197
 198 /*
 199  * Only used with split irqchip where we need to do the resample fd
 200  * kick for the kernel from userspace.
 201  */
 202 static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
 203     QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
 204
 205 static QemuMutex kml_slots_lock;
 206
 207 #define kvm_slots_lock()    qemu_mutex_lock(&kml_slots_lock)
 208 #define kvm_slots_unlock()  qemu_mutex_unlock(&kml_slots_lock)
 209
 210 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem);
 211
 212 static inline void kvm_resample_fd_remove(int gsi)
 213 {
 214     KVMResampleFd *rfd;
 215
 216     QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
 217         if (rfd->gsi == gsi) {
 218             QLIST_REMOVE(rfd, node);
 219             g_free(rfd);
 220             break;
 221         }
 222     }
 223 }
 224
 225 static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
 226 {
 227     KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
 228
 229     rfd->gsi = gsi;
 230     rfd->resample_event = event;
 231
 232     QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
 233 }
 234
 235 void kvm_resample_fd_notify(int gsi)
 236 {
 237     KVMResampleFd *rfd;
 238
 239     QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
 240         if (rfd->gsi == gsi) {
 241             event_notifier_set(rfd->resample_event);
 242             trace_kvm_resample_fd_notify(gsi);
 243             return;
 244         }
 245     }
 246 }
 247
 248 int kvm_get_max_memslots(void)
 249 {
 250     KVMState *s = KVM_STATE(current_accel());
 251
 252     return s->nr_slots;
 253 }
 254
 255 /* Called with KVMMemoryListener.slots_lock held */
 256 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
 257 {
 258     KVMState *s = kvm_state;
 259     int i;
 260
 261     for (i = 0; i < s->nr_slots; i++) {
 262         if (kml->slots[i].memory_size == 0) {
 263             return &kml->slots[i];
 264         }
 265     }
 266
 267     return NULL;
 268 }
 269
 270 bool kvm_has_free_slot(MachineState *ms)
 271 {
 272     KVMState *s = KVM_STATE(ms->accelerator);
 273     bool result;
 274     KVMMemoryListener *kml = &s->memory_listener;
 275
 276     kvm_slots_lock();
 277     result = !!kvm_get_free_slot(kml);
 278     kvm_slots_unlock();
 279
 280     return result;
 281 }
 282
 283 /* Called with KVMMemoryListener.slots_lock held */
 284 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
 285 {
 286     KVMSlot *slot = kvm_get_free_slot(kml);
 287
 288     if (slot) {
 289         return slot;
 290     }
 291
 292     fprintf(stderr, "%s: no free slot available\n", __func__);
 293     abort();
 294 }
 295
 296 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
 297                                          hwaddr start_addr,
 298                                          hwaddr size)
 299 {
 300     KVMState *s = kvm_state;
 301     int i;
 302
 303     for (i = 0; i < s->nr_slots; i++) {
 304         KVMSlot *mem = &kml->slots[i];
 305
 306         if (start_addr == mem->start_addr && size == mem->memory_size) {
 307             return mem;
 308         }
 309     }
 310
 311     return NULL;
 312 }
 313
 314 /*
 315  * Calculate and align the start address and the size of the section.
 316  * Return the size. If the size is 0, the aligned section is empty.
 317  */
 318 static hwaddr kvm_align_section(MemoryRegionSection *section,
 319                                 hwaddr *start)
 320 {
 321     hwaddr size = int128_get64(section->size);
 322     hwaddr delta, aligned;
 323
 324     /* kvm works in page size chunks, but the function may be called
 325        with sub-page size and unaligned start address. Pad the start
 326        address to next and truncate size to previous page boundary. */
 327     aligned = ROUND_UP(section->offset_within_address_space,
 328                        qemu_real_host_page_size());
 329     delta = aligned - section->offset_within_address_space;
 330     *start = aligned;
 331     if (delta > size) {
 332         return 0;
 333     }
 334
 335     return (size - delta) & qemu_real_host_page_mask();
 336 }
 337
 338 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
 339                                        hwaddr *phys_addr)
 340 {
 341     KVMMemoryListener *kml = &s->memory_listener;
 342     int i, ret = 0;
 343
 344     kvm_slots_lock();
 345     for (i = 0; i < s->nr_slots; i++) {
 346         KVMSlot *mem = &kml->slots[i];
 347
 348         if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
 349             *phys_addr = mem->start_addr + (ram - mem->ram);
 350             ret = 1;
 351             break;
 352         }
 353     }
 354     kvm_slots_unlock();
 355
 356     return ret;
 357 }
 358
 359 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
 360 {
 361     KVMState *s = kvm_state;
 362     struct kvm_userspace_memory_region mem;
 363     int ret;
 364
 365     mem.slot = slot->slot | (kml->as_id << 16);
 366     mem.guest_phys_addr = slot->start_addr;
 367     mem.userspace_addr = (unsigned long)slot->ram;
 368     mem.flags = slot->flags;
 369
 370     if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
 371         /* Set the slot size to 0 before setting the slot to the desired
 372          * value. This is needed based on KVM commit 75d61fbc. */
 373         mem.memory_size = 0;
 374         ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 375         if (ret < 0) {
 376             goto err;
 377         }
 378     }
 379     mem.memory_size = slot->memory_size;
 380     ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 381     slot->old_flags = mem.flags;
 382 err:
 383     trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr,
 384                               mem.memory_size, mem.userspace_addr, ret);
 385     if (ret < 0) {
 386         error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
 387                      " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
 388                      __func__, mem.slot, slot->start_addr,
 389                      (uint64_t)mem.memory_size, strerror(errno));
 390     }
 391     return ret;
 392 }
 393
 394 static int do_kvm_destroy_vcpu(CPUState *cpu)
 395 {
 396     KVMState *s = kvm_state;
 397     long mmap_size;
 398     struct KVMParkedVcpu *vcpu = NULL;
 399     int ret = 0;
 400
 401     DPRINTF("kvm_destroy_vcpu\n");
 402
 403     ret = kvm_arch_destroy_vcpu(cpu);
 404     if (ret < 0) {
 405         goto err;
 406     }
 407
 408     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 409     if (mmap_size < 0) {
 410         ret = mmap_size;
 411         DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
 412         goto err;
 413     }
 414
 415     ret = munmap(cpu->kvm_run, mmap_size);
 416     if (ret < 0) {
 417         goto err;
 418     }
 419
 420     if (cpu->kvm_dirty_gfns) {
 421         ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
 422         if (ret < 0) {
 423             goto err;
 424         }
 425     }
 426
 427     vcpu = g_malloc0(sizeof(*vcpu));
 428     vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
 429     vcpu->kvm_fd = cpu->kvm_fd;
 430     QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
 431 err:
 432     return ret;
 433 }
 434
 435 void kvm_destroy_vcpu(CPUState *cpu)
 436 {
 437     if (do_kvm_destroy_vcpu(cpu) < 0) {
 438         error_report("kvm_destroy_vcpu failed");
 439         exit(EXIT_FAILURE);
 440     }
 441 }
 442
 443 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
 444 {
 445     struct KVMParkedVcpu *cpu;
 446
 447     QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
 448         if (cpu->vcpu_id == vcpu_id) {
 449             int kvm_fd;
 450
 451             QLIST_REMOVE(cpu, node);
 452             kvm_fd = cpu->kvm_fd;
 453             g_free(cpu);
 454             return kvm_fd;
 455         }
 456     }
 457
 458     return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
 459 }
 460
 461 int kvm_init_vcpu(CPUState *cpu, Error **errp)
 462 {
 463     KVMState *s = kvm_state;
 464     long mmap_size;
 465     int ret;
 466
 467     trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
 468
 469     ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
 470     if (ret < 0) {
 471         error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)",
 472                          kvm_arch_vcpu_id(cpu));
 473         goto err;
 474     }
 475
 476     cpu->kvm_fd = ret;
 477     cpu->kvm_state = s;
 478     cpu->vcpu_dirty = true;
 479     cpu->dirty_pages = 0;
 480
 481     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 482     if (mmap_size < 0) {
 483         ret = mmap_size;
 484         error_setg_errno(errp, -mmap_size,
 485                          "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
 486         goto err;
 487     }
 488
 489     cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
 490                         cpu->kvm_fd, 0);
 491     if (cpu->kvm_run == MAP_FAILED) {
 492         ret = -errno;
 493         error_setg_errno(errp, ret,
 494                          "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)",
 495                          kvm_arch_vcpu_id(cpu));
 496         goto err;
 497     }
 498
 499     if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
 500         s->coalesced_mmio_ring =
 501             (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
 502     }
 503
 504     if (s->kvm_dirty_ring_size) {
 505         /* Use MAP_SHARED to share pages with the kernel */
 506         cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
 507                                    PROT_READ | PROT_WRITE, MAP_SHARED,
 508                                    cpu->kvm_fd,
 509                                    PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
 510         if (cpu->kvm_dirty_gfns == MAP_FAILED) {
 511             ret = -errno;
 512             DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret);
 513             goto err;
 514         }
 515     }
 516
 517     ret = kvm_arch_init_vcpu(cpu);
 518     if (ret < 0) {
 519         error_setg_errno(errp, -ret,
 520                          "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)",
 521                          kvm_arch_vcpu_id(cpu));
 522     }
 523 err:
 524     return ret;
 525 }
 526
 527 /*
 528  * dirty pages logging control
 529  */
 530
 531 static int kvm_mem_flags(MemoryRegion *mr)
 532 {
 533     bool readonly = mr->readonly || memory_region_is_romd(mr);
 534     int flags = 0;
 535
 536     if (memory_region_get_dirty_log_mask(mr) != 0) {
 537         flags |= KVM_MEM_LOG_DIRTY_PAGES;
 538     }
 539     if (readonly && kvm_readonly_mem_allowed) {
 540         flags |= KVM_MEM_READONLY;
 541     }
 542     return flags;
 543 }
 544
 545 /* Called with KVMMemoryListener.slots_lock held */
 546 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
 547                                  MemoryRegion *mr)
 548 {
 549     mem->flags = kvm_mem_flags(mr);
 550
 551     /* If nothing changed effectively, no need to issue ioctl */
 552     if (mem->flags == mem->old_flags) {
 553         return 0;
 554     }
 555
 556     kvm_slot_init_dirty_bitmap(mem);
 557     return kvm_set_user_memory_region(kml, mem, false);
 558 }
 559
 560 static int kvm_section_update_flags(KVMMemoryListener *kml,
 561                                     MemoryRegionSection *section)
 562 {
 563     hwaddr start_addr, size, slot_size;
 564     KVMSlot *mem;
 565     int ret = 0;
 566
 567     size = kvm_align_section(section, &start_addr);
 568     if (!size) {
 569         return 0;
 570     }
 571
 572     kvm_slots_lock();
 573
 574     while (size && !ret) {
 575         slot_size = MIN(kvm_max_slot_size, size);
 576         mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
 577         if (!mem) {
 578             /* We don't have a slot if we want to trap every access. */
 579             goto out;
 580         }
 581
 582         ret = kvm_slot_update_flags(kml, mem, section->mr);
 583         start_addr += slot_size;
 584         size -= slot_size;
 585     }
 586
 587 out:
 588     kvm_slots_unlock();
 589     return ret;
 590 }
 591
 592 static void kvm_log_start(MemoryListener *listener,
 593                           MemoryRegionSection *section,
 594                           int old, int new)
 595 {
 596     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 597     int r;
 598
 599     if (old != 0) {
 600         return;
 601     }
 602
 603     r = kvm_section_update_flags(kml, section);
 604     if (r < 0) {
 605         abort();
 606     }
 607 }
 608
 609 static void kvm_log_stop(MemoryListener *listener,
 610                           MemoryRegionSection *section,
 611                           int old, int new)
 612 {
 613     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 614     int r;
 615
 616     if (new != 0) {
 617         return;
 618     }
 619
 620     r = kvm_section_update_flags(kml, section);
 621     if (r < 0) {
 622         abort();
 623     }
 624 }
 625
 626 /* get kvm's dirty pages bitmap and update qemu's */
 627 static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
 628 {
 629     ram_addr_t start = slot->ram_start_offset;
 630     ram_addr_t pages = slot->memory_size / qemu_real_host_page_size();
 631
 632     cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
 633 }
 634
 635 static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
 636 {
 637     memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
 638 }
 639
 640 #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 641
 642 /* Allocate the dirty bitmap for a slot  */
 643 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem)
 644 {
 645     if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) {
 646         return;
 647     }
 648
 649     /*
 650      * XXX bad kernel interface alert
 651      * For dirty bitmap, kernel allocates array of size aligned to
 652      * bits-per-long.  But for case when the kernel is 64bits and
 653      * the userspace is 32bits, userspace can't align to the same
 654      * bits-per-long, since sizeof(long) is different between kernel
 655      * and user space.  This way, userspace will provide buffer which
 656      * may be 4 bytes less than the kernel will use, resulting in
 657      * userspace memory corruption (which is not detectable by valgrind
 658      * too, in most cases).
 659      * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
 660      * a hope that sizeof(long) won't become >8 any time soon.
 661      *
 662      * Note: the granule of kvm dirty log is qemu_real_host_page_size.
 663      * And mem->memory_size is aligned to it (otherwise this mem can't
 664      * be registered to KVM).
 665      */
 666     hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(),
 667                                         /*HOST_LONG_BITS*/ 64) / 8;
 668     mem->dirty_bmap = g_malloc0(bitmap_size);
 669     mem->dirty_bmap_size = bitmap_size;
 670 }
 671
 672 /*
 673  * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if
 674  * succeeded, false otherwise
 675  */
 676 static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
 677 {
 678     struct kvm_dirty_log d = {};
 679     int ret;
 680
 681     d.dirty_bitmap = slot->dirty_bmap;
 682     d.slot = slot->slot | (slot->as_id << 16);
 683     ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
 684
 685     if (ret == -ENOENT) {
 686         /* kernel does not have dirty bitmap in this slot */
 687         ret = 0;
 688     }
 689     if (ret) {
 690         error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d",
 691                           __func__, ret);
 692     }
 693     return ret == 0;
 694 }
 695
 696 /* Should be with all slots_lock held for the address spaces. */
 697 static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
 698                                      uint32_t slot_id, uint64_t offset)
 699 {
 700     KVMMemoryListener *kml;
 701     KVMSlot *mem;
 702
 703     if (as_id >= s->nr_as) {
 704         return;
 705     }
 706
 707     kml = s->as[as_id].ml;
 708     mem = &kml->slots[slot_id];
 709
 710     if (!mem->memory_size || offset >=
 711         (mem->memory_size / qemu_real_host_page_size())) {
 712         return;
 713     }
 714
 715     set_bit(offset, mem->dirty_bmap);
 716 }
 717
 718 static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
 719 {
 720     return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
 721 }
 722
 723 static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
 724 {
 725     gfn->flags = KVM_DIRTY_GFN_F_RESET;
 726 }
 727
 728 /*
 729  * Should be with all slots_lock held for the address spaces.  It returns the
 730  * dirty page we've collected on this dirty ring.
 731  */
 732 static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
 733 {
 734     struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
 735     uint32_t ring_size = s->kvm_dirty_ring_size;
 736     uint32_t count = 0, fetch = cpu->kvm_fetch_index;
 737
 738     assert(dirty_gfns && ring_size);
 739     trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
 740
 741     while (true) {
 742         cur = &dirty_gfns[fetch % ring_size];
 743         if (!dirty_gfn_is_dirtied(cur)) {
 744             break;
 745         }
 746         kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
 747                                  cur->offset);
 748         dirty_gfn_set_collected(cur);
 749         trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
 750         fetch++;
 751         count++;
 752     }
 753     cpu->kvm_fetch_index = fetch;
 754     cpu->dirty_pages += count;
 755
 756     return count;
 757 }
 758
 759 /* Must be with slots_lock held */
 760 static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
 761 {
 762     int ret;
 763     CPUState *cpu;
 764     uint64_t total = 0;
 765     int64_t stamp;
 766
 767     stamp = get_clock();
 768
 769     CPU_FOREACH(cpu) {
 770         total += kvm_dirty_ring_reap_one(s, cpu);
 771     }
 772
 773     if (total) {
 774         ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
 775         assert(ret == total);
 776     }
 777
 778     stamp = get_clock() - stamp;
 779
 780     if (total) {
 781         trace_kvm_dirty_ring_reap(total, stamp / 1000);
 782     }
 783
 784     return total;
 785 }
 786
 787 /*
 788  * Currently for simplicity, we must hold BQL before calling this.  We can
 789  * consider to drop the BQL if we're clear with all the race conditions.
 790  */
 791 static uint64_t kvm_dirty_ring_reap(KVMState *s)
 792 {
 793     uint64_t total;
 794
 795     /*
 796      * We need to lock all kvm slots for all address spaces here,
 797      * because:
 798      *
 799      * (1) We need to mark dirty for dirty bitmaps in multiple slots
 800      *     and for tons of pages, so it's better to take the lock here
 801      *     once rather than once per page.  And more importantly,
 802      *
 803      * (2) We must _NOT_ publish dirty bits to the other threads
 804      *     (e.g., the migration thread) via the kvm memory slot dirty
 805      *     bitmaps before correctly re-protect those dirtied pages.
 806      *     Otherwise we can have potential risk of data corruption if
 807      *     the page data is read in the other thread before we do
 808      *     reset below.
 809      */
 810     kvm_slots_lock();
 811     total = kvm_dirty_ring_reap_locked(s);
 812     kvm_slots_unlock();
 813
 814     return total;
 815 }
 816
 817 static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
 818 {
 819     /* No need to do anything */
 820 }
 821
 822 /*
 823  * Kick all vcpus out in a synchronized way.  When returned, we
 824  * guarantee that every vcpu has been kicked and at least returned to
 825  * userspace once.
 826  */
 827 static void kvm_cpu_synchronize_kick_all(void)
 828 {
 829     CPUState *cpu;
 830
 831     CPU_FOREACH(cpu) {
 832         run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
 833     }
 834 }
 835
 836 /*
 837  * Flush all the existing dirty pages to the KVM slot buffers.  When
 838  * this call returns, we guarantee that all the touched dirty pages
 839  * before calling this function have been put into the per-kvmslot
 840  * dirty bitmap.
 841  *
 842  * This function must be called with BQL held.
 843  */
 844 static void kvm_dirty_ring_flush(void)
 845 {
 846     trace_kvm_dirty_ring_flush(0);
 847     /*
 848      * The function needs to be serialized.  Since this function
 849      * should always be with BQL held, serialization is guaranteed.
 850      * However, let's be sure of it.
 851      */
 852     assert(qemu_mutex_iothread_locked());
 853     /*
 854      * First make sure to flush the hardware buffers by kicking all
 855      * vcpus out in a synchronous way.
 856      */
 857     kvm_cpu_synchronize_kick_all();
 858     kvm_dirty_ring_reap(kvm_state);
 859     trace_kvm_dirty_ring_flush(1);
 860 }
 861
 862 /**
 863  * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
 864  *
 865  * This function will first try to fetch dirty bitmap from the kernel,
 866  * and then updates qemu's dirty bitmap.
 867  *
 868  * NOTE: caller must be with kml->slots_lock held.
 869  *
 870  * @kml: the KVM memory listener object
 871  * @section: the memory section to sync the dirty bitmap with
 872  */
 873 static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
 874                                            MemoryRegionSection *section)
 875 {
 876     KVMState *s = kvm_state;
 877     KVMSlot *mem;
 878     hwaddr start_addr, size;
 879     hwaddr slot_size;
 880
 881     size = kvm_align_section(section, &start_addr);
 882     while (size) {
 883         slot_size = MIN(kvm_max_slot_size, size);
 884         mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
 885         if (!mem) {
 886             /* We don't have a slot if we want to trap every access. */
 887             return;
 888         }
 889         if (kvm_slot_get_dirty_log(s, mem)) {
 890             kvm_slot_sync_dirty_pages(mem);
 891         }
 892         start_addr += slot_size;
 893         size -= slot_size;
 894     }
 895 }
 896
 897 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
 898 #define KVM_CLEAR_LOG_SHIFT  6
 899 #define KVM_CLEAR_LOG_ALIGN  (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT)
 900 #define KVM_CLEAR_LOG_MASK   (-KVM_CLEAR_LOG_ALIGN)
 901
 902 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
 903                                   uint64_t size)
 904 {
 905     KVMState *s = kvm_state;
 906     uint64_t end, bmap_start, start_delta, bmap_npages;
 907     struct kvm_clear_dirty_log d;
 908     unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size();
 909     int ret;
 910
 911     /*
 912      * We need to extend either the start or the size or both to
 913      * satisfy the KVM interface requirement.  Firstly, do the start
 914      * page alignment on 64 host pages
 915      */
 916     bmap_start = start & KVM_CLEAR_LOG_MASK;
 917     start_delta = start - bmap_start;
 918     bmap_start /= psize;
 919
 920     /*
 921      * The kernel interface has restriction on the size too, that either:
 922      *
 923      * (1) the size is 64 host pages aligned (just like the start), or
 924      * (2) the size fills up until the end of the KVM memslot.
 925      */
 926     bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
 927         << KVM_CLEAR_LOG_SHIFT;
 928     end = mem->memory_size / psize;
 929     if (bmap_npages > end - bmap_start) {
 930         bmap_npages = end - bmap_start;
 931     }
 932     start_delta /= psize;
 933
 934     /*
 935      * Prepare the bitmap to clear dirty bits.  Here we must guarantee
 936      * that we won't clear any unknown dirty bits otherwise we might
 937      * accidentally clear some set bits which are not yet synced from
 938      * the kernel into QEMU's bitmap, then we'll lose track of the
 939      * guest modifications upon those pages (which can directly lead
 940      * to guest data loss or panic after migration).
 941      *
 942      * Layout of the KVMSlot.dirty_bmap:
 943      *
 944      *                   |<-------- bmap_npages -----------..>|
 945      *                                                     [1]
 946      *                     start_delta         size
 947      *  |----------------|-------------|------------------|------------|
 948      *  ^                ^             ^                               ^
 949      *  |                |             |                               |
 950      * start          bmap_start     (start)                         end
 951      * of memslot                                             of memslot
 952      *
 953      * [1] bmap_npages can be aligned to either 64 pages or the end of slot
 954      */
 955
 956     assert(bmap_start % BITS_PER_LONG == 0);
 957     /* We should never do log_clear before log_sync */
 958     assert(mem->dirty_bmap);
 959     if (start_delta || bmap_npages - size / psize) {
 960         /* Slow path - we need to manipulate a temp bitmap */
 961         bmap_clear = bitmap_new(bmap_npages);
 962         bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
 963                                     bmap_start, start_delta + size / psize);
 964         /*
 965          * We need to fill the holes at start because that was not
 966          * specified by the caller and we extended the bitmap only for
 967          * 64 pages alignment
 968          */
 969         bitmap_clear(bmap_clear, 0, start_delta);
 970         d.dirty_bitmap = bmap_clear;
 971     } else {
 972         /*
 973          * Fast path - both start and size align well with BITS_PER_LONG
 974          * (or the end of memory slot)
 975          */
 976         d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
 977     }
 978
 979     d.first_page = bmap_start;
 980     /* It should never overflow.  If it happens, say something */
 981     assert(bmap_npages <= UINT32_MAX);
 982     d.num_pages = bmap_npages;
 983     d.slot = mem->slot | (as_id << 16);
 984
 985     ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d);
 986     if (ret < 0 && ret != -ENOENT) {
 987         error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
 988                      "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
 989                      __func__, d.slot, (uint64_t)d.first_page,
 990                      (uint32_t)d.num_pages, ret);
 991     } else {
 992         ret = 0;
 993         trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
 994     }
 995
 996     /*
 997      * After we have updated the remote dirty bitmap, we update the
 998      * cached bitmap as well for the memslot, then if another user
 999      * clears the same region we know we shouldn't clear it again on
1000      * the remote otherwise it's data loss as well.
1001      */
1002     bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
1003                  size / psize);
1004     /* This handles the NULL case well */
1005     g_free(bmap_clear);
1006     return ret;
1007 }
1008
1009
1010 /**
1011  * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
1012  *
1013  * NOTE: this will be a no-op if we haven't enabled manual dirty log
1014  * protection in the host kernel because in that case this operation
1015  * will be done within log_sync().
1016  *
1017  * @kml:     the kvm memory listener
1018  * @section: the memory range to clear dirty bitmap
1019  */
1020 static int kvm_physical_log_clear(KVMMemoryListener *kml,
1021                                   MemoryRegionSection *section)
1022 {
1023     KVMState *s = kvm_state;
1024     uint64_t start, size, offset, count;
1025     KVMSlot *mem;
1026     int ret = 0, i;
1027
1028     if (!s->manual_dirty_log_protect) {
1029         /* No need to do explicit clear */
1030         return ret;
1031     }
1032
1033     start = section->offset_within_address_space;
1034     size = int128_get64(section->size);
1035
1036     if (!size) {
1037         /* Nothing more we can do... */
1038         return ret;
1039     }
1040
1041     kvm_slots_lock();
1042
1043     for (i = 0; i < s->nr_slots; i++) {
1044         mem = &kml->slots[i];
1045         /* Discard slots that are empty or do not overlap the section */
1046         if (!mem->memory_size ||
1047             mem->start_addr > start + size - 1 ||
1048             start > mem->start_addr + mem->memory_size - 1) {
1049             continue;
1050         }
1051
1052         if (start >= mem->start_addr) {
1053             /* The slot starts before section or is aligned to it.  */
1054             offset = start - mem->start_addr;
1055             count = MIN(mem->memory_size - offset, size);
1056         } else {
1057             /* The slot starts after section.  */
1058             offset = 0;
1059             count = MIN(mem->memory_size, size - (mem->start_addr - start));
1060         }
1061         ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
1062         if (ret < 0) {
1063             break;
1064         }
1065     }
1066
1067     kvm_slots_unlock();
1068
1069     return ret;
1070 }
1071
1072 static void kvm_coalesce_mmio_region(MemoryListener *listener,
1073                                      MemoryRegionSection *secion,
1074                                      hwaddr start, hwaddr size)
1075 {
1076     KVMState *s = kvm_state;
1077
1078     if (s->coalesced_mmio) {
1079         struct kvm_coalesced_mmio_zone zone;
1080
1081         zone.addr = start;
1082         zone.size = size;
1083         zone.pad = 0;
1084
1085         (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1086     }
1087 }
1088
1089 static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
1090                                        MemoryRegionSection *secion,
1091                                        hwaddr start, hwaddr size)
1092 {
1093     KVMState *s = kvm_state;
1094
1095     if (s->coalesced_mmio) {
1096         struct kvm_coalesced_mmio_zone zone;
1097
1098         zone.addr = start;
1099         zone.size = size;
1100         zone.pad = 0;
1101
1102         (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1103     }
1104 }
1105
1106 static void kvm_coalesce_pio_add(MemoryListener *listener,
1107                                 MemoryRegionSection *section,
1108                                 hwaddr start, hwaddr size)
1109 {
1110     KVMState *s = kvm_state;
1111
1112     if (s->coalesced_pio) {
1113         struct kvm_coalesced_mmio_zone zone;
1114
1115         zone.addr = start;
1116         zone.size = size;
1117         zone.pio = 1;
1118
1119         (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1120     }
1121 }
1122
1123 static void kvm_coalesce_pio_del(MemoryListener *listener,
1124                                 MemoryRegionSection *section,
1125                                 hwaddr start, hwaddr size)
1126 {
1127     KVMState *s = kvm_state;
1128
1129     if (s->coalesced_pio) {
1130         struct kvm_coalesced_mmio_zone zone;
1131
1132         zone.addr = start;
1133         zone.size = size;
1134         zone.pio = 1;
1135
1136         (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1137      }
1138 }
1139
1140 static MemoryListener kvm_coalesced_pio_listener = {
1141     .name = "kvm-coalesced-pio",
1142     .coalesced_io_add = kvm_coalesce_pio_add,
1143     .coalesced_io_del = kvm_coalesce_pio_del,
1144 };
1145
1146 int kvm_check_extension(KVMState *s, unsigned int extension)
1147 {
1148     int ret;
1149
1150     ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1151     if (ret < 0) {
1152         ret = 0;
1153     }
1154
1155     return ret;
1156 }
1157
1158 int kvm_vm_check_extension(KVMState *s, unsigned int extension)
1159 {
1160     int ret;
1161
1162     ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1163     if (ret < 0) {
1164         /* VM wide version not implemented, use global one instead */
1165         ret = kvm_check_extension(s, extension);
1166     }
1167
1168     return ret;
1169 }
1170
1171 typedef struct HWPoisonPage {
1172     ram_addr_t ram_addr;
1173     QLIST_ENTRY(HWPoisonPage) list;
1174 } HWPoisonPage;
1175
1176 static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
1177     QLIST_HEAD_INITIALIZER(hwpoison_page_list);
1178
1179 static void kvm_unpoison_all(void *param)
1180 {
1181     HWPoisonPage *page, *next_page;
1182
1183     QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
1184         QLIST_REMOVE(page, list);
1185         qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
1186         g_free(page);
1187     }
1188 }
1189
1190 void kvm_hwpoison_page_add(ram_addr_t ram_addr)
1191 {
1192     HWPoisonPage *page;
1193
1194     QLIST_FOREACH(page, &hwpoison_page_list, list) {
1195         if (page->ram_addr == ram_addr) {
1196             return;
1197         }
1198     }
1199     page = g_new(HWPoisonPage, 1);
1200     page->ram_addr = ram_addr;
1201     QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
1202 }
1203
1204 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
1205 {
1206 #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN
1207     /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN
1208      * endianness, but the memory core hands them in target endianness.
1209      * For example, PPC is always treated as big-endian even if running
1210      * on KVM and on PPC64LE.  Correct here.
1211      */
1212     switch (size) {
1213     case 2:
1214         val = bswap16(val);
1215         break;
1216     case 4:
1217         val = bswap32(val);
1218         break;
1219     }
1220 #endif
1221     return val;
1222 }
1223
1224 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
1225                                   bool assign, uint32_t size, bool datamatch)
1226 {
1227     int ret;
1228     struct kvm_ioeventfd iofd = {
1229         .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1230         .addr = addr,
1231         .len = size,
1232         .flags = 0,
1233         .fd = fd,
1234     };
1235
1236     trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
1237                                  datamatch);
1238     if (!kvm_enabled()) {
1239         return -ENOSYS;
1240     }
1241
1242     if (datamatch) {
1243         iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1244     }
1245     if (!assign) {
1246         iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1247     }
1248
1249     ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1250
1251     if (ret < 0) {
1252         return -errno;
1253     }
1254
1255     return 0;
1256 }
1257
1258 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
1259                                  bool assign, uint32_t size, bool datamatch)
1260 {
1261     struct kvm_ioeventfd kick = {
1262         .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1263         .addr = addr,
1264         .flags = KVM_IOEVENTFD_FLAG_PIO,
1265         .len = size,
1266         .fd = fd,
1267     };
1268     int r;
1269     trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
1270     if (!kvm_enabled()) {
1271         return -ENOSYS;
1272     }
1273     if (datamatch) {
1274         kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1275     }
1276     if (!assign) {
1277         kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1278     }
1279     r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1280     if (r < 0) {
1281         return r;
1282     }
1283     return 0;
1284 }
1285
1286
1287 static int kvm_check_many_ioeventfds(void)
1288 {
1289     /* Userspace can use ioeventfd for io notification.  This requires a host
1290      * that supports eventfd(2) and an I/O thread; since eventfd does not
1291      * support SIGIO it cannot interrupt the vcpu.
1292      *
1293      * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
1294      * can avoid creating too many ioeventfds.
1295      */
1296 #if defined(CONFIG_EVENTFD)
1297     int ioeventfds[7];
1298     int i, ret = 0;
1299     for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
1300         ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
1301         if (ioeventfds[i] < 0) {
1302             break;
1303         }
1304         ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
1305         if (ret < 0) {
1306             close(ioeventfds[i]);
1307             break;
1308         }
1309     }
1310
1311     /* Decide whether many devices are supported or not */
1312     ret = i == ARRAY_SIZE(ioeventfds);
1313
1314     while (i-- > 0) {
1315         kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
1316         close(ioeventfds[i]);
1317     }
1318     return ret;
1319 #else
1320     return 0;
1321 #endif
1322 }
1323
1324 static const KVMCapabilityInfo *
1325 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
1326 {
1327     while (list->name) {
1328         if (!kvm_check_extension(s, list->value)) {
1329             return list;
1330         }
1331         list++;
1332     }
1333     return NULL;
1334 }
1335
1336 void kvm_set_max_memslot_size(hwaddr max_slot_size)
1337 {
1338     g_assert(
1339         ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size
1340     );
1341     kvm_max_slot_size = max_slot_size;
1342 }
1343
1344 static void kvm_set_phys_mem(KVMMemoryListener *kml,
1345                              MemoryRegionSection *section, bool add)
1346 {
1347     KVMSlot *mem;
1348     int err;
1349     MemoryRegion *mr = section->mr;
1350     bool writable = !mr->readonly && !mr->rom_device;
1351     hwaddr start_addr, size, slot_size, mr_offset;
1352     ram_addr_t ram_start_offset;
1353     void *ram;
1354
1355     if (!memory_region_is_ram(mr)) {
1356         if (writable || !kvm_readonly_mem_allowed) {
1357             return;
1358         } else if (!mr->romd_mode) {
1359             /* If the memory device is not in romd_mode, then we actually want
1360              * to remove the kvm memory slot so all accesses will trap. */
1361             add = false;
1362         }
1363     }
1364
1365     size = kvm_align_section(section, &start_addr);
1366     if (!size) {
1367         return;
1368     }
1369
1370     /* The offset of the kvmslot within the memory region */
1371     mr_offset = section->offset_within_region + start_addr -
1372         section->offset_within_address_space;
1373
1374     /* use aligned delta to align the ram address and offset */
1375     ram = memory_region_get_ram_ptr(mr) + mr_offset;
1376     ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
1377
1378     kvm_slots_lock();
1379
1380     if (!add) {
1381         do {
1382             slot_size = MIN(kvm_max_slot_size, size);
1383             mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
1384             if (!mem) {
1385                 goto out;
1386             }
1387             if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1388                 /*
1389                  * NOTE: We should be aware of the fact that here we're only
1390                  * doing a best effort to sync dirty bits.  No matter whether
1391                  * we're using dirty log or dirty ring, we ignored two facts:
1392                  *
1393                  * (1) dirty bits can reside in hardware buffers (PML)
1394                  *
1395                  * (2) after we collected dirty bits here, pages can be dirtied
1396                  * again before we do the final KVM_SET_USER_MEMORY_REGION to
1397                  * remove the slot.
1398                  *
1399                  * Not easy.  Let's cross the fingers until it's fixed.
1400                  */
1401                 if (kvm_state->kvm_dirty_ring_size) {
1402                     kvm_dirty_ring_reap_locked(kvm_state);
1403                 } else {
1404                     kvm_slot_get_dirty_log(kvm_state, mem);
1405                 }
1406                 kvm_slot_sync_dirty_pages(mem);
1407             }
1408
1409             /* unregister the slot */
1410             g_free(mem->dirty_bmap);
1411             mem->dirty_bmap = NULL;
1412             mem->memory_size = 0;
1413             mem->flags = 0;
1414             err = kvm_set_user_memory_region(kml, mem, false);
1415             if (err) {
1416                 fprintf(stderr, "%s: error unregistering slot: %s\n",
1417                         __func__, strerror(-err));
1418                 abort();
1419             }
1420             start_addr += slot_size;
1421             size -= slot_size;
1422         } while (size);
1423         goto out;
1424     }
1425
1426     /* register the new slot */
1427     do {
1428         slot_size = MIN(kvm_max_slot_size, size);
1429         mem = kvm_alloc_slot(kml);
1430         mem->as_id = kml->as_id;
1431         mem->memory_size = slot_size;
1432         mem->start_addr = start_addr;
1433         mem->ram_start_offset = ram_start_offset;
1434         mem->ram = ram;
1435         mem->flags = kvm_mem_flags(mr);
1436         kvm_slot_init_dirty_bitmap(mem);
1437         err = kvm_set_user_memory_region(kml, mem, true);
1438         if (err) {
1439             fprintf(stderr, "%s: error registering slot: %s\n", __func__,
1440                     strerror(-err));
1441             abort();
1442         }
1443         start_addr += slot_size;
1444         ram_start_offset += slot_size;
1445         ram += slot_size;
1446         size -= slot_size;
1447     } while (size);
1448
1449 out:
1450     kvm_slots_unlock();
1451 }
1452
1453 static void *kvm_dirty_ring_reaper_thread(void *data)
1454 {
1455     KVMState *s = data;
1456     struct KVMDirtyRingReaper *r = &s->reaper;
1457
1458     rcu_register_thread();
1459
1460     trace_kvm_dirty_ring_reaper("init");
1461
1462     while (true) {
1463         r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
1464         trace_kvm_dirty_ring_reaper("wait");
1465         /*
1466          * TODO: provide a smarter timeout rather than a constant?
1467          */
1468         sleep(1);
1469
1470         trace_kvm_dirty_ring_reaper("wakeup");
1471         r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
1472
1473         qemu_mutex_lock_iothread();
1474         kvm_dirty_ring_reap(s);
1475         qemu_mutex_unlock_iothread();
1476
1477         r->reaper_iteration++;
1478     }
1479
1480     trace_kvm_dirty_ring_reaper("exit");
1481
1482     rcu_unregister_thread();
1483
1484     return NULL;
1485 }
1486
1487 static int kvm_dirty_ring_reaper_init(KVMState *s)
1488 {
1489     struct KVMDirtyRingReaper *r = &s->reaper;
1490
1491     qemu_thread_create(&r->reaper_thr, "kvm-reaper",
1492                        kvm_dirty_ring_reaper_thread,
1493                        s, QEMU_THREAD_JOINABLE);
1494
1495     return 0;
1496 }
1497
1498 static void kvm_region_add(MemoryListener *listener,
1499                            MemoryRegionSection *section)
1500 {
1501     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1502
1503     memory_region_ref(section->mr);
1504     kvm_set_phys_mem(kml, section, true);
1505 }
1506
1507 static void kvm_region_del(MemoryListener *listener,
1508                            MemoryRegionSection *section)
1509 {
1510     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1511
1512     kvm_set_phys_mem(kml, section, false);
1513     memory_region_unref(section->mr);
1514 }
1515
1516 static void kvm_log_sync(MemoryListener *listener,
1517                          MemoryRegionSection *section)
1518 {
1519     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1520
1521     kvm_slots_lock();
1522     kvm_physical_sync_dirty_bitmap(kml, section);
1523     kvm_slots_unlock();
1524 }
1525
1526 static void kvm_log_sync_global(MemoryListener *l)
1527 {
1528     KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
1529     KVMState *s = kvm_state;
1530     KVMSlot *mem;
1531     int i;
1532
1533     /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
1534     kvm_dirty_ring_flush();
1535
1536     /*
1537      * TODO: make this faster when nr_slots is big while there are
1538      * only a few used slots (small VMs).
1539      */
1540     kvm_slots_lock();
1541     for (i = 0; i < s->nr_slots; i++) {
1542         mem = &kml->slots[i];
1543         if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1544             kvm_slot_sync_dirty_pages(mem);
1545             /*
1546              * This is not needed by KVM_GET_DIRTY_LOG because the
1547              * ioctl will unconditionally overwrite the whole region.
1548              * However kvm dirty ring has no such side effect.
1549              */
1550             kvm_slot_reset_dirty_pages(mem);
1551         }
1552     }
1553     kvm_slots_unlock();
1554 }
1555
1556 static void kvm_log_clear(MemoryListener *listener,
1557                           MemoryRegionSection *section)
1558 {
1559     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1560     int r;
1561
1562     r = kvm_physical_log_clear(kml, section);
1563     if (r < 0) {
1564         error_report_once("%s: kvm log clear failed: mr=%s "
1565                           "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1566                           section->mr->name, section->offset_within_region,
1567                           int128_get64(section->size));
1568         abort();
1569     }
1570 }
1571
1572 static void kvm_mem_ioeventfd_add(MemoryListener *listener,
1573                                   MemoryRegionSection *section,
1574                                   bool match_data, uint64_t data,
1575                                   EventNotifier *e)
1576 {
1577     int fd = event_notifier_get_fd(e);
1578     int r;
1579
1580     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1581                                data, true, int128_get64(section->size),
1582                                match_data);
1583     if (r < 0) {
1584         fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1585                 __func__, strerror(-r), -r);
1586         abort();
1587     }
1588 }
1589
1590 static void kvm_mem_ioeventfd_del(MemoryListener *listener,
1591                                   MemoryRegionSection *section,
1592                                   bool match_data, uint64_t data,
1593                                   EventNotifier *e)
1594 {
1595     int fd = event_notifier_get_fd(e);
1596     int r;
1597
1598     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1599                                data, false, int128_get64(section->size),
1600                                match_data);
1601     if (r < 0) {
1602         fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1603                 __func__, strerror(-r), -r);
1604         abort();
1605     }
1606 }
1607
1608 static void kvm_io_ioeventfd_add(MemoryListener *listener,
1609                                  MemoryRegionSection *section,
1610                                  bool match_data, uint64_t data,
1611                                  EventNotifier *e)
1612 {
1613     int fd = event_notifier_get_fd(e);
1614     int r;
1615
1616     r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1617                               data, true, int128_get64(section->size),
1618                               match_data);
1619     if (r < 0) {
1620         fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1621                 __func__, strerror(-r), -r);
1622         abort();
1623     }
1624 }
1625
1626 static void kvm_io_ioeventfd_del(MemoryListener *listener,
1627                                  MemoryRegionSection *section,
1628                                  bool match_data, uint64_t data,
1629                                  EventNotifier *e)
1630
1631 {
1632     int fd = event_notifier_get_fd(e);
1633     int r;
1634
1635     r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1636                               data, false, int128_get64(section->size),
1637                               match_data);
1638     if (r < 0) {
1639         fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1640                 __func__, strerror(-r), -r);
1641         abort();
1642     }
1643 }
1644
1645 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
1646                                   AddressSpace *as, int as_id, const char *name)
1647 {
1648     int i;
1649
1650     kml->slots = g_new0(KVMSlot, s->nr_slots);
1651     kml->as_id = as_id;
1652
1653     for (i = 0; i < s->nr_slots; i++) {
1654         kml->slots[i].slot = i;
1655     }
1656
1657     kml->listener.region_add = kvm_region_add;
1658     kml->listener.region_del = kvm_region_del;
1659     kml->listener.log_start = kvm_log_start;
1660     kml->listener.log_stop = kvm_log_stop;
1661     kml->listener.priority = 10;
1662     kml->listener.name = name;
1663
1664     if (s->kvm_dirty_ring_size) {
1665         kml->listener.log_sync_global = kvm_log_sync_global;
1666     } else {
1667         kml->listener.log_sync = kvm_log_sync;
1668         kml->listener.log_clear = kvm_log_clear;
1669     }
1670
1671     memory_listener_register(&kml->listener, as);
1672
1673     for (i = 0; i < s->nr_as; ++i) {
1674         if (!s->as[i].as) {
1675             s->as[i].as = as;
1676             s->as[i].ml = kml;
1677             break;
1678         }
1679     }
1680 }
1681
1682 static MemoryListener kvm_io_listener = {
1683     .name = "kvm-io",
1684     .eventfd_add = kvm_io_ioeventfd_add,
1685     .eventfd_del = kvm_io_ioeventfd_del,
1686     .priority = 10,
1687 };
1688
1689 int kvm_set_irq(KVMState *s, int irq, int level)
1690 {
1691     struct kvm_irq_level event;
1692     int ret;
1693
1694     assert(kvm_async_interrupts_enabled());
1695
1696     event.level = level;
1697     event.irq = irq;
1698     ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
1699     if (ret < 0) {
1700         perror("kvm_set_irq");
1701         abort();
1702     }
1703
1704     return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
1705 }
1706
1707 #ifdef KVM_CAP_IRQ_ROUTING
1708 typedef struct KVMMSIRoute {
1709     struct kvm_irq_routing_entry kroute;
1710     QTAILQ_ENTRY(KVMMSIRoute) entry;
1711 } KVMMSIRoute;
1712
1713 static void set_gsi(KVMState *s, unsigned int gsi)
1714 {
1715     set_bit(gsi, s->used_gsi_bitmap);
1716 }
1717
1718 static void clear_gsi(KVMState *s, unsigned int gsi)
1719 {
1720     clear_bit(gsi, s->used_gsi_bitmap);
1721 }
1722
1723 void kvm_init_irq_routing(KVMState *s)
1724 {
1725     int gsi_count, i;
1726
1727     gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
1728     if (gsi_count > 0) {
1729         /* Round up so we can search ints using ffs */
1730         s->used_gsi_bitmap = bitmap_new(gsi_count);
1731         s->gsi_count = gsi_count;
1732     }
1733
1734     s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
1735     s->nr_allocated_irq_routes = 0;
1736
1737     if (!kvm_direct_msi_allowed) {
1738         for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
1739             QTAILQ_INIT(&s->msi_hashtab[i]);
1740         }
1741     }
1742
1743     kvm_arch_init_irq_routing(s);
1744 }
1745
1746 void kvm_irqchip_commit_routes(KVMState *s)
1747 {
1748     int ret;
1749
1750     if (kvm_gsi_direct_mapping()) {
1751         return;
1752     }
1753
1754     if (!kvm_gsi_routing_enabled()) {
1755         return;
1756     }
1757
1758     s->irq_routes->flags = 0;
1759     trace_kvm_irqchip_commit_routes();
1760     ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
1761     assert(ret == 0);
1762 }
1763
1764 static void kvm_add_routing_entry(KVMState *s,
1765                                   struct kvm_irq_routing_entry *entry)
1766 {
1767     struct kvm_irq_routing_entry *new;
1768     int n, size;
1769
1770     if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
1771         n = s->nr_allocated_irq_routes * 2;
1772         if (n < 64) {
1773             n = 64;
1774         }
1775         size = sizeof(struct kvm_irq_routing);
1776         size += n * sizeof(*new);
1777         s->irq_routes = g_realloc(s->irq_routes, size);
1778         s->nr_allocated_irq_routes = n;
1779     }
1780     n = s->irq_routes->nr++;
1781     new = &s->irq_routes->entries[n];
1782
1783     *new = *entry;
1784
1785     set_gsi(s, entry->gsi);
1786 }
1787
1788 static int kvm_update_routing_entry(KVMState *s,
1789                                     struct kvm_irq_routing_entry *new_entry)
1790 {
1791     struct kvm_irq_routing_entry *entry;
1792     int n;
1793
1794     for (n = 0; n < s->irq_routes->nr; n++) {
1795         entry = &s->irq_routes->entries[n];
1796         if (entry->gsi != new_entry->gsi) {
1797             continue;
1798         }
1799
1800         if(!memcmp(entry, new_entry, sizeof *entry)) {
1801             return 0;
1802         }
1803
1804         *entry = *new_entry;
1805
1806         return 0;
1807     }
1808
1809     return -ESRCH;
1810 }
1811
1812 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1813 {
1814     struct kvm_irq_routing_entry e = {};
1815
1816     assert(pin < s->gsi_count);
1817
1818     e.gsi = irq;
1819     e.type = KVM_IRQ_ROUTING_IRQCHIP;
1820     e.flags = 0;
1821     e.u.irqchip.irqchip = irqchip;
1822     e.u.irqchip.pin = pin;
1823     kvm_add_routing_entry(s, &e);
1824 }
1825
1826 void kvm_irqchip_release_virq(KVMState *s, int virq)
1827 {
1828     struct kvm_irq_routing_entry *e;
1829     int i;
1830
1831     if (kvm_gsi_direct_mapping()) {
1832         return;
1833     }
1834
1835     for (i = 0; i < s->irq_routes->nr; i++) {
1836         e = &s->irq_routes->entries[i];
1837         if (e->gsi == virq) {
1838             s->irq_routes->nr--;
1839             *e = s->irq_routes->entries[s->irq_routes->nr];
1840         }
1841     }
1842     clear_gsi(s, virq);
1843     kvm_arch_release_virq_post(virq);
1844     trace_kvm_irqchip_release_virq(virq);
1845 }
1846
1847 void kvm_irqchip_add_change_notifier(Notifier *n)
1848 {
1849     notifier_list_add(&kvm_irqchip_change_notifiers, n);
1850 }
1851
1852 void kvm_irqchip_remove_change_notifier(Notifier *n)
1853 {
1854     notifier_remove(n);
1855 }
1856
1857 void kvm_irqchip_change_notify(void)
1858 {
1859     notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
1860 }
1861
1862 static unsigned int kvm_hash_msi(uint32_t data)
1863 {
1864     /* This is optimized for IA32 MSI layout. However, no other arch shall
1865      * repeat the mistake of not providing a direct MSI injection API. */
1866     return data & 0xff;
1867 }
1868
1869 static void kvm_flush_dynamic_msi_routes(KVMState *s)
1870 {
1871     KVMMSIRoute *route, *next;
1872     unsigned int hash;
1873
1874     for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1875         QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1876             kvm_irqchip_release_virq(s, route->kroute.gsi);
1877             QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1878             g_free(route);
1879         }
1880     }
1881 }
1882
1883 static int kvm_irqchip_get_virq(KVMState *s)
1884 {
1885     int next_virq;
1886
1887     /*
1888      * PIC and IOAPIC share the first 16 GSI numbers, thus the available
1889      * GSI numbers are more than the number of IRQ route. Allocating a GSI
1890      * number can succeed even though a new route entry cannot be added.
1891      * When this happens, flush dynamic MSI entries to free IRQ route entries.
1892      */
1893     if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
1894         kvm_flush_dynamic_msi_routes(s);
1895     }
1896
1897     /* Return the lowest unused GSI in the bitmap */
1898     next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
1899     if (next_virq >= s->gsi_count) {
1900         return -ENOSPC;
1901     } else {
1902         return next_virq;
1903     }
1904 }
1905
1906 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
1907 {
1908     unsigned int hash = kvm_hash_msi(msg.data);
1909     KVMMSIRoute *route;
1910
1911     QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1912         if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1913             route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1914             route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1915             return route;
1916         }
1917     }
1918     return NULL;
1919 }
1920
1921 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1922 {
1923     struct kvm_msi msi;
1924     KVMMSIRoute *route;
1925
1926     if (kvm_direct_msi_allowed) {
1927         msi.address_lo = (uint32_t)msg.address;
1928         msi.address_hi = msg.address >> 32;
1929         msi.data = le32_to_cpu(msg.data);
1930         msi.flags = 0;
1931         memset(msi.pad, 0, sizeof(msi.pad));
1932
1933         return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1934     }
1935
1936     route = kvm_lookup_msi_route(s, msg);
1937     if (!route) {
1938         int virq;
1939
1940         virq = kvm_irqchip_get_virq(s);
1941         if (virq < 0) {
1942             return virq;
1943         }
1944
1945         route = g_new0(KVMMSIRoute, 1);
1946         route->kroute.gsi = virq;
1947         route->kroute.type = KVM_IRQ_ROUTING_MSI;
1948         route->kroute.flags = 0;
1949         route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1950         route->kroute.u.msi.address_hi = msg.address >> 32;
1951         route->kroute.u.msi.data = le32_to_cpu(msg.data);
1952
1953         kvm_add_routing_entry(s, &route->kroute);
1954         kvm_irqchip_commit_routes(s);
1955
1956         QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1957                            entry);
1958     }
1959
1960     assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1961
1962     return kvm_set_irq(s, route->kroute.gsi, 1);
1963 }
1964
1965 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
1966 {
1967     struct kvm_irq_routing_entry kroute = {};
1968     int virq;
1969     KVMState *s = c->s;
1970     MSIMessage msg = {0, 0};
1971
1972     if (pci_available && dev) {
1973         msg = pci_get_msi_message(dev, vector);
1974     }
1975
1976     if (kvm_gsi_direct_mapping()) {
1977         return kvm_arch_msi_data_to_gsi(msg.data);
1978     }
1979
1980     if (!kvm_gsi_routing_enabled()) {
1981         return -ENOSYS;
1982     }
1983
1984     virq = kvm_irqchip_get_virq(s);
1985     if (virq < 0) {
1986         return virq;
1987     }
1988
1989     kroute.gsi = virq;
1990     kroute.type = KVM_IRQ_ROUTING_MSI;
1991     kroute.flags = 0;
1992     kroute.u.msi.address_lo = (uint32_t)msg.address;
1993     kroute.u.msi.address_hi = msg.address >> 32;
1994     kroute.u.msi.data = le32_to_cpu(msg.data);
1995     if (pci_available && kvm_msi_devid_required()) {
1996         kroute.flags = KVM_MSI_VALID_DEVID;
1997         kroute.u.msi.devid = pci_requester_id(dev);
1998     }
1999     if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2000         kvm_irqchip_release_virq(s, virq);
2001         return -EINVAL;
2002     }
2003
2004     trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
2005                                     vector, virq);
2006
2007     kvm_add_routing_entry(s, &kroute);
2008     kvm_arch_add_msi_route_post(&kroute, vector, dev);
2009     c->changes++;
2010
2011     return virq;
2012 }
2013
2014 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
2015                                  PCIDevice *dev)
2016 {
2017     struct kvm_irq_routing_entry kroute = {};
2018
2019     if (kvm_gsi_direct_mapping()) {
2020         return 0;
2021     }
2022
2023     if (!kvm_irqchip_in_kernel()) {
2024         return -ENOSYS;
2025     }
2026
2027     kroute.gsi = virq;
2028     kroute.type = KVM_IRQ_ROUTING_MSI;
2029     kroute.flags = 0;
2030     kroute.u.msi.address_lo = (uint32_t)msg.address;
2031     kroute.u.msi.address_hi = msg.address >> 32;
2032     kroute.u.msi.data = le32_to_cpu(msg.data);
2033     if (pci_available && kvm_msi_devid_required()) {
2034         kroute.flags = KVM_MSI_VALID_DEVID;
2035         kroute.u.msi.devid = pci_requester_id(dev);
2036     }
2037     if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2038         return -EINVAL;
2039     }
2040
2041     trace_kvm_irqchip_update_msi_route(virq);
2042
2043     return kvm_update_routing_entry(s, &kroute);
2044 }
2045
2046 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2047                                     EventNotifier *resample, int virq,
2048                                     bool assign)
2049 {
2050     int fd = event_notifier_get_fd(event);
2051     int rfd = resample ? event_notifier_get_fd(resample) : -1;
2052
2053     struct kvm_irqfd irqfd = {
2054         .fd = fd,
2055         .gsi = virq,
2056         .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
2057     };
2058
2059     if (rfd != -1) {
2060         assert(assign);
2061         if (kvm_irqchip_is_split()) {
2062             /*
2063              * When the slow irqchip (e.g. IOAPIC) is in the
2064              * userspace, KVM kernel resamplefd will not work because
2065              * the EOI of the interrupt will be delivered to userspace
2066              * instead, so the KVM kernel resamplefd kick will be
2067              * skipped.  The userspace here mimics what the kernel
2068              * provides with resamplefd, remember the resamplefd and
2069              * kick it when we receive EOI of this IRQ.
2070              *
2071              * This is hackery because IOAPIC is mostly bypassed
2072              * (except EOI broadcasts) when irqfd is used.  However
2073              * this can bring much performance back for split irqchip
2074              * with INTx IRQs (for VFIO, this gives 93% perf of the
2075              * full fast path, which is 46% perf boost comparing to
2076              * the INTx slow path).
2077              */
2078             kvm_resample_fd_insert(virq, resample);
2079         } else {
2080             irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
2081             irqfd.resamplefd = rfd;
2082         }
2083     } else if (!assign) {
2084         if (kvm_irqchip_is_split()) {
2085             kvm_resample_fd_remove(virq);
2086         }
2087     }
2088
2089     if (!kvm_irqfds_enabled()) {
2090         return -ENOSYS;
2091     }
2092
2093     return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
2094 }
2095
2096 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
2097 {
2098     struct kvm_irq_routing_entry kroute = {};
2099     int virq;
2100
2101     if (!kvm_gsi_routing_enabled()) {
2102         return -ENOSYS;
2103     }
2104
2105     virq = kvm_irqchip_get_virq(s);
2106     if (virq < 0) {
2107         return virq;
2108     }
2109
2110     kroute.gsi = virq;
2111     kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
2112     kroute.flags = 0;
2113     kroute.u.adapter.summary_addr = adapter->summary_addr;
2114     kroute.u.adapter.ind_addr = adapter->ind_addr;
2115     kroute.u.adapter.summary_offset = adapter->summary_offset;
2116     kroute.u.adapter.ind_offset = adapter->ind_offset;
2117     kroute.u.adapter.adapter_id = adapter->adapter_id;
2118
2119     kvm_add_routing_entry(s, &kroute);
2120
2121     return virq;
2122 }
2123
2124 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
2125 {
2126     struct kvm_irq_routing_entry kroute = {};
2127     int virq;
2128
2129     if (!kvm_gsi_routing_enabled()) {
2130         return -ENOSYS;
2131     }
2132     if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
2133         return -ENOSYS;
2134     }
2135     virq = kvm_irqchip_get_virq(s);
2136     if (virq < 0) {
2137         return virq;
2138     }
2139
2140     kroute.gsi = virq;
2141     kroute.type = KVM_IRQ_ROUTING_HV_SINT;
2142     kroute.flags = 0;
2143     kroute.u.hv_sint.vcpu = vcpu;
2144     kroute.u.hv_sint.sint = sint;
2145
2146     kvm_add_routing_entry(s, &kroute);
2147     kvm_irqchip_commit_routes(s);
2148
2149     return virq;
2150 }
2151
2152 #else /* !KVM_CAP_IRQ_ROUTING */
2153
2154 void kvm_init_irq_routing(KVMState *s)
2155 {
2156 }
2157
2158 void kvm_irqchip_release_virq(KVMState *s, int virq)
2159 {
2160 }
2161
2162 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2163 {
2164     abort();
2165 }
2166
2167 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
2168 {
2169     return -ENOSYS;
2170 }
2171
2172 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
2173 {
2174     return -ENOSYS;
2175 }
2176
2177 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
2178 {
2179     return -ENOSYS;
2180 }
2181
2182 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2183                                     EventNotifier *resample, int virq,
2184                                     bool assign)
2185 {
2186     abort();
2187 }
2188
2189 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
2190 {
2191     return -ENOSYS;
2192 }
2193 #endif /* !KVM_CAP_IRQ_ROUTING */
2194
2195 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2196                                        EventNotifier *rn, int virq)
2197 {
2198     return kvm_irqchip_assign_irqfd(s, n, rn, virq, true);
2199 }
2200
2201 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2202                                           int virq)
2203 {
2204     return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false);
2205 }
2206
2207 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
2208                                    EventNotifier *rn, qemu_irq irq)
2209 {
2210     gpointer key, gsi;
2211     gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2212
2213     if (!found) {
2214         return -ENXIO;
2215     }
2216     return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
2217 }
2218
2219 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
2220                                       qemu_irq irq)
2221 {
2222     gpointer key, gsi;
2223     gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2224
2225     if (!found) {
2226         return -ENXIO;
2227     }
2228     return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
2229 }
2230
2231 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
2232 {
2233     g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
2234 }
2235
2236 static void kvm_irqchip_create(KVMState *s)
2237 {
2238     int ret;
2239
2240     assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
2241     if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
2242         ;
2243     } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
2244         ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
2245         if (ret < 0) {
2246             fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
2247             exit(1);
2248         }
2249     } else {
2250         return;
2251     }
2252
2253     /* First probe and see if there's a arch-specific hook to create the
2254      * in-kernel irqchip for us */
2255     ret = kvm_arch_irqchip_create(s);
2256     if (ret == 0) {
2257         if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) {
2258             perror("Split IRQ chip mode not supported.");
2259             exit(1);
2260         } else {
2261             ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
2262         }
2263     }
2264     if (ret < 0) {
2265         fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
2266         exit(1);
2267     }
2268
2269     kvm_kernel_irqchip = true;
2270     /* If we have an in-kernel IRQ chip then we must have asynchronous
2271      * interrupt delivery (though the reverse is not necessarily true)
2272      */
2273     kvm_async_interrupts_allowed = true;
2274     kvm_halt_in_kernel_allowed = true;
2275
2276     kvm_init_irq_routing(s);
2277
2278     s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
2279 }
2280
2281 /* Find number of supported CPUs using the recommended
2282  * procedure from the kernel API documentation to cope with
2283  * older kernels that may be missing capabilities.
2284  */
2285 static int kvm_recommended_vcpus(KVMState *s)
2286 {
2287     int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
2288     return (ret) ? ret : 4;
2289 }
2290
2291 static int kvm_max_vcpus(KVMState *s)
2292 {
2293     int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
2294     return (ret) ? ret : kvm_recommended_vcpus(s);
2295 }
2296
2297 static int kvm_max_vcpu_id(KVMState *s)
2298 {
2299     int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
2300     return (ret) ? ret : kvm_max_vcpus(s);
2301 }
2302
2303 bool kvm_vcpu_id_is_valid(int vcpu_id)
2304 {
2305     KVMState *s = KVM_STATE(current_accel());
2306     return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
2307 }
2308
2309 bool kvm_dirty_ring_enabled(void)
2310 {
2311     return kvm_state->kvm_dirty_ring_size ? true : false;
2312 }
2313
2314 static void query_stats_cb(StatsResultList **result, StatsTarget target,
2315                            strList *names, strList *targets, Error **errp);
2316 static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp);
2317
2318 static int kvm_init(MachineState *ms)
2319 {
2320     MachineClass *mc = MACHINE_GET_CLASS(ms);
2321     static const char upgrade_note[] =
2322         "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
2323         "(see http://sourceforge.net/projects/kvm).\n";
2324     struct {
2325         const char *name;
2326         int num;
2327     } num_cpus[] = {
2328         { "SMP",          ms->smp.cpus },
2329         { "hotpluggable", ms->smp.max_cpus },
2330         { NULL, }
2331     }, *nc = num_cpus;
2332     int soft_vcpus_limit, hard_vcpus_limit;
2333     KVMState *s;
2334     const KVMCapabilityInfo *missing_cap;
2335     int ret;
2336     int type = 0;
2337     uint64_t dirty_log_manual_caps;
2338
2339     qemu_mutex_init(&kml_slots_lock);
2340
2341     s = KVM_STATE(ms->accelerator);
2342
2343     /*
2344      * On systems where the kernel can support different base page
2345      * sizes, host page size may be different from TARGET_PAGE_SIZE,
2346      * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
2347      * page size for the system though.
2348      */
2349     assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size());
2350
2351     s->sigmask_len = 8;
2352
2353 #ifdef KVM_CAP_SET_GUEST_DEBUG
2354     QTAILQ_INIT(&s->kvm_sw_breakpoints);
2355 #endif
2356     QLIST_INIT(&s->kvm_parked_vcpus);
2357     s->fd = qemu_open_old("/dev/kvm", O_RDWR);
2358     if (s->fd == -1) {
2359         fprintf(stderr, "Could not access KVM kernel module: %m\n");
2360         ret = -errno;
2361         goto err;
2362     }
2363
2364     ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
2365     if (ret < KVM_API_VERSION) {
2366         if (ret >= 0) {
2367             ret = -EINVAL;
2368         }
2369         fprintf(stderr, "kvm version too old\n");
2370         goto err;
2371     }
2372
2373     if (ret > KVM_API_VERSION) {
2374         ret = -EINVAL;
2375         fprintf(stderr, "kvm version not supported\n");
2376         goto err;
2377     }
2378
2379     kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
2380     s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
2381
2382     /* If unspecified, use the default value */
2383     if (!s->nr_slots) {
2384         s->nr_slots = 32;
2385     }
2386
2387     s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
2388     if (s->nr_as <= 1) {
2389         s->nr_as = 1;
2390     }
2391     s->as = g_new0(struct KVMAs, s->nr_as);
2392
2393     if (object_property_find(OBJECT(current_machine), "kvm-type")) {
2394         g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine),
2395                                                             "kvm-type",
2396                                                             &error_abort);
2397         type = mc->kvm_type(ms, kvm_type);
2398     } else if (mc->kvm_type) {
2399         type = mc->kvm_type(ms, NULL);
2400     }
2401
2402     do {
2403         ret = kvm_ioctl(s, KVM_CREATE_VM, type);
2404     } while (ret == -EINTR);
2405
2406     if (ret < 0) {
2407         fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
2408                 strerror(-ret));
2409
2410 #ifdef TARGET_S390X
2411         if (ret == -EINVAL) {
2412             fprintf(stderr,
2413                     "Host kernel setup problem detected. Please verify:\n");
2414             fprintf(stderr, "- for kernels supporting the switch_amode or"
2415                     " user_mode parameters, whether\n");
2416             fprintf(stderr,
2417                     "  user space is running in primary address space\n");
2418             fprintf(stderr,
2419                     "- for kernels supporting the vm.allocate_pgste sysctl, "
2420                     "whether it is enabled\n");
2421         }
2422 #elif defined(TARGET_PPC)
2423         if (ret == -EINVAL) {
2424             fprintf(stderr,
2425                     "PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
2426                     (type == 2) ? "pr" : "hv");
2427         }
2428 #endif
2429         goto err;
2430     }
2431
2432     s->vmfd = ret;
2433
2434     /* check the vcpu limits */
2435     soft_vcpus_limit = kvm_recommended_vcpus(s);
2436     hard_vcpus_limit = kvm_max_vcpus(s);
2437
2438     while (nc->name) {
2439         if (nc->num > soft_vcpus_limit) {
2440             warn_report("Number of %s cpus requested (%d) exceeds "
2441                         "the recommended cpus supported by KVM (%d)",
2442                         nc->name, nc->num, soft_vcpus_limit);
2443
2444             if (nc->num > hard_vcpus_limit) {
2445                 fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
2446                         "the maximum cpus supported by KVM (%d)\n",
2447                         nc->name, nc->num, hard_vcpus_limit);
2448                 exit(1);
2449             }
2450         }
2451         nc++;
2452     }
2453
2454     missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
2455     if (!missing_cap) {
2456         missing_cap =
2457             kvm_check_extension_list(s, kvm_arch_required_capabilities);
2458     }
2459     if (missing_cap) {
2460         ret = -EINVAL;
2461         fprintf(stderr, "kvm does not support %s\n%s",
2462                 missing_cap->name, upgrade_note);
2463         goto err;
2464     }
2465
2466     s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
2467     s->coalesced_pio = s->coalesced_mmio &&
2468                        kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
2469
2470     /*
2471      * Enable KVM dirty ring if supported, otherwise fall back to
2472      * dirty logging mode
2473      */
2474     if (s->kvm_dirty_ring_size > 0) {
2475         uint64_t ring_bytes;
2476
2477         ring_bytes = s->kvm_dirty_ring_size * sizeof(struct kvm_dirty_gfn);
2478
2479         /* Read the max supported pages */
2480         ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING);
2481         if (ret > 0) {
2482             if (ring_bytes > ret) {
2483                 error_report("KVM dirty ring size %" PRIu32 " too big "
2484                              "(maximum is %ld).  Please use a smaller value.",
2485                              s->kvm_dirty_ring_size,
2486                              (long)ret / sizeof(struct kvm_dirty_gfn));
2487                 ret = -EINVAL;
2488                 goto err;
2489             }
2490
2491             ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, ring_bytes);
2492             if (ret) {
2493                 error_report("Enabling of KVM dirty ring failed: %s. "
2494                              "Suggested minimum value is 1024.", strerror(-ret));
2495                 goto err;
2496             }
2497
2498             s->kvm_dirty_ring_bytes = ring_bytes;
2499          } else {
2500              warn_report("KVM dirty ring not available, using bitmap method");
2501              s->kvm_dirty_ring_size = 0;
2502         }
2503     }
2504
2505     /*
2506      * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
2507      * enabled.  More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
2508      * page is wr-protected initially, which is against how kvm dirty ring is
2509      * usage - kvm dirty ring requires all pages are wr-protected at the very
2510      * beginning.  Enabling this feature for dirty ring causes data corruption.
2511      *
2512      * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
2513      * we may expect a higher stall time when starting the migration.  In the
2514      * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
2515      * instead of clearing dirty bit, it can be a way to explicitly wr-protect
2516      * guest pages.
2517      */
2518     if (!s->kvm_dirty_ring_size) {
2519         dirty_log_manual_caps =
2520             kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
2521         dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
2522                                   KVM_DIRTY_LOG_INITIALLY_SET);
2523         s->manual_dirty_log_protect = dirty_log_manual_caps;
2524         if (dirty_log_manual_caps) {
2525             ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
2526                                     dirty_log_manual_caps);
2527             if (ret) {
2528                 warn_report("Trying to enable capability %"PRIu64" of "
2529                             "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
2530                             "Falling back to the legacy mode. ",
2531                             dirty_log_manual_caps);
2532                 s->manual_dirty_log_protect = 0;
2533             }
2534         }
2535     }
2536
2537 #ifdef KVM_CAP_VCPU_EVENTS
2538     s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
2539 #endif
2540
2541     s->robust_singlestep =
2542         kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
2543
2544 #ifdef KVM_CAP_DEBUGREGS
2545     s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
2546 #endif
2547
2548     s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
2549
2550 #ifdef KVM_CAP_IRQ_ROUTING
2551     kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
2552 #endif
2553
2554     s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
2555
2556     s->irq_set_ioctl = KVM_IRQ_LINE;
2557     if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
2558         s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
2559     }
2560
2561     kvm_readonly_mem_allowed =
2562         (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
2563
2564     kvm_eventfds_allowed =
2565         (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);
2566
2567     kvm_irqfds_allowed =
2568         (kvm_check_extension(s, KVM_CAP_IRQFD) > 0);
2569
2570     kvm_resamplefds_allowed =
2571         (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
2572
2573     kvm_vm_attributes_allowed =
2574         (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
2575
2576     kvm_ioeventfd_any_length_allowed =
2577         (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);
2578
2579 #ifdef KVM_CAP_SET_GUEST_DEBUG
2580     kvm_has_guest_debug =
2581         (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0);
2582 #endif
2583
2584     kvm_sstep_flags = 0;
2585     if (kvm_has_guest_debug) {
2586         kvm_sstep_flags = SSTEP_ENABLE;
2587
2588 #if defined KVM_CAP_SET_GUEST_DEBUG2
2589         int guest_debug_flags =
2590             kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2);
2591
2592         if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) {
2593             kvm_sstep_flags |= SSTEP_NOIRQ;
2594         }
2595 #endif
2596     }
2597
2598     kvm_state = s;
2599
2600     ret = kvm_arch_init(ms, s);
2601     if (ret < 0) {
2602         goto err;
2603     }
2604
2605     if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
2606         s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
2607     }
2608
2609     qemu_register_reset(kvm_unpoison_all, NULL);
2610
2611     if (s->kernel_irqchip_allowed) {
2612         kvm_irqchip_create(s);
2613     }
2614
2615     if (kvm_eventfds_allowed) {
2616         s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
2617         s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
2618     }
2619     s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
2620     s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
2621
2622     kvm_memory_listener_register(s, &s->memory_listener,
2623                                  &address_space_memory, 0, "kvm-memory");
2624     if (kvm_eventfds_allowed) {
2625         memory_listener_register(&kvm_io_listener,
2626                                  &address_space_io);
2627     }
2628     memory_listener_register(&kvm_coalesced_pio_listener,
2629                              &address_space_io);
2630
2631     s->many_ioeventfds = kvm_check_many_ioeventfds();
2632
2633     s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2634     if (!s->sync_mmu) {
2635         ret = ram_block_discard_disable(true);
2636         assert(!ret);
2637     }
2638
2639     if (s->kvm_dirty_ring_size) {
2640         ret = kvm_dirty_ring_reaper_init(s);
2641         if (ret) {
2642             goto err;
2643         }
2644     }
2645
2646     if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
2647         add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb,
2648                             query_stats_schemas_cb);
2649     }
2650
2651     return 0;
2652
2653 err:
2654     assert(ret < 0);
2655     if (s->vmfd >= 0) {
2656         close(s->vmfd);
2657     }
2658     if (s->fd != -1) {
2659         close(s->fd);
2660     }
2661     g_free(s->memory_listener.slots);
2662
2663     return ret;
2664 }
2665
2666 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
2667 {
2668     s->sigmask_len = sigmask_len;
2669 }
2670
2671 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
2672                           int size, uint32_t count)
2673 {
2674     int i;
2675     uint8_t *ptr = data;
2676
2677     for (i = 0; i < count; i++) {
2678         address_space_rw(&address_space_io, port, attrs,
2679                          ptr, size,
2680                          direction == KVM_EXIT_IO_OUT);
2681         ptr += size;
2682     }
2683 }
2684
2685 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
2686 {
2687     fprintf(stderr, "KVM internal error. Suberror: %d\n",
2688             run->internal.suberror);
2689
2690     if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
2691         int i;
2692
2693         for (i = 0; i < run->internal.ndata; ++i) {
2694             fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n",
2695                     i, (uint64_t)run->internal.data[i]);
2696         }
2697     }
2698     if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
2699         fprintf(stderr, "emulation failure\n");
2700         if (!kvm_arch_stop_on_emulation_error(cpu)) {
2701             cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2702             return EXCP_INTERRUPT;
2703         }
2704     }
2705     /* FIXME: Should trigger a qmp message to let management know
2706      * something went wrong.
2707      */
2708     return -1;
2709 }
2710
2711 void kvm_flush_coalesced_mmio_buffer(void)
2712 {
2713     KVMState *s = kvm_state;
2714
2715     if (s->coalesced_flush_in_progress) {
2716         return;
2717     }
2718
2719     s->coalesced_flush_in_progress = true;
2720
2721     if (s->coalesced_mmio_ring) {
2722         struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
2723         while (ring->first != ring->last) {
2724             struct kvm_coalesced_mmio *ent;
2725
2726             ent = &ring->coalesced_mmio[ring->first];
2727
2728             if (ent->pio == 1) {
2729                 address_space_write(&address_space_io, ent->phys_addr,
2730                                     MEMTXATTRS_UNSPECIFIED, ent->data,
2731                                     ent->len);
2732             } else {
2733                 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2734             }
2735             smp_wmb();
2736             ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
2737         }
2738     }
2739
2740     s->coalesced_flush_in_progress = false;
2741 }
2742
2743 bool kvm_cpu_check_are_resettable(void)
2744 {
2745     return kvm_arch_cpu_check_are_resettable();
2746 }
2747
2748 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2749 {
2750     if (!cpu->vcpu_dirty) {
2751         kvm_arch_get_registers(cpu);
2752         cpu->vcpu_dirty = true;
2753     }
2754 }
2755
2756 void kvm_cpu_synchronize_state(CPUState *cpu)
2757 {
2758     if (!cpu->vcpu_dirty) {
2759         run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
2760     }
2761 }
2762
2763 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
2764 {
2765     kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
2766     cpu->vcpu_dirty = false;
2767 }
2768
2769 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
2770 {
2771     run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2772 }
2773
2774 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
2775 {
2776     kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
2777     cpu->vcpu_dirty = false;
2778 }
2779
2780 void kvm_cpu_synchronize_post_init(CPUState *cpu)
2781 {
2782     run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2783 }
2784
2785 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
2786 {
2787     cpu->vcpu_dirty = true;
2788 }
2789
2790 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
2791 {
2792     run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2793 }
2794
2795 #ifdef KVM_HAVE_MCE_INJECTION
2796 static __thread void *pending_sigbus_addr;
2797 static __thread int pending_sigbus_code;
2798 static __thread bool have_sigbus_pending;
2799 #endif
2800
2801 static void kvm_cpu_kick(CPUState *cpu)
2802 {
2803     qatomic_set(&cpu->kvm_run->immediate_exit, 1);
2804 }
2805
2806 static void kvm_cpu_kick_self(void)
2807 {
2808     if (kvm_immediate_exit) {
2809         kvm_cpu_kick(current_cpu);
2810     } else {
2811         qemu_cpu_kick_self();
2812     }
2813 }
2814
2815 static void kvm_eat_signals(CPUState *cpu)
2816 {
2817     struct timespec ts = { 0, 0 };
2818     siginfo_t siginfo;
2819     sigset_t waitset;
2820     sigset_t chkset;
2821     int r;
2822
2823     if (kvm_immediate_exit) {
2824         qatomic_set(&cpu->kvm_run->immediate_exit, 0);
2825         /* Write kvm_run->immediate_exit before the cpu->exit_request
2826          * write in kvm_cpu_exec.
2827          */
2828         smp_wmb();
2829         return;
2830     }
2831
2832     sigemptyset(&waitset);
2833     sigaddset(&waitset, SIG_IPI);
2834
2835     do {
2836         r = sigtimedwait(&waitset, &siginfo, &ts);
2837         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
2838             perror("sigtimedwait");
2839             exit(1);
2840         }
2841
2842         r = sigpending(&chkset);
2843         if (r == -1) {
2844             perror("sigpending");
2845             exit(1);
2846         }
2847     } while (sigismember(&chkset, SIG_IPI));
2848 }
2849
2850 int kvm_cpu_exec(CPUState *cpu)
2851 {
2852     struct kvm_run *run = cpu->kvm_run;
2853     int ret, run_ret;
2854
2855     DPRINTF("kvm_cpu_exec()\n");
2856
2857     if (kvm_arch_process_async_events(cpu)) {
2858         qatomic_set(&cpu->exit_request, 0);
2859         return EXCP_HLT;
2860     }
2861
2862     qemu_mutex_unlock_iothread();
2863     cpu_exec_start(cpu);
2864
2865     do {
2866         MemTxAttrs attrs;
2867
2868         if (cpu->vcpu_dirty) {
2869             kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
2870             cpu->vcpu_dirty = false;
2871         }
2872
2873         kvm_arch_pre_run(cpu, run);
2874         if (qatomic_read(&cpu->exit_request)) {
2875             DPRINTF("interrupt exit requested\n");
2876             /*
2877              * KVM requires us to reenter the kernel after IO exits to complete
2878              * instruction emulation. This self-signal will ensure that we
2879              * leave ASAP again.
2880              */
2881             kvm_cpu_kick_self();
2882         }
2883
2884         /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
2885          * Matching barrier in kvm_eat_signals.
2886          */
2887         smp_rmb();
2888
2889         run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
2890
2891         attrs = kvm_arch_post_run(cpu, run);
2892
2893 #ifdef KVM_HAVE_MCE_INJECTION
2894         if (unlikely(have_sigbus_pending)) {
2895             qemu_mutex_lock_iothread();
2896             kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
2897                                     pending_sigbus_addr);
2898             have_sigbus_pending = false;
2899             qemu_mutex_unlock_iothread();
2900         }
2901 #endif
2902
2903         if (run_ret < 0) {
2904             if (run_ret == -EINTR || run_ret == -EAGAIN) {
2905                 DPRINTF("io window exit\n");
2906                 kvm_eat_signals(cpu);
2907                 ret = EXCP_INTERRUPT;
2908                 break;
2909             }
2910             fprintf(stderr, "error: kvm run failed %s\n",
2911                     strerror(-run_ret));
2912 #ifdef TARGET_PPC
2913             if (run_ret == -EBUSY) {
2914                 fprintf(stderr,
2915                         "This is probably because your SMT is enabled.\n"
2916                         "VCPU can only run on primary threads with all "
2917                         "secondary threads offline.\n");
2918             }
2919 #endif
2920             ret = -1;
2921             break;
2922         }
2923
2924         trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
2925         switch (run->exit_reason) {
2926         case KVM_EXIT_IO:
2927             DPRINTF("handle_io\n");
2928             /* Called outside BQL */
2929             kvm_handle_io(run->io.port, attrs,
2930                           (uint8_t *)run + run->io.data_offset,
2931                           run->io.direction,
2932                           run->io.size,
2933                           run->io.count);
2934             ret = 0;
2935             break;
2936         case KVM_EXIT_MMIO:
2937             DPRINTF("handle_mmio\n");
2938             /* Called outside BQL */
2939             address_space_rw(&address_space_memory,
2940                              run->mmio.phys_addr, attrs,
2941                              run->mmio.data,
2942                              run->mmio.len,
2943                              run->mmio.is_write);
2944             ret = 0;
2945             break;
2946         case KVM_EXIT_IRQ_WINDOW_OPEN:
2947             DPRINTF("irq_window_open\n");
2948             ret = EXCP_INTERRUPT;
2949             break;
2950         case KVM_EXIT_SHUTDOWN:
2951             DPRINTF("shutdown\n");
2952             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2953             ret = EXCP_INTERRUPT;
2954             break;
2955         case KVM_EXIT_UNKNOWN:
2956             fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
2957                     (uint64_t)run->hw.hardware_exit_reason);
2958             ret = -1;
2959             break;
2960         case KVM_EXIT_INTERNAL_ERROR:
2961             ret = kvm_handle_internal_error(cpu, run);
2962             break;
2963         case KVM_EXIT_DIRTY_RING_FULL:
2964             /*
2965              * We shouldn't continue if the dirty ring of this vcpu is
2966              * still full.  Got kicked by KVM_RESET_DIRTY_RINGS.
2967              */
2968             trace_kvm_dirty_ring_full(cpu->cpu_index);
2969             qemu_mutex_lock_iothread();
2970             kvm_dirty_ring_reap(kvm_state);
2971             qemu_mutex_unlock_iothread();
2972             ret = 0;
2973             break;
2974         case KVM_EXIT_SYSTEM_EVENT:
2975             switch (run->system_event.type) {
2976             case KVM_SYSTEM_EVENT_SHUTDOWN:
2977                 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
2978                 ret = EXCP_INTERRUPT;
2979                 break;
2980             case KVM_SYSTEM_EVENT_RESET:
2981                 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2982                 ret = EXCP_INTERRUPT;
2983                 break;
2984             case KVM_SYSTEM_EVENT_CRASH:
2985                 kvm_cpu_synchronize_state(cpu);
2986                 qemu_mutex_lock_iothread();
2987                 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2988                 qemu_mutex_unlock_iothread();
2989                 ret = 0;
2990                 break;
2991             default:
2992                 DPRINTF("kvm_arch_handle_exit\n");
2993                 ret = kvm_arch_handle_exit(cpu, run);
2994                 break;
2995             }
2996             break;
2997         default:
2998             DPRINTF("kvm_arch_handle_exit\n");
2999             ret = kvm_arch_handle_exit(cpu, run);
3000             break;
3001         }
3002     } while (ret == 0);
3003
3004     cpu_exec_end(cpu);
3005     qemu_mutex_lock_iothread();
3006
3007     if (ret < 0) {
3008         cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
3009         vm_stop(RUN_STATE_INTERNAL_ERROR);
3010     }
3011
3012     qatomic_set(&cpu->exit_request, 0);
3013     return ret;
3014 }
3015
3016 int kvm_ioctl(KVMState *s, int type, ...)
3017 {
3018     int ret;
3019     void *arg;
3020     va_list ap;
3021
3022     va_start(ap, type);
3023     arg = va_arg(ap, void *);
3024     va_end(ap);
3025
3026     trace_kvm_ioctl(type, arg);
3027     ret = ioctl(s->fd, type, arg);
3028     if (ret == -1) {
3029         ret = -errno;
3030     }
3031     return ret;
3032 }
3033
3034 int kvm_vm_ioctl(KVMState *s, int type, ...)
3035 {
3036     int ret;
3037     void *arg;
3038     va_list ap;
3039
3040     va_start(ap, type);
3041     arg = va_arg(ap, void *);
3042     va_end(ap);
3043
3044     trace_kvm_vm_ioctl(type, arg);
3045     ret = ioctl(s->vmfd, type, arg);
3046     if (ret == -1) {
3047         ret = -errno;
3048     }
3049     return ret;
3050 }
3051
3052 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
3053 {
3054     int ret;
3055     void *arg;
3056     va_list ap;
3057
3058     va_start(ap, type);
3059     arg = va_arg(ap, void *);
3060     va_end(ap);
3061
3062     trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
3063     ret = ioctl(cpu->kvm_fd, type, arg);
3064     if (ret == -1) {
3065         ret = -errno;
3066     }
3067     return ret;
3068 }
3069
3070 int kvm_device_ioctl(int fd, int type, ...)
3071 {
3072     int ret;
3073     void *arg;
3074     va_list ap;
3075
3076     va_start(ap, type);
3077     arg = va_arg(ap, void *);
3078     va_end(ap);
3079
3080     trace_kvm_device_ioctl(fd, type, arg);
3081     ret = ioctl(fd, type, arg);
3082     if (ret == -1) {
3083         ret = -errno;
3084     }
3085     return ret;
3086 }
3087
3088 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
3089 {
3090     int ret;
3091     struct kvm_device_attr attribute = {
3092         .group = group,
3093         .attr = attr,
3094     };
3095
3096     if (!kvm_vm_attributes_allowed) {
3097         return 0;
3098     }
3099
3100     ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
3101     /* kvm returns 0 on success for HAS_DEVICE_ATTR */
3102     return ret ? 0 : 1;
3103 }
3104
3105 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
3106 {
3107     struct kvm_device_attr attribute = {
3108         .group = group,
3109         .attr = attr,
3110         .flags = 0,
3111     };
3112
3113     return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
3114 }
3115
3116 int kvm_device_access(int fd, int group, uint64_t attr,
3117                       void *val, bool write, Error **errp)
3118 {
3119     struct kvm_device_attr kvmattr;
3120     int err;
3121
3122     kvmattr.flags = 0;
3123     kvmattr.group = group;
3124     kvmattr.attr = attr;
3125     kvmattr.addr = (uintptr_t)val;
3126
3127     err = kvm_device_ioctl(fd,
3128                            write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
3129                            &kvmattr);
3130     if (err < 0) {
3131         error_setg_errno(errp, -err,
3132                          "KVM_%s_DEVICE_ATTR failed: Group %d "
3133                          "attr 0x%016" PRIx64,
3134                          write ? "SET" : "GET", group, attr);
3135     }
3136     return err;
3137 }
3138
3139 bool kvm_has_sync_mmu(void)
3140 {
3141     return kvm_state->sync_mmu;
3142 }
3143
3144 int kvm_has_vcpu_events(void)
3145 {
3146     return kvm_state->vcpu_events;
3147 }
3148
3149 int kvm_has_robust_singlestep(void)
3150 {
3151     return kvm_state->robust_singlestep;
3152 }
3153
3154 int kvm_has_debugregs(void)
3155 {
3156     return kvm_state->debugregs;
3157 }
3158
3159 int kvm_max_nested_state_length(void)
3160 {
3161     return kvm_state->max_nested_state_len;
3162 }
3163
3164 int kvm_has_many_ioeventfds(void)
3165 {
3166     if (!kvm_enabled()) {
3167         return 0;
3168     }
3169     return kvm_state->many_ioeventfds;
3170 }
3171
3172 int kvm_has_gsi_routing(void)
3173 {
3174 #ifdef KVM_CAP_IRQ_ROUTING
3175     return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
3176 #else
3177     return false;
3178 #endif
3179 }
3180
3181 int kvm_has_intx_set_mask(void)
3182 {
3183     return kvm_state->intx_set_mask;
3184 }
3185
3186 bool kvm_arm_supports_user_irq(void)
3187 {
3188     return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
3189 }
3190
3191 #ifdef KVM_CAP_SET_GUEST_DEBUG
3192 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
3193                                                  target_ulong pc)
3194 {
3195     struct kvm_sw_breakpoint *bp;
3196
3197     QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
3198         if (bp->pc == pc) {
3199             return bp;
3200         }
3201     }
3202     return NULL;
3203 }
3204
3205 int kvm_sw_breakpoints_active(CPUState *cpu)
3206 {
3207     return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
3208 }
3209
3210 struct kvm_set_guest_debug_data {
3211     struct kvm_guest_debug dbg;
3212     int err;
3213 };
3214
3215 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
3216 {
3217     struct kvm_set_guest_debug_data *dbg_data =
3218         (struct kvm_set_guest_debug_data *) data.host_ptr;
3219
3220     dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
3221                                    &dbg_data->dbg);
3222 }
3223
3224 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
3225 {
3226     struct kvm_set_guest_debug_data data;
3227
3228     data.dbg.control = reinject_trap;
3229
3230     if (cpu->singlestep_enabled) {
3231         data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
3232
3233         if (cpu->singlestep_enabled & SSTEP_NOIRQ) {
3234             data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ;
3235         }
3236     }
3237     kvm_arch_update_guest_debug(cpu, &data.dbg);
3238
3239     run_on_cpu(cpu, kvm_invoke_set_guest_debug,
3240                RUN_ON_CPU_HOST_PTR(&data));
3241     return data.err;
3242 }
3243
3244 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
3245                           target_ulong len, int type)
3246 {
3247     struct kvm_sw_breakpoint *bp;
3248     int err;
3249
3250     if (type == GDB_BREAKPOINT_SW) {
3251         bp = kvm_find_sw_breakpoint(cpu, addr);
3252         if (bp) {
3253             bp->use_count++;
3254             return 0;
3255         }
3256
3257         bp = g_new(struct kvm_sw_breakpoint, 1);
3258         bp->pc = addr;
3259         bp->use_count = 1;
3260         err = kvm_arch_insert_sw_breakpoint(cpu, bp);
3261         if (err) {
3262             g_free(bp);
3263             return err;
3264         }
3265
3266         QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3267     } else {
3268         err = kvm_arch_insert_hw_breakpoint(addr, len, type);
3269         if (err) {
3270             return err;
3271         }
3272     }
3273
3274     CPU_FOREACH(cpu) {
3275         err = kvm_update_guest_debug(cpu, 0);
3276         if (err) {
3277             return err;
3278         }
3279     }
3280     return 0;
3281 }
3282
3283 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
3284                           target_ulong len, int type)
3285 {
3286     struct kvm_sw_breakpoint *bp;
3287     int err;
3288
3289     if (type == GDB_BREAKPOINT_SW) {
3290         bp = kvm_find_sw_breakpoint(cpu, addr);
3291         if (!bp) {
3292             return -ENOENT;
3293         }
3294
3295         if (bp->use_count > 1) {
3296             bp->use_count--;
3297             return 0;
3298         }
3299
3300         err = kvm_arch_remove_sw_breakpoint(cpu, bp);
3301         if (err) {
3302             return err;
3303         }
3304
3305         QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3306         g_free(bp);
3307     } else {
3308         err = kvm_arch_remove_hw_breakpoint(addr, len, type);
3309         if (err) {
3310             return err;
3311         }
3312     }
3313
3314     CPU_FOREACH(cpu) {
3315         err = kvm_update_guest_debug(cpu, 0);
3316         if (err) {
3317             return err;
3318         }
3319     }
3320     return 0;
3321 }
3322
3323 void kvm_remove_all_breakpoints(CPUState *cpu)
3324 {
3325     struct kvm_sw_breakpoint *bp, *next;
3326     KVMState *s = cpu->kvm_state;
3327     CPUState *tmpcpu;
3328
3329     QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
3330         if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
3331             /* Try harder to find a CPU that currently sees the breakpoint. */
3332             CPU_FOREACH(tmpcpu) {
3333                 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
3334                     break;
3335                 }
3336             }
3337         }
3338         QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
3339         g_free(bp);
3340     }
3341     kvm_arch_remove_all_hw_breakpoints();
3342
3343     CPU_FOREACH(cpu) {
3344         kvm_update_guest_debug(cpu, 0);
3345     }
3346 }
3347
3348 #else /* !KVM_CAP_SET_GUEST_DEBUG */
3349
3350 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
3351 {
3352     return -EINVAL;
3353 }
3354
3355 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
3356                           target_ulong len, int type)
3357 {
3358     return -EINVAL;
3359 }
3360
3361 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
3362                           target_ulong len, int type)
3363 {
3364     return -EINVAL;
3365 }
3366
3367 void kvm_remove_all_breakpoints(CPUState *cpu)
3368 {
3369 }
3370 #endif /* !KVM_CAP_SET_GUEST_DEBUG */
3371
3372 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
3373 {
3374     KVMState *s = kvm_state;
3375     struct kvm_signal_mask *sigmask;
3376     int r;
3377
3378     sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
3379
3380     sigmask->len = s->sigmask_len;
3381     memcpy(sigmask->sigset, sigset, sizeof(*sigset));
3382     r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
3383     g_free(sigmask);
3384
3385     return r;
3386 }
3387
3388 static void kvm_ipi_signal(int sig)
3389 {
3390     if (current_cpu) {
3391         assert(kvm_immediate_exit);
3392         kvm_cpu_kick(current_cpu);
3393     }
3394 }
3395
3396 void kvm_init_cpu_signals(CPUState *cpu)
3397 {
3398     int r;
3399     sigset_t set;
3400     struct sigaction sigact;
3401
3402     memset(&sigact, 0, sizeof(sigact));
3403     sigact.sa_handler = kvm_ipi_signal;
3404     sigaction(SIG_IPI, &sigact, NULL);
3405
3406     pthread_sigmask(SIG_BLOCK, NULL, &set);
3407 #if defined KVM_HAVE_MCE_INJECTION
3408     sigdelset(&set, SIGBUS);
3409     pthread_sigmask(SIG_SETMASK, &set, NULL);
3410 #endif
3411     sigdelset(&set, SIG_IPI);
3412     if (kvm_immediate_exit) {
3413         r = pthread_sigmask(SIG_SETMASK, &set, NULL);
3414     } else {
3415         r = kvm_set_signal_mask(cpu, &set);
3416     }
3417     if (r) {
3418         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
3419         exit(1);
3420     }
3421 }
3422
3423 /* Called asynchronously in VCPU thread.  */
3424 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
3425 {
3426 #ifdef KVM_HAVE_MCE_INJECTION
3427     if (have_sigbus_pending) {
3428         return 1;
3429     }
3430     have_sigbus_pending = true;
3431     pending_sigbus_addr = addr;
3432     pending_sigbus_code = code;
3433     qatomic_set(&cpu->exit_request, 1);
3434     return 0;
3435 #else
3436     return 1;
3437 #endif
3438 }
3439
3440 /* Called synchronously (via signalfd) in main thread.  */
3441 int kvm_on_sigbus(int code, void *addr)
3442 {
3443 #ifdef KVM_HAVE_MCE_INJECTION
3444     /* Action required MCE kills the process if SIGBUS is blocked.  Because
3445      * that's what happens in the I/O thread, where we handle MCE via signalfd,
3446      * we can only get action optional here.
3447      */
3448     assert(code != BUS_MCEERR_AR);
3449     kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
3450     return 0;
3451 #else
3452     return 1;
3453 #endif
3454 }
3455
3456 int kvm_create_device(KVMState *s, uint64_t type, bool test)
3457 {
3458     int ret;
3459     struct kvm_create_device create_dev;
3460
3461     create_dev.type = type;
3462     create_dev.fd = -1;
3463     create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
3464
3465     if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
3466         return -ENOTSUP;
3467     }
3468
3469     ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
3470     if (ret) {
3471         return ret;
3472     }
3473
3474     return test ? 0 : create_dev.fd;
3475 }
3476
3477 bool kvm_device_supported(int vmfd, uint64_t type)
3478 {
3479     struct kvm_create_device create_dev = {
3480         .type = type,
3481         .fd = -1,
3482         .flags = KVM_CREATE_DEVICE_TEST,
3483     };
3484
3485     if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
3486         return false;
3487     }
3488
3489     return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
3490 }
3491
3492 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
3493 {
3494     struct kvm_one_reg reg;
3495     int r;
3496
3497     reg.id = id;
3498     reg.addr = (uintptr_t) source;
3499     r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
3500     if (r) {
3501         trace_kvm_failed_reg_set(id, strerror(-r));
3502     }
3503     return r;
3504 }
3505
3506 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
3507 {
3508     struct kvm_one_reg reg;
3509     int r;
3510
3511     reg.id = id;
3512     reg.addr = (uintptr_t) target;
3513     r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
3514     if (r) {
3515         trace_kvm_failed_reg_get(id, strerror(-r));
3516     }
3517     return r;
3518 }
3519
3520 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
3521                                  hwaddr start_addr, hwaddr size)
3522 {
3523     KVMState *kvm = KVM_STATE(ms->accelerator);
3524     int i;
3525
3526     for (i = 0; i < kvm->nr_as; ++i) {
3527         if (kvm->as[i].as == as && kvm->as[i].ml) {
3528             size = MIN(kvm_max_slot_size, size);
3529             return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
3530                                                     start_addr, size);
3531         }
3532     }
3533
3534     return false;
3535 }
3536
3537 static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v,
3538                                    const char *name, void *opaque,
3539                                    Error **errp)
3540 {
3541     KVMState *s = KVM_STATE(obj);
3542     int64_t value = s->kvm_shadow_mem;
3543
3544     visit_type_int(v, name, &value, errp);
3545 }
3546
3547 static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
3548                                    const char *name, void *opaque,
3549                                    Error **errp)
3550 {
3551     KVMState *s = KVM_STATE(obj);
3552     int64_t value;
3553
3554     if (s->fd != -1) {
3555         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3556         return;
3557     }
3558
3559     if (!visit_type_int(v, name, &value, errp)) {
3560         return;
3561     }
3562
3563     s->kvm_shadow_mem = value;
3564 }
3565
3566 static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
3567                                    const char *name, void *opaque,
3568                                    Error **errp)
3569 {
3570     KVMState *s = KVM_STATE(obj);
3571     OnOffSplit mode;
3572
3573     if (s->fd != -1) {
3574         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3575         return;
3576     }
3577
3578     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
3579         return;
3580     }
3581     switch (mode) {
3582     case ON_OFF_SPLIT_ON:
3583         s->kernel_irqchip_allowed = true;
3584         s->kernel_irqchip_required = true;
3585         s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3586         break;
3587     case ON_OFF_SPLIT_OFF:
3588         s->kernel_irqchip_allowed = false;
3589         s->kernel_irqchip_required = false;
3590         s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3591         break;
3592     case ON_OFF_SPLIT_SPLIT:
3593         s->kernel_irqchip_allowed = true;
3594         s->kernel_irqchip_required = true;
3595         s->kernel_irqchip_split = ON_OFF_AUTO_ON;
3596         break;
3597     default:
3598         /* The value was checked in visit_type_OnOffSplit() above. If
3599          * we get here, then something is wrong in QEMU.
3600          */
3601         abort();
3602     }
3603 }
3604
3605 bool kvm_kernel_irqchip_allowed(void)
3606 {
3607     return kvm_state->kernel_irqchip_allowed;
3608 }
3609
3610 bool kvm_kernel_irqchip_required(void)
3611 {
3612     return kvm_state->kernel_irqchip_required;
3613 }
3614
3615 bool kvm_kernel_irqchip_split(void)
3616 {
3617     return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
3618 }
3619
3620 static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
3621                                     const char *name, void *opaque,
3622                                     Error **errp)
3623 {
3624     KVMState *s = KVM_STATE(obj);
3625     uint32_t value = s->kvm_dirty_ring_size;
3626
3627     visit_type_uint32(v, name, &value, errp);
3628 }
3629
3630 static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
3631                                     const char *name, void *opaque,
3632                                     Error **errp)
3633 {
3634     KVMState *s = KVM_STATE(obj);
3635     Error *error = NULL;
3636     uint32_t value;
3637
3638     if (s->fd != -1) {
3639         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3640         return;
3641     }
3642
3643     visit_type_uint32(v, name, &value, &error);
3644     if (error) {
3645         error_propagate(errp, error);
3646         return;
3647     }
3648     if (value & (value - 1)) {
3649         error_setg(errp, "dirty-ring-size must be a power of two.");
3650         return;
3651     }
3652
3653     s->kvm_dirty_ring_size = value;
3654 }
3655
3656 static void kvm_accel_instance_init(Object *obj)
3657 {
3658     KVMState *s = KVM_STATE(obj);
3659
3660     s->fd = -1;
3661     s->vmfd = -1;
3662     s->kvm_shadow_mem = -1;
3663     s->kernel_irqchip_allowed = true;
3664     s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
3665     /* KVM dirty ring is by default off */
3666     s->kvm_dirty_ring_size = 0;
3667 }
3668
3669 static void kvm_accel_class_init(ObjectClass *oc, void *data)
3670 {
3671     AccelClass *ac = ACCEL_CLASS(oc);
3672     ac->name = "KVM";
3673     ac->init_machine = kvm_init;
3674     ac->has_memory = kvm_accel_has_memory;
3675     ac->allowed = &kvm_allowed;
3676
3677     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
3678         NULL, kvm_set_kernel_irqchip,
3679         NULL, NULL);
3680     object_class_property_set_description(oc, "kernel-irqchip",
3681         "Configure KVM in-kernel irqchip");
3682
3683     object_class_property_add(oc, "kvm-shadow-mem", "int",
3684         kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem,
3685         NULL, NULL);
3686     object_class_property_set_description(oc, "kvm-shadow-mem",
3687         "KVM shadow MMU size");
3688
3689     object_class_property_add(oc, "dirty-ring-size", "uint32",
3690         kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
3691         NULL, NULL);
3692     object_class_property_set_description(oc, "dirty-ring-size",
3693         "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
3694 }
3695
3696 static const TypeInfo kvm_accel_type = {
3697     .name = TYPE_KVM_ACCEL,
3698     .parent = TYPE_ACCEL,
3699     .instance_init = kvm_accel_instance_init,
3700     .class_init = kvm_accel_class_init,
3701     .instance_size = sizeof(KVMState),
3702 };
3703
3704 static void kvm_type_init(void)
3705 {
3706     type_register_static(&kvm_accel_type);
3707 }
3708
3709 type_init(kvm_type_init);
3710
3711 typedef struct StatsArgs {
3712     union StatsResultsType {
3713         StatsResultList **stats;
3714         StatsSchemaList **schema;
3715     } result;
3716     strList *names;
3717     Error **errp;
3718 } StatsArgs;
3719
3720 static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
3721                                     uint64_t *stats_data,
3722                                     StatsList *stats_list,
3723                                     Error **errp)
3724 {
3725
3726     Stats *stats;
3727     uint64List *val_list = NULL;
3728
3729     /* Only add stats that we understand.  */
3730     switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
3731     case KVM_STATS_TYPE_CUMULATIVE:
3732     case KVM_STATS_TYPE_INSTANT:
3733     case KVM_STATS_TYPE_PEAK:
3734     case KVM_STATS_TYPE_LINEAR_HIST:
3735     case KVM_STATS_TYPE_LOG_HIST:
3736         break;
3737     default:
3738         return stats_list;
3739     }
3740
3741     switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
3742     case KVM_STATS_UNIT_NONE:
3743     case KVM_STATS_UNIT_BYTES:
3744     case KVM_STATS_UNIT_CYCLES:
3745     case KVM_STATS_UNIT_SECONDS:
3746         break;
3747     default:
3748         return stats_list;
3749     }
3750
3751     switch (pdesc->flags & KVM_STATS_BASE_MASK) {
3752     case KVM_STATS_BASE_POW10:
3753     case KVM_STATS_BASE_POW2:
3754         break;
3755     default:
3756         return stats_list;
3757     }
3758
3759     /* Alloc and populate data list */
3760     stats = g_new0(Stats, 1);
3761     stats->name = g_strdup(pdesc->name);
3762     stats->value = g_new0(StatsValue, 1);;
3763
3764     if (pdesc->size == 1) {
3765         stats->value->u.scalar = *stats_data;
3766         stats->value->type = QTYPE_QNUM;
3767     } else {
3768         int i;
3769         for (i = 0; i < pdesc->size; i++) {
3770             QAPI_LIST_PREPEND(val_list, stats_data[i]);
3771         }
3772         stats->value->u.list = val_list;
3773         stats->value->type = QTYPE_QLIST;
3774     }
3775
3776     QAPI_LIST_PREPEND(stats_list, stats);
3777     return stats_list;
3778 }
3779
3780 static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
3781                                                  StatsSchemaValueList *list,
3782                                                  Error **errp)
3783 {
3784     StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1);
3785     schema_entry->value = g_new0(StatsSchemaValue, 1);
3786
3787     switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
3788     case KVM_STATS_TYPE_CUMULATIVE:
3789         schema_entry->value->type = STATS_TYPE_CUMULATIVE;
3790         break;
3791     case KVM_STATS_TYPE_INSTANT:
3792         schema_entry->value->type = STATS_TYPE_INSTANT;
3793         break;
3794     case KVM_STATS_TYPE_PEAK:
3795         schema_entry->value->type = STATS_TYPE_PEAK;
3796         break;
3797     case KVM_STATS_TYPE_LINEAR_HIST:
3798         schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM;
3799         schema_entry->value->bucket_size = pdesc->bucket_size;
3800         schema_entry->value->has_bucket_size = true;
3801         break;
3802     case KVM_STATS_TYPE_LOG_HIST:
3803         schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM;
3804         break;
3805     default:
3806         goto exit;
3807     }
3808
3809     switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
3810     case KVM_STATS_UNIT_NONE:
3811         break;
3812     case KVM_STATS_UNIT_BYTES:
3813         schema_entry->value->has_unit = true;
3814         schema_entry->value->unit = STATS_UNIT_BYTES;
3815         break;
3816     case KVM_STATS_UNIT_CYCLES:
3817         schema_entry->value->has_unit = true;
3818         schema_entry->value->unit = STATS_UNIT_CYCLES;
3819         break;
3820     case KVM_STATS_UNIT_SECONDS:
3821         schema_entry->value->has_unit = true;
3822         schema_entry->value->unit = STATS_UNIT_SECONDS;
3823         break;
3824     default:
3825         goto exit;
3826     }
3827
3828     schema_entry->value->exponent = pdesc->exponent;
3829     if (pdesc->exponent) {
3830         switch (pdesc->flags & KVM_STATS_BASE_MASK) {
3831         case KVM_STATS_BASE_POW10:
3832             schema_entry->value->has_base = true;
3833             schema_entry->value->base = 10;
3834             break;
3835         case KVM_STATS_BASE_POW2:
3836             schema_entry->value->has_base = true;
3837             schema_entry->value->base = 2;
3838             break;
3839         default:
3840             goto exit;
3841         }
3842     }
3843
3844     schema_entry->value->name = g_strdup(pdesc->name);
3845     schema_entry->next = list;
3846     return schema_entry;
3847 exit:
3848     g_free(schema_entry->value);
3849     g_free(schema_entry);
3850     return list;
3851 }
3852
3853 /* Cached stats descriptors */
3854 typedef struct StatsDescriptors {
3855     const char *ident; /* cache key, currently the StatsTarget */
3856     struct kvm_stats_desc *kvm_stats_desc;
3857     struct kvm_stats_header *kvm_stats_header;
3858     QTAILQ_ENTRY(StatsDescriptors) next;
3859 } StatsDescriptors;
3860
3861 static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors =
3862     QTAILQ_HEAD_INITIALIZER(stats_descriptors);
3863
3864 /*
3865  * Return the descriptors for 'target', that either have already been read
3866  * or are retrieved from 'stats_fd'.
3867  */
3868 static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd,
3869                                                 Error **errp)
3870 {
3871     StatsDescriptors *descriptors;
3872     const char *ident;
3873     struct kvm_stats_desc *kvm_stats_desc;
3874     struct kvm_stats_header *kvm_stats_header;
3875     size_t size_desc;
3876     ssize_t ret;
3877
3878     ident = StatsTarget_str(target);
3879     QTAILQ_FOREACH(descriptors, &stats_descriptors, next) {
3880         if (g_str_equal(descriptors->ident, ident)) {
3881             return descriptors;
3882         }
3883     }
3884
3885     descriptors = g_new0(StatsDescriptors, 1);
3886
3887     /* Read stats header */
3888     kvm_stats_header = g_malloc(sizeof(*kvm_stats_header));
3889     ret = read(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header));
3890     if (ret != sizeof(*kvm_stats_header)) {
3891         error_setg(errp, "KVM stats: failed to read stats header: "
3892                    "expected %zu actual %zu",
3893                    sizeof(*kvm_stats_header), ret);
3894         return NULL;
3895     }
3896     size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
3897
3898     /* Read stats descriptors */
3899     kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc);
3900     ret = pread(stats_fd, kvm_stats_desc,
3901                 size_desc * kvm_stats_header->num_desc,
3902                 kvm_stats_header->desc_offset);
3903
3904     if (ret != size_desc * kvm_stats_header->num_desc) {
3905         error_setg(errp, "KVM stats: failed to read stats descriptors: "
3906                    "expected %zu actual %zu",
3907                    size_desc * kvm_stats_header->num_desc, ret);
3908         g_free(descriptors);
3909         g_free(kvm_stats_desc);
3910         return NULL;
3911     }
3912     descriptors->kvm_stats_header = kvm_stats_header;
3913     descriptors->kvm_stats_desc = kvm_stats_desc;
3914     descriptors->ident = ident;
3915     QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next);
3916     return descriptors;
3917 }
3918
3919 static void query_stats(StatsResultList **result, StatsTarget target,
3920                         strList *names, int stats_fd, Error **errp)
3921 {
3922     struct kvm_stats_desc *kvm_stats_desc;
3923     struct kvm_stats_header *kvm_stats_header;
3924     StatsDescriptors *descriptors;
3925     g_autofree uint64_t *stats_data = NULL;
3926     struct kvm_stats_desc *pdesc;
3927     StatsList *stats_list = NULL;
3928     size_t size_desc, size_data = 0;
3929     ssize_t ret;
3930     int i;
3931
3932     descriptors = find_stats_descriptors(target, stats_fd, errp);
3933     if (!descriptors) {
3934         return;
3935     }
3936
3937     kvm_stats_header = descriptors->kvm_stats_header;
3938     kvm_stats_desc = descriptors->kvm_stats_desc;
3939     size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
3940
3941     /* Tally the total data size; read schema data */
3942     for (i = 0; i < kvm_stats_header->num_desc; ++i) {
3943         pdesc = (void *)kvm_stats_desc + i * size_desc;
3944         size_data += pdesc->size * sizeof(*stats_data);
3945     }
3946
3947     stats_data = g_malloc0(size_data);
3948     ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset);
3949
3950     if (ret != size_data) {
3951         error_setg(errp, "KVM stats: failed to read data: "
3952                    "expected %zu actual %zu", size_data, ret);
3953         return;
3954     }
3955
3956     for (i = 0; i < kvm_stats_header->num_desc; ++i) {
3957         uint64_t *stats;
3958         pdesc = (void *)kvm_stats_desc + i * size_desc;
3959
3960         /* Add entry to the list */
3961         stats = (void *)stats_data + pdesc->offset;
3962         if (!apply_str_list_filter(pdesc->name, names)) {
3963             continue;
3964         }
3965         stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp);
3966     }
3967
3968     if (!stats_list) {
3969         return;
3970     }
3971
3972     switch (target) {
3973     case STATS_TARGET_VM:
3974         add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list);
3975         break;
3976     case STATS_TARGET_VCPU:
3977         add_stats_entry(result, STATS_PROVIDER_KVM,
3978                         current_cpu->parent_obj.canonical_path,
3979                         stats_list);
3980         break;
3981     default:
3982         break;
3983     }
3984 }
3985
3986 static void query_stats_schema(StatsSchemaList **result, StatsTarget target,
3987                                int stats_fd, Error **errp)
3988 {
3989     struct kvm_stats_desc *kvm_stats_desc;
3990     struct kvm_stats_header *kvm_stats_header;
3991     StatsDescriptors *descriptors;
3992     struct kvm_stats_desc *pdesc;
3993     StatsSchemaValueList *stats_list = NULL;
3994     size_t size_desc;
3995     int i;
3996
3997     descriptors = find_stats_descriptors(target, stats_fd, errp);
3998     if (!descriptors) {
3999         return;
4000     }
4001
4002     kvm_stats_header = descriptors->kvm_stats_header;
4003     kvm_stats_desc = descriptors->kvm_stats_desc;
4004     size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4005
4006     /* Tally the total data size; read schema data */
4007     for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4008         pdesc = (void *)kvm_stats_desc + i * size_desc;
4009         stats_list = add_kvmschema_entry(pdesc, stats_list, errp);
4010     }
4011
4012     add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list);
4013 }
4014
4015 static void query_stats_vcpu(CPUState *cpu, run_on_cpu_data data)
4016 {
4017     StatsArgs *kvm_stats_args = (StatsArgs *) data.host_ptr;
4018     int stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
4019     Error *local_err = NULL;
4020
4021     if (stats_fd == -1) {
4022         error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4023         error_propagate(kvm_stats_args->errp, local_err);
4024         return;
4025     }
4026     query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU,
4027                 kvm_stats_args->names, stats_fd, kvm_stats_args->errp);
4028     close(stats_fd);
4029 }
4030
4031 static void query_stats_schema_vcpu(CPUState *cpu, run_on_cpu_data data)
4032 {
4033     StatsArgs *kvm_stats_args = (StatsArgs *) data.host_ptr;
4034     int stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
4035     Error *local_err = NULL;
4036
4037     if (stats_fd == -1) {
4038         error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4039         error_propagate(kvm_stats_args->errp, local_err);
4040         return;
4041     }
4042     query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd,
4043                        kvm_stats_args->errp);
4044     close(stats_fd);
4045 }
4046
4047 static void query_stats_cb(StatsResultList **result, StatsTarget target,
4048                            strList *names, strList *targets, Error **errp)
4049 {
4050     KVMState *s = kvm_state;
4051     CPUState *cpu;
4052     int stats_fd;
4053
4054     switch (target) {
4055     case STATS_TARGET_VM:
4056     {
4057         stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4058         if (stats_fd == -1) {
4059             error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4060             return;
4061         }
4062         query_stats(result, target, names, stats_fd, errp);
4063         close(stats_fd);
4064         break;
4065     }
4066     case STATS_TARGET_VCPU:
4067     {
4068         StatsArgs stats_args;
4069         stats_args.result.stats = result;
4070         stats_args.names = names;
4071         stats_args.errp = errp;
4072         CPU_FOREACH(cpu) {
4073             if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) {
4074                 continue;
4075             }
4076             run_on_cpu(cpu, query_stats_vcpu, RUN_ON_CPU_HOST_PTR(&stats_args));
4077         }
4078         break;
4079     }
4080     default:
4081         break;
4082     }
4083 }
4084
4085 void query_stats_schemas_cb(StatsSchemaList **result, Error **errp)
4086 {
4087     StatsArgs stats_args;
4088     KVMState *s = kvm_state;
4089     int stats_fd;
4090
4091     stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4092     if (stats_fd == -1) {
4093         error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4094         return;
4095     }
4096     query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp);
4097     close(stats_fd);
4098
4099     stats_args.result.schema = result;
4100     stats_args.errp = errp;
4101     run_on_cpu(first_cpu, query_stats_schema_vcpu, RUN_ON_CPU_HOST_PTR(&stats_args));
4102 }