hw/virtio/vhost.c

   1 /*
   2  * vhost support
   3  *
   4  * Copyright Red Hat, Inc. 2010
   5  *
   6  * Authors:
   7  *  Michael S. Tsirkin <mst@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Contributions after 2012-01-13 are licensed under the terms of the
  13  * GNU GPL, version 2 or (at your option) any later version.
  14  */
  15
  16 #include "qemu/osdep.h"
  17 #include "qapi/error.h"
  18 #include "hw/virtio/vhost.h"
  19 #include "hw/hw.h"
  20 #include "qemu/atomic.h"
  21 #include "qemu/range.h"
  22 #include "qemu/error-report.h"
  23 #include "qemu/memfd.h"
  24 #include <linux/vhost.h>
  25 #include "exec/address-spaces.h"
  26 #include "hw/virtio/virtio-bus.h"
  27 #include "hw/virtio/virtio-access.h"
  28 #include "migration/blocker.h"
  29 #include "sysemu/dma.h"
  30
  31 /* enabled until disconnected backend stabilizes */
  32 #define _VHOST_DEBUG 1
  33
  34 #ifdef _VHOST_DEBUG
  35 #define VHOST_OPS_DEBUG(fmt, ...) \
  36     do { error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
  37                       strerror(errno), errno); } while (0)
  38 #else
  39 #define VHOST_OPS_DEBUG(fmt, ...) \
  40     do { } while (0)
  41 #endif
  42
  43 static struct vhost_log *vhost_log;
  44 static struct vhost_log *vhost_log_shm;
  45
  46 static unsigned int used_memslots;
  47 static QLIST_HEAD(, vhost_dev) vhost_devices =
  48     QLIST_HEAD_INITIALIZER(vhost_devices);
  49
  50 bool vhost_has_free_slot(void)
  51 {
  52     unsigned int slots_limit = ~0U;
  53     struct vhost_dev *hdev;
  54
  55     QLIST_FOREACH(hdev, &vhost_devices, entry) {
  56         unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
  57         slots_limit = MIN(slots_limit, r);
  58     }
  59     return slots_limit > used_memslots;
  60 }
  61
  62 static void vhost_dev_sync_region(struct vhost_dev *dev,
  63                                   MemoryRegionSection *section,
  64                                   uint64_t mfirst, uint64_t mlast,
  65                                   uint64_t rfirst, uint64_t rlast)
  66 {
  67     vhost_log_chunk_t *log = dev->log->log;
  68
  69     uint64_t start = MAX(mfirst, rfirst);
  70     uint64_t end = MIN(mlast, rlast);
  71     vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
  72     vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
  73     uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
  74
  75     if (end < start) {
  76         return;
  77     }
  78     assert(end / VHOST_LOG_CHUNK < dev->log_size);
  79     assert(start / VHOST_LOG_CHUNK < dev->log_size);
  80
  81     for (;from < to; ++from) {
  82         vhost_log_chunk_t log;
  83         /* We first check with non-atomic: much cheaper,
  84          * and we expect non-dirty to be the common case. */
  85         if (!*from) {
  86             addr += VHOST_LOG_CHUNK;
  87             continue;
  88         }
  89         /* Data must be read atomically. We don't really need barrier semantics
  90          * but it's easier to use atomic_* than roll our own. */
  91         log = atomic_xchg(from, 0);
  92         while (log) {
  93             int bit = ctzl(log);
  94             hwaddr page_addr;
  95             hwaddr section_offset;
  96             hwaddr mr_offset;
  97             page_addr = addr + bit * VHOST_LOG_PAGE;
  98             section_offset = page_addr - section->offset_within_address_space;
  99             mr_offset = section_offset + section->offset_within_region;
 100             memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
 101             log &= ~(0x1ull << bit);
 102         }
 103         addr += VHOST_LOG_CHUNK;
 104     }
 105 }
 106
 107 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
 108                                    MemoryRegionSection *section,
 109                                    hwaddr first,
 110                                    hwaddr last)
 111 {
 112     int i;
 113     hwaddr start_addr;
 114     hwaddr end_addr;
 115
 116     if (!dev->log_enabled || !dev->started) {
 117         return 0;
 118     }
 119     start_addr = section->offset_within_address_space;
 120     end_addr = range_get_last(start_addr, int128_get64(section->size));
 121     start_addr = MAX(first, start_addr);
 122     end_addr = MIN(last, end_addr);
 123
 124     for (i = 0; i < dev->mem->nregions; ++i) {
 125         struct vhost_memory_region *reg = dev->mem->regions + i;
 126         vhost_dev_sync_region(dev, section, start_addr, end_addr,
 127                               reg->guest_phys_addr,
 128                               range_get_last(reg->guest_phys_addr,
 129                                              reg->memory_size));
 130     }
 131     for (i = 0; i < dev->nvqs; ++i) {
 132         struct vhost_virtqueue *vq = dev->vqs + i;
 133         vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
 134                               range_get_last(vq->used_phys, vq->used_size));
 135     }
 136     return 0;
 137 }
 138
 139 static void vhost_log_sync(MemoryListener *listener,
 140                           MemoryRegionSection *section)
 141 {
 142     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 143                                          memory_listener);
 144     vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
 145 }
 146
 147 static void vhost_log_sync_range(struct vhost_dev *dev,
 148                                  hwaddr first, hwaddr last)
 149 {
 150     int i;
 151     /* FIXME: this is N^2 in number of sections */
 152     for (i = 0; i < dev->n_mem_sections; ++i) {
 153         MemoryRegionSection *section = &dev->mem_sections[i];
 154         vhost_sync_dirty_bitmap(dev, section, first, last);
 155     }
 156 }
 157
 158 /* Assign/unassign. Keep an unsorted array of non-overlapping
 159  * memory regions in dev->mem. */
 160 static void vhost_dev_unassign_memory(struct vhost_dev *dev,
 161                                       uint64_t start_addr,
 162                                       uint64_t size)
 163 {
 164     int from, to, n = dev->mem->nregions;
 165     /* Track overlapping/split regions for sanity checking. */
 166     int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
 167
 168     for (from = 0, to = 0; from < n; ++from, ++to) {
 169         struct vhost_memory_region *reg = dev->mem->regions + to;
 170         uint64_t reglast;
 171         uint64_t memlast;
 172         uint64_t change;
 173
 174         /* clone old region */
 175         if (to != from) {
 176             memcpy(reg, dev->mem->regions + from, sizeof *reg);
 177         }
 178
 179         /* No overlap is simple */
 180         if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 181                             start_addr, size)) {
 182             continue;
 183         }
 184
 185         /* Split only happens if supplied region
 186          * is in the middle of an existing one. Thus it can not
 187          * overlap with any other existing region. */
 188         assert(!split);
 189
 190         reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 191         memlast = range_get_last(start_addr, size);
 192
 193         /* Remove whole region */
 194         if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
 195             --dev->mem->nregions;
 196             --to;
 197             ++overlap_middle;
 198             continue;
 199         }
 200
 201         /* Shrink region */
 202         if (memlast >= reglast) {
 203             reg->memory_size = start_addr - reg->guest_phys_addr;
 204             assert(reg->memory_size);
 205             assert(!overlap_end);
 206             ++overlap_end;
 207             continue;
 208         }
 209
 210         /* Shift region */
 211         if (start_addr <= reg->guest_phys_addr) {
 212             change = memlast + 1 - reg->guest_phys_addr;
 213             reg->memory_size -= change;
 214             reg->guest_phys_addr += change;
 215             reg->userspace_addr += change;
 216             assert(reg->memory_size);
 217             assert(!overlap_start);
 218             ++overlap_start;
 219             continue;
 220         }
 221
 222         /* This only happens if supplied region
 223          * is in the middle of an existing one. Thus it can not
 224          * overlap with any other existing region. */
 225         assert(!overlap_start);
 226         assert(!overlap_end);
 227         assert(!overlap_middle);
 228         /* Split region: shrink first part, shift second part. */
 229         memcpy(dev->mem->regions + n, reg, sizeof *reg);
 230         reg->memory_size = start_addr - reg->guest_phys_addr;
 231         assert(reg->memory_size);
 232         change = memlast + 1 - reg->guest_phys_addr;
 233         reg = dev->mem->regions + n;
 234         reg->memory_size -= change;
 235         assert(reg->memory_size);
 236         reg->guest_phys_addr += change;
 237         reg->userspace_addr += change;
 238         /* Never add more than 1 region */
 239         assert(dev->mem->nregions == n);
 240         ++dev->mem->nregions;
 241         ++split;
 242     }
 243 }
 244
 245 /* Called after unassign, so no regions overlap the given range. */
 246 static void vhost_dev_assign_memory(struct vhost_dev *dev,
 247                                     uint64_t start_addr,
 248                                     uint64_t size,
 249                                     uint64_t uaddr)
 250 {
 251     int from, to;
 252     struct vhost_memory_region *merged = NULL;
 253     for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
 254         struct vhost_memory_region *reg = dev->mem->regions + to;
 255         uint64_t prlast, urlast;
 256         uint64_t pmlast, umlast;
 257         uint64_t s, e, u;
 258
 259         /* clone old region */
 260         if (to != from) {
 261             memcpy(reg, dev->mem->regions + from, sizeof *reg);
 262         }
 263         prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 264         pmlast = range_get_last(start_addr, size);
 265         urlast = range_get_last(reg->userspace_addr, reg->memory_size);
 266         umlast = range_get_last(uaddr, size);
 267
 268         /* check for overlapping regions: should never happen. */
 269         assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
 270         /* Not an adjacent or overlapping region - do not merge. */
 271         if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
 272             (pmlast + 1 != reg->guest_phys_addr ||
 273              umlast + 1 != reg->userspace_addr)) {
 274             continue;
 275         }
 276
 277         if (dev->vhost_ops->vhost_backend_can_merge &&
 278             !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size,
 279                                                      reg->userspace_addr,
 280                                                      reg->memory_size)) {
 281             continue;
 282         }
 283
 284         if (merged) {
 285             --to;
 286             assert(to >= 0);
 287         } else {
 288             merged = reg;
 289         }
 290         u = MIN(uaddr, reg->userspace_addr);
 291         s = MIN(start_addr, reg->guest_phys_addr);
 292         e = MAX(pmlast, prlast);
 293         uaddr = merged->userspace_addr = u;
 294         start_addr = merged->guest_phys_addr = s;
 295         size = merged->memory_size = e - s + 1;
 296         assert(merged->memory_size);
 297     }
 298
 299     if (!merged) {
 300         struct vhost_memory_region *reg = dev->mem->regions + to;
 301         memset(reg, 0, sizeof *reg);
 302         reg->memory_size = size;
 303         assert(reg->memory_size);
 304         reg->guest_phys_addr = start_addr;
 305         reg->userspace_addr = uaddr;
 306         ++to;
 307     }
 308     assert(to <= dev->mem->nregions + 1);
 309     dev->mem->nregions = to;
 310 }
 311
 312 static uint64_t vhost_get_log_size(struct vhost_dev *dev)
 313 {
 314     uint64_t log_size = 0;
 315     int i;
 316     for (i = 0; i < dev->mem->nregions; ++i) {
 317         struct vhost_memory_region *reg = dev->mem->regions + i;
 318         uint64_t last = range_get_last(reg->guest_phys_addr,
 319                                        reg->memory_size);
 320         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 321     }
 322     for (i = 0; i < dev->nvqs; ++i) {
 323         struct vhost_virtqueue *vq = dev->vqs + i;
 324         uint64_t last = vq->used_phys + vq->used_size - 1;
 325         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 326     }
 327     return log_size;
 328 }
 329
 330 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
 331 {
 332     struct vhost_log *log;
 333     uint64_t logsize = size * sizeof(*(log->log));
 334     int fd = -1;
 335
 336     log = g_new0(struct vhost_log, 1);
 337     if (share) {
 338         log->log = qemu_memfd_alloc("vhost-log", logsize,
 339                                     F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
 340                                     &fd);
 341         memset(log->log, 0, logsize);
 342     } else {
 343         log->log = g_malloc0(logsize);
 344     }
 345
 346     log->size = size;
 347     log->refcnt = 1;
 348     log->fd = fd;
 349
 350     return log;
 351 }
 352
 353 static struct vhost_log *vhost_log_get(uint64_t size, bool share)
 354 {
 355     struct vhost_log *log = share ? vhost_log_shm : vhost_log;
 356
 357     if (!log || log->size != size) {
 358         log = vhost_log_alloc(size, share);
 359         if (share) {
 360             vhost_log_shm = log;
 361         } else {
 362             vhost_log = log;
 363         }
 364     } else {
 365         ++log->refcnt;
 366     }
 367
 368     return log;
 369 }
 370
 371 static void vhost_log_put(struct vhost_dev *dev, bool sync)
 372 {
 373     struct vhost_log *log = dev->log;
 374
 375     if (!log) {
 376         return;
 377     }
 378
 379     --log->refcnt;
 380     if (log->refcnt == 0) {
 381         /* Sync only the range covered by the old log */
 382         if (dev->log_size && sync) {
 383             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
 384         }
 385
 386         if (vhost_log == log) {
 387             g_free(log->log);
 388             vhost_log = NULL;
 389         } else if (vhost_log_shm == log) {
 390             qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
 391                             log->fd);
 392             vhost_log_shm = NULL;
 393         }
 394
 395         g_free(log);
 396     }
 397
 398     dev->log = NULL;
 399     dev->log_size = 0;
 400 }
 401
 402 static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 403 {
 404     return dev->vhost_ops->vhost_requires_shm_log &&
 405            dev->vhost_ops->vhost_requires_shm_log(dev);
 406 }
 407
 408 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 409 {
 410     struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
 411     uint64_t log_base = (uintptr_t)log->log;
 412     int r;
 413
 414     /* inform backend of log switching, this must be done before
 415        releasing the current log, to ensure no logging is lost */
 416     r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
 417     if (r < 0) {
 418         VHOST_OPS_DEBUG("vhost_set_log_base failed");
 419     }
 420
 421     vhost_log_put(dev, true);
 422     dev->log = log;
 423     dev->log_size = size;
 424 }
 425
 426 static int vhost_dev_has_iommu(struct vhost_dev *dev)
 427 {
 428     VirtIODevice *vdev = dev->vdev;
 429
 430     return virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
 431 }
 432
 433 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
 434                               hwaddr *plen, int is_write)
 435 {
 436     if (!vhost_dev_has_iommu(dev)) {
 437         return cpu_physical_memory_map(addr, plen, is_write);
 438     } else {
 439         return (void *)(uintptr_t)addr;
 440     }
 441 }
 442
 443 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
 444                                hwaddr len, int is_write,
 445                                hwaddr access_len)
 446 {
 447     if (!vhost_dev_has_iommu(dev)) {
 448         cpu_physical_memory_unmap(buffer, len, is_write, access_len);
 449     }
 450 }
 451
 452 static int vhost_verify_ring_part_mapping(struct vhost_dev *dev,
 453                                           void *part,
 454                                           uint64_t part_addr,
 455                                           uint64_t part_size,
 456                                           uint64_t start_addr,
 457                                           uint64_t size)
 458 {
 459     hwaddr l;
 460     void *p;
 461     int r = 0;
 462
 463     if (!ranges_overlap(start_addr, size, part_addr, part_size)) {
 464         return 0;
 465     }
 466     l = part_size;
 467     p = vhost_memory_map(dev, part_addr, &l, 1);
 468     if (!p || l != part_size) {
 469         r = -ENOMEM;
 470     }
 471     if (p != part) {
 472         r = -EBUSY;
 473     }
 474     vhost_memory_unmap(dev, p, l, 0, 0);
 475     return r;
 476 }
 477
 478 static int vhost_verify_ring_mappings(struct vhost_dev *dev,
 479                                       uint64_t start_addr,
 480                                       uint64_t size)
 481 {
 482     int i, j;
 483     int r = 0;
 484     const char *part_name[] = {
 485         "descriptor table",
 486         "available ring",
 487         "used ring"
 488     };
 489
 490     for (i = 0; i < dev->nvqs; ++i) {
 491         struct vhost_virtqueue *vq = dev->vqs + i;
 492
 493         j = 0;
 494         r = vhost_verify_ring_part_mapping(dev, vq->desc, vq->desc_phys,
 495                                            vq->desc_size, start_addr, size);
 496         if (r) {
 497             break;
 498         }
 499
 500         j++;
 501         r = vhost_verify_ring_part_mapping(dev, vq->avail, vq->avail_phys,
 502                                            vq->avail_size, start_addr, size);
 503         if (r) {
 504             break;
 505         }
 506
 507         j++;
 508         r = vhost_verify_ring_part_mapping(dev, vq->used, vq->used_phys,
 509                                            vq->used_size, start_addr, size);
 510         if (r) {
 511             break;
 512         }
 513     }
 514
 515     if (r == -ENOMEM) {
 516         error_report("Unable to map %s for ring %d", part_name[j], i);
 517     } else if (r == -EBUSY) {
 518         error_report("%s relocated for ring %d", part_name[j], i);
 519     }
 520     return r;
 521 }
 522
 523 static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev,
 524                                                       uint64_t start_addr,
 525                                                       uint64_t size)
 526 {
 527     int i, n = dev->mem->nregions;
 528     for (i = 0; i < n; ++i) {
 529         struct vhost_memory_region *reg = dev->mem->regions + i;
 530         if (ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 531                            start_addr, size)) {
 532             return reg;
 533         }
 534     }
 535     return NULL;
 536 }
 537
 538 static bool vhost_dev_cmp_memory(struct vhost_dev *dev,
 539                                  uint64_t start_addr,
 540                                  uint64_t size,
 541                                  uint64_t uaddr)
 542 {
 543     struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size);
 544     uint64_t reglast;
 545     uint64_t memlast;
 546
 547     if (!reg) {
 548         return true;
 549     }
 550
 551     reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 552     memlast = range_get_last(start_addr, size);
 553
 554     /* Need to extend region? */
 555     if (start_addr < reg->guest_phys_addr || memlast > reglast) {
 556         return true;
 557     }
 558     /* userspace_addr changed? */
 559     return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr;
 560 }
 561
 562 static void vhost_set_memory(MemoryListener *listener,
 563                              MemoryRegionSection *section,
 564                              bool add)
 565 {
 566     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 567                                          memory_listener);
 568     hwaddr start_addr = section->offset_within_address_space;
 569     ram_addr_t size = int128_get64(section->size);
 570     bool log_dirty =
 571         memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION);
 572     int s = offsetof(struct vhost_memory, regions) +
 573         (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
 574     void *ram;
 575
 576     dev->mem = g_realloc(dev->mem, s);
 577
 578     if (log_dirty) {
 579         add = false;
 580     }
 581
 582     assert(size);
 583
 584     /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
 585     ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
 586     if (add) {
 587         if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
 588             /* Region exists with same address. Nothing to do. */
 589             return;
 590         }
 591     } else {
 592         if (!vhost_dev_find_reg(dev, start_addr, size)) {
 593             /* Removing region that we don't access. Nothing to do. */
 594             return;
 595         }
 596     }
 597
 598     vhost_dev_unassign_memory(dev, start_addr, size);
 599     if (add) {
 600         /* Add given mapping, merging adjacent regions if any */
 601         vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram);
 602     } else {
 603         /* Remove old mapping for this memory, if any. */
 604         vhost_dev_unassign_memory(dev, start_addr, size);
 605     }
 606     dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
 607     dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
 608     dev->memory_changed = true;
 609     used_memslots = dev->mem->nregions;
 610 }
 611
 612 static bool vhost_section(MemoryRegionSection *section)
 613 {
 614     return memory_region_is_ram(section->mr) &&
 615         !memory_region_is_rom(section->mr);
 616 }
 617
 618 static void vhost_begin(MemoryListener *listener)
 619 {
 620     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 621                                          memory_listener);
 622     dev->mem_changed_end_addr = 0;
 623     dev->mem_changed_start_addr = -1;
 624 }
 625
 626 static void vhost_commit(MemoryListener *listener)
 627 {
 628     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 629                                          memory_listener);
 630     hwaddr start_addr = 0;
 631     ram_addr_t size = 0;
 632     uint64_t log_size;
 633     int r;
 634
 635     if (!dev->memory_changed) {
 636         return;
 637     }
 638     if (!dev->started) {
 639         return;
 640     }
 641     if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) {
 642         return;
 643     }
 644
 645     if (dev->started) {
 646         start_addr = dev->mem_changed_start_addr;
 647         size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1;
 648
 649         r = vhost_verify_ring_mappings(dev, start_addr, size);
 650         assert(r >= 0);
 651     }
 652
 653     if (!dev->log_enabled) {
 654         r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 655         if (r < 0) {
 656             VHOST_OPS_DEBUG("vhost_set_mem_table failed");
 657         }
 658         dev->memory_changed = false;
 659         return;
 660     }
 661     log_size = vhost_get_log_size(dev);
 662     /* We allocate an extra 4K bytes to log,
 663      * to reduce the * number of reallocations. */
 664 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
 665     /* To log more, must increase log size before table update. */
 666     if (dev->log_size < log_size) {
 667         vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
 668     }
 669     r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 670     if (r < 0) {
 671         VHOST_OPS_DEBUG("vhost_set_mem_table failed");
 672     }
 673     /* To log less, can only decrease log size after table update. */
 674     if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
 675         vhost_dev_log_resize(dev, log_size);
 676     }
 677     dev->memory_changed = false;
 678 }
 679
 680 static void vhost_region_add(MemoryListener *listener,
 681                              MemoryRegionSection *section)
 682 {
 683     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 684                                          memory_listener);
 685
 686     if (!vhost_section(section)) {
 687         return;
 688     }
 689
 690     ++dev->n_mem_sections;
 691     dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections,
 692                                 dev->n_mem_sections);
 693     dev->mem_sections[dev->n_mem_sections - 1] = *section;
 694     memory_region_ref(section->mr);
 695     vhost_set_memory(listener, section, true);
 696 }
 697
 698 static void vhost_region_del(MemoryListener *listener,
 699                              MemoryRegionSection *section)
 700 {
 701     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 702                                          memory_listener);
 703     int i;
 704
 705     if (!vhost_section(section)) {
 706         return;
 707     }
 708
 709     vhost_set_memory(listener, section, false);
 710     memory_region_unref(section->mr);
 711     for (i = 0; i < dev->n_mem_sections; ++i) {
 712         if (dev->mem_sections[i].offset_within_address_space
 713             == section->offset_within_address_space) {
 714             --dev->n_mem_sections;
 715             memmove(&dev->mem_sections[i], &dev->mem_sections[i+1],
 716                     (dev->n_mem_sections - i) * sizeof(*dev->mem_sections));
 717             break;
 718         }
 719     }
 720 }
 721
 722 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 723 {
 724     struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
 725     struct vhost_dev *hdev = iommu->hdev;
 726     hwaddr iova = iotlb->iova + iommu->iommu_offset;
 727
 728     if (vhost_backend_invalidate_device_iotlb(hdev, iova,
 729                                               iotlb->addr_mask + 1)) {
 730         error_report("Fail to invalidate device iotlb");
 731     }
 732 }
 733
 734 static void vhost_iommu_region_add(MemoryListener *listener,
 735                                    MemoryRegionSection *section)
 736 {
 737     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 738                                          iommu_listener);
 739     struct vhost_iommu *iommu;
 740     Int128 end;
 741
 742     if (!memory_region_is_iommu(section->mr)) {
 743         return;
 744     }
 745
 746     iommu = g_malloc0(sizeof(*iommu));
 747     end = int128_add(int128_make64(section->offset_within_region),
 748                      section->size);
 749     end = int128_sub(end, int128_one());
 750     iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
 751                         IOMMU_NOTIFIER_UNMAP,
 752                         section->offset_within_region,
 753                         int128_get64(end));
 754     iommu->mr = section->mr;
 755     iommu->iommu_offset = section->offset_within_address_space -
 756                           section->offset_within_region;
 757     iommu->hdev = dev;
 758     memory_region_register_iommu_notifier(section->mr, &iommu->n);
 759     QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
 760     /* TODO: can replay help performance here? */
 761 }
 762
 763 static void vhost_iommu_region_del(MemoryListener *listener,
 764                                    MemoryRegionSection *section)
 765 {
 766     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 767                                          iommu_listener);
 768     struct vhost_iommu *iommu;
 769
 770     if (!memory_region_is_iommu(section->mr)) {
 771         return;
 772     }
 773
 774     QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
 775         if (iommu->mr == section->mr &&
 776             iommu->n.start == section->offset_within_region) {
 777             memory_region_unregister_iommu_notifier(iommu->mr,
 778                                                     &iommu->n);
 779             QLIST_REMOVE(iommu, iommu_next);
 780             g_free(iommu);
 781             break;
 782         }
 783     }
 784 }
 785
 786 static void vhost_region_nop(MemoryListener *listener,
 787                              MemoryRegionSection *section)
 788 {
 789 }
 790
 791 static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
 792                                     struct vhost_virtqueue *vq,
 793                                     unsigned idx, bool enable_log)
 794 {
 795     struct vhost_vring_addr addr = {
 796         .index = idx,
 797         .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
 798         .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
 799         .used_user_addr = (uint64_t)(unsigned long)vq->used,
 800         .log_guest_addr = vq->used_phys,
 801         .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
 802     };
 803     int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
 804     if (r < 0) {
 805         VHOST_OPS_DEBUG("vhost_set_vring_addr failed");
 806         return -errno;
 807     }
 808     return 0;
 809 }
 810
 811 static int vhost_dev_set_features(struct vhost_dev *dev,
 812                                   bool enable_log)
 813 {
 814     uint64_t features = dev->acked_features;
 815     int r;
 816     if (enable_log) {
 817         features |= 0x1ULL << VHOST_F_LOG_ALL;
 818     }
 819     r = dev->vhost_ops->vhost_set_features(dev, features);
 820     if (r < 0) {
 821         VHOST_OPS_DEBUG("vhost_set_features failed");
 822     }
 823     return r < 0 ? -errno : 0;
 824 }
 825
 826 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
 827 {
 828     int r, i, idx;
 829     r = vhost_dev_set_features(dev, enable_log);
 830     if (r < 0) {
 831         goto err_features;
 832     }
 833     for (i = 0; i < dev->nvqs; ++i) {
 834         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 835         r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 836                                      enable_log);
 837         if (r < 0) {
 838             goto err_vq;
 839         }
 840     }
 841     return 0;
 842 err_vq:
 843     for (; i >= 0; --i) {
 844         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 845         vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 846                                  dev->log_enabled);
 847     }
 848     vhost_dev_set_features(dev, dev->log_enabled);
 849 err_features:
 850     return r;
 851 }
 852
 853 static int vhost_migration_log(MemoryListener *listener, int enable)
 854 {
 855     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 856                                          memory_listener);
 857     int r;
 858     if (!!enable == dev->log_enabled) {
 859         return 0;
 860     }
 861     if (!dev->started) {
 862         dev->log_enabled = enable;
 863         return 0;
 864     }
 865     if (!enable) {
 866         r = vhost_dev_set_log(dev, false);
 867         if (r < 0) {
 868             return r;
 869         }
 870         vhost_log_put(dev, false);
 871     } else {
 872         vhost_dev_log_resize(dev, vhost_get_log_size(dev));
 873         r = vhost_dev_set_log(dev, true);
 874         if (r < 0) {
 875             return r;
 876         }
 877     }
 878     dev->log_enabled = enable;
 879     return 0;
 880 }
 881
 882 static void vhost_log_global_start(MemoryListener *listener)
 883 {
 884     int r;
 885
 886     r = vhost_migration_log(listener, true);
 887     if (r < 0) {
 888         abort();
 889     }
 890 }
 891
 892 static void vhost_log_global_stop(MemoryListener *listener)
 893 {
 894     int r;
 895
 896     r = vhost_migration_log(listener, false);
 897     if (r < 0) {
 898         abort();
 899     }
 900 }
 901
 902 static void vhost_log_start(MemoryListener *listener,
 903                             MemoryRegionSection *section,
 904                             int old, int new)
 905 {
 906     /* FIXME: implement */
 907 }
 908
 909 static void vhost_log_stop(MemoryListener *listener,
 910                            MemoryRegionSection *section,
 911                            int old, int new)
 912 {
 913     /* FIXME: implement */
 914 }
 915
 916 /* The vhost driver natively knows how to handle the vrings of non
 917  * cross-endian legacy devices and modern devices. Only legacy devices
 918  * exposed to a bi-endian guest may require the vhost driver to use a
 919  * specific endianness.
 920  */
 921 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
 922 {
 923     if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
 924         return false;
 925     }
 926 #ifdef HOST_WORDS_BIGENDIAN
 927     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
 928 #else
 929     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
 930 #endif
 931 }
 932
 933 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
 934                                                    bool is_big_endian,
 935                                                    int vhost_vq_index)
 936 {
 937     struct vhost_vring_state s = {
 938         .index = vhost_vq_index,
 939         .num = is_big_endian
 940     };
 941
 942     if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
 943         return 0;
 944     }
 945
 946     VHOST_OPS_DEBUG("vhost_set_vring_endian failed");
 947     if (errno == ENOTTY) {
 948         error_report("vhost does not support cross-endian");
 949         return -ENOSYS;
 950     }
 951
 952     return -errno;
 953 }
 954
 955 static int vhost_memory_region_lookup(struct vhost_dev *hdev,
 956                                       uint64_t gpa, uint64_t *uaddr,
 957                                       uint64_t *len)
 958 {
 959     int i;
 960
 961     for (i = 0; i < hdev->mem->nregions; i++) {
 962         struct vhost_memory_region *reg = hdev->mem->regions + i;
 963
 964         if (gpa >= reg->guest_phys_addr &&
 965             reg->guest_phys_addr + reg->memory_size > gpa) {
 966             *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
 967             *len = reg->guest_phys_addr + reg->memory_size - gpa;
 968             return 0;
 969         }
 970     }
 971
 972     return -EFAULT;
 973 }
 974
 975 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
 976 {
 977     IOMMUTLBEntry iotlb;
 978     uint64_t uaddr, len;
 979     int ret = -EFAULT;
 980
 981     rcu_read_lock();
 982
 983     iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
 984                                           iova, write);
 985     if (iotlb.target_as != NULL) {
 986         ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
 987                                          &uaddr, &len);
 988         if (ret) {
 989             error_report("Fail to lookup the translated address "
 990                          "%"PRIx64, iotlb.translated_addr);
 991             goto out;
 992         }
 993
 994         len = MIN(iotlb.addr_mask + 1, len);
 995         iova = iova & ~iotlb.addr_mask;
 996
 997         ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
 998                                                 len, iotlb.perm);
 999         if (ret) {
1000             error_report("Fail to update device iotlb");
1001             goto out;
1002         }
1003     }
1004 out:
1005     rcu_read_unlock();
1006
1007     return ret;
1008 }
1009
1010 static int vhost_virtqueue_start(struct vhost_dev *dev,
1011                                 struct VirtIODevice *vdev,
1012                                 struct vhost_virtqueue *vq,
1013                                 unsigned idx)
1014 {
1015     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1016     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1017     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1018     hwaddr s, l, a;
1019     int r;
1020     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1021     struct vhost_vring_file file = {
1022         .index = vhost_vq_index
1023     };
1024     struct vhost_vring_state state = {
1025         .index = vhost_vq_index
1026     };
1027     struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
1028
1029
1030     vq->num = state.num = virtio_queue_get_num(vdev, idx);
1031     r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
1032     if (r) {
1033         VHOST_OPS_DEBUG("vhost_set_vring_num failed");
1034         return -errno;
1035     }
1036
1037     state.num = virtio_queue_get_last_avail_idx(vdev, idx);
1038     r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
1039     if (r) {
1040         VHOST_OPS_DEBUG("vhost_set_vring_base failed");
1041         return -errno;
1042     }
1043
1044     if (vhost_needs_vring_endian(vdev)) {
1045         r = vhost_virtqueue_set_vring_endian_legacy(dev,
1046                                                     virtio_is_big_endian(vdev),
1047                                                     vhost_vq_index);
1048         if (r) {
1049             return -errno;
1050         }
1051     }
1052
1053     vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
1054     vq->desc_phys = a = virtio_queue_get_desc_addr(vdev, idx);
1055     vq->desc = vhost_memory_map(dev, a, &l, 0);
1056     if (!vq->desc || l != s) {
1057         r = -ENOMEM;
1058         goto fail_alloc_desc;
1059     }
1060     vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1061     vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
1062     vq->avail = vhost_memory_map(dev, a, &l, 0);
1063     if (!vq->avail || l != s) {
1064         r = -ENOMEM;
1065         goto fail_alloc_avail;
1066     }
1067     vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1068     vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
1069     vq->used = vhost_memory_map(dev, a, &l, 1);
1070     if (!vq->used || l != s) {
1071         r = -ENOMEM;
1072         goto fail_alloc_used;
1073     }
1074
1075     r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
1076     if (r < 0) {
1077         r = -errno;
1078         goto fail_alloc;
1079     }
1080
1081     file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
1082     r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
1083     if (r) {
1084         VHOST_OPS_DEBUG("vhost_set_vring_kick failed");
1085         r = -errno;
1086         goto fail_kick;
1087     }
1088
1089     /* Clear and discard previous events if any. */
1090     event_notifier_test_and_clear(&vq->masked_notifier);
1091
1092     /* Init vring in unmasked state, unless guest_notifier_mask
1093      * will do it later.
1094      */
1095     if (!vdev->use_guest_notifier_mask) {
1096         /* TODO: check and handle errors. */
1097         vhost_virtqueue_mask(dev, vdev, idx, false);
1098     }
1099
1100     if (k->query_guest_notifiers &&
1101         k->query_guest_notifiers(qbus->parent) &&
1102         virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1103         file.fd = -1;
1104         r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1105         if (r) {
1106             goto fail_vector;
1107         }
1108     }
1109
1110     return 0;
1111
1112 fail_vector:
1113 fail_kick:
1114 fail_alloc:
1115     vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1116                        0, 0);
1117 fail_alloc_used:
1118     vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1119                        0, 0);
1120 fail_alloc_avail:
1121     vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1122                        0, 0);
1123 fail_alloc_desc:
1124     return r;
1125 }
1126
1127 static void vhost_virtqueue_stop(struct vhost_dev *dev,
1128                                     struct VirtIODevice *vdev,
1129                                     struct vhost_virtqueue *vq,
1130                                     unsigned idx)
1131 {
1132     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1133     struct vhost_vring_state state = {
1134         .index = vhost_vq_index,
1135     };
1136     int r;
1137
1138     r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
1139     if (r < 0) {
1140         VHOST_OPS_DEBUG("vhost VQ %d ring restore failed: %d", idx, r);
1141         /* Connection to the backend is broken, so let's sync internal
1142          * last avail idx to the device used idx.
1143          */
1144         virtio_queue_restore_last_avail_idx(vdev, idx);
1145     } else {
1146         virtio_queue_set_last_avail_idx(vdev, idx, state.num);
1147     }
1148     virtio_queue_invalidate_signalled_used(vdev, idx);
1149     virtio_queue_update_used_idx(vdev, idx);
1150
1151     /* In the cross-endian case, we need to reset the vring endianness to
1152      * native as legacy devices expect so by default.
1153      */
1154     if (vhost_needs_vring_endian(vdev)) {
1155         vhost_virtqueue_set_vring_endian_legacy(dev,
1156                                                 !virtio_is_big_endian(vdev),
1157                                                 vhost_vq_index);
1158     }
1159
1160     vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1161                        1, virtio_queue_get_used_size(vdev, idx));
1162     vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1163                        0, virtio_queue_get_avail_size(vdev, idx));
1164     vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1165                        0, virtio_queue_get_desc_size(vdev, idx));
1166 }
1167
1168 static void vhost_eventfd_add(MemoryListener *listener,
1169                               MemoryRegionSection *section,
1170                               bool match_data, uint64_t data, EventNotifier *e)
1171 {
1172 }
1173
1174 static void vhost_eventfd_del(MemoryListener *listener,
1175                               MemoryRegionSection *section,
1176                               bool match_data, uint64_t data, EventNotifier *e)
1177 {
1178 }
1179
1180 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1181                                                 int n, uint32_t timeout)
1182 {
1183     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1184     struct vhost_vring_state state = {
1185         .index = vhost_vq_index,
1186         .num = timeout,
1187     };
1188     int r;
1189
1190     if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1191         return -EINVAL;
1192     }
1193
1194     r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1195     if (r) {
1196         VHOST_OPS_DEBUG("vhost_set_vring_busyloop_timeout failed");
1197         return r;
1198     }
1199
1200     return 0;
1201 }
1202
1203 static int vhost_virtqueue_init(struct vhost_dev *dev,
1204                                 struct vhost_virtqueue *vq, int n)
1205 {
1206     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1207     struct vhost_vring_file file = {
1208         .index = vhost_vq_index,
1209     };
1210     int r = event_notifier_init(&vq->masked_notifier, 0);
1211     if (r < 0) {
1212         return r;
1213     }
1214
1215     file.fd = event_notifier_get_fd(&vq->masked_notifier);
1216     r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1217     if (r) {
1218         VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1219         r = -errno;
1220         goto fail_call;
1221     }
1222
1223     vq->dev = dev;
1224
1225     return 0;
1226 fail_call:
1227     event_notifier_cleanup(&vq->masked_notifier);
1228     return r;
1229 }
1230
1231 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1232 {
1233     event_notifier_cleanup(&vq->masked_notifier);
1234 }
1235
1236 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
1237                    VhostBackendType backend_type, uint32_t busyloop_timeout)
1238 {
1239     uint64_t features;
1240     int i, r, n_initialized_vqs = 0;
1241     Error *local_err = NULL;
1242
1243     hdev->vdev = NULL;
1244     hdev->migration_blocker = NULL;
1245
1246     r = vhost_set_backend_type(hdev, backend_type);
1247     assert(r >= 0);
1248
1249     r = hdev->vhost_ops->vhost_backend_init(hdev, opaque);
1250     if (r < 0) {
1251         goto fail;
1252     }
1253
1254     if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
1255         error_report("vhost backend memory slots limit is less"
1256                 " than current number of present memory slots");
1257         r = -1;
1258         goto fail;
1259     }
1260
1261     r = hdev->vhost_ops->vhost_set_owner(hdev);
1262     if (r < 0) {
1263         VHOST_OPS_DEBUG("vhost_set_owner failed");
1264         goto fail;
1265     }
1266
1267     r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1268     if (r < 0) {
1269         VHOST_OPS_DEBUG("vhost_get_features failed");
1270         goto fail;
1271     }
1272
1273     for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
1274         r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1275         if (r < 0) {
1276             goto fail;
1277         }
1278     }
1279
1280     if (busyloop_timeout) {
1281         for (i = 0; i < hdev->nvqs; ++i) {
1282             r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1283                                                      busyloop_timeout);
1284             if (r < 0) {
1285                 goto fail_busyloop;
1286             }
1287         }
1288     }
1289
1290     hdev->features = features;
1291
1292     hdev->memory_listener = (MemoryListener) {
1293         .begin = vhost_begin,
1294         .commit = vhost_commit,
1295         .region_add = vhost_region_add,
1296         .region_del = vhost_region_del,
1297         .region_nop = vhost_region_nop,
1298         .log_start = vhost_log_start,
1299         .log_stop = vhost_log_stop,
1300         .log_sync = vhost_log_sync,
1301         .log_global_start = vhost_log_global_start,
1302         .log_global_stop = vhost_log_global_stop,
1303         .eventfd_add = vhost_eventfd_add,
1304         .eventfd_del = vhost_eventfd_del,
1305         .priority = 10
1306     };
1307
1308     hdev->iommu_listener = (MemoryListener) {
1309         .region_add = vhost_iommu_region_add,
1310         .region_del = vhost_iommu_region_del,
1311     };
1312
1313     if (hdev->migration_blocker == NULL) {
1314         if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1315             error_setg(&hdev->migration_blocker,
1316                        "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1317         } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_check()) {
1318             error_setg(&hdev->migration_blocker,
1319                        "Migration disabled: failed to allocate shared memory");
1320         }
1321     }
1322
1323     if (hdev->migration_blocker != NULL) {
1324         r = migrate_add_blocker(hdev->migration_blocker, &local_err);
1325         if (local_err) {
1326             error_report_err(local_err);
1327             error_free(hdev->migration_blocker);
1328             goto fail_busyloop;
1329         }
1330     }
1331
1332     hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1333     hdev->n_mem_sections = 0;
1334     hdev->mem_sections = NULL;
1335     hdev->log = NULL;
1336     hdev->log_size = 0;
1337     hdev->log_enabled = false;
1338     hdev->started = false;
1339     hdev->memory_changed = false;
1340     memory_listener_register(&hdev->memory_listener, &address_space_memory);
1341     QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1342     return 0;
1343
1344 fail_busyloop:
1345     while (--i >= 0) {
1346         vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
1347     }
1348 fail:
1349     hdev->nvqs = n_initialized_vqs;
1350     vhost_dev_cleanup(hdev);
1351     return r;
1352 }
1353
1354 void vhost_dev_cleanup(struct vhost_dev *hdev)
1355 {
1356     int i;
1357
1358     for (i = 0; i < hdev->nvqs; ++i) {
1359         vhost_virtqueue_cleanup(hdev->vqs + i);
1360     }
1361     if (hdev->mem) {
1362         /* those are only safe after successful init */
1363         memory_listener_unregister(&hdev->memory_listener);
1364         for (i = 0; i < hdev->n_mem_sections; ++i) {
1365             MemoryRegionSection *section = &hdev->mem_sections[i];
1366             memory_region_unref(section->mr);
1367         }
1368         QLIST_REMOVE(hdev, entry);
1369     }
1370     if (hdev->migration_blocker) {
1371         migrate_del_blocker(hdev->migration_blocker);
1372         error_free(hdev->migration_blocker);
1373     }
1374     g_free(hdev->mem);
1375     g_free(hdev->mem_sections);
1376     if (hdev->vhost_ops) {
1377         hdev->vhost_ops->vhost_backend_cleanup(hdev);
1378     }
1379     assert(!hdev->log);
1380
1381     memset(hdev, 0, sizeof(struct vhost_dev));
1382 }
1383
1384 /* Stop processing guest IO notifications in qemu.
1385  * Start processing them in vhost in kernel.
1386  */
1387 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1388 {
1389     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1390     int i, r, e;
1391
1392     /* We will pass the notifiers to the kernel, make sure that QEMU
1393      * doesn't interfere.
1394      */
1395     r = virtio_device_grab_ioeventfd(vdev);
1396     if (r < 0) {
1397         error_report("binding does not support host notifiers");
1398         goto fail;
1399     }
1400
1401     for (i = 0; i < hdev->nvqs; ++i) {
1402         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1403                                          true);
1404         if (r < 0) {
1405             error_report("vhost VQ %d notifier binding failed: %d", i, -r);
1406             goto fail_vq;
1407         }
1408     }
1409
1410     return 0;
1411 fail_vq:
1412     while (--i >= 0) {
1413         e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1414                                          false);
1415         if (e < 0) {
1416             error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
1417         }
1418         assert (e >= 0);
1419     }
1420     virtio_device_release_ioeventfd(vdev);
1421 fail:
1422     return r;
1423 }
1424
1425 /* Stop processing guest IO notifications in vhost.
1426  * Start processing them in qemu.
1427  * This might actually run the qemu handlers right away,
1428  * so virtio in qemu must be completely setup when this is called.
1429  */
1430 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1431 {
1432     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1433     int i, r;
1434
1435     for (i = 0; i < hdev->nvqs; ++i) {
1436         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1437                                          false);
1438         if (r < 0) {
1439             error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1440         }
1441         assert (r >= 0);
1442     }
1443     virtio_device_release_ioeventfd(vdev);
1444 }
1445
1446 /* Test and clear event pending status.
1447  * Should be called after unmask to avoid losing events.
1448  */
1449 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1450 {
1451     struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1452     assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1453     return event_notifier_test_and_clear(&vq->masked_notifier);
1454 }
1455
1456 /* Mask/unmask events from this vq. */
1457 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1458                          bool mask)
1459 {
1460     struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1461     int r, index = n - hdev->vq_index;
1462     struct vhost_vring_file file;
1463
1464     /* should only be called after backend is connected */
1465     assert(hdev->vhost_ops);
1466
1467     if (mask) {
1468         assert(vdev->use_guest_notifier_mask);
1469         file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1470     } else {
1471         file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1472     }
1473
1474     file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1475     r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1476     if (r < 0) {
1477         VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1478     }
1479 }
1480
1481 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1482                             uint64_t features)
1483 {
1484     const int *bit = feature_bits;
1485     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1486         uint64_t bit_mask = (1ULL << *bit);
1487         if (!(hdev->features & bit_mask)) {
1488             features &= ~bit_mask;
1489         }
1490         bit++;
1491     }
1492     return features;
1493 }
1494
1495 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1496                         uint64_t features)
1497 {
1498     const int *bit = feature_bits;
1499     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1500         uint64_t bit_mask = (1ULL << *bit);
1501         if (features & bit_mask) {
1502             hdev->acked_features |= bit_mask;
1503         }
1504         bit++;
1505     }
1506 }
1507
1508 /* Host notifiers must be enabled at this point. */
1509 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1510 {
1511     int i, r;
1512
1513     /* should only be called after backend is connected */
1514     assert(hdev->vhost_ops);
1515
1516     hdev->started = true;
1517     hdev->vdev = vdev;
1518
1519     r = vhost_dev_set_features(hdev, hdev->log_enabled);
1520     if (r < 0) {
1521         goto fail_features;
1522     }
1523
1524     if (vhost_dev_has_iommu(hdev)) {
1525         memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
1526     }
1527
1528     r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1529     if (r < 0) {
1530         VHOST_OPS_DEBUG("vhost_set_mem_table failed");
1531         r = -errno;
1532         goto fail_mem;
1533     }
1534     for (i = 0; i < hdev->nvqs; ++i) {
1535         r = vhost_virtqueue_start(hdev,
1536                                   vdev,
1537                                   hdev->vqs + i,
1538                                   hdev->vq_index + i);
1539         if (r < 0) {
1540             goto fail_vq;
1541         }
1542     }
1543
1544     if (hdev->log_enabled) {
1545         uint64_t log_base;
1546
1547         hdev->log_size = vhost_get_log_size(hdev);
1548         hdev->log = vhost_log_get(hdev->log_size,
1549                                   vhost_dev_log_is_shared(hdev));
1550         log_base = (uintptr_t)hdev->log->log;
1551         r = hdev->vhost_ops->vhost_set_log_base(hdev,
1552                                                 hdev->log_size ? log_base : 0,
1553                                                 hdev->log);
1554         if (r < 0) {
1555             VHOST_OPS_DEBUG("vhost_set_log_base failed");
1556             r = -errno;
1557             goto fail_log;
1558         }
1559     }
1560
1561     if (vhost_dev_has_iommu(hdev)) {
1562         hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
1563
1564         /* Update used ring information for IOTLB to work correctly,
1565          * vhost-kernel code requires for this.*/
1566         for (i = 0; i < hdev->nvqs; ++i) {
1567             struct vhost_virtqueue *vq = hdev->vqs + i;
1568             vhost_device_iotlb_miss(hdev, vq->used_phys, true);
1569         }
1570     }
1571     return 0;
1572 fail_log:
1573     vhost_log_put(hdev, false);
1574 fail_vq:
1575     while (--i >= 0) {
1576         vhost_virtqueue_stop(hdev,
1577                              vdev,
1578                              hdev->vqs + i,
1579                              hdev->vq_index + i);
1580     }
1581     i = hdev->nvqs;
1582
1583 fail_mem:
1584 fail_features:
1585
1586     hdev->started = false;
1587     return r;
1588 }
1589
1590 /* Host notifiers must be enabled at this point. */
1591 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1592 {
1593     int i;
1594
1595     /* should only be called after backend is connected */
1596     assert(hdev->vhost_ops);
1597
1598     for (i = 0; i < hdev->nvqs; ++i) {
1599         vhost_virtqueue_stop(hdev,
1600                              vdev,
1601                              hdev->vqs + i,
1602                              hdev->vq_index + i);
1603     }
1604
1605     if (vhost_dev_has_iommu(hdev)) {
1606         hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
1607         memory_listener_unregister(&hdev->iommu_listener);
1608     }
1609     vhost_log_put(hdev, true);
1610     hdev->started = false;
1611     hdev->vdev = NULL;
1612 }
1613
1614 int vhost_net_set_backend(struct vhost_dev *hdev,
1615                           struct vhost_vring_file *file)
1616 {
1617     if (hdev->vhost_ops->vhost_net_set_backend) {
1618         return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
1619     }
1620
1621     return -1;
1622 }