hw/virtio/vhost.c

   1 /*
   2  * vhost support
   3  *
   4  * Copyright Red Hat, Inc. 2010
   5  *
   6  * Authors:
   7  *  Michael S. Tsirkin <mst@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Contributions after 2012-01-13 are licensed under the terms of the
  13  * GNU GPL, version 2 or (at your option) any later version.
  14  */
  15
  16 #include "qemu/osdep.h"
  17 #include "qapi/error.h"
  18 #include "hw/virtio/vhost.h"
  19 #include "hw/hw.h"
  20 #include "qemu/atomic.h"
  21 #include "qemu/range.h"
  22 #include "qemu/error-report.h"
  23 #include "qemu/memfd.h"
  24 #include <linux/vhost.h>
  25 #include "exec/address-spaces.h"
  26 #include "hw/virtio/virtio-bus.h"
  27 #include "hw/virtio/virtio-access.h"
  28 #include "migration/migration.h"
  29
  30 static struct vhost_log *vhost_log;
  31 static struct vhost_log *vhost_log_shm;
  32
  33 static unsigned int used_memslots;
  34 static QLIST_HEAD(, vhost_dev) vhost_devices =
  35     QLIST_HEAD_INITIALIZER(vhost_devices);
  36
  37 bool vhost_has_free_slot(void)
  38 {
  39     unsigned int slots_limit = ~0U;
  40     struct vhost_dev *hdev;
  41
  42     QLIST_FOREACH(hdev, &vhost_devices, entry) {
  43         unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
  44         slots_limit = MIN(slots_limit, r);
  45     }
  46     return slots_limit > used_memslots;
  47 }
  48
  49 static void vhost_dev_sync_region(struct vhost_dev *dev,
  50                                   MemoryRegionSection *section,
  51                                   uint64_t mfirst, uint64_t mlast,
  52                                   uint64_t rfirst, uint64_t rlast)
  53 {
  54     vhost_log_chunk_t *log = dev->log->log;
  55
  56     uint64_t start = MAX(mfirst, rfirst);
  57     uint64_t end = MIN(mlast, rlast);
  58     vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
  59     vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
  60     uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK;
  61
  62     if (end < start) {
  63         return;
  64     }
  65     assert(end / VHOST_LOG_CHUNK < dev->log_size);
  66     assert(start / VHOST_LOG_CHUNK < dev->log_size);
  67
  68     for (;from < to; ++from) {
  69         vhost_log_chunk_t log;
  70         /* We first check with non-atomic: much cheaper,
  71          * and we expect non-dirty to be the common case. */
  72         if (!*from) {
  73             addr += VHOST_LOG_CHUNK;
  74             continue;
  75         }
  76         /* Data must be read atomically. We don't really need barrier semantics
  77          * but it's easier to use atomic_* than roll our own. */
  78         log = atomic_xchg(from, 0);
  79         while (log) {
  80             int bit = ctzl(log);
  81             hwaddr page_addr;
  82             hwaddr section_offset;
  83             hwaddr mr_offset;
  84             page_addr = addr + bit * VHOST_LOG_PAGE;
  85             section_offset = page_addr - section->offset_within_address_space;
  86             mr_offset = section_offset + section->offset_within_region;
  87             memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
  88             log &= ~(0x1ull << bit);
  89         }
  90         addr += VHOST_LOG_CHUNK;
  91     }
  92 }
  93
  94 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
  95                                    MemoryRegionSection *section,
  96                                    hwaddr first,
  97                                    hwaddr last)
  98 {
  99     int i;
 100     hwaddr start_addr;
 101     hwaddr end_addr;
 102
 103     if (!dev->log_enabled || !dev->started) {
 104         return 0;
 105     }
 106     start_addr = section->offset_within_address_space;
 107     end_addr = range_get_last(start_addr, int128_get64(section->size));
 108     start_addr = MAX(first, start_addr);
 109     end_addr = MIN(last, end_addr);
 110
 111     for (i = 0; i < dev->mem->nregions; ++i) {
 112         struct vhost_memory_region *reg = dev->mem->regions + i;
 113         vhost_dev_sync_region(dev, section, start_addr, end_addr,
 114                               reg->guest_phys_addr,
 115                               range_get_last(reg->guest_phys_addr,
 116                                              reg->memory_size));
 117     }
 118     for (i = 0; i < dev->nvqs; ++i) {
 119         struct vhost_virtqueue *vq = dev->vqs + i;
 120         vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
 121                               range_get_last(vq->used_phys, vq->used_size));
 122     }
 123     return 0;
 124 }
 125
 126 static void vhost_log_sync(MemoryListener *listener,
 127                           MemoryRegionSection *section)
 128 {
 129     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 130                                          memory_listener);
 131     vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
 132 }
 133
 134 static void vhost_log_sync_range(struct vhost_dev *dev,
 135                                  hwaddr first, hwaddr last)
 136 {
 137     int i;
 138     /* FIXME: this is N^2 in number of sections */
 139     for (i = 0; i < dev->n_mem_sections; ++i) {
 140         MemoryRegionSection *section = &dev->mem_sections[i];
 141         vhost_sync_dirty_bitmap(dev, section, first, last);
 142     }
 143 }
 144
 145 /* Assign/unassign. Keep an unsorted array of non-overlapping
 146  * memory regions in dev->mem. */
 147 static void vhost_dev_unassign_memory(struct vhost_dev *dev,
 148                                       uint64_t start_addr,
 149                                       uint64_t size)
 150 {
 151     int from, to, n = dev->mem->nregions;
 152     /* Track overlapping/split regions for sanity checking. */
 153     int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
 154
 155     for (from = 0, to = 0; from < n; ++from, ++to) {
 156         struct vhost_memory_region *reg = dev->mem->regions + to;
 157         uint64_t reglast;
 158         uint64_t memlast;
 159         uint64_t change;
 160
 161         /* clone old region */
 162         if (to != from) {
 163             memcpy(reg, dev->mem->regions + from, sizeof *reg);
 164         }
 165
 166         /* No overlap is simple */
 167         if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 168                             start_addr, size)) {
 169             continue;
 170         }
 171
 172         /* Split only happens if supplied region
 173          * is in the middle of an existing one. Thus it can not
 174          * overlap with any other existing region. */
 175         assert(!split);
 176
 177         reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 178         memlast = range_get_last(start_addr, size);
 179
 180         /* Remove whole region */
 181         if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
 182             --dev->mem->nregions;
 183             --to;
 184             ++overlap_middle;
 185             continue;
 186         }
 187
 188         /* Shrink region */
 189         if (memlast >= reglast) {
 190             reg->memory_size = start_addr - reg->guest_phys_addr;
 191             assert(reg->memory_size);
 192             assert(!overlap_end);
 193             ++overlap_end;
 194             continue;
 195         }
 196
 197         /* Shift region */
 198         if (start_addr <= reg->guest_phys_addr) {
 199             change = memlast + 1 - reg->guest_phys_addr;
 200             reg->memory_size -= change;
 201             reg->guest_phys_addr += change;
 202             reg->userspace_addr += change;
 203             assert(reg->memory_size);
 204             assert(!overlap_start);
 205             ++overlap_start;
 206             continue;
 207         }
 208
 209         /* This only happens if supplied region
 210          * is in the middle of an existing one. Thus it can not
 211          * overlap with any other existing region. */
 212         assert(!overlap_start);
 213         assert(!overlap_end);
 214         assert(!overlap_middle);
 215         /* Split region: shrink first part, shift second part. */
 216         memcpy(dev->mem->regions + n, reg, sizeof *reg);
 217         reg->memory_size = start_addr - reg->guest_phys_addr;
 218         assert(reg->memory_size);
 219         change = memlast + 1 - reg->guest_phys_addr;
 220         reg = dev->mem->regions + n;
 221         reg->memory_size -= change;
 222         assert(reg->memory_size);
 223         reg->guest_phys_addr += change;
 224         reg->userspace_addr += change;
 225         /* Never add more than 1 region */
 226         assert(dev->mem->nregions == n);
 227         ++dev->mem->nregions;
 228         ++split;
 229     }
 230 }
 231
 232 /* Called after unassign, so no regions overlap the given range. */
 233 static void vhost_dev_assign_memory(struct vhost_dev *dev,
 234                                     uint64_t start_addr,
 235                                     uint64_t size,
 236                                     uint64_t uaddr)
 237 {
 238     int from, to;
 239     struct vhost_memory_region *merged = NULL;
 240     for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
 241         struct vhost_memory_region *reg = dev->mem->regions + to;
 242         uint64_t prlast, urlast;
 243         uint64_t pmlast, umlast;
 244         uint64_t s, e, u;
 245
 246         /* clone old region */
 247         if (to != from) {
 248             memcpy(reg, dev->mem->regions + from, sizeof *reg);
 249         }
 250         prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 251         pmlast = range_get_last(start_addr, size);
 252         urlast = range_get_last(reg->userspace_addr, reg->memory_size);
 253         umlast = range_get_last(uaddr, size);
 254
 255         /* check for overlapping regions: should never happen. */
 256         assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
 257         /* Not an adjacent or overlapping region - do not merge. */
 258         if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
 259             (pmlast + 1 != reg->guest_phys_addr ||
 260              umlast + 1 != reg->userspace_addr)) {
 261             continue;
 262         }
 263
 264         if (dev->vhost_ops->vhost_backend_can_merge &&
 265             !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size,
 266                                                      reg->userspace_addr,
 267                                                      reg->memory_size)) {
 268             continue;
 269         }
 270
 271         if (merged) {
 272             --to;
 273             assert(to >= 0);
 274         } else {
 275             merged = reg;
 276         }
 277         u = MIN(uaddr, reg->userspace_addr);
 278         s = MIN(start_addr, reg->guest_phys_addr);
 279         e = MAX(pmlast, prlast);
 280         uaddr = merged->userspace_addr = u;
 281         start_addr = merged->guest_phys_addr = s;
 282         size = merged->memory_size = e - s + 1;
 283         assert(merged->memory_size);
 284     }
 285
 286     if (!merged) {
 287         struct vhost_memory_region *reg = dev->mem->regions + to;
 288         memset(reg, 0, sizeof *reg);
 289         reg->memory_size = size;
 290         assert(reg->memory_size);
 291         reg->guest_phys_addr = start_addr;
 292         reg->userspace_addr = uaddr;
 293         ++to;
 294     }
 295     assert(to <= dev->mem->nregions + 1);
 296     dev->mem->nregions = to;
 297 }
 298
 299 static uint64_t vhost_get_log_size(struct vhost_dev *dev)
 300 {
 301     uint64_t log_size = 0;
 302     int i;
 303     for (i = 0; i < dev->mem->nregions; ++i) {
 304         struct vhost_memory_region *reg = dev->mem->regions + i;
 305         uint64_t last = range_get_last(reg->guest_phys_addr,
 306                                        reg->memory_size);
 307         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 308     }
 309     for (i = 0; i < dev->nvqs; ++i) {
 310         struct vhost_virtqueue *vq = dev->vqs + i;
 311         uint64_t last = vq->used_phys + vq->used_size - 1;
 312         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 313     }
 314     return log_size;
 315 }
 316
 317 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
 318 {
 319     struct vhost_log *log;
 320     uint64_t logsize = size * sizeof(*(log->log));
 321     int fd = -1;
 322
 323     log = g_new0(struct vhost_log, 1);
 324     if (share) {
 325         log->log = qemu_memfd_alloc("vhost-log", logsize,
 326                                     F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
 327                                     &fd);
 328         memset(log->log, 0, logsize);
 329     } else {
 330         log->log = g_malloc0(logsize);
 331     }
 332
 333     log->size = size;
 334     log->refcnt = 1;
 335     log->fd = fd;
 336
 337     return log;
 338 }
 339
 340 static struct vhost_log *vhost_log_get(uint64_t size, bool share)
 341 {
 342     struct vhost_log *log = share ? vhost_log_shm : vhost_log;
 343
 344     if (!log || log->size != size) {
 345         log = vhost_log_alloc(size, share);
 346         if (share) {
 347             vhost_log_shm = log;
 348         } else {
 349             vhost_log = log;
 350         }
 351     } else {
 352         ++log->refcnt;
 353     }
 354
 355     return log;
 356 }
 357
 358 static void vhost_log_put(struct vhost_dev *dev, bool sync)
 359 {
 360     struct vhost_log *log = dev->log;
 361
 362     if (!log) {
 363         return;
 364     }
 365
 366     --log->refcnt;
 367     if (log->refcnt == 0) {
 368         /* Sync only the range covered by the old log */
 369         if (dev->log_size && sync) {
 370             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
 371         }
 372
 373         if (vhost_log == log) {
 374             g_free(log->log);
 375             vhost_log = NULL;
 376         } else if (vhost_log_shm == log) {
 377             qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
 378                             log->fd);
 379             vhost_log_shm = NULL;
 380         }
 381
 382         g_free(log);
 383     }
 384 }
 385
 386 static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 387 {
 388     return dev->vhost_ops->vhost_requires_shm_log &&
 389            dev->vhost_ops->vhost_requires_shm_log(dev);
 390 }
 391
 392 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 393 {
 394     struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
 395     uint64_t log_base = (uintptr_t)log->log;
 396     int r;
 397
 398     /* inform backend of log switching, this must be done before
 399        releasing the current log, to ensure no logging is lost */
 400     r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
 401     assert(r >= 0);
 402     vhost_log_put(dev, true);
 403     dev->log = log;
 404     dev->log_size = size;
 405 }
 406
 407 static int vhost_verify_ring_mappings(struct vhost_dev *dev,
 408                                       uint64_t start_addr,
 409                                       uint64_t size)
 410 {
 411     int i;
 412     int r = 0;
 413
 414     for (i = 0; !r && i < dev->nvqs; ++i) {
 415         struct vhost_virtqueue *vq = dev->vqs + i;
 416         hwaddr l;
 417         void *p;
 418
 419         if (!ranges_overlap(start_addr, size, vq->ring_phys, vq->ring_size)) {
 420             continue;
 421         }
 422         l = vq->ring_size;
 423         p = cpu_physical_memory_map(vq->ring_phys, &l, 1);
 424         if (!p || l != vq->ring_size) {
 425             fprintf(stderr, "Unable to map ring buffer for ring %d\n", i);
 426             r = -ENOMEM;
 427         }
 428         if (p != vq->ring) {
 429             fprintf(stderr, "Ring buffer relocated for ring %d\n", i);
 430             r = -EBUSY;
 431         }
 432         cpu_physical_memory_unmap(p, l, 0, 0);
 433     }
 434     return r;
 435 }
 436
 437 static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev,
 438                                                       uint64_t start_addr,
 439                                                       uint64_t size)
 440 {
 441     int i, n = dev->mem->nregions;
 442     for (i = 0; i < n; ++i) {
 443         struct vhost_memory_region *reg = dev->mem->regions + i;
 444         if (ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 445                            start_addr, size)) {
 446             return reg;
 447         }
 448     }
 449     return NULL;
 450 }
 451
 452 static bool vhost_dev_cmp_memory(struct vhost_dev *dev,
 453                                  uint64_t start_addr,
 454                                  uint64_t size,
 455                                  uint64_t uaddr)
 456 {
 457     struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size);
 458     uint64_t reglast;
 459     uint64_t memlast;
 460
 461     if (!reg) {
 462         return true;
 463     }
 464
 465     reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 466     memlast = range_get_last(start_addr, size);
 467
 468     /* Need to extend region? */
 469     if (start_addr < reg->guest_phys_addr || memlast > reglast) {
 470         return true;
 471     }
 472     /* userspace_addr changed? */
 473     return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr;
 474 }
 475
 476 static void vhost_set_memory(MemoryListener *listener,
 477                              MemoryRegionSection *section,
 478                              bool add)
 479 {
 480     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 481                                          memory_listener);
 482     hwaddr start_addr = section->offset_within_address_space;
 483     ram_addr_t size = int128_get64(section->size);
 484     bool log_dirty =
 485         memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION);
 486     int s = offsetof(struct vhost_memory, regions) +
 487         (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
 488     void *ram;
 489
 490     dev->mem = g_realloc(dev->mem, s);
 491
 492     if (log_dirty) {
 493         add = false;
 494     }
 495
 496     assert(size);
 497
 498     /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
 499     ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
 500     if (add) {
 501         if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
 502             /* Region exists with same address. Nothing to do. */
 503             return;
 504         }
 505     } else {
 506         if (!vhost_dev_find_reg(dev, start_addr, size)) {
 507             /* Removing region that we don't access. Nothing to do. */
 508             return;
 509         }
 510     }
 511
 512     vhost_dev_unassign_memory(dev, start_addr, size);
 513     if (add) {
 514         /* Add given mapping, merging adjacent regions if any */
 515         vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram);
 516     } else {
 517         /* Remove old mapping for this memory, if any. */
 518         vhost_dev_unassign_memory(dev, start_addr, size);
 519     }
 520     dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
 521     dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
 522     dev->memory_changed = true;
 523     used_memslots = dev->mem->nregions;
 524 }
 525
 526 static bool vhost_section(MemoryRegionSection *section)
 527 {
 528     return memory_region_is_ram(section->mr);
 529 }
 530
 531 static void vhost_begin(MemoryListener *listener)
 532 {
 533     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 534                                          memory_listener);
 535     dev->mem_changed_end_addr = 0;
 536     dev->mem_changed_start_addr = -1;
 537 }
 538
 539 static void vhost_commit(MemoryListener *listener)
 540 {
 541     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 542                                          memory_listener);
 543     hwaddr start_addr = 0;
 544     ram_addr_t size = 0;
 545     uint64_t log_size;
 546     int r;
 547
 548     if (!dev->memory_changed) {
 549         return;
 550     }
 551     if (!dev->started) {
 552         return;
 553     }
 554     if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) {
 555         return;
 556     }
 557
 558     if (dev->started) {
 559         start_addr = dev->mem_changed_start_addr;
 560         size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1;
 561
 562         r = vhost_verify_ring_mappings(dev, start_addr, size);
 563         assert(r >= 0);
 564     }
 565
 566     if (!dev->log_enabled) {
 567         r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 568         assert(r >= 0);
 569         dev->memory_changed = false;
 570         return;
 571     }
 572     log_size = vhost_get_log_size(dev);
 573     /* We allocate an extra 4K bytes to log,
 574      * to reduce the * number of reallocations. */
 575 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
 576     /* To log more, must increase log size before table update. */
 577     if (dev->log_size < log_size) {
 578         vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
 579     }
 580     r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 581     assert(r >= 0);
 582     /* To log less, can only decrease log size after table update. */
 583     if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
 584         vhost_dev_log_resize(dev, log_size);
 585     }
 586     dev->memory_changed = false;
 587 }
 588
 589 static void vhost_region_add(MemoryListener *listener,
 590                              MemoryRegionSection *section)
 591 {
 592     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 593                                          memory_listener);
 594
 595     if (!vhost_section(section)) {
 596         return;
 597     }
 598
 599     ++dev->n_mem_sections;
 600     dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections,
 601                                 dev->n_mem_sections);
 602     dev->mem_sections[dev->n_mem_sections - 1] = *section;
 603     memory_region_ref(section->mr);
 604     vhost_set_memory(listener, section, true);
 605 }
 606
 607 static void vhost_region_del(MemoryListener *listener,
 608                              MemoryRegionSection *section)
 609 {
 610     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 611                                          memory_listener);
 612     int i;
 613
 614     if (!vhost_section(section)) {
 615         return;
 616     }
 617
 618     vhost_set_memory(listener, section, false);
 619     memory_region_unref(section->mr);
 620     for (i = 0; i < dev->n_mem_sections; ++i) {
 621         if (dev->mem_sections[i].offset_within_address_space
 622             == section->offset_within_address_space) {
 623             --dev->n_mem_sections;
 624             memmove(&dev->mem_sections[i], &dev->mem_sections[i+1],
 625                     (dev->n_mem_sections - i) * sizeof(*dev->mem_sections));
 626             break;
 627         }
 628     }
 629 }
 630
 631 static void vhost_region_nop(MemoryListener *listener,
 632                              MemoryRegionSection *section)
 633 {
 634 }
 635
 636 static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
 637                                     struct vhost_virtqueue *vq,
 638                                     unsigned idx, bool enable_log)
 639 {
 640     struct vhost_vring_addr addr = {
 641         .index = idx,
 642         .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
 643         .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
 644         .used_user_addr = (uint64_t)(unsigned long)vq->used,
 645         .log_guest_addr = vq->used_phys,
 646         .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
 647     };
 648     int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
 649     if (r < 0) {
 650         return -errno;
 651     }
 652     return 0;
 653 }
 654
 655 static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
 656 {
 657     uint64_t features = dev->acked_features;
 658     int r;
 659     if (enable_log) {
 660         features |= 0x1ULL << VHOST_F_LOG_ALL;
 661     }
 662     r = dev->vhost_ops->vhost_set_features(dev, features);
 663     return r < 0 ? -errno : 0;
 664 }
 665
 666 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
 667 {
 668     int r, t, i, idx;
 669     r = vhost_dev_set_features(dev, enable_log);
 670     if (r < 0) {
 671         goto err_features;
 672     }
 673     for (i = 0; i < dev->nvqs; ++i) {
 674         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 675         r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 676                                      enable_log);
 677         if (r < 0) {
 678             goto err_vq;
 679         }
 680     }
 681     return 0;
 682 err_vq:
 683     for (; i >= 0; --i) {
 684         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 685         t = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 686                                      dev->log_enabled);
 687         assert(t >= 0);
 688     }
 689     t = vhost_dev_set_features(dev, dev->log_enabled);
 690     assert(t >= 0);
 691 err_features:
 692     return r;
 693 }
 694
 695 static int vhost_migration_log(MemoryListener *listener, int enable)
 696 {
 697     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 698                                          memory_listener);
 699     int r;
 700     if (!!enable == dev->log_enabled) {
 701         return 0;
 702     }
 703     if (!dev->started) {
 704         dev->log_enabled = enable;
 705         return 0;
 706     }
 707     if (!enable) {
 708         r = vhost_dev_set_log(dev, false);
 709         if (r < 0) {
 710             return r;
 711         }
 712         vhost_log_put(dev, false);
 713         dev->log = NULL;
 714         dev->log_size = 0;
 715     } else {
 716         vhost_dev_log_resize(dev, vhost_get_log_size(dev));
 717         r = vhost_dev_set_log(dev, true);
 718         if (r < 0) {
 719             return r;
 720         }
 721     }
 722     dev->log_enabled = enable;
 723     return 0;
 724 }
 725
 726 static void vhost_log_global_start(MemoryListener *listener)
 727 {
 728     int r;
 729
 730     r = vhost_migration_log(listener, true);
 731     if (r < 0) {
 732         abort();
 733     }
 734 }
 735
 736 static void vhost_log_global_stop(MemoryListener *listener)
 737 {
 738     int r;
 739
 740     r = vhost_migration_log(listener, false);
 741     if (r < 0) {
 742         abort();
 743     }
 744 }
 745
 746 static void vhost_log_start(MemoryListener *listener,
 747                             MemoryRegionSection *section,
 748                             int old, int new)
 749 {
 750     /* FIXME: implement */
 751 }
 752
 753 static void vhost_log_stop(MemoryListener *listener,
 754                            MemoryRegionSection *section,
 755                            int old, int new)
 756 {
 757     /* FIXME: implement */
 758 }
 759
 760 /* The vhost driver natively knows how to handle the vrings of non
 761  * cross-endian legacy devices and modern devices. Only legacy devices
 762  * exposed to a bi-endian guest may require the vhost driver to use a
 763  * specific endianness.
 764  */
 765 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
 766 {
 767     if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
 768         return false;
 769     }
 770 #ifdef HOST_WORDS_BIGENDIAN
 771     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
 772 #else
 773     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
 774 #endif
 775 }
 776
 777 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
 778                                                    bool is_big_endian,
 779                                                    int vhost_vq_index)
 780 {
 781     struct vhost_vring_state s = {
 782         .index = vhost_vq_index,
 783         .num = is_big_endian
 784     };
 785
 786     if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
 787         return 0;
 788     }
 789
 790     if (errno == ENOTTY) {
 791         error_report("vhost does not support cross-endian");
 792         return -ENOSYS;
 793     }
 794
 795     return -errno;
 796 }
 797
 798 static int vhost_virtqueue_start(struct vhost_dev *dev,
 799                                 struct VirtIODevice *vdev,
 800                                 struct vhost_virtqueue *vq,
 801                                 unsigned idx)
 802 {
 803     hwaddr s, l, a;
 804     int r;
 805     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
 806     struct vhost_vring_file file = {
 807         .index = vhost_vq_index
 808     };
 809     struct vhost_vring_state state = {
 810         .index = vhost_vq_index
 811     };
 812     struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
 813
 814
 815     vq->num = state.num = virtio_queue_get_num(vdev, idx);
 816     r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
 817     if (r) {
 818         return -errno;
 819     }
 820
 821     state.num = virtio_queue_get_last_avail_idx(vdev, idx);
 822     r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
 823     if (r) {
 824         return -errno;
 825     }
 826
 827     if (vhost_needs_vring_endian(vdev)) {
 828         r = vhost_virtqueue_set_vring_endian_legacy(dev,
 829                                                     virtio_is_big_endian(vdev),
 830                                                     vhost_vq_index);
 831         if (r) {
 832             return -errno;
 833         }
 834     }
 835
 836     s = l = virtio_queue_get_desc_size(vdev, idx);
 837     a = virtio_queue_get_desc_addr(vdev, idx);
 838     vq->desc = cpu_physical_memory_map(a, &l, 0);
 839     if (!vq->desc || l != s) {
 840         r = -ENOMEM;
 841         goto fail_alloc_desc;
 842     }
 843     s = l = virtio_queue_get_avail_size(vdev, idx);
 844     a = virtio_queue_get_avail_addr(vdev, idx);
 845     vq->avail = cpu_physical_memory_map(a, &l, 0);
 846     if (!vq->avail || l != s) {
 847         r = -ENOMEM;
 848         goto fail_alloc_avail;
 849     }
 850     vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
 851     vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
 852     vq->used = cpu_physical_memory_map(a, &l, 1);
 853     if (!vq->used || l != s) {
 854         r = -ENOMEM;
 855         goto fail_alloc_used;
 856     }
 857
 858     vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx);
 859     vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx);
 860     vq->ring = cpu_physical_memory_map(a, &l, 1);
 861     if (!vq->ring || l != s) {
 862         r = -ENOMEM;
 863         goto fail_alloc_ring;
 864     }
 865
 866     r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
 867     if (r < 0) {
 868         r = -errno;
 869         goto fail_alloc;
 870     }
 871
 872     file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
 873     r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
 874     if (r) {
 875         r = -errno;
 876         goto fail_kick;
 877     }
 878
 879     /* Clear and discard previous events if any. */
 880     event_notifier_test_and_clear(&vq->masked_notifier);
 881
 882     /* Init vring in unmasked state, unless guest_notifier_mask
 883      * will do it later.
 884      */
 885     if (!vdev->use_guest_notifier_mask) {
 886         /* TODO: check and handle errors. */
 887         vhost_virtqueue_mask(dev, vdev, idx, false);
 888     }
 889
 890     return 0;
 891
 892 fail_kick:
 893 fail_alloc:
 894     cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
 895                               0, 0);
 896 fail_alloc_ring:
 897     cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
 898                               0, 0);
 899 fail_alloc_used:
 900     cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
 901                               0, 0);
 902 fail_alloc_avail:
 903     cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
 904                               0, 0);
 905 fail_alloc_desc:
 906     return r;
 907 }
 908
 909 static void vhost_virtqueue_stop(struct vhost_dev *dev,
 910                                     struct VirtIODevice *vdev,
 911                                     struct vhost_virtqueue *vq,
 912                                     unsigned idx)
 913 {
 914     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
 915     struct vhost_vring_state state = {
 916         .index = vhost_vq_index,
 917     };
 918     int r;
 919
 920     r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
 921     if (r < 0) {
 922         fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
 923         fflush(stderr);
 924     }
 925     virtio_queue_set_last_avail_idx(vdev, idx, state.num);
 926     virtio_queue_invalidate_signalled_used(vdev, idx);
 927
 928     /* In the cross-endian case, we need to reset the vring endianness to
 929      * native as legacy devices expect so by default.
 930      */
 931     if (vhost_needs_vring_endian(vdev)) {
 932         r = vhost_virtqueue_set_vring_endian_legacy(dev,
 933                                                     !virtio_is_big_endian(vdev),
 934                                                     vhost_vq_index);
 935         if (r < 0) {
 936             error_report("failed to reset vring endianness");
 937         }
 938     }
 939
 940     assert (r >= 0);
 941     cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
 942                               0, virtio_queue_get_ring_size(vdev, idx));
 943     cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
 944                               1, virtio_queue_get_used_size(vdev, idx));
 945     cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
 946                               0, virtio_queue_get_avail_size(vdev, idx));
 947     cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
 948                               0, virtio_queue_get_desc_size(vdev, idx));
 949 }
 950
 951 static void vhost_eventfd_add(MemoryListener *listener,
 952                               MemoryRegionSection *section,
 953                               bool match_data, uint64_t data, EventNotifier *e)
 954 {
 955 }
 956
 957 static void vhost_eventfd_del(MemoryListener *listener,
 958                               MemoryRegionSection *section,
 959                               bool match_data, uint64_t data, EventNotifier *e)
 960 {
 961 }
 962
 963 static int vhost_virtqueue_init(struct vhost_dev *dev,
 964                                 struct vhost_virtqueue *vq, int n)
 965 {
 966     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
 967     struct vhost_vring_file file = {
 968         .index = vhost_vq_index,
 969     };
 970     int r = event_notifier_init(&vq->masked_notifier, 0);
 971     if (r < 0) {
 972         return r;
 973     }
 974
 975     file.fd = event_notifier_get_fd(&vq->masked_notifier);
 976     r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
 977     if (r) {
 978         r = -errno;
 979         goto fail_call;
 980     }
 981     return 0;
 982 fail_call:
 983     event_notifier_cleanup(&vq->masked_notifier);
 984     return r;
 985 }
 986
 987 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
 988 {
 989     event_notifier_cleanup(&vq->masked_notifier);
 990 }
 991
 992 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
 993                    VhostBackendType backend_type)
 994 {
 995     uint64_t features;
 996     int i, r;
 997
 998     hdev->migration_blocker = NULL;
 999
1000     if (vhost_set_backend_type(hdev, backend_type) < 0) {
1001         close((uintptr_t)opaque);
1002         return -1;
1003     }
1004
1005     if (hdev->vhost_ops->vhost_backend_init(hdev, opaque) < 0) {
1006         close((uintptr_t)opaque);
1007         return -errno;
1008     }
1009
1010     if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
1011         fprintf(stderr, "vhost backend memory slots limit is less"
1012                 " than current number of present memory slots\n");
1013         close((uintptr_t)opaque);
1014         return -1;
1015     }
1016     QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1017
1018     r = hdev->vhost_ops->vhost_set_owner(hdev);
1019     if (r < 0) {
1020         goto fail;
1021     }
1022
1023     r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1024     if (r < 0) {
1025         goto fail;
1026     }
1027
1028     for (i = 0; i < hdev->nvqs; ++i) {
1029         r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1030         if (r < 0) {
1031             goto fail_vq;
1032         }
1033     }
1034     hdev->features = features;
1035
1036     hdev->memory_listener = (MemoryListener) {
1037         .begin = vhost_begin,
1038         .commit = vhost_commit,
1039         .region_add = vhost_region_add,
1040         .region_del = vhost_region_del,
1041         .region_nop = vhost_region_nop,
1042         .log_start = vhost_log_start,
1043         .log_stop = vhost_log_stop,
1044         .log_sync = vhost_log_sync,
1045         .log_global_start = vhost_log_global_start,
1046         .log_global_stop = vhost_log_global_stop,
1047         .eventfd_add = vhost_eventfd_add,
1048         .eventfd_del = vhost_eventfd_del,
1049         .priority = 10
1050     };
1051
1052     if (hdev->migration_blocker == NULL) {
1053         if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1054             error_setg(&hdev->migration_blocker,
1055                        "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1056         } else if (!qemu_memfd_check()) {
1057             error_setg(&hdev->migration_blocker,
1058                        "Migration disabled: failed to allocate shared memory");
1059         }
1060     }
1061
1062     if (hdev->migration_blocker != NULL) {
1063         migrate_add_blocker(hdev->migration_blocker);
1064     }
1065
1066     hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1067     hdev->n_mem_sections = 0;
1068     hdev->mem_sections = NULL;
1069     hdev->log = NULL;
1070     hdev->log_size = 0;
1071     hdev->log_enabled = false;
1072     hdev->started = false;
1073     hdev->memory_changed = false;
1074     memory_listener_register(&hdev->memory_listener, &address_space_memory);
1075     return 0;
1076 fail_vq:
1077     while (--i >= 0) {
1078         vhost_virtqueue_cleanup(hdev->vqs + i);
1079     }
1080 fail:
1081     r = -errno;
1082     hdev->vhost_ops->vhost_backend_cleanup(hdev);
1083     QLIST_REMOVE(hdev, entry);
1084     return r;
1085 }
1086
1087 void vhost_dev_cleanup(struct vhost_dev *hdev)
1088 {
1089     int i;
1090     for (i = 0; i < hdev->nvqs; ++i) {
1091         vhost_virtqueue_cleanup(hdev->vqs + i);
1092     }
1093     memory_listener_unregister(&hdev->memory_listener);
1094     if (hdev->migration_blocker) {
1095         migrate_del_blocker(hdev->migration_blocker);
1096         error_free(hdev->migration_blocker);
1097     }
1098     g_free(hdev->mem);
1099     g_free(hdev->mem_sections);
1100     hdev->vhost_ops->vhost_backend_cleanup(hdev);
1101     QLIST_REMOVE(hdev, entry);
1102 }
1103
1104 /* Stop processing guest IO notifications in qemu.
1105  * Start processing them in vhost in kernel.
1106  */
1107 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1108 {
1109     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1110     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1111     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1112     int i, r, e;
1113     if (!k->ioeventfd_started) {
1114         fprintf(stderr, "binding does not support host notifiers\n");
1115         r = -ENOSYS;
1116         goto fail;
1117     }
1118
1119     for (i = 0; i < hdev->nvqs; ++i) {
1120         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1121                                          true);
1122         if (r < 0) {
1123             fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r);
1124             goto fail_vq;
1125         }
1126     }
1127
1128     return 0;
1129 fail_vq:
1130     while (--i >= 0) {
1131         e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1132                                          false);
1133         if (e < 0) {
1134             fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r);
1135             fflush(stderr);
1136         }
1137         assert (e >= 0);
1138     }
1139 fail:
1140     return r;
1141 }
1142
1143 /* Stop processing guest IO notifications in vhost.
1144  * Start processing them in qemu.
1145  * This might actually run the qemu handlers right away,
1146  * so virtio in qemu must be completely setup when this is called.
1147  */
1148 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1149 {
1150     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1151     int i, r;
1152
1153     for (i = 0; i < hdev->nvqs; ++i) {
1154         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1155                                          false);
1156         if (r < 0) {
1157             fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r);
1158             fflush(stderr);
1159         }
1160         assert (r >= 0);
1161     }
1162 }
1163
1164 /* Test and clear event pending status.
1165  * Should be called after unmask to avoid losing events.
1166  */
1167 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1168 {
1169     struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1170     assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1171     return event_notifier_test_and_clear(&vq->masked_notifier);
1172 }
1173
1174 /* Mask/unmask events from this vq. */
1175 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1176                          bool mask)
1177 {
1178     struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1179     int r, index = n - hdev->vq_index;
1180     struct vhost_vring_file file;
1181
1182     if (mask) {
1183         assert(vdev->use_guest_notifier_mask);
1184         file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1185     } else {
1186         file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1187     }
1188
1189     file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1190     r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1191     assert(r >= 0);
1192 }
1193
1194 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1195                             uint64_t features)
1196 {
1197     const int *bit = feature_bits;
1198     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1199         uint64_t bit_mask = (1ULL << *bit);
1200         if (!(hdev->features & bit_mask)) {
1201             features &= ~bit_mask;
1202         }
1203         bit++;
1204     }
1205     return features;
1206 }
1207
1208 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1209                         uint64_t features)
1210 {
1211     const int *bit = feature_bits;
1212     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1213         uint64_t bit_mask = (1ULL << *bit);
1214         if (features & bit_mask) {
1215             hdev->acked_features |= bit_mask;
1216         }
1217         bit++;
1218     }
1219 }
1220
1221 /* Host notifiers must be enabled at this point. */
1222 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1223 {
1224     int i, r;
1225
1226     hdev->started = true;
1227
1228     r = vhost_dev_set_features(hdev, hdev->log_enabled);
1229     if (r < 0) {
1230         goto fail_features;
1231     }
1232     r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1233     if (r < 0) {
1234         r = -errno;
1235         goto fail_mem;
1236     }
1237     for (i = 0; i < hdev->nvqs; ++i) {
1238         r = vhost_virtqueue_start(hdev,
1239                                   vdev,
1240                                   hdev->vqs + i,
1241                                   hdev->vq_index + i);
1242         if (r < 0) {
1243             goto fail_vq;
1244         }
1245     }
1246
1247     if (hdev->log_enabled) {
1248         uint64_t log_base;
1249
1250         hdev->log_size = vhost_get_log_size(hdev);
1251         hdev->log = vhost_log_get(hdev->log_size,
1252                                   vhost_dev_log_is_shared(hdev));
1253         log_base = (uintptr_t)hdev->log->log;
1254         r = hdev->vhost_ops->vhost_set_log_base(hdev,
1255                                                 hdev->log_size ? log_base : 0,
1256                                                 hdev->log);
1257         if (r < 0) {
1258             r = -errno;
1259             goto fail_log;
1260         }
1261     }
1262
1263     return 0;
1264 fail_log:
1265     vhost_log_put(hdev, false);
1266 fail_vq:
1267     while (--i >= 0) {
1268         vhost_virtqueue_stop(hdev,
1269                              vdev,
1270                              hdev->vqs + i,
1271                              hdev->vq_index + i);
1272     }
1273     i = hdev->nvqs;
1274 fail_mem:
1275 fail_features:
1276
1277     hdev->started = false;
1278     return r;
1279 }
1280
1281 /* Host notifiers must be enabled at this point. */
1282 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1283 {
1284     int i;
1285
1286     for (i = 0; i < hdev->nvqs; ++i) {
1287         vhost_virtqueue_stop(hdev,
1288                              vdev,
1289                              hdev->vqs + i,
1290                              hdev->vq_index + i);
1291     }
1292
1293     vhost_log_put(hdev, true);
1294     hdev->started = false;
1295     hdev->log = NULL;
1296     hdev->log_size = 0;
1297 }
1298