hw/virtio/vhost.c

   1 /*
   2  * vhost support
   3  *
   4  * Copyright Red Hat, Inc. 2010
   5  *
   6  * Authors:
   7  *  Michael S. Tsirkin <mst@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Contributions after 2012-01-13 are licensed under the terms of the
  13  * GNU GPL, version 2 or (at your option) any later version.
  14  */
  15
  16 #include "qemu/osdep.h"
  17 #include "qapi/error.h"
  18 #include "hw/virtio/vhost.h"
  19 #include "hw/hw.h"
  20 #include "qemu/atomic.h"
  21 #include "qemu/range.h"
  22 #include "qemu/error-report.h"
  23 #include "qemu/memfd.h"
  24 #include <linux/vhost.h>
  25 #include "exec/address-spaces.h"
  26 #include "hw/virtio/virtio-bus.h"
  27 #include "hw/virtio/virtio-access.h"
  28 #include "migration/migration.h"
  29
  30 static struct vhost_log *vhost_log;
  31 static struct vhost_log *vhost_log_shm;
  32
  33 static unsigned int used_memslots;
  34 static QLIST_HEAD(, vhost_dev) vhost_devices =
  35     QLIST_HEAD_INITIALIZER(vhost_devices);
  36
  37 bool vhost_has_free_slot(void)
  38 {
  39     unsigned int slots_limit = ~0U;
  40     struct vhost_dev *hdev;
  41
  42     QLIST_FOREACH(hdev, &vhost_devices, entry) {
  43         unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
  44         slots_limit = MIN(slots_limit, r);
  45     }
  46     return slots_limit > used_memslots;
  47 }
  48
  49 static void vhost_dev_sync_region(struct vhost_dev *dev,
  50                                   MemoryRegionSection *section,
  51                                   uint64_t mfirst, uint64_t mlast,
  52                                   uint64_t rfirst, uint64_t rlast)
  53 {
  54     vhost_log_chunk_t *log = dev->log->log;
  55
  56     uint64_t start = MAX(mfirst, rfirst);
  57     uint64_t end = MIN(mlast, rlast);
  58     vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
  59     vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
  60     uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK;
  61
  62     if (end < start) {
  63         return;
  64     }
  65     assert(end / VHOST_LOG_CHUNK < dev->log_size);
  66     assert(start / VHOST_LOG_CHUNK < dev->log_size);
  67
  68     for (;from < to; ++from) {
  69         vhost_log_chunk_t log;
  70         /* We first check with non-atomic: much cheaper,
  71          * and we expect non-dirty to be the common case. */
  72         if (!*from) {
  73             addr += VHOST_LOG_CHUNK;
  74             continue;
  75         }
  76         /* Data must be read atomically. We don't really need barrier semantics
  77          * but it's easier to use atomic_* than roll our own. */
  78         log = atomic_xchg(from, 0);
  79         while (log) {
  80             int bit = ctzl(log);
  81             hwaddr page_addr;
  82             hwaddr section_offset;
  83             hwaddr mr_offset;
  84             page_addr = addr + bit * VHOST_LOG_PAGE;
  85             section_offset = page_addr - section->offset_within_address_space;
  86             mr_offset = section_offset + section->offset_within_region;
  87             memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
  88             log &= ~(0x1ull << bit);
  89         }
  90         addr += VHOST_LOG_CHUNK;
  91     }
  92 }
  93
  94 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
  95                                    MemoryRegionSection *section,
  96                                    hwaddr first,
  97                                    hwaddr last)
  98 {
  99     int i;
 100     hwaddr start_addr;
 101     hwaddr end_addr;
 102
 103     if (!dev->log_enabled || !dev->started) {
 104         return 0;
 105     }
 106     start_addr = section->offset_within_address_space;
 107     end_addr = range_get_last(start_addr, int128_get64(section->size));
 108     start_addr = MAX(first, start_addr);
 109     end_addr = MIN(last, end_addr);
 110
 111     for (i = 0; i < dev->mem->nregions; ++i) {
 112         struct vhost_memory_region *reg = dev->mem->regions + i;
 113         vhost_dev_sync_region(dev, section, start_addr, end_addr,
 114                               reg->guest_phys_addr,
 115                               range_get_last(reg->guest_phys_addr,
 116                                              reg->memory_size));
 117     }
 118     for (i = 0; i < dev->nvqs; ++i) {
 119         struct vhost_virtqueue *vq = dev->vqs + i;
 120         vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
 121                               range_get_last(vq->used_phys, vq->used_size));
 122     }
 123     return 0;
 124 }
 125
 126 static void vhost_log_sync(MemoryListener *listener,
 127                           MemoryRegionSection *section)
 128 {
 129     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 130                                          memory_listener);
 131     vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
 132 }
 133
 134 static void vhost_log_sync_range(struct vhost_dev *dev,
 135                                  hwaddr first, hwaddr last)
 136 {
 137     int i;
 138     /* FIXME: this is N^2 in number of sections */
 139     for (i = 0; i < dev->n_mem_sections; ++i) {
 140         MemoryRegionSection *section = &dev->mem_sections[i];
 141         vhost_sync_dirty_bitmap(dev, section, first, last);
 142     }
 143 }
 144
 145 /* Assign/unassign. Keep an unsorted array of non-overlapping
 146  * memory regions in dev->mem. */
 147 static void vhost_dev_unassign_memory(struct vhost_dev *dev,
 148                                       uint64_t start_addr,
 149                                       uint64_t size)
 150 {
 151     int from, to, n = dev->mem->nregions;
 152     /* Track overlapping/split regions for sanity checking. */
 153     int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
 154
 155     for (from = 0, to = 0; from < n; ++from, ++to) {
 156         struct vhost_memory_region *reg = dev->mem->regions + to;
 157         uint64_t reglast;
 158         uint64_t memlast;
 159         uint64_t change;
 160
 161         /* clone old region */
 162         if (to != from) {
 163             memcpy(reg, dev->mem->regions + from, sizeof *reg);
 164         }
 165
 166         /* No overlap is simple */
 167         if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 168                             start_addr, size)) {
 169             continue;
 170         }
 171
 172         /* Split only happens if supplied region
 173          * is in the middle of an existing one. Thus it can not
 174          * overlap with any other existing region. */
 175         assert(!split);
 176
 177         reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 178         memlast = range_get_last(start_addr, size);
 179
 180         /* Remove whole region */
 181         if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
 182             --dev->mem->nregions;
 183             --to;
 184             ++overlap_middle;
 185             continue;
 186         }
 187
 188         /* Shrink region */
 189         if (memlast >= reglast) {
 190             reg->memory_size = start_addr - reg->guest_phys_addr;
 191             assert(reg->memory_size);
 192             assert(!overlap_end);
 193             ++overlap_end;
 194             continue;
 195         }
 196
 197         /* Shift region */
 198         if (start_addr <= reg->guest_phys_addr) {
 199             change = memlast + 1 - reg->guest_phys_addr;
 200             reg->memory_size -= change;
 201             reg->guest_phys_addr += change;
 202             reg->userspace_addr += change;
 203             assert(reg->memory_size);
 204             assert(!overlap_start);
 205             ++overlap_start;
 206             continue;
 207         }
 208
 209         /* This only happens if supplied region
 210          * is in the middle of an existing one. Thus it can not
 211          * overlap with any other existing region. */
 212         assert(!overlap_start);
 213         assert(!overlap_end);
 214         assert(!overlap_middle);
 215         /* Split region: shrink first part, shift second part. */
 216         memcpy(dev->mem->regions + n, reg, sizeof *reg);
 217         reg->memory_size = start_addr - reg->guest_phys_addr;
 218         assert(reg->memory_size);
 219         change = memlast + 1 - reg->guest_phys_addr;
 220         reg = dev->mem->regions + n;
 221         reg->memory_size -= change;
 222         assert(reg->memory_size);
 223         reg->guest_phys_addr += change;
 224         reg->userspace_addr += change;
 225         /* Never add more than 1 region */
 226         assert(dev->mem->nregions == n);
 227         ++dev->mem->nregions;
 228         ++split;
 229     }
 230 }
 231
 232 /* Called after unassign, so no regions overlap the given range. */
 233 static void vhost_dev_assign_memory(struct vhost_dev *dev,
 234                                     uint64_t start_addr,
 235                                     uint64_t size,
 236                                     uint64_t uaddr)
 237 {
 238     int from, to;
 239     struct vhost_memory_region *merged = NULL;
 240     for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
 241         struct vhost_memory_region *reg = dev->mem->regions + to;
 242         uint64_t prlast, urlast;
 243         uint64_t pmlast, umlast;
 244         uint64_t s, e, u;
 245
 246         /* clone old region */
 247         if (to != from) {
 248             memcpy(reg, dev->mem->regions + from, sizeof *reg);
 249         }
 250         prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 251         pmlast = range_get_last(start_addr, size);
 252         urlast = range_get_last(reg->userspace_addr, reg->memory_size);
 253         umlast = range_get_last(uaddr, size);
 254
 255         /* check for overlapping regions: should never happen. */
 256         assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
 257         /* Not an adjacent or overlapping region - do not merge. */
 258         if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
 259             (pmlast + 1 != reg->guest_phys_addr ||
 260              umlast + 1 != reg->userspace_addr)) {
 261             continue;
 262         }
 263
 264         if (dev->vhost_ops->vhost_backend_can_merge &&
 265             !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size,
 266                                                      reg->userspace_addr,
 267                                                      reg->memory_size)) {
 268             continue;
 269         }
 270
 271         if (merged) {
 272             --to;
 273             assert(to >= 0);
 274         } else {
 275             merged = reg;
 276         }
 277         u = MIN(uaddr, reg->userspace_addr);
 278         s = MIN(start_addr, reg->guest_phys_addr);
 279         e = MAX(pmlast, prlast);
 280         uaddr = merged->userspace_addr = u;
 281         start_addr = merged->guest_phys_addr = s;
 282         size = merged->memory_size = e - s + 1;
 283         assert(merged->memory_size);
 284     }
 285
 286     if (!merged) {
 287         struct vhost_memory_region *reg = dev->mem->regions + to;
 288         memset(reg, 0, sizeof *reg);
 289         reg->memory_size = size;
 290         assert(reg->memory_size);
 291         reg->guest_phys_addr = start_addr;
 292         reg->userspace_addr = uaddr;
 293         ++to;
 294     }
 295     assert(to <= dev->mem->nregions + 1);
 296     dev->mem->nregions = to;
 297 }
 298
 299 static uint64_t vhost_get_log_size(struct vhost_dev *dev)
 300 {
 301     uint64_t log_size = 0;
 302     int i;
 303     for (i = 0; i < dev->mem->nregions; ++i) {
 304         struct vhost_memory_region *reg = dev->mem->regions + i;
 305         uint64_t last = range_get_last(reg->guest_phys_addr,
 306                                        reg->memory_size);
 307         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 308     }
 309     for (i = 0; i < dev->nvqs; ++i) {
 310         struct vhost_virtqueue *vq = dev->vqs + i;
 311         uint64_t last = vq->used_phys + vq->used_size - 1;
 312         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 313     }
 314     return log_size;
 315 }
 316
 317 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
 318 {
 319     struct vhost_log *log;
 320     uint64_t logsize = size * sizeof(*(log->log));
 321     int fd = -1;
 322
 323     log = g_new0(struct vhost_log, 1);
 324     if (share) {
 325         log->log = qemu_memfd_alloc("vhost-log", logsize,
 326                                     F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
 327                                     &fd);
 328         memset(log->log, 0, logsize);
 329     } else {
 330         log->log = g_malloc0(logsize);
 331     }
 332
 333     log->size = size;
 334     log->refcnt = 1;
 335     log->fd = fd;
 336
 337     return log;
 338 }
 339
 340 static struct vhost_log *vhost_log_get(uint64_t size, bool share)
 341 {
 342     struct vhost_log *log = share ? vhost_log_shm : vhost_log;
 343
 344     if (!log || log->size != size) {
 345         log = vhost_log_alloc(size, share);
 346         if (share) {
 347             vhost_log_shm = log;
 348         } else {
 349             vhost_log = log;
 350         }
 351     } else {
 352         ++log->refcnt;
 353     }
 354
 355     return log;
 356 }
 357
 358 static void vhost_log_put(struct vhost_dev *dev, bool sync)
 359 {
 360     struct vhost_log *log = dev->log;
 361
 362     if (!log) {
 363         return;
 364     }
 365
 366     --log->refcnt;
 367     if (log->refcnt == 0) {
 368         /* Sync only the range covered by the old log */
 369         if (dev->log_size && sync) {
 370             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
 371         }
 372
 373         if (vhost_log == log) {
 374             g_free(log->log);
 375             vhost_log = NULL;
 376         } else if (vhost_log_shm == log) {
 377             qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
 378                             log->fd);
 379             vhost_log_shm = NULL;
 380         }
 381
 382         g_free(log);
 383     }
 384 }
 385
 386 static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 387 {
 388     return dev->vhost_ops->vhost_requires_shm_log &&
 389            dev->vhost_ops->vhost_requires_shm_log(dev);
 390 }
 391
 392 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 393 {
 394     struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
 395     uint64_t log_base = (uintptr_t)log->log;
 396     int r;
 397
 398     /* inform backend of log switching, this must be done before
 399        releasing the current log, to ensure no logging is lost */
 400     r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
 401     assert(r >= 0);
 402     vhost_log_put(dev, true);
 403     dev->log = log;
 404     dev->log_size = size;
 405 }
 406
 407 static int vhost_verify_ring_mappings(struct vhost_dev *dev,
 408                                       uint64_t start_addr,
 409                                       uint64_t size)
 410 {
 411     int i;
 412     int r = 0;
 413
 414     for (i = 0; !r && i < dev->nvqs; ++i) {
 415         struct vhost_virtqueue *vq = dev->vqs + i;
 416         hwaddr l;
 417         void *p;
 418
 419         if (!ranges_overlap(start_addr, size, vq->ring_phys, vq->ring_size)) {
 420             continue;
 421         }
 422         l = vq->ring_size;
 423         p = cpu_physical_memory_map(vq->ring_phys, &l, 1);
 424         if (!p || l != vq->ring_size) {
 425             fprintf(stderr, "Unable to map ring buffer for ring %d\n", i);
 426             r = -ENOMEM;
 427         }
 428         if (p != vq->ring) {
 429             fprintf(stderr, "Ring buffer relocated for ring %d\n", i);
 430             r = -EBUSY;
 431         }
 432         cpu_physical_memory_unmap(p, l, 0, 0);
 433     }
 434     return r;
 435 }
 436
 437 static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev,
 438                                                       uint64_t start_addr,
 439                                                       uint64_t size)
 440 {
 441     int i, n = dev->mem->nregions;
 442     for (i = 0; i < n; ++i) {
 443         struct vhost_memory_region *reg = dev->mem->regions + i;
 444         if (ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 445                            start_addr, size)) {
 446             return reg;
 447         }
 448     }
 449     return NULL;
 450 }
 451
 452 static bool vhost_dev_cmp_memory(struct vhost_dev *dev,
 453                                  uint64_t start_addr,
 454                                  uint64_t size,
 455                                  uint64_t uaddr)
 456 {
 457     struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size);
 458     uint64_t reglast;
 459     uint64_t memlast;
 460
 461     if (!reg) {
 462         return true;
 463     }
 464
 465     reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 466     memlast = range_get_last(start_addr, size);
 467
 468     /* Need to extend region? */
 469     if (start_addr < reg->guest_phys_addr || memlast > reglast) {
 470         return true;
 471     }
 472     /* userspace_addr changed? */
 473     return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr;
 474 }
 475
 476 static void vhost_set_memory(MemoryListener *listener,
 477                              MemoryRegionSection *section,
 478                              bool add)
 479 {
 480     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 481                                          memory_listener);
 482     hwaddr start_addr = section->offset_within_address_space;
 483     ram_addr_t size = int128_get64(section->size);
 484     bool log_dirty =
 485         memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION);
 486     int s = offsetof(struct vhost_memory, regions) +
 487         (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
 488     void *ram;
 489
 490     dev->mem = g_realloc(dev->mem, s);
 491
 492     if (log_dirty) {
 493         add = false;
 494     }
 495
 496     assert(size);
 497
 498     /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
 499     ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
 500     if (add) {
 501         if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
 502             /* Region exists with same address. Nothing to do. */
 503             return;
 504         }
 505     } else {
 506         if (!vhost_dev_find_reg(dev, start_addr, size)) {
 507             /* Removing region that we don't access. Nothing to do. */
 508             return;
 509         }
 510     }
 511
 512     vhost_dev_unassign_memory(dev, start_addr, size);
 513     if (add) {
 514         /* Add given mapping, merging adjacent regions if any */
 515         vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram);
 516     } else {
 517         /* Remove old mapping for this memory, if any. */
 518         vhost_dev_unassign_memory(dev, start_addr, size);
 519     }
 520     dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
 521     dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
 522     dev->memory_changed = true;
 523     used_memslots = dev->mem->nregions;
 524 }
 525
 526 static bool vhost_section(MemoryRegionSection *section)
 527 {
 528     return memory_region_is_ram(section->mr);
 529 }
 530
 531 static void vhost_begin(MemoryListener *listener)
 532 {
 533     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 534                                          memory_listener);
 535     dev->mem_changed_end_addr = 0;
 536     dev->mem_changed_start_addr = -1;
 537 }
 538
 539 static void vhost_commit(MemoryListener *listener)
 540 {
 541     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 542                                          memory_listener);
 543     hwaddr start_addr = 0;
 544     ram_addr_t size = 0;
 545     uint64_t log_size;
 546     int r;
 547
 548     if (!dev->memory_changed) {
 549         return;
 550     }
 551     if (!dev->started) {
 552         return;
 553     }
 554     if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) {
 555         return;
 556     }
 557
 558     if (dev->started) {
 559         start_addr = dev->mem_changed_start_addr;
 560         size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1;
 561
 562         r = vhost_verify_ring_mappings(dev, start_addr, size);
 563         assert(r >= 0);
 564     }
 565
 566     if (!dev->log_enabled) {
 567         r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 568         assert(r >= 0);
 569         dev->memory_changed = false;
 570         return;
 571     }
 572     log_size = vhost_get_log_size(dev);
 573     /* We allocate an extra 4K bytes to log,
 574      * to reduce the * number of reallocations. */
 575 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
 576     /* To log more, must increase log size before table update. */
 577     if (dev->log_size < log_size) {
 578         vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
 579     }
 580     r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 581     assert(r >= 0);
 582     /* To log less, can only decrease log size after table update. */
 583     if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
 584         vhost_dev_log_resize(dev, log_size);
 585     }
 586     dev->memory_changed = false;
 587 }
 588
 589 static void vhost_region_add(MemoryListener *listener,
 590                              MemoryRegionSection *section)
 591 {
 592     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 593                                          memory_listener);
 594
 595     if (!vhost_section(section)) {
 596         return;
 597     }
 598
 599     ++dev->n_mem_sections;
 600     dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections,
 601                                 dev->n_mem_sections);
 602     dev->mem_sections[dev->n_mem_sections - 1] = *section;
 603     memory_region_ref(section->mr);
 604     vhost_set_memory(listener, section, true);
 605 }
 606
 607 static void vhost_region_del(MemoryListener *listener,
 608                              MemoryRegionSection *section)
 609 {
 610     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 611                                          memory_listener);
 612     int i;
 613
 614     if (!vhost_section(section)) {
 615         return;
 616     }
 617
 618     vhost_set_memory(listener, section, false);
 619     memory_region_unref(section->mr);
 620     for (i = 0; i < dev->n_mem_sections; ++i) {
 621         if (dev->mem_sections[i].offset_within_address_space
 622             == section->offset_within_address_space) {
 623             --dev->n_mem_sections;
 624             memmove(&dev->mem_sections[i], &dev->mem_sections[i+1],
 625                     (dev->n_mem_sections - i) * sizeof(*dev->mem_sections));
 626             break;
 627         }
 628     }
 629 }
 630
 631 static void vhost_region_nop(MemoryListener *listener,
 632                              MemoryRegionSection *section)
 633 {
 634 }
 635
 636 static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
 637                                     struct vhost_virtqueue *vq,
 638                                     unsigned idx, bool enable_log)
 639 {
 640     struct vhost_vring_addr addr = {
 641         .index = idx,
 642         .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
 643         .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
 644         .used_user_addr = (uint64_t)(unsigned long)vq->used,
 645         .log_guest_addr = vq->used_phys,
 646         .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
 647     };
 648     int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
 649     if (r < 0) {
 650         return -errno;
 651     }
 652     return 0;
 653 }
 654
 655 static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
 656 {
 657     uint64_t features = dev->acked_features;
 658     int r;
 659     if (enable_log) {
 660         features |= 0x1ULL << VHOST_F_LOG_ALL;
 661     }
 662     r = dev->vhost_ops->vhost_set_features(dev, features);
 663     return r < 0 ? -errno : 0;
 664 }
 665
 666 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
 667 {
 668     int r, t, i, idx;
 669     r = vhost_dev_set_features(dev, enable_log);
 670     if (r < 0) {
 671         goto err_features;
 672     }
 673     for (i = 0; i < dev->nvqs; ++i) {
 674         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 675         r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 676                                      enable_log);
 677         if (r < 0) {
 678             goto err_vq;
 679         }
 680     }
 681     return 0;
 682 err_vq:
 683     for (; i >= 0; --i) {
 684         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 685         t = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 686                                      dev->log_enabled);
 687         assert(t >= 0);
 688     }
 689     t = vhost_dev_set_features(dev, dev->log_enabled);
 690     assert(t >= 0);
 691 err_features:
 692     return r;
 693 }
 694
 695 static int vhost_migration_log(MemoryListener *listener, int enable)
 696 {
 697     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 698                                          memory_listener);
 699     int r;
 700     if (!!enable == dev->log_enabled) {
 701         return 0;
 702     }
 703     if (!dev->started) {
 704         dev->log_enabled = enable;
 705         return 0;
 706     }
 707     if (!enable) {
 708         r = vhost_dev_set_log(dev, false);
 709         if (r < 0) {
 710             return r;
 711         }
 712         vhost_log_put(dev, false);
 713         dev->log = NULL;
 714         dev->log_size = 0;
 715     } else {
 716         vhost_dev_log_resize(dev, vhost_get_log_size(dev));
 717         r = vhost_dev_set_log(dev, true);
 718         if (r < 0) {
 719             return r;
 720         }
 721     }
 722     dev->log_enabled = enable;
 723     return 0;
 724 }
 725
 726 static void vhost_log_global_start(MemoryListener *listener)
 727 {
 728     int r;
 729
 730     r = vhost_migration_log(listener, true);
 731     if (r < 0) {
 732         abort();
 733     }
 734 }
 735
 736 static void vhost_log_global_stop(MemoryListener *listener)
 737 {
 738     int r;
 739
 740     r = vhost_migration_log(listener, false);
 741     if (r < 0) {
 742         abort();
 743     }
 744 }
 745
 746 static void vhost_log_start(MemoryListener *listener,
 747                             MemoryRegionSection *section,
 748                             int old, int new)
 749 {
 750     /* FIXME: implement */
 751 }
 752
 753 static void vhost_log_stop(MemoryListener *listener,
 754                            MemoryRegionSection *section,
 755                            int old, int new)
 756 {
 757     /* FIXME: implement */
 758 }
 759
 760 /* The vhost driver natively knows how to handle the vrings of non
 761  * cross-endian legacy devices and modern devices. Only legacy devices
 762  * exposed to a bi-endian guest may require the vhost driver to use a
 763  * specific endianness.
 764  */
 765 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
 766 {
 767     if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
 768         return false;
 769     }
 770 #ifdef TARGET_IS_BIENDIAN
 771 #ifdef HOST_WORDS_BIGENDIAN
 772     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
 773 #else
 774     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
 775 #endif
 776 #else
 777     return false;
 778 #endif
 779 }
 780
 781 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
 782                                                    bool is_big_endian,
 783                                                    int vhost_vq_index)
 784 {
 785     struct vhost_vring_state s = {
 786         .index = vhost_vq_index,
 787         .num = is_big_endian
 788     };
 789
 790     if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
 791         return 0;
 792     }
 793
 794     if (errno == ENOTTY) {
 795         error_report("vhost does not support cross-endian");
 796         return -ENOSYS;
 797     }
 798
 799     return -errno;
 800 }
 801
 802 static int vhost_virtqueue_start(struct vhost_dev *dev,
 803                                 struct VirtIODevice *vdev,
 804                                 struct vhost_virtqueue *vq,
 805                                 unsigned idx)
 806 {
 807     hwaddr s, l, a;
 808     int r;
 809     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
 810     struct vhost_vring_file file = {
 811         .index = vhost_vq_index
 812     };
 813     struct vhost_vring_state state = {
 814         .index = vhost_vq_index
 815     };
 816     struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
 817
 818
 819     vq->num = state.num = virtio_queue_get_num(vdev, idx);
 820     r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
 821     if (r) {
 822         return -errno;
 823     }
 824
 825     state.num = virtio_queue_get_last_avail_idx(vdev, idx);
 826     r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
 827     if (r) {
 828         return -errno;
 829     }
 830
 831     if (vhost_needs_vring_endian(vdev)) {
 832         r = vhost_virtqueue_set_vring_endian_legacy(dev,
 833                                                     virtio_is_big_endian(vdev),
 834                                                     vhost_vq_index);
 835         if (r) {
 836             return -errno;
 837         }
 838     }
 839
 840     s = l = virtio_queue_get_desc_size(vdev, idx);
 841     a = virtio_queue_get_desc_addr(vdev, idx);
 842     vq->desc = cpu_physical_memory_map(a, &l, 0);
 843     if (!vq->desc || l != s) {
 844         r = -ENOMEM;
 845         goto fail_alloc_desc;
 846     }
 847     s = l = virtio_queue_get_avail_size(vdev, idx);
 848     a = virtio_queue_get_avail_addr(vdev, idx);
 849     vq->avail = cpu_physical_memory_map(a, &l, 0);
 850     if (!vq->avail || l != s) {
 851         r = -ENOMEM;
 852         goto fail_alloc_avail;
 853     }
 854     vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
 855     vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
 856     vq->used = cpu_physical_memory_map(a, &l, 1);
 857     if (!vq->used || l != s) {
 858         r = -ENOMEM;
 859         goto fail_alloc_used;
 860     }
 861
 862     vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx);
 863     vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx);
 864     vq->ring = cpu_physical_memory_map(a, &l, 1);
 865     if (!vq->ring || l != s) {
 866         r = -ENOMEM;
 867         goto fail_alloc_ring;
 868     }
 869
 870     r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
 871     if (r < 0) {
 872         r = -errno;
 873         goto fail_alloc;
 874     }
 875
 876     file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
 877     r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
 878     if (r) {
 879         r = -errno;
 880         goto fail_kick;
 881     }
 882
 883     /* Clear and discard previous events if any. */
 884     event_notifier_test_and_clear(&vq->masked_notifier);
 885
 886     /* Init vring in unmasked state, unless guest_notifier_mask
 887      * will do it later.
 888      */
 889     if (!vdev->use_guest_notifier_mask) {
 890         /* TODO: check and handle errors. */
 891         vhost_virtqueue_mask(dev, vdev, idx, false);
 892     }
 893
 894     return 0;
 895
 896 fail_kick:
 897 fail_alloc:
 898     cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
 899                               0, 0);
 900 fail_alloc_ring:
 901     cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
 902                               0, 0);
 903 fail_alloc_used:
 904     cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
 905                               0, 0);
 906 fail_alloc_avail:
 907     cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
 908                               0, 0);
 909 fail_alloc_desc:
 910     return r;
 911 }
 912
 913 static void vhost_virtqueue_stop(struct vhost_dev *dev,
 914                                     struct VirtIODevice *vdev,
 915                                     struct vhost_virtqueue *vq,
 916                                     unsigned idx)
 917 {
 918     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
 919     struct vhost_vring_state state = {
 920         .index = vhost_vq_index,
 921     };
 922     int r;
 923
 924     r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
 925     if (r < 0) {
 926         fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
 927         fflush(stderr);
 928     }
 929     virtio_queue_set_last_avail_idx(vdev, idx, state.num);
 930     virtio_queue_invalidate_signalled_used(vdev, idx);
 931
 932     /* In the cross-endian case, we need to reset the vring endianness to
 933      * native as legacy devices expect so by default.
 934      */
 935     if (vhost_needs_vring_endian(vdev)) {
 936         r = vhost_virtqueue_set_vring_endian_legacy(dev,
 937                                                     !virtio_is_big_endian(vdev),
 938                                                     vhost_vq_index);
 939         if (r < 0) {
 940             error_report("failed to reset vring endianness");
 941         }
 942     }
 943
 944     assert (r >= 0);
 945     cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
 946                               0, virtio_queue_get_ring_size(vdev, idx));
 947     cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
 948                               1, virtio_queue_get_used_size(vdev, idx));
 949     cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
 950                               0, virtio_queue_get_avail_size(vdev, idx));
 951     cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
 952                               0, virtio_queue_get_desc_size(vdev, idx));
 953 }
 954
 955 static void vhost_eventfd_add(MemoryListener *listener,
 956                               MemoryRegionSection *section,
 957                               bool match_data, uint64_t data, EventNotifier *e)
 958 {
 959 }
 960
 961 static void vhost_eventfd_del(MemoryListener *listener,
 962                               MemoryRegionSection *section,
 963                               bool match_data, uint64_t data, EventNotifier *e)
 964 {
 965 }
 966
 967 static int vhost_virtqueue_init(struct vhost_dev *dev,
 968                                 struct vhost_virtqueue *vq, int n)
 969 {
 970     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
 971     struct vhost_vring_file file = {
 972         .index = vhost_vq_index,
 973     };
 974     int r = event_notifier_init(&vq->masked_notifier, 0);
 975     if (r < 0) {
 976         return r;
 977     }
 978
 979     file.fd = event_notifier_get_fd(&vq->masked_notifier);
 980     r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
 981     if (r) {
 982         r = -errno;
 983         goto fail_call;
 984     }
 985     return 0;
 986 fail_call:
 987     event_notifier_cleanup(&vq->masked_notifier);
 988     return r;
 989 }
 990
 991 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
 992 {
 993     event_notifier_cleanup(&vq->masked_notifier);
 994 }
 995
 996 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
 997                    VhostBackendType backend_type)
 998 {
 999     uint64_t features;
1000     int i, r;
1001
1002     hdev->migration_blocker = NULL;
1003
1004     if (vhost_set_backend_type(hdev, backend_type) < 0) {
1005         close((uintptr_t)opaque);
1006         return -1;
1007     }
1008
1009     if (hdev->vhost_ops->vhost_backend_init(hdev, opaque) < 0) {
1010         close((uintptr_t)opaque);
1011         return -errno;
1012     }
1013
1014     if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
1015         fprintf(stderr, "vhost backend memory slots limit is less"
1016                 " than current number of present memory slots\n");
1017         close((uintptr_t)opaque);
1018         return -1;
1019     }
1020     QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1021
1022     r = hdev->vhost_ops->vhost_set_owner(hdev);
1023     if (r < 0) {
1024         goto fail;
1025     }
1026
1027     r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1028     if (r < 0) {
1029         goto fail;
1030     }
1031
1032     for (i = 0; i < hdev->nvqs; ++i) {
1033         r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1034         if (r < 0) {
1035             goto fail_vq;
1036         }
1037     }
1038     hdev->features = features;
1039
1040     hdev->memory_listener = (MemoryListener) {
1041         .begin = vhost_begin,
1042         .commit = vhost_commit,
1043         .region_add = vhost_region_add,
1044         .region_del = vhost_region_del,
1045         .region_nop = vhost_region_nop,
1046         .log_start = vhost_log_start,
1047         .log_stop = vhost_log_stop,
1048         .log_sync = vhost_log_sync,
1049         .log_global_start = vhost_log_global_start,
1050         .log_global_stop = vhost_log_global_stop,
1051         .eventfd_add = vhost_eventfd_add,
1052         .eventfd_del = vhost_eventfd_del,
1053         .priority = 10
1054     };
1055
1056     if (hdev->migration_blocker == NULL) {
1057         if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1058             error_setg(&hdev->migration_blocker,
1059                        "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1060         } else if (!qemu_memfd_check()) {
1061             error_setg(&hdev->migration_blocker,
1062                        "Migration disabled: failed to allocate shared memory");
1063         }
1064     }
1065
1066     if (hdev->migration_blocker != NULL) {
1067         migrate_add_blocker(hdev->migration_blocker);
1068     }
1069
1070     hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1071     hdev->n_mem_sections = 0;
1072     hdev->mem_sections = NULL;
1073     hdev->log = NULL;
1074     hdev->log_size = 0;
1075     hdev->log_enabled = false;
1076     hdev->started = false;
1077     hdev->memory_changed = false;
1078     memory_listener_register(&hdev->memory_listener, &address_space_memory);
1079     return 0;
1080 fail_vq:
1081     while (--i >= 0) {
1082         vhost_virtqueue_cleanup(hdev->vqs + i);
1083     }
1084 fail:
1085     r = -errno;
1086     hdev->vhost_ops->vhost_backend_cleanup(hdev);
1087     QLIST_REMOVE(hdev, entry);
1088     return r;
1089 }
1090
1091 void vhost_dev_cleanup(struct vhost_dev *hdev)
1092 {
1093     int i;
1094     for (i = 0; i < hdev->nvqs; ++i) {
1095         vhost_virtqueue_cleanup(hdev->vqs + i);
1096     }
1097     memory_listener_unregister(&hdev->memory_listener);
1098     if (hdev->migration_blocker) {
1099         migrate_del_blocker(hdev->migration_blocker);
1100         error_free(hdev->migration_blocker);
1101     }
1102     g_free(hdev->mem);
1103     g_free(hdev->mem_sections);
1104     hdev->vhost_ops->vhost_backend_cleanup(hdev);
1105     QLIST_REMOVE(hdev, entry);
1106 }
1107
1108 /* Stop processing guest IO notifications in qemu.
1109  * Start processing them in vhost in kernel.
1110  */
1111 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1112 {
1113     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1114     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1115     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1116     int i, r, e;
1117     if (!k->set_host_notifier) {
1118         fprintf(stderr, "binding does not support host notifiers\n");
1119         r = -ENOSYS;
1120         goto fail;
1121     }
1122
1123     for (i = 0; i < hdev->nvqs; ++i) {
1124         r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, true);
1125         if (r < 0) {
1126             fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r);
1127             goto fail_vq;
1128         }
1129     }
1130
1131     return 0;
1132 fail_vq:
1133     while (--i >= 0) {
1134         e = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false);
1135         if (e < 0) {
1136             fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r);
1137             fflush(stderr);
1138         }
1139         assert (e >= 0);
1140     }
1141 fail:
1142     return r;
1143 }
1144
1145 /* Stop processing guest IO notifications in vhost.
1146  * Start processing them in qemu.
1147  * This might actually run the qemu handlers right away,
1148  * so virtio in qemu must be completely setup when this is called.
1149  */
1150 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1151 {
1152     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1153     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1154     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1155     int i, r;
1156
1157     for (i = 0; i < hdev->nvqs; ++i) {
1158         r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false);
1159         if (r < 0) {
1160             fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r);
1161             fflush(stderr);
1162         }
1163         assert (r >= 0);
1164     }
1165 }
1166
1167 /* Test and clear event pending status.
1168  * Should be called after unmask to avoid losing events.
1169  */
1170 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1171 {
1172     struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1173     assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1174     return event_notifier_test_and_clear(&vq->masked_notifier);
1175 }
1176
1177 /* Mask/unmask events from this vq. */
1178 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1179                          bool mask)
1180 {
1181     struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1182     int r, index = n - hdev->vq_index;
1183     struct vhost_vring_file file;
1184
1185     if (mask) {
1186         assert(vdev->use_guest_notifier_mask);
1187         file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1188     } else {
1189         file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1190     }
1191
1192     file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1193     r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1194     assert(r >= 0);
1195 }
1196
1197 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1198                             uint64_t features)
1199 {
1200     const int *bit = feature_bits;
1201     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1202         uint64_t bit_mask = (1ULL << *bit);
1203         if (!(hdev->features & bit_mask)) {
1204             features &= ~bit_mask;
1205         }
1206         bit++;
1207     }
1208     return features;
1209 }
1210
1211 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1212                         uint64_t features)
1213 {
1214     const int *bit = feature_bits;
1215     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1216         uint64_t bit_mask = (1ULL << *bit);
1217         if (features & bit_mask) {
1218             hdev->acked_features |= bit_mask;
1219         }
1220         bit++;
1221     }
1222 }
1223
1224 /* Host notifiers must be enabled at this point. */
1225 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1226 {
1227     int i, r;
1228
1229     hdev->started = true;
1230
1231     r = vhost_dev_set_features(hdev, hdev->log_enabled);
1232     if (r < 0) {
1233         goto fail_features;
1234     }
1235     r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1236     if (r < 0) {
1237         r = -errno;
1238         goto fail_mem;
1239     }
1240     for (i = 0; i < hdev->nvqs; ++i) {
1241         r = vhost_virtqueue_start(hdev,
1242                                   vdev,
1243                                   hdev->vqs + i,
1244                                   hdev->vq_index + i);
1245         if (r < 0) {
1246             goto fail_vq;
1247         }
1248     }
1249
1250     if (hdev->log_enabled) {
1251         uint64_t log_base;
1252
1253         hdev->log_size = vhost_get_log_size(hdev);
1254         hdev->log = vhost_log_get(hdev->log_size,
1255                                   vhost_dev_log_is_shared(hdev));
1256         log_base = (uintptr_t)hdev->log->log;
1257         r = hdev->vhost_ops->vhost_set_log_base(hdev,
1258                                                 hdev->log_size ? log_base : 0,
1259                                                 hdev->log);
1260         if (r < 0) {
1261             r = -errno;
1262             goto fail_log;
1263         }
1264     }
1265
1266     return 0;
1267 fail_log:
1268     vhost_log_put(hdev, false);
1269 fail_vq:
1270     while (--i >= 0) {
1271         vhost_virtqueue_stop(hdev,
1272                              vdev,
1273                              hdev->vqs + i,
1274                              hdev->vq_index + i);
1275     }
1276     i = hdev->nvqs;
1277 fail_mem:
1278 fail_features:
1279
1280     hdev->started = false;
1281     return r;
1282 }
1283
1284 /* Host notifiers must be enabled at this point. */
1285 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1286 {
1287     int i;
1288
1289     for (i = 0; i < hdev->nvqs; ++i) {
1290         vhost_virtqueue_stop(hdev,
1291                              vdev,
1292                              hdev->vqs + i,
1293                              hdev->vq_index + i);
1294     }
1295
1296     vhost_log_put(hdev, true);
1297     hdev->started = false;
1298     hdev->log = NULL;
1299     hdev->log_size = 0;
1300 }
1301