hw/virtio/vhost.c

   1 /*
   2  * vhost support
   3  *
   4  * Copyright Red Hat, Inc. 2010
   5  *
   6  * Authors:
   7  *  Michael S. Tsirkin <mst@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Contributions after 2012-01-13 are licensed under the terms of the
  13  * GNU GPL, version 2 or (at your option) any later version.
  14  */
  15
  16 #include "qemu/osdep.h"
  17 #include "hw/virtio/vhost.h"
  18 #include "hw/hw.h"
  19 #include "qemu/atomic.h"
  20 #include "qemu/range.h"
  21 #include "qemu/error-report.h"
  22 #include "qemu/memfd.h"
  23 #include <linux/vhost.h>
  24 #include "exec/address-spaces.h"
  25 #include "hw/virtio/virtio-bus.h"
  26 #include "hw/virtio/virtio-access.h"
  27 #include "migration/migration.h"
  28
  29 static struct vhost_log *vhost_log;
  30 static struct vhost_log *vhost_log_shm;
  31
  32 static unsigned int used_memslots;
  33 static QLIST_HEAD(, vhost_dev) vhost_devices =
  34     QLIST_HEAD_INITIALIZER(vhost_devices);
  35
  36 bool vhost_has_free_slot(void)
  37 {
  38     unsigned int slots_limit = ~0U;
  39     struct vhost_dev *hdev;
  40
  41     QLIST_FOREACH(hdev, &vhost_devices, entry) {
  42         unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
  43         slots_limit = MIN(slots_limit, r);
  44     }
  45     return slots_limit > used_memslots;
  46 }
  47
  48 static void vhost_dev_sync_region(struct vhost_dev *dev,
  49                                   MemoryRegionSection *section,
  50                                   uint64_t mfirst, uint64_t mlast,
  51                                   uint64_t rfirst, uint64_t rlast)
  52 {
  53     vhost_log_chunk_t *log = dev->log->log;
  54
  55     uint64_t start = MAX(mfirst, rfirst);
  56     uint64_t end = MIN(mlast, rlast);
  57     vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
  58     vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
  59     uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK;
  60
  61     if (end < start) {
  62         return;
  63     }
  64     assert(end / VHOST_LOG_CHUNK < dev->log_size);
  65     assert(start / VHOST_LOG_CHUNK < dev->log_size);
  66
  67     for (;from < to; ++from) {
  68         vhost_log_chunk_t log;
  69         /* We first check with non-atomic: much cheaper,
  70          * and we expect non-dirty to be the common case. */
  71         if (!*from) {
  72             addr += VHOST_LOG_CHUNK;
  73             continue;
  74         }
  75         /* Data must be read atomically. We don't really need barrier semantics
  76          * but it's easier to use atomic_* than roll our own. */
  77         log = atomic_xchg(from, 0);
  78         while (log) {
  79             int bit = ctzl(log);
  80             hwaddr page_addr;
  81             hwaddr section_offset;
  82             hwaddr mr_offset;
  83             page_addr = addr + bit * VHOST_LOG_PAGE;
  84             section_offset = page_addr - section->offset_within_address_space;
  85             mr_offset = section_offset + section->offset_within_region;
  86             memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
  87             log &= ~(0x1ull << bit);
  88         }
  89         addr += VHOST_LOG_CHUNK;
  90     }
  91 }
  92
  93 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
  94                                    MemoryRegionSection *section,
  95                                    hwaddr first,
  96                                    hwaddr last)
  97 {
  98     int i;
  99     hwaddr start_addr;
 100     hwaddr end_addr;
 101
 102     if (!dev->log_enabled || !dev->started) {
 103         return 0;
 104     }
 105     start_addr = section->offset_within_address_space;
 106     end_addr = range_get_last(start_addr, int128_get64(section->size));
 107     start_addr = MAX(first, start_addr);
 108     end_addr = MIN(last, end_addr);
 109
 110     for (i = 0; i < dev->mem->nregions; ++i) {
 111         struct vhost_memory_region *reg = dev->mem->regions + i;
 112         vhost_dev_sync_region(dev, section, start_addr, end_addr,
 113                               reg->guest_phys_addr,
 114                               range_get_last(reg->guest_phys_addr,
 115                                              reg->memory_size));
 116     }
 117     for (i = 0; i < dev->nvqs; ++i) {
 118         struct vhost_virtqueue *vq = dev->vqs + i;
 119         vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
 120                               range_get_last(vq->used_phys, vq->used_size));
 121     }
 122     return 0;
 123 }
 124
 125 static void vhost_log_sync(MemoryListener *listener,
 126                           MemoryRegionSection *section)
 127 {
 128     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 129                                          memory_listener);
 130     vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
 131 }
 132
 133 static void vhost_log_sync_range(struct vhost_dev *dev,
 134                                  hwaddr first, hwaddr last)
 135 {
 136     int i;
 137     /* FIXME: this is N^2 in number of sections */
 138     for (i = 0; i < dev->n_mem_sections; ++i) {
 139         MemoryRegionSection *section = &dev->mem_sections[i];
 140         vhost_sync_dirty_bitmap(dev, section, first, last);
 141     }
 142 }
 143
 144 /* Assign/unassign. Keep an unsorted array of non-overlapping
 145  * memory regions in dev->mem. */
 146 static void vhost_dev_unassign_memory(struct vhost_dev *dev,
 147                                       uint64_t start_addr,
 148                                       uint64_t size)
 149 {
 150     int from, to, n = dev->mem->nregions;
 151     /* Track overlapping/split regions for sanity checking. */
 152     int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
 153
 154     for (from = 0, to = 0; from < n; ++from, ++to) {
 155         struct vhost_memory_region *reg = dev->mem->regions + to;
 156         uint64_t reglast;
 157         uint64_t memlast;
 158         uint64_t change;
 159
 160         /* clone old region */
 161         if (to != from) {
 162             memcpy(reg, dev->mem->regions + from, sizeof *reg);
 163         }
 164
 165         /* No overlap is simple */
 166         if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 167                             start_addr, size)) {
 168             continue;
 169         }
 170
 171         /* Split only happens if supplied region
 172          * is in the middle of an existing one. Thus it can not
 173          * overlap with any other existing region. */
 174         assert(!split);
 175
 176         reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 177         memlast = range_get_last(start_addr, size);
 178
 179         /* Remove whole region */
 180         if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
 181             --dev->mem->nregions;
 182             --to;
 183             ++overlap_middle;
 184             continue;
 185         }
 186
 187         /* Shrink region */
 188         if (memlast >= reglast) {
 189             reg->memory_size = start_addr - reg->guest_phys_addr;
 190             assert(reg->memory_size);
 191             assert(!overlap_end);
 192             ++overlap_end;
 193             continue;
 194         }
 195
 196         /* Shift region */
 197         if (start_addr <= reg->guest_phys_addr) {
 198             change = memlast + 1 - reg->guest_phys_addr;
 199             reg->memory_size -= change;
 200             reg->guest_phys_addr += change;
 201             reg->userspace_addr += change;
 202             assert(reg->memory_size);
 203             assert(!overlap_start);
 204             ++overlap_start;
 205             continue;
 206         }
 207
 208         /* This only happens if supplied region
 209          * is in the middle of an existing one. Thus it can not
 210          * overlap with any other existing region. */
 211         assert(!overlap_start);
 212         assert(!overlap_end);
 213         assert(!overlap_middle);
 214         /* Split region: shrink first part, shift second part. */
 215         memcpy(dev->mem->regions + n, reg, sizeof *reg);
 216         reg->memory_size = start_addr - reg->guest_phys_addr;
 217         assert(reg->memory_size);
 218         change = memlast + 1 - reg->guest_phys_addr;
 219         reg = dev->mem->regions + n;
 220         reg->memory_size -= change;
 221         assert(reg->memory_size);
 222         reg->guest_phys_addr += change;
 223         reg->userspace_addr += change;
 224         /* Never add more than 1 region */
 225         assert(dev->mem->nregions == n);
 226         ++dev->mem->nregions;
 227         ++split;
 228     }
 229 }
 230
 231 /* Called after unassign, so no regions overlap the given range. */
 232 static void vhost_dev_assign_memory(struct vhost_dev *dev,
 233                                     uint64_t start_addr,
 234                                     uint64_t size,
 235                                     uint64_t uaddr)
 236 {
 237     int from, to;
 238     struct vhost_memory_region *merged = NULL;
 239     for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
 240         struct vhost_memory_region *reg = dev->mem->regions + to;
 241         uint64_t prlast, urlast;
 242         uint64_t pmlast, umlast;
 243         uint64_t s, e, u;
 244
 245         /* clone old region */
 246         if (to != from) {
 247             memcpy(reg, dev->mem->regions + from, sizeof *reg);
 248         }
 249         prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 250         pmlast = range_get_last(start_addr, size);
 251         urlast = range_get_last(reg->userspace_addr, reg->memory_size);
 252         umlast = range_get_last(uaddr, size);
 253
 254         /* check for overlapping regions: should never happen. */
 255         assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
 256         /* Not an adjacent or overlapping region - do not merge. */
 257         if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
 258             (pmlast + 1 != reg->guest_phys_addr ||
 259              umlast + 1 != reg->userspace_addr)) {
 260             continue;
 261         }
 262
 263         if (merged) {
 264             --to;
 265             assert(to >= 0);
 266         } else {
 267             merged = reg;
 268         }
 269         u = MIN(uaddr, reg->userspace_addr);
 270         s = MIN(start_addr, reg->guest_phys_addr);
 271         e = MAX(pmlast, prlast);
 272         uaddr = merged->userspace_addr = u;
 273         start_addr = merged->guest_phys_addr = s;
 274         size = merged->memory_size = e - s + 1;
 275         assert(merged->memory_size);
 276     }
 277
 278     if (!merged) {
 279         struct vhost_memory_region *reg = dev->mem->regions + to;
 280         memset(reg, 0, sizeof *reg);
 281         reg->memory_size = size;
 282         assert(reg->memory_size);
 283         reg->guest_phys_addr = start_addr;
 284         reg->userspace_addr = uaddr;
 285         ++to;
 286     }
 287     assert(to <= dev->mem->nregions + 1);
 288     dev->mem->nregions = to;
 289 }
 290
 291 static uint64_t vhost_get_log_size(struct vhost_dev *dev)
 292 {
 293     uint64_t log_size = 0;
 294     int i;
 295     for (i = 0; i < dev->mem->nregions; ++i) {
 296         struct vhost_memory_region *reg = dev->mem->regions + i;
 297         uint64_t last = range_get_last(reg->guest_phys_addr,
 298                                        reg->memory_size);
 299         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 300     }
 301     for (i = 0; i < dev->nvqs; ++i) {
 302         struct vhost_virtqueue *vq = dev->vqs + i;
 303         uint64_t last = vq->used_phys + vq->used_size - 1;
 304         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 305     }
 306     return log_size;
 307 }
 308
 309 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
 310 {
 311     struct vhost_log *log;
 312     uint64_t logsize = size * sizeof(*(log->log));
 313     int fd = -1;
 314
 315     log = g_new0(struct vhost_log, 1);
 316     if (share) {
 317         log->log = qemu_memfd_alloc("vhost-log", logsize,
 318                                     F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
 319                                     &fd);
 320         memset(log->log, 0, logsize);
 321     } else {
 322         log->log = g_malloc0(logsize);
 323     }
 324
 325     log->size = size;
 326     log->refcnt = 1;
 327     log->fd = fd;
 328
 329     return log;
 330 }
 331
 332 static struct vhost_log *vhost_log_get(uint64_t size, bool share)
 333 {
 334     struct vhost_log *log = share ? vhost_log_shm : vhost_log;
 335
 336     if (!log || log->size != size) {
 337         log = vhost_log_alloc(size, share);
 338         if (share) {
 339             vhost_log_shm = log;
 340         } else {
 341             vhost_log = log;
 342         }
 343     } else {
 344         ++log->refcnt;
 345     }
 346
 347     return log;
 348 }
 349
 350 static void vhost_log_put(struct vhost_dev *dev, bool sync)
 351 {
 352     struct vhost_log *log = dev->log;
 353
 354     if (!log) {
 355         return;
 356     }
 357
 358     --log->refcnt;
 359     if (log->refcnt == 0) {
 360         /* Sync only the range covered by the old log */
 361         if (dev->log_size && sync) {
 362             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
 363         }
 364
 365         if (vhost_log == log) {
 366             g_free(log->log);
 367             vhost_log = NULL;
 368         } else if (vhost_log_shm == log) {
 369             qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
 370                             log->fd);
 371             vhost_log_shm = NULL;
 372         }
 373
 374         g_free(log);
 375     }
 376 }
 377
 378 static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 379 {
 380     return dev->vhost_ops->vhost_requires_shm_log &&
 381            dev->vhost_ops->vhost_requires_shm_log(dev);
 382 }
 383
 384 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 385 {
 386     struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
 387     uint64_t log_base = (uintptr_t)log->log;
 388     int r;
 389
 390     /* inform backend of log switching, this must be done before
 391        releasing the current log, to ensure no logging is lost */
 392     r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
 393     assert(r >= 0);
 394     vhost_log_put(dev, true);
 395     dev->log = log;
 396     dev->log_size = size;
 397 }
 398
 399 static int vhost_verify_ring_mappings(struct vhost_dev *dev,
 400                                       uint64_t start_addr,
 401                                       uint64_t size)
 402 {
 403     int i;
 404     int r = 0;
 405
 406     for (i = 0; !r && i < dev->nvqs; ++i) {
 407         struct vhost_virtqueue *vq = dev->vqs + i;
 408         hwaddr l;
 409         void *p;
 410
 411         if (!ranges_overlap(start_addr, size, vq->ring_phys, vq->ring_size)) {
 412             continue;
 413         }
 414         l = vq->ring_size;
 415         p = cpu_physical_memory_map(vq->ring_phys, &l, 1);
 416         if (!p || l != vq->ring_size) {
 417             fprintf(stderr, "Unable to map ring buffer for ring %d\n", i);
 418             r = -ENOMEM;
 419         }
 420         if (p != vq->ring) {
 421             fprintf(stderr, "Ring buffer relocated for ring %d\n", i);
 422             r = -EBUSY;
 423         }
 424         cpu_physical_memory_unmap(p, l, 0, 0);
 425     }
 426     return r;
 427 }
 428
 429 static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev,
 430                                                       uint64_t start_addr,
 431                                                       uint64_t size)
 432 {
 433     int i, n = dev->mem->nregions;
 434     for (i = 0; i < n; ++i) {
 435         struct vhost_memory_region *reg = dev->mem->regions + i;
 436         if (ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 437                            start_addr, size)) {
 438             return reg;
 439         }
 440     }
 441     return NULL;
 442 }
 443
 444 static bool vhost_dev_cmp_memory(struct vhost_dev *dev,
 445                                  uint64_t start_addr,
 446                                  uint64_t size,
 447                                  uint64_t uaddr)
 448 {
 449     struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size);
 450     uint64_t reglast;
 451     uint64_t memlast;
 452
 453     if (!reg) {
 454         return true;
 455     }
 456
 457     reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 458     memlast = range_get_last(start_addr, size);
 459
 460     /* Need to extend region? */
 461     if (start_addr < reg->guest_phys_addr || memlast > reglast) {
 462         return true;
 463     }
 464     /* userspace_addr changed? */
 465     return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr;
 466 }
 467
 468 static void vhost_set_memory(MemoryListener *listener,
 469                              MemoryRegionSection *section,
 470                              bool add)
 471 {
 472     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 473                                          memory_listener);
 474     hwaddr start_addr = section->offset_within_address_space;
 475     ram_addr_t size = int128_get64(section->size);
 476     bool log_dirty =
 477         memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION);
 478     int s = offsetof(struct vhost_memory, regions) +
 479         (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
 480     void *ram;
 481
 482     dev->mem = g_realloc(dev->mem, s);
 483
 484     if (log_dirty) {
 485         add = false;
 486     }
 487
 488     assert(size);
 489
 490     /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
 491     ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
 492     if (add) {
 493         if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
 494             /* Region exists with same address. Nothing to do. */
 495             return;
 496         }
 497     } else {
 498         if (!vhost_dev_find_reg(dev, start_addr, size)) {
 499             /* Removing region that we don't access. Nothing to do. */
 500             return;
 501         }
 502     }
 503
 504     vhost_dev_unassign_memory(dev, start_addr, size);
 505     if (add) {
 506         /* Add given mapping, merging adjacent regions if any */
 507         vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram);
 508     } else {
 509         /* Remove old mapping for this memory, if any. */
 510         vhost_dev_unassign_memory(dev, start_addr, size);
 511     }
 512     dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
 513     dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
 514     dev->memory_changed = true;
 515     used_memslots = dev->mem->nregions;
 516 }
 517
 518 static bool vhost_section(MemoryRegionSection *section)
 519 {
 520     return memory_region_is_ram(section->mr);
 521 }
 522
 523 static void vhost_begin(MemoryListener *listener)
 524 {
 525     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 526                                          memory_listener);
 527     dev->mem_changed_end_addr = 0;
 528     dev->mem_changed_start_addr = -1;
 529 }
 530
 531 static void vhost_commit(MemoryListener *listener)
 532 {
 533     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 534                                          memory_listener);
 535     hwaddr start_addr = 0;
 536     ram_addr_t size = 0;
 537     uint64_t log_size;
 538     int r;
 539
 540     if (!dev->memory_changed) {
 541         return;
 542     }
 543     if (!dev->started) {
 544         return;
 545     }
 546     if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) {
 547         return;
 548     }
 549
 550     if (dev->started) {
 551         start_addr = dev->mem_changed_start_addr;
 552         size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1;
 553
 554         r = vhost_verify_ring_mappings(dev, start_addr, size);
 555         assert(r >= 0);
 556     }
 557
 558     if (!dev->log_enabled) {
 559         r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 560         assert(r >= 0);
 561         dev->memory_changed = false;
 562         return;
 563     }
 564     log_size = vhost_get_log_size(dev);
 565     /* We allocate an extra 4K bytes to log,
 566      * to reduce the * number of reallocations. */
 567 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
 568     /* To log more, must increase log size before table update. */
 569     if (dev->log_size < log_size) {
 570         vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
 571     }
 572     r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 573     assert(r >= 0);
 574     /* To log less, can only decrease log size after table update. */
 575     if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
 576         vhost_dev_log_resize(dev, log_size);
 577     }
 578     dev->memory_changed = false;
 579 }
 580
 581 static void vhost_region_add(MemoryListener *listener,
 582                              MemoryRegionSection *section)
 583 {
 584     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 585                                          memory_listener);
 586
 587     if (!vhost_section(section)) {
 588         return;
 589     }
 590
 591     ++dev->n_mem_sections;
 592     dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections,
 593                                 dev->n_mem_sections);
 594     dev->mem_sections[dev->n_mem_sections - 1] = *section;
 595     memory_region_ref(section->mr);
 596     vhost_set_memory(listener, section, true);
 597 }
 598
 599 static void vhost_region_del(MemoryListener *listener,
 600                              MemoryRegionSection *section)
 601 {
 602     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 603                                          memory_listener);
 604     int i;
 605
 606     if (!vhost_section(section)) {
 607         return;
 608     }
 609
 610     vhost_set_memory(listener, section, false);
 611     memory_region_unref(section->mr);
 612     for (i = 0; i < dev->n_mem_sections; ++i) {
 613         if (dev->mem_sections[i].offset_within_address_space
 614             == section->offset_within_address_space) {
 615             --dev->n_mem_sections;
 616             memmove(&dev->mem_sections[i], &dev->mem_sections[i+1],
 617                     (dev->n_mem_sections - i) * sizeof(*dev->mem_sections));
 618             break;
 619         }
 620     }
 621 }
 622
 623 static void vhost_region_nop(MemoryListener *listener,
 624                              MemoryRegionSection *section)
 625 {
 626 }
 627
 628 static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
 629                                     struct vhost_virtqueue *vq,
 630                                     unsigned idx, bool enable_log)
 631 {
 632     struct vhost_vring_addr addr = {
 633         .index = idx,
 634         .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
 635         .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
 636         .used_user_addr = (uint64_t)(unsigned long)vq->used,
 637         .log_guest_addr = vq->used_phys,
 638         .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
 639     };
 640     int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
 641     if (r < 0) {
 642         return -errno;
 643     }
 644     return 0;
 645 }
 646
 647 static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
 648 {
 649     uint64_t features = dev->acked_features;
 650     int r;
 651     if (enable_log) {
 652         features |= 0x1ULL << VHOST_F_LOG_ALL;
 653     }
 654     r = dev->vhost_ops->vhost_set_features(dev, features);
 655     return r < 0 ? -errno : 0;
 656 }
 657
 658 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
 659 {
 660     int r, t, i, idx;
 661     r = vhost_dev_set_features(dev, enable_log);
 662     if (r < 0) {
 663         goto err_features;
 664     }
 665     for (i = 0; i < dev->nvqs; ++i) {
 666         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 667         r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 668                                      enable_log);
 669         if (r < 0) {
 670             goto err_vq;
 671         }
 672     }
 673     return 0;
 674 err_vq:
 675     for (; i >= 0; --i) {
 676         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 677         t = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 678                                      dev->log_enabled);
 679         assert(t >= 0);
 680     }
 681     t = vhost_dev_set_features(dev, dev->log_enabled);
 682     assert(t >= 0);
 683 err_features:
 684     return r;
 685 }
 686
 687 static int vhost_migration_log(MemoryListener *listener, int enable)
 688 {
 689     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 690                                          memory_listener);
 691     int r;
 692     if (!!enable == dev->log_enabled) {
 693         return 0;
 694     }
 695     if (!dev->started) {
 696         dev->log_enabled = enable;
 697         return 0;
 698     }
 699     if (!enable) {
 700         r = vhost_dev_set_log(dev, false);
 701         if (r < 0) {
 702             return r;
 703         }
 704         vhost_log_put(dev, false);
 705         dev->log = NULL;
 706         dev->log_size = 0;
 707     } else {
 708         vhost_dev_log_resize(dev, vhost_get_log_size(dev));
 709         r = vhost_dev_set_log(dev, true);
 710         if (r < 0) {
 711             return r;
 712         }
 713     }
 714     dev->log_enabled = enable;
 715     return 0;
 716 }
 717
 718 static void vhost_log_global_start(MemoryListener *listener)
 719 {
 720     int r;
 721
 722     r = vhost_migration_log(listener, true);
 723     if (r < 0) {
 724         abort();
 725     }
 726 }
 727
 728 static void vhost_log_global_stop(MemoryListener *listener)
 729 {
 730     int r;
 731
 732     r = vhost_migration_log(listener, false);
 733     if (r < 0) {
 734         abort();
 735     }
 736 }
 737
 738 static void vhost_log_start(MemoryListener *listener,
 739                             MemoryRegionSection *section,
 740                             int old, int new)
 741 {
 742     /* FIXME: implement */
 743 }
 744
 745 static void vhost_log_stop(MemoryListener *listener,
 746                            MemoryRegionSection *section,
 747                            int old, int new)
 748 {
 749     /* FIXME: implement */
 750 }
 751
 752 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
 753 {
 754 #ifdef TARGET_IS_BIENDIAN
 755 #ifdef HOST_WORDS_BIGENDIAN
 756     return !virtio_is_big_endian(vdev);
 757 #else
 758     return virtio_is_big_endian(vdev);
 759 #endif
 760 #else
 761     return false;
 762 #endif
 763 }
 764
 765 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
 766                                                    bool is_big_endian,
 767                                                    int vhost_vq_index)
 768 {
 769     struct vhost_vring_state s = {
 770         .index = vhost_vq_index,
 771         .num = is_big_endian
 772     };
 773
 774     if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
 775         return 0;
 776     }
 777
 778     if (errno == ENOTTY) {
 779         error_report("vhost does not support cross-endian");
 780         return -ENOSYS;
 781     }
 782
 783     return -errno;
 784 }
 785
 786 static int vhost_virtqueue_start(struct vhost_dev *dev,
 787                                 struct VirtIODevice *vdev,
 788                                 struct vhost_virtqueue *vq,
 789                                 unsigned idx)
 790 {
 791     hwaddr s, l, a;
 792     int r;
 793     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
 794     struct vhost_vring_file file = {
 795         .index = vhost_vq_index
 796     };
 797     struct vhost_vring_state state = {
 798         .index = vhost_vq_index
 799     };
 800     struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
 801
 802
 803     vq->num = state.num = virtio_queue_get_num(vdev, idx);
 804     r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
 805     if (r) {
 806         return -errno;
 807     }
 808
 809     state.num = virtio_queue_get_last_avail_idx(vdev, idx);
 810     r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
 811     if (r) {
 812         return -errno;
 813     }
 814
 815     if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1) &&
 816         vhost_needs_vring_endian(vdev)) {
 817         r = vhost_virtqueue_set_vring_endian_legacy(dev,
 818                                                     virtio_is_big_endian(vdev),
 819                                                     vhost_vq_index);
 820         if (r) {
 821             return -errno;
 822         }
 823     }
 824
 825     s = l = virtio_queue_get_desc_size(vdev, idx);
 826     a = virtio_queue_get_desc_addr(vdev, idx);
 827     vq->desc = cpu_physical_memory_map(a, &l, 0);
 828     if (!vq->desc || l != s) {
 829         r = -ENOMEM;
 830         goto fail_alloc_desc;
 831     }
 832     s = l = virtio_queue_get_avail_size(vdev, idx);
 833     a = virtio_queue_get_avail_addr(vdev, idx);
 834     vq->avail = cpu_physical_memory_map(a, &l, 0);
 835     if (!vq->avail || l != s) {
 836         r = -ENOMEM;
 837         goto fail_alloc_avail;
 838     }
 839     vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
 840     vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
 841     vq->used = cpu_physical_memory_map(a, &l, 1);
 842     if (!vq->used || l != s) {
 843         r = -ENOMEM;
 844         goto fail_alloc_used;
 845     }
 846
 847     vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx);
 848     vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx);
 849     vq->ring = cpu_physical_memory_map(a, &l, 1);
 850     if (!vq->ring || l != s) {
 851         r = -ENOMEM;
 852         goto fail_alloc_ring;
 853     }
 854
 855     r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
 856     if (r < 0) {
 857         r = -errno;
 858         goto fail_alloc;
 859     }
 860
 861     file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
 862     r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
 863     if (r) {
 864         r = -errno;
 865         goto fail_kick;
 866     }
 867
 868     /* Clear and discard previous events if any. */
 869     event_notifier_test_and_clear(&vq->masked_notifier);
 870
 871     return 0;
 872
 873 fail_kick:
 874 fail_alloc:
 875     cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
 876                               0, 0);
 877 fail_alloc_ring:
 878     cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
 879                               0, 0);
 880 fail_alloc_used:
 881     cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
 882                               0, 0);
 883 fail_alloc_avail:
 884     cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
 885                               0, 0);
 886 fail_alloc_desc:
 887     return r;
 888 }
 889
 890 static void vhost_virtqueue_stop(struct vhost_dev *dev,
 891                                     struct VirtIODevice *vdev,
 892                                     struct vhost_virtqueue *vq,
 893                                     unsigned idx)
 894 {
 895     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
 896     struct vhost_vring_state state = {
 897         .index = vhost_vq_index,
 898     };
 899     int r;
 900
 901     r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
 902     if (r < 0) {
 903         fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
 904         fflush(stderr);
 905     }
 906     virtio_queue_set_last_avail_idx(vdev, idx, state.num);
 907     virtio_queue_invalidate_signalled_used(vdev, idx);
 908
 909     /* In the cross-endian case, we need to reset the vring endianness to
 910      * native as legacy devices expect so by default.
 911      */
 912     if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1) &&
 913         vhost_needs_vring_endian(vdev)) {
 914         r = vhost_virtqueue_set_vring_endian_legacy(dev,
 915                                                     !virtio_is_big_endian(vdev),
 916                                                     vhost_vq_index);
 917         if (r < 0) {
 918             error_report("failed to reset vring endianness");
 919         }
 920     }
 921
 922     assert (r >= 0);
 923     cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
 924                               0, virtio_queue_get_ring_size(vdev, idx));
 925     cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
 926                               1, virtio_queue_get_used_size(vdev, idx));
 927     cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
 928                               0, virtio_queue_get_avail_size(vdev, idx));
 929     cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
 930                               0, virtio_queue_get_desc_size(vdev, idx));
 931 }
 932
 933 static void vhost_eventfd_add(MemoryListener *listener,
 934                               MemoryRegionSection *section,
 935                               bool match_data, uint64_t data, EventNotifier *e)
 936 {
 937 }
 938
 939 static void vhost_eventfd_del(MemoryListener *listener,
 940                               MemoryRegionSection *section,
 941                               bool match_data, uint64_t data, EventNotifier *e)
 942 {
 943 }
 944
 945 static int vhost_virtqueue_init(struct vhost_dev *dev,
 946                                 struct vhost_virtqueue *vq, int n)
 947 {
 948     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
 949     struct vhost_vring_file file = {
 950         .index = vhost_vq_index,
 951     };
 952     int r = event_notifier_init(&vq->masked_notifier, 0);
 953     if (r < 0) {
 954         return r;
 955     }
 956
 957     file.fd = event_notifier_get_fd(&vq->masked_notifier);
 958     r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
 959     if (r) {
 960         r = -errno;
 961         goto fail_call;
 962     }
 963     return 0;
 964 fail_call:
 965     event_notifier_cleanup(&vq->masked_notifier);
 966     return r;
 967 }
 968
 969 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
 970 {
 971     event_notifier_cleanup(&vq->masked_notifier);
 972 }
 973
 974 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
 975                    VhostBackendType backend_type)
 976 {
 977     uint64_t features;
 978     int i, r;
 979
 980     hdev->migration_blocker = NULL;
 981
 982     if (vhost_set_backend_type(hdev, backend_type) < 0) {
 983         close((uintptr_t)opaque);
 984         return -1;
 985     }
 986
 987     if (hdev->vhost_ops->vhost_backend_init(hdev, opaque) < 0) {
 988         close((uintptr_t)opaque);
 989         return -errno;
 990     }
 991
 992     if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
 993         fprintf(stderr, "vhost backend memory slots limit is less"
 994                 " than current number of present memory slots\n");
 995         close((uintptr_t)opaque);
 996         return -1;
 997     }
 998     QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
 999
1000     r = hdev->vhost_ops->vhost_set_owner(hdev);
1001     if (r < 0) {
1002         goto fail;
1003     }
1004
1005     r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1006     if (r < 0) {
1007         goto fail;
1008     }
1009
1010     for (i = 0; i < hdev->nvqs; ++i) {
1011         r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1012         if (r < 0) {
1013             goto fail_vq;
1014         }
1015     }
1016     hdev->features = features;
1017
1018     hdev->memory_listener = (MemoryListener) {
1019         .begin = vhost_begin,
1020         .commit = vhost_commit,
1021         .region_add = vhost_region_add,
1022         .region_del = vhost_region_del,
1023         .region_nop = vhost_region_nop,
1024         .log_start = vhost_log_start,
1025         .log_stop = vhost_log_stop,
1026         .log_sync = vhost_log_sync,
1027         .log_global_start = vhost_log_global_start,
1028         .log_global_stop = vhost_log_global_stop,
1029         .eventfd_add = vhost_eventfd_add,
1030         .eventfd_del = vhost_eventfd_del,
1031         .priority = 10
1032     };
1033
1034     if (hdev->migration_blocker == NULL) {
1035         if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1036             error_setg(&hdev->migration_blocker,
1037                        "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1038         } else if (!qemu_memfd_check()) {
1039             error_setg(&hdev->migration_blocker,
1040                        "Migration disabled: failed to allocate shared memory");
1041         }
1042     }
1043
1044     if (hdev->migration_blocker != NULL) {
1045         migrate_add_blocker(hdev->migration_blocker);
1046     }
1047
1048     hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1049     hdev->n_mem_sections = 0;
1050     hdev->mem_sections = NULL;
1051     hdev->log = NULL;
1052     hdev->log_size = 0;
1053     hdev->log_enabled = false;
1054     hdev->started = false;
1055     hdev->memory_changed = false;
1056     memory_listener_register(&hdev->memory_listener, &address_space_memory);
1057     return 0;
1058 fail_vq:
1059     while (--i >= 0) {
1060         vhost_virtqueue_cleanup(hdev->vqs + i);
1061     }
1062 fail:
1063     r = -errno;
1064     hdev->vhost_ops->vhost_backend_cleanup(hdev);
1065     QLIST_REMOVE(hdev, entry);
1066     return r;
1067 }
1068
1069 void vhost_dev_cleanup(struct vhost_dev *hdev)
1070 {
1071     int i;
1072     for (i = 0; i < hdev->nvqs; ++i) {
1073         vhost_virtqueue_cleanup(hdev->vqs + i);
1074     }
1075     memory_listener_unregister(&hdev->memory_listener);
1076     if (hdev->migration_blocker) {
1077         migrate_del_blocker(hdev->migration_blocker);
1078         error_free(hdev->migration_blocker);
1079     }
1080     g_free(hdev->mem);
1081     g_free(hdev->mem_sections);
1082     hdev->vhost_ops->vhost_backend_cleanup(hdev);
1083     QLIST_REMOVE(hdev, entry);
1084 }
1085
1086 /* Stop processing guest IO notifications in qemu.
1087  * Start processing them in vhost in kernel.
1088  */
1089 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1090 {
1091     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1092     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1093     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1094     int i, r, e;
1095     if (!k->set_host_notifier) {
1096         fprintf(stderr, "binding does not support host notifiers\n");
1097         r = -ENOSYS;
1098         goto fail;
1099     }
1100
1101     for (i = 0; i < hdev->nvqs; ++i) {
1102         r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, true);
1103         if (r < 0) {
1104             fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r);
1105             goto fail_vq;
1106         }
1107     }
1108
1109     return 0;
1110 fail_vq:
1111     while (--i >= 0) {
1112         e = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false);
1113         if (e < 0) {
1114             fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r);
1115             fflush(stderr);
1116         }
1117         assert (e >= 0);
1118     }
1119 fail:
1120     return r;
1121 }
1122
1123 /* Stop processing guest IO notifications in vhost.
1124  * Start processing them in qemu.
1125  * This might actually run the qemu handlers right away,
1126  * so virtio in qemu must be completely setup when this is called.
1127  */
1128 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1129 {
1130     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1131     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1132     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1133     int i, r;
1134
1135     for (i = 0; i < hdev->nvqs; ++i) {
1136         r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false);
1137         if (r < 0) {
1138             fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r);
1139             fflush(stderr);
1140         }
1141         assert (r >= 0);
1142     }
1143 }
1144
1145 /* Test and clear event pending status.
1146  * Should be called after unmask to avoid losing events.
1147  */
1148 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1149 {
1150     struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1151     assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1152     return event_notifier_test_and_clear(&vq->masked_notifier);
1153 }
1154
1155 /* Mask/unmask events from this vq. */
1156 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1157                          bool mask)
1158 {
1159     struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1160     int r, index = n - hdev->vq_index;
1161     struct vhost_vring_file file;
1162
1163     if (mask) {
1164         file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1165     } else {
1166         file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1167     }
1168
1169     file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1170     r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1171     assert(r >= 0);
1172 }
1173
1174 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1175                             uint64_t features)
1176 {
1177     const int *bit = feature_bits;
1178     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1179         uint64_t bit_mask = (1ULL << *bit);
1180         if (!(hdev->features & bit_mask)) {
1181             features &= ~bit_mask;
1182         }
1183         bit++;
1184     }
1185     return features;
1186 }
1187
1188 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1189                         uint64_t features)
1190 {
1191     const int *bit = feature_bits;
1192     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1193         uint64_t bit_mask = (1ULL << *bit);
1194         if (features & bit_mask) {
1195             hdev->acked_features |= bit_mask;
1196         }
1197         bit++;
1198     }
1199 }
1200
1201 /* Host notifiers must be enabled at this point. */
1202 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1203 {
1204     int i, r;
1205
1206     hdev->started = true;
1207
1208     r = vhost_dev_set_features(hdev, hdev->log_enabled);
1209     if (r < 0) {
1210         goto fail_features;
1211     }
1212     r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1213     if (r < 0) {
1214         r = -errno;
1215         goto fail_mem;
1216     }
1217     for (i = 0; i < hdev->nvqs; ++i) {
1218         r = vhost_virtqueue_start(hdev,
1219                                   vdev,
1220                                   hdev->vqs + i,
1221                                   hdev->vq_index + i);
1222         if (r < 0) {
1223             goto fail_vq;
1224         }
1225     }
1226
1227     if (hdev->log_enabled) {
1228         uint64_t log_base;
1229
1230         hdev->log_size = vhost_get_log_size(hdev);
1231         hdev->log = vhost_log_get(hdev->log_size,
1232                                   vhost_dev_log_is_shared(hdev));
1233         log_base = (uintptr_t)hdev->log->log;
1234         r = hdev->vhost_ops->vhost_set_log_base(hdev,
1235                                                 hdev->log_size ? log_base : 0,
1236                                                 hdev->log);
1237         if (r < 0) {
1238             r = -errno;
1239             goto fail_log;
1240         }
1241     }
1242
1243     return 0;
1244 fail_log:
1245     vhost_log_put(hdev, false);
1246 fail_vq:
1247     while (--i >= 0) {
1248         vhost_virtqueue_stop(hdev,
1249                              vdev,
1250                              hdev->vqs + i,
1251                              hdev->vq_index + i);
1252     }
1253     i = hdev->nvqs;
1254 fail_mem:
1255 fail_features:
1256
1257     hdev->started = false;
1258     return r;
1259 }
1260
1261 /* Host notifiers must be enabled at this point. */
1262 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1263 {
1264     int i;
1265
1266     for (i = 0; i < hdev->nvqs; ++i) {
1267         vhost_virtqueue_stop(hdev,
1268                              vdev,
1269                              hdev->vqs + i,
1270                              hdev->vq_index + i);
1271     }
1272
1273     vhost_log_put(hdev, true);
1274     hdev->started = false;
1275     hdev->log = NULL;
1276     hdev->log_size = 0;
1277 }
1278