hw/virtio/vhost.c

   1 /*
   2  * vhost support
   3  *
   4  * Copyright Red Hat, Inc. 2010
   5  *
   6  * Authors:
   7  *  Michael S. Tsirkin <mst@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Contributions after 2012-01-13 are licensed under the terms of the
  13  * GNU GPL, version 2 or (at your option) any later version.
  14  */
  15
  16 #include "qemu/osdep.h"
  17 #include "hw/virtio/vhost.h"
  18 #include "hw/hw.h"
  19 #include "qemu/atomic.h"
  20 #include "qemu/range.h"
  21 #include "qemu/error-report.h"
  22 #include "qemu/memfd.h"
  23 #include <linux/vhost.h>
  24 #include "exec/address-spaces.h"
  25 #include "hw/virtio/virtio-bus.h"
  26 #include "hw/virtio/virtio-access.h"
  27 #include "migration/migration.h"
  28
  29 static struct vhost_log *vhost_log;
  30 static struct vhost_log *vhost_log_shm;
  31
  32 static unsigned int used_memslots;
  33 static QLIST_HEAD(, vhost_dev) vhost_devices =
  34     QLIST_HEAD_INITIALIZER(vhost_devices);
  35
  36 bool vhost_has_free_slot(void)
  37 {
  38     unsigned int slots_limit = ~0U;
  39     struct vhost_dev *hdev;
  40
  41     QLIST_FOREACH(hdev, &vhost_devices, entry) {
  42         unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
  43         slots_limit = MIN(slots_limit, r);
  44     }
  45     return slots_limit > used_memslots;
  46 }
  47
  48 static void vhost_dev_sync_region(struct vhost_dev *dev,
  49                                   MemoryRegionSection *section,
  50                                   uint64_t mfirst, uint64_t mlast,
  51                                   uint64_t rfirst, uint64_t rlast)
  52 {
  53     vhost_log_chunk_t *log = dev->log->log;
  54
  55     uint64_t start = MAX(mfirst, rfirst);
  56     uint64_t end = MIN(mlast, rlast);
  57     vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
  58     vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
  59     uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK;
  60
  61     if (end < start) {
  62         return;
  63     }
  64     assert(end / VHOST_LOG_CHUNK < dev->log_size);
  65     assert(start / VHOST_LOG_CHUNK < dev->log_size);
  66
  67     for (;from < to; ++from) {
  68         vhost_log_chunk_t log;
  69         /* We first check with non-atomic: much cheaper,
  70          * and we expect non-dirty to be the common case. */
  71         if (!*from) {
  72             addr += VHOST_LOG_CHUNK;
  73             continue;
  74         }
  75         /* Data must be read atomically. We don't really need barrier semantics
  76          * but it's easier to use atomic_* than roll our own. */
  77         log = atomic_xchg(from, 0);
  78         while (log) {
  79             int bit = ctzl(log);
  80             hwaddr page_addr;
  81             hwaddr section_offset;
  82             hwaddr mr_offset;
  83             page_addr = addr + bit * VHOST_LOG_PAGE;
  84             section_offset = page_addr - section->offset_within_address_space;
  85             mr_offset = section_offset + section->offset_within_region;
  86             memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
  87             log &= ~(0x1ull << bit);
  88         }
  89         addr += VHOST_LOG_CHUNK;
  90     }
  91 }
  92
  93 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
  94                                    MemoryRegionSection *section,
  95                                    hwaddr first,
  96                                    hwaddr last)
  97 {
  98     int i;
  99     hwaddr start_addr;
 100     hwaddr end_addr;
 101
 102     if (!dev->log_enabled || !dev->started) {
 103         return 0;
 104     }
 105     start_addr = section->offset_within_address_space;
 106     end_addr = range_get_last(start_addr, int128_get64(section->size));
 107     start_addr = MAX(first, start_addr);
 108     end_addr = MIN(last, end_addr);
 109
 110     for (i = 0; i < dev->mem->nregions; ++i) {
 111         struct vhost_memory_region *reg = dev->mem->regions + i;
 112         vhost_dev_sync_region(dev, section, start_addr, end_addr,
 113                               reg->guest_phys_addr,
 114                               range_get_last(reg->guest_phys_addr,
 115                                              reg->memory_size));
 116     }
 117     for (i = 0; i < dev->nvqs; ++i) {
 118         struct vhost_virtqueue *vq = dev->vqs + i;
 119         vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
 120                               range_get_last(vq->used_phys, vq->used_size));
 121     }
 122     return 0;
 123 }
 124
 125 static void vhost_log_sync(MemoryListener *listener,
 126                           MemoryRegionSection *section)
 127 {
 128     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 129                                          memory_listener);
 130     vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
 131 }
 132
 133 static void vhost_log_sync_range(struct vhost_dev *dev,
 134                                  hwaddr first, hwaddr last)
 135 {
 136     int i;
 137     /* FIXME: this is N^2 in number of sections */
 138     for (i = 0; i < dev->n_mem_sections; ++i) {
 139         MemoryRegionSection *section = &dev->mem_sections[i];
 140         vhost_sync_dirty_bitmap(dev, section, first, last);
 141     }
 142 }
 143
 144 /* Assign/unassign. Keep an unsorted array of non-overlapping
 145  * memory regions in dev->mem. */
 146 static void vhost_dev_unassign_memory(struct vhost_dev *dev,
 147                                       uint64_t start_addr,
 148                                       uint64_t size)
 149 {
 150     int from, to, n = dev->mem->nregions;
 151     /* Track overlapping/split regions for sanity checking. */
 152     int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
 153
 154     for (from = 0, to = 0; from < n; ++from, ++to) {
 155         struct vhost_memory_region *reg = dev->mem->regions + to;
 156         uint64_t reglast;
 157         uint64_t memlast;
 158         uint64_t change;
 159
 160         /* clone old region */
 161         if (to != from) {
 162             memcpy(reg, dev->mem->regions + from, sizeof *reg);
 163         }
 164
 165         /* No overlap is simple */
 166         if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 167                             start_addr, size)) {
 168             continue;
 169         }
 170
 171         /* Split only happens if supplied region
 172          * is in the middle of an existing one. Thus it can not
 173          * overlap with any other existing region. */
 174         assert(!split);
 175
 176         reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 177         memlast = range_get_last(start_addr, size);
 178
 179         /* Remove whole region */
 180         if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
 181             --dev->mem->nregions;
 182             --to;
 183             ++overlap_middle;
 184             continue;
 185         }
 186
 187         /* Shrink region */
 188         if (memlast >= reglast) {
 189             reg->memory_size = start_addr - reg->guest_phys_addr;
 190             assert(reg->memory_size);
 191             assert(!overlap_end);
 192             ++overlap_end;
 193             continue;
 194         }
 195
 196         /* Shift region */
 197         if (start_addr <= reg->guest_phys_addr) {
 198             change = memlast + 1 - reg->guest_phys_addr;
 199             reg->memory_size -= change;
 200             reg->guest_phys_addr += change;
 201             reg->userspace_addr += change;
 202             assert(reg->memory_size);
 203             assert(!overlap_start);
 204             ++overlap_start;
 205             continue;
 206         }
 207
 208         /* This only happens if supplied region
 209          * is in the middle of an existing one. Thus it can not
 210          * overlap with any other existing region. */
 211         assert(!overlap_start);
 212         assert(!overlap_end);
 213         assert(!overlap_middle);
 214         /* Split region: shrink first part, shift second part. */
 215         memcpy(dev->mem->regions + n, reg, sizeof *reg);
 216         reg->memory_size = start_addr - reg->guest_phys_addr;
 217         assert(reg->memory_size);
 218         change = memlast + 1 - reg->guest_phys_addr;
 219         reg = dev->mem->regions + n;
 220         reg->memory_size -= change;
 221         assert(reg->memory_size);
 222         reg->guest_phys_addr += change;
 223         reg->userspace_addr += change;
 224         /* Never add more than 1 region */
 225         assert(dev->mem->nregions == n);
 226         ++dev->mem->nregions;
 227         ++split;
 228     }
 229 }
 230
 231 /* Called after unassign, so no regions overlap the given range. */
 232 static void vhost_dev_assign_memory(struct vhost_dev *dev,
 233                                     uint64_t start_addr,
 234                                     uint64_t size,
 235                                     uint64_t uaddr)
 236 {
 237     int from, to;
 238     struct vhost_memory_region *merged = NULL;
 239     for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
 240         struct vhost_memory_region *reg = dev->mem->regions + to;
 241         uint64_t prlast, urlast;
 242         uint64_t pmlast, umlast;
 243         uint64_t s, e, u;
 244
 245         /* clone old region */
 246         if (to != from) {
 247             memcpy(reg, dev->mem->regions + from, sizeof *reg);
 248         }
 249         prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 250         pmlast = range_get_last(start_addr, size);
 251         urlast = range_get_last(reg->userspace_addr, reg->memory_size);
 252         umlast = range_get_last(uaddr, size);
 253
 254         /* check for overlapping regions: should never happen. */
 255         assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
 256         /* Not an adjacent or overlapping region - do not merge. */
 257         if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
 258             (pmlast + 1 != reg->guest_phys_addr ||
 259              umlast + 1 != reg->userspace_addr)) {
 260             continue;
 261         }
 262
 263         if (merged) {
 264             --to;
 265             assert(to >= 0);
 266         } else {
 267             merged = reg;
 268         }
 269         u = MIN(uaddr, reg->userspace_addr);
 270         s = MIN(start_addr, reg->guest_phys_addr);
 271         e = MAX(pmlast, prlast);
 272         uaddr = merged->userspace_addr = u;
 273         start_addr = merged->guest_phys_addr = s;
 274         size = merged->memory_size = e - s + 1;
 275         assert(merged->memory_size);
 276     }
 277
 278     if (!merged) {
 279         struct vhost_memory_region *reg = dev->mem->regions + to;
 280         memset(reg, 0, sizeof *reg);
 281         reg->memory_size = size;
 282         assert(reg->memory_size);
 283         reg->guest_phys_addr = start_addr;
 284         reg->userspace_addr = uaddr;
 285         ++to;
 286     }
 287     assert(to <= dev->mem->nregions + 1);
 288     dev->mem->nregions = to;
 289 }
 290
 291 static uint64_t vhost_get_log_size(struct vhost_dev *dev)
 292 {
 293     uint64_t log_size = 0;
 294     int i;
 295     for (i = 0; i < dev->mem->nregions; ++i) {
 296         struct vhost_memory_region *reg = dev->mem->regions + i;
 297         uint64_t last = range_get_last(reg->guest_phys_addr,
 298                                        reg->memory_size);
 299         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 300     }
 301     for (i = 0; i < dev->nvqs; ++i) {
 302         struct vhost_virtqueue *vq = dev->vqs + i;
 303         uint64_t last = vq->used_phys + vq->used_size - 1;
 304         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 305     }
 306     return log_size;
 307 }
 308
 309 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
 310 {
 311     struct vhost_log *log;
 312     uint64_t logsize = size * sizeof(*(log->log));
 313     int fd = -1;
 314
 315     log = g_new0(struct vhost_log, 1);
 316     if (share) {
 317         log->log = qemu_memfd_alloc("vhost-log", logsize,
 318                                     F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
 319                                     &fd);
 320         memset(log->log, 0, logsize);
 321     } else {
 322         log->log = g_malloc0(logsize);
 323     }
 324
 325     log->size = size;
 326     log->refcnt = 1;
 327     log->fd = fd;
 328
 329     return log;
 330 }
 331
 332 static struct vhost_log *vhost_log_get(uint64_t size, bool share)
 333 {
 334     struct vhost_log *log = share ? vhost_log_shm : vhost_log;
 335
 336     if (!log || log->size != size) {
 337         log = vhost_log_alloc(size, share);
 338         if (share) {
 339             vhost_log_shm = log;
 340         } else {
 341             vhost_log = log;
 342         }
 343     } else {
 344         ++log->refcnt;
 345     }
 346
 347     return log;
 348 }
 349
 350 static void vhost_log_put(struct vhost_dev *dev, bool sync)
 351 {
 352     struct vhost_log *log = dev->log;
 353
 354     if (!log) {
 355         return;
 356     }
 357
 358     --log->refcnt;
 359     if (log->refcnt == 0) {
 360         /* Sync only the range covered by the old log */
 361         if (dev->log_size && sync) {
 362             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
 363         }
 364
 365         if (vhost_log == log) {
 366             g_free(log->log);
 367             vhost_log = NULL;
 368         } else if (vhost_log_shm == log) {
 369             qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
 370                             log->fd);
 371             vhost_log_shm = NULL;
 372         }
 373
 374         g_free(log);
 375     }
 376 }
 377
 378 static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 379 {
 380     return dev->vhost_ops->vhost_requires_shm_log &&
 381            dev->vhost_ops->vhost_requires_shm_log(dev);
 382 }
 383
 384 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 385 {
 386     struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
 387     uint64_t log_base = (uintptr_t)log->log;
 388     int r;
 389
 390     /* inform backend of log switching, this must be done before
 391        releasing the current log, to ensure no logging is lost */
 392     r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
 393     assert(r >= 0);
 394     vhost_log_put(dev, true);
 395     dev->log = log;
 396     dev->log_size = size;
 397 }
 398
 399 static int vhost_verify_ring_mappings(struct vhost_dev *dev,
 400                                       uint64_t start_addr,
 401                                       uint64_t size)
 402 {
 403     int i;
 404     int r = 0;
 405
 406     for (i = 0; !r && i < dev->nvqs; ++i) {
 407         struct vhost_virtqueue *vq = dev->vqs + i;
 408         hwaddr l;
 409         void *p;
 410
 411         if (!ranges_overlap(start_addr, size, vq->ring_phys, vq->ring_size)) {
 412             continue;
 413         }
 414         l = vq->ring_size;
 415         p = cpu_physical_memory_map(vq->ring_phys, &l, 1);
 416         if (!p || l != vq->ring_size) {
 417             fprintf(stderr, "Unable to map ring buffer for ring %d\n", i);
 418             r = -ENOMEM;
 419         }
 420         if (p != vq->ring) {
 421             fprintf(stderr, "Ring buffer relocated for ring %d\n", i);
 422             r = -EBUSY;
 423         }
 424         cpu_physical_memory_unmap(p, l, 0, 0);
 425     }
 426     return r;
 427 }
 428
 429 static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev,
 430                                                       uint64_t start_addr,
 431                                                       uint64_t size)
 432 {
 433     int i, n = dev->mem->nregions;
 434     for (i = 0; i < n; ++i) {
 435         struct vhost_memory_region *reg = dev->mem->regions + i;
 436         if (ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 437                            start_addr, size)) {
 438             return reg;
 439         }
 440     }
 441     return NULL;
 442 }
 443
 444 static bool vhost_dev_cmp_memory(struct vhost_dev *dev,
 445                                  uint64_t start_addr,
 446                                  uint64_t size,
 447                                  uint64_t uaddr)
 448 {
 449     struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size);
 450     uint64_t reglast;
 451     uint64_t memlast;
 452
 453     if (!reg) {
 454         return true;
 455     }
 456
 457     reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 458     memlast = range_get_last(start_addr, size);
 459
 460     /* Need to extend region? */
 461     if (start_addr < reg->guest_phys_addr || memlast > reglast) {
 462         return true;
 463     }
 464     /* userspace_addr changed? */
 465     return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr;
 466 }
 467
 468 static void vhost_set_memory(MemoryListener *listener,
 469                              MemoryRegionSection *section,
 470                              bool add)
 471 {
 472     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 473                                          memory_listener);
 474     hwaddr start_addr = section->offset_within_address_space;
 475     ram_addr_t size = int128_get64(section->size);
 476     bool log_dirty =
 477         memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION);
 478     int s = offsetof(struct vhost_memory, regions) +
 479         (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
 480     void *ram;
 481
 482     dev->mem = g_realloc(dev->mem, s);
 483
 484     if (log_dirty) {
 485         add = false;
 486     }
 487
 488     assert(size);
 489
 490     /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
 491     ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
 492     if (add) {
 493         if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
 494             /* Region exists with same address. Nothing to do. */
 495             return;
 496         }
 497     } else {
 498         if (!vhost_dev_find_reg(dev, start_addr, size)) {
 499             /* Removing region that we don't access. Nothing to do. */
 500             return;
 501         }
 502     }
 503
 504     vhost_dev_unassign_memory(dev, start_addr, size);
 505     if (add) {
 506         /* Add given mapping, merging adjacent regions if any */
 507         vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram);
 508     } else {
 509         /* Remove old mapping for this memory, if any. */
 510         vhost_dev_unassign_memory(dev, start_addr, size);
 511     }
 512     dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
 513     dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
 514     dev->memory_changed = true;
 515     used_memslots = dev->mem->nregions;
 516 }
 517
 518 static bool vhost_section(MemoryRegionSection *section)
 519 {
 520     return memory_region_is_ram(section->mr);
 521 }
 522
 523 static void vhost_begin(MemoryListener *listener)
 524 {
 525     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 526                                          memory_listener);
 527     dev->mem_changed_end_addr = 0;
 528     dev->mem_changed_start_addr = -1;
 529 }
 530
 531 static void vhost_commit(MemoryListener *listener)
 532 {
 533     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 534                                          memory_listener);
 535     hwaddr start_addr = 0;
 536     ram_addr_t size = 0;
 537     uint64_t log_size;
 538     int r;
 539
 540     if (!dev->memory_changed) {
 541         return;
 542     }
 543     if (!dev->started) {
 544         return;
 545     }
 546     if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) {
 547         return;
 548     }
 549
 550     if (dev->started) {
 551         start_addr = dev->mem_changed_start_addr;
 552         size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1;
 553
 554         r = vhost_verify_ring_mappings(dev, start_addr, size);
 555         assert(r >= 0);
 556     }
 557
 558     if (!dev->log_enabled) {
 559         r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 560         assert(r >= 0);
 561         dev->memory_changed = false;
 562         return;
 563     }
 564     log_size = vhost_get_log_size(dev);
 565     /* We allocate an extra 4K bytes to log,
 566      * to reduce the * number of reallocations. */
 567 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
 568     /* To log more, must increase log size before table update. */
 569     if (dev->log_size < log_size) {
 570         vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
 571     }
 572     r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 573     assert(r >= 0);
 574     /* To log less, can only decrease log size after table update. */
 575     if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
 576         vhost_dev_log_resize(dev, log_size);
 577     }
 578     dev->memory_changed = false;
 579 }
 580
 581 static void vhost_region_add(MemoryListener *listener,
 582                              MemoryRegionSection *section)
 583 {
 584     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 585                                          memory_listener);
 586
 587     if (!vhost_section(section)) {
 588         return;
 589     }
 590
 591     ++dev->n_mem_sections;
 592     dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections,
 593                                 dev->n_mem_sections);
 594     dev->mem_sections[dev->n_mem_sections - 1] = *section;
 595     memory_region_ref(section->mr);
 596     vhost_set_memory(listener, section, true);
 597 }
 598
 599 static void vhost_region_del(MemoryListener *listener,
 600                              MemoryRegionSection *section)
 601 {
 602     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 603                                          memory_listener);
 604     int i;
 605
 606     if (!vhost_section(section)) {
 607         return;
 608     }
 609
 610     vhost_set_memory(listener, section, false);
 611     memory_region_unref(section->mr);
 612     for (i = 0; i < dev->n_mem_sections; ++i) {
 613         if (dev->mem_sections[i].offset_within_address_space
 614             == section->offset_within_address_space) {
 615             --dev->n_mem_sections;
 616             memmove(&dev->mem_sections[i], &dev->mem_sections[i+1],
 617                     (dev->n_mem_sections - i) * sizeof(*dev->mem_sections));
 618             break;
 619         }
 620     }
 621 }
 622
 623 static void vhost_region_nop(MemoryListener *listener,
 624                              MemoryRegionSection *section)
 625 {
 626 }
 627
 628 static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
 629                                     struct vhost_virtqueue *vq,
 630                                     unsigned idx, bool enable_log)
 631 {
 632     struct vhost_vring_addr addr = {
 633         .index = idx,
 634         .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
 635         .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
 636         .used_user_addr = (uint64_t)(unsigned long)vq->used,
 637         .log_guest_addr = vq->used_phys,
 638         .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
 639     };
 640     int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
 641     if (r < 0) {
 642         return -errno;
 643     }
 644     return 0;
 645 }
 646
 647 static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
 648 {
 649     uint64_t features = dev->acked_features;
 650     int r;
 651     if (enable_log) {
 652         features |= 0x1ULL << VHOST_F_LOG_ALL;
 653     }
 654     r = dev->vhost_ops->vhost_set_features(dev, features);
 655     return r < 0 ? -errno : 0;
 656 }
 657
 658 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
 659 {
 660     int r, t, i, idx;
 661     r = vhost_dev_set_features(dev, enable_log);
 662     if (r < 0) {
 663         goto err_features;
 664     }
 665     for (i = 0; i < dev->nvqs; ++i) {
 666         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 667         r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 668                                      enable_log);
 669         if (r < 0) {
 670             goto err_vq;
 671         }
 672     }
 673     return 0;
 674 err_vq:
 675     for (; i >= 0; --i) {
 676         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 677         t = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 678                                      dev->log_enabled);
 679         assert(t >= 0);
 680     }
 681     t = vhost_dev_set_features(dev, dev->log_enabled);
 682     assert(t >= 0);
 683 err_features:
 684     return r;
 685 }
 686
 687 static int vhost_migration_log(MemoryListener *listener, int enable)
 688 {
 689     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 690                                          memory_listener);
 691     int r;
 692     if (!!enable == dev->log_enabled) {
 693         return 0;
 694     }
 695     if (!dev->started) {
 696         dev->log_enabled = enable;
 697         return 0;
 698     }
 699     if (!enable) {
 700         r = vhost_dev_set_log(dev, false);
 701         if (r < 0) {
 702             return r;
 703         }
 704         vhost_log_put(dev, false);
 705         dev->log = NULL;
 706         dev->log_size = 0;
 707     } else {
 708         vhost_dev_log_resize(dev, vhost_get_log_size(dev));
 709         r = vhost_dev_set_log(dev, true);
 710         if (r < 0) {
 711             return r;
 712         }
 713     }
 714     dev->log_enabled = enable;
 715     return 0;
 716 }
 717
 718 static void vhost_log_global_start(MemoryListener *listener)
 719 {
 720     int r;
 721
 722     r = vhost_migration_log(listener, true);
 723     if (r < 0) {
 724         abort();
 725     }
 726 }
 727
 728 static void vhost_log_global_stop(MemoryListener *listener)
 729 {
 730     int r;
 731
 732     r = vhost_migration_log(listener, false);
 733     if (r < 0) {
 734         abort();
 735     }
 736 }
 737
 738 static void vhost_log_start(MemoryListener *listener,
 739                             MemoryRegionSection *section,
 740                             int old, int new)
 741 {
 742     /* FIXME: implement */
 743 }
 744
 745 static void vhost_log_stop(MemoryListener *listener,
 746                            MemoryRegionSection *section,
 747                            int old, int new)
 748 {
 749     /* FIXME: implement */
 750 }
 751
 752 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
 753                                                    bool is_big_endian,
 754                                                    int vhost_vq_index)
 755 {
 756     struct vhost_vring_state s = {
 757         .index = vhost_vq_index,
 758         .num = is_big_endian
 759     };
 760
 761     if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
 762         return 0;
 763     }
 764
 765     if (errno == ENOTTY) {
 766         error_report("vhost does not support cross-endian");
 767         return -ENOSYS;
 768     }
 769
 770     return -errno;
 771 }
 772
 773 static int vhost_virtqueue_start(struct vhost_dev *dev,
 774                                 struct VirtIODevice *vdev,
 775                                 struct vhost_virtqueue *vq,
 776                                 unsigned idx)
 777 {
 778     hwaddr s, l, a;
 779     int r;
 780     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
 781     struct vhost_vring_file file = {
 782         .index = vhost_vq_index
 783     };
 784     struct vhost_vring_state state = {
 785         .index = vhost_vq_index
 786     };
 787     struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
 788
 789
 790     vq->num = state.num = virtio_queue_get_num(vdev, idx);
 791     r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
 792     if (r) {
 793         return -errno;
 794     }
 795
 796     state.num = virtio_queue_get_last_avail_idx(vdev, idx);
 797     r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
 798     if (r) {
 799         return -errno;
 800     }
 801
 802     if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1) &&
 803         virtio_legacy_is_cross_endian(vdev)) {
 804         r = vhost_virtqueue_set_vring_endian_legacy(dev,
 805                                                     virtio_is_big_endian(vdev),
 806                                                     vhost_vq_index);
 807         if (r) {
 808             return -errno;
 809         }
 810     }
 811
 812     s = l = virtio_queue_get_desc_size(vdev, idx);
 813     a = virtio_queue_get_desc_addr(vdev, idx);
 814     vq->desc = cpu_physical_memory_map(a, &l, 0);
 815     if (!vq->desc || l != s) {
 816         r = -ENOMEM;
 817         goto fail_alloc_desc;
 818     }
 819     s = l = virtio_queue_get_avail_size(vdev, idx);
 820     a = virtio_queue_get_avail_addr(vdev, idx);
 821     vq->avail = cpu_physical_memory_map(a, &l, 0);
 822     if (!vq->avail || l != s) {
 823         r = -ENOMEM;
 824         goto fail_alloc_avail;
 825     }
 826     vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
 827     vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
 828     vq->used = cpu_physical_memory_map(a, &l, 1);
 829     if (!vq->used || l != s) {
 830         r = -ENOMEM;
 831         goto fail_alloc_used;
 832     }
 833
 834     vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx);
 835     vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx);
 836     vq->ring = cpu_physical_memory_map(a, &l, 1);
 837     if (!vq->ring || l != s) {
 838         r = -ENOMEM;
 839         goto fail_alloc_ring;
 840     }
 841
 842     r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
 843     if (r < 0) {
 844         r = -errno;
 845         goto fail_alloc;
 846     }
 847
 848     file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
 849     r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
 850     if (r) {
 851         r = -errno;
 852         goto fail_kick;
 853     }
 854
 855     /* Clear and discard previous events if any. */
 856     event_notifier_test_and_clear(&vq->masked_notifier);
 857
 858     return 0;
 859
 860 fail_kick:
 861 fail_alloc:
 862     cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
 863                               0, 0);
 864 fail_alloc_ring:
 865     cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
 866                               0, 0);
 867 fail_alloc_used:
 868     cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
 869                               0, 0);
 870 fail_alloc_avail:
 871     cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
 872                               0, 0);
 873 fail_alloc_desc:
 874     return r;
 875 }
 876
 877 static void vhost_virtqueue_stop(struct vhost_dev *dev,
 878                                     struct VirtIODevice *vdev,
 879                                     struct vhost_virtqueue *vq,
 880                                     unsigned idx)
 881 {
 882     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
 883     struct vhost_vring_state state = {
 884         .index = vhost_vq_index,
 885     };
 886     int r;
 887
 888     r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
 889     if (r < 0) {
 890         fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
 891         fflush(stderr);
 892     }
 893     virtio_queue_set_last_avail_idx(vdev, idx, state.num);
 894     virtio_queue_invalidate_signalled_used(vdev, idx);
 895
 896     /* In the cross-endian case, we need to reset the vring endianness to
 897      * native as legacy devices expect so by default.
 898      */
 899     if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1) &&
 900         virtio_legacy_is_cross_endian(vdev)) {
 901         r = vhost_virtqueue_set_vring_endian_legacy(dev,
 902                                                     !virtio_is_big_endian(vdev),
 903                                                     vhost_vq_index);
 904         if (r < 0) {
 905             error_report("failed to reset vring endianness");
 906         }
 907     }
 908
 909     assert (r >= 0);
 910     cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
 911                               0, virtio_queue_get_ring_size(vdev, idx));
 912     cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
 913                               1, virtio_queue_get_used_size(vdev, idx));
 914     cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
 915                               0, virtio_queue_get_avail_size(vdev, idx));
 916     cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
 917                               0, virtio_queue_get_desc_size(vdev, idx));
 918 }
 919
 920 static void vhost_eventfd_add(MemoryListener *listener,
 921                               MemoryRegionSection *section,
 922                               bool match_data, uint64_t data, EventNotifier *e)
 923 {
 924 }
 925
 926 static void vhost_eventfd_del(MemoryListener *listener,
 927                               MemoryRegionSection *section,
 928                               bool match_data, uint64_t data, EventNotifier *e)
 929 {
 930 }
 931
 932 static int vhost_virtqueue_init(struct vhost_dev *dev,
 933                                 struct vhost_virtqueue *vq, int n)
 934 {
 935     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
 936     struct vhost_vring_file file = {
 937         .index = vhost_vq_index,
 938     };
 939     int r = event_notifier_init(&vq->masked_notifier, 0);
 940     if (r < 0) {
 941         return r;
 942     }
 943
 944     file.fd = event_notifier_get_fd(&vq->masked_notifier);
 945     r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
 946     if (r) {
 947         r = -errno;
 948         goto fail_call;
 949     }
 950     return 0;
 951 fail_call:
 952     event_notifier_cleanup(&vq->masked_notifier);
 953     return r;
 954 }
 955
 956 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
 957 {
 958     event_notifier_cleanup(&vq->masked_notifier);
 959 }
 960
 961 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
 962                    VhostBackendType backend_type)
 963 {
 964     uint64_t features;
 965     int i, r;
 966
 967     hdev->migration_blocker = NULL;
 968
 969     if (vhost_set_backend_type(hdev, backend_type) < 0) {
 970         close((uintptr_t)opaque);
 971         return -1;
 972     }
 973
 974     if (hdev->vhost_ops->vhost_backend_init(hdev, opaque) < 0) {
 975         close((uintptr_t)opaque);
 976         return -errno;
 977     }
 978
 979     if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
 980         fprintf(stderr, "vhost backend memory slots limit is less"
 981                 " than current number of present memory slots\n");
 982         close((uintptr_t)opaque);
 983         return -1;
 984     }
 985     QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
 986
 987     r = hdev->vhost_ops->vhost_set_owner(hdev);
 988     if (r < 0) {
 989         goto fail;
 990     }
 991
 992     r = hdev->vhost_ops->vhost_get_features(hdev, &features);
 993     if (r < 0) {
 994         goto fail;
 995     }
 996
 997     for (i = 0; i < hdev->nvqs; ++i) {
 998         r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
 999         if (r < 0) {
1000             goto fail_vq;
1001         }
1002     }
1003     hdev->features = features;
1004
1005     hdev->memory_listener = (MemoryListener) {
1006         .begin = vhost_begin,
1007         .commit = vhost_commit,
1008         .region_add = vhost_region_add,
1009         .region_del = vhost_region_del,
1010         .region_nop = vhost_region_nop,
1011         .log_start = vhost_log_start,
1012         .log_stop = vhost_log_stop,
1013         .log_sync = vhost_log_sync,
1014         .log_global_start = vhost_log_global_start,
1015         .log_global_stop = vhost_log_global_stop,
1016         .eventfd_add = vhost_eventfd_add,
1017         .eventfd_del = vhost_eventfd_del,
1018         .priority = 10
1019     };
1020
1021     if (hdev->migration_blocker == NULL) {
1022         if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1023             error_setg(&hdev->migration_blocker,
1024                        "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1025         } else if (!qemu_memfd_check()) {
1026             error_setg(&hdev->migration_blocker,
1027                        "Migration disabled: failed to allocate shared memory");
1028         }
1029     }
1030
1031     if (hdev->migration_blocker != NULL) {
1032         migrate_add_blocker(hdev->migration_blocker);
1033     }
1034
1035     hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1036     hdev->n_mem_sections = 0;
1037     hdev->mem_sections = NULL;
1038     hdev->log = NULL;
1039     hdev->log_size = 0;
1040     hdev->log_enabled = false;
1041     hdev->started = false;
1042     hdev->memory_changed = false;
1043     memory_listener_register(&hdev->memory_listener, &address_space_memory);
1044     return 0;
1045 fail_vq:
1046     while (--i >= 0) {
1047         vhost_virtqueue_cleanup(hdev->vqs + i);
1048     }
1049 fail:
1050     r = -errno;
1051     hdev->vhost_ops->vhost_backend_cleanup(hdev);
1052     QLIST_REMOVE(hdev, entry);
1053     return r;
1054 }
1055
1056 void vhost_dev_cleanup(struct vhost_dev *hdev)
1057 {
1058     int i;
1059     for (i = 0; i < hdev->nvqs; ++i) {
1060         vhost_virtqueue_cleanup(hdev->vqs + i);
1061     }
1062     memory_listener_unregister(&hdev->memory_listener);
1063     if (hdev->migration_blocker) {
1064         migrate_del_blocker(hdev->migration_blocker);
1065         error_free(hdev->migration_blocker);
1066     }
1067     g_free(hdev->mem);
1068     g_free(hdev->mem_sections);
1069     hdev->vhost_ops->vhost_backend_cleanup(hdev);
1070     QLIST_REMOVE(hdev, entry);
1071 }
1072
1073 /* Stop processing guest IO notifications in qemu.
1074  * Start processing them in vhost in kernel.
1075  */
1076 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1077 {
1078     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1079     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1080     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1081     int i, r, e;
1082     if (!k->set_host_notifier) {
1083         fprintf(stderr, "binding does not support host notifiers\n");
1084         r = -ENOSYS;
1085         goto fail;
1086     }
1087
1088     for (i = 0; i < hdev->nvqs; ++i) {
1089         r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, true);
1090         if (r < 0) {
1091             fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r);
1092             goto fail_vq;
1093         }
1094     }
1095
1096     return 0;
1097 fail_vq:
1098     while (--i >= 0) {
1099         e = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false);
1100         if (e < 0) {
1101             fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r);
1102             fflush(stderr);
1103         }
1104         assert (e >= 0);
1105     }
1106 fail:
1107     return r;
1108 }
1109
1110 /* Stop processing guest IO notifications in vhost.
1111  * Start processing them in qemu.
1112  * This might actually run the qemu handlers right away,
1113  * so virtio in qemu must be completely setup when this is called.
1114  */
1115 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1116 {
1117     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1118     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1119     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1120     int i, r;
1121
1122     for (i = 0; i < hdev->nvqs; ++i) {
1123         r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false);
1124         if (r < 0) {
1125             fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r);
1126             fflush(stderr);
1127         }
1128         assert (r >= 0);
1129     }
1130 }
1131
1132 /* Test and clear event pending status.
1133  * Should be called after unmask to avoid losing events.
1134  */
1135 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1136 {
1137     struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1138     assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1139     return event_notifier_test_and_clear(&vq->masked_notifier);
1140 }
1141
1142 /* Mask/unmask events from this vq. */
1143 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1144                          bool mask)
1145 {
1146     struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1147     int r, index = n - hdev->vq_index;
1148     struct vhost_vring_file file;
1149
1150     if (mask) {
1151         file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1152     } else {
1153         file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1154     }
1155
1156     file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1157     r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1158     assert(r >= 0);
1159 }
1160
1161 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1162                             uint64_t features)
1163 {
1164     const int *bit = feature_bits;
1165     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1166         uint64_t bit_mask = (1ULL << *bit);
1167         if (!(hdev->features & bit_mask)) {
1168             features &= ~bit_mask;
1169         }
1170         bit++;
1171     }
1172     return features;
1173 }
1174
1175 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1176                         uint64_t features)
1177 {
1178     const int *bit = feature_bits;
1179     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1180         uint64_t bit_mask = (1ULL << *bit);
1181         if (features & bit_mask) {
1182             hdev->acked_features |= bit_mask;
1183         }
1184         bit++;
1185     }
1186 }
1187
1188 /* Host notifiers must be enabled at this point. */
1189 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1190 {
1191     int i, r;
1192
1193     hdev->started = true;
1194
1195     r = vhost_dev_set_features(hdev, hdev->log_enabled);
1196     if (r < 0) {
1197         goto fail_features;
1198     }
1199     r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1200     if (r < 0) {
1201         r = -errno;
1202         goto fail_mem;
1203     }
1204     for (i = 0; i < hdev->nvqs; ++i) {
1205         r = vhost_virtqueue_start(hdev,
1206                                   vdev,
1207                                   hdev->vqs + i,
1208                                   hdev->vq_index + i);
1209         if (r < 0) {
1210             goto fail_vq;
1211         }
1212     }
1213
1214     if (hdev->log_enabled) {
1215         uint64_t log_base;
1216
1217         hdev->log_size = vhost_get_log_size(hdev);
1218         hdev->log = vhost_log_get(hdev->log_size,
1219                                   vhost_dev_log_is_shared(hdev));
1220         log_base = (uintptr_t)hdev->log->log;
1221         r = hdev->vhost_ops->vhost_set_log_base(hdev,
1222                                                 hdev->log_size ? log_base : 0,
1223                                                 hdev->log);
1224         if (r < 0) {
1225             r = -errno;
1226             goto fail_log;
1227         }
1228     }
1229
1230     return 0;
1231 fail_log:
1232     vhost_log_put(hdev, false);
1233 fail_vq:
1234     while (--i >= 0) {
1235         vhost_virtqueue_stop(hdev,
1236                              vdev,
1237                              hdev->vqs + i,
1238                              hdev->vq_index + i);
1239     }
1240     i = hdev->nvqs;
1241 fail_mem:
1242 fail_features:
1243
1244     hdev->started = false;
1245     return r;
1246 }
1247
1248 /* Host notifiers must be enabled at this point. */
1249 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1250 {
1251     int i;
1252
1253     for (i = 0; i < hdev->nvqs; ++i) {
1254         vhost_virtqueue_stop(hdev,
1255                              vdev,
1256                              hdev->vqs + i,
1257                              hdev->vq_index + i);
1258     }
1259
1260     vhost_log_put(hdev, true);
1261     hdev->started = false;
1262     hdev->log = NULL;
1263     hdev->log_size = 0;
1264 }
1265