hw/vfio/container.c

   1 /*
   2  * generic functions used by VFIO devices
   3  *
   4  * Copyright Red Hat, Inc. 2012
   5  *
   6  * Authors:
   7  *  Alex Williamson <alex.williamson@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Based on qemu-kvm device-assignment:
  13  *  Adapted for KVM by Qumranet.
  14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19  */
  20
  21 #include "qemu/osdep.h"
  22 #include <sys/ioctl.h>
  23 #include <linux/vfio.h>
  24
  25 #include "hw/vfio/vfio-common.h"
  26 #include "exec/address-spaces.h"
  27 #include "exec/memory.h"
  28 #include "exec/ram_addr.h"
  29 #include "qemu/error-report.h"
  30 #include "qemu/range.h"
  31 #include "sysemu/reset.h"
  32 #include "trace.h"
  33 #include "qapi/error.h"
  34 #include "pci.h"
  35
  36 VFIOGroupList vfio_group_list =
  37     QLIST_HEAD_INITIALIZER(vfio_group_list);
  38
  39 static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
  40 {
  41     switch (container->iommu_type) {
  42     case VFIO_TYPE1v2_IOMMU:
  43     case VFIO_TYPE1_IOMMU:
  44         /*
  45          * We support coordinated discarding of RAM via the RamDiscardManager.
  46          */
  47         return ram_block_uncoordinated_discard_disable(state);
  48     default:
  49         /*
  50          * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
  51          * RamDiscardManager, however, it is completely untested.
  52          *
  53          * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
  54          * completely the opposite of managing mapping/pinning dynamically as
  55          * required by RamDiscardManager. We would have to special-case sections
  56          * with a RamDiscardManager.
  57          */
  58         return ram_block_discard_disable(state);
  59     }
  60 }
  61
  62 static int vfio_dma_unmap_bitmap(const VFIOContainer *container,
  63                                  hwaddr iova, ram_addr_t size,
  64                                  IOMMUTLBEntry *iotlb)
  65 {
  66     const VFIOContainerBase *bcontainer = &container->bcontainer;
  67     struct vfio_iommu_type1_dma_unmap *unmap;
  68     struct vfio_bitmap *bitmap;
  69     VFIOBitmap vbmap;
  70     int ret;
  71
  72     ret = vfio_bitmap_alloc(&vbmap, size);
  73     if (ret) {
  74         return ret;
  75     }
  76
  77     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
  78
  79     unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
  80     unmap->iova = iova;
  81     unmap->size = size;
  82     unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
  83     bitmap = (struct vfio_bitmap *)&unmap->data;
  84
  85     /*
  86      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
  87      * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
  88      * to qemu_real_host_page_size.
  89      */
  90     bitmap->pgsize = qemu_real_host_page_size();
  91     bitmap->size = vbmap.size;
  92     bitmap->data = (__u64 *)vbmap.bitmap;
  93
  94     if (vbmap.size > bcontainer->max_dirty_bitmap_size) {
  95         error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
  96         ret = -E2BIG;
  97         goto unmap_exit;
  98     }
  99
 100     ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
 101     if (!ret) {
 102         cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
 103                 iotlb->translated_addr, vbmap.pages);
 104     } else {
 105         error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
 106     }
 107
 108 unmap_exit:
 109     g_free(unmap);
 110     g_free(vbmap.bitmap);
 111
 112     return ret;
 113 }
 114
 115 /*
 116  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
 117  */
 118 static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
 119                                  hwaddr iova, ram_addr_t size,
 120                                  IOMMUTLBEntry *iotlb)
 121 {
 122     const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
 123                                                   bcontainer);
 124     struct vfio_iommu_type1_dma_unmap unmap = {
 125         .argsz = sizeof(unmap),
 126         .flags = 0,
 127         .iova = iova,
 128         .size = size,
 129     };
 130     bool need_dirty_sync = false;
 131     int ret;
 132     Error *local_err = NULL;
 133
 134     if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) {
 135         if (!vfio_devices_all_device_dirty_tracking(bcontainer) &&
 136             bcontainer->dirty_pages_supported) {
 137             return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
 138         }
 139
 140         need_dirty_sync = true;
 141     }
 142
 143     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
 144         /*
 145          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
 146          * v4.15) where an overflow in its wrap-around check prevents us from
 147          * unmapping the last page of the address space.  Test for the error
 148          * condition and re-try the unmap excluding the last page.  The
 149          * expectation is that we've never mapped the last page anyway and this
 150          * unmap request comes via vIOMMU support which also makes it unlikely
 151          * that this page is used.  This bug was introduced well after type1 v2
 152          * support was introduced, so we shouldn't need to test for v1.  A fix
 153          * is queued for kernel v5.0 so this workaround can be removed once
 154          * affected kernels are sufficiently deprecated.
 155          */
 156         if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
 157             container->iommu_type == VFIO_TYPE1v2_IOMMU) {
 158             trace_vfio_legacy_dma_unmap_overflow_workaround();
 159             unmap.size -= 1ULL << ctz64(bcontainer->pgsizes);
 160             continue;
 161         }
 162         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
 163         return -errno;
 164     }
 165
 166     if (need_dirty_sync) {
 167         ret = vfio_get_dirty_bitmap(bcontainer, iova, size,
 168                                     iotlb->translated_addr, &local_err);
 169         if (ret) {
 170             error_report_err(local_err);
 171             return ret;
 172         }
 173     }
 174
 175     return 0;
 176 }
 177
 178 static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
 179                                ram_addr_t size, void *vaddr, bool readonly)
 180 {
 181     const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
 182                                                   bcontainer);
 183     struct vfio_iommu_type1_dma_map map = {
 184         .argsz = sizeof(map),
 185         .flags = VFIO_DMA_MAP_FLAG_READ,
 186         .vaddr = (__u64)(uintptr_t)vaddr,
 187         .iova = iova,
 188         .size = size,
 189     };
 190
 191     if (!readonly) {
 192         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
 193     }
 194
 195     /*
 196      * Try the mapping, if it fails with EBUSY, unmap the region and try
 197      * again.  This shouldn't be necessary, but we sometimes see it in
 198      * the VGA ROM space.
 199      */
 200     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
 201         (errno == EBUSY &&
 202          vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 &&
 203          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
 204         return 0;
 205     }
 206
 207     error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
 208     return -errno;
 209 }
 210
 211 static int
 212 vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer,
 213                                     bool start, Error **errp)
 214 {
 215     const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
 216                                                   bcontainer);
 217     int ret;
 218     struct vfio_iommu_type1_dirty_bitmap dirty = {
 219         .argsz = sizeof(dirty),
 220     };
 221
 222     if (start) {
 223         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
 224     } else {
 225         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
 226     }
 227
 228     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
 229     if (ret) {
 230         ret = -errno;
 231         error_setg_errno(errp, errno, "Failed to set dirty tracking flag 0x%x",
 232                          dirty.flags);
 233     }
 234
 235     return ret;
 236 }
 237
 238 static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
 239                       VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
 240 {
 241     const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
 242                                                   bcontainer);
 243     struct vfio_iommu_type1_dirty_bitmap *dbitmap;
 244     struct vfio_iommu_type1_dirty_bitmap_get *range;
 245     int ret;
 246
 247     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
 248
 249     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
 250     dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
 251     range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
 252     range->iova = iova;
 253     range->size = size;
 254
 255     /*
 256      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
 257      * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
 258      * to qemu_real_host_page_size.
 259      */
 260     range->bitmap.pgsize = qemu_real_host_page_size();
 261     range->bitmap.size = vbmap->size;
 262     range->bitmap.data = (__u64 *)vbmap->bitmap;
 263
 264     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
 265     if (ret) {
 266         ret = -errno;
 267         error_setg_errno(errp, errno,
 268                          "Failed to get dirty bitmap for iova: 0x%"PRIx64
 269                          " size: 0x%"PRIx64, (uint64_t)range->iova,
 270                          (uint64_t)range->size);
 271     }
 272
 273     g_free(dbitmap);
 274
 275     return ret;
 276 }
 277
 278 static struct vfio_info_cap_header *
 279 vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
 280 {
 281     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
 282         return NULL;
 283     }
 284
 285     return vfio_get_cap((void *)info, info->cap_offset, id);
 286 }
 287
 288 bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
 289                              unsigned int *avail)
 290 {
 291     struct vfio_info_cap_header *hdr;
 292     struct vfio_iommu_type1_info_dma_avail *cap;
 293
 294     /* If the capability cannot be found, assume no DMA limiting */
 295     hdr = vfio_get_iommu_type1_info_cap(info,
 296                                         VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
 297     if (!hdr) {
 298         return false;
 299     }
 300
 301     if (avail != NULL) {
 302         cap = (void *) hdr;
 303         *avail = cap->avail;
 304     }
 305
 306     return true;
 307 }
 308
 309 static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
 310                                      VFIOContainerBase *bcontainer)
 311 {
 312     struct vfio_info_cap_header *hdr;
 313     struct vfio_iommu_type1_info_cap_iova_range *cap;
 314
 315     hdr = vfio_get_iommu_type1_info_cap(info,
 316                                         VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
 317     if (!hdr) {
 318         return false;
 319     }
 320
 321     cap = (void *)hdr;
 322
 323     for (int i = 0; i < cap->nr_iovas; i++) {
 324         Range *range = g_new(Range, 1);
 325
 326         range_set_bounds(range, cap->iova_ranges[i].start,
 327                          cap->iova_ranges[i].end);
 328         bcontainer->iova_ranges =
 329             range_list_insert(bcontainer->iova_ranges, range);
 330     }
 331
 332     return true;
 333 }
 334
 335 static void vfio_kvm_device_add_group(VFIOGroup *group)
 336 {
 337     Error *err = NULL;
 338
 339     if (vfio_kvm_device_add_fd(group->fd, &err)) {
 340         error_reportf_err(err, "group ID %d: ", group->groupid);
 341     }
 342 }
 343
 344 static void vfio_kvm_device_del_group(VFIOGroup *group)
 345 {
 346     Error *err = NULL;
 347
 348     if (vfio_kvm_device_del_fd(group->fd, &err)) {
 349         error_reportf_err(err, "group ID %d: ", group->groupid);
 350     }
 351 }
 352
 353 /*
 354  * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
 355  */
 356 static int vfio_get_iommu_type(int container_fd,
 357                                Error **errp)
 358 {
 359     int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
 360                           VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
 361     int i;
 362
 363     for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
 364         if (ioctl(container_fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
 365             return iommu_types[i];
 366         }
 367     }
 368     error_setg(errp, "No available IOMMU models");
 369     return -EINVAL;
 370 }
 371
 372 /*
 373  * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type
 374  */
 375 static const char *vfio_get_iommu_class_name(int iommu_type)
 376 {
 377     switch (iommu_type) {
 378     case VFIO_TYPE1v2_IOMMU:
 379     case VFIO_TYPE1_IOMMU:
 380         return TYPE_VFIO_IOMMU_LEGACY;
 381         break;
 382     case VFIO_SPAPR_TCE_v2_IOMMU:
 383     case VFIO_SPAPR_TCE_IOMMU:
 384         return TYPE_VFIO_IOMMU_SPAPR;
 385         break;
 386     default:
 387         g_assert_not_reached();
 388     };
 389 }
 390
 391 static bool vfio_set_iommu(int container_fd, int group_fd,
 392                            int *iommu_type, Error **errp)
 393 {
 394     if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container_fd)) {
 395         error_setg_errno(errp, errno, "Failed to set group container");
 396         return false;
 397     }
 398
 399     while (ioctl(container_fd, VFIO_SET_IOMMU, *iommu_type)) {
 400         if (*iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
 401             /*
 402              * On sPAPR, despite the IOMMU subdriver always advertises v1 and
 403              * v2, the running platform may not support v2 and there is no
 404              * way to guess it until an IOMMU group gets added to the container.
 405              * So in case it fails with v2, try v1 as a fallback.
 406              */
 407             *iommu_type = VFIO_SPAPR_TCE_IOMMU;
 408             continue;
 409         }
 410         error_setg_errno(errp, errno, "Failed to set iommu for container");
 411         return false;
 412     }
 413
 414     return true;
 415 }
 416
 417 static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group,
 418                                             Error **errp)
 419 {
 420     int iommu_type;
 421     const char *vioc_name;
 422     VFIOContainer *container;
 423
 424     iommu_type = vfio_get_iommu_type(fd, errp);
 425     if (iommu_type < 0) {
 426         return NULL;
 427     }
 428
 429     if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) {
 430         return NULL;
 431     }
 432
 433     vioc_name = vfio_get_iommu_class_name(iommu_type);
 434
 435     container = VFIO_IOMMU_LEGACY(object_new(vioc_name));
 436     container->fd = fd;
 437     container->iommu_type = iommu_type;
 438     return container;
 439 }
 440
 441 static int vfio_get_iommu_info(VFIOContainer *container,
 442                                struct vfio_iommu_type1_info **info)
 443 {
 444
 445     size_t argsz = sizeof(struct vfio_iommu_type1_info);
 446
 447     *info = g_new0(struct vfio_iommu_type1_info, 1);
 448 again:
 449     (*info)->argsz = argsz;
 450
 451     if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
 452         g_free(*info);
 453         *info = NULL;
 454         return -errno;
 455     }
 456
 457     if (((*info)->argsz > argsz)) {
 458         argsz = (*info)->argsz;
 459         *info = g_realloc(*info, argsz);
 460         goto again;
 461     }
 462
 463     return 0;
 464 }
 465
 466 static struct vfio_info_cap_header *
 467 vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
 468 {
 469     struct vfio_info_cap_header *hdr;
 470     void *ptr = info;
 471
 472     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
 473         return NULL;
 474     }
 475
 476     for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
 477         if (hdr->id == id) {
 478             return hdr;
 479         }
 480     }
 481
 482     return NULL;
 483 }
 484
 485 static void vfio_get_iommu_info_migration(VFIOContainer *container,
 486                                           struct vfio_iommu_type1_info *info)
 487 {
 488     struct vfio_info_cap_header *hdr;
 489     struct vfio_iommu_type1_info_cap_migration *cap_mig;
 490     VFIOContainerBase *bcontainer = &container->bcontainer;
 491
 492     hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
 493     if (!hdr) {
 494         return;
 495     }
 496
 497     cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
 498                             header);
 499
 500     /*
 501      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
 502      * qemu_real_host_page_size to mark those dirty.
 503      */
 504     if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
 505         bcontainer->dirty_pages_supported = true;
 506         bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
 507         bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap;
 508     }
 509 }
 510
 511 static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
 512 {
 513     VFIOContainer *container = container_of(bcontainer, VFIOContainer,
 514                                             bcontainer);
 515     g_autofree struct vfio_iommu_type1_info *info = NULL;
 516     int ret;
 517
 518     ret = vfio_get_iommu_info(container, &info);
 519     if (ret) {
 520         error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
 521         return false;
 522     }
 523
 524     if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
 525         bcontainer->pgsizes = info->iova_pgsizes;
 526     } else {
 527         bcontainer->pgsizes = qemu_real_host_page_size();
 528     }
 529
 530     if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) {
 531         bcontainer->dma_max_mappings = 65535;
 532     }
 533
 534     vfio_get_info_iova_range(info, bcontainer);
 535
 536     vfio_get_iommu_info_migration(container, info);
 537     return true;
 538 }
 539
 540 static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as,
 541                                    Error **errp)
 542 {
 543     VFIOContainer *container;
 544     VFIOContainerBase *bcontainer;
 545     int ret, fd;
 546     VFIOAddressSpace *space;
 547     VFIOIOMMUClass *vioc;
 548
 549     space = vfio_get_address_space(as);
 550
 551     /*
 552      * VFIO is currently incompatible with discarding of RAM insofar as the
 553      * madvise to purge (zap) the page from QEMU's address space does not
 554      * interact with the memory API and therefore leaves stale virtual to
 555      * physical mappings in the IOMMU if the page was previously pinned.  We
 556      * therefore set discarding broken for each group added to a container,
 557      * whether the container is used individually or shared.  This provides
 558      * us with options to allow devices within a group to opt-in and allow
 559      * discarding, so long as it is done consistently for a group (for instance
 560      * if the device is an mdev device where it is known that the host vendor
 561      * driver will never pin pages outside of the working set of the guest
 562      * driver, which would thus not be discarding candidates).
 563      *
 564      * The first opportunity to induce pinning occurs here where we attempt to
 565      * attach the group to existing containers within the AddressSpace.  If any
 566      * pages are already zapped from the virtual address space, such as from
 567      * previous discards, new pinning will cause valid mappings to be
 568      * re-established.  Likewise, when the overall MemoryListener for a new
 569      * container is registered, a replay of mappings within the AddressSpace
 570      * will occur, re-establishing any previously zapped pages as well.
 571      *
 572      * Especially virtio-balloon is currently only prevented from discarding
 573      * new memory, it will not yet set ram_block_discard_set_required() and
 574      * therefore, neither stops us here or deals with the sudden memory
 575      * consumption of inflated memory.
 576      *
 577      * We do support discarding of memory coordinated via the RamDiscardManager
 578      * with some IOMMU types. vfio_ram_block_discard_disable() handles the
 579      * details once we know which type of IOMMU we are using.
 580      */
 581
 582     QLIST_FOREACH(bcontainer, &space->containers, next) {
 583         container = container_of(bcontainer, VFIOContainer, bcontainer);
 584         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
 585             ret = vfio_ram_block_discard_disable(container, true);
 586             if (ret) {
 587                 error_setg_errno(errp, -ret,
 588                                  "Cannot set discarding of RAM broken");
 589                 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
 590                           &container->fd)) {
 591                     error_report("vfio: error disconnecting group %d from"
 592                                  " container", group->groupid);
 593                 }
 594                 return false;
 595             }
 596             group->container = container;
 597             QLIST_INSERT_HEAD(&container->group_list, group, container_next);
 598             vfio_kvm_device_add_group(group);
 599             return true;
 600         }
 601     }
 602
 603     fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
 604     if (fd < 0) {
 605         goto put_space_exit;
 606     }
 607
 608     ret = ioctl(fd, VFIO_GET_API_VERSION);
 609     if (ret != VFIO_API_VERSION) {
 610         error_setg(errp, "supported vfio version: %d, "
 611                    "reported version: %d", VFIO_API_VERSION, ret);
 612         goto close_fd_exit;
 613     }
 614
 615     container = vfio_create_container(fd, group, errp);
 616     if (!container) {
 617         goto close_fd_exit;
 618     }
 619     bcontainer = &container->bcontainer;
 620
 621     if (!vfio_cpr_register_container(bcontainer, errp)) {
 622         goto free_container_exit;
 623     }
 624
 625     ret = vfio_ram_block_discard_disable(container, true);
 626     if (ret) {
 627         error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
 628         goto unregister_container_exit;
 629     }
 630
 631     vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
 632     assert(vioc->setup);
 633
 634     if (!vioc->setup(bcontainer, errp)) {
 635         goto enable_discards_exit;
 636     }
 637
 638     vfio_kvm_device_add_group(group);
 639
 640     vfio_address_space_insert(space, bcontainer);
 641
 642     group->container = container;
 643     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
 644
 645     bcontainer->listener = vfio_memory_listener;
 646     memory_listener_register(&bcontainer->listener, bcontainer->space->as);
 647
 648     if (bcontainer->error) {
 649         error_propagate_prepend(errp, bcontainer->error,
 650             "memory listener initialization failed: ");
 651         goto listener_release_exit;
 652     }
 653
 654     bcontainer->initialized = true;
 655
 656     return true;
 657 listener_release_exit:
 658     QLIST_REMOVE(group, container_next);
 659     vfio_kvm_device_del_group(group);
 660     memory_listener_unregister(&bcontainer->listener);
 661     if (vioc->release) {
 662         vioc->release(bcontainer);
 663     }
 664
 665 enable_discards_exit:
 666     vfio_ram_block_discard_disable(container, false);
 667
 668 unregister_container_exit:
 669     vfio_cpr_unregister_container(bcontainer);
 670
 671 free_container_exit:
 672     object_unref(container);
 673
 674 close_fd_exit:
 675     close(fd);
 676
 677 put_space_exit:
 678     vfio_put_address_space(space);
 679
 680     return false;
 681 }
 682
 683 static void vfio_disconnect_container(VFIOGroup *group)
 684 {
 685     VFIOContainer *container = group->container;
 686     VFIOContainerBase *bcontainer = &container->bcontainer;
 687     VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
 688
 689     QLIST_REMOVE(group, container_next);
 690     group->container = NULL;
 691
 692     /*
 693      * Explicitly release the listener first before unset container,
 694      * since unset may destroy the backend container if it's the last
 695      * group.
 696      */
 697     if (QLIST_EMPTY(&container->group_list)) {
 698         memory_listener_unregister(&bcontainer->listener);
 699         if (vioc->release) {
 700             vioc->release(bcontainer);
 701         }
 702     }
 703
 704     if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
 705         error_report("vfio: error disconnecting group %d from container",
 706                      group->groupid);
 707     }
 708
 709     if (QLIST_EMPTY(&container->group_list)) {
 710         VFIOAddressSpace *space = bcontainer->space;
 711
 712         trace_vfio_disconnect_container(container->fd);
 713         vfio_cpr_unregister_container(bcontainer);
 714         close(container->fd);
 715         object_unref(container);
 716
 717         vfio_put_address_space(space);
 718     }
 719 }
 720
 721 static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
 722 {
 723     ERRP_GUARD();
 724     VFIOGroup *group;
 725     char path[32];
 726     struct vfio_group_status status = { .argsz = sizeof(status) };
 727
 728     QLIST_FOREACH(group, &vfio_group_list, next) {
 729         if (group->groupid == groupid) {
 730             /* Found it.  Now is it already in the right context? */
 731             if (group->container->bcontainer.space->as == as) {
 732                 return group;
 733             } else {
 734                 error_setg(errp, "group %d used in multiple address spaces",
 735                            group->groupid);
 736                 return NULL;
 737             }
 738         }
 739     }
 740
 741     group = g_malloc0(sizeof(*group));
 742
 743     snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
 744     group->fd = qemu_open(path, O_RDWR, errp);
 745     if (group->fd < 0) {
 746         goto free_group_exit;
 747     }
 748
 749     if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
 750         error_setg_errno(errp, errno, "failed to get group %d status", groupid);
 751         goto close_fd_exit;
 752     }
 753
 754     if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
 755         error_setg(errp, "group %d is not viable", groupid);
 756         error_append_hint(errp,
 757                           "Please ensure all devices within the iommu_group "
 758                           "are bound to their vfio bus driver.\n");
 759         goto close_fd_exit;
 760     }
 761
 762     group->groupid = groupid;
 763     QLIST_INIT(&group->device_list);
 764
 765     if (!vfio_connect_container(group, as, errp)) {
 766         error_prepend(errp, "failed to setup container for group %d: ",
 767                       groupid);
 768         goto close_fd_exit;
 769     }
 770
 771     QLIST_INSERT_HEAD(&vfio_group_list, group, next);
 772
 773     return group;
 774
 775 close_fd_exit:
 776     close(group->fd);
 777
 778 free_group_exit:
 779     g_free(group);
 780
 781     return NULL;
 782 }
 783
 784 static void vfio_put_group(VFIOGroup *group)
 785 {
 786     if (!group || !QLIST_EMPTY(&group->device_list)) {
 787         return;
 788     }
 789
 790     if (!group->ram_block_discard_allowed) {
 791         vfio_ram_block_discard_disable(group->container, false);
 792     }
 793     vfio_kvm_device_del_group(group);
 794     vfio_disconnect_container(group);
 795     QLIST_REMOVE(group, next);
 796     trace_vfio_put_group(group->fd);
 797     close(group->fd);
 798     g_free(group);
 799 }
 800
 801 static bool vfio_get_device(VFIOGroup *group, const char *name,
 802                             VFIODevice *vbasedev, Error **errp)
 803 {
 804     g_autofree struct vfio_device_info *info = NULL;
 805     int fd;
 806
 807     fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
 808     if (fd < 0) {
 809         error_setg_errno(errp, errno, "error getting device from group %d",
 810                          group->groupid);
 811         error_append_hint(errp,
 812                       "Verify all devices in group %d are bound to vfio-<bus> "
 813                       "or pci-stub and not already in use\n", group->groupid);
 814         return false;
 815     }
 816
 817     info = vfio_get_device_info(fd);
 818     if (!info) {
 819         error_setg_errno(errp, errno, "error getting device info");
 820         close(fd);
 821         return false;
 822     }
 823
 824     /*
 825      * Set discarding of RAM as not broken for this group if the driver knows
 826      * the device operates compatibly with discarding.  Setting must be
 827      * consistent per group, but since compatibility is really only possible
 828      * with mdev currently, we expect singleton groups.
 829      */
 830     if (vbasedev->ram_block_discard_allowed !=
 831         group->ram_block_discard_allowed) {
 832         if (!QLIST_EMPTY(&group->device_list)) {
 833             error_setg(errp, "Inconsistent setting of support for discarding "
 834                        "RAM (e.g., balloon) within group");
 835             close(fd);
 836             return false;
 837         }
 838
 839         if (!group->ram_block_discard_allowed) {
 840             group->ram_block_discard_allowed = true;
 841             vfio_ram_block_discard_disable(group->container, false);
 842         }
 843     }
 844
 845     vbasedev->fd = fd;
 846     vbasedev->group = group;
 847     QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
 848
 849     vbasedev->num_irqs = info->num_irqs;
 850     vbasedev->num_regions = info->num_regions;
 851     vbasedev->flags = info->flags;
 852
 853     trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs);
 854
 855     vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
 856
 857     return true;
 858 }
 859
 860 static void vfio_put_base_device(VFIODevice *vbasedev)
 861 {
 862     if (!vbasedev->group) {
 863         return;
 864     }
 865     QLIST_REMOVE(vbasedev, next);
 866     vbasedev->group = NULL;
 867     trace_vfio_put_base_device(vbasedev->fd);
 868     close(vbasedev->fd);
 869 }
 870
 871 static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
 872 {
 873     char *tmp, group_path[PATH_MAX];
 874     g_autofree char *group_name = NULL;
 875     int ret, groupid;
 876     ssize_t len;
 877
 878     tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
 879     len = readlink(tmp, group_path, sizeof(group_path));
 880     g_free(tmp);
 881
 882     if (len <= 0 || len >= sizeof(group_path)) {
 883         ret = len < 0 ? -errno : -ENAMETOOLONG;
 884         error_setg_errno(errp, -ret, "no iommu_group found");
 885         return ret;
 886     }
 887
 888     group_path[len] = 0;
 889
 890     group_name = g_path_get_basename(group_path);
 891     if (sscanf(group_name, "%d", &groupid) != 1) {
 892         error_setg_errno(errp, errno, "failed to read %s", group_path);
 893         return -errno;
 894     }
 895     return groupid;
 896 }
 897
 898 /*
 899  * vfio_attach_device: attach a device to a security context
 900  * @name and @vbasedev->name are likely to be different depending
 901  * on the type of the device, hence the need for passing @name
 902  */
 903 static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
 904                                       AddressSpace *as, Error **errp)
 905 {
 906     int groupid = vfio_device_groupid(vbasedev, errp);
 907     VFIODevice *vbasedev_iter;
 908     VFIOGroup *group;
 909     VFIOContainerBase *bcontainer;
 910
 911     if (groupid < 0) {
 912         return false;
 913     }
 914
 915     trace_vfio_attach_device(vbasedev->name, groupid);
 916
 917     if (!vfio_device_hiod_realize(vbasedev, errp)) {
 918         return false;
 919     }
 920
 921     group = vfio_get_group(groupid, as, errp);
 922     if (!group) {
 923         return false;
 924     }
 925
 926     QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
 927         if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
 928             error_setg(errp, "device is already attached");
 929             vfio_put_group(group);
 930             return false;
 931         }
 932     }
 933     if (!vfio_get_device(group, name, vbasedev, errp)) {
 934         vfio_put_group(group);
 935         return false;
 936     }
 937
 938     bcontainer = &group->container->bcontainer;
 939     vbasedev->bcontainer = bcontainer;
 940     QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
 941     QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
 942
 943     return true;
 944 }
 945
 946 static void vfio_legacy_detach_device(VFIODevice *vbasedev)
 947 {
 948     VFIOGroup *group = vbasedev->group;
 949
 950     QLIST_REMOVE(vbasedev, global_next);
 951     QLIST_REMOVE(vbasedev, container_next);
 952     vbasedev->bcontainer = NULL;
 953     trace_vfio_detach_device(vbasedev->name, group->groupid);
 954     vfio_put_base_device(vbasedev);
 955     vfio_put_group(group);
 956 }
 957
 958 static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single)
 959 {
 960     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
 961     VFIOGroup *group;
 962     struct vfio_pci_hot_reset_info *info = NULL;
 963     struct vfio_pci_dependent_device *devices;
 964     struct vfio_pci_hot_reset *reset;
 965     int32_t *fds;
 966     int ret, i, count;
 967     bool multi = false;
 968
 969     trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
 970
 971     if (!single) {
 972         vfio_pci_pre_reset(vdev);
 973     }
 974     vdev->vbasedev.needs_reset = false;
 975
 976     ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
 977
 978     if (ret) {
 979         goto out_single;
 980     }
 981     devices = &info->devices[0];
 982
 983     trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
 984
 985     /* Verify that we have all the groups required */
 986     for (i = 0; i < info->count; i++) {
 987         PCIHostDeviceAddress host;
 988         VFIOPCIDevice *tmp;
 989         VFIODevice *vbasedev_iter;
 990
 991         host.domain = devices[i].segment;
 992         host.bus = devices[i].bus;
 993         host.slot = PCI_SLOT(devices[i].devfn);
 994         host.function = PCI_FUNC(devices[i].devfn);
 995
 996         trace_vfio_pci_hot_reset_dep_devices(host.domain,
 997                 host.bus, host.slot, host.function, devices[i].group_id);
 998
 999         if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
1000             continue;
1001         }
1002
1003         QLIST_FOREACH(group, &vfio_group_list, next) {
1004             if (group->groupid == devices[i].group_id) {
1005                 break;
1006             }
1007         }
1008
1009         if (!group) {
1010             if (!vdev->has_pm_reset) {
1011                 error_report("vfio: Cannot reset device %s, "
1012                              "depends on group %d which is not owned.",
1013                              vdev->vbasedev.name, devices[i].group_id);
1014             }
1015             ret = -EPERM;
1016             goto out;
1017         }
1018
1019         /* Prep dependent devices for reset and clear our marker. */
1020         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
1021             if (!vbasedev_iter->dev->realized ||
1022                 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
1023                 continue;
1024             }
1025             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
1026             if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
1027                 if (single) {
1028                     ret = -EINVAL;
1029                     goto out_single;
1030                 }
1031                 vfio_pci_pre_reset(tmp);
1032                 tmp->vbasedev.needs_reset = false;
1033                 multi = true;
1034                 break;
1035             }
1036         }
1037     }
1038
1039     if (!single && !multi) {
1040         ret = -EINVAL;
1041         goto out_single;
1042     }
1043
1044     /* Determine how many group fds need to be passed */
1045     count = 0;
1046     QLIST_FOREACH(group, &vfio_group_list, next) {
1047         for (i = 0; i < info->count; i++) {
1048             if (group->groupid == devices[i].group_id) {
1049                 count++;
1050                 break;
1051             }
1052         }
1053     }
1054
1055     reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
1056     reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
1057     fds = &reset->group_fds[0];
1058
1059     /* Fill in group fds */
1060     QLIST_FOREACH(group, &vfio_group_list, next) {
1061         for (i = 0; i < info->count; i++) {
1062             if (group->groupid == devices[i].group_id) {
1063                 fds[reset->count++] = group->fd;
1064                 break;
1065             }
1066         }
1067     }
1068
1069     /* Bus reset! */
1070     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
1071     g_free(reset);
1072     if (ret) {
1073         ret = -errno;
1074     }
1075
1076     trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
1077                                     ret ? strerror(errno) : "Success");
1078
1079 out:
1080     /* Re-enable INTx on affected devices */
1081     for (i = 0; i < info->count; i++) {
1082         PCIHostDeviceAddress host;
1083         VFIOPCIDevice *tmp;
1084         VFIODevice *vbasedev_iter;
1085
1086         host.domain = devices[i].segment;
1087         host.bus = devices[i].bus;
1088         host.slot = PCI_SLOT(devices[i].devfn);
1089         host.function = PCI_FUNC(devices[i].devfn);
1090
1091         if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
1092             continue;
1093         }
1094
1095         QLIST_FOREACH(group, &vfio_group_list, next) {
1096             if (group->groupid == devices[i].group_id) {
1097                 break;
1098             }
1099         }
1100
1101         if (!group) {
1102             break;
1103         }
1104
1105         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
1106             if (!vbasedev_iter->dev->realized ||
1107                 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
1108                 continue;
1109             }
1110             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
1111             if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
1112                 vfio_pci_post_reset(tmp);
1113                 break;
1114             }
1115         }
1116     }
1117 out_single:
1118     if (!single) {
1119         vfio_pci_post_reset(vdev);
1120     }
1121     g_free(info);
1122
1123     return ret;
1124 }
1125
1126 static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
1127 {
1128     VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
1129
1130     vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO;
1131
1132     vioc->setup = vfio_legacy_setup;
1133     vioc->dma_map = vfio_legacy_dma_map;
1134     vioc->dma_unmap = vfio_legacy_dma_unmap;
1135     vioc->attach_device = vfio_legacy_attach_device;
1136     vioc->detach_device = vfio_legacy_detach_device;
1137     vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking;
1138     vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap;
1139     vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
1140 };
1141
1142 static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
1143                                      Error **errp)
1144 {
1145     VFIODevice *vdev = opaque;
1146
1147     hiod->name = g_strdup(vdev->name);
1148     hiod->agent = opaque;
1149
1150     return true;
1151 }
1152
1153 static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap,
1154                                     Error **errp)
1155 {
1156     switch (cap) {
1157     case HOST_IOMMU_DEVICE_CAP_AW_BITS:
1158         return vfio_device_get_aw_bits(hiod->agent);
1159     default:
1160         error_setg(errp, "%s: unsupported capability %x", hiod->name, cap);
1161         return -EINVAL;
1162     }
1163 }
1164
1165 static GList *
1166 hiod_legacy_vfio_get_iova_ranges(HostIOMMUDevice *hiod)
1167 {
1168     VFIODevice *vdev = hiod->agent;
1169
1170     g_assert(vdev);
1171     return vfio_container_get_iova_ranges(vdev->bcontainer);
1172 }
1173
1174 static uint64_t
1175 hiod_legacy_vfio_get_page_size_mask(HostIOMMUDevice *hiod)
1176 {
1177     VFIODevice *vdev = hiod->agent;
1178
1179     g_assert(vdev);
1180     return vfio_container_get_page_size_mask(vdev->bcontainer);
1181 }
1182
1183 static void vfio_iommu_legacy_instance_init(Object *obj)
1184 {
1185     VFIOContainer *container = VFIO_IOMMU_LEGACY(obj);
1186
1187     QLIST_INIT(&container->group_list);
1188 }
1189
1190 static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
1191 {
1192     HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
1193
1194     hioc->realize = hiod_legacy_vfio_realize;
1195     hioc->get_cap = hiod_legacy_vfio_get_cap;
1196     hioc->get_iova_ranges = hiod_legacy_vfio_get_iova_ranges;
1197     hioc->get_page_size_mask = hiod_legacy_vfio_get_page_size_mask;
1198 };
1199
1200 static const TypeInfo types[] = {
1201     {
1202         .name = TYPE_VFIO_IOMMU_LEGACY,
1203         .parent = TYPE_VFIO_IOMMU,
1204         .instance_init = vfio_iommu_legacy_instance_init,
1205         .instance_size = sizeof(VFIOContainer),
1206         .class_init = vfio_iommu_legacy_class_init,
1207     }, {
1208         .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
1209         .parent = TYPE_HOST_IOMMU_DEVICE,
1210         .class_init = hiod_legacy_vfio_class_init,
1211     }
1212 };
1213
1214 DEFINE_TYPES(types)