hw/vfio/container.c

   1 /*
   2  * generic functions used by VFIO devices
   3  *
   4  * Copyright Red Hat, Inc. 2012
   5  *
   6  * Authors:
   7  *  Alex Williamson <alex.williamson@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Based on qemu-kvm device-assignment:
  13  *  Adapted for KVM by Qumranet.
  14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19  */
  20
  21 #include "qemu/osdep.h"
  22 #include <sys/ioctl.h>
  23 #include <linux/vfio.h>
  24
  25 #include "hw/vfio/vfio-common.h"
  26 #include "exec/address-spaces.h"
  27 #include "exec/memory.h"
  28 #include "exec/ram_addr.h"
  29 #include "hw/hw.h"
  30 #include "qemu/error-report.h"
  31 #include "qemu/range.h"
  32 #include "sysemu/reset.h"
  33 #include "trace.h"
  34 #include "qapi/error.h"
  35 #include "migration/migration.h"
  36
  37 VFIOGroupList vfio_group_list =
  38     QLIST_HEAD_INITIALIZER(vfio_group_list);
  39
  40 static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
  41 {
  42     switch (container->iommu_type) {
  43     case VFIO_TYPE1v2_IOMMU:
  44     case VFIO_TYPE1_IOMMU:
  45         /*
  46          * We support coordinated discarding of RAM via the RamDiscardManager.
  47          */
  48         return ram_block_uncoordinated_discard_disable(state);
  49     default:
  50         /*
  51          * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
  52          * RamDiscardManager, however, it is completely untested.
  53          *
  54          * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
  55          * completely the opposite of managing mapping/pinning dynamically as
  56          * required by RamDiscardManager. We would have to special-case sections
  57          * with a RamDiscardManager.
  58          */
  59         return ram_block_discard_disable(state);
  60     }
  61 }
  62
  63 static int vfio_dma_unmap_bitmap(VFIOContainer *container,
  64                                  hwaddr iova, ram_addr_t size,
  65                                  IOMMUTLBEntry *iotlb)
  66 {
  67     struct vfio_iommu_type1_dma_unmap *unmap;
  68     struct vfio_bitmap *bitmap;
  69     VFIOBitmap vbmap;
  70     int ret;
  71
  72     ret = vfio_bitmap_alloc(&vbmap, size);
  73     if (ret) {
  74         return ret;
  75     }
  76
  77     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
  78
  79     unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
  80     unmap->iova = iova;
  81     unmap->size = size;
  82     unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
  83     bitmap = (struct vfio_bitmap *)&unmap->data;
  84
  85     /*
  86      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
  87      * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
  88      * to qemu_real_host_page_size.
  89      */
  90     bitmap->pgsize = qemu_real_host_page_size();
  91     bitmap->size = vbmap.size;
  92     bitmap->data = (__u64 *)vbmap.bitmap;
  93
  94     if (vbmap.size > container->max_dirty_bitmap_size) {
  95         error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
  96         ret = -E2BIG;
  97         goto unmap_exit;
  98     }
  99
 100     ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
 101     if (!ret) {
 102         cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
 103                 iotlb->translated_addr, vbmap.pages);
 104     } else {
 105         error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
 106     }
 107
 108 unmap_exit:
 109     g_free(unmap);
 110     g_free(vbmap.bitmap);
 111
 112     return ret;
 113 }
 114
 115 /*
 116  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
 117  */
 118 int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
 119                    ram_addr_t size, IOMMUTLBEntry *iotlb)
 120 {
 121     struct vfio_iommu_type1_dma_unmap unmap = {
 122         .argsz = sizeof(unmap),
 123         .flags = 0,
 124         .iova = iova,
 125         .size = size,
 126     };
 127     bool need_dirty_sync = false;
 128     int ret;
 129
 130     if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
 131         if (!vfio_devices_all_device_dirty_tracking(container) &&
 132             container->dirty_pages_supported) {
 133             return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
 134         }
 135
 136         need_dirty_sync = true;
 137     }
 138
 139     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
 140         /*
 141          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
 142          * v4.15) where an overflow in its wrap-around check prevents us from
 143          * unmapping the last page of the address space.  Test for the error
 144          * condition and re-try the unmap excluding the last page.  The
 145          * expectation is that we've never mapped the last page anyway and this
 146          * unmap request comes via vIOMMU support which also makes it unlikely
 147          * that this page is used.  This bug was introduced well after type1 v2
 148          * support was introduced, so we shouldn't need to test for v1.  A fix
 149          * is queued for kernel v5.0 so this workaround can be removed once
 150          * affected kernels are sufficiently deprecated.
 151          */
 152         if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
 153             container->iommu_type == VFIO_TYPE1v2_IOMMU) {
 154             trace_vfio_dma_unmap_overflow_workaround();
 155             unmap.size -= 1ULL << ctz64(container->pgsizes);
 156             continue;
 157         }
 158         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
 159         return -errno;
 160     }
 161
 162     if (need_dirty_sync) {
 163         ret = vfio_get_dirty_bitmap(container, iova, size,
 164                                     iotlb->translated_addr);
 165         if (ret) {
 166             return ret;
 167         }
 168     }
 169
 170     return 0;
 171 }
 172
 173 int vfio_dma_map(VFIOContainer *container, hwaddr iova,
 174                  ram_addr_t size, void *vaddr, bool readonly)
 175 {
 176     struct vfio_iommu_type1_dma_map map = {
 177         .argsz = sizeof(map),
 178         .flags = VFIO_DMA_MAP_FLAG_READ,
 179         .vaddr = (__u64)(uintptr_t)vaddr,
 180         .iova = iova,
 181         .size = size,
 182     };
 183
 184     if (!readonly) {
 185         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
 186     }
 187
 188     /*
 189      * Try the mapping, if it fails with EBUSY, unmap the region and try
 190      * again.  This shouldn't be necessary, but we sometimes see it in
 191      * the VGA ROM space.
 192      */
 193     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
 194         (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
 195          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
 196         return 0;
 197     }
 198
 199     error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
 200     return -errno;
 201 }
 202
 203 int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
 204 {
 205     int ret;
 206     struct vfio_iommu_type1_dirty_bitmap dirty = {
 207         .argsz = sizeof(dirty),
 208     };
 209
 210     if (!container->dirty_pages_supported) {
 211         return 0;
 212     }
 213
 214     if (start) {
 215         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
 216     } else {
 217         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
 218     }
 219
 220     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
 221     if (ret) {
 222         ret = -errno;
 223         error_report("Failed to set dirty tracking flag 0x%x errno: %d",
 224                      dirty.flags, errno);
 225     }
 226
 227     return ret;
 228 }
 229
 230 int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
 231                             hwaddr iova, hwaddr size)
 232 {
 233     struct vfio_iommu_type1_dirty_bitmap *dbitmap;
 234     struct vfio_iommu_type1_dirty_bitmap_get *range;
 235     int ret;
 236
 237     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
 238
 239     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
 240     dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
 241     range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
 242     range->iova = iova;
 243     range->size = size;
 244
 245     /*
 246      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
 247      * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
 248      * to qemu_real_host_page_size.
 249      */
 250     range->bitmap.pgsize = qemu_real_host_page_size();
 251     range->bitmap.size = vbmap->size;
 252     range->bitmap.data = (__u64 *)vbmap->bitmap;
 253
 254     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
 255     if (ret) {
 256         ret = -errno;
 257         error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
 258                 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
 259                 (uint64_t)range->size, errno);
 260     }
 261
 262     g_free(dbitmap);
 263
 264     return ret;
 265 }
 266
 267 static struct vfio_info_cap_header *
 268 vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
 269 {
 270     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
 271         return NULL;
 272     }
 273
 274     return vfio_get_cap((void *)info, info->cap_offset, id);
 275 }
 276
 277 bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
 278                              unsigned int *avail)
 279 {
 280     struct vfio_info_cap_header *hdr;
 281     struct vfio_iommu_type1_info_dma_avail *cap;
 282
 283     /* If the capability cannot be found, assume no DMA limiting */
 284     hdr = vfio_get_iommu_type1_info_cap(info,
 285                                         VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
 286     if (!hdr) {
 287         return false;
 288     }
 289
 290     if (avail != NULL) {
 291         cap = (void *) hdr;
 292         *avail = cap->avail;
 293     }
 294
 295     return true;
 296 }
 297
 298 static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
 299                                      VFIOContainer *container)
 300 {
 301     struct vfio_info_cap_header *hdr;
 302     struct vfio_iommu_type1_info_cap_iova_range *cap;
 303
 304     hdr = vfio_get_iommu_type1_info_cap(info,
 305                                         VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
 306     if (!hdr) {
 307         return false;
 308     }
 309
 310     cap = (void *)hdr;
 311
 312     for (int i = 0; i < cap->nr_iovas; i++) {
 313         Range *range = g_new(Range, 1);
 314
 315         range_set_bounds(range, cap->iova_ranges[i].start,
 316                          cap->iova_ranges[i].end);
 317         container->iova_ranges =
 318             range_list_insert(container->iova_ranges, range);
 319     }
 320
 321     return true;
 322 }
 323
 324 static void vfio_kvm_device_add_group(VFIOGroup *group)
 325 {
 326     Error *err = NULL;
 327
 328     if (vfio_kvm_device_add_fd(group->fd, &err)) {
 329         error_reportf_err(err, "group ID %d: ", group->groupid);
 330     }
 331 }
 332
 333 static void vfio_kvm_device_del_group(VFIOGroup *group)
 334 {
 335     Error *err = NULL;
 336
 337     if (vfio_kvm_device_del_fd(group->fd, &err)) {
 338         error_reportf_err(err, "group ID %d: ", group->groupid);
 339     }
 340 }
 341
 342 /*
 343  * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
 344  */
 345 static int vfio_get_iommu_type(VFIOContainer *container,
 346                                Error **errp)
 347 {
 348     int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
 349                           VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
 350     int i;
 351
 352     for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
 353         if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
 354             return iommu_types[i];
 355         }
 356     }
 357     error_setg(errp, "No available IOMMU models");
 358     return -EINVAL;
 359 }
 360
 361 static int vfio_init_container(VFIOContainer *container, int group_fd,
 362                                Error **errp)
 363 {
 364     int iommu_type, ret;
 365
 366     iommu_type = vfio_get_iommu_type(container, errp);
 367     if (iommu_type < 0) {
 368         return iommu_type;
 369     }
 370
 371     ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
 372     if (ret) {
 373         error_setg_errno(errp, errno, "Failed to set group container");
 374         return -errno;
 375     }
 376
 377     while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
 378         if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
 379             /*
 380              * On sPAPR, despite the IOMMU subdriver always advertises v1 and
 381              * v2, the running platform may not support v2 and there is no
 382              * way to guess it until an IOMMU group gets added to the container.
 383              * So in case it fails with v2, try v1 as a fallback.
 384              */
 385             iommu_type = VFIO_SPAPR_TCE_IOMMU;
 386             continue;
 387         }
 388         error_setg_errno(errp, errno, "Failed to set iommu for container");
 389         return -errno;
 390     }
 391
 392     container->iommu_type = iommu_type;
 393     return 0;
 394 }
 395
 396 static int vfio_get_iommu_info(VFIOContainer *container,
 397                                struct vfio_iommu_type1_info **info)
 398 {
 399
 400     size_t argsz = sizeof(struct vfio_iommu_type1_info);
 401
 402     *info = g_new0(struct vfio_iommu_type1_info, 1);
 403 again:
 404     (*info)->argsz = argsz;
 405
 406     if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
 407         g_free(*info);
 408         *info = NULL;
 409         return -errno;
 410     }
 411
 412     if (((*info)->argsz > argsz)) {
 413         argsz = (*info)->argsz;
 414         *info = g_realloc(*info, argsz);
 415         goto again;
 416     }
 417
 418     return 0;
 419 }
 420
 421 static struct vfio_info_cap_header *
 422 vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
 423 {
 424     struct vfio_info_cap_header *hdr;
 425     void *ptr = info;
 426
 427     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
 428         return NULL;
 429     }
 430
 431     for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
 432         if (hdr->id == id) {
 433             return hdr;
 434         }
 435     }
 436
 437     return NULL;
 438 }
 439
 440 static void vfio_get_iommu_info_migration(VFIOContainer *container,
 441                                           struct vfio_iommu_type1_info *info)
 442 {
 443     struct vfio_info_cap_header *hdr;
 444     struct vfio_iommu_type1_info_cap_migration *cap_mig;
 445
 446     hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
 447     if (!hdr) {
 448         return;
 449     }
 450
 451     cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
 452                             header);
 453
 454     /*
 455      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
 456      * qemu_real_host_page_size to mark those dirty.
 457      */
 458     if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
 459         container->dirty_pages_supported = true;
 460         container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
 461         container->dirty_pgsizes = cap_mig->pgsize_bitmap;
 462     }
 463 }
 464
 465 static void vfio_free_container(VFIOContainer *container)
 466 {
 467     g_list_free_full(container->iova_ranges, g_free);
 468     g_free(container);
 469 }
 470
 471 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
 472                                   Error **errp)
 473 {
 474     VFIOContainer *container;
 475     int ret, fd;
 476     VFIOAddressSpace *space;
 477
 478     space = vfio_get_address_space(as);
 479
 480     /*
 481      * VFIO is currently incompatible with discarding of RAM insofar as the
 482      * madvise to purge (zap) the page from QEMU's address space does not
 483      * interact with the memory API and therefore leaves stale virtual to
 484      * physical mappings in the IOMMU if the page was previously pinned.  We
 485      * therefore set discarding broken for each group added to a container,
 486      * whether the container is used individually or shared.  This provides
 487      * us with options to allow devices within a group to opt-in and allow
 488      * discarding, so long as it is done consistently for a group (for instance
 489      * if the device is an mdev device where it is known that the host vendor
 490      * driver will never pin pages outside of the working set of the guest
 491      * driver, which would thus not be discarding candidates).
 492      *
 493      * The first opportunity to induce pinning occurs here where we attempt to
 494      * attach the group to existing containers within the AddressSpace.  If any
 495      * pages are already zapped from the virtual address space, such as from
 496      * previous discards, new pinning will cause valid mappings to be
 497      * re-established.  Likewise, when the overall MemoryListener for a new
 498      * container is registered, a replay of mappings within the AddressSpace
 499      * will occur, re-establishing any previously zapped pages as well.
 500      *
 501      * Especially virtio-balloon is currently only prevented from discarding
 502      * new memory, it will not yet set ram_block_discard_set_required() and
 503      * therefore, neither stops us here or deals with the sudden memory
 504      * consumption of inflated memory.
 505      *
 506      * We do support discarding of memory coordinated via the RamDiscardManager
 507      * with some IOMMU types. vfio_ram_block_discard_disable() handles the
 508      * details once we know which type of IOMMU we are using.
 509      */
 510
 511     QLIST_FOREACH(container, &space->containers, next) {
 512         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
 513             ret = vfio_ram_block_discard_disable(container, true);
 514             if (ret) {
 515                 error_setg_errno(errp, -ret,
 516                                  "Cannot set discarding of RAM broken");
 517                 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
 518                           &container->fd)) {
 519                     error_report("vfio: error disconnecting group %d from"
 520                                  " container", group->groupid);
 521                 }
 522                 return ret;
 523             }
 524             group->container = container;
 525             QLIST_INSERT_HEAD(&container->group_list, group, container_next);
 526             vfio_kvm_device_add_group(group);
 527             return 0;
 528         }
 529     }
 530
 531     fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
 532     if (fd < 0) {
 533         error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
 534         ret = -errno;
 535         goto put_space_exit;
 536     }
 537
 538     ret = ioctl(fd, VFIO_GET_API_VERSION);
 539     if (ret != VFIO_API_VERSION) {
 540         error_setg(errp, "supported vfio version: %d, "
 541                    "reported version: %d", VFIO_API_VERSION, ret);
 542         ret = -EINVAL;
 543         goto close_fd_exit;
 544     }
 545
 546     container = g_malloc0(sizeof(*container));
 547     container->space = space;
 548     container->fd = fd;
 549     container->error = NULL;
 550     container->dirty_pages_supported = false;
 551     container->dma_max_mappings = 0;
 552     container->iova_ranges = NULL;
 553     QLIST_INIT(&container->giommu_list);
 554     QLIST_INIT(&container->vrdl_list);
 555
 556     ret = vfio_init_container(container, group->fd, errp);
 557     if (ret) {
 558         goto free_container_exit;
 559     }
 560
 561     ret = vfio_ram_block_discard_disable(container, true);
 562     if (ret) {
 563         error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
 564         goto free_container_exit;
 565     }
 566
 567     switch (container->iommu_type) {
 568     case VFIO_TYPE1v2_IOMMU:
 569     case VFIO_TYPE1_IOMMU:
 570     {
 571         struct vfio_iommu_type1_info *info;
 572
 573         ret = vfio_get_iommu_info(container, &info);
 574         if (ret) {
 575             error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
 576             goto enable_discards_exit;
 577         }
 578
 579         if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
 580             container->pgsizes = info->iova_pgsizes;
 581         } else {
 582             container->pgsizes = qemu_real_host_page_size();
 583         }
 584
 585         if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
 586             container->dma_max_mappings = 65535;
 587         }
 588
 589         vfio_get_info_iova_range(info, container);
 590
 591         vfio_get_iommu_info_migration(container, info);
 592         g_free(info);
 593         break;
 594     }
 595     case VFIO_SPAPR_TCE_v2_IOMMU:
 596     case VFIO_SPAPR_TCE_IOMMU:
 597     {
 598         ret = vfio_spapr_container_init(container, errp);
 599         if (ret) {
 600             goto enable_discards_exit;
 601         }
 602         break;
 603     }
 604     }
 605
 606     vfio_kvm_device_add_group(group);
 607
 608     QLIST_INIT(&container->group_list);
 609     QLIST_INSERT_HEAD(&space->containers, container, next);
 610
 611     group->container = container;
 612     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
 613
 614     container->listener = vfio_memory_listener;
 615
 616     memory_listener_register(&container->listener, container->space->as);
 617
 618     if (container->error) {
 619         ret = -1;
 620         error_propagate_prepend(errp, container->error,
 621             "memory listener initialization failed: ");
 622         goto listener_release_exit;
 623     }
 624
 625     container->initialized = true;
 626
 627     return 0;
 628 listener_release_exit:
 629     QLIST_REMOVE(group, container_next);
 630     QLIST_REMOVE(container, next);
 631     vfio_kvm_device_del_group(group);
 632     memory_listener_unregister(&container->listener);
 633     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
 634         container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
 635         vfio_spapr_container_deinit(container);
 636     }
 637
 638 enable_discards_exit:
 639     vfio_ram_block_discard_disable(container, false);
 640
 641 free_container_exit:
 642     vfio_free_container(container);
 643
 644 close_fd_exit:
 645     close(fd);
 646
 647 put_space_exit:
 648     vfio_put_address_space(space);
 649
 650     return ret;
 651 }
 652
 653 static void vfio_disconnect_container(VFIOGroup *group)
 654 {
 655     VFIOContainer *container = group->container;
 656
 657     QLIST_REMOVE(group, container_next);
 658     group->container = NULL;
 659
 660     /*
 661      * Explicitly release the listener first before unset container,
 662      * since unset may destroy the backend container if it's the last
 663      * group.
 664      */
 665     if (QLIST_EMPTY(&container->group_list)) {
 666         memory_listener_unregister(&container->listener);
 667         if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
 668             container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
 669             vfio_spapr_container_deinit(container);
 670         }
 671     }
 672
 673     if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
 674         error_report("vfio: error disconnecting group %d from container",
 675                      group->groupid);
 676     }
 677
 678     if (QLIST_EMPTY(&container->group_list)) {
 679         VFIOAddressSpace *space = container->space;
 680         VFIOGuestIOMMU *giommu, *tmp;
 681
 682         QLIST_REMOVE(container, next);
 683
 684         QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
 685             memory_region_unregister_iommu_notifier(
 686                     MEMORY_REGION(giommu->iommu_mr), &giommu->n);
 687             QLIST_REMOVE(giommu, giommu_next);
 688             g_free(giommu);
 689         }
 690
 691         trace_vfio_disconnect_container(container->fd);
 692         close(container->fd);
 693         vfio_free_container(container);
 694
 695         vfio_put_address_space(space);
 696     }
 697 }
 698
 699 static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
 700 {
 701     VFIOGroup *group;
 702     char path[32];
 703     struct vfio_group_status status = { .argsz = sizeof(status) };
 704
 705     QLIST_FOREACH(group, &vfio_group_list, next) {
 706         if (group->groupid == groupid) {
 707             /* Found it.  Now is it already in the right context? */
 708             if (group->container->space->as == as) {
 709                 return group;
 710             } else {
 711                 error_setg(errp, "group %d used in multiple address spaces",
 712                            group->groupid);
 713                 return NULL;
 714             }
 715         }
 716     }
 717
 718     group = g_malloc0(sizeof(*group));
 719
 720     snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
 721     group->fd = qemu_open_old(path, O_RDWR);
 722     if (group->fd < 0) {
 723         error_setg_errno(errp, errno, "failed to open %s", path);
 724         goto free_group_exit;
 725     }
 726
 727     if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
 728         error_setg_errno(errp, errno, "failed to get group %d status", groupid);
 729         goto close_fd_exit;
 730     }
 731
 732     if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
 733         error_setg(errp, "group %d is not viable", groupid);
 734         error_append_hint(errp,
 735                           "Please ensure all devices within the iommu_group "
 736                           "are bound to their vfio bus driver.\n");
 737         goto close_fd_exit;
 738     }
 739
 740     group->groupid = groupid;
 741     QLIST_INIT(&group->device_list);
 742
 743     if (vfio_connect_container(group, as, errp)) {
 744         error_prepend(errp, "failed to setup container for group %d: ",
 745                       groupid);
 746         goto close_fd_exit;
 747     }
 748
 749     QLIST_INSERT_HEAD(&vfio_group_list, group, next);
 750
 751     return group;
 752
 753 close_fd_exit:
 754     close(group->fd);
 755
 756 free_group_exit:
 757     g_free(group);
 758
 759     return NULL;
 760 }
 761
 762 static void vfio_put_group(VFIOGroup *group)
 763 {
 764     if (!group || !QLIST_EMPTY(&group->device_list)) {
 765         return;
 766     }
 767
 768     if (!group->ram_block_discard_allowed) {
 769         vfio_ram_block_discard_disable(group->container, false);
 770     }
 771     vfio_kvm_device_del_group(group);
 772     vfio_disconnect_container(group);
 773     QLIST_REMOVE(group, next);
 774     trace_vfio_put_group(group->fd);
 775     close(group->fd);
 776     g_free(group);
 777 }
 778
 779 static int vfio_get_device(VFIOGroup *group, const char *name,
 780                            VFIODevice *vbasedev, Error **errp)
 781 {
 782     g_autofree struct vfio_device_info *info = NULL;
 783     int fd;
 784
 785     fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
 786     if (fd < 0) {
 787         error_setg_errno(errp, errno, "error getting device from group %d",
 788                          group->groupid);
 789         error_append_hint(errp,
 790                       "Verify all devices in group %d are bound to vfio-<bus> "
 791                       "or pci-stub and not already in use\n", group->groupid);
 792         return fd;
 793     }
 794
 795     info = vfio_get_device_info(fd);
 796     if (!info) {
 797         error_setg_errno(errp, errno, "error getting device info");
 798         close(fd);
 799         return -1;
 800     }
 801
 802     /*
 803      * Set discarding of RAM as not broken for this group if the driver knows
 804      * the device operates compatibly with discarding.  Setting must be
 805      * consistent per group, but since compatibility is really only possible
 806      * with mdev currently, we expect singleton groups.
 807      */
 808     if (vbasedev->ram_block_discard_allowed !=
 809         group->ram_block_discard_allowed) {
 810         if (!QLIST_EMPTY(&group->device_list)) {
 811             error_setg(errp, "Inconsistent setting of support for discarding "
 812                        "RAM (e.g., balloon) within group");
 813             close(fd);
 814             return -1;
 815         }
 816
 817         if (!group->ram_block_discard_allowed) {
 818             group->ram_block_discard_allowed = true;
 819             vfio_ram_block_discard_disable(group->container, false);
 820         }
 821     }
 822
 823     vbasedev->fd = fd;
 824     vbasedev->group = group;
 825     QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
 826
 827     vbasedev->num_irqs = info->num_irqs;
 828     vbasedev->num_regions = info->num_regions;
 829     vbasedev->flags = info->flags;
 830
 831     trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs);
 832
 833     vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
 834
 835     return 0;
 836 }
 837
 838 static void vfio_put_base_device(VFIODevice *vbasedev)
 839 {
 840     if (!vbasedev->group) {
 841         return;
 842     }
 843     QLIST_REMOVE(vbasedev, next);
 844     vbasedev->group = NULL;
 845     trace_vfio_put_base_device(vbasedev->fd);
 846     close(vbasedev->fd);
 847 }
 848
 849 static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
 850 {
 851     char *tmp, group_path[PATH_MAX], *group_name;
 852     int ret, groupid;
 853     ssize_t len;
 854
 855     tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
 856     len = readlink(tmp, group_path, sizeof(group_path));
 857     g_free(tmp);
 858
 859     if (len <= 0 || len >= sizeof(group_path)) {
 860         ret = len < 0 ? -errno : -ENAMETOOLONG;
 861         error_setg_errno(errp, -ret, "no iommu_group found");
 862         return ret;
 863     }
 864
 865     group_path[len] = 0;
 866
 867     group_name = basename(group_path);
 868     if (sscanf(group_name, "%d", &groupid) != 1) {
 869         error_setg_errno(errp, errno, "failed to read %s", group_path);
 870         return -errno;
 871     }
 872     return groupid;
 873 }
 874
 875 /*
 876  * vfio_attach_device: attach a device to a security context
 877  * @name and @vbasedev->name are likely to be different depending
 878  * on the type of the device, hence the need for passing @name
 879  */
 880 int vfio_attach_device(char *name, VFIODevice *vbasedev,
 881                        AddressSpace *as, Error **errp)
 882 {
 883     int groupid = vfio_device_groupid(vbasedev, errp);
 884     VFIODevice *vbasedev_iter;
 885     VFIOGroup *group;
 886     VFIOContainer *container;
 887     int ret;
 888
 889     if (groupid < 0) {
 890         return groupid;
 891     }
 892
 893     trace_vfio_attach_device(vbasedev->name, groupid);
 894
 895     group = vfio_get_group(groupid, as, errp);
 896     if (!group) {
 897         return -ENOENT;
 898     }
 899
 900     QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
 901         if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
 902             error_setg(errp, "device is already attached");
 903             vfio_put_group(group);
 904             return -EBUSY;
 905         }
 906     }
 907     ret = vfio_get_device(group, name, vbasedev, errp);
 908     if (ret) {
 909         vfio_put_group(group);
 910         return ret;
 911     }
 912
 913     container = group->container;
 914     vbasedev->container = container;
 915     QLIST_INSERT_HEAD(&container->device_list, vbasedev, container_next);
 916     QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
 917
 918     return ret;
 919 }
 920
 921 void vfio_detach_device(VFIODevice *vbasedev)
 922 {
 923     VFIOGroup *group = vbasedev->group;
 924
 925     if (!vbasedev->container) {
 926         return;
 927     }
 928
 929     QLIST_REMOVE(vbasedev, global_next);
 930     QLIST_REMOVE(vbasedev, container_next);
 931     vbasedev->container = NULL;
 932     trace_vfio_detach_device(vbasedev->name, group->groupid);
 933     vfio_put_base_device(vbasedev);
 934     vfio_put_group(group);
 935 }